Wikipedia River: A river to index wikipedia, closes #403.

2010-10-03 22:22:45 +02:00 · 2010-10-03 22:22:45 +02:00 · c4d17860a1
parent 425744e0db
commit c4d17860a1
21 changed files with 1227 additions and 0 deletions
--- a/.idea/dictionaries/kimchy.xml
+++ b/.idea/dictionaries/kimchy.xml
@ -148,6 +148,7 @@
      <w>uuid</w>
      <w>versioned</w>
      <w>warmup</w>
      <w>wikipedia</w>
      <w>wildcards</w>
      <w>xcontent</w>
      <w>xson</w>
--- a/.idea/modules.xml
+++ b/.idea/modules.xml
@ -14,6 +14,7 @@
      <module fileurl="file://$PROJECT_DIR$/.idea/modules//plugin-river-couchdb.iml" filepath="$PROJECT_DIR$/.idea/modules//plugin-river-couchdb.iml" />
      <module fileurl="file://$PROJECT_DIR$/.idea/modules//plugin-river-rabbitmq.iml" filepath="$PROJECT_DIR$/.idea/modules//plugin-river-rabbitmq.iml" />
      <module fileurl="file://$PROJECT_DIR$/.idea/modules/plugin-river-twitter.iml" filepath="$PROJECT_DIR$/.idea/modules/plugin-river-twitter.iml" />
      <module fileurl="file://$PROJECT_DIR$/.idea/modules//plugin-river-wikipedia.iml" filepath="$PROJECT_DIR$/.idea/modules//plugin-river-wikipedia.iml" />
      <module fileurl="file://$PROJECT_DIR$/.idea/modules//plugin-transport-memcached.iml" filepath="$PROJECT_DIR$/.idea/modules//plugin-transport-memcached.iml" />
      <module fileurl="file://$PROJECT_DIR$/.idea/modules//plugin-transport-thrift.iml" filepath="$PROJECT_DIR$/.idea/modules//plugin-transport-thrift.iml" />
      <module fileurl="file://$PROJECT_DIR$/.idea/modules//plugins-hadoop.iml" filepath="$PROJECT_DIR$/.idea/modules//plugins-hadoop.iml" />
--- a/.idea/modules/elasticsearch-root.iml
+++ b/.idea/modules/elasticsearch-root.iml
@ -24,6 +24,7 @@
    <orderEntry type="module" module-name="plugin-river-twitter" />
    <orderEntry type="module" module-name="plugin-river-couchdb" />
    <orderEntry type="module" module-name="plugin-river-rabbitmq" />
    <orderEntry type="module" module-name="plugin-river-wikipedia" />
    <orderEntry type="module" module-name="test-integration" />
  </component>
 </module>
--- a/.idea/modules/plugin-river-wikipedia.iml
+++ b/.idea/modules/plugin-river-wikipedia.iml
@ -0,0 +1,18 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <module type="JAVA_MODULE" version="4">
  <component name="NewModuleRootManager" inherit-compiler-output="false">
    <output url="file://$MODULE_DIR$/../../plugins/river/wikipedia/build/classes/main" />
    <output-test url="file://$MODULE_DIR$/../../plugins/river/wikipedia/build/classes/test" />
    <exclude-output />
    <content url="file://$MODULE_DIR$/../../plugins/river/wikipedia">
      <sourceFolder url="file://$MODULE_DIR$/../../plugins/river/wikipedia/src/main/java" isTestSource="false" />
    </content>
    <orderEntry type="inheritedJdk" />
    <orderEntry type="sourceFolder" forTests="false" />
    <orderEntry type="module" module-name="elasticsearch" />
    <orderEntry type="module" module-name="test-testng" scope="TEST" />
    <orderEntry type="library" scope="TEST" name="testng" level="project" />
    <orderEntry type="library" scope="TEST" name="hamcrest" level="project" />
  </component>
 </module>
--- a/plugins/river/wikipedia/build.gradle
+++ b/plugins/river/wikipedia/build.gradle
@ -0,0 +1,136 @@
 dependsOn(':elasticsearch')
 apply plugin: 'java'
 apply plugin: 'maven'
 archivesBaseName = "elasticsearch-river-wikipedia"
 explodedDistDir = new File(distsDir, 'exploded')
 manifest.mainAttributes("Implementation-Title": "ElasticSearch::Plugins::River::Wikipedia", "Implementation-Version": rootProject.version, "Implementation-Date": buildTimeStr)
 configurations.compile.transitive = true
 configurations.testCompile.transitive = true
 // no need to use the resource dir
 sourceSets.main.resources.srcDirs 'src/main/java'
 sourceSets.test.resources.srcDirs 'src/test/java'
 // add the source files to the dist jar
 //jar {
 //    from sourceSets.main.allJava
 //}
 configurations {
    dists
    distLib {
        visible = false
        transitive = false
    }
 }
 dependencies {
    compile project(':elasticsearch')
    testCompile project(':test-testng')
    testCompile('org.testng:testng:5.10:jdk15') { transitive = false }
    testCompile 'org.hamcrest:hamcrest-all:1.1'
 }
 test {
    useTestNG()
    jmvArgs = ["-ea", "-Xmx1024m"]
    suiteName = project.name
    listeners = ["org.elasticsearch.util.testng.Listeners"]
    systemProperties["es.test.log.conf"] = System.getProperty("es.test.log.conf", "log4j-gradle.properties")
 }
 task explodedDist(dependsOn: [jar], description: 'Builds the plugin zip file') << {
    [explodedDistDir]*.mkdirs()
    copy {
        from configurations.distLib
        into explodedDistDir
    }
    // remove elasticsearch files (compile above adds the elasticsearch one)
    ant.delete { fileset(dir: explodedDistDir, includes: "elasticsearch-*.jar") }
    copy {
        from libsDir
        into explodedDistDir
    }
    ant.delete { fileset(dir: explodedDistDir, includes: "elasticsearch-*-javadoc.jar") }
    ant.delete { fileset(dir: explodedDistDir, includes: "elasticsearch-*-sources.jar") }
 }
 task zip(type: Zip, dependsOn: ['explodedDist']) {
    from(explodedDistDir) {
    }
 }
 task release(dependsOn: [zip]) << {
    ant.delete(dir: explodedDistDir)
    copy {
        from distsDir
        into(new File(rootProject.distsDir, "plugins"))
    }
 }
 configurations {
    deployerJars
 }
 dependencies {
    deployerJars "org.apache.maven.wagon:wagon-http:1.0-beta-2"
 }
 task sourcesJar(type: Jar, dependsOn: classes) {
    classifier = 'sources'
    from sourceSets.main.allSource
 }
 task javadocJar(type: Jar, dependsOn: javadoc) {
    classifier = 'javadoc'
    from javadoc.destinationDir
 }
 artifacts {
    archives sourcesJar
    archives javadocJar
 }
 uploadArchives {
    repositories.mavenDeployer {
        configuration = configurations.deployerJars
        repository(url: rootProject.mavenRepoUrl) {
            authentication(userName: rootProject.mavenRepoUser, password: rootProject.mavenRepoPass)
        }
        snapshotRepository(url: rootProject.mavenSnapshotRepoUrl) {
            authentication(userName: rootProject.mavenRepoUser, password: rootProject.mavenRepoPass)
        }
        pom.project {
            inceptionYear '2009'
            name 'elasticsearch-plugins-river-wikipedia'
            description 'Attachments Plugin for ElasticSearch'
            licenses {
                license {
                    name 'The Apache Software License, Version 2.0'
                    url 'http://www.apache.org/licenses/LICENSE-2.0.txt'
                    distribution 'repo'
                }
            }
            scm {
                connection 'git://github.com/elasticsearch/elasticsearch.git'
                developerConnection 'git@github.com:elasticsearch/elasticsearch.git'
                url 'http://github.com/elasticsearch/elasticsearch'
            }
        }
        pom.whenConfigured {pom ->
            pom.dependencies = pom.dependencies.findAll {dep -> dep.scope != 'test' } // removes the test scoped ones
        }
    }
 }
--- a/plugins/river/wikipedia/src/main/java/es-plugin.properties
+++ b/plugins/river/wikipedia/src/main/java/es-plugin.properties
@ -0,0 +1,2 @@
 plugin=org.elasticsearch.plugin.river.wikipedia.WikipediaRiverPlugin
--- a/plugins/river/wikipedia/src/main/java/org/elasticsearch/plugin/river/wikipedia/WikipediaRiverPlugin.java
+++ b/plugins/river/wikipedia/src/main/java/org/elasticsearch/plugin/river/wikipedia/WikipediaRiverPlugin.java
@ -0,0 +1,40 @@
 /*
 * Licensed to Elastic Search and Shay Banon under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership. Elastic Search licenses this
 * file to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
 package org.elasticsearch.plugin.river.wikipedia;
 import org.elasticsearch.common.inject.Inject;
 import org.elasticsearch.plugins.AbstractPlugin;
 /**
 * @author kimchy (shay.banon)
 */
 public class WikipediaRiverPlugin extends AbstractPlugin {
    @Inject public WikipediaRiverPlugin() {
    }
    @Override public String name() {
        return "river-wikipedia";
    }
    @Override public String description() {
        return "River Wikipedia Plugin";
    }
 }
--- a/plugins/river/wikipedia/src/main/java/org/elasticsearch/river/wikipedia/WikipediaRiver.java
+++ b/plugins/river/wikipedia/src/main/java/org/elasticsearch/river/wikipedia/WikipediaRiver.java
@ -0,0 +1,235 @@
 /*
 * Licensed to Elastic Search and Shay Banon under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership. Elastic Search licenses this
 * file to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
 package org.elasticsearch.river.wikipedia;
 import org.elasticsearch.ExceptionsHelper;
 import org.elasticsearch.action.ActionListener;
 import org.elasticsearch.action.bulk.BulkResponse;
 import org.elasticsearch.client.Client;
 import org.elasticsearch.client.Requests;
 import org.elasticsearch.client.action.bulk.BulkRequestBuilder;
 import org.elasticsearch.cluster.block.ClusterBlockException;
 import org.elasticsearch.common.inject.Inject;
 import org.elasticsearch.common.util.concurrent.EsExecutors;
 import org.elasticsearch.common.xcontent.XContentBuilder;
 import org.elasticsearch.common.xcontent.XContentFactory;
 import org.elasticsearch.common.xcontent.support.XContentMapValues;
 import org.elasticsearch.indices.IndexAlreadyExistsException;
 import org.elasticsearch.river.AbstractRiverComponent;
 import org.elasticsearch.river.River;
 import org.elasticsearch.river.RiverName;
 import org.elasticsearch.river.RiverSettings;
 import org.elasticsearch.river.wikipedia.support.PageCallbackHandler;
 import org.elasticsearch.river.wikipedia.support.WikiPage;
 import org.elasticsearch.river.wikipedia.support.WikiXMLParser;
 import org.elasticsearch.river.wikipedia.support.WikiXMLParserFactory;
 import java.net.MalformedURLException;
 import java.net.URL;
 import java.util.Map;
 import java.util.concurrent.atomic.AtomicInteger;
 /**
 * @author kimchy (shay.banon)
 */
 public class WikipediaRiver extends AbstractRiverComponent implements River {
    private StringBuilder sb = new StringBuilder();
    private final Client client;
    private final URL url;
    private final String indexName;
    private final String typeName;
    private final int bulkSize;
    private final int dropThreshold;
    private final AtomicInteger onGoingBulks = new AtomicInteger();
    private volatile Thread thread;
    private volatile boolean closed = false;
    private volatile BulkRequestBuilder currentRequest;
    @Inject public WikipediaRiver(RiverName riverName, RiverSettings settings, Client client) throws MalformedURLException {
        super(riverName, settings);
        this.client = client;
        String url = "http://download.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2";
        if (settings.settings().containsKey("wikipedia")) {
            Map<String, Object> wikipediaSettings = (Map<String, Object>) settings.settings().get("wikipedia");
            url = XContentMapValues.nodeStringValue(wikipediaSettings.get("url"), url);
        }
        logger.info("creating wikipedia stream river for [{}]", url);
        this.url = new URL(url);
        if (settings.settings().containsKey("index")) {
            Map<String, Object> indexSettings = (Map<String, Object>) settings.settings().get("index");
            indexName = XContentMapValues.nodeStringValue(indexSettings.get("index"), riverName.name());
            typeName = XContentMapValues.nodeStringValue(indexSettings.get("type"), "status");
            this.bulkSize = XContentMapValues.nodeIntegerValue(settings.settings().get("bulk_size"), 100);
            this.dropThreshold = XContentMapValues.nodeIntegerValue(settings.settings().get("drop_threshold"), 10);
        } else {
            indexName = riverName.name();
            typeName = "page";
            bulkSize = 100;
            dropThreshold = 10;
        }
    }
    @Override public void start() {
        logger.info("starting twitter stream");
        try {
            client.admin().indices().prepareCreate(indexName).execute().actionGet();
        } catch (Exception e) {
            if (ExceptionsHelper.unwrapCause(e) instanceof IndexAlreadyExistsException) {
                // that's fine
            } else if (ExceptionsHelper.unwrapCause(e) instanceof ClusterBlockException) {
                // ok, not recovered yet..., lets start indexing and hope we recover by the first bulk
                // TODO: a smarter logic can be to register for cluster event listener here, and only start sampling when the block is removed...
            } else {
                logger.warn("failed to create index [{}], disabling river...", e, indexName);
                return;
            }
        }
        currentRequest = client.prepareBulk();
        WikiXMLParser parser = WikiXMLParserFactory.getSAXParser(url);
        try {
            parser.setPageCallback(new PageCallback());
        } catch (Exception e) {
            logger.error("failed to create parser", e);
            return;
        }
        thread = EsExecutors.daemonThreadFactory(settings.globalSettings(), "wikipedia_slurper").newThread(new Parser(parser));
        thread.start();
    }
    @Override public void close() {
        logger.info("closing wikipedia river");
        closed = true;
        if (thread != null) {
            thread.interrupt();
        }
    }
    private class Parser implements Runnable {
        private final WikiXMLParser parser;
        private Parser(WikiXMLParser parser) {
            this.parser = parser;
        }
        @Override public void run() {
            try {
                parser.parse();
            } catch (Exception e) {
                if (closed) {
                    return;
                }
                logger.error("failed to parse stream", e);
            }
        }
    }
    private class PageCallback implements PageCallbackHandler {
        @Override public void process(WikiPage page) {
            if (closed) {
                return;
            }
            String title = stripTitle(page.getTitle());
            if (logger.isTraceEnabled()) {
                logger.trace("page {} : {}", page.getID(), page.getTitle());
            }
            try {
                XContentBuilder builder = XContentFactory.jsonBuilder().startObject();
                builder.field("title", title);
                builder.field("text", page.getText());
                builder.field("redirect", page.isRedirect());
                builder.field("special", page.isSpecialPage());
                builder.field("stub", page.isStub());
                builder.field("disambiguation", page.isDisambiguationPage());
                builder.startArray("category");
                for (String s : page.getCategories()) {
                    builder.value(s);
                }
                builder.endArray();
                builder.startArray("link");
                for (String s : page.getLinks()) {
                    builder.value(s);
                }
                builder.endArray();
                builder.endObject();
                // For now, we index (and not create) since we need to keep track of what we indexed...
                currentRequest.add(Requests.indexRequest(indexName).type(typeName).id(page.getID()).create(false).source(builder));
                processBulkIfNeeded();
            } catch (Exception e) {
                logger.warn("failed to construct index request", e);
            }
        }
        private void processBulkIfNeeded() {
            if (currentRequest.numberOfActions() >= bulkSize) {
                // execute the bulk operation
                int currentOnGoingBulks = onGoingBulks.incrementAndGet();
                if (currentOnGoingBulks > dropThreshold) {
                    // TODO, just wait here!, we can slow down the wikipedia parsing
                    onGoingBulks.decrementAndGet();
                    logger.warn("dropping bulk, [{}] crossed threshold [{}]", onGoingBulks, dropThreshold);
                } else {
                    try {
                        currentRequest.execute(new ActionListener<BulkResponse>() {
                            @Override public void onResponse(BulkResponse bulkResponse) {
                                onGoingBulks.decrementAndGet();
                            }
                            @Override public void onFailure(Throwable e) {
                                logger.warn("failed to execute bulk");
                            }
                        });
                    } catch (Exception e) {
                        logger.warn("failed to process bulk", e);
                    }
                }
                currentRequest = client.prepareBulk();
            }
        }
    }
    private String stripTitle(String title) {
        sb.setLength(0);
        sb.append(title);
        while (sb.length() > 0 && (sb.charAt(sb.length() - 1) == '\n' || (sb.charAt(sb.length() - 1) == ' '))) {
            sb.deleteCharAt(sb.length() - 1);
        }
        return sb.toString();
    }
 }
--- a/plugins/river/wikipedia/src/main/java/org/elasticsearch/river/wikipedia/WikipediaRiverModule.java
+++ b/plugins/river/wikipedia/src/main/java/org/elasticsearch/river/wikipedia/WikipediaRiverModule.java
@ -0,0 +1,33 @@
 /*
 * Licensed to Elastic Search and Shay Banon under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership. Elastic Search licenses this
 * file to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
 package org.elasticsearch.river.wikipedia;
 import org.elasticsearch.common.inject.AbstractModule;
 import org.elasticsearch.river.River;
 /**
 * @author kimchy (shay.banon)
 */
 public class WikipediaRiverModule extends AbstractModule {
    @Override protected void configure() {
        bind(River.class).to(WikipediaRiver.class).asEagerSingleton();
    }
 }
--- a/plugins/river/wikipedia/src/main/java/org/elasticsearch/river/wikipedia/support/InfoBox.java
+++ b/plugins/river/wikipedia/src/main/java/org/elasticsearch/river/wikipedia/support/InfoBox.java
@ -0,0 +1,37 @@
 /*
 * Licensed to Elastic Search and Shay Banon under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership. Elastic Search licenses this
 * file to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
 package org.elasticsearch.river.wikipedia.support;
 /**
 * A class abstracting Wiki infobox
 *
 * @author Delip Rao
 */
 public class InfoBox {
    String infoBoxWikiText = null;
    InfoBox(String infoBoxWikiText) {
        this.infoBoxWikiText = infoBoxWikiText;
    }
    public String dumpRaw() {
        return infoBoxWikiText;
    }
 }
--- a/plugins/river/wikipedia/src/main/java/org/elasticsearch/river/wikipedia/support/IteratorHandler.java
+++ b/plugins/river/wikipedia/src/main/java/org/elasticsearch/river/wikipedia/support/IteratorHandler.java
@ -0,0 +1,34 @@
 /*
 * Licensed to Elastic Search and Shay Banon under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership. Elastic Search licenses this
 * file to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
 package org.elasticsearch.river.wikipedia.support;
 public class IteratorHandler implements PageCallbackHandler {
    private WikiXMLParser parser = null;
    public IteratorHandler(WikiXMLParser myParser) {
        parser = myParser;
    }
    public void process(WikiPage page) {
        parser.notifyPage(page);
    }
 }
--- a/plugins/river/wikipedia/src/main/java/org/elasticsearch/river/wikipedia/support/PageCallbackHandler.java
+++ b/plugins/river/wikipedia/src/main/java/org/elasticsearch/river/wikipedia/support/PageCallbackHandler.java
@ -0,0 +1,26 @@
 package org.elasticsearch.river.wikipedia.support;
 /**
 * Interface to allow streamed processing of pages.
 * This allows a SAX style processing of Wikipedia XML files.
 * The registered callback is executed on each page
 * element in the XML file.
 * <p>
 * Using callbacks will consume lesser memory, an useful feature for large
 * dumps like English and German.
 *
 * @author Delip Rao
 * @see WikiXMLDOMParser
 * @see WikiPage
 */
 public interface PageCallbackHandler {
    /**
     * This is the callback method that should be implemented before
     * registering with <code>WikiXMLDOMParser</code>
     *
     * @param page a wikipedia page object
     * @see WikiPage
     */
    public void process(WikiPage page);
 }
--- a/plugins/river/wikipedia/src/main/java/org/elasticsearch/river/wikipedia/support/SAXPageCallbackHandler.java
+++ b/plugins/river/wikipedia/src/main/java/org/elasticsearch/river/wikipedia/support/SAXPageCallbackHandler.java
@ -0,0 +1,60 @@
 package org.elasticsearch.river.wikipedia.support;
 import org.xml.sax.Attributes;
 import org.xml.sax.helpers.DefaultHandler;
 /**
 * A Wrapper class for the PageCallbackHandler
 *
 * @author Jason Smith
 */
 public class SAXPageCallbackHandler extends DefaultHandler {
    private PageCallbackHandler pageHandler;
    private WikiPage currentPage;
    private String currentTag;
    private String currentWikitext;
    private String currentTitle;
    private String currentID;
    public SAXPageCallbackHandler(PageCallbackHandler ph) {
        pageHandler = ph;
    }
    public void startElement(String uri, String name, String qName, Attributes attr) {
        currentTag = qName;
        if (qName.equals("page")) {
            currentPage = new WikiPage();
            currentWikitext = "";
            currentTitle = "";
            currentID = "";
        }
    }
    public void endElement(String uri, String name, String qName) {
        if (qName.equals("page")) {
            currentPage.setTitle(currentTitle);
            currentPage.setID(currentID);
            currentPage.setWikiText(currentWikitext);
            pageHandler.process(currentPage);
        }
        if (qName.equals("mediawiki")) {
            // TODO hasMoreElements() should now return false
        }
    }
    public void characters(char ch[], int start, int length) {
        if (currentTag.equals("title")) {
            currentTitle = currentTitle.concat(new String(ch, start, length));
        }
        // TODO: To avoid looking at the revision ID, only the first ID is taken.
        // I'm not sure how big the block size is in each call to characters(),
        // so this may be unsafe.
        else if ((currentTag.equals("id")) && (currentID.length() == 0)) {
            currentID = new String(ch, start, length);
        } else if (currentTag.equals("text")) {
            currentWikitext = currentWikitext.concat(new String(ch, start, length));
        }
    }
 }
--- a/plugins/river/wikipedia/src/main/java/org/elasticsearch/river/wikipedia/support/WikiPage.java
+++ b/plugins/river/wikipedia/src/main/java/org/elasticsearch/river/wikipedia/support/WikiPage.java
@ -0,0 +1,150 @@
 /*
 * Licensed to Elastic Search and Shay Banon under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership. Elastic Search licenses this
 * file to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
 package org.elasticsearch.river.wikipedia.support;
 import java.util.List;
 /**
 * Data structures for a wikipedia page.
 *
 * @author Delip Rao
 */
 public class WikiPage {
    private String title = null;
    private WikiTextParser wikiTextParser = null;
    private String id = null;
    /**
     * Set the page title. This is not intended for direct use.
     *
     * @param title
     */
    public void setTitle(String title) {
        this.title = title;
    }
    /**
     * Set the wiki text associated with this page.
     * This setter also introduces side effects. This is not intended for direct use.
     *
     * @param wtext wiki-formatted text
     */
    public void setWikiText(String wtext) {
        wikiTextParser = new WikiTextParser(wtext);
    }
    /**
     * @return a string containing the page title.
     */
    public String getTitle() {
        return title;
    }
    /**
     * @param languageCode
     * @return a string containing the title translated
     *         in the given languageCode.
     */
    public String getTranslatedTitle(String languageCode) {
        return wikiTextParser.getTranslatedTitle(languageCode);
    }
    /**
     * @return true if this a disambiguation page.
     */
    public boolean isDisambiguationPage() {
        if (title.contains("(disambiguation)") ||
                wikiTextParser.isDisambiguationPage())
            return true;
        else return false;
    }
    /**
     * @return true for "special pages" -- like Category:, Wikipedia:, etc
     */
    public boolean isSpecialPage() {
        return title.contains(":");
    }
    /**
     * Use this method to get the wiki text associated with this page.
     * Useful for custom processing the wiki text.
     *
     * @return a string containing the wiki text.
     */
    public String getWikiText() {
        return wikiTextParser.getText();
    }
    /**
     * @return true if this is a redirection page
     */
    public boolean isRedirect() {
        return wikiTextParser.isRedirect();
    }
    /**
     * @return true if this is a stub page
     */
    public boolean isStub() {
        return wikiTextParser.isStub();
    }
    /**
     * @return the title of the page being redirected to.
     */
    public String getRedirectPage() {
        return wikiTextParser.getRedirectText();
    }
    /**
     * @return plain text stripped of all wiki formatting.
     */
    public String getText() {
        return wikiTextParser.getPlainText();
    }
    /**
     * @return a list of categories the page belongs to, null if this a redirection/disambiguation page
     */
    public List<String> getCategories() {
        return wikiTextParser.getCategories();
    }
    /**
     * @return a list of links contained in the page
     */
    public List<String> getLinks() {
        return wikiTextParser.getLinks();
    }
    public void setID(String id) {
        this.id = id;
    }
    public InfoBox getInfoBox() {
        return wikiTextParser.getInfoBox();
    }
    public String getID() {
        return id;
    }
 }
--- a/plugins/river/wikipedia/src/main/java/org/elasticsearch/river/wikipedia/support/WikiPageIterator.java
+++ b/plugins/river/wikipedia/src/main/java/org/elasticsearch/river/wikipedia/support/WikiPageIterator.java
@ -0,0 +1,47 @@
 package org.elasticsearch.river.wikipedia.support;
 import java.util.Vector;
 /**
 * A class to iterate the pages after the wikipedia XML file has been parsed with {@link WikiXMLDOMParser}.
 *
 * @author Delip Rao
 * @see WikiXMLDOMParser
 */
 public class WikiPageIterator {
    private int currentPage = 0;
    private int lastPage = 0;
    Vector<WikiPage> pageList = null;
    public WikiPageIterator(Vector<WikiPage> list) {
        pageList = list;
        if (pageList != null)
            lastPage = pageList.size();
    }
    /**
     * @return true if there are more pages to be read
     */
    public boolean hasMorePages() {
        return (currentPage < lastPage);
    }
    /**
     * Reset the iterator.
     */
    public void reset() {
        currentPage = 0;
    }
    /**
     * Advances the iterator by one position.
     *
     * @return a {@link WikiPage}
     */
    public WikiPage nextPage() {
        if (hasMorePages())
            return pageList.elementAt(currentPage++);
        return null;
 	}
 }
--- a/plugins/river/wikipedia/src/main/java/org/elasticsearch/river/wikipedia/support/WikiTextParser.java
+++ b/plugins/river/wikipedia/src/main/java/org/elasticsearch/river/wikipedia/support/WikiTextParser.java
@ -0,0 +1,196 @@
 /*
 * Licensed to Elastic Search and Shay Banon under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership. Elastic Search licenses this
 * file to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
 package org.elasticsearch.river.wikipedia.support;
 import java.util.ArrayList;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 /**
 * For internal use only -- Used by the {@link WikiPage} class.
 * Can also be used as a stand alone class to parse wiki formatted text.
 *
 * @author Delip Rao
 */
 public class WikiTextParser {
    private String wikiText = null;
    private ArrayList<String> pageCats = null;
    private ArrayList<String> pageLinks = null;
    private boolean redirect = false;
    private String redirectString = null;
    private static Pattern redirectPattern =
            Pattern.compile("#REDIRECT\\s+\\[\\[(.*?)\\]\\]");
    private boolean stub = false;
    private boolean disambiguation = false;
    private static Pattern stubPattern = Pattern.compile("\\-stub\\}\\}");
    private static Pattern disambCatPattern = Pattern.compile("\\{\\{disambig\\}\\}");
    private InfoBox infoBox = null;
    public WikiTextParser(String wtext) {
        wikiText = wtext;
        Matcher matcher = redirectPattern.matcher(wikiText);
        if (matcher.find()) {
            redirect = true;
            if (matcher.groupCount() == 1)
                redirectString = matcher.group(1);
        }
        matcher = stubPattern.matcher(wikiText);
        stub = matcher.find();
        matcher = disambCatPattern.matcher(wikiText);
        disambiguation = matcher.find();
    }
    public boolean isRedirect() {
        return redirect;
    }
    public boolean isStub() {
        return stub;
    }
    public String getRedirectText() {
        return redirectString;
    }
    public String getText() {
        return wikiText;
    }
    public ArrayList<String> getCategories() {
        if (pageCats == null) parseCategories();
        return pageCats;
    }
    public ArrayList<String> getLinks() {
        if (pageLinks == null) parseLinks();
        return pageLinks;
    }
    private void parseCategories() {
        pageCats = new ArrayList<String>();
        Pattern catPattern = Pattern.compile("\\[\\[Category:(.*?)\\]\\]", Pattern.MULTILINE);
        Matcher matcher = catPattern.matcher(wikiText);
        while (matcher.find()) {
            String[] temp = matcher.group(1).split("\\|");
            pageCats.add(temp[0]);
        }
    }
    private void parseLinks() {
        pageLinks = new ArrayList<String>();
        Pattern catPattern = Pattern.compile("\\[\\[(.*?)\\]\\]", Pattern.MULTILINE);
        Matcher matcher = catPattern.matcher(wikiText);
        while (matcher.find()) {
            String[] temp = matcher.group(1).split("\\|");
            if (temp == null || temp.length == 0) continue;
            String link = temp[0];
            if (link.contains(":") == false) {
                pageLinks.add(link);
            }
        }
    }
    public String getPlainText() {
        String text = wikiText.replaceAll("&gt;", ">");
        text = text.replaceAll("&lt;", "<");
        text = text.replaceAll("<ref>.*?</ref>", " ");
        text = text.replaceAll("</?.*?>", " ");
        text = text.replaceAll("\\{\\{.*?\\}\\}", " ");
        text = text.replaceAll("\\[\\[.*?:.*?\\]\\]", " ");
        text = text.replaceAll("\\[\\[(.*?)\\]\\]", "$1");
        text = text.replaceAll("\\s(.*?)\\|(\\w+\\s)", " $2");
        text = text.replaceAll("\\[.*?\\]", " ");
        text = text.replaceAll("\\'+", "");
        return text;
    }
    public InfoBox getInfoBox() {
        //parseInfoBox is expensive. Doing it only once like other parse* methods
        if (infoBox == null)
            infoBox = parseInfoBox();
        return infoBox;
    }
    private InfoBox parseInfoBox() {
        String INFOBOX_CONST_STR = "{{Infobox";
        int startPos = wikiText.indexOf(INFOBOX_CONST_STR);
        if (startPos < 0) return null;
        int bracketCount = 2;
        int endPos = startPos + INFOBOX_CONST_STR.length();
        for (; endPos < wikiText.length(); endPos++) {
            switch (wikiText.charAt(endPos)) {
                case '}':
                    bracketCount--;
                    break;
                case '{':
                    bracketCount++;
                    break;
                default:
            }
            if (bracketCount == 0) break;
        }
        String infoBoxText = wikiText.substring(startPos, endPos + 1);
        infoBoxText = stripCite(infoBoxText); // strip clumsy {{cite}} tags
        // strip any html formatting
        infoBoxText = infoBoxText.replaceAll("&gt;", ">");
        infoBoxText = infoBoxText.replaceAll("&lt;", "<");
        infoBoxText = infoBoxText.replaceAll("<ref.*?>.*?</ref>", " ");
        infoBoxText = infoBoxText.replaceAll("</?.*?>", " ");
        return new InfoBox(infoBoxText);
    }
    private String stripCite(String text) {
        String CITE_CONST_STR = "{{cite";
        int startPos = text.indexOf(CITE_CONST_STR);
        if (startPos < 0) return text;
        int bracketCount = 2;
        int endPos = startPos + CITE_CONST_STR.length();
        for (; endPos < text.length(); endPos++) {
            switch (text.charAt(endPos)) {
                case '}':
                    bracketCount--;
                    break;
                case '{':
                    bracketCount++;
                    break;
                default:
            }
            if (bracketCount == 0) break;
        }
        text = text.substring(0, startPos - 1) + text.substring(endPos);
        return stripCite(text);
    }
    public boolean isDisambiguationPage() {
        return disambiguation;
    }
    public String getTranslatedTitle(String languageCode) {
        Pattern pattern = Pattern.compile("^\\[\\[" + languageCode + ":(.*?)\\]\\]$", Pattern.MULTILINE);
        Matcher matcher = pattern.matcher(wikiText);
        if (matcher.find()) {
            return matcher.group(1);
        }
        return null;
    }
 }
--- a/plugins/river/wikipedia/src/main/java/org/elasticsearch/river/wikipedia/support/WikiXMLParser.java
+++ b/plugins/river/wikipedia/src/main/java/org/elasticsearch/river/wikipedia/support/WikiXMLParser.java
@ -0,0 +1,92 @@
 /*
 * Licensed to Elastic Search and Shay Banon under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership. Elastic Search licenses this
 * file to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
 package org.elasticsearch.river.wikipedia.support;
 import org.elasticsearch.common.compress.bzip2.CBZip2InputStream;
 import org.xml.sax.InputSource;
 import java.io.BufferedReader;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.net.URL;
 import java.util.zip.GZIPInputStream;
 /**
 * @author Delip Rao
 * @author Jason Smith
 */
 public abstract class WikiXMLParser {
    private URL wikiXMLFile = null;
    protected WikiPage currentPage = null;
    public WikiXMLParser(URL fileName) {
        wikiXMLFile = fileName;
    }
    /**
     * Set a callback handler. The callback is executed every time a
     * page instance is detected in the stream. Custom handlers are
     * implementations of {@link PageCallbackHandler}
     *
     * @param handler
     * @throws Exception
     */
    public abstract void setPageCallback(PageCallbackHandler handler) throws Exception;
    /**
     * The main parse method.
     *
     * @throws Exception
     */
    public abstract void parse() throws Exception;
    /**
     * @return an iterator to the list of pages
     * @throws Exception
     */
    public abstract WikiPageIterator getIterator() throws Exception;
    /**
     * @return An InputSource created from wikiXMLFile
     * @throws Exception
     */
    protected InputSource getInputSource() throws Exception {
        BufferedReader br = null;
        if (wikiXMLFile.toExternalForm().endsWith(".gz")) {
            br = new BufferedReader(new InputStreamReader(new GZIPInputStream(wikiXMLFile.openStream()), "UTF-8"));
        } else if (wikiXMLFile.toExternalForm().endsWith(".bz2")) {
            InputStream fis = wikiXMLFile.openStream();
            byte[] ignoreBytes = new byte[2];
            fis.read(ignoreBytes); //"B", "Z" bytes from commandline tools
            br = new BufferedReader(new InputStreamReader(new CBZip2InputStream(fis), "UTF-8"));
        } else {
            br = new BufferedReader(new InputStreamReader(wikiXMLFile.openStream(), "UTF-8"));
        }
        return new InputSource(br);
    }
    protected void notifyPage(WikiPage page) {
        currentPage = page;
    }
 }
--- a/plugins/river/wikipedia/src/main/java/org/elasticsearch/river/wikipedia/support/WikiXMLParserFactory.java
+++ b/plugins/river/wikipedia/src/main/java/org/elasticsearch/river/wikipedia/support/WikiXMLParserFactory.java
@ -0,0 +1,14 @@
 package org.elasticsearch.river.wikipedia.support;
 import java.net.URL;
 /**
 * @author Delip Rao
 */
 public class WikiXMLParserFactory {
    public static WikiXMLParser getSAXParser(URL fileName) {
        return new WikiXMLSAXParser(fileName);
    }
 }
--- a/plugins/river/wikipedia/src/main/java/org/elasticsearch/river/wikipedia/support/WikiXMLSAXParser.java
+++ b/plugins/river/wikipedia/src/main/java/org/elasticsearch/river/wikipedia/support/WikiXMLSAXParser.java
@ -0,0 +1,97 @@
 /*
 * Licensed to Elastic Search and Shay Banon under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership. Elastic Search licenses this
 * file to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
 package org.elasticsearch.river.wikipedia.support;
 import org.xml.sax.SAXException;
 import org.xml.sax.XMLReader;
 import org.xml.sax.helpers.XMLReaderFactory;
 import java.net.URL;
 /**
 * A SAX Parser for Wikipedia XML dumps.
 *
 * @author Jason Smith
 */
 public class WikiXMLSAXParser extends WikiXMLParser {
    private XMLReader xmlReader;
    private PageCallbackHandler pageHandler = null;
    public WikiXMLSAXParser(URL fileName) {
        super(fileName);
        try {
            xmlReader = XMLReaderFactory.createXMLReader();
            pageHandler = new IteratorHandler(this);
        } catch (SAXException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
    }
    /**
     * Set a callback handler. The callback is executed every time a
     * page instance is detected in the stream. Custom handlers are
     * implementations of {@link PageCallbackHandler}
     *
     * @param handler
     * @throws Exception
     */
    public void setPageCallback(PageCallbackHandler handler) throws Exception {
        pageHandler = handler;
    }
    /**
     * The main parse method.
     *
     * @throws Exception
     */
    public void parse() throws Exception {
        xmlReader.setContentHandler(new SAXPageCallbackHandler(pageHandler));
        xmlReader.parse(getInputSource());
    }
    /**
     * This parser is event driven, so it
     * can't provide a page iterator.
     */
    @Override
    public WikiPageIterator getIterator() throws Exception {
        if (!(pageHandler instanceof IteratorHandler)) {
            throw new Exception("Custom page callback found. Will not iterate.");
        }
        throw new UnsupportedOperationException();
    }
    /**
     * A convenience method for the Wikipedia SAX interface
     *
     * @param dumpFile - path to the Wikipedia dump
     * @param handler  - callback handler used for parsing
     * @throws Exception
     */
    public static void parseWikipediaDump(URL dumpFile,
                                          PageCallbackHandler handler) throws Exception {
        WikiXMLParser wxsp = WikiXMLParserFactory.getSAXParser(dumpFile);
        wxsp.setPageCallback(handler);
 	  wxsp.parse();
 	}
 }
--- a/plugins/river/wikipedia/src/main/java/org/elasticsearch/river/wikipedia/support/package-info.java
+++ b/plugins/river/wikipedia/src/main/java/org/elasticsearch/river/wikipedia/support/package-info.java
@ -0,0 +1,6 @@
 /**
 * Copied from wikixmlj on 2010-10-03.
 *
 * Changed from File handling to URL handling, and removed Dom parser.
 */
 package org.elasticsearch.river.wikipedia.support;
--- a/settings.gradle
+++ b/settings.gradle
@ -21,6 +21,7 @@ include 'plugins-transport-memcached'
 include 'plugins-transport-thrift'
 include 'plugins-river-twitter'
 include 'plugins-river-wikipedia'
 include 'plugins-river-rabbitmq'
 include 'plugins-river-couchdb'
		`@ -0,0 +1,2 @@`
							`plugin=org.elasticsearch.plugin.river.wikipedia.WikipediaRiverPlugin`