lucene/solr/bin/post

229 lines
7.7 KiB
Bash
Executable File

#!/usr/bin/env bash
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ====== Common code copied/adapted from bin/solr (TODO: centralize/share this kind of thing across bin/solr, etc)
THIS_SCRIPT="$0"
# Resolve symlinks to this script
while [ -h "$THIS_SCRIPT" ] ; do
ls=`ls -ld "$THIS_SCRIPT"`
# Drop everything prior to ->
link=`expr "$ls" : '.*-> \(.*\)$'`
if expr "$link" : '/.*' > /dev/null; then
THIS_SCRIPT="$link"
else
THIS_SCRIPT=`dirname "$THIS_SCRIPT"`/"$link"
fi
done
SOLR_TIP=`dirname "$THIS_SCRIPT"`/..
SOLR_TIP=`cd "$SOLR_TIP"; pwd`
if [ -n "$SOLR_JAVA_HOME" ]; then
JAVA="$SOLR_JAVA_HOME/bin/java"
elif [ -n "$JAVA_HOME" ]; then
for java in "$JAVA_HOME"/bin/amd64/java "$JAVA_HOME"/bin/java; do
if [ -x "$java" ]; then
JAVA="$java"
break
fi
done
else
JAVA=java
fi
# test that Java exists and is executable on this server
"$JAVA" -version >/dev/null 2>&1 || { echo >&2 "Java is required to run this tool! Please install Java 8 or greater before running this script."; exit 1; }
# ===== post specific code
TOOL_JAR=("$SOLR_TIP/dist"/solr-core-*.jar)
function print_usage() {
echo ""
echo 'Usage: post -c <collection> [OPTIONS] <files|directories|urls|-d ["...",...]>'
echo " or post -help"
echo ""
echo " collection name defaults to DEFAULT_SOLR_COLLECTION if not specified"
echo ""
echo "OPTIONS"
echo "======="
echo " Solr options:"
echo " -url <base Solr update URL> (overrides collection, host, and port)"
echo " -host <host> (default: localhost)"
echo " -p or -port <port> (default: 8983)"
echo " -commit yes|no (default: yes)"
# optimize intentionally omitted, but can be used as '-optimize yes' (default: no)
echo ""
echo " Web crawl options:"
echo " -recursive <depth> (default: 1)"
echo " -delay <seconds> (default: 10)"
echo ""
echo " Directory crawl options:"
echo " -delay <seconds> (default: 0)"
echo ""
echo " stdin/args options:"
echo " -type <content/type> (default: application/xml)"
echo ""
echo " Other options:"
echo " -filetypes <type>[,<type>,...] (default: xml,json,jsonl,csv,pdf,doc,docx,ppt,pptx,xls,xlsx,odt,odp,ods,ott,otp,ots,rtf,htm,html,txt,log)"
echo " -params \"<key>=<value>[&<key>=<value>...]\" (values must be URL-encoded; these pass through to Solr update request)"
echo " -out yes|no (default: no; yes outputs Solr response to console)"
echo " -format solr (sends application/json content as Solr commands to /update instead of /update/json/docs)"
echo ""
echo ""
echo "Examples:"
echo ""
echo "* JSON file: $THIS_SCRIPT -c wizbang events.json"
echo "* XML files: $THIS_SCRIPT -c records article*.xml"
echo "* CSV file: $THIS_SCRIPT -c signals LATEST-signals.csv"
echo "* Directory of files: $THIS_SCRIPT -c myfiles ~/Documents"
echo "* Web crawl: $THIS_SCRIPT -c gettingstarted http://lucene.apache.org/solr -recursive 1 -delay 1"
echo "* Standard input (stdin): echo '{"commit": {}}' | $THIS_SCRIPT -c my_collection -type application/json -out yes -d"
echo "* Data as string: $THIS_SCRIPT -c signals -type text/csv -out yes -d $'id,value\n1,0.47'"
echo ""
} # end print_usage
if [[ $# -eq 1 && ("$1" == "-help" || "$1" == "-h" || "$1" == "-usage") ]]; then
print_usage
exit
fi
COLLECTION="$DEFAULT_SOLR_COLLECTION"
PROPS=('-Dauto=yes')
RECURSIVE=""
FILES=()
URLS=()
ARGS=()
while [ $# -gt 0 ]; do
# TODO: natively handle the optional parameters to SPT
# but for now they can be specified as bin/post -c collection-name delay=5 http://lucidworks.com
if [[ -d "$1" ]]; then
# Directory
# echo "$1: DIRECTORY"
RECURSIVE=yes
FILES+=("$1")
elif [[ -f "$1" ]]; then
# File
# echo "$1: FILE"
FILES+=("$1")
elif [[ "$1" == http* ]]; then
# URL
# echo "$1: URL"
URLS+=("$1")
else
if [[ "$1" == -* ]]; then
if [[ "$1" == "-c" ]]; then
# Special case, pull out collection name
shift
COLLECTION="$1"
elif [[ "$1" == "-p" ]]; then
# -p alias for -port for convenience and compatibility with `bin/solr start`
shift
PROPS+=("-Dport=$1")
elif [[ ("$1" == "-d" || "$1" == "--data" || "$1" == "-") ]]; then
if [[ ! -t 0 ]]; then
MODE="stdin"
else
# when no stdin exists and -d specified, the rest of the arguments
# are assumed to be strings to post as-is
MODE="args"
shift
if [[ $# -gt 0 ]]; then
ARGS=("$@")
shift $#
else
# SPT needs a valid args string, useful for 'bin/post -c foo -d' to force a commit
ARGS+=("<add/>")
fi
fi
else
key="${1:1}"
shift
# echo "$1: PROP"
PROPS+=("-D$key=$1")
if [[ "$key" == "url" ]]; then
SOLR_URL=$1
fi
fi
else
echo -e "\nUnrecognized argument: $1\n"
echo -e "If this was intended to be a data file, it does not exist relative to $PWD\n"
exit 1
fi
fi
shift
done
# Check for errors
if [[ $COLLECTION == "" && $SOLR_URL == "" ]]; then
echo -e "\nCollection or URL must be specified. Use -c <collection name> or set DEFAULT_SOLR_COLLECTION in your environment, or use -url instead.\n"
echo -e "See '$THIS_SCRIPT -h' for usage instructions.\n"
exit 1
fi
# Unsupported: bin/post -c foo
if [[ ${#FILES[@]} == 0 && ${#URLS[@]} == 0 && $MODE != "stdin" && $MODE != "args" ]]; then
echo -e "\nNo files, directories, URLs, -d strings, or stdin were specified.\n"
echo -e "See '$THIS_SCRIPT -h' for usage instructions.\n"
exit 1
fi
# SPT does not support mixing different data mode types, just files, just URLs, just stdin, or just argument strings.
# The following are unsupported constructs:
# bin/post -c foo existing_file.csv http://example.com
# echo '<xml.../>' | bin/post -c foo existing_file.csv
# bin/post -c foo existing_file.csv -d 'anything'
if [[ (${#FILES[@]} != 0 && ${#URLS[@]} != 0 && $MODE != "stdin" && $MODE != "args")
|| ((${#FILES[@]} != 0 || ${#URLS[@]} != 0) && ($MODE == "stdin" || $MODE == "args")) ]]; then
echo -e "\nCombining files/directories, URLs, stdin, or args is not supported. Post them separately.\n"
exit 1
fi
PARAMS=""
# TODO: let's simplify this
if [[ $MODE != "stdin" && $MODE != "args" ]]; then
if [[ $FILES != "" ]]; then
MODE="files"
PARAMS=("${FILES[@]}")
fi
if [[ $URLS != "" ]]; then
MODE="web"
PARAMS=("${URLS[@]}")
fi
else
PARAMS=("${ARGS[@]}")
fi
PROPS+=("-Dc=$COLLECTION" "-Ddata=$MODE")
if [[ -n "$RECURSIVE" ]]; then
PROPS+=('-Drecursive=yes')
fi
echo "$JAVA" -classpath "${TOOL_JAR[0]}" "${PROPS[@]}" org.apache.solr.util.SimplePostTool "${PARAMS[@]}"
"$JAVA" -classpath "${TOOL_JAR[0]}" "${PROPS[@]}" org.apache.solr.util.SimplePostTool "${PARAMS[@]}"
# post smoker:
# bin/post -c signals -out yes -type application/json -d '[{"id": 2, "val": 0.47}]'
# bin/post -c signals -out yes -params "wt=json" -d '<add><doc><field name="id">1</field></doc></add>'