mirror of https://github.com/apache/lucene.git
SOLR-6127: Improve example docs, using films data
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1647918 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
53ee828c86
commit
6d665f146a
|
@ -0,0 +1,71 @@
|
|||
We have a movie data set in JSON, Solr XML, and CSV formats.
|
||||
All 3 formats contain the same data. You can use any one format to index documents to Solr.
|
||||
|
||||
The data is fetched from Freebase and the data license is present in the films-LICENSE.txt file.
|
||||
|
||||
This data consists of the following fields -
|
||||
* "id" - unique identifier for the movie
|
||||
* "name" - Name of the movie
|
||||
* "directed_by" - The person(s) who directed the making of the film
|
||||
* "initial_release_date" - The earliest official initial film screening date in any country
|
||||
* "genre" - The genre(s) that the movie belongs to
|
||||
|
||||
Steps:
|
||||
* Start Solr:
|
||||
bin/solr start
|
||||
|
||||
* Create a "films" core
|
||||
bin/solr create_core -n films -c data_driven_schema_configs
|
||||
|
||||
* Update the schema (by default it will guess the field types based on the date as it is indexed):
|
||||
curl http://localhost:8983/solr/films/schema/fields -X POST -H 'Content-type:application/json' --data-binary '
|
||||
[
|
||||
{
|
||||
"name":"genre",
|
||||
"type":"string",
|
||||
"stored":true,
|
||||
"multiValued":true
|
||||
},
|
||||
{
|
||||
"name":"directed_by",
|
||||
"type":"string",
|
||||
"stored":true,
|
||||
"multiValued":true
|
||||
},
|
||||
{
|
||||
"name":"name",
|
||||
"type":"text_general",
|
||||
"stored":true
|
||||
},
|
||||
{
|
||||
"name":"initial_release_date",
|
||||
"type":"tdate",
|
||||
"stored":true
|
||||
}
|
||||
]'
|
||||
|
||||
* Now let's index the data. You could run either of the following commands from the example/exampledocs directory
|
||||
|
||||
For JSON -
|
||||
curl 'http://localhost:8983/solr/films/update?commit=true' --data-binary @films.json -H 'Content-type:application/json'
|
||||
|
||||
For XML -
|
||||
curl 'http://localhost:8983/solr/films/update?commit=true' --data-binary @films.xml -H 'Content-type:text/xml'
|
||||
|
||||
For CSV -
|
||||
curl 'http://localhost:8983/solr/films/update?f.genre.split=true&f.directed_by.split=true&f.genre.separator=|&f.directed_by.separator=|&commit=true' --data-binary @films.csv -H 'Content-type:text/csv; charset=utf-8'
|
||||
|
||||
* Let's get searching.
|
||||
- Search for 'Batman':
|
||||
http://localhost:8983/solr/films/query?q=name:batman
|
||||
|
||||
- Show me all 'Super hero' movies:
|
||||
http://localhost:8983/solr/films/query?q=*:*&fq=genre:%22Superhero%20movie%22
|
||||
|
||||
- Let's see the distribution of genres across all the movies. See the facet section for the counts:
|
||||
http://localhost:8983/solr/films/query?q=*:*&facet=true&facet.field=genre
|
||||
|
||||
Exploring the data further -
|
||||
|
||||
* Increase the MAX_ITERATIONS value, put in your freebase API_KEY and run the exampledocs_generator.py script using Python 3.
|
||||
Now re-index Solr with the new data.
|
|
@ -0,0 +1,121 @@
|
|||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# contributor license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright ownership.
|
||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""
|
||||
This will generate a movie data set of 1100 records.
|
||||
These are the first 1100 movies which appear when querying the Freebase of type '/film/film'.
|
||||
Here is the link to the freebase page - https://www.freebase.com/film/film?schema=
|
||||
|
||||
Usage - python3 exampledocs_generator.py
|
||||
"""
|
||||
|
||||
import csv
|
||||
import copy
|
||||
import json
|
||||
import codecs
|
||||
import datetime
|
||||
import urllib.parse
|
||||
import urllib.request
|
||||
import xml.etree.cElementTree as ET
|
||||
from xml.dom import minidom
|
||||
|
||||
MAX_ITERATIONS=10 #10 limits it to 1100 docs
|
||||
|
||||
# You need an API Key by Google to run this
|
||||
API_KEY = '<insert your Google developer API key>'
|
||||
service_url = 'https://www.googleapis.com/freebase/v1/mqlread'
|
||||
query = [{
|
||||
"id": None,
|
||||
"name": None,
|
||||
"initial_release_date": None,
|
||||
"directed_by": [],
|
||||
"genre": [],
|
||||
"type": "/film/film",
|
||||
"initial_release_date>" : "2000"
|
||||
}]
|
||||
|
||||
def gen_csv(filmlist):
|
||||
filmlistDup = copy.deepcopy(filmlist)
|
||||
#Convert multi-valued to % delimited string
|
||||
for film in filmlistDup:
|
||||
for key in film:
|
||||
if isinstance(film[key], list):
|
||||
film[key] = '|'.join(film[key])
|
||||
keys = ['name', 'directed_by', 'genre', 'type', 'id', 'initial_release_date']
|
||||
with open('films.csv', 'w', newline='', encoding='utf8') as csvfile:
|
||||
dict_writer = csv.DictWriter(csvfile, keys)
|
||||
dict_writer.writeheader()
|
||||
dict_writer.writerows(filmlistDup)
|
||||
|
||||
def gen_json(filmlist):
|
||||
filmlistDup = copy.deepcopy(filmlist)
|
||||
with open('films.json', 'w') as jsonfile:
|
||||
jsonfile.write(json.dumps(filmlist, indent=2))
|
||||
|
||||
def gen_xml(filmlist):
|
||||
root = ET.Element("add")
|
||||
for film in filmlist:
|
||||
doc = ET.SubElement(root, "doc")
|
||||
for key in film:
|
||||
if isinstance(film[key], list):
|
||||
for value in film[key]:
|
||||
field = ET.SubElement(doc, "field")
|
||||
field.set("name", key)
|
||||
field.text=value
|
||||
else:
|
||||
field = ET.SubElement(doc, "field")
|
||||
field.set("name", key)
|
||||
field.text=film[key]
|
||||
tree = ET.ElementTree(root)
|
||||
with open('films.xml', 'w') as f:
|
||||
f.write( minidom.parseString(ET.tostring(tree.getroot(),'utf-8')).toprettyxml(indent=" ") )
|
||||
|
||||
def do_query(filmlist, cursor=""):
|
||||
params = {
|
||||
'query': json.dumps(query),
|
||||
'key': API_KEY,
|
||||
'cursor': cursor
|
||||
}
|
||||
url = service_url + '?' + urllib.parse.urlencode(params)
|
||||
data = urllib.request.urlopen(url).read().decode('utf-8')
|
||||
response = json.loads(data)
|
||||
for item in response['result']:
|
||||
del item['type'] # It's always /film/film. No point of adding this.
|
||||
try:
|
||||
datetime.datetime.strptime(item['initial_release_date'], "%Y-%m-%d")
|
||||
except ValueError:
|
||||
#Date time not formatted properly. Keeping it simple by removing the date field from that doc
|
||||
del item['initial_release_date']
|
||||
filmlist.append(item)
|
||||
return response.get("cursor")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
filmlist = []
|
||||
#Adding 1 entry manually to play nice with schemaless mode
|
||||
firstFilm = {'directed_by': ['Wes Anderson'], 'initial_release_date': '2014-03-28', 'genre': ['Comedy'],
|
||||
'name': 'The Grand Budapest Hotel', 'id': '/en/001'}
|
||||
filmlist.append(firstFilm)
|
||||
cursor = do_query(filmlist)
|
||||
i=0
|
||||
while(cursor):
|
||||
cursor = do_query(filmlist, cursor)
|
||||
i = i+1
|
||||
if i==MAX_ITERATIONS:
|
||||
break
|
||||
|
||||
gen_json(filmlist)
|
||||
gen_csv(filmlist)
|
||||
gen_xml(filmlist)
|
|
@ -0,0 +1,3 @@
|
|||
The films data (films.json/.xml/.csv) is licensed under the Creative Commons Attribution 2.5 Generic License.
|
||||
To view a copy of this license, visit http://creativecommons.org/licenses/by/2.5/
|
||||
or send a letter to Creative Commons, 444 Castro Street, Suite 900, Mountain View, California, 94041, USA.
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue