mirror of https://github.com/apache/lucene.git
117 lines
3.7 KiB
Python
117 lines
3.7 KiB
Python
# Licensed to the Apache Software Foundation (ASF) under one or more
|
|
# contributor license agreements. See the NOTICE file distributed with
|
|
# this work for additional information regarding copyright ownership.
|
|
# The ASF licenses this file to You under the Apache License, Version 2.0
|
|
# (the "License"); you may not use this file except in compliance with
|
|
# the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
"""
|
|
This will generate a movie data set of 1100 records.
|
|
These are the first 1100 movies which appear when querying the Freebase of type '/film/film'.
|
|
Here is the link to the freebase page - https://www.freebase.com/film/film?schema=
|
|
|
|
Usage - python3 film_data_generator.py
|
|
"""
|
|
|
|
import csv
|
|
import copy
|
|
import json
|
|
import codecs
|
|
import datetime
|
|
import urllib.parse
|
|
import urllib.request
|
|
import xml.etree.cElementTree as ET
|
|
from xml.dom import minidom
|
|
|
|
MAX_ITERATIONS=10 #10 limits it to 1100 docs
|
|
|
|
# You need an API Key by Google to run this
|
|
API_KEY = '<insert your Google developer API key>'
|
|
service_url = 'https://www.googleapis.com/freebase/v1/mqlread'
|
|
query = [{
|
|
"id": None,
|
|
"name": None,
|
|
"initial_release_date": None,
|
|
"directed_by": [],
|
|
"genre": [],
|
|
"type": "/film/film",
|
|
"initial_release_date>" : "2000"
|
|
}]
|
|
|
|
def gen_csv(filmlist):
|
|
filmlistDup = copy.deepcopy(filmlist)
|
|
#Convert multi-valued to % delimited string
|
|
for film in filmlistDup:
|
|
for key in film:
|
|
if isinstance(film[key], list):
|
|
film[key] = '|'.join(film[key])
|
|
keys = ['name', 'directed_by', 'genre', 'type', 'id', 'initial_release_date']
|
|
with open('films.csv', 'w', newline='', encoding='utf8') as csvfile:
|
|
dict_writer = csv.DictWriter(csvfile, keys)
|
|
dict_writer.writeheader()
|
|
dict_writer.writerows(filmlistDup)
|
|
|
|
def gen_json(filmlist):
|
|
filmlistDup = copy.deepcopy(filmlist)
|
|
with open('films.json', 'w') as jsonfile:
|
|
jsonfile.write(json.dumps(filmlist, indent=2))
|
|
|
|
def gen_xml(filmlist):
|
|
root = ET.Element("add")
|
|
for film in filmlist:
|
|
doc = ET.SubElement(root, "doc")
|
|
for key in film:
|
|
if isinstance(film[key], list):
|
|
for value in film[key]:
|
|
field = ET.SubElement(doc, "field")
|
|
field.set("name", key)
|
|
field.text=value
|
|
else:
|
|
field = ET.SubElement(doc, "field")
|
|
field.set("name", key)
|
|
field.text=film[key]
|
|
tree = ET.ElementTree(root)
|
|
with open('films.xml', 'w') as f:
|
|
f.write( minidom.parseString(ET.tostring(tree.getroot(),'utf-8')).toprettyxml(indent=" ") )
|
|
|
|
def do_query(filmlist, cursor=""):
|
|
params = {
|
|
'query': json.dumps(query),
|
|
'key': API_KEY,
|
|
'cursor': cursor
|
|
}
|
|
url = service_url + '?' + urllib.parse.urlencode(params)
|
|
data = urllib.request.urlopen(url).read().decode('utf-8')
|
|
response = json.loads(data)
|
|
for item in response['result']:
|
|
del item['type'] # It's always /film/film. No point of adding this.
|
|
try:
|
|
datetime.datetime.strptime(item['initial_release_date'], "%Y-%m-%d")
|
|
except ValueError:
|
|
#Date time not formatted properly. Keeping it simple by removing the date field from that doc
|
|
del item['initial_release_date']
|
|
filmlist.append(item)
|
|
return response.get("cursor")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
filmlist = []
|
|
cursor = do_query(filmlist)
|
|
i=0
|
|
while(cursor):
|
|
cursor = do_query(filmlist, cursor)
|
|
i = i+1
|
|
if i==MAX_ITERATIONS:
|
|
break
|
|
|
|
gen_json(filmlist)
|
|
gen_csv(filmlist)
|
|
gen_xml(filmlist) |