# Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ This will generate a movie data set of 1100 records. These are the first 1100 movies which appear when querying the Freebase of type '/film/film'. Here is the link to the freebase page - https://www.freebase.com/film/film?schema= Usage - python3 film_data_generator.py """ import csv import copy import json import codecs import datetime import urllib.parse import urllib.request import xml.etree.cElementTree as ET from xml.dom import minidom MAX_ITERATIONS=10 #10 limits it to 1100 docs # You need an API Key by Google to run this API_KEY = '' service_url = 'https://www.googleapis.com/freebase/v1/mqlread' query = [{ "id": None, "name": None, "initial_release_date": None, "directed_by": [], "genre": [], "type": "/film/film", "initial_release_date>" : "2000" }] def gen_csv(filmlist): filmlistDup = copy.deepcopy(filmlist) #Convert multi-valued to % delimited string for film in filmlistDup: for key in film: if isinstance(film[key], list): film[key] = '|'.join(film[key]) keys = ['name', 'directed_by', 'genre', 'type', 'id', 'initial_release_date'] with open('films.csv', 'w', newline='', encoding='utf8') as csvfile: dict_writer = csv.DictWriter(csvfile, keys) dict_writer.writeheader() dict_writer.writerows(filmlistDup) def gen_json(filmlist): filmlistDup = copy.deepcopy(filmlist) with open('films.json', 'w') as jsonfile: jsonfile.write(json.dumps(filmlist, indent=2)) def gen_xml(filmlist): root = ET.Element("add") for film in filmlist: doc = ET.SubElement(root, "doc") for key in film: if isinstance(film[key], list): for value in film[key]: field = ET.SubElement(doc, "field") field.set("name", key) field.text=value else: field = ET.SubElement(doc, "field") field.set("name", key) field.text=film[key] tree = ET.ElementTree(root) with open('films.xml', 'w') as f: f.write( minidom.parseString(ET.tostring(tree.getroot(),'utf-8')).toprettyxml(indent=" ") ) def do_query(filmlist, cursor=""): params = { 'query': json.dumps(query), 'key': API_KEY, 'cursor': cursor } url = service_url + '?' + urllib.parse.urlencode(params) data = urllib.request.urlopen(url).read().decode('utf-8') response = json.loads(data) for item in response['result']: del item['type'] # It's always /film/film. No point of adding this. try: datetime.datetime.strptime(item['initial_release_date'], "%Y-%m-%d") except ValueError: #Date time not formatted properly. Keeping it simple by removing the date field from that doc del item['initial_release_date'] filmlist.append(item) return response.get("cursor") if __name__ == "__main__": filmlist = [] cursor = do_query(filmlist) i=0 while(cursor): cursor = do_query(filmlist, cursor) i = i+1 if i==MAX_ITERATIONS: break gen_json(filmlist) gen_csv(filmlist) gen_xml(filmlist)