lucene/solr/example/films/film_data_generator.py

# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
This will generate a movie data set of 1100 records.
These are the first 1100 movies which appear when querying the Freebase of type '/film/film'.
Here is the link to the freebase page - https://www.freebase.com/film/film?schema=

Usage - python3 film_data_generator.py
"""

import csv
import copy
import json
import codecs
import datetime
import urllib.parse
import urllib.request
import xml.etree.cElementTree as ET
from xml.dom import minidom

MAX_ITERATIONS=10  #10 limits it to 1100 docs

# You need an API Key by Google to run this
API_KEY = '<insert your Google developer API key>'
service_url = 'https://www.googleapis.com/freebase/v1/mqlread'
query = [{
  "id": None,
  "name": None,
  "initial_release_date": None,
  "directed_by": [],
  "genre": [],
  "type": "/film/film",
  "initial_release_date>" : "2000"
}]

def gen_csv(filmlist):
  filmlistDup = copy.deepcopy(filmlist)
  #Convert multi-valued to % delimited string
  for film in filmlistDup:
      for key in film:
        if isinstance(film[key], list):
          film[key] = '|'.join(film[key])
  keys = ['name', 'directed_by', 'genre', 'type', 'id', 'initial_release_date']
  with open('films.csv', 'w', newline='', encoding='utf8') as csvfile:
    dict_writer = csv.DictWriter(csvfile, keys)
    dict_writer.writeheader()
    dict_writer.writerows(filmlistDup)

def gen_json(filmlist):
  filmlistDup = copy.deepcopy(filmlist)
  with open('films.json', 'w') as jsonfile:
    jsonfile.write(json.dumps(filmlist, indent=2))

def gen_xml(filmlist):
  root = ET.Element("add")
  for film in filmlist:
    doc = ET.SubElement(root, "doc")
    for key in film:
      if isinstance(film[key], list):
        for value in film[key]:
          field = ET.SubElement(doc, "field")
          field.set("name", key)
          field.text=value
      else:
        field = ET.SubElement(doc, "field")
        field.set("name", key)
        field.text=film[key]
  tree = ET.ElementTree(root)
  with open('films.xml', 'w') as f:
    f.write( minidom.parseString(ET.tostring(tree.getroot(),'utf-8')).toprettyxml(indent="  ") )

def do_query(filmlist, cursor=""):
  params = {
          'query': json.dumps(query),
          'key': API_KEY,
          'cursor': cursor
  }
  url = service_url + '?' + urllib.parse.urlencode(params)
  data = urllib.request.urlopen(url).read().decode('utf-8')
  response = json.loads(data)
  for item in response['result']:
    del item['type'] # It's always /film/film. No point of adding this.
    try:
      datetime.datetime.strptime(item['initial_release_date'], "%Y-%m-%d")
    except ValueError:
      #Date time not formatted properly. Keeping it simple by removing the date field from that doc
      del item['initial_release_date']
    filmlist.append(item)
  return response.get("cursor")


if __name__ == "__main__":
  filmlist = []
  cursor = do_query(filmlist)
  i=0
  while(cursor):
      cursor = do_query(filmlist, cursor)
      i = i+1
      if i==MAX_ITERATIONS:
          break

  gen_json(filmlist)
  gen_csv(filmlist)
  gen_xml(filmlist)
SOLR-6127: Improve example docs, using films data git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1647918 13f79535-47bb-0310-9956-ffa450edef68 2014-12-25 16:27:12 -05:00			`# Licensed to the Apache Software Foundation (ASF) under one or more`
			`# contributor license agreements. See the NOTICE file distributed with`
			`# this work for additional information regarding copyright ownership.`
			`# The ASF licenses this file to You under the Apache License, Version 2.0`
			`# (the "License"); you may not use this file except in compliance with`
			`# the License. You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`

			`"""`
			`This will generate a movie data set of 1100 records.`
			`These are the first 1100 movies which appear when querying the Freebase of type '/film/film'.`
			`Here is the link to the freebase page - https://www.freebase.com/film/film?schema=`

SOLR-6127: move films example (data) to its own subdirectory git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1648540 13f79535-47bb-0310-9956-ffa450edef68 2014-12-30 10:24:28 -05:00			`Usage - python3 film_data_generator.py`
SOLR-6127: Improve example docs, using films data git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1647918 13f79535-47bb-0310-9956-ffa450edef68 2014-12-25 16:27:12 -05:00			`"""`

			`import csv`
			`import copy`
			`import json`
			`import codecs`
			`import datetime`
			`import urllib.parse`
			`import urllib.request`
			`import xml.etree.cElementTree as ET`
			`from xml.dom import minidom`

			`MAX_ITERATIONS=10 #10 limits it to 1100 docs`

			`# You need an API Key by Google to run this`
			`API_KEY = '<insert your Google developer API key>'`
			`service_url = 'https://www.googleapis.com/freebase/v1/mqlread'`
			`query = [{`
			`"id": None,`
			`"name": None,`
			`"initial_release_date": None,`
			`"directed_by": [],`
			`"genre": [],`
			`"type": "/film/film",`
			`"initial_release_date>" : "2000"`
			`}]`

			`def gen_csv(filmlist):`
			`filmlistDup = copy.deepcopy(filmlist)`
			`#Convert multi-valued to % delimited string`
			`for film in filmlistDup:`
			`for key in film:`
			`if isinstance(film[key], list):`
			`film[key] = '\|'.join(film[key])`
			`keys = ['name', 'directed_by', 'genre', 'type', 'id', 'initial_release_date']`
			`with open('films.csv', 'w', newline='', encoding='utf8') as csvfile:`
			`dict_writer = csv.DictWriter(csvfile, keys)`
			`dict_writer.writeheader()`
			`dict_writer.writerows(filmlistDup)`

			`def gen_json(filmlist):`
			`filmlistDup = copy.deepcopy(filmlist)`
			`with open('films.json', 'w') as jsonfile:`
			`jsonfile.write(json.dumps(filmlist, indent=2))`

			`def gen_xml(filmlist):`
			`root = ET.Element("add")`
			`for film in filmlist:`
			`doc = ET.SubElement(root, "doc")`
			`for key in film:`
			`if isinstance(film[key], list):`
			`for value in film[key]:`
			`field = ET.SubElement(doc, "field")`
			`field.set("name", key)`
			`field.text=value`
			`else:`
			`field = ET.SubElement(doc, "field")`
			`field.set("name", key)`
			`field.text=film[key]`
			`tree = ET.ElementTree(root)`
			`with open('films.xml', 'w') as f:`
			`f.write( minidom.parseString(ET.tostring(tree.getroot(),'utf-8')).toprettyxml(indent=" ") )`

			`def do_query(filmlist, cursor=""):`
			`params = {`
			`'query': json.dumps(query),`
			`'key': API_KEY,`
			`'cursor': cursor`
			`}`
			`url = service_url + '?' + urllib.parse.urlencode(params)`
			`data = urllib.request.urlopen(url).read().decode('utf-8')`
			`response = json.loads(data)`
			`for item in response['result']:`
			`del item['type'] # It's always /film/film. No point of adding this.`
			`try:`
			`datetime.datetime.strptime(item['initial_release_date'], "%Y-%m-%d")`
			`except ValueError:`
			`#Date time not formatted properly. Keeping it simple by removing the date field from that doc`
			`del item['initial_release_date']`
			`filmlist.append(item)`
			`return response.get("cursor")`


			`if __name__ == "__main__":`
			`filmlist = []`
			`cursor = do_query(filmlist)`
			`i=0`
			`while(cursor):`
			`cursor = do_query(filmlist, cursor)`
			`i = i+1`
			`if i==MAX_ITERATIONS:`
			`break`

			`gen_json(filmlist)`
			`gen_csv(filmlist)`
			`gen_xml(filmlist)`