[
  {
    "path": ".gitignore",
    "content": "data\n*.sqlite3"
  },
  {
    "path": "00-download.sh",
    "content": "wget -r -l1 -np -nd -P data ftp://ftp.fu-berlin.de/pub/misc/movies/database/ "
  },
  {
    "path": "01-movies.sql",
    "content": "-- sqlite3 movies.sqlite3 < movies.sql\n\nCREATE TABLE Movies (\nid INTEGER PRIMARY KEY,\ntitle varchar(250),\nyear integer,\nbudget integer,\nlength integer,\nimdb_rating float,\nimdb_votes integer,\nimdb_rating_votes varchar(10),\nmpaa_rating varchar(5)\n);\n\nCREATE TABLE Ratings (id INTEGER PRIMARY KEY, movie_id integer, score varchar(10), outof10 float, votes integer);\nCREATE TABLE Genres (id INTEGER PRIMARY KEY , movie_id integer, genre varchar(50));\n\nCREATE INDEX title on Movies (title);\nCREATE INDEX year on Movies (year);\nCREATE INDEX titleyear on Movies (title, year);\nCREATE INDEX id on Movies (id);\nCREATE INDEX rid on Ratings (id);\nCREATE INDEX rmid on Ratings (movie_id);\nCREATE INDEX gid on Genres (id);\nCREATE INDEX gmid on Genres (movie_id);\n"
  },
  {
    "path": "02-import.rb",
    "content": "require 'rubygems'\nrequire 'sqlite3'\n$db = SQLite3::Database.new( \"movies.sqlite3\" )\n$title = \"[a-z,&-;0-9$#+=\\/!?. ]+\"\n\ndef import_movies\n\t#$100,000 Pyramid, The (2001) (VG)\t\t\t2001\n\ttitle_re = /^(#{$title}) \\s+ \\([0-9]+\\) \\s+ ([0-9]+)$/ix\n\ti = 0\n\n\tstmt = $db.prepare(\"INSERT INTO Movies (title, year) VALUES (?, ?);\")\n\t$db.transaction do\n\t\t$db.execute \"DELETE FROM Movies;\"\n\t\n\t\tFile.new(\"data/movies.list\").each_line do |l|\n\t\t\tprint \".\" if (i = i + 1) % 5000 == 0; STDOUT.flush\n\t\t\tif match = title_re.match(l)\n\t\t\t\tstmt.execute!(match[1], match[2].to_i)\n\t\t\tend\n\t\tend\n\tend\n\t\n\tputs\nend\n\ndef import_times\n\t#\"Ballyskillen Opera House, The\" (1980)\t\t\t30\t(6 episodes)\n\ttime_re = /^(#{$title}) \\s+ \\(([0-9]+)\\) \\s+ (?:[a-z]+:)?([0-9]+)/ix \n\ti = 0\n\n\tstmt = $db.prepare(\"UPDATE Movies set length=? WHERE title=? AND year=?;\")\n  $db.transaction do \n\t\tFile.new(\"data/running-times.list\").each_line do |l|\n\t\t\tprint \".\" if (i = i + 1) % 5000 == 0; STDOUT.flush\n\t\t  \n\t\t\tif match = time_re.match(l)\n\t\t\t\tstmt.execute!(match[3].to_i, match[1], match[2].to_i)\n\t\t\tend\n\t\tend\n  end\n\t\n\tputs\nend\n\n\ndef import_budgets\n\tdashes = \"-------------------------------------------------------------------------------\"\n\ttitle_re = /MV:\\s+(#{$title}?) \\s \\(([0-9]+)\\)/ix\n\tbudget_re = /BT:\\s+USD\\s+([0-9,.]+)/ix\n\n\tstmt = $db.prepare(\"UPDATE Movies set budget=? WHERE title=? AND year=?;\")\n\t$db.transaction do \n\t\tFile.new(\"data/business.list\").each(dashes) do |l|\n\t\t\tif match = title_re.match(l.to_s) and bt = budget_re.match(l.to_s)\n\t\t\t\tstmt.execute!(bt[1].gsub!(\",\",\"\").to_i, match[1], match[2].to_i) \n\t\t\tend\n\t\tend\n\tend\nend\n\ndef import_mpaa_ratings\n\tdashes = \"-------------------------------------------------------------------------------\"\n\ttitle_re = /MV:\\s+(#{$title}?) \\s \\(([0-9]+)\\)/ix\n\trating_re = /RE: Rated (.*?) /i\n\n\tstmt = $db.prepare(\"UPDATE Movies set mpaa_rating=? WHERE title=? AND year=?;\")\n\t$db.transaction do \n\t\tFile.new(\"data/mpaa-ratings-reasons.list\").each(dashes) do |l|\n\t\t\tif match = title_re.match(l.to_s) and rt = rating_re.match(l.to_s)\n\t\t\t\tstmt.execute!(rt[1], match[1], match[2].to_i)\n\t\t\tend\n\t\tend\n\tend\nend\n\n\ndef import_genres\n\t#D2: The Mighty Ducks (1994)\t\t\t\tFamily\n\tgenre_re = /^(#{$title}?) \\s+ \\(([0-9]+)\\) (?:\\s*[({].*[})])*  \\s+(.*?)$/ix\n\ti = 0\n\t\n\tstmt = $db.prepare(\"INSERT INTO Genres (genre, movie_id) VALUES (?, (SELECT id FROM Movies WHERE title=? AND year=?));\")\n\t$db.transaction do \n\t\t$db.execute \"DELETE FROM Genres;\"\n\t\t\n\t\tFile.new(\"data/genres.list\").each_line do |l|\n\t\t\tprint \".\" if (i = i + 1) % 1000 == 0; STDOUT.flush\n\t\t\tif match = genre_re.match(l)\n\t\t\t\tstmt.execute!(match[3], match[1], match[2].to_i)\n\t\t\tend\n\t\tend\n\t\tputs\n\tend\nend\n\n\ndef import_ratings\n\t#.0.1112000      14   5.9  365 Nights in Hollywood (1934)\n\tratings_re = /([0-9.\\*]+) \\s+ ([0-9]+) \\s+ ([0-9.]+) \\s+ (#{$title}?) \\s+ \\(([0-9]+)\\)/ix\n\n\tstmt = $db.prepare(\"UPDATE Movies set imdb_votes=?, imdb_rating=?, imdb_rating_votes=? WHERE title=? AND year=?;\")\n\t$db.transaction\n\t\n\tFile.new(\"data/ratings.list\").each_line do |l|\n\t\tif match = ratings_re.match(l)\n\t\t\trating, votes, outof10, title, year = match[1], match[2], match[3], match[4], match[5]\n\t\t\tstmt.execute!(votes, outof10, rating, title, year)\n\t\tend\n\tend\n\t$db.commit\n\t\nend\n\n# puts \"Importing movies\"\n# import_movies\nputs \"Importing times\"\nimport_times\nputs \"Importing budgets\"\nimport_budgets\nputs \"Importing ratings\"\nimport_mpaa_ratings\nputs \"Importing votes\"\nimport_ratings\nputs \"Importing genres\"\nimport_genres\n\n\n#puts Movie.count( \"budget > 0\")\n#puts Movie.count( \"length > 0\")\n#puts Movie.count( \"budget > 0 and length > 0\")\n#puts Movie.count( \"imdb_votes > 0 and length > 0\")\n#puts Movie.count( \"budget > 0 and length > 0 and imdb_votes > 0\")"
  },
  {
    "path": "03-export.rb",
    "content": "require \"rubygems\"\nrequire \"arrayfields\"\nrequire \"sqlite3\"\nrequire \"set\"\n\n$genres_of_interest = [\"Action\", \"Animation\", \"Comedy\", \"Drama\", \"Documentary\", \"Romance\", \"Short\"]\n$ratings_map = {\".\" => 0, \"0\" => 4.5, \"1\" => 14.5, \"2\" => 24.5, \"3\" => 34.5, \"4\" => 44.5, \"5\" => 45.5, \"6\" => 64.5, \"7\" => 74.5, \"8\" => 84.5, \"9\" => 94.5, \"*\" => 100}\n\n\ndef genres_binary(id, db)\n\tgenres = db.execute(\"SELECT genre FROM Genres where movie_id = #{id};\").flatten.to_set\n\t$genres_of_interest.map { |genre| (genres.include? genre) ? 1 : 0}\nend\n\ndef ratings_breakdown(ratings)\n\tratings[0..ratings.length].to_s.split(//).map{|s| $ratings_map[s]}\nend\n\ndb = SQLite3::Database.new( \"movies.sqlite3\" )\nsql = \"\n\tSELECT Movies.* \n\tFROM Movies\n\tWHERE length > 0 and imdb_votes > 0\n\tORDER BY title\"\n\ni = 0 \n\nFile.open(\"movies.tab\", \"w\") do |out|\n\tout << [\n\t\t'title', 'year', 'length', 'budget', \n\t\t'rating', 'votes', (1..10).map{|i| \"r\" + i.to_s}, \n\t\t'mpaa', $genres_of_interest\n\t].flatten.join(\"\\t\") + \"\\n\"\n\n\tdb.execute(sql) do |row| \n\t\tputs i if (i = i + 1) % 5000 == 0\n\n\t\tout << [\n\t\t\trow[\"title\"], \n\t\t\trow[\"year\"], \n\t\t\trow[\"length\"], \n\t\t\trow[\"budget\"], \n\t\t\trow[\"imdb_rating\"], row[\"imdb_votes\"], ratings_breakdown(row[\"imdb_rating_votes\"]), \n\t\t\trow[\"mpaa_rating\"], genres_binary(row['id'], db)\n\t\t].flatten.join(\"\\t\") + \"\\n\" rescue nil\n\tend\nend"
  },
  {
    "path": "readme.md",
    "content": "The internet movie database, [imdb.com](http://imdb.com/), is a website devoted to collecting movie data supplied by studios and fan.  It claims to be the biggest movie database on the web and is run by amazon.  More about information imdb.com can be found [online](http://imdb.com/help/show_leaf?about), including information about the [data collection process](http://imdb.com/help/show_leaf?infosource).\n\nIMDB makes their [raw data available](http://uk.imdb.com/interfaces/). Unfortunately, the data is divided into many text files and the format of each file differs slightly.  To create one data file containing all the desired information these ruby scripts extract the relevant information and store in a database.  Finally, this data is exported to csv to make it easier to import into data analysis packages.\n\nThe following text files were downloaded and used:\n\n* business.list. Total budget\n* genres.list.  Genres that a movie belongs to (eg. comedy and action)\n* movies.list.  Master list of all movie titles with year of production.\n* mpaa-ratings-reasons.list.  MPAA ratings.\n* ratings.list.  IMDB fan ratings.\n* running-times.list.  Movie length in minutes.\n\nMovies were selected for inclusion if they had a known length and had been rated by at least one IMDB user. The final output contains the following fields:\n\n* title.  Title of the movie.\n* year.  Year of release.\n* budget.  Total budget (if known) in US dollars\n* length.  Length in minutes.\n* rating.  Average IMDB user rating.\n* votes.  Number of IMDB users who rated this movie.\n* r1-10.  Distribution of votes for each rating, to mid point of nearest decile: 0 = no votes, 4.5 = 1-9$\\%$ votes, 14.5 = 11-19$\\%$ of votes, etc.  Due to rounding errors these may not sum to 100.\n* mpaa.  MPAA rating.\n* action, animation, comedy, drama, documentary, romance, short.  Binary variables representing if movie was classified as belonging to that genre.\n"
  }
]