Repository: hadley/data-movies Branch: master Commit: 70b58fc8e197 Files: 6 Total size: 7.6 KB Directory structure: gitextract_8p8la6va/ ├── .gitignore ├── 00-download.sh ├── 01-movies.sql ├── 02-import.rb ├── 03-export.rb └── readme.md ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ data *.sqlite3 ================================================ FILE: 00-download.sh ================================================ wget -r -l1 -np -nd -P data ftp://ftp.fu-berlin.de/pub/misc/movies/database/ ================================================ FILE: 01-movies.sql ================================================ -- sqlite3 movies.sqlite3 < movies.sql CREATE TABLE Movies ( id INTEGER PRIMARY KEY, title varchar(250), year integer, budget integer, length integer, imdb_rating float, imdb_votes integer, imdb_rating_votes varchar(10), mpaa_rating varchar(5) ); CREATE TABLE Ratings (id INTEGER PRIMARY KEY, movie_id integer, score varchar(10), outof10 float, votes integer); CREATE TABLE Genres (id INTEGER PRIMARY KEY , movie_id integer, genre varchar(50)); CREATE INDEX title on Movies (title); CREATE INDEX year on Movies (year); CREATE INDEX titleyear on Movies (title, year); CREATE INDEX id on Movies (id); CREATE INDEX rid on Ratings (id); CREATE INDEX rmid on Ratings (movie_id); CREATE INDEX gid on Genres (id); CREATE INDEX gmid on Genres (movie_id); ================================================ FILE: 02-import.rb ================================================ require 'rubygems' require 'sqlite3' $db = SQLite3::Database.new( "movies.sqlite3" ) $title = "[a-z,&-;0-9$#+=\/!?. ]+" def import_movies #$100,000 Pyramid, The (2001) (VG) 2001 title_re = /^(#{$title}) \s+ \([0-9]+\) \s+ ([0-9]+)$/ix i = 0 stmt = $db.prepare("INSERT INTO Movies (title, year) VALUES (?, ?);") $db.transaction do $db.execute "DELETE FROM Movies;" File.new("data/movies.list").each_line do |l| print "." if (i = i + 1) % 5000 == 0; STDOUT.flush if match = title_re.match(l) stmt.execute!(match[1], match[2].to_i) end end end puts end def import_times #"Ballyskillen Opera House, The" (1980) 30 (6 episodes) time_re = /^(#{$title}) \s+ \(([0-9]+)\) \s+ (?:[a-z]+:)?([0-9]+)/ix i = 0 stmt = $db.prepare("UPDATE Movies set length=? WHERE title=? AND year=?;") $db.transaction do File.new("data/running-times.list").each_line do |l| print "." if (i = i + 1) % 5000 == 0; STDOUT.flush if match = time_re.match(l) stmt.execute!(match[3].to_i, match[1], match[2].to_i) end end end puts end def import_budgets dashes = "-------------------------------------------------------------------------------" title_re = /MV:\s+(#{$title}?) \s \(([0-9]+)\)/ix budget_re = /BT:\s+USD\s+([0-9,.]+)/ix stmt = $db.prepare("UPDATE Movies set budget=? WHERE title=? AND year=?;") $db.transaction do File.new("data/business.list").each(dashes) do |l| if match = title_re.match(l.to_s) and bt = budget_re.match(l.to_s) stmt.execute!(bt[1].gsub!(",","").to_i, match[1], match[2].to_i) end end end end def import_mpaa_ratings dashes = "-------------------------------------------------------------------------------" title_re = /MV:\s+(#{$title}?) \s \(([0-9]+)\)/ix rating_re = /RE: Rated (.*?) /i stmt = $db.prepare("UPDATE Movies set mpaa_rating=? WHERE title=? AND year=?;") $db.transaction do File.new("data/mpaa-ratings-reasons.list").each(dashes) do |l| if match = title_re.match(l.to_s) and rt = rating_re.match(l.to_s) stmt.execute!(rt[1], match[1], match[2].to_i) end end end end def import_genres #D2: The Mighty Ducks (1994) Family genre_re = /^(#{$title}?) \s+ \(([0-9]+)\) (?:\s*[({].*[})])* \s+(.*?)$/ix i = 0 stmt = $db.prepare("INSERT INTO Genres (genre, movie_id) VALUES (?, (SELECT id FROM Movies WHERE title=? AND year=?));") $db.transaction do $db.execute "DELETE FROM Genres;" File.new("data/genres.list").each_line do |l| print "." if (i = i + 1) % 1000 == 0; STDOUT.flush if match = genre_re.match(l) stmt.execute!(match[3], match[1], match[2].to_i) end end puts end end def import_ratings #.0.1112000 14 5.9 365 Nights in Hollywood (1934) ratings_re = /([0-9.\*]+) \s+ ([0-9]+) \s+ ([0-9.]+) \s+ (#{$title}?) \s+ \(([0-9]+)\)/ix stmt = $db.prepare("UPDATE Movies set imdb_votes=?, imdb_rating=?, imdb_rating_votes=? WHERE title=? AND year=?;") $db.transaction File.new("data/ratings.list").each_line do |l| if match = ratings_re.match(l) rating, votes, outof10, title, year = match[1], match[2], match[3], match[4], match[5] stmt.execute!(votes, outof10, rating, title, year) end end $db.commit end # puts "Importing movies" # import_movies puts "Importing times" import_times puts "Importing budgets" import_budgets puts "Importing ratings" import_mpaa_ratings puts "Importing votes" import_ratings puts "Importing genres" import_genres #puts Movie.count( "budget > 0") #puts Movie.count( "length > 0") #puts Movie.count( "budget > 0 and length > 0") #puts Movie.count( "imdb_votes > 0 and length > 0") #puts Movie.count( "budget > 0 and length > 0 and imdb_votes > 0") ================================================ FILE: 03-export.rb ================================================ require "rubygems" require "arrayfields" require "sqlite3" require "set" $genres_of_interest = ["Action", "Animation", "Comedy", "Drama", "Documentary", "Romance", "Short"] $ratings_map = {"." => 0, "0" => 4.5, "1" => 14.5, "2" => 24.5, "3" => 34.5, "4" => 44.5, "5" => 45.5, "6" => 64.5, "7" => 74.5, "8" => 84.5, "9" => 94.5, "*" => 100} def genres_binary(id, db) genres = db.execute("SELECT genre FROM Genres where movie_id = #{id};").flatten.to_set $genres_of_interest.map { |genre| (genres.include? genre) ? 1 : 0} end def ratings_breakdown(ratings) ratings[0..ratings.length].to_s.split(//).map{|s| $ratings_map[s]} end db = SQLite3::Database.new( "movies.sqlite3" ) sql = " SELECT Movies.* FROM Movies WHERE length > 0 and imdb_votes > 0 ORDER BY title" i = 0 File.open("movies.tab", "w") do |out| out << [ 'title', 'year', 'length', 'budget', 'rating', 'votes', (1..10).map{|i| "r" + i.to_s}, 'mpaa', $genres_of_interest ].flatten.join("\t") + "\n" db.execute(sql) do |row| puts i if (i = i + 1) % 5000 == 0 out << [ row["title"], row["year"], row["length"], row["budget"], row["imdb_rating"], row["imdb_votes"], ratings_breakdown(row["imdb_rating_votes"]), row["mpaa_rating"], genres_binary(row['id'], db) ].flatten.join("\t") + "\n" rescue nil end end ================================================ FILE: readme.md ================================================ The internet movie database, [imdb.com](http://imdb.com/), is a website devoted to collecting movie data supplied by studios and fan. It claims to be the biggest movie database on the web and is run by amazon. More about information imdb.com can be found [online](http://imdb.com/help/show_leaf?about), including information about the [data collection process](http://imdb.com/help/show_leaf?infosource). IMDB makes their [raw data available](http://uk.imdb.com/interfaces/). Unfortunately, the data is divided into many text files and the format of each file differs slightly. To create one data file containing all the desired information these ruby scripts extract the relevant information and store in a database. Finally, this data is exported to csv to make it easier to import into data analysis packages. The following text files were downloaded and used: * business.list. Total budget * genres.list. Genres that a movie belongs to (eg. comedy and action) * movies.list. Master list of all movie titles with year of production. * mpaa-ratings-reasons.list. MPAA ratings. * ratings.list. IMDB fan ratings. * running-times.list. Movie length in minutes. Movies were selected for inclusion if they had a known length and had been rated by at least one IMDB user. The final output contains the following fields: * title. Title of the movie. * year. Year of release. * budget. Total budget (if known) in US dollars * length. Length in minutes. * rating. Average IMDB user rating. * votes. Number of IMDB users who rated this movie. * r1-10. Distribution of votes for each rating, to mid point of nearest decile: 0 = no votes, 4.5 = 1-9$\%$ votes, 14.5 = 11-19$\%$ of votes, etc. Due to rounding errors these may not sum to 100. * mpaa. MPAA rating. * action, animation, comedy, drama, documentary, romance, short. Binary variables representing if movie was classified as belonging to that genre.