Full Code of hadley/data-movies for AI

master 70b58fc8e197 cached
6 files
7.6 KB
2.5k tokens
19 symbols
1 requests
Download .txt
Repository: hadley/data-movies
Branch: master
Commit: 70b58fc8e197
Files: 6
Total size: 7.6 KB

Directory structure:
gitextract_8p8la6va/

├── .gitignore
├── 00-download.sh
├── 01-movies.sql
├── 02-import.rb
├── 03-export.rb
└── readme.md

================================================
FILE CONTENTS
================================================

================================================
FILE: .gitignore
================================================
data
*.sqlite3

================================================
FILE: 00-download.sh
================================================
wget -r -l1 -np -nd -P data ftp://ftp.fu-berlin.de/pub/misc/movies/database/ 

================================================
FILE: 01-movies.sql
================================================
-- sqlite3 movies.sqlite3 < movies.sql

CREATE TABLE Movies (
id INTEGER PRIMARY KEY,
title varchar(250),
year integer,
budget integer,
length integer,
imdb_rating float,
imdb_votes integer,
imdb_rating_votes varchar(10),
mpaa_rating varchar(5)
);

CREATE TABLE Ratings (id INTEGER PRIMARY KEY, movie_id integer, score varchar(10), outof10 float, votes integer);
CREATE TABLE Genres (id INTEGER PRIMARY KEY , movie_id integer, genre varchar(50));

CREATE INDEX title on Movies (title);
CREATE INDEX year on Movies (year);
CREATE INDEX titleyear on Movies (title, year);
CREATE INDEX id on Movies (id);
CREATE INDEX rid on Ratings (id);
CREATE INDEX rmid on Ratings (movie_id);
CREATE INDEX gid on Genres (id);
CREATE INDEX gmid on Genres (movie_id);


================================================
FILE: 02-import.rb
================================================
require 'rubygems'
require 'sqlite3'
$db = SQLite3::Database.new( "movies.sqlite3" )
$title = "[a-z,&-;0-9$#+=\/!?. ]+"

def import_movies
	#$100,000 Pyramid, The (2001) (VG)			2001
	title_re = /^(#{$title}) \s+ \([0-9]+\) \s+ ([0-9]+)$/ix
	i = 0

	stmt = $db.prepare("INSERT INTO Movies (title, year) VALUES (?, ?);")
	$db.transaction do
		$db.execute "DELETE FROM Movies;"
	
		File.new("data/movies.list").each_line do |l|
			print "." if (i = i + 1) % 5000 == 0; STDOUT.flush
			if match = title_re.match(l)
				stmt.execute!(match[1], match[2].to_i)
			end
		end
	end
	
	puts
end

def import_times
	#"Ballyskillen Opera House, The" (1980)			30	(6 episodes)
	time_re = /^(#{$title}) \s+ \(([0-9]+)\) \s+ (?:[a-z]+:)?([0-9]+)/ix 
	i = 0

	stmt = $db.prepare("UPDATE Movies set length=? WHERE title=? AND year=?;")
  $db.transaction do 
		File.new("data/running-times.list").each_line do |l|
			print "." if (i = i + 1) % 5000 == 0; STDOUT.flush
		  
			if match = time_re.match(l)
				stmt.execute!(match[3].to_i, match[1], match[2].to_i)
			end
		end
  end
	
	puts
end


def import_budgets
	dashes = "-------------------------------------------------------------------------------"
	title_re = /MV:\s+(#{$title}?) \s \(([0-9]+)\)/ix
	budget_re = /BT:\s+USD\s+([0-9,.]+)/ix

	stmt = $db.prepare("UPDATE Movies set budget=? WHERE title=? AND year=?;")
	$db.transaction do 
		File.new("data/business.list").each(dashes) do |l|
			if match = title_re.match(l.to_s) and bt = budget_re.match(l.to_s)
				stmt.execute!(bt[1].gsub!(",","").to_i, match[1], match[2].to_i) 
			end
		end
	end
end

def import_mpaa_ratings
	dashes = "-------------------------------------------------------------------------------"
	title_re = /MV:\s+(#{$title}?) \s \(([0-9]+)\)/ix
	rating_re = /RE: Rated (.*?) /i

	stmt = $db.prepare("UPDATE Movies set mpaa_rating=? WHERE title=? AND year=?;")
	$db.transaction do 
		File.new("data/mpaa-ratings-reasons.list").each(dashes) do |l|
			if match = title_re.match(l.to_s) and rt = rating_re.match(l.to_s)
				stmt.execute!(rt[1], match[1], match[2].to_i)
			end
		end
	end
end


def import_genres
	#D2: The Mighty Ducks (1994)				Family
	genre_re = /^(#{$title}?) \s+ \(([0-9]+)\) (?:\s*[({].*[})])*  \s+(.*?)$/ix
	i = 0
	
	stmt = $db.prepare("INSERT INTO Genres (genre, movie_id) VALUES (?, (SELECT id FROM Movies WHERE title=? AND year=?));")
	$db.transaction do 
		$db.execute "DELETE FROM Genres;"
		
		File.new("data/genres.list").each_line do |l|
			print "." if (i = i + 1) % 1000 == 0; STDOUT.flush
			if match = genre_re.match(l)
				stmt.execute!(match[3], match[1], match[2].to_i)
			end
		end
		puts
	end
end


def import_ratings
	#.0.1112000      14   5.9  365 Nights in Hollywood (1934)
	ratings_re = /([0-9.\*]+) \s+ ([0-9]+) \s+ ([0-9.]+) \s+ (#{$title}?) \s+ \(([0-9]+)\)/ix

	stmt = $db.prepare("UPDATE Movies set imdb_votes=?, imdb_rating=?, imdb_rating_votes=? WHERE title=? AND year=?;")
	$db.transaction
	
	File.new("data/ratings.list").each_line do |l|
		if match = ratings_re.match(l)
			rating, votes, outof10, title, year = match[1], match[2], match[3], match[4], match[5]
			stmt.execute!(votes, outof10, rating, title, year)
		end
	end
	$db.commit
	
end

# puts "Importing movies"
# import_movies
puts "Importing times"
import_times
puts "Importing budgets"
import_budgets
puts "Importing ratings"
import_mpaa_ratings
puts "Importing votes"
import_ratings
puts "Importing genres"
import_genres


#puts Movie.count( "budget > 0")
#puts Movie.count( "length > 0")
#puts Movie.count( "budget > 0 and length > 0")
#puts Movie.count( "imdb_votes > 0 and length > 0")
#puts Movie.count( "budget > 0 and length > 0 and imdb_votes > 0")

================================================
FILE: 03-export.rb
================================================
require "rubygems"
require "arrayfields"
require "sqlite3"
require "set"

$genres_of_interest = ["Action", "Animation", "Comedy", "Drama", "Documentary", "Romance", "Short"]
$ratings_map = {"." => 0, "0" => 4.5, "1" => 14.5, "2" => 24.5, "3" => 34.5, "4" => 44.5, "5" => 45.5, "6" => 64.5, "7" => 74.5, "8" => 84.5, "9" => 94.5, "*" => 100}


def genres_binary(id, db)
	genres = db.execute("SELECT genre FROM Genres where movie_id = #{id};").flatten.to_set
	$genres_of_interest.map { |genre| (genres.include? genre) ? 1 : 0}
end

def ratings_breakdown(ratings)
	ratings[0..ratings.length].to_s.split(//).map{|s| $ratings_map[s]}
end

db = SQLite3::Database.new( "movies.sqlite3" )
sql = "
	SELECT Movies.* 
	FROM Movies
	WHERE length > 0 and imdb_votes > 0
	ORDER BY title"

i = 0 

File.open("movies.tab", "w") do |out|
	out << [
		'title', 'year', 'length', 'budget', 
		'rating', 'votes', (1..10).map{|i| "r" + i.to_s}, 
		'mpaa', $genres_of_interest
	].flatten.join("\t") + "\n"

	db.execute(sql) do |row| 
		puts i if (i = i + 1) % 5000 == 0

		out << [
			row["title"], 
			row["year"], 
			row["length"], 
			row["budget"], 
			row["imdb_rating"], row["imdb_votes"], ratings_breakdown(row["imdb_rating_votes"]), 
			row["mpaa_rating"], genres_binary(row['id'], db)
		].flatten.join("\t") + "\n" rescue nil
	end
end

================================================
FILE: readme.md
================================================
The internet movie database, [imdb.com](http://imdb.com/), is a website devoted to collecting movie data supplied by studios and fan.  It claims to be the biggest movie database on the web and is run by amazon.  More about information imdb.com can be found [online](http://imdb.com/help/show_leaf?about), including information about the [data collection process](http://imdb.com/help/show_leaf?infosource).

IMDB makes their [raw data available](http://uk.imdb.com/interfaces/). Unfortunately, the data is divided into many text files and the format of each file differs slightly.  To create one data file containing all the desired information these ruby scripts extract the relevant information and store in a database.  Finally, this data is exported to csv to make it easier to import into data analysis packages.

The following text files were downloaded and used:

* business.list. Total budget
* genres.list.  Genres that a movie belongs to (eg. comedy and action)
* movies.list.  Master list of all movie titles with year of production.
* mpaa-ratings-reasons.list.  MPAA ratings.
* ratings.list.  IMDB fan ratings.
* running-times.list.  Movie length in minutes.

Movies were selected for inclusion if they had a known length and had been rated by at least one IMDB user. The final output contains the following fields:

* title.  Title of the movie.
* year.  Year of release.
* budget.  Total budget (if known) in US dollars
* length.  Length in minutes.
* rating.  Average IMDB user rating.
* votes.  Number of IMDB users who rated this movie.
* r1-10.  Distribution of votes for each rating, to mid point of nearest decile: 0 = no votes, 4.5 = 1-9$\%$ votes, 14.5 = 11-19$\%$ of votes, etc.  Due to rounding errors these may not sum to 100.
* mpaa.  MPAA rating.
* action, animation, comedy, drama, documentary, romance, short.  Binary variables representing if movie was classified as belonging to that genre.
Download .txt
gitextract_8p8la6va/

├── .gitignore
├── 00-download.sh
├── 01-movies.sql
├── 02-import.rb
├── 03-export.rb
└── readme.md
Download .txt
SYMBOL INDEX (19 symbols across 3 files)

FILE: 01-movies.sql
  type Movies (line 3) | CREATE TABLE Movies (
  type Ratings (line 15) | CREATE TABLE Ratings (id INTEGER PRIMARY KEY, movie_id integer, score va...
  type Genres (line 16) | CREATE TABLE Genres (id INTEGER PRIMARY KEY , movie_id integer, genre va...
  type title (line 18) | CREATE INDEX title on Movies (title)
  type year (line 19) | CREATE INDEX year on Movies (year)
  type titleyear (line 20) | CREATE INDEX titleyear on Movies (title, year)
  type id (line 21) | CREATE INDEX id on Movies (id)
  type rid (line 22) | CREATE INDEX rid on Ratings (id)
  type rmid (line 23) | CREATE INDEX rmid on Ratings (movie_id)
  type gid (line 24) | CREATE INDEX gid on Genres (id)
  type gmid (line 25) | CREATE INDEX gmid on Genres (movie_id)

FILE: 02-import.rb
  function import_movies (line 6) | def import_movies
  function import_times (line 26) | def import_times
  function import_budgets (line 46) | def import_budgets
  function import_mpaa_ratings (line 61) | def import_mpaa_ratings
  function import_genres (line 77) | def import_genres
  function import_ratings (line 97) | def import_ratings

FILE: 03-export.rb
  function genres_binary (line 10) | def genres_binary(id, db)
  function ratings_breakdown (line 15) | def ratings_breakdown(ratings)
Condensed preview — 6 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (9K chars).
[
  {
    "path": ".gitignore",
    "chars": 14,
    "preview": "data\n*.sqlite3"
  },
  {
    "path": "00-download.sh",
    "chars": 77,
    "preview": "wget -r -l1 -np -nd -P data ftp://ftp.fu-berlin.de/pub/misc/movies/database/ "
  },
  {
    "path": "01-movies.sql",
    "chars": 750,
    "preview": "-- sqlite3 movies.sqlite3 < movies.sql\n\nCREATE TABLE Movies (\nid INTEGER PRIMARY KEY,\ntitle varchar(250),\nyear integer,\n"
  },
  {
    "path": "02-import.rb",
    "chars": 3679,
    "preview": "require 'rubygems'\nrequire 'sqlite3'\n$db = SQLite3::Database.new( \"movies.sqlite3\" )\n$title = \"[a-z,&-;0-9$#+=\\/!?. ]+\"\n"
  },
  {
    "path": "03-export.rb",
    "chars": 1321,
    "preview": "require \"rubygems\"\nrequire \"arrayfields\"\nrequire \"sqlite3\"\nrequire \"set\"\n\n$genres_of_interest = [\"Action\", \"Animation\", "
  },
  {
    "path": "readme.md",
    "chars": 1923,
    "preview": "The internet movie database, [imdb.com](http://imdb.com/), is a website devoted to collecting movie data supplied by stu"
  }
]

About this extraction

This page contains the full source code of the hadley/data-movies GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 6 files (7.6 KB), approximately 2.5k tokens, and a symbol index with 19 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.

Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.

Copied to clipboard!