Repository: yourtion/DataminingGuideBook-Codes Branch: master Commit: ff8f41b3b5fa Files: 32 Total size: 172.3 KB Directory structure: gitextract_9dxlqu_b/ ├── .gitignore ├── README.md ├── chapter-2/ │ ├── filteringdata.py │ ├── filteringdataPearson.py │ └── recommender.py ├── chapter-3/ │ ├── adjusted_cosine_similarity.py │ └── recommender3.py ├── chapter-4/ │ ├── athletesTestSet.txt │ ├── athletesTrainingSet.txt │ ├── classifyTemplate.py │ ├── filteringdata.py │ ├── irisTestSet.data │ ├── irisTrainingSet.data │ ├── mpgTestSet.txt │ ├── mpgTrainingSet.txt │ ├── nearestNeighborClassifier.py │ ├── normalizeColumnTemplate.py │ └── testMedianAndASD.py ├── chapter-5/ │ ├── crossValidation.py │ ├── divide.py │ └── pimaKNN.py ├── chapter-6/ │ ├── naiveBayes.py │ └── naiveBayesDensityFunction.py ├── chapter-7/ │ ├── bayesSentiment.py │ └── bayesText.py └── chapter-8/ ├── cereal.csv ├── dogs.csv ├── enrondata.txt ├── hierarchicalClusterer.py ├── hierarchicalClustererTemplate.py ├── kmeans.py └── kmeansPlusPlus.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python env/ build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ *.egg-info/ .installed.cfg *.egg # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *,cover .hypothesis/ # Translations *.mo *.pot # Django stuff: *.log # Sphinx documentation docs/_build/ # PyBuilder target/ #Ipython Notebook .ipynb_checkpoints ================================================ FILE: README.md ================================================ # DataminingGuideBook-Codes [《面向程序员的数据挖掘指南》](http://dataminingguide.books.yourtion.com) 源码 ## 目录 ### [第一章:简介](http://dataminingguide.books.yourtion.com/chapter-1.html) 讲述什么是数据挖掘,它所能解决的问题的是什么,以及在阅读完本书后,你可以做些什么。 ### [第二章:推荐系统入门](http://dataminingguide.books.yourtion.com/chapter-2.html) 介绍协同过滤,基本的距离算法,包括曼哈顿距离、欧几里得距离、闵科夫斯基距离、皮尔森相关系数。使用Python实现一个基本的推荐算法。 ### [第三章:隐式评价和基于物品的过滤算法](http://dataminingguide.books.yourtion.com/chapter-3.html) 这章开始讨论可供选择的用户评价体系。用户能够显示地給于评价(好、差、五星评价等),或者隐式地給于评价——如果用户在亚马逊购买了一个MP3,我们则认为他是“喜欢”这件商品的。 ### [第四章:分类](http://dataminingguide.books.yourtion.com/chapter-4.html) 上一章中我们使用用户对商品的评价来进行推荐,这一章我们会使用商品本身的特性来进行推荐。这种算法在潘多拉等网站中采用。 ### [第五章:进一步探索分类](http://dataminingguide.books.yourtion.com/chapter-5.html) 本章会讨论如何评价分类器的效果,方法包括十折交叉验证、留一法、以及Kappa检验等,同时还会引入kNN算法。 ### [第六章:朴素贝叶斯](http://dataminingguide.books.yourtion.com/chapter-6.html) 我们会在这章探索朴素贝叶斯分类算法,使用概率密度函数来处理数值型数据。 ### [第七章:朴素贝叶斯算法和非结构化文本](http://dataminingguide.books.yourtion.com/chapter-7.html) 这一章我们会尝试使用朴素贝叶斯算法来对非结构化文本进行分类。我们是否能够判断出Twitter上的一片影评是正面评价还是负面的呢? ### [第八章:聚类](http://dataminingguide.books.yourtion.com/chapter-8.html) 我们会讨论层次聚类和kmeans聚类。 ================================================ FILE: chapter-2/filteringdata.py ================================================ # # FILTERINGDATA.py # # Code file for the book Programmer's Guide to Data Mining # http://guidetodatamining.com # Ron Zacharski # from math import sqrt users = {"Angelica": {"Blues Traveler": 3.5, "Broken Bells": 2.0, "Norah Jones": 4.5, "Phoenix": 5.0, "Slightly Stoopid": 1.5, "The Strokes": 2.5, "Vampire Weekend": 2.0}, "Bill":{"Blues Traveler": 2.0, "Broken Bells": 3.5, "Deadmau5": 4.0, "Phoenix": 2.0, "Slightly Stoopid": 3.5, "Vampire Weekend": 3.0}, "Chan": {"Blues Traveler": 5.0, "Broken Bells": 1.0, "Deadmau5": 1.0, "Norah Jones": 3.0, "Phoenix": 5, "Slightly Stoopid": 1.0}, "Dan": {"Blues Traveler": 3.0, "Broken Bells": 4.0, "Deadmau5": 4.5, "Phoenix": 3.0, "Slightly Stoopid": 4.5, "The Strokes": 4.0, "Vampire Weekend": 2.0}, "Hailey": {"Broken Bells": 4.0, "Deadmau5": 1.0, "Norah Jones": 4.0, "The Strokes": 4.0, "Vampire Weekend": 1.0}, "Jordyn": {"Broken Bells": 4.5, "Deadmau5": 4.0, "Norah Jones": 5.0, "Phoenix": 5.0, "Slightly Stoopid": 4.5, "The Strokes": 4.0, "Vampire Weekend": 4.0}, "Sam": {"Blues Traveler": 5.0, "Broken Bells": 2.0, "Norah Jones": 3.0, "Phoenix": 5.0, "Slightly Stoopid": 4.0, "The Strokes": 5.0}, "Veronica": {"Blues Traveler": 3.0, "Norah Jones": 5.0, "Phoenix": 4.0, "Slightly Stoopid": 2.5, "The Strokes": 3.0} } def manhattan(rating1, rating2): """Computes the Manhattan distance. Both rating1 and rating2 are dictionaries of the form {'The Strokes': 3.0, 'Slightly Stoopid': 2.5}""" distance = 0 commonRatings = False for key in rating1: if key in rating2: distance += abs(rating1[key] - rating2[key]) commonRatings = True if commonRatings: return distance else: return -1 #Indicates no ratings in common def computeNearestNeighbor(username, users): """creates a sorted list of users based on their distance to username""" distances = [] for user in users: if user != username: distance = manhattan(users[user], users[username]) distances.append((distance, user)) # sort based on distance -- closest first distances.sort() return distances def recommend(username, users): """Give list of recommendations""" # first find nearest neighbor nearest = computeNearestNeighbor(username, users)[0][1] recommendations = [] # now find bands neighbor rated that user didn't neighborRatings = users[nearest] userRatings = users[username] for artist in neighborRatings: if not artist in userRatings: recommendations.append((artist, neighborRatings[artist])) # using the fn sorted for variety - sort is more efficient return sorted(recommendations, key=lambda artistTuple: artistTuple[1], reverse = True) # examples - uncomment to run print( recommend('Hailey', users)) #print( recommend('Chan', users)) ================================================ FILE: chapter-2/filteringdataPearson.py ================================================ # # FILTERINGDATA.py # # Code file for the book Programmer's Guide to Data Mining # http://guidetodatamining.com # Ron Zacharski # from math import sqrt users = {"Angelica": {"Blues Traveler": 3.5, "Broken Bells": 2.0, "Norah Jones": 4.5, "Phoenix": 5.0, "Slightly Stoopid": 1.5, "The Strokes": 2.5, "Vampire Weekend": 2.0}, "Bill":{"Blues Traveler": 2.0, "Broken Bells": 3.5, "Deadmau5": 4.0, "Phoenix": 2.0, "Slightly Stoopid": 3.5, "Vampire Weekend": 3.0}, "Chan": {"Blues Traveler": 5.0, "Broken Bells": 1.0, "Deadmau5": 1.0, "Norah Jones": 3.0, "Phoenix": 5, "Slightly Stoopid": 1.0}, "Dan": {"Blues Traveler": 3.0, "Broken Bells": 4.0, "Deadmau5": 4.5, "Phoenix": 3.0, "Slightly Stoopid": 4.5, "The Strokes": 4.0, "Vampire Weekend": 2.0}, "Hailey": {"Broken Bells": 4.0, "Deadmau5": 1.0, "Norah Jones": 4.0, "The Strokes": 4.0, "Vampire Weekend": 1.0}, "Jordyn": {"Broken Bells": 4.5, "Deadmau5": 4.0, "Norah Jones": 5.0, "Phoenix": 5.0, "Slightly Stoopid": 4.5, "The Strokes": 4.0, "Vampire Weekend": 4.0}, "Sam": {"Blues Traveler": 5.0, "Broken Bells": 2.0, "Norah Jones": 3.0, "Phoenix": 5.0, "Slightly Stoopid": 4.0, "The Strokes": 5.0}, "Veronica": {"Blues Traveler": 3.0, "Norah Jones": 5.0, "Phoenix": 4.0, "Slightly Stoopid": 2.5, "The Strokes": 3.0} } def manhattan(rating1, rating2): """Computes the Manhattan distance. Both rating1 and rating2 are dictionaries of the form {'The Strokes': 3.0, 'Slightly Stoopid': 2.5}""" distance = 0 total = 0 for key in rating1: if key in rating2: distance += abs(rating1[key] - rating2[key]) total += 1 if total > 0: return distance / total else: return -1 #Indicates no ratings in common def pearson(rating1, rating2): sum_xy = 0 sum_x = 0 sum_y = 0 sum_x2 = 0 sum_y2 = 0 n = 0 for key in rating1: if key in rating2: n += 1 x = rating1[key] y = rating2[key] sum_xy += x * y sum_x += x sum_y += y sum_x2 += pow(x, 2) sum_y2 += pow(y, 2) # now compute denominator denominator = sqrt(sum_x2 - pow(sum_x, 2) / n) * sqrt(sum_y2 - pow(sum_y, 2) / n) if denominator == 0: return 0 else: return (sum_xy - (sum_x * sum_y) / n) / denominator def computeNearestNeighbor(username, users): """creates a sorted list of users based on their distance to username""" distances = [] for user in users: if user != username: distance = manhattan(users[user], users[username]) distances.append((distance, user)) # sort based on distance -- closest first distances.sort() return distances def recommend(username, users): """Give list of recommendations""" # first find nearest neighbor nearest = computeNearestNeighbor(username, users)[0][1] recommendations = [] # now find bands neighbor rated that user didn't neighborRatings = users[nearest] userRatings = users[username] for artist in neighborRatings: if not artist in userRatings: recommendations.append((artist, neighborRatings[artist])) # using the fn sorted for variety - sort is more efficient return sorted(recommendations, key=lambda artistTuple: artistTuple[1], reverse = True) ================================================ FILE: chapter-2/recommender.py ================================================ import codecs from math import sqrt users = {"Angelica": {"Blues Traveler": 3.5, "Broken Bells": 2.0, "Norah Jones": 4.5, "Phoenix": 5.0, "Slightly Stoopid": 1.5, "The Strokes": 2.5, "Vampire Weekend": 2.0}, "Bill":{"Blues Traveler": 2.0, "Broken Bells": 3.5, "Deadmau5": 4.0, "Phoenix": 2.0, "Slightly Stoopid": 3.5, "Vampire Weekend": 3.0}, "Chan": {"Blues Traveler": 5.0, "Broken Bells": 1.0, "Deadmau5": 1.0, "Norah Jones": 3.0, "Phoenix": 5, "Slightly Stoopid": 1.0}, "Dan": {"Blues Traveler": 3.0, "Broken Bells": 4.0, "Deadmau5": 4.5, "Phoenix": 3.0, "Slightly Stoopid": 4.5, "The Strokes": 4.0, "Vampire Weekend": 2.0}, "Hailey": {"Broken Bells": 4.0, "Deadmau5": 1.0, "Norah Jones": 4.0, "The Strokes": 4.0, "Vampire Weekend": 1.0}, "Jordyn": {"Broken Bells": 4.5, "Deadmau5": 4.0, "Norah Jones": 5.0, "Phoenix": 5.0, "Slightly Stoopid": 4.5, "The Strokes": 4.0, "Vampire Weekend": 4.0}, "Sam": {"Blues Traveler": 5.0, "Broken Bells": 2.0, "Norah Jones": 3.0, "Phoenix": 5.0, "Slightly Stoopid": 4.0, "The Strokes": 5.0}, "Veronica": {"Blues Traveler": 3.0, "Norah Jones": 5.0, "Phoenix": 4.0, "Slightly Stoopid": 2.5, "The Strokes": 3.0} } class recommender: def __init__(self, data, k=1, metric='pearson', n=5): """ initialize recommender currently, if data is dictionary the recommender is initialized to it. For all other data types of data, no initialization occurs k is the k value for k nearest neighbor metric is which distance formula to use n is the maximum number of recommendations to make""" self.k = k self.n = n self.username2id = {} self.userid2name = {} self.productid2name = {} # for some reason I want to save the name of the metric self.metric = metric if self.metric == 'pearson': self.fn = self.pearson # # if data is dictionary set recommender data to it # if type(data).__name__ == 'dict': self.data = data def convertProductID2name(self, id): """Given product id number return product name""" if id in self.productid2name: return self.productid2name[id] else: return id def userRatings(self, id, n): """Return n top ratings for user with id""" print ("Ratings for " + self.userid2name[id]) ratings = self.data[id] print(len(ratings)) ratings = list(ratings.items()) ratings = [(self.convertProductID2name(k), v) for (k, v) in ratings] # finally sort and return ratings.sort(key=lambda artistTuple: artistTuple[1], reverse = True) ratings = ratings[:n] for rating in ratings: print("%s\t%i" % (rating[0], rating[1])) def loadBookDB(self, path=''): """loads the BX book dataset. Path is where the BX files are located""" self.data = {} i = 0 # # First load book ratings into self.data # f = codecs.open(path + "BX-Book-Ratings.csv", 'r', 'utf8') for line in f: i += 1 #separate line into fields fields = line.split(';') user = fields[0].strip('"') book = fields[1].strip('"') rating = int(fields[2].strip().strip('"')) if user in self.data: currentRatings = self.data[user] else: currentRatings = {} currentRatings[book] = rating self.data[user] = currentRatings f.close() # # Now load books into self.productid2name # Books contains isbn, title, and author among other fields # f = codecs.open(path + "BX-Books.csv", 'r', 'utf8') for line in f: i += 1 #separate line into fields fields = line.split(';') isbn = fields[0].strip('"') title = fields[1].strip('"') author = fields[2].strip().strip('"') title = title + ' by ' + author self.productid2name[isbn] = title f.close() # # Now load user info into both self.userid2name and # self.username2id # f = codecs.open(path + "BX-Users.csv", 'r', 'utf8') for line in f: i += 1 #print(line) #separate line into fields fields = line.split(';') userid = fields[0].strip('"') location = fields[1].strip('"') if len(fields) > 3: age = fields[2].strip().strip('"') else: age = 'NULL' if age != 'NULL': value = location + ' (age: ' + age + ')' else: value = location self.userid2name[userid] = value self.username2id[location] = userid f.close() print(i) def pearson(self, rating1, rating2): sum_xy = 0 sum_x = 0 sum_y = 0 sum_x2 = 0 sum_y2 = 0 n = 0 for key in rating1: if key in rating2: n += 1 x = rating1[key] y = rating2[key] sum_xy += x * y sum_x += x sum_y += y sum_x2 += pow(x, 2) sum_y2 += pow(y, 2) if n == 0: return 0 # now compute denominator denominator = (sqrt(sum_x2 - pow(sum_x, 2) / n) * sqrt(sum_y2 - pow(sum_y, 2) / n)) if denominator == 0: return 0 else: return (sum_xy - (sum_x * sum_y) / n) / denominator def computeNearestNeighbor(self, username): """creates a sorted list of users based on their distance to username""" distances = [] for instance in self.data: if instance != username: distance = self.fn(self.data[username], self.data[instance]) distances.append((instance, distance)) # sort based on distance -- closest first distances.sort(key=lambda artistTuple: artistTuple[1], reverse=True) return distances def recommend(self, user): """Give list of recommendations""" recommendations = {} # first get list of users ordered by nearness nearest = self.computeNearestNeighbor(user) # # now get the ratings for the user # userRatings = self.data[user] # # determine the total distance totalDistance = 0.0 for i in range(self.k): totalDistance += nearest[i][1] # now iterate through the k nearest neighbors # accumulating their ratings for i in range(self.k): # compute slice of pie weight = nearest[i][1] / totalDistance # get the name of the person name = nearest[i][0] # get the ratings for this person neighborRatings = self.data[name] # get the name of the person # now find bands neighbor rated that user didn't for artist in neighborRatings: if not artist in userRatings: if artist not in recommendations: recommendations[artist] = (neighborRatings[artist] * weight) else: recommendations[artist] = (recommendations[artist] + neighborRatings[artist] * weight) # now make list from dictionary recommendations = list(recommendations.items()) recommendations = [(self.convertProductID2name(k), v) for (k, v) in recommendations] # finally sort and return recommendations.sort(key=lambda artistTuple: artistTuple[1], reverse = True) # Return the first n items return recommendations[:self.n] ================================================ FILE: chapter-3/adjusted_cosine_similarity.py ================================================ # -*- coding: utf-8 -*- from math import sqrt users3 = {"David": {"Imagine Dragons": 3, "Daft Punk": 5, "Lorde": 4, "Fall Out Boy": 1}, "Matt": {"Imagine Dragons": 3, "Daft Punk": 4, "Lorde": 4, "Fall Out Boy": 1}, "Ben": {"Kacey Musgraves": 4, "Imagine Dragons": 3, "Lorde": 3, "Fall Out Boy": 1}, "Chris": {"Kacey Musgraves": 4, "Imagine Dragons": 4, "Daft Punk": 4, "Lorde": 3, "Fall Out Boy": 1}, "Tori": {"Kacey Musgraves": 5, "Imagine Dragons": 4, "Daft Punk": 5, "Fall Out Boy": 3}} def computeSimilarity(band1, band2, userRatings): averages = {} for (key, ratings) in userRatings.items(): averages[key] = (float(sum(ratings.values())) / len(ratings.values())) num = 0 # 分子 dem1 = 0 # 分母的第一部分 dem2 = 0 for (user, ratings) in userRatings.items(): if band1 in ratings and band2 in ratings: avg = averages[user] num += (ratings[band1] - avg) * (ratings[band2] - avg) dem1 += (ratings[band1] - avg) ** 2 dem2 += (ratings[band2] - avg) ** 2 return num / (sqrt(dem1) * sqrt(dem2)) print(computeSimilarity('Kacey Musgraves', 'Lorde', users3)) print(computeSimilarity('Imagine Dragons', 'Lorde', users3)) print(computeSimilarity('Daft Punk', 'Lorde', users3)) ================================================ FILE: chapter-3/recommender3.py ================================================ import codecs from math import sqrt users2 = {"Amy": {"Taylor Swift": 4, "PSY": 3, "Whitney Houston": 4}, "Ben": {"Taylor Swift": 5, "PSY": 2}, "Clara": {"PSY": 3.5, "Whitney Houston": 4}, "Daisy": {"Taylor Swift": 5, "Whitney Houston": 3}} users = {"Angelica": {"Blues Traveler": 3.5, "Broken Bells": 2.0, "Norah Jones": 4.5, "Phoenix": 5.0, "Slightly Stoopid": 1.5, "The Strokes": 2.5, "Vampire Weekend": 2.0}, "Bill":{"Blues Traveler": 2.0, "Broken Bells": 3.5, "Deadmau5": 4.0, "Phoenix": 2.0, "Slightly Stoopid": 3.5, "Vampire Weekend": 3.0}, "Chan": {"Blues Traveler": 5.0, "Broken Bells": 1.0, "Deadmau5": 1.0, "Norah Jones": 3.0, "Phoenix": 5, "Slightly Stoopid": 1.0}, "Dan": {"Blues Traveler": 3.0, "Broken Bells": 4.0, "Deadmau5": 4.5, "Phoenix": 3.0, "Slightly Stoopid": 4.5, "The Strokes": 4.0, "Vampire Weekend": 2.0}, "Hailey": {"Broken Bells": 4.0, "Deadmau5": 1.0, "Norah Jones": 4.0, "The Strokes": 4.0, "Vampire Weekend": 1.0}, "Jordyn": {"Broken Bells": 4.5, "Deadmau5": 4.0, "Norah Jones": 5.0, "Phoenix": 5.0, "Slightly Stoopid": 4.5, "The Strokes": 4.0, "Vampire Weekend": 4.0}, "Sam": {"Blues Traveler": 5.0, "Broken Bells": 2.0, "Norah Jones": 3.0, "Phoenix": 5.0, "Slightly Stoopid": 4.0, "The Strokes": 5.0}, "Veronica": {"Blues Traveler": 3.0, "Norah Jones": 5.0, "Phoenix": 4.0, "Slightly Stoopid": 2.5, "The Strokes": 3.0} } class recommender: def __init__(self, data, k=1, metric='pearson', n=5): """ initialize recommender currently, if data is dictionary the recommender is initialized to it. For all other data types of data, no initialization occurs k is the k value for k nearest neighbor metric is which distance formula to use n is the maximum number of recommendations to make""" self.k = k self.n = n self.username2id = {} self.userid2name = {} self.productid2name = {} # # The following two variables are used for Slope One # self.frequencies = {} self.deviations = {} # for some reason I want to save the name of the metric self.metric = metric if self.metric == 'pearson': self.fn = self.pearson # # if data is dictionary set recommender data to it # if type(data).__name__ == 'dict': self.data = data def convertProductID2name(self, id): """Given product id number return product name""" if id in self.productid2name: return self.productid2name[id] else: return id def userRatings(self, id, n): """Return n top ratings for user with id""" print ("Ratings for " + self.userid2name[id]) ratings = self.data[id] print(len(ratings)) ratings = list(ratings.items())[:n] ratings = [(self.convertProductID2name(k), v) for (k, v) in ratings] # finally sort and return ratings.sort(key=lambda artistTuple: artistTuple[1], reverse = True) for rating in ratings: print("%s\t%i" % (rating[0], rating[1])) def showUserTopItems(self, user, n): """ show top n items for user""" items = list(self.data[user].items()) items.sort(key=lambda itemTuple: itemTuple[1], reverse=True) for i in range(n): print("%s\t%i" % (self.convertProductID2name(items[i][0]), items[i][1])) def loadMovieLens(self, path=''): self.data = {} # # first load movie ratings # i = 0 # # First load book ratings into self.data # #f = codecs.open(path + "u.data", 'r', 'utf8') f = codecs.open(path + "u.data", 'r', 'ascii') # f = open(path + "u.data") for line in f: i += 1 #separate line into fields fields = line.split('\t') user = fields[0] movie = fields[1] rating = int(fields[2].strip().strip('"')) if user in self.data: currentRatings = self.data[user] else: currentRatings = {} currentRatings[movie] = rating self.data[user] = currentRatings f.close() # # Now load movie into self.productid2name # the file u.item contains movie id, title, release date among # other fields # #f = codecs.open(path + "u.item", 'r', 'utf8') f = codecs.open(path + "u.item", 'r', 'iso8859-1', 'ignore') #f = open(path + "u.item") for line in f: i += 1 #separate line into fields fields = line.split('|') mid = fields[0].strip() title = fields[1].strip() self.productid2name[mid] = title f.close() # # Now load user info into both self.userid2name # and self.username2id # #f = codecs.open(path + "u.user", 'r', 'utf8') f = open(path + "u.user") for line in f: i += 1 fields = line.split('|') userid = fields[0].strip('"') self.userid2name[userid] = line self.username2id[line] = userid f.close() print(i) def loadBookDB(self, path=''): """loads the BX book dataset. Path is where the BX files are located""" self.data = {} i = 0 # # First load book ratings into self.data # f = codecs.open(path + "u.data", 'r', 'utf8') for line in f: i += 1 # separate line into fields fields = line.split(';') user = fields[0].strip('"') book = fields[1].strip('"') rating = int(fields[2].strip().strip('"')) if rating > 5: print("EXCEEDING ", rating) if user in self.data: currentRatings = self.data[user] else: currentRatings = {} currentRatings[book] = rating self.data[user] = currentRatings f.close() # # Now load books into self.productid2name # Books contains isbn, title, and author among other fields # f = codecs.open(path + "BX-Books.csv", 'r', 'utf8') for line in f: i += 1 # separate line into fields fields = line.split(';') isbn = fields[0].strip('"') title = fields[1].strip('"') author = fields[2].strip().strip('"') title = title + ' by ' + author self.productid2name[isbn] = title f.close() # # Now load user info into both self.userid2name and # self.username2id # f = codecs.open(path + "BX-Users.csv", 'r', 'utf8') for line in f: i += 1 # separate line into fields fields = line.split(';') userid = fields[0].strip('"') location = fields[1].strip('"') if len(fields) > 3: age = fields[2].strip().strip('"') else: age = 'NULL' if age != 'NULL': value = location + ' (age: ' + age + ')' else: value = location self.userid2name[userid] = value self.username2id[location] = userid f.close() print(i) def computeDeviations(self): # for each person in the data: # get their ratings for ratings in self.data.values(): # for each item & rating in that set of ratings: for (item, rating) in ratings.items(): self.frequencies.setdefault(item, {}) self.deviations.setdefault(item, {}) # for each item2 & rating2 in that set of ratings: for (item2, rating2) in ratings.items(): if item != item2: # add the difference between the ratings to our # computation self.frequencies[item].setdefault(item2, 0) self.deviations[item].setdefault(item2, 0.0) self.frequencies[item][item2] += 1 self.deviations[item][item2] += rating - rating2 for (item, ratings) in self.deviations.items(): for item2 in ratings: ratings[item2] /= self.frequencies[item][item2] def slopeOneRecommendations(self, userRatings): recommendations = {} frequencies = {} # for every item and rating in the user's recommendations for (userItem, userRating) in userRatings.items(): # for every item in our dataset that the user didn't rate for (diffItem, diffRatings) in self.deviations.items(): if diffItem not in userRatings and \ userItem in self.deviations[diffItem]: freq = self.frequencies[diffItem][userItem] recommendations.setdefault(diffItem, 0.0) frequencies.setdefault(diffItem, 0) # add to the running sum representing the numerator # of the formula recommendations[diffItem] += (diffRatings[userItem] + userRating) * freq # keep a running sum of the frequency of diffitem frequencies[diffItem] += freq recommendations = [(self.convertProductID2name(k), v / frequencies[k]) for (k, v) in recommendations.items()] # finally sort and return recommendations.sort(key=lambda artistTuple: artistTuple[1], reverse = True) # I am only going to return the first 50 recommendations return recommendations[:50] def pearson(self, rating1, rating2): sum_xy = 0 sum_x = 0 sum_y = 0 sum_x2 = 0 sum_y2 = 0 n = 0 for key in rating1: if key in rating2: n += 1 x = rating1[key] y = rating2[key] sum_xy += x * y sum_x += x sum_y += y sum_x2 += pow(x, 2) sum_y2 += pow(y, 2) if n == 0: return 0 # now compute denominator denominator = sqrt(sum_x2 - pow(sum_x, 2) / n) * \ sqrt(sum_y2 - pow(sum_y, 2) / n) if denominator == 0: return 0 else: return (sum_xy - (sum_x * sum_y) / n) / denominator def computeNearestNeighbor(self, username): """creates a sorted list of users based on their distance to username""" distances = [] for instance in self.data: if instance != username: distance = self.fn(self.data[username], self.data[instance]) distances.append((instance, distance)) # sort based on distance -- closest first distances.sort(key=lambda artistTuple: artistTuple[1], reverse=True) return distances def recommend(self, user): """Give list of recommendations""" recommendations = {} # first get list of users ordered by nearness nearest = self.computeNearestNeighbor(user) # # now get the ratings for the user # userRatings = self.data[user] # # determine the total distance totalDistance = 0.0 for i in range(self.k): totalDistance += nearest[i][1] # now iterate through the k nearest neighbors # accumulating their ratings for i in range(self.k): # compute slice of pie weight = nearest[i][1] / totalDistance # get the name of the person name = nearest[i][0] # get the ratings for this person neighborRatings = self.data[name] # get the name of the person # now find bands neighbor rated that user didn't for artist in neighborRatings: if not artist in userRatings: if artist not in recommendations: recommendations[artist] = neighborRatings[artist] * \ weight else: recommendations[artist] = recommendations[artist] + \ neighborRatings[artist] * \ weight # now make list from dictionary and only get the first n items recommendations = list(recommendations.items())[:self.n] recommendations = [(self.convertProductID2name(k), v) for (k, v) in recommendations] # finally sort and return recommendations.sort(key=lambda artistTuple: artistTuple[1], reverse = True) return recommendations ================================================ FILE: chapter-4/athletesTestSet.txt ================================================ Aly Raisman Gymnastics 62 115 Crystal Langhorne Basketball 74 190 Diana Taurasi Basketball 72 163 Erin Thorn Basketball 69 144 Hannah Whelan Gymnastics 63 117 Jaycie Phelps Gymnastics 60 97 Kelly Miller Basketball 70 140 Kerri Strug Gymnastics 57 87 Koko Tsurumi Gymnastics 55 75 Li Shanshan Gymnastics 64 101 Lindsay Whalen Basketball 69 169 Lisa Jane Weightman Track 62 97 Maya Moore Basketball 72 174 Paula Radcliffe Track 68 120 Penny Taylor Basketball 73 165 Priscah Jeptoo Track 65 108 Shalane Flanagan Track 65 106 Xiaolin Zhu Track 67 121 Xueqin Wang Track 64 110 Zhu Xiaolin Track 67 123 ================================================ FILE: chapter-4/athletesTrainingSet.txt ================================================ comment class num num Asuka Teramoto Gymnastics 54 66 Brittainey Raven Basketball 72 162 Chen Nan Basketball 78 204 Gabby Douglas Gymnastics 49 90 Helalia Johannes Track 65 99 Irina Miketenko Track 63 106 Jennifer Lacy Basketball 75 175 Kara Goucher Track 67 123 Linlin Deng Gymnastics 54 68 Nakia Sanford Basketball 76 200 Nikki Blue Basketball 68 163 Qiushuang Huang Gymnastics 61 95 Rebecca Tunney Gymnastics 58 77 Rene Kalmer Track 70 108 Shanna Crossley Basketball 70 155 Shavonte Zellous Basketball 70 155 Tatyana Petrova Track 63 108 Tiki Gelana Track 65 106 Valeria Straneo Track 66 97 Viktoria Komova Gymnastics 61 76 ================================================ FILE: chapter-4/classifyTemplate.py ================================================ # # Classify Template # # Finish the code for the method, nearestNeighbor # # Code file for the book Programmer's Guide to Data Mining # http://guidetodatamining.com # # Ron Zacharski # class Classifier: def __init__(self, filename): self.medianAndDeviation = [] # reading the data in from the file f = open(filename) lines = f.readlines() f.close() self.format = lines[0].strip().split('\t') self.data = [] for line in lines[1:]: fields = line.strip().split('\t') ignore = [] vector = [] for i in range(len(fields)): if self.format[i] == 'num': vector.append(int(fields[i])) elif self.format[i] == 'comment': ignore.append(fields[i]) elif self.format[i] == 'class': classification = fields[i] self.data.append((classification, vector, ignore)) self.rawData = list(self.data) # get length of instance vector self.vlen = len(self.data[0][1]) # now normalize the data for i in range(self.vlen): self.normalizeColumn(i) ################################################## ### ### CODE TO COMPUTE THE MODIFIED STANDARD SCORE def getMedian(self, alist): """return median of alist""" if alist == []: return [] blist = sorted(alist) length = len(alist) if length % 2 == 1: # length of list is odd so return middle element return blist[int(((length + 1) / 2) - 1)] else: # length of list is even so compute midpoint v1 = blist[int(length / 2)] v2 =blist[(int(length / 2) - 1)] return (v1 + v2) / 2.0 def getAbsoluteStandardDeviation(self, alist, median): """given alist and median return absolute standard deviation""" sum = 0 for item in alist: sum += abs(item - median) return sum / len(alist) def normalizeColumn(self, columnNumber): """given a column number, normalize that column in self.data""" # first extract values to list col = [v[1][columnNumber] for v in self.data] median = self.getMedian(col) asd = self.getAbsoluteStandardDeviation(col, median) #print("Median: %f ASD = %f" % (median, asd)) self.medianAndDeviation.append((median, asd)) for v in self.data: v[1][columnNumber] = (v[1][columnNumber] - median) / asd def normalizeVector(self, v): """We have stored the median and asd for each column. We now use them to normalize vector v""" vector = list(v) for i in range(len(vector)): (median, asd) = self.medianAndDeviation[i] vector[i] = (vector[i] - median) / asd return vector ### ### END NORMALIZATION ################################################## def manhattan(self, vector1, vector2): """Computes the Manhattan distance.""" return sum(map(lambda v1, v2: abs(v1 - v2), vector1, vector2)) def nearestNeighbor(self, itemVector): """return nearest neighbor to itemVector""" return ((0, ("REPLACE THIS LINE WITH CORRECT RETURN", [0], []))) def classify(self, itemVector): """Return class we think item Vector is in""" return(self.nearestNeighbor(self.normalizeVector(itemVector))[1][0]) def unitTest(): classifier = Classifier('athletesTrainingSet.txt') br = ('Basketball', [72, 162], ['Brittainey Raven']) nl = ('Gymnastics', [61, 76], ['Viktoria Komova']) cl = ("Basketball", [74, 190], ['Crystal Langhorne']) # first check normalize function brNorm = classifier.normalizeVector(br[1]) nlNorm = classifier.normalizeVector(nl[1]) clNorm = classifier.normalizeVector(cl[1]) assert(brNorm == classifier.data[1][1]) assert(nlNorm == classifier.data[-1][1]) print('normalizeVector fn OK') # check distance assert (round(classifier.manhattan(clNorm, classifier.data[1][1]), 5) == 1.16823) assert(classifier.manhattan(brNorm, classifier.data[1][1]) == 0) assert(classifier.manhattan(nlNorm, classifier.data[-1][1]) == 0) print('Manhattan distance fn OK') # Brittainey Raven's nearest neighbor should be herself result = classifier.nearestNeighbor(brNorm) assert(result[1][2]== br[2]) # Nastia Liukin's nearest neighbor should be herself result = classifier.nearestNeighbor(nlNorm) assert(result[1][2]== nl[2]) # Crystal Langhorne's nearest neighbor is Jennifer Lacy" assert(classifier.nearestNeighbor(clNorm)[1][2][0] == "Jennifer Lacy") print("Nearest Neighbor fn OK") # Check if classify correctly identifies sports assert(classifier.classify(br[1]) == 'Basketball') assert(classifier.classify(cl[1]) == 'Basketball') assert(classifier.classify(nl[1]) == 'Gymnastics') print('Classify fn OK') unitTest() ================================================ FILE: chapter-4/filteringdata.py ================================================ # # ch4-filteringdata.py # # Code for the first example from chapter 4. # The only change from the original filteringdata.py is the addition of the music dictionary. # # Code file for the book Programmer's Guide to Data Mining # http://guidetodatamining.com # Ron Zacharski # from math import sqrt users = {"Angelica": {"Blues Traveler": 3.5, "Broken Bells": 2.0, "Norah Jones": 4.5, "Phoenix": 5.0, "Slightly Stoopid": 1.5, "The Strokes": 2.5, "Vampire Weekend": 2.0}, "Bill":{"Blues Traveler": 2.0, "Broken Bells": 3.5, "Deadmau5": 4.0, "Phoenix": 2.0, "Slightly Stoopid": 3.5, "Vampire Weekend": 3.0}, "Chan": {"Blues Traveler": 5.0, "Broken Bells": 1.0, "Deadmau5": 1.0, "Norah Jones": 3.0, "Phoenix": 5, "Slightly Stoopid": 1.0}, "Dan": {"Blues Traveler": 3.0, "Broken Bells": 4.0, "Deadmau5": 4.5, "Phoenix": 3.0, "Slightly Stoopid": 4.5, "The Strokes": 4.0, "Vampire Weekend": 2.0}, "Hailey": {"Broken Bells": 4.0, "Deadmau5": 1.0, "Norah Jones": 4.0, "The Strokes": 4.0, "Vampire Weekend": 1.0}, "Jordyn": {"Broken Bells": 4.5, "Deadmau5": 4.0, "Norah Jones": 5.0, "Phoenix": 5.0, "Slightly Stoopid": 4.5, "The Strokes": 4.0, "Vampire Weekend": 4.0}, "Sam": {"Blues Traveler": 5.0, "Broken Bells": 2.0, "Norah Jones": 3.0, "Phoenix": 5.0, "Slightly Stoopid": 4.0, "The Strokes": 5.0}, "Veronica": {"Blues Traveler": 3.0, "Norah Jones": 5.0, "Phoenix": 4.0, "Slightly Stoopid": 2.5, "The Strokes": 3.0} } music = {"Dr Dog/Fate": {"piano": 2.5, "vocals": 4, "beat": 3.5, "blues": 3, "guitar": 5, "backup vocals": 4, "rap": 1}, "Phoenix/Lisztomania": {"piano": 2, "vocals": 5, "beat": 5, "blues": 3, "guitar": 2, "backup vocals": 1, "rap": 1}, "Heartless Bastards/Out at Sea": {"piano": 1, "vocals": 5, "beat": 4, "blues": 2, "guitar": 4, "backup vocals": 1, "rap": 1}, "Todd Snider/Don't Tempt Me": {"piano": 4, "vocals": 5, "beat": 4, "blues": 4, "guitar": 1, "backup vocals": 5, "rap": 1}, "The Black Keys/Magic Potion": {"piano": 1, "vocals": 4, "beat": 5, "blues": 3.5, "guitar": 5, "backup vocals": 1, "rap": 1}, "Glee Cast/Jessie's Girl": {"piano": 1, "vocals": 5, "beat": 3.5, "blues": 3, "guitar":4, "backup vocals": 5, "rap": 1}, "La Roux/Bulletproof": {"piano": 5, "vocals": 5, "beat": 4, "blues": 2, "guitar": 1, "backup vocals": 1, "rap": 1}, "Mike Posner": {"piano": 2.5, "vocals": 4, "beat": 4, "blues": 1, "guitar": 1, "backup vocals": 1, "rap": 1}, "Black Eyed Peas/Rock That Body": {"piano": 2, "vocals": 5, "beat": 5, "blues": 1, "guitar": 2, "backup vocals": 2, "rap": 4}, "Lady Gaga/Alejandro": {"piano": 1, "vocals": 5, "beat": 3, "blues": 2, "guitar": 1, "backup vocals": 2, "rap": 1}} def manhattan(rating1, rating2): """Computes the Manhattan distance. Both rating1 and rating2 are dictionaries of the form {'The Strokes': 3.0, 'Slightly Stoopid': 2.5}""" distance = 0 total = 0 for key in rating1: if key in rating2: distance += abs(rating1[key] - rating2[key]) total += 1 return distance def computeNearestNeighbor(username, users): """creates a sorted list of users based on their distance to username""" distances = [] for user in users: if user != username: distance = manhattan(users[user], users[username]) distances.append((distance, user)) # sort based on distance -- closest first distances.sort() return distances def recommend(username, users): """Give list of recommendations""" # first find nearest neighbor nearest = computeNearestNeighbor(username, users)[0][1] recommendations = [] # now find bands neighbor rated that user didn't neighborRatings = users[nearest] userRatings = users[username] for artist in neighborRatings: if not artist in userRatings: recommendations.append((artist, neighborRatings[artist])) # using the fn sorted for variety - sort is more efficient return sorted(recommendations, key=lambda artistTuple: artistTuple[1], reverse = True) ================================================ FILE: chapter-4/irisTestSet.data ================================================ 5.1 3.5 1.4 0.2 Iris-setosa 4.9 3.0 1.4 0.2 Iris-setosa 4.7 3.2 1.3 0.2 Iris-setosa 4.6 3.1 1.5 0.2 Iris-setosa 5.0 3.6 1.4 0.2 Iris-setosa 5.4 3.9 1.7 0.4 Iris-setosa 4.6 3.4 1.4 0.3 Iris-setosa 5.0 3.4 1.5 0.2 Iris-setosa 4.4 2.9 1.4 0.2 Iris-setosa 4.9 3.1 1.5 0.1 Iris-setosa 7.0 3.2 4.7 1.4 Iris-versicolor 6.4 3.2 4.5 1.5 Iris-versicolor 6.9 3.1 4.9 1.5 Iris-versicolor 5.5 2.3 4.0 1.3 Iris-versicolor 6.5 2.8 4.6 1.5 Iris-versicolor 5.7 2.8 4.5 1.3 Iris-versicolor 6.3 3.3 4.7 1.6 Iris-versicolor 4.9 2.4 3.3 1.0 Iris-versicolor 6.6 2.9 4.6 1.3 Iris-versicolor 5.2 2.7 3.9 1.4 Iris-versicolor 6.7 3.1 5.6 2.4 Iris-virginica 6.9 3.1 5.1 2.3 Iris-virginica 5.8 2.7 5.1 1.9 Iris-virginica 6.8 3.2 5.9 2.3 Iris-virginica 6.7 3.3 5.7 2.5 Iris-virginica 6.7 3.0 5.2 2.3 Iris-virginica 6.3 2.5 5.0 1.9 Iris-virginica 6.5 3.0 5.2 2.0 Iris-virginica 6.2 3.4 5.4 2.3 Iris-virginica 5.9 3.0 5.1 1.8 Iris-virginica ================================================ FILE: chapter-4/irisTrainingSet.data ================================================ num num num num class 5.4 3.7 1.5 0.2 Iris-setosa 4.8 3.4 1.6 0.2 Iris-setosa 4.8 3.0 1.4 0.1 Iris-setosa 4.3 3.0 1.1 0.1 Iris-setosa 5.8 4.0 1.2 0.2 Iris-setosa 5.7 4.4 1.5 0.4 Iris-setosa 5.4 3.9 1.3 0.4 Iris-setosa 5.1 3.5 1.4 0.3 Iris-setosa 5.7 3.8 1.7 0.3 Iris-setosa 5.1 3.8 1.5 0.3 Iris-setosa 5.4 3.4 1.7 0.2 Iris-setosa 5.1 3.7 1.5 0.4 Iris-setosa 4.6 3.6 1.0 0.2 Iris-setosa 5.1 3.3 1.7 0.5 Iris-setosa 4.8 3.4 1.9 0.2 Iris-setosa 5.0 3.0 1.6 0.2 Iris-setosa 5.0 3.4 1.6 0.4 Iris-setosa 5.2 3.5 1.5 0.2 Iris-setosa 5.2 3.4 1.4 0.2 Iris-setosa 4.7 3.2 1.6 0.2 Iris-setosa 4.8 3.1 1.6 0.2 Iris-setosa 5.4 3.4 1.5 0.4 Iris-setosa 5.2 4.1 1.5 0.1 Iris-setosa 5.5 4.2 1.4 0.2 Iris-setosa 4.9 3.1 1.5 0.1 Iris-setosa 5.0 3.2 1.2 0.2 Iris-setosa 5.5 3.5 1.3 0.2 Iris-setosa 4.9 3.1 1.5 0.1 Iris-setosa 4.4 3.0 1.3 0.2 Iris-setosa 5.1 3.4 1.5 0.2 Iris-setosa 5.0 3.5 1.3 0.3 Iris-setosa 4.5 2.3 1.3 0.3 Iris-setosa 4.4 3.2 1.3 0.2 Iris-setosa 5.0 3.5 1.6 0.6 Iris-setosa 5.1 3.8 1.9 0.4 Iris-setosa 4.8 3.0 1.4 0.3 Iris-setosa 5.1 3.8 1.6 0.2 Iris-setosa 4.6 3.2 1.4 0.2 Iris-setosa 5.3 3.7 1.5 0.2 Iris-setosa 5.0 3.3 1.4 0.2 Iris-setosa 5.0 2.0 3.5 1.0 Iris-versicolor 5.9 3.0 4.2 1.5 Iris-versicolor 6.0 2.2 4.0 1.0 Iris-versicolor 6.1 2.9 4.7 1.4 Iris-versicolor 5.6 2.9 3.6 1.3 Iris-versicolor 6.7 3.1 4.4 1.4 Iris-versicolor 5.6 3.0 4.5 1.5 Iris-versicolor 5.8 2.7 4.1 1.0 Iris-versicolor 6.2 2.2 4.5 1.5 Iris-versicolor 5.6 2.5 3.9 1.1 Iris-versicolor 5.9 3.2 4.8 1.8 Iris-versicolor 6.1 2.8 4.0 1.3 Iris-versicolor 6.3 2.5 4.9 1.5 Iris-versicolor 6.1 2.8 4.7 1.2 Iris-versicolor 6.4 2.9 4.3 1.3 Iris-versicolor 6.6 3.0 4.4 1.4 Iris-versicolor 6.8 2.8 4.8 1.4 Iris-versicolor 6.7 3.0 5.0 1.7 Iris-versicolor 6.0 2.9 4.5 1.5 Iris-versicolor 5.7 2.6 3.5 1.0 Iris-versicolor 5.5 2.4 3.8 1.1 Iris-versicolor 5.5 2.4 3.7 1.0 Iris-versicolor 5.8 2.7 3.9 1.2 Iris-versicolor 6.0 2.7 5.1 1.6 Iris-versicolor 5.4 3.0 4.5 1.5 Iris-versicolor 6.0 3.4 4.5 1.6 Iris-versicolor 6.7 3.1 4.7 1.5 Iris-versicolor 6.3 2.3 4.4 1.3 Iris-versicolor 5.6 3.0 4.1 1.3 Iris-versicolor 5.5 2.5 4.0 1.3 Iris-versicolor 5.5 2.6 4.4 1.2 Iris-versicolor 6.1 3.0 4.6 1.4 Iris-versicolor 5.8 2.6 4.0 1.2 Iris-versicolor 5.0 2.3 3.3 1.0 Iris-versicolor 5.6 2.7 4.2 1.3 Iris-versicolor 5.7 3.0 4.2 1.2 Iris-versicolor 5.7 2.9 4.2 1.3 Iris-versicolor 6.2 2.9 4.3 1.3 Iris-versicolor 5.1 2.5 3.0 1.1 Iris-versicolor 5.7 2.8 4.1 1.3 Iris-versicolor 6.3 3.3 6.0 2.5 Iris-virginica 5.8 2.7 5.1 1.9 Iris-virginica 7.1 3.0 5.9 2.1 Iris-virginica 6.3 2.9 5.6 1.8 Iris-virginica 6.5 3.0 5.8 2.2 Iris-virginica 7.6 3.0 6.6 2.1 Iris-virginica 4.9 2.5 4.5 1.7 Iris-virginica 7.3 2.9 6.3 1.8 Iris-virginica 6.7 2.5 5.8 1.8 Iris-virginica 7.2 3.6 6.1 2.5 Iris-virginica 6.5 3.2 5.1 2.0 Iris-virginica 6.4 2.7 5.3 1.9 Iris-virginica 6.8 3.0 5.5 2.1 Iris-virginica 5.7 2.5 5.0 2.0 Iris-virginica 5.8 2.8 5.1 2.4 Iris-virginica 6.4 3.2 5.3 2.3 Iris-virginica 6.5 3.0 5.5 1.8 Iris-virginica 7.7 3.8 6.7 2.2 Iris-virginica 7.7 2.6 6.9 2.3 Iris-virginica 6.0 2.2 5.0 1.5 Iris-virginica 6.9 3.2 5.7 2.3 Iris-virginica 5.6 2.8 4.9 2.0 Iris-virginica 7.7 2.8 6.7 2.0 Iris-virginica 6.3 2.7 4.9 1.8 Iris-virginica 6.7 3.3 5.7 2.1 Iris-virginica 7.2 3.2 6.0 1.8 Iris-virginica 6.2 2.8 4.8 1.8 Iris-virginica 6.1 3.0 4.9 1.8 Iris-virginica 6.4 2.8 5.6 2.1 Iris-virginica 7.2 3.0 5.8 1.6 Iris-virginica 7.4 2.8 6.1 1.9 Iris-virginica 7.9 3.8 6.4 2.0 Iris-virginica 6.4 2.8 5.6 2.2 Iris-virginica 6.3 2.8 5.1 1.5 Iris-virginica 6.1 2.6 5.6 1.4 Iris-virginica 7.7 3.0 6.1 2.3 Iris-virginica 6.3 3.4 5.6 2.4 Iris-virginica 6.4 3.1 5.5 1.8 Iris-virginica 6.0 3.0 4.8 1.8 Iris-virginica 6.9 3.1 5.4 2.1 Iris-virginica ================================================ FILE: chapter-4/mpgTestSet.txt ================================================ 15 8 390.0 190.0 3850 8.5 amc ambassador dpl 15 8 383.0 170.0 3563 10.0 dodge challenger se 15 8 340.0 160.0 3609 8.0 plymouth 'cuda 340 15 8 400.0 150.0 3761 9.5 chevrolet monte carlo 15 8 455.0 225.0 3086 10.0 buick estate wagon (sw) 25 4 113.0 95.00 2372 15.0 toyota corona mark ii 20 6 198.0 95.00 2833 15.5 plymouth duster 20 6 199.0 97.00 2774 15.5 amc hornet 20 6 200.0 85.00 2587 16.0 ford maverick 25 4 97.00 88.00 2130 14.5 datsun pl510 25 4 97.00 46.00 1835 20.5 volkswagen 1131 deluxe sedan 25 4 110.0 87.00 2672 17.5 peugeot 504 25 4 107.0 90.00 2430 14.5 audi 100 ls 25 4 104.0 95.00 2375 17.5 saab 99e 25 4 121.0 113.0 2234 12.5 bmw 2002 20 6 199.0 90.00 2648 15.0 amc gremlin 10 8 360.0 215.0 4615 14.0 ford f250 10 8 307.0 200.0 4376 15.0 chevy c20 10 8 318.0 210.0 4382 13.5 dodge d200 10 8 304.0 193.0 4732 18.5 hi 1200d 25 4 97.00 88.00 2130 14.5 datsun pl510 30 4 140.0 90.00 2264 15.5 chevrolet vega 2300 25 4 113.0 95.00 2228 14.0 toyota corona 20 6 232.0 100.0 2634 13.0 amc gremlin 15 6 225.0 105.0 3439 15.5 plymouth satellite custom 15 6 250.0 100.0 3329 15.5 chevrolet chevelle malibu 20 6 250.0 88.00 3302 15.5 ford torino 500 20 6 232.0 100.0 3288 15.5 amc matador 15 8 350.0 165.0 4209 12.0 chevrolet impala 15 8 400.0 175.0 4464 11.5 pontiac catalina brougham 15 8 351.0 153.0 4154 13.5 ford galaxie 500 15 8 318.0 150.0 4096 13.0 plymouth fury iii 10 8 383.0 180.0 4955 11.5 dodge monaco (sw) 15 8 400.0 170.0 4746 12.0 ford country squire (sw) 15 8 400.0 175.0 5140 12.0 pontiac safari (sw) 20 6 258.0 110.0 2962 13.5 amc hornet sportabout (sw) 20 4 140.0 72.00 2408 19.0 chevrolet vega (sw) 20 6 250.0 100.0 3282 15.0 pontiac firebird 20 6 250.0 88.00 3139 14.5 ford mustang 25 4 122.0 86.00 2220 14.0 mercury capri 2000 30 4 116.0 90.00 2123 14.0 opel 1900 30 4 79.00 70.00 2074 19.5 peugeot 304 30 4 88.00 76.00 2065 14.5 fiat 124b 30 4 71.00 65.00 1773 19.0 toyota corolla 1200 35 4 72.00 69.00 1613 18.0 datsun 1200 25 4 97.00 60.00 1834 19.0 volkswagen model 111 25 4 91.00 70.00 1955 20.5 plymouth cricket 25 4 113.0 95.00 2278 15.5 toyota corona hardtop 25 4 97.50 80.00 2126 17.0 dodge colt hardtop 25 4 97.00 54.00 2254 23.5 volkswagen type 3 ================================================ FILE: chapter-4/mpgTrainingSet.txt ================================================ class num num num num num comment 20 8 307.0 130.0 3504 12.0 chevrolet chevelle malibu 15 8 350.0 165.0 3693 11.5 buick skylark 320 20 8 318.0 150.0 3436 11.0 plymouth satellite 15 8 304.0 150.0 3433 12.0 amc rebel sst 15 8 302.0 140.0 3449 10.5 ford torino 15 8 429.0 198.0 4341 10.0 ford galaxie 500 15 8 454.0 220.0 4354 9.0 chevrolet impala 15 8 440.0 215.0 4312 8.5 plymouth fury iii 15 8 455.0 225.0 4425 10.0 pontiac catalina 20 4 140.0 90.00 2408 19.5 chevrolet vega 20 4 122.0 86.00 2226 16.5 ford pinto runabout 15 8 350.0 165.0 4274 12.0 chevrolet impala 15 8 400.0 175.0 4385 12.0 pontiac catalina 15 8 318.0 150.0 4135 13.5 plymouth fury iii 15 8 351.0 153.0 4129 13.0 ford galaxie 500 15 8 304.0 150.0 3672 11.5 amc ambassador sst 10 8 429.0 208.0 4633 11.0 mercury marquis 15 8 350.0 155.0 4502 13.5 buick lesabre custom 10 8 350.0 160.0 4456 13.5 oldsmobile delta 88 royale 15 8 400.0 190.0 4422 12.5 chrysler newport royal 20 3 70.00 97.00 2330 13.5 mazda rx2 coupe 15 8 304.0 150.0 3892 12.5 amc matador (sw) 15 8 307.0 130.0 4098 14.0 chevrolet chevelle concours (sw) 15 8 302.0 140.0 4294 16.0 ford gran torino (sw) 15 8 318.0 150.0 4077 14.0 plymouth satellite custom (sw) 20 4 121.0 112.0 2933 14.5 volvo 145e (sw) 20 4 121.0 76.00 2511 18.0 volkswagen 411 (sw) 20 4 120.0 87.00 2979 19.5 peugeot 504 (sw) 25 4 96.00 69.00 2189 18.0 renault 12 (sw) 20 4 122.0 86.00 2395 16.0 ford pinto (sw) 30 4 97.00 92.00 2288 17.0 datsun 510 (sw) 25 4 120.0 97.00 2506 14.5 toyouta corona mark ii (sw) 30 4 98.00 80.00 2164 15.0 dodge colt (sw) 25 4 97.00 88.00 2100 16.5 toyota corolla 1600 (sw) 15 8 350.0 175.0 4100 13.0 buick century 350 15 8 304.0 150.0 3672 11.5 amc matador 15 8 350.0 145.0 3988 13.0 chevrolet malibu 15 8 302.0 137.0 4042 14.5 ford gran torino 15 8 318.0 150.0 3777 12.5 dodge coronet custom 10 8 429.0 198.0 4952 11.5 mercury marquis brougham 15 8 400.0 150.0 4464 12.0 chevrolet caprice classic 15 8 351.0 158.0 4363 13.0 ford ltd 15 8 318.0 150.0 4237 14.5 plymouth fury gran sedan 15 8 440.0 215.0 4735 11.0 chrysler new yorker brougham 10 8 455.0 225.0 4951 11.0 buick electra 225 custom 15 8 360.0 175.0 3821 11.0 amc ambassador brougham 20 6 225.0 105.0 3121 16.5 plymouth valiant 15 6 250.0 100.0 3278 18.0 chevrolet nova custom 20 6 232.0 100.0 2945 16.0 amc hornet 20 6 250.0 88.00 3021 16.5 ford maverick 25 6 198.0 95.00 2904 16.0 plymouth duster 25 4 97.00 46.00 1950 21.0 volkswagen super beetle 10 8 400.0 150.0 4997 14.0 chevrolet impala 10 8 400.0 167.0 4906 12.5 ford country 15 8 360.0 170.0 4654 13.0 plymouth custom suburb 10 8 350.0 180.0 4499 12.5 oldsmobile vista cruiser 20 6 232.0 100.0 2789 15.0 amc gremlin 20 4 97.00 88.00 2279 19.0 toyota carina 20 4 140.0 72.00 2401 19.5 chevrolet vega 20 4 108.0 94.00 2379 16.5 datsun 610 20 3 70.00 90.00 2124 13.5 maxda rx3 20 4 122.0 85.00 2310 18.5 ford pinto 20 6 155.0 107.0 2472 14.0 mercury capri v6 25 4 98.00 90.00 2265 15.5 fiat 124 sport coupe 15 8 350.0 145.0 4082 13.0 chevrolet monte carlo s 15 8 400.0 230.0 4278 9.50 pontiac grand prix 30 4 68.00 49.00 1867 19.5 fiat 128 25 4 116.0 75.00 2158 15.5 opel manta 20 4 114.0 91.00 2582 14.0 audi 100ls 20 4 121.0 112.0 2868 15.5 volvo 144ea 15 8 318.0 150.0 3399 11.0 dodge dart custom 25 4 121.0 110.0 2660 14.0 saab 99le 20 6 156.0 122.0 2807 13.5 toyota mark ii 10 8 350.0 180.0 3664 11.0 oldsmobile omega 20 6 198.0 95.00 3102 16.5 plymouth duster 20 6 232.0 100.0 2901 16.0 amc hornet 15 6 250.0 100.0 3336 17.0 chevrolet nova 30 4 79.00 67.00 1950 19.0 datsun b210 25 4 122.0 80.00 2451 16.5 ford pinto 30 4 71.00 65.00 1836 21.0 toyota corolla 1200 25 4 140.0 75.00 2542 17.0 chevrolet vega 15 6 250.0 100.0 3781 17.0 chevrolet chevelle malibu classic 15 6 258.0 110.0 3632 18.0 amc matador 20 6 225.0 105.0 3613 16.5 plymouth satellite sebring 15 8 302.0 140.0 4141 14.0 ford gran torino 15 8 350.0 150.0 4699 14.5 buick century luxus (sw) 15 8 318.0 150.0 4457 13.5 dodge coronet custom (sw) 15 8 302.0 140.0 4638 16.0 ford gran torino (sw) 15 8 304.0 150.0 4257 15.5 amc matador (sw) 30 4 98.00 83.00 2219 16.5 audi fox 25 4 79.00 67.00 1963 15.5 volkswagen dasher 25 4 97.00 78.00 2300 14.5 opel manta 30 4 76.00 52.00 1649 16.5 toyota corona 30 4 83.00 61.00 2003 19.0 datsun 710 30 4 90.00 75.00 2125 14.5 dodge colt 25 4 90.00 75.00 2108 15.5 fiat 128 25 4 116.0 75.00 2246 14.0 fiat 124 tc 25 4 120.0 97.00 2489 15.0 honda civic 25 4 108.0 93.00 2391 15.5 subaru 30 4 79.00 67.00 2000 16.0 fiat x1.9 20 6 225.0 95.00 3264 16.0 plymouth valiant custom 20 6 250.0 105.0 3459 16.0 chevrolet nova 15 6 250.0 72.00 3432 21.0 mercury monarch 15 6 250.0 72.00 3158 19.5 ford maverick 15 8 400.0 170.0 4668 11.5 pontiac catalina 15 8 350.0 145.0 4440 14.0 chevrolet bel air 15 8 318.0 150.0 4498 14.5 plymouth grand fury 15 8 351.0 148.0 4657 13.5 ford ltd 15 6 231.0 110.0 3907 21.0 buick century 15 6 250.0 105.0 3897 18.5 chevroelt chevelle malibu 15 6 258.0 110.0 3730 19.0 amc matador 20 6 225.0 95.00 3785 19.0 plymouth fury 20 6 231.0 110.0 3039 15.0 buick skyhawk 20 8 262.0 110.0 3221 13.5 chevrolet monza 2+2 15 8 302.0 129.0 3169 12.0 ford mustang ii 30 4 97.00 75.00 2171 16.0 toyota corolla 25 4 140.0 83.00 2639 17.0 ford pinto 20 6 232.0 100.0 2914 16.0 amc gremlin 25 4 140.0 78.00 2592 18.5 pontiac astro 25 4 134.0 96.00 2702 13.5 toyota corona 25 4 90.00 71.00 2223 16.5 volkswagen dasher 25 4 119.0 97.00 2545 17.0 datsun 710 20 6 171.0 97.00 2984 14.5 ford pinto 30 4 90.00 70.00 1937 14.0 volkswagen rabbit 20 6 232.0 90.00 3211 17.0 amc pacer 25 4 115.0 95.00 2694 15.0 audi 100ls 25 4 120.0 88.00 2957 17.0 peugeot 504 20 4 121.0 98.00 2945 14.5 volvo 244dl 25 4 121.0 115.0 2671 13.5 saab 99le 35 4 91.00 53.00 1795 17.5 honda civic cvcc 30 4 107.0 86.00 2464 15.5 fiat 131 25 4 116.0 81.00 2220 16.9 opel 1900 25 4 140.0 92.00 2572 14.9 capri ii 25 4 98.00 79.00 2255 17.7 dodge colt 25 4 101.0 83.00 2202 15.3 renault 12tl 20 8 305.0 140.0 4215 13.0 chevrolet chevelle malibu classic 15 8 318.0 150.0 4190 13.0 dodge coronet brougham 15 8 304.0 120.0 3962 13.9 amc matador 15 8 351.0 152.0 4215 12.8 ford gran torino 20 6 225.0 100.0 3233 15.4 plymouth valiant 20 6 250.0 105.0 3353 14.5 chevrolet nova 25 6 200.0 81.00 3012 17.6 ford maverick 25 6 232.0 90.00 3085 17.6 amc hornet 30 4 85.00 52.00 2035 22.2 chevrolet chevette 25 4 98.00 60.00 2164 22.1 chevrolet woody 30 4 90.00 70.00 1937 14.2 vw rabbit 35 4 91.00 53.00 1795 17.4 honda civic 20 6 225.0 100.0 3651 17.7 dodge aspen se 20 6 250.0 78.00 3574 21.0 ford granada ghia 20 6 250.0 110.0 3645 16.2 pontiac ventura sj 20 6 258.0 95.00 3193 17.8 amc pacer d/l 30 4 97.00 71.00 1825 12.2 volkswagen rabbit 30 4 85.00 70.00 1990 17.0 datsun b-210 30 4 97.00 75.00 2155 16.4 toyota corolla 25 4 140.0 72.00 2565 13.6 ford pinto 20 4 130.0 102.0 3150 15.7 volvo 245 15 8 318.0 150.0 3940 13.2 plymouth volare premier v8 20 4 120.0 88.00 3270 21.9 peugeot 504 20 6 156.0 108.0 2930 15.5 toyota mark ii 15 6 168.0 120.0 3820 16.7 mercedes-benz 280s 15 8 350.0 180.0 4380 12.1 cadillac seville 15 8 350.0 145.0 4055 12.0 chevy c10 15 8 302.0 130.0 3870 15.0 ford f108 15 8 318.0 150.0 3755 14.0 dodge d100 30 4 98.00 68.00 2045 18.5 honda accord cvcc 30 4 111.0 80.00 2155 14.8 buick opel isuzu deluxe 35 4 79.00 58.00 1825 18.6 renault 5 gtl 25 4 122.0 96.00 2300 15.5 plymouth arrow gs 35 4 85.00 70.00 1945 16.8 datsun f-10 hatchback 20 8 305.0 145.0 3880 12.5 chevrolet caprice classic 15 8 260.0 110.0 4060 19.0 oldsmobile cutlass supreme 15 8 318.0 145.0 4140 13.7 dodge monaco brougham 15 8 302.0 130.0 4295 14.9 mercury cougar brougham 20 6 250.0 110.0 3520 16.4 chevrolet concours 20 6 231.0 105.0 3425 16.9 buick skylark 20 6 225.0 100.0 3630 17.7 plymouth volare custom 20 6 250.0 98.00 3525 19.0 ford granada 15 8 400.0 180.0 4220 11.1 pontiac grand prix lj 15 8 350.0 170.0 4165 11.4 chevrolet monte carlo landau 15 8 400.0 190.0 4325 12.2 chrysler cordoba 15 8 351.0 149.0 4335 14.5 ford thunderbird 30 4 97.00 78.00 1940 14.5 volkswagen rabbit custom 25 4 151.0 88.00 2740 16.0 pontiac sunbird coupe 25 4 97.00 75.00 2265 18.2 toyota corolla liftback 25 4 140.0 89.00 2755 15.8 ford mustang ii 2+2 30 4 98.00 63.00 2051 17.0 chevrolet chevette 35 4 98.00 83.00 2075 15.9 dodge colt m/m 30 4 97.00 67.00 1985 16.4 subaru dl 30 4 97.00 78.00 2190 14.1 volkswagen dasher 20 6 146.0 97.00 2815 14.5 datsun 810 20 4 121.0 110.0 2600 12.8 bmw 320i 20 3 80.00 110.0 2720 13.5 mazda rx-4 45 4 90.00 48.00 1985 21.5 volkswagen rabbit custom diesel 35 4 98.00 66.00 1800 14.4 ford fiesta 35 4 78.00 52.00 1985 19.4 mazda glc deluxe 40 4 85.00 70.00 2070 18.6 datsun b210 gx 35 4 91.00 60.00 1800 16.4 honda civic cvcc 20 8 260.0 110.0 3365 15.5 oldsmobile cutlass salon brougham 20 8 318.0 140.0 3735 13.2 dodge diplomat 20 8 302.0 139.0 3570 12.8 mercury monarch ghia 20 6 231.0 105.0 3535 19.2 pontiac phoenix lj 20 6 200.0 95.00 3155 18.2 chevrolet malibu 20 6 200.0 85.00 2965 15.8 ford fairmont (auto) 25 4 140.0 88.00 2720 15.4 ford fairmont (man) 20 6 225.0 100.0 3430 17.2 plymouth volare 20 6 232.0 90.00 3210 17.2 amc concord 20 6 231.0 105.0 3380 15.8 buick century special 20 6 200.0 85.00 3070 16.7 mercury zephyr 20 6 225.0 110.0 3620 18.7 dodge aspen 20 6 258.0 120.0 3410 15.1 amc concord d/l 20 8 305.0 145.0 3425 13.2 chevrolet monte carlo landau 20 6 231.0 165.0 3445 13.4 buick regal sport coupe (turbo) 20 8 302.0 139.0 3205 11.2 ford futura 20 8 318.0 140.0 4080 13.7 dodge magnum xe 30 4 98.00 68.00 2155 16.5 chevrolet chevette 30 4 134.0 95.00 2560 14.2 toyota corona 25 4 119.0 97.00 2300 14.7 datsun 510 30 4 105.0 75.00 2230 14.5 dodge omni 20 4 134.0 95.00 2515 14.8 toyota celica gt liftback 25 4 156.0 105.0 2745 16.7 plymouth sapporo 25 4 151.0 85.00 2855 17.6 oldsmobile starfire sx 25 4 119.0 97.00 2405 14.9 datsun 200-sx 20 5 131.0 103.0 2830 15.9 audi 5000 15 6 163.0 125.0 3140 13.6 volvo 264gl 20 4 121.0 115.0 2795 15.7 saab 99gle 15 6 163.0 133.0 3410 15.8 peugeot 604sl 30 4 89.00 71.00 1990 14.9 volkswagen scirocco 30 4 98.00 68.00 2135 16.6 honda accord lx 20 6 231.0 115.0 3245 15.4 pontiac lemans v6 20 6 200.0 85.00 2990 18.2 mercury zephyr 6 20 4 140.0 88.00 2890 17.3 ford fairmont 4 20 6 232.0 90.00 3265 18.2 amc concord dl 6 20 6 225.0 110.0 3360 16.6 dodge aspen 6 15 8 305.0 130.0 3840 15.4 chevrolet caprice classic 20 8 302.0 129.0 3725 13.4 ford ltd landau 15 8 351.0 138.0 3955 13.2 mercury grand marquis 20 8 318.0 135.0 3830 15.2 dodge st. regis 15 8 350.0 155.0 4360 14.9 buick estate wagon (sw) 15 8 351.0 142.0 4054 14.3 ford country squire (sw) 20 8 267.0 125.0 3605 15.0 chevrolet malibu classic (sw) 20 8 360.0 150.0 3940 13.0 chrysler lebaron town @ country (sw) 30 4 89.00 71.00 1925 14.0 vw rabbit custom 35 4 86.00 65.00 1975 15.2 maxda glc deluxe 35 4 98.00 80.00 1915 14.4 dodge colt hatchback custom 25 4 121.0 80.00 2670 15.0 amc spirit dl 25 5 183.0 77.00 3530 20.1 mercedes benz 300d 25 8 350.0 125.0 3900 17.4 cadillac eldorado 25 4 141.0 71.00 3190 24.8 peugeot 504 25 8 260.0 90.00 3420 22.2 oldsmobile cutlass salon brougham 35 4 105.0 70.00 2200 13.2 plymouth horizon 35 4 105.0 70.00 2150 14.9 plymouth horizon tc3 30 4 85.00 65.00 2020 19.2 datsun 210 35 4 91.00 69.00 2130 14.7 fiat strada custom 30 4 151.0 90.00 2670 16.0 buick skylark limited 30 6 173.0 115.0 2595 11.3 chevrolet citation 25 6 173.0 115.0 2700 12.9 oldsmobile omega brougham 35 4 151.0 90.00 2556 13.2 pontiac phoenix 40 4 98.00 76.00 2144 14.7 vw rabbit 40 4 89.00 60.00 1968 18.8 toyota corolla tercel 30 4 98.00 70.00 2120 15.5 chevrolet chevette 35 4 86.00 65.00 2019 16.4 datsun 310 30 4 151.0 90.00 2678 16.5 chevrolet citation 25 4 140.0 88.00 2870 18.1 ford fairmont 25 4 151.0 90.00 3003 20.1 amc concord 20 6 225.0 90.00 3381 18.7 dodge aspen 35 4 97.00 78.00 2188 15.8 audi 4000 30 4 134.0 90.00 2711 15.5 toyota corona liftback 30 4 120.0 75.00 2542 17.5 mazda 626 35 4 119.0 92.00 2434 15.0 datsun 510 hatchback 30 4 108.0 75.00 2265 15.2 toyota corolla 45 4 86.00 65.00 2110 17.9 mazda glc 30 4 156.0 105.0 2800 14.4 dodge colt 40 4 85.00 65.00 2110 19.2 datsun 210 45 4 90.00 48.00 2085 21.7 vw rabbit c (diesel) 45 4 90.00 48.00 2335 23.7 vw dasher (diesel) 35 5 121.0 67.00 2950 19.9 audi 5000s (diesel) 30 4 146.0 67.00 3250 21.8 mercedes-benz 240d 45 4 91.00 67.00 1850 13.8 honda civic 1500 gl 35 4 97.00 67.00 2145 18.0 subaru dl 30 4 89.00 62.00 1845 15.3 vokswagen rabbit 35 6 168.0 132.0 2910 11.4 datsun 280-zx 25 3 70.00 100.0 2420 12.5 mazda rx-7 gs 35 4 122.0 88.00 2500 15.1 triumph tr7 coupe 30 4 107.0 72.00 2290 17.0 honda accord 25 4 135.0 84.00 2490 15.7 plymouth reliant 25 4 151.0 84.00 2635 16.4 buick skylark 25 4 156.0 92.00 2620 14.4 dodge aries wagon (sw) 25 6 173.0 110.0 2725 12.6 chevrolet citation 30 4 135.0 84.00 2385 12.9 plymouth reliant 40 4 79.00 58.00 1755 16.9 toyota starlet 40 4 86.00 64.00 1875 16.4 plymouth champ 35 4 81.00 60.00 1760 16.1 honda civic 1300 30 4 97.00 67.00 2065 17.8 subaru 35 4 85.00 65.00 1975 19.4 datsun 210 mpg 40 4 89.00 62.00 2050 17.3 toyota tercel 35 4 91.00 68.00 1985 16.0 mazda glc 4 35 4 105.0 63.00 2215 14.9 plymouth horizon 4 35 4 98.00 65.00 2045 16.2 ford escort 4w 30 4 98.00 65.00 2380 20.7 ford escort 2h 35 4 105.0 74.00 2190 14.2 volkswagen jetta 35 4 107.0 75.00 2210 14.4 honda prelude 30 4 108.0 75.00 2350 16.8 toyota corolla 35 4 119.0 100.0 2615 14.8 datsun 200sx 30 4 120.0 74.00 2635 18.3 mazda 626 30 4 141.0 80.00 3230 20.4 peugeot 505s turbo diesel 30 6 145.0 76.00 3160 19.6 volvo diesel 25 6 168.0 116.0 2900 12.6 toyota cressida 25 6 146.0 120.0 2930 13.8 datsun 810 maxima 20 6 231.0 110.0 3415 15.8 buick century 25 8 350.0 105.0 3725 19.0 oldsmobile cutlass ls 20 6 200.0 88.00 3060 17.1 ford granada gl 20 6 225.0 85.00 3465 16.6 chrysler lebaron salon 30 4 112.0 88.00 2605 19.6 chevrolet cavalier 25 4 112.0 88.00 2640 18.6 chevrolet cavalier wagon 35 4 112.0 88.00 2395 18.0 chevrolet cavalier 2-door 30 4 112.0 85.00 2575 16.2 pontiac j2000 se hatchback 30 4 135.0 84.00 2525 16.0 dodge aries se 25 4 151.0 90.00 2735 18.0 pontiac phoenix 25 4 140.0 92.00 2865 16.4 ford fairmont futura 35 4 105.0 74.00 1980 15.3 volkswagen rabbit l 35 4 91.00 68.00 2025 18.2 mazda glc custom l 30 4 91.00 68.00 1970 17.6 mazda glc custom 40 4 105.0 63.00 2125 14.7 plymouth horizon miser 35 4 98.00 70.00 2125 17.3 mercury lynx l 35 4 120.0 88.00 2160 14.5 nissan stanza xe 35 4 107.0 75.00 2205 14.5 honda accord 35 4 108.0 70.00 2245 16.9 toyota corolla 40 4 91.00 67.00 1965 15.0 honda civic 30 4 91.00 67.00 1965 15.7 honda civic (auto) 40 4 91.00 67.00 1995 16.2 datsun 310 gx 25 6 181.0 110.0 2945 16.4 buick century limited 40 6 262.0 85.00 3015 17.0 oldsmobile cutlass ciera (diesel) 25 4 156.0 92.00 2585 14.5 chrysler lebaron medallion 20 6 232.0 112.0 2835 14.7 ford granada l 30 4 144.0 96.00 2665 13.9 toyota celica gt 35 4 135.0 84.00 2370 13.0 dodge charger 2.2 25 4 151.0 90.00 2950 17.3 chevrolet camaro 25 4 140.0 86.00 2790 15.6 ford mustang gl 45 4 97.00 52.00 2130 24.6 vw pickup 30 4 135.0 84.00 2295 11.6 dodge rampage 30 4 120.0 79.00 2625 18.6 ford ranger 30 4 119.0 82.00 2720 19.4 chevy s-10 ================================================ FILE: chapter-4/nearestNeighborClassifier.py ================================================ # # Nearest Neighbor Classifier # # # Code file for the book Programmer's Guide to Data Mining # http://guidetodatamining.com # # Ron Zacharski # ## I am trying to make the classifier more general purpose ## by reading the data from a file. ## Each line of the file contains tab separated fields. ## The first line of the file describes how those fields (columns) should ## be interpreted. The descriptors in the fields of the first line are: ## ## comment - this field should be interpreted as a comment ## class - this field describes the class of the field ## num - this field describes an integer attribute that should ## be included in the computation. ## ## more to be described as needed ## ## ## So, for example, if our file describes athletes and is of the form: ## Shavonte Zellous basketball 70 155 ## The first line might be: ## comment class num num ## ## Meaning the first column (name of the player) should be considered a comment; ## the next column represents the class of the entry (the sport); ## and the next 2 represent attributes to use in the calculations. ## ## The classifer reads this file into the list called data. ## The format of each entry in that list is a tuple ## ## (class, normalized attribute-list, comment-list) ## ## so, for example ## ## [('basketball', [1.28, 1.71], ['Brittainey Raven']), ## ('basketball', [0.89, 1.47], ['Shavonte Zellous']), ## ('gymnastics', [-1.68, -0.75], ['Shawn Johnson']), ## ('gymnastics', [-2.27, -1.2], ['Ksenia Semenova']), ## ('track', [0.09, -0.06], ['Blake Russell'])] ## class Classifier: def __init__(self, filename): self.medianAndDeviation = [] # reading the data in from the file f = open(filename) lines = f.readlines() f.close() self.format = lines[0].strip().split('\t') self.data = [] for line in lines[1:]: fields = line.strip().split('\t') ignore = [] vector = [] for i in range(len(fields)): if self.format[i] == 'num': vector.append(float(fields[i])) elif self.format[i] == 'comment': ignore.append(fields[i]) elif self.format[i] == 'class': classification = fields[i] self.data.append((classification, vector, ignore)) self.rawData = list(self.data) # get length of instance vector self.vlen = len(self.data[0][1]) # now normalize the data for i in range(self.vlen): self.normalizeColumn(i) ################################################## ### ### CODE TO COMPUTE THE MODIFIED STANDARD SCORE def getMedian(self, alist): """return median of alist""" if alist == []: return [] blist = sorted(alist) length = len(alist) if length % 2 == 1: # length of list is odd so return middle element return blist[int(((length + 1) / 2) - 1)] else: # length of list is even so compute midpoint v1 = blist[int(length / 2)] v2 =blist[(int(length / 2) - 1)] return (v1 + v2) / 2.0 def getAbsoluteStandardDeviation(self, alist, median): """given alist and median return absolute standard deviation""" sum = 0 for item in alist: sum += abs(item - median) return sum / len(alist) def normalizeColumn(self, columnNumber): """given a column number, normalize that column in self.data""" # first extract values to list col = [v[1][columnNumber] for v in self.data] median = self.getMedian(col) asd = self.getAbsoluteStandardDeviation(col, median) #print("Median: %f ASD = %f" % (median, asd)) self.medianAndDeviation.append((median, asd)) for v in self.data: v[1][columnNumber] = (v[1][columnNumber] - median) / asd def normalizeVector(self, v): """We have stored the median and asd for each column. We now use them to normalize vector v""" vector = list(v) for i in range(len(vector)): (median, asd) = self.medianAndDeviation[i] vector[i] = (vector[i] - median) / asd return vector ### ### END NORMALIZATION ################################################## def manhattan(self, vector1, vector2): """Computes the Manhattan distance.""" return sum(map(lambda v1, v2: abs(v1 - v2), vector1, vector2)) def nearestNeighbor(self, itemVector): """return nearest neighbor to itemVector""" return min([ (self.manhattan(itemVector, item[1]), item) for item in self.data]) def classify(self, itemVector): """Return class we think item Vector is in""" return(self.nearestNeighbor(self.normalizeVector(itemVector))[1][0]) def unitTest(): classifier = Classifier('athletesTrainingSet.txt') br = ('Basketball', [72, 162], ['Brittainey Raven']) nl = ('Gymnastics', [61, 76], ['Viktoria Komova']) cl = ("Basketball", [74, 190], ['Crystal Langhorne']) # first check normalize function brNorm = classifier.normalizeVector(br[1]) nlNorm = classifier.normalizeVector(nl[1]) clNorm = classifier.normalizeVector(cl[1]) assert(brNorm == classifier.data[1][1]) assert(nlNorm == classifier.data[-1][1]) print('normalizeVector fn OK') # check distance assert (round(classifier.manhattan(clNorm, classifier.data[1][1]), 5) == 1.16823) assert(classifier.manhattan(brNorm, classifier.data[1][1]) == 0) assert(classifier.manhattan(nlNorm, classifier.data[-1][1]) == 0) print('Manhattan distance fn OK') # Brittainey Raven's nearest neighbor should be herself result = classifier.nearestNeighbor(brNorm) assert(result[1][2]== br[2]) # Nastia Liukin's nearest neighbor should be herself result = classifier.nearestNeighbor(nlNorm) assert(result[1][2]== nl[2]) # Crystal Langhorne's nearest neighbor is Jennifer Lacy" assert(classifier.nearestNeighbor(clNorm)[1][2][0] == "Jennifer Lacy") print("Nearest Neighbor fn OK") # Check if classify correctly identifies sports assert(classifier.classify(br[1]) == 'Basketball') assert(classifier.classify(cl[1]) == 'Basketball') assert(classifier.classify(nl[1]) == 'Gymnastics') print('Classify fn OK') def test(training_filename, test_filename): """Test the classifier on a test set of data""" classifier = Classifier(training_filename) f = open(test_filename) lines = f.readlines() f.close() numCorrect = 0.0 for line in lines: data = line.strip().split('\t') vector = [] classInColumn = -1 for i in range(len(classifier.format)): if classifier.format[i] == 'num': vector.append(float(data[i])) elif classifier.format[i] == 'class': classInColumn = i theClass= classifier.classify(vector) prefix = '-' if theClass == data[classInColumn]: # it is correct numCorrect += 1 prefix = '+' print("%s %12s %s" % (prefix, theClass, line)) print("%4.2f%% correct" % (numCorrect * 100/ len(lines))) ## ## Here are examples of how the classifier is used on different data sets ## in the book. # test('athletesTrainingSet.txt', 'athletesTestSet.txt') # test("irisTrainingSet.data", "irisTestSet.data") # test("mpgTrainingSet.txt", "mpgTestSet.txt") ================================================ FILE: chapter-4/normalizeColumnTemplate.py ================================================ # # normalize column # # This is the template for you to write and test the method # # normalizeColumn # # You will also need the file athletesTrainingSet.txt # # For use with the book Programmer's Guide to Data Mining # http://guidetodatamining.com # # Ron Zacharski # class Classifier: def __init__(self, filename): self.medianAndDeviation = [] # reading the data in from the file f = open(filename) lines = f.readlines() f.close() self.format = lines[0].strip().split('\t') self.data = [] for line in lines[1:]: fields = line.strip().split('\t') ignore = [] vector = [] for i in range(len(fields)): if self.format[i] == 'num': vector.append(int(fields[i])) elif self.format[i] == 'comment': ignore.append(fields[i]) elif self.format[i] == 'class': classification = fields[i] self.data.append((classification, vector, ignore)) self.rawData = list(self.data) # get length of instance vector self.vlen = len(self.data[0][1]) # now normalize the data for i in range(self.vlen): self.normalizeColumn(i) def getMedian(self, alist): """return median of alist""" if alist == []: return [] blist = sorted(alist) length = len(alist) if length % 2 == 1: # length of list is odd so return middle element return blist[int(((length + 1) / 2) - 1)] else: # length of list is even so compute midpoint v1 = blist[int(length / 2)] v2 =blist[(int(length / 2) - 1)] return (v1 + v2) / 2.0 def getAbsoluteStandardDeviation(self, alist, median): """given alist and median return absolute standard deviation""" sum = 0 for item in alist: sum += abs(item - median) return sum / len(alist) ################################################## ### ### FINISH WRITING THIS METHOD def normalizeColumn(self, columnNumber): """given a column number, normalize that column in self.data using the Modified Standard Score""" """ TO BE DONE""" ### ### ################################################## def unitTest(): classifier = Classifier('athletesTrainingSet.txt') # # test median and absolute standard deviation methods list1 = [54, 72, 78, 49, 65, 63, 75, 67, 54, 76, 68, 61, 58, 70, 70, 70, 63, 65, 66, 61] list2 = [66, 162, 204, 90, 99, 106, 175, 123, 68, 200, 163, 95, 77, 108, 155, 155, 108, 106, 97, 76] m1 = classifier.getMedian(list1) assert(round(m1, 3) == 65.5) m2 = classifier.getMedian(list2) assert(round(m2, 3) == 107) assert(round(classifier.getAbsoluteStandardDeviation(list1, m1),3) == 5.95) assert(round(classifier.getAbsoluteStandardDeviation(list2, m2),3) == 33.65) print("getMedian and getAbsoluteStandardDeviation are OK") # test normalizeColumn list1 = [[-1.9328, -1.2184], [1.0924, 1.6345], [2.1008, 2.8826], [-2.7731, -0.5052], [-0.084, -0.2377], [-0.4202, -0.0297], [1.5966, 2.0208], [0.2521, 0.4755], [-1.9328, -1.159], [1.7647, 2.7637], [0.4202, 1.6642], [-0.7563, -0.3566], [-1.2605, -0.8915], [0.7563, 0.0297], [0.7563, 1.4264], [0.7563, 1.4264], [-0.4202, 0.0297], [-0.084, -0.0297], [0.084, -0.2972], [-0.7563, -0.9212]] for i in range(len(list1)): assert(round(classifier.data[i][1][0],4) == list1[i][0]) assert(round(classifier.data[i][1][1],4) == list1[i][1]) print("normalizeColumn is OK") unitTest() ================================================ FILE: chapter-4/testMedianAndASD.py ================================================ # # Template -- please add code for the two functions # getMedian # getAbsoluteStandardDeviation # # also download the file athletesTrainingSet.txt, which you should # put in the same folder as this file. class Classifier: def __init__(self, filename): self.medianAndDeviation = [] # reading the data in from the file f = open(filename) lines = f.readlines() f.close() self.format = lines[0].strip().split('\t') self.data = [] for line in lines[1:]: fields = line.strip().split('\t') ignore = [] vector = [] for i in range(len(fields)): if self.format[i] == 'num': vector.append(int(fields[i])) elif self.format[i] == 'comment': ignore.append(fields[i]) elif self.format[i] == 'class': classification = fields[i] self.data.append((classification, vector, ignore)) self.rawData = list(self.data) ################################################## ### ### FINISH THE FOLLOWING TWO METHODS def getMedian(self, alist): """return median of alist""" """TO BE DONE""" return 0 def getAbsoluteStandardDeviation(self, alist, median): """given alist and median return absolute standard deviation""" """TO BE DONE""" return 0 ### ### ################################################## def unitTest(): list1 = [54, 72, 78, 49, 65, 63, 75, 67, 54] list2 = [54, 72, 78, 49, 65, 63, 75, 67, 54, 68] list3 = [69] list4 = [69, 72] classifier = Classifier('athletesTrainingSet.txt') m1 = classifier.getMedian(list1) m2 = classifier.getMedian(list2) m3 = classifier.getMedian(list3) m4 = classifier.getMedian(list4) asd1 = classifier.getAbsoluteStandardDeviation(list1, m1) asd2 = classifier.getAbsoluteStandardDeviation(list2, m2) asd3 = classifier.getAbsoluteStandardDeviation(list3, m3) asd4 = classifier.getAbsoluteStandardDeviation(list4, m4) assert(round(m1, 3) == 65) assert(round(m2, 3) == 66) assert(round(m3, 3) == 69) assert(round(m4, 3) == 70.5) assert(round(asd1, 3) == 8) assert(round(asd2, 3) == 7.5) assert(round(asd3, 3) == 0) assert(round(asd4, 3) == 1.5) print("getMedian and getAbsoluteStandardDeviation work correctly") unitTest() ================================================ FILE: chapter-5/crossValidation.py ================================================ # # # Nearest Neighbor Classifier for mpg dataset # # for chapter 5 page 14 # # Code file for the book Programmer's Guide to Data Mining # http://guidetodatamining.com # # Ron Zacharski # class Classifier: def __init__(self, bucketPrefix, testBucketNumber, dataFormat): """ a classifier will be built from files with the bucketPrefix excluding the file with textBucketNumber. dataFormat is a string that describes how to interpret each line of the data files. For example, for the mpg data the format is: "class num num num num num comment" """ self.medianAndDeviation = [] # reading the data in from the file self.format = dataFormat.strip().split('\t') self.data = [] # for each of the buckets numbered 1 through 10: for i in range(1, 11): # if it is not the bucket we should ignore, read in the data if i != testBucketNumber: filename = "%s-%02i" % (bucketPrefix, i) f = open(filename) lines = f.readlines() f.close() for line in lines[1:]: fields = line.strip().split('\t') ignore = [] vector = [] for i in range(len(fields)): if self.format[i] == 'num': vector.append(float(fields[i])) elif self.format[i] == 'comment': ignore.append(fields[i]) elif self.format[i] == 'class': classification = fields[i] self.data.append((classification, vector, ignore)) self.rawData = list(self.data) # get length of instance vector self.vlen = len(self.data[0][1]) # now normalize the data for i in range(self.vlen): self.normalizeColumn(i) ################################################## ### ### CODE TO COMPUTE THE MODIFIED STANDARD SCORE def getMedian(self, alist): """return median of alist""" if alist == []: return [] blist = sorted(alist) length = len(alist) if length % 2 == 1: # length of list is odd so return middle element return blist[int(((length + 1) / 2) - 1)] else: # length of list is even so compute midpoint v1 = blist[int(length / 2)] v2 =blist[(int(length / 2) - 1)] return (v1 + v2) / 2.0 def getAbsoluteStandardDeviation(self, alist, median): """given alist and median return absolute standard deviation""" sum = 0 for item in alist: sum += abs(item - median) return sum / len(alist) def normalizeColumn(self, columnNumber): """given a column number, normalize that column in self.data""" # first extract values to list col = [v[1][columnNumber] for v in self.data] median = self.getMedian(col) asd = self.getAbsoluteStandardDeviation(col, median) #print("Median: %f ASD = %f" % (median, asd)) self.medianAndDeviation.append((median, asd)) for v in self.data: v[1][columnNumber] = (v[1][columnNumber] - median) / asd def normalizeVector(self, v): """We have stored the median and asd for each column. We now use them to normalize vector v""" vector = list(v) for i in range(len(vector)): (median, asd) = self.medianAndDeviation[i] vector[i] = (vector[i] - median) / asd return vector ### ### END NORMALIZATION ################################################## def testBucket(self, bucketPrefix, bucketNumber): """Evaluate the classifier with data from the file bucketPrefix-bucketNumber""" filename = "%s-%02i" % (bucketPrefix, bucketNumber) f = open(filename) lines = f.readlines() totals = {} f.close() for line in lines: data = line.strip().split('\t') vector = [] classInColumn = -1 for i in range(len(self.format)): if self.format[i] == 'num': vector.append(float(data[i])) elif self.format[i] == 'class': classInColumn = i theRealClass = data[classInColumn] classifiedAs = self.classify(vector) totals.setdefault(theRealClass, {}) totals[theRealClass].setdefault(classifiedAs, 0) totals[theRealClass][classifiedAs] += 1 return totals def manhattan(self, vector1, vector2): """Computes the Manhattan distance.""" return sum(map(lambda v1, v2: abs(v1 - v2), vector1, vector2)) def nearestNeighbor(self, itemVector): """return nearest neighbor to itemVector""" return min([ (self.manhattan(itemVector, item[1]), item) for item in self.data]) def classify(self, itemVector): """Return class we think item Vector is in""" return(self.nearestNeighbor(self.normalizeVector(itemVector))[1][0]) def tenfold(bucketPrefix, dataFormat): results = {} for i in range(1, 11): c = Classifier(bucketPrefix, i, dataFormat) t = c.testBucket(bucketPrefix, i) for (key, value) in t.items(): results.setdefault(key, {}) for (ckey, cvalue) in value.items(): results[key].setdefault(ckey, 0) results[key][ckey] += cvalue # now print results categories = list(results.keys()) categories.sort() print( "\n Classified as: ") header = " " subheader = " +" for category in categories: header += category + " " subheader += "----+" print (header) print (subheader) total = 0.0 correct = 0.0 for category in categories: row = category + " |" for c2 in categories: if c2 in results[category]: count = results[category][c2] else: count = 0 row += " %2i |" % count total += count if c2 == category: correct += count print(row) print(subheader) print("\n%5.3f percent correct" %((correct * 100) / total)) print("total of %i instances" % total) tenfold("mpgData/mpgData", "class num num num num num comment") ================================================ FILE: chapter-5/divide.py ================================================ # divide data into 10 buckets import random def buckets(filename, bucketName, separator, classColumn): """the original data is in the file named filename bucketName is the prefix for all the bucket names separator is the character that divides the columns (for ex., a tab or comma and classColumn is the column that indicates the class""" # put the data in 10 buckets numberOfBuckets = 10 data = {} # first read in the data and divide by category with open(filename) as f: lines = f.readlines() for line in lines: if separator != '\t': line = line.replace(separator, '\t') # first get the category category = line.split()[classColumn] data.setdefault(category, []) data[category].append(line) # initialize the buckets buckets = [] for i in range(numberOfBuckets): buckets.append([]) # now for each category put the data into the buckets for k in data.keys(): #randomize order of instances for each class random.shuffle(data[k]) bNum = 0 # divide into buckets for item in data[k]: buckets[bNum].append(item) bNum = (bNum + 1) % numberOfBuckets # write to file for bNum in range(numberOfBuckets): f = open("%s-%02i" % (bucketName, bNum + 1), 'w') for item in buckets[bNum]: f.write(item) f.close() # example of how to use this code buckets("pimaSmall.txt", 'pimaSmall',',',8) ================================================ FILE: chapter-5/pimaKNN.py ================================================ # # # Nearest Neighbor Classifier for Pima dataset # # # Code file for the book Programmer's Guide to Data Mining # http://guidetodatamining.com # # Ron Zacharski # import heapq import random class Classifier: def __init__(self, bucketPrefix, testBucketNumber, dataFormat, k): """ a classifier will be built from files with the bucketPrefix excluding the file with textBucketNumber. dataFormat is a string that describes how to interpret each line of the data files. For example, for the mpg data the format is: "class num num num num num comment" """ self.medianAndDeviation = [] self.k = k # reading the data in from the file self.format = dataFormat.strip().split('\t') self.data = [] # for each of the buckets numbered 1 through 10: for i in range(1, 11): # if it is not the bucket we should ignore, read in the data if i != testBucketNumber: filename = "%s-%02i" % (bucketPrefix, i) f = open(filename) lines = f.readlines() f.close() for line in lines[1:]: fields = line.strip().split('\t') ignore = [] vector = [] for i in range(len(fields)): if self.format[i] == 'num': vector.append(float(fields[i])) elif self.format[i] == 'comment': ignore.append(fields[i]) elif self.format[i] == 'class': classification = fields[i] self.data.append((classification, vector, ignore)) self.rawData = list(self.data) # get length of instance vector self.vlen = len(self.data[0][1]) # now normalize the data for i in range(self.vlen): self.normalizeColumn(i) ################################################## ### ### CODE TO COMPUTE THE MODIFIED STANDARD SCORE def getMedian(self, alist): """return median of alist""" if alist == []: return [] blist = sorted(alist) length = len(alist) if length % 2 == 1: # length of list is odd so return middle element return blist[int(((length + 1) / 2) - 1)] else: # length of list is even so compute midpoint v1 = blist[int(length / 2)] v2 =blist[(int(length / 2) - 1)] return (v1 + v2) / 2.0 def getAbsoluteStandardDeviation(self, alist, median): """given alist and median return absolute standard deviation""" sum = 0 for item in alist: sum += abs(item - median) return sum / len(alist) def normalizeColumn(self, columnNumber): """given a column number, normalize that column in self.data""" # first extract values to list col = [v[1][columnNumber] for v in self.data] median = self.getMedian(col) asd = self.getAbsoluteStandardDeviation(col, median) #print("Median: %f ASD = %f" % (median, asd)) self.medianAndDeviation.append((median, asd)) for v in self.data: v[1][columnNumber] = (v[1][columnNumber] - median) / asd def normalizeVector(self, v): """We have stored the median and asd for each column. We now use them to normalize vector v""" vector = list(v) for i in range(len(vector)): (median, asd) = self.medianAndDeviation[i] vector[i] = (vector[i] - median) / asd return vector ### ### END NORMALIZATION ################################################## def testBucket(self, bucketPrefix, bucketNumber): """Evaluate the classifier with data from the file bucketPrefix-bucketNumber""" filename = "%s-%02i" % (bucketPrefix, bucketNumber) f = open(filename) lines = f.readlines() totals = {} f.close() for line in lines: data = line.strip().split('\t') vector = [] classInColumn = -1 for i in range(len(self.format)): if self.format[i] == 'num': vector.append(float(data[i])) elif self.format[i] == 'class': classInColumn = i theRealClass = data[classInColumn] #print("REAL ", theRealClass) classifiedAs = self.classify(vector) totals.setdefault(theRealClass, {}) totals[theRealClass].setdefault(classifiedAs, 0) totals[theRealClass][classifiedAs] += 1 return totals def manhattan(self, vector1, vector2): """Computes the Manhattan distance.""" return sum(map(lambda v1, v2: abs(v1 - v2), vector1, vector2)) def nearestNeighbor(self, itemVector): """return nearest neighbor to itemVector""" return min([ (self.manhattan(itemVector, item[1]), item) for item in self.data]) def knn(self, itemVector): """returns the predicted class of itemVector using k Nearest Neighbors""" # changed from min to heapq.nsmallest to get the # k closest neighbors neighbors = heapq.nsmallest(self.k, [(self.manhattan(itemVector, item[1]), item) for item in self.data]) # each neighbor gets a vote results = {} for neighbor in neighbors: theClass = neighbor[1][0] results.setdefault(theClass, 0) results[theClass] += 1 resultList = sorted([(i[1], i[0]) for i in results.items()], reverse=True) #get all the classes that have the maximum votes maxVotes = resultList[0][0] possibleAnswers = [i[1] for i in resultList if i[0] == maxVotes] # randomly select one of the classes that received the max votes answer = random.choice(possibleAnswers) return( answer) def classify(self, itemVector): """Return class we think item Vector is in""" # k represents how many nearest neighbors to use return(self.knn(self.normalizeVector(itemVector))) def tenfold(bucketPrefix, dataFormat, k): results = {} for i in range(1, 11): c = Classifier(bucketPrefix, i, dataFormat, k) t = c.testBucket(bucketPrefix, i) for (key, value) in t.items(): results.setdefault(key, {}) for (ckey, cvalue) in value.items(): results[key].setdefault(ckey, 0) results[key][ckey] += cvalue # now print results categories = list(results.keys()) categories.sort() print( "\n Classified as: ") header = " " subheader = " +" for category in categories: header += "% 2s " % category subheader += "-----+" print (header) print (subheader) total = 0.0 correct = 0.0 for category in categories: row = " %s |" % category for c2 in categories: if c2 in results[category]: count = results[category][c2] else: count = 0 row += " %3i |" % count total += count if c2 == category: correct += count print(row) print(subheader) print("\n%5.3f percent correct" %((correct * 100) / total)) print("total of %i instances" % total) print("SMALL DATA SET") tenfold("pimaSmall/pimaSmall", "num num num num num num num num class", 3) print("\n\nLARGE DATA SET") tenfold("pima/pima", "num num num num num num num num class", 3) ================================================ FILE: chapter-6/naiveBayes.py ================================================ # # Naive Bayes Classifier chapter 6 # # _____________________________________________________________________ class Classifier: def __init__(self, bucketPrefix, testBucketNumber, dataFormat): """ a classifier will be built from files with the bucketPrefix excluding the file with textBucketNumber. dataFormat is a string that describes how to interpret each line of the data files. For example, for the iHealth data the format is: "attr attr attr attr class" """ total = 0 classes = {} counts = {} # reading the data in from the file self.format = dataFormat.strip().split('\t') self.prior = {} self.conditional = {} # for each of the buckets numbered 1 through 10: for i in range(1, 11): # if it is not the bucket we should ignore, read in the data if i != testBucketNumber: filename = "%s-%02i" % (bucketPrefix, i) f = open(filename) lines = f.readlines() f.close() for line in lines: fields = line.strip().split('\t') ignore = [] vector = [] for i in range(len(fields)): if self.format[i] == 'num': vector.append(float(fields[i])) elif self.format[i] == 'attr': vector.append(fields[i]) elif self.format[i] == 'comment': ignore.append(fields[i]) elif self.format[i] == 'class': category = fields[i] # now process this instance total += 1 classes.setdefault(category, 0) counts.setdefault(category, {}) classes[category] += 1 # now process each attribute of the instance col = 0 for columnValue in vector: col += 1 counts[category].setdefault(col, {}) counts[category][col].setdefault(columnValue, 0) counts[category][col][columnValue] += 1 # # ok done counting. now compute probabilities # # first prior probabilities p(h) # for (category, count) in classes.items(): self.prior[category] = count / total # # now compute conditional probabilities p(h|D) # for (category, columns) in counts.items(): self.conditional.setdefault(category, {}) for (col, valueCounts) in columns.items(): self.conditional[category].setdefault(col, {}) for (attrValue, count) in valueCounts.items(): self.conditional[category][col][attrValue] = ( count / classes[category]) self.tmp = counts def testBucket(self, bucketPrefix, bucketNumber): """Evaluate the classifier with data from the file bucketPrefix-bucketNumber""" filename = "%s-%02i" % (bucketPrefix, bucketNumber) f = open(filename) lines = f.readlines() totals = {} f.close() loc = 1 for line in lines: loc += 1 data = line.strip().split('\t') vector = [] classInColumn = -1 for i in range(len(self.format)): if self.format[i] == 'num': vector.append(float(data[i])) elif self.format[i] == 'attr': vector.append(data[i]) elif self.format[i] == 'class': classInColumn = i theRealClass = data[classInColumn] classifiedAs = self.classify(vector) totals.setdefault(theRealClass, {}) totals[theRealClass].setdefault(classifiedAs, 0) totals[theRealClass][classifiedAs] += 1 return totals def classify(self, itemVector): """Return class we think item Vector is in""" results = [] for (category, prior) in self.prior.items(): prob = prior col = 1 for attrValue in itemVector: if not attrValue in self.conditional[category][col]: # we did not find any instances of this attribute value # occurring with this category so prob = 0 prob = 0 else: prob = prob * self.conditional[category][col][attrValue] col += 1 results.append((prob, category)) # return the category with the highest probability return(max(results)[1]) def tenfold(bucketPrefix, dataFormat): results = {} for i in range(1, 11): c = Classifier(bucketPrefix, i, dataFormat) t = c.testBucket(bucketPrefix, i) for (key, value) in t.items(): results.setdefault(key, {}) for (ckey, cvalue) in value.items(): results[key].setdefault(ckey, 0) results[key][ckey] += cvalue # now print results categories = list(results.keys()) categories.sort() print( "\n Classified as: ") header = " " subheader = " +" for category in categories: header += "% 10s " % category subheader += "-------+" print (header) print (subheader) total = 0.0 correct = 0.0 for category in categories: row = " %10s |" % category for c2 in categories: if c2 in results[category]: count = results[category][c2] else: count = 0 row += " %5i |" % count total += count if c2 == category: correct += count print(row) print(subheader) print("\n%5.3f percent correct" %((correct * 100) / total)) print("total of %i instances" % total) tenfold("house-votes/hv", "class\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr") #c = Classifier("house-votes/hv", 0, # "class\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr") #c = Classifier("iHealth/i", 10, # "attr\tattr\tattr\tattr\tclass") #print(c.classify(['health', 'moderate', 'moderate', 'yes'])) #c = Classifier("house-votes-filtered/hv", 5, "class\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr") #t = c.testBucket("house-votes-filtered/hv", 5) #print(t) ================================================ FILE: chapter-6/naiveBayesDensityFunction.py ================================================ # # Naive Bayes Classifier chapter 6 # # _____________________________________________________________________ import math class Classifier: def __init__(self, bucketPrefix, testBucketNumber, dataFormat): """ a classifier will be built from files with the bucketPrefix excluding the file with textBucketNumber. dataFormat is a string that describes how to interpret each line of the data files. For example, for the iHealth data the format is: "attr attr attr attr class" """ total = 0 classes = {} # counts used for attributes that are not numeric counts = {} # totals used for attributes that are numereric # we will use these to compute the mean and sample standard deviation for # each attribute - class pair. totals = {} numericValues = {} # reading the data in from the file self.format = dataFormat.strip().split('\t') # self.prior = {} self.conditional = {} # for each of the buckets numbered 1 through 10: for i in range(1, 11): # if it is not the bucket we should ignore, read in the data if i != testBucketNumber: filename = "%s-%02i" % (bucketPrefix, i) f = open(filename) lines = f.readlines() f.close() for line in lines: fields = line.strip().split('\t') ignore = [] vector = [] nums = [] for i in range(len(fields)): if self.format[i] == 'num': nums.append(float(fields[i])) elif self.format[i] == 'attr': vector.append(fields[i]) elif self.format[i] == 'comment': ignore.append(fields[i]) elif self.format[i] == 'class': category = fields[i] # now process this instance total += 1 classes.setdefault(category, 0) counts.setdefault(category, {}) totals.setdefault(category, {}) numericValues.setdefault(category, {}) classes[category] += 1 # now process each non-numeric attribute of the instance col = 0 for columnValue in vector: col += 1 counts[category].setdefault(col, {}) counts[category][col].setdefault(columnValue, 0) counts[category][col][columnValue] += 1 # process numeric attributes col = 0 for columnValue in nums: col += 1 totals[category].setdefault(col, 0) #totals[category][col].setdefault(columnValue, 0) totals[category][col] += columnValue numericValues[category].setdefault(col, []) numericValues[category][col].append(columnValue) # # ok done counting. now compute probabilities # # first prior probabilities p(h) # for (category, count) in classes.items(): self.prior[category] = count / total # # now compute conditional probabilities p(h|D) # for (category, columns) in counts.items(): self.conditional.setdefault(category, {}) for (col, valueCounts) in columns.items(): self.conditional[category].setdefault(col, {}) for (attrValue, count) in valueCounts.items(): self.conditional[category][col][attrValue] = ( count / classes[category]) self.tmp = counts # # now compute mean and sample standard deviation # self.means = {} self.totals = totals for (category, columns) in totals.items(): self.means.setdefault(category, {}) for (col, cTotal) in columns.items(): self.means[category][col] = cTotal / classes[category] # standard deviation self.ssd = {} for (category, columns) in numericValues.items(): self.ssd.setdefault(category, {}) for (col, values) in columns.items(): SumOfSquareDifferences = 0 theMean = self.means[category][col] for value in values: SumOfSquareDifferences += (value - theMean)**2 columns[col] = 0 self.ssd[category][col] = math.sqrt(SumOfSquareDifferences / (classes[category] - 1)) def testBucket(self, bucketPrefix, bucketNumber): """Evaluate the classifier with data from the file bucketPrefix-bucketNumber""" filename = "%s-%02i" % (bucketPrefix, bucketNumber) f = open(filename) lines = f.readlines() totals = {} f.close() loc = 1 for line in lines: loc += 1 data = line.strip().split('\t') vector = [] numV = [] classInColumn = -1 for i in range(len(self.format)): if self.format[i] == 'num': numV.append(float(data[i])) elif self.format[i] == 'attr': vector.append(data[i]) elif self.format[i] == 'class': classInColumn = i theRealClass = data[classInColumn] classifiedAs = self.classify(vector, numV) totals.setdefault(theRealClass, {}) totals[theRealClass].setdefault(classifiedAs, 0) totals[theRealClass][classifiedAs] += 1 return totals def classify(self, itemVector, numVector): """Return class we think item Vector is in""" results = [] sqrt2pi = math.sqrt(2 * math.pi) for (category, prior) in self.prior.items(): prob = prior col = 1 for attrValue in itemVector: if not attrValue in self.conditional[category][col]: # we did not find any instances of this attribute value # occurring with this category so prob = 0 prob = 0 else: prob = prob * self.conditional[category][col][attrValue] col += 1 col = 1 for x in numVector: mean = self.means[category][col] ssd = self.ssd[category][col] ePart = math.pow(math.e, -(x - mean)**2/(2*ssd**2)) prob = prob * ((1.0 / (sqrt2pi*ssd)) * ePart) col += 1 results.append((prob, category)) # return the category with the highest probability #print(results) return(max(results)[1]) def tenfold(bucketPrefix, dataFormat): results = {} for i in range(1, 11): c = Classifier(bucketPrefix, i, dataFormat) t = c.testBucket(bucketPrefix, i) for (key, value) in t.items(): results.setdefault(key, {}) for (ckey, cvalue) in value.items(): results[key].setdefault(ckey, 0) results[key][ckey] += cvalue # now print results categories = list(results.keys()) categories.sort() print( "\n Classified as: ") header = " " subheader = " +" for category in categories: header += "% 10s " % category subheader += "-------+" print (header) print (subheader) total = 0.0 correct = 0.0 for category in categories: row = " %10s |" % category for c2 in categories: if c2 in results[category]: count = results[category][c2] else: count = 0 row += " %5i |" % count total += count if c2 == category: correct += count print(row) print(subheader) print("\n%5.3f percent correct" %((correct * 100) / total)) print("total of %i instances" % total) def pdf(mean, ssd, x): """Probability Density Function computing P(x|y) input is the mean, sample standard deviation for all the items in y, and x.""" ePart = math.pow(math.e, -(x-mean)**2/(2*ssd**2)) print (ePart) return (1.0 / (math.sqrt(2*math.pi)*ssd)) * ePart #tenfold("house-votes/hv", "class\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr") #c = Classifier("house-votes/hv", 0, # "class\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr") tenfold("pimaSmall/pimaSmall", "num num num num num num num num class") tenfold("pima/pima", "num num num num num num num num class") #c = Classifier("iHealth/i", 10, # "attr\tattr\tattr\tattr\tclass") #print(c.classify([], [3, 78, 50, 32, 88, 31.0, 0.248, 26])) #c = Classifier("house-votes-filtered/hv", 5, "class\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr") #t = c.testBucket("house-votes-filtered/hv", 5) #print(t) ================================================ FILE: chapter-7/bayesSentiment.py ================================================ from __future__ import print_function import os, codecs, math class BayesText: def __init__(self, trainingdir, stopwordlist, ignoreBucket): """This class implements a naive Bayes approach to text classification trainingdir is the training data. Each subdirectory of trainingdir is titled with the name of the classification category -- those subdirectories in turn contain the text files for that category. The stopwordlist is a list of words (one per line) will be removed before any counting takes place. """ self.vocabulary = {} self.prob = {} self.totals = {} self.stopwords = {} f = open(stopwordlist) for line in f: self.stopwords[line.strip()] = 1 f.close() categories = os.listdir(trainingdir) #filter out files that are not directories self.categories = [filename for filename in categories if os.path.isdir(trainingdir + filename)] print("Counting ...") for category in self.categories: #print(' ' + category) (self.prob[category], self.totals[category]) = self.train(trainingdir, category, ignoreBucket) # I am going to eliminate any word in the vocabulary # that doesn't occur at least 3 times toDelete = [] for word in self.vocabulary: if self.vocabulary[word] < 3: # mark word for deletion # can't delete now because you can't delete # from a list you are currently iterating over toDelete.append(word) # now delete for word in toDelete: del self.vocabulary[word] # now compute probabilities vocabLength = len(self.vocabulary) #print("Computing probabilities:") for category in self.categories: #print(' ' + category) denominator = self.totals[category] + vocabLength for word in self.vocabulary: if word in self.prob[category]: count = self.prob[category][word] else: count = 1 self.prob[category][word] = (float(count + 1) / denominator) #print ("DONE TRAINING\n\n") def train(self, trainingdir, category, bucketNumberToIgnore): """counts word occurrences for a particular category""" ignore = "%i" % bucketNumberToIgnore currentdir = trainingdir + category directories = os.listdir(currentdir) counts = {} total = 0 for directory in directories: if directory != ignore: currentBucket = trainingdir + category + "/" + directory files = os.listdir(currentBucket) #print(" " + currentBucket) for file in files: f = codecs.open(currentBucket + '/' + file, 'r', 'iso8859-1') for line in f: tokens = line.split() for token in tokens: # get rid of punctuation and lowercase token token = token.strip('\'".,?:-') token = token.lower() if token != '' and not token in self.stopwords: self.vocabulary.setdefault(token, 0) self.vocabulary[token] += 1 counts.setdefault(token, 0) counts[token] += 1 total += 1 f.close() return(counts, total) def classify(self, filename): results = {} for category in self.categories: results[category] = 0 f = codecs.open(filename, 'r', 'iso8859-1') for line in f: tokens = line.split() for token in tokens: #print(token) token = token.strip('\'".,?:-').lower() if token in self.vocabulary: for category in self.categories: if self.prob[category][token] == 0: print("%s %s" % (category, token)) results[category] += math.log( self.prob[category][token]) f.close() results = list(results.items()) results.sort(key=lambda tuple: tuple[1], reverse = True) # for debugging I can change this to give me the entire list return results[0][0] def testCategory(self, direc, category, bucketNumber): results = {} directory = direc + ("%i/" % bucketNumber) #print("Testing " + directory) files = os.listdir(directory) total = 0 correct = 0 for file in files: total += 1 result = self.classify(directory + file) results.setdefault(result, 0) results[result] += 1 #if result == category: # correct += 1 return results def test(self, testdir, bucketNumber): """Test all files in the test directory--that directory is organized into subdirectories--each subdir is a classification category""" results = {} categories = os.listdir(testdir) #filter out files that are not directories categories = [filename for filename in categories if os.path.isdir(testdir + filename)] correct = 0 total = 0 for category in categories: #print(".", end="") results[category] = self.testCategory( testdir + category + '/', category, bucketNumber) return results def tenfold(dataPrefix, stoplist): results = {} for i in range(0,10): bT = BayesText(dataPrefix, stoplist, i) r = bT.test(theDir, i) for (key, value) in r.items(): results.setdefault(key, {}) for (ckey, cvalue) in value.items(): results[key].setdefault(ckey, 0) results[key][ckey] += cvalue categories = list(results.keys()) categories.sort() print( "\n Classified as: ") header = " " subheader = " +" for category in categories: header += "% 2s " % category subheader += "-----+" print (header) print (subheader) total = 0.0 correct = 0.0 for category in categories: row = " %s |" % category for c2 in categories: if c2 in results[category]: count = results[category][c2] else: count = 0 row += " %3i |" % count total += count if c2 == category: correct += count print(row) print(subheader) print("\n%5.3f percent correct" %((correct * 100) / total)) print("total of %i instances" % total) # change these to match your directory structure prefixPath = "/Users/raz/Dropbox/guide/data/review_polarity_buckets/" theDir = prefixPath + "/txt_sentoken/" stoplistfile = prefixPath + "stopwords25.txt" tenfold(theDir, stoplistfile) ================================================ FILE: chapter-7/bayesText.py ================================================ from __future__ import print_function import os, codecs, math class BayesText: def __init__(self, trainingdir, stopwordlist): """This class implements a naive Bayes approach to text classification trainingdir is the training data. Each subdirectory of trainingdir is titled with the name of the classification category -- those subdirectories in turn contain the text files for that category. The stopwordlist is a list of words (one per line) will be removed before any counting takes place. """ self.vocabulary = {} self.prob = {} self.totals = {} self.stopwords = {} f = open(stopwordlist) for line in f: self.stopwords[line.strip()] = 1 f.close() categories = os.listdir(trainingdir) #filter out files that are not directories self.categories = [filename for filename in categories if os.path.isdir(trainingdir + filename)] print("Counting ...") for category in self.categories: print(' ' + category) (self.prob[category], self.totals[category]) = self.train(trainingdir, category) # I am going to eliminate any word in the vocabulary # that doesn't occur at least 3 times toDelete = [] for word in self.vocabulary: if self.vocabulary[word] < 3: # mark word for deletion # can't delete now because you can't delete # from a list you are currently iterating over toDelete.append(word) # now delete for word in toDelete: del self.vocabulary[word] # now compute probabilities vocabLength = len(self.vocabulary) print("Computing probabilities:") for category in self.categories: print(' ' + category) denominator = self.totals[category] + vocabLength for word in self.vocabulary: if word in self.prob[category]: count = self.prob[category][word] else: count = 1 self.prob[category][word] = (float(count + 1) / denominator) print ("DONE TRAINING\n\n") def train(self, trainingdir, category): """counts word occurrences for a particular category""" currentdir = trainingdir + category files = os.listdir(currentdir) counts = {} total = 0 for file in files: #print(currentdir + '/' + file) f = codecs.open(currentdir + '/' + file, 'r', 'iso8859-1') for line in f: tokens = line.split() for token in tokens: # get rid of punctuation and lowercase token token = token.strip('\'".,?:-') token = token.lower() if token != '' and not token in self.stopwords: self.vocabulary.setdefault(token, 0) self.vocabulary[token] += 1 counts.setdefault(token, 0) counts[token] += 1 total += 1 f.close() return(counts, total) def classify(self, filename): results = {} for category in self.categories: results[category] = 0 f = codecs.open(filename, 'r', 'iso8859-1') for line in f: tokens = line.split() for token in tokens: #print(token) token = token.strip('\'".,?:-').lower() if token in self.vocabulary: for category in self.categories: if self.prob[category][token] == 0: print("%s %s" % (category, token)) results[category] += math.log( self.prob[category][token]) f.close() results = list(results.items()) results.sort(key=lambda tuple: tuple[1], reverse = True) # for debugging I can change this to give me the entire list return results[0][0] def testCategory(self, directory, category): files = os.listdir(directory) total = 0 correct = 0 for file in files: total += 1 result = self.classify(directory + file) if result == category: correct += 1 return (correct, total) def test(self, testdir): """Test all files in the test directory--that directory is organized into subdirectories--each subdir is a classification category""" categories = os.listdir(testdir) #filter out files that are not directories categories = [filename for filename in categories if os.path.isdir(testdir + filename)] correct = 0 total = 0 for category in categories: print(".", end="") (catCorrect, catTotal) = self.testCategory( testdir + category + '/', category) correct += catCorrect total += catTotal print("\n\nAccuracy is %f%% (%i test instances)" % ((float(correct) / total) * 100, total)) # change these to match your directory structure baseDirectory = "/Users/raz/Dropbox/guide/data/20news-bydate/" trainingDir = baseDirectory + "20news-bydate-train/" testDir = baseDirectory + "20news-bydate-test/" stoplistfile = "/Users/raz/Downloads/20news-bydate/stopwords0.txt" print("Reg stoplist 0 ") bT = BayesText(trainingDir, baseDirectory + "stopwords0.txt") print("Running Test ...") bT.test(testDir) print("\n\nReg stoplist 25 ") bT = BayesText(trainingDir, baseDirectory + "stopwords25.txt") print("Running Test ...") bT.test(testDir) print("\n\nReg stoplist 174 ") bT = BayesText(trainingDir, baseDirectory + "stopwords174.txt") print("Running Test ...") bT.test(testDir) ================================================ FILE: chapter-8/cereal.csv ================================================ Name,Calories,Protein,Fat (g),Sodium (mg),dietary fiber (g),carbohydrates (g),sugar,x, 100% Bran,70,4,1,130,10,5,6,280,25 100% Natural Bran,120,3,5,15,2,8,8,135,0 All-Bran,70,4,1,260,9,7,5,320,25 All-Bran with Extra Fiber,50,4,0,140,14,8,0,330,25 Almond Delight,110,2,2,200,1,14,8,-1,25 Apple Cinnamon Cheerios,110,2,2,180,1.5,10.5,10,70,25 Apple Jacks,110,2,0,125,1,11,14,30,25 Basic 4,130,3,2,210,2,18,8,100,25 Bran Chex,90,2,1,200,4,15,6,125,25 Bran Flakes,90,3,0,210,5,13,5,190,25 Cap'n'Crunch,120,1,2,220,0,12,12,35,25 Cheerios,110,6,2,290,2,17,1,105,25 Cinnamon Toast Crunch,120,1,3,210,0,13,9,45,25 Clusters,110,3,2,140,2,13,7,105,25 Cocoa Puffs,110,1,1,180,0,12,13,55,25 Corn Chex,110,2,0,280,0,22,3,25,25 Corn Flakes,100,2,0,290,1,21,2,35,25 Corn Pops,110,1,0,90,1,13,12,20,25 Count Chocula,110,1,1,180,0,12,13,65,25 Cracklin' Oat Bran,110,3,3,140,4,10,7,160,25 Cream of Wheat (Quick),100,3,0,80,1,21,0,-1,0 Crispix,110,2,0,220,1,21,3,30,25 Crispy Wheat & Raisins,100,2,1,140,2,11,10,120,25 Double Chex,100,2,0,190,1,18,5,80,25 Froot Loops,110,2,1,125,1,11,13,30,25 Frosted Flakes,110,1,0,200,1,14,11,25,25 Frosted Mini-Wheats,100,3,0,0,3,14,7,100,25 Fruit & Fibre,120,3,2,160,5,12,10,200,25 Fruitful Bran,120,3,0,240,5,14,12,190,25 Fruity Pebbles,110,1,1,135,0,13,12,25,25 Golden Crisp,100,2,0,45,0,11,15,40,25 Golden Grahams,110,1,1,280,0,15,9,45,25 Grape Nuts Flakes,100,3,1,140,3,15,5,85,25 Grape-Nuts,110,3,0,170,3,17,3,90,25 Great Grains Pecan,120,3,3,75,3,13,4,100,25 Honey Graham Ohs,120,1,2,220,1,12,11,45,25 Honey Nut Cheerios,110,3,1,250,1.5,11.5,10,90,25 Honey-comb,110,1,0,180,0,14,11,35,25 Just Right Crunchy Nuggets,110,2,1,170,1,17,6,60,100 Just Right Fruit & Nut,140,3,1,170,2,20,9,95,100 Kix,110,2,1,260,0,21,3,40,25 Life,100,4,2,150,2,12,6,95,25 Lucky Charms,110,2,1,180,0,12,12,55,25 Maypo,100,4,1,0,0,16,3,95,25 Muesli Raisins & Almonds,150,4,3,95,3,16,11,170,25 Muesli Peaches & Pecans,150,4,3,150,3,16,11,170,25 Mueslix Crispy Blend,160,3,2,150,3,17,13,160,25 Multi-Grain Cheerios,100,2,1,220,2,15,6,90,25 Nut&Honey Crunch,120,2,1,190,0,15,9,40,25 Nutri-Grain Almond-Raisin,140,3,2,220,3,21,7,130,25 Nutri-grain Wheat,90,3,0,170,3,18,2,90,25 Oatmeal Raisin Crisp,130,3,2,170,1.5,13.5,10,120,25 Post Nat. Raisin Bran,120,3,1,200,6,11,14,260,25 Product 19,100,3,0,320,1,20,3,45,100 Puffed Rice,50,1,0,0,0,13,0,15,0 Puffed Wheat,50,2,0,0,1,10,0,50,0 Quaker Oat Squares,100,4,1,135,2,14,6,110,25 Quaker Oatmeal,100,5,2,0,2.7,-1,-1,110,0 Raisin Bran,120,3,1,210,5,14,12,240,25 Raisin Nut Bran,100,3,2,140,2.5,10.5,8,140,25 Raisin Squares,90,2,0,0,2,15,6,110,25 Rice Chex,110,1,0,240,0,23,2,30,25 Rice Krispies,110,2,0,290,0,22,3,35,25 Shredded Wheat,80,2,0,0,3,16,0,95,0 Shredded Wheat 'n'Bran,90,3,0,0,4,19,0,140,0 Shredded Wheat spoon size,90,3,0,0,3,20,0,120,0 Smacks,110,2,1,70,1,9,15,40,25 Special K,110,6,0,230,1,16,3,55,25 Strawberry Fruit Wheats,90,2,0,15,3,15,5,90,25 Total Corn Flakes,110,2,1,200,0,21,3,35,100 Total Raisin Bran,140,3,1,190,4,15,14,230,100 Total Whole Grain,100,3,1,200,3,16,3,110,100 Triples,110,2,1,250,0,21,3,60,25 Trix,110,1,1,140,0,13,12,25,25 Wheat Chex,100,3,1,230,3,17,3,115,25 Wheaties,100,3,1,200,3,17,3,110,25 Wheaties Honey Gold,110,2,1,200,1,16,8,60,25 ================================================ FILE: chapter-8/dogs.csv ================================================ breed,height (inches),weight (pounds) Border Collie,20,45 Boston Terrier,16,20 Brittany Spaniel,18,35 Bullmastiff,27,120 Chihuahua,8,8 German Shepherd,25,78 Golden Retriever,23,70 Great Dane,32,160 Portuguese Water Dog,21,50 Standard Poodle,19,65 Yorkshire Terrier,6,7 ================================================ FILE: chapter-8/enrondata.txt ================================================ kay.mann@enron.com,vince.kaminski@enron.com,jeff.dasovich@enron.com,pete.davis@enron.com,chris.germany@enron.com,sara.shackleton@enron.com,tana.jones@enron.com,steven.kean@enron.com,kate.symes@enron.com,matthew.lenhart@enron.com,eric.bass@enron.com,debra.perlingiere@enron.com,sally.beck@enron.com,mark.taylor@enron.com,susan.scott@enron.com,gerald.nemec@enron.com,drew.fossum@enron.com,john.arnold@enron.com,carol.clair@enron.com,benjamin.rogers@enron.com,richard.sanders@enron.com,phillip.love@enron.com,david.delainey@enron.com,darron.giron@enron.com,daren.farmer@enron.com,mike.mcconnell@enron.com,jeffrey.shankman@enron.com,elizabeth.sager@enron.com,john.lavorato@enron.com,robin.rodrigue@enron.com,phillip.allen@enron.com,mark.haedicke@enron.com,chris.dorland@enron.com,scott.neal@enron.com,michelle.cash@enron.com,louise.kitchen@enron.com,mike.grigsby@enron.com,susan.mara@enron.com,d..steffes@enron.com,mary.hain@enron.com,dan.hyvl@enron.com,larry.campbell@enron.com,james.steffes@enron.com,errol.mclaughlin@enron.com,j.kaminski@enron.com,kimberly.watson@enron.com,richard.shapiro@enron.com,lynn.blair@enron.com,maureen.mcvicker@enron.com,rosalee.fleming@enron.com,stanley.horton@enron.com,mjones7@txu.com,rod.hayslett@enron.com,marie.heard@enron.com,matt.smith@enron.com,rick.buy@enron.com,m..love@enron.com,hunter.shively@enron.com,shirley.crenshaw@enron.com,sherri.sera@enron.com,mark.guzman@enron.com,shelley.corman@enron.com,ginger.dernehl@enron.com,james.derrick@enron.com,michelle.lokay@enron.com,mary.cook@enron.com,dana.davis@enron.com,david.forster@enron.com,judy.hernandez@enron.com,m..presto@enron.com,soblander@carrfut.com,karen.denne@enron.com,christi.nicolay@enron.com,evelyn.metoyer@enron.com,perfmgmt@enron.com,leslie.hansen@enron.com,kevin.hyatt@enron.com,tori.kuykendall@enron.com,lorna.brennan@enron.com,liz.taylor@enron.com,patrice.mims@enron.com,mike.maggi@enron.com,tracy.geaccone@enron.com,jane.tholt@enron.com,rhonda.denton@enron.com,cara.semperger@enron.com,barry.tycholiz@enron.com,mike.carson@enron.com,bill.williams@enron.com,kerri.thompson@enron.com kay.mann@enron.com,16735,0,0,0,10,20,4,0,0,0,0,7,0,6,0,6,1,0,9,0,41,0,0,0,0,0,0,94,0,0,0,16,0,0,10,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,2,0,0,71,0,4,3,0,0,0,0,0,0,0,0,12,0,0,0,56,0 vince.kaminski@enron.com,0,14368,0,0,0,0,0,14,0,0,0,0,21,0,0,0,0,8,0,4,0,0,16,0,0,0,75,0,53,0,0,19,0,0,0,54,0,0,0,0,0,0,7,0,0,28,8,0,0,0,7,0,0,0,0,42,0,8,1246,23,0,5,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,13,0,0,5,0,0,0,0,0,0,0 jeff.dasovich@enron.com,0,2,11411,0,0,0,0,117,0,0,0,0,0,0,92,0,42,0,0,0,1010,0,164,0,0,0,0,0,142,0,47,2,0,0,0,132,3,2660,442,399,0,0,2712,0,1,5,2889,0,89,48,0,0,0,0,0,0,0,0,0,0,0,50,1114,0,4,0,1,0,0,1,0,2480,27,0,0,0,2,0,0,0,0,0,0,0,0,0,106,0,1,0 pete.davis@enron.com,0,0,0,9149,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0 chris.germany@enron.com,43,0,0,0,8801,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,0,0,0,0,0,0,0,0,112,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,29,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0 sara.shackleton@enron.com,17,0,0,0,0,8777,569,0,0,0,0,4,0,665,0,0,0,8,436,0,22,0,0,0,0,0,0,5,0,0,0,0,0,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,295,0,0,0,0,0,0,0,0,0,0,0,313,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2,0,0,2,0,0,0,0,0 tana.jones@enron.com,2,0,0,0,0,575,8490,2,0,0,0,7,0,824,0,0,0,55,460,0,2,0,0,0,0,0,2,334,2,0,0,4,0,0,0,114,0,0,0,0,16,0,0,0,0,0,0,0,0,0,0,0,2,804,0,0,0,0,0,0,0,0,0,0,0,278,8,162,0,2,0,0,0,0,0,730,0,0,0,0,0,2,0,0,21,0,2,0,0,0 steven.kean@enron.com,0,0,408,0,0,0,0,6759,0,0,0,0,0,0,0,0,0,0,0,0,22,0,38,0,0,18,8,0,41,0,26,23,0,6,9,9,6,155,0,57,0,0,245,0,0,0,361,0,1038,126,7,0,4,0,0,21,0,0,0,62,0,52,11,34,0,0,0,0,0,0,0,156,35,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0 kate.symes@enron.com,0,0,0,1,0,0,0,0,5438,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,51,0,0,0,0,0,0,0,0,0,0,0,0,939,0,0,0,0,0,0,0,0,0,0,195,85,0,0,62,888 matthew.lenhart@enron.com,0,0,0,0,0,0,0,0,0,5265,199,0,0,0,49,0,0,0,0,0,0,56,0,0,0,0,0,0,0,0,28,0,2,0,0,0,68,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,17,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,60,0,0,0,0,0,13,0,0,0,0,0,0 eric.bass@enron.com,0,0,0,0,0,0,0,0,0,692,5158,0,0,0,4,0,0,0,0,0,0,413,0,12,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,27,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 debra.perlingiere@enron.com,14,0,0,0,0,0,0,0,0,0,0,4387,0,0,0,130,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,147,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0 sally.beck@enron.com,0,6,0,0,0,0,0,4,0,0,27,0,4343,6,4,0,0,16,0,16,0,19,117,17,16,19,25,1,134,8,0,0,0,18,0,177,2,0,0,0,0,16,0,16,0,0,0,0,0,1,6,0,0,0,0,28,16,0,0,0,0,0,0,6,0,0,16,9,12,1,0,0,0,8,0,0,0,0,0,5,0,16,0,0,19,0,3,16,0,8 mark.taylor@enron.com,6,0,0,0,0,297,377,0,0,0,0,8,0,4111,0,8,0,2,188,0,34,0,0,0,0,0,0,47,0,0,0,36,0,0,12,111,0,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,39,0,0,0,0,0,0,0,0,0,0,0,73,0,160,0,0,0,0,1,0,0,72,0,0,0,0,0,0,0,0,0,0,0,0,0,0 susan.scott@enron.com,0,0,52,0,0,0,0,0,0,15,15,0,0,0,4000,13,81,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,6,0,0,0,4,0,0,0,0,0,0,26,0,0,3,17,0,0,0,0,0,0,0,0,0,0,0,0,0,104,0,0,131,0,5,0,0,1,0,0,14,0,0,0,164,2,0,0,0,0,0,0,0,0,1,16,0,0 gerald.nemec@enron.com,0,0,0,0,6,2,2,0,0,0,0,60,0,18,46,3888,14,0,2,0,6,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0,0,0,0,0,18,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0,0,0,0,0,0,114,0,0,0 drew.fossum@enron.com,1,0,3,0,0,0,0,0,0,0,0,0,0,2,405,0,3706,0,2,0,0,0,0,0,0,0,0,2,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,22,0,23,0,0,17,0,38,0,0,0,0,0,0,0,0,105,0,2,5,0,0,3,0,0,0,0,0,0,0,0,166,0,17,0,0,0,0,0,0,0,0,0,0,0 john.arnold@enron.com,0,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3578,0,0,0,0,0,0,0,0,47,0,162,0,8,0,0,11,0,18,4,0,0,0,0,0,0,54,0,0,0,0,0,0,0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,6,0,39,0,1,11,0,0,0,0,0,0,0,0,53,0,206,0,0,0,0,0,0,0,0 carol.clair@enron.com,4,0,0,0,0,458,323,0,0,0,0,2,0,371,0,0,0,0,3564,0,13,0,0,0,0,0,0,66,0,0,0,2,0,0,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,28,0,0,0,0,0,0,0,0,0,0,0,191,0,104,0,0,0,0,2,0,0,74,0,0,0,0,0,0,0,0,22,0,0,0,0,0 benjamin.rogers@enron.com,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3427,0,0,32,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 richard.sanders@enron.com,10,2,36,0,0,6,2,78,0,0,0,0,0,8,0,6,0,0,2,0,3262,0,8,0,0,4,0,32,4,0,2,159,0,0,29,4,2,9,0,21,0,0,42,0,0,0,31,0,0,0,0,0,0,4,0,2,0,0,0,0,0,0,5,33,0,0,0,0,0,0,0,9,20,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 phillip.love@enron.com,0,0,0,0,0,0,0,0,0,64,121,0,0,0,0,0,0,0,0,0,0,3112,0,88,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,19,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,15,0,0,4,0,0,15,0,0,0,0,0,0 david.delainey@enron.com,0,33,2,0,0,0,0,34,0,0,0,0,72,0,0,0,0,26,0,8,6,0,3069,0,0,11,64,4,259,0,35,140,0,39,0,21,0,0,0,0,0,0,61,0,0,0,46,0,0,0,0,0,0,0,0,54,0,39,0,4,0,0,0,3,0,0,7,0,0,0,0,0,20,0,0,0,0,0,0,0,0,0,0,0,0,0,39,0,0,0 darron.giron@enron.com,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,177,0,2963,0,0,0,0,0,12,0,0,0,0,0,0,4,0,0,0,0,0,0,26,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 daren.farmer@enron.com,0,0,0,0,0,0,0,0,0,0,4,0,2,0,0,0,0,0,0,0,0,0,0,0,2812,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 mike.mcconnell@enron.com,0,10,0,0,0,0,0,28,0,0,0,0,24,0,0,0,0,0,0,0,0,0,25,0,0,2742,249,0,21,0,0,3,0,0,0,65,0,0,0,0,0,0,0,0,0,0,0,0,0,3,11,0,0,0,0,15,0,0,0,3,0,0,0,7,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,18,0,0,0,0,0,0,0,0,0,0 jeffrey.shankman@enron.com,0,39,0,0,0,0,0,5,0,0,0,0,9,0,0,0,0,16,0,0,0,0,14,0,0,131,2681,0,29,0,6,7,0,3,5,7,6,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,38,0,9,2,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0 elizabeth.sager@enron.com,16,0,0,0,0,0,23,19,0,0,0,0,0,28,0,4,0,0,96,0,62,0,11,0,0,0,0,2636,1,0,0,97,0,0,10,12,0,0,0,0,0,0,6,0,0,0,4,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,28,2,0,0,3,0,15,59,0,0,74,0,0,0,0,0,0,0,0,0,0,0,0,0,0 john.lavorato@enron.com,0,14,0,0,18,0,0,6,0,18,18,0,25,0,0,0,0,176,0,0,0,0,169,0,0,2,46,0,2585,0,63,29,27,66,0,123,39,0,0,0,0,0,6,0,1,0,13,0,2,2,5,0,2,0,1,102,0,54,0,2,0,0,0,2,0,0,20,16,0,34,0,0,0,0,0,0,0,18,0,7,10,18,0,10,0,0,35,19,8,0 robin.rodrigue@enron.com,0,0,0,0,0,0,0,0,0,0,0,0,0,0,26,0,0,0,0,0,0,8,0,12,0,0,0,0,0,2496,0,0,0,0,0,0,0,0,0,0,0,0,0,27,0,0,0,0,0,0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 phillip.allen@enron.com,0,0,19,0,0,0,0,27,0,88,0,0,4,6,18,0,0,2,0,0,17,0,8,0,0,0,0,0,63,0,2195,0,0,2,0,0,173,17,0,17,0,0,27,0,0,0,17,0,0,0,6,0,0,0,24,0,0,26,0,0,0,0,0,0,0,0,0,0,0,0,0,0,21,0,0,0,0,55,0,0,0,0,0,52,0,0,7,0,0,0 mark.haedicke@enron.com,2,2,0,0,0,2,2,10,0,0,0,2,0,133,0,2,0,0,14,0,53,0,33,0,0,2,6,99,18,0,0,1941,0,0,20,27,0,0,0,0,1,0,2,0,0,0,7,0,0,0,0,0,0,2,0,9,0,0,0,0,0,0,0,17,0,10,0,2,0,0,0,0,4,0,0,6,0,0,0,6,0,0,0,0,0,0,0,0,0,0 chris.dorland@enron.com,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,23,0,0,0,1840,0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,13,7,0,0 scott.neal@enron.com,0,0,0,0,45,2,0,0,0,0,0,9,4,0,0,0,0,34,0,0,0,0,7,0,0,0,4,0,35,0,70,0,0,1829,0,5,36,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,82,0,0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,0,0,0,0,8,0,20,0,0,0,0,0,0,0,0 michelle.cash@enron.com,0,0,0,0,0,0,0,2,0,0,0,0,3,18,0,0,0,0,0,0,17,0,0,0,0,0,1,22,5,0,0,24,0,0,1824,12,0,0,0,0,0,0,0,0,2,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,3,0,0,0,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0 louise.kitchen@enron.com,0,10,1,0,0,1,29,7,0,0,0,0,115,111,0,0,0,61,2,0,12,0,5,0,0,10,6,35,175,0,7,19,0,66,7,1728,71,0,74,0,0,0,1,0,4,0,5,0,0,0,0,0,0,2,0,64,0,7,0,0,0,0,0,0,0,26,30,168,0,112,0,7,6,0,0,23,0,0,0,10,2,0,0,0,0,0,97,10,0,0 mike.grigsby@enron.com,0,0,0,0,0,0,0,0,0,338,45,0,0,0,22,0,0,13,0,0,0,0,0,4,0,0,7,0,40,0,96,0,8,10,0,8,1719,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,282,0,6,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,347,0,0,0,0,0,73,0,0,245,0,0,0 susan.mara@enron.com,0,0,1200,0,0,0,0,189,0,0,0,0,0,0,0,0,0,0,2,0,533,0,50,0,0,0,0,2,46,0,383,0,0,0,0,46,0,1687,174,445,0,0,865,0,0,0,889,0,0,0,0,0,0,0,0,0,0,0,0,0,22,0,451,0,0,0,0,0,0,0,0,726,33,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,0 d..steffes@enron.com,0,0,186,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,12,0,0,0,0,12,30,0,0,0,0,0,0,52,0,133,1655,0,0,0,0,0,2,0,187,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,16,0,0,0,6,7,0,22,0,11,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 mary.hain@enron.com,0,0,279,0,0,0,0,205,0,0,0,0,0,0,0,0,0,0,0,0,394,0,13,0,3,0,0,14,13,0,153,156,0,0,0,0,127,425,0,1456,0,0,517,0,0,0,215,0,16,0,0,0,0,0,0,137,0,0,0,0,0,32,14,10,0,0,0,0,0,0,0,158,78,0,0,0,0,0,0,0,0,0,0,0,14,23,0,0,19,0 dan.hyvl@enron.com,0,0,0,0,0,0,10,0,0,0,5,71,0,6,0,8,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1454,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6,0,4,0,0,6,0,0,0,0,0,29,0,0,0 larry.campbell@enron.com,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1388,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0 james.steffes@enron.com,0,5,635,0,0,0,0,539,0,0,0,0,0,4,3,0,0,0,0,0,149,0,157,0,0,0,0,77,129,0,85,16,0,1,0,52,70,410,0,254,0,0,1346,0,0,0,576,0,16,0,0,0,0,0,0,0,0,3,0,0,0,34,20,0,0,0,0,0,0,0,0,107,51,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0 errol.mclaughlin@enron.com,0,0,0,0,0,0,0,0,0,0,0,0,0,0,22,0,0,168,0,0,0,3,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1325,0,0,0,0,0,0,0,0,0,0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,138,0,0,0,0,0,0,0,0 j.kaminski@enron.com,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,0,0,0,0,0,2,5,0,0,0,0,0,0,0,0,1247,2,0,0,0,0,1,0,0,0,0,1,0,0,70,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 kimberly.watson@enron.com,0,19,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,1217,0,10,0,0,1,0,7,0,0,0,0,0,2,0,0,8,0,0,223,0,0,0,0,0,0,0,0,0,0,0,20,0,1,0,0,0,16,0,0,0,0,0,0,0 richard.shapiro@enron.com,0,6,237,0,0,0,0,476,0,0,0,0,0,0,0,0,0,0,0,0,2,0,26,0,0,9,5,0,48,0,0,4,0,0,0,37,0,111,96,17,0,0,144,0,0,0,1215,0,68,5,0,0,0,0,0,0,0,0,0,0,0,0,92,0,0,0,0,0,0,11,0,22,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 lynn.blair@enron.com,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,53,0,1210,0,0,4,0,3,0,0,0,0,0,0,0,0,145,0,0,24,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0 maureen.mcvicker@enron.com,0,0,189,0,0,0,0,230,0,0,0,0,0,0,0,0,0,0,0,0,32,0,75,0,0,30,5,3,22,0,13,15,0,0,0,22,0,73,2,53,0,0,125,0,0,0,158,0,1186,25,55,0,6,0,0,25,0,0,0,27,0,0,40,55,0,0,0,0,0,0,0,66,30,0,0,0,0,0,0,5,0,0,0,0,0,0,0,0,0,0 rosalee.fleming@enron.com,0,3,18,0,0,0,0,190,0,0,0,0,1,0,0,0,0,0,0,0,2,0,129,0,0,134,118,0,131,0,0,0,0,0,0,128,0,0,0,0,0,0,0,0,0,0,15,0,128,1119,152,0,118,0,0,153,0,0,0,133,0,2,0,153,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,0,0,0 stanley.horton@enron.com,0,9,0,0,0,0,0,12,0,0,0,0,2,0,0,0,12,0,0,4,0,0,0,0,0,9,6,0,19,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,1,0,2,0,0,1073,0,34,0,0,2,0,2,0,0,0,28,0,0,3,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,2,0,0,0,0,0,0,0 mjones7@txu.com,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1063,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 rod.hayslett@enron.com,0,0,0,0,0,0,2,0,0,0,0,0,9,0,0,0,45,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,60,0,1061,0,0,0,0,0,0,0,0,26,0,0,11,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,225,0,0,0,0,0,0,0 marie.heard@enron.com,0,0,0,0,0,160,224,0,0,0,0,5,0,14,0,72,0,0,10,0,0,0,0,0,0,0,0,79,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1061,0,1,0,0,0,0,0,0,0,0,0,106,0,0,0,0,0,0,0,0,0,70,0,0,0,0,0,0,0,0,0,0,0,0,0,0 matt.smith@enron.com,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1060,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 rick.buy@enron.com,0,3,0,0,0,0,0,1,0,0,0,0,5,0,0,0,0,0,0,0,0,0,26,0,0,5,3,0,20,0,0,2,0,0,0,4,0,0,0,0,0,0,0,0,1,0,0,0,0,2,0,0,0,0,0,1053,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 m..love@enron.com,0,0,0,0,15,0,0,0,0,4,22,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,0,0,0,15,0,0,13,0,0,0,0,0,0,41,0,0,0,0,0,0,0,0,0,0,0,0,732,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 hunter.shively@enron.com,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,3,0,0,7,0,33,0,15,0,0,15,0,10,4,0,0,0,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1052,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0 shirley.crenshaw@enron.com,0,364,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,26,3,0,0,0,0,0,0,0,0,0,3,0,0,974,2,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,0,0,0,0 sherri.sera@enron.com,0,17,0,0,0,0,0,238,0,0,0,0,7,0,0,0,0,0,0,0,0,0,56,0,0,41,28,0,29,0,0,0,0,7,0,37,0,0,0,0,0,0,7,0,0,0,4,0,4,0,52,0,34,0,0,41,0,0,0,971,0,0,4,53,0,0,0,0,0,0,0,23,0,0,0,0,0,0,0,4,0,0,0,0,0,0,7,0,0,0 mark.guzman@enron.com,0,0,0,0,0,0,0,0,49,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,970,0,0,0,0,0,0,0,0,0,0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,10,0,0,4,20 shelley.corman@enron.com,0,0,26,0,0,0,0,134,0,0,0,0,0,2,85,0,120,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,32,1,32,0,0,51,0,0,47,61,108,0,0,133,0,92,0,0,0,0,0,0,0,0,940,0,0,10,0,0,0,0,0,0,0,38,0,0,0,10,0,10,0,0,0,8,0,0,0,0,0,0,0 ginger.dernehl@enron.com,0,0,681,0,0,0,0,442,0,0,0,0,0,0,108,0,0,0,0,0,32,0,44,0,0,8,8,3,42,0,16,40,0,0,0,8,16,573,151,313,0,0,472,0,0,0,642,0,510,2,0,0,0,0,0,32,0,0,0,0,0,32,925,32,0,0,0,0,0,0,0,83,394,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 james.derrick@enron.com,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,12,0,0,0,15,0,0,0,0,0,0,0,0,0,0,13,0,0,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,909,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 michelle.lokay@enron.com,0,0,1,0,0,0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,192,0,4,0,0,0,0,7,0,0,0,0,0,0,0,0,9,0,0,904,0,0,0,0,0,0,0,0,0,0,0,116,0,0,0,0,0,2,0,0,0,0,0,0,0 mary.cook@enron.com,1,0,0,0,0,445,376,0,0,0,0,0,0,179,0,156,0,5,134,0,0,0,0,0,0,0,0,202,1,0,0,3,0,0,1,17,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,198,0,0,0,0,0,0,0,0,0,1,0,901,0,1,0,0,0,0,0,0,0,148,0,0,0,0,0,0,0,0,0,0,2,0,0,0 dana.davis@enron.com,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,899,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 david.forster@enron.com,0,0,38,38,27,0,80,56,0,59,43,0,42,291,0,0,0,40,20,2,2,0,0,0,24,56,77,45,114,0,21,59,59,61,38,189,61,0,41,0,0,2,0,38,0,0,0,0,0,0,0,0,0,38,0,56,38,29,0,0,0,0,0,0,2,0,53,891,0,53,0,0,0,38,0,13,0,21,0,0,21,59,0,27,0,0,55,59,38,0 judy.hernandez@enron.com,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,888,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 m..presto@enron.com,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,2,0,0,0,0,0,0,0,0,0,20,56,0,0,0,2,0,0,46,0,0,13,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,35,8,0,885,0,0,0,0,0,2,0,0,0,5,0,0,0,0,0,0,0,7,0,0 soblander@carrfut.com,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,863,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 karen.denne@enron.com,0,0,478,0,0,0,0,327,0,0,0,0,12,0,0,0,0,5,0,0,9,0,24,0,0,12,0,0,43,0,0,2,0,0,2,45,0,283,9,2,0,0,238,0,0,0,283,0,3,9,21,0,8,0,0,15,0,0,0,9,0,6,7,72,0,0,0,0,0,1,0,851,0,0,0,0,0,0,0,5,0,0,0,0,0,0,0,0,0,0 christi.nicolay@enron.com,93,3,14,0,0,0,0,363,0,0,0,0,0,15,21,0,0,0,0,3,30,0,102,0,0,0,0,168,64,0,9,47,0,1,0,21,0,144,1,200,0,0,362,0,0,0,428,0,15,0,0,0,0,0,0,0,0,0,0,7,0,75,7,0,0,0,0,3,0,1,0,0,836,0,0,2,0,0,0,0,0,0,0,0,0,0,0,29,0,0 evelyn.metoyer@enron.com,0,0,0,0,0,0,0,0,826,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,830,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0 perfmgmt@enron.com,23,0,30,0,12,9,32,0,0,2,0,0,34,8,0,28,0,0,0,2,0,0,0,0,0,0,0,2,0,0,0,0,0,18,1,3,6,0,0,0,0,0,0,6,0,0,0,0,0,0,0,0,7,0,6,0,0,0,0,0,15,0,0,0,30,0,12,0,0,0,0,0,0,0,830,0,0,3,0,0,0,0,0,0,0,0,1,0,1,0 leslie.hansen@enron.com,3,0,0,0,0,19,514,0,0,0,0,13,0,74,0,7,0,0,5,0,0,0,0,0,0,0,0,11,0,0,0,2,0,0,3,6,0,0,0,0,22,0,0,0,0,0,0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0,0,0,0,2,0,12,0,1,0,0,0,0,0,829,0,0,0,0,0,0,0,0,4,0,0,0,0,0 kevin.hyatt@enron.com,0,0,0,0,1,0,0,0,0,0,0,0,0,0,2,0,10,0,0,0,2,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,32,0,0,0,68,0,6,0,0,1,0,5,0,0,0,0,0,0,0,0,1,0,0,68,0,0,0,0,0,0,0,0,0,0,0,821,0,4,0,0,0,6,0,0,0,0,0,0,0 tori.kuykendall@enron.com,0,0,0,0,0,0,0,0,0,24,1,0,0,2,10,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,6,0,0,0,0,0,22,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,810,0,0,0,0,0,9,0,0,0,0,0,0 lorna.brennan@enron.com,0,0,0,0,0,0,0,0,0,0,0,0,0,0,277,0,332,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,589,0,336,0,0,0,0,14,0,0,0,0,0,0,0,0,193,0,0,745,0,0,0,0,0,0,0,0,0,0,0,750,0,806,0,0,0,0,0,0,0,0,0,0,0 liz.taylor@enron.com,0,6,0,0,0,0,38,21,0,32,32,0,21,6,3,0,0,16,0,0,0,0,40,0,0,118,64,41,159,0,0,10,32,40,6,101,39,0,3,0,0,0,0,0,1,0,4,0,0,3,46,0,5,32,0,28,0,3,3,0,0,0,0,28,0,41,39,13,0,42,0,0,0,0,0,47,0,0,0,805,0,32,0,0,0,0,41,32,0,0 patrice.mims@enron.com,0,0,0,0,0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,776,0,0,0,0,0,0,0,0,0 mike.maggi@enron.com,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,772,0,0,0,0,0,0,0,0 tracy.geaccone@enron.com,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,22,0,0,0,0,7,0,160,0,0,0,0,0,0,0,0,5,0,0,6,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,770,0,0,0,0,0,0,0 jane.tholt@enron.com,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,28,0,0,0,0,0,28,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,40,0,0,0,0,0,767,0,0,4,0,0,0 rhonda.denton@enron.com,12,0,0,0,0,2,12,0,566,0,0,0,0,0,0,0,0,0,11,330,0,0,0,0,0,0,0,502,0,0,0,0,249,0,0,0,0,0,0,339,0,485,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,501,0,0,0,0,0,484,0,0,8,0,0,498,249,0,502,0,0,0,0,0,0,0,0,760,498,0,495,19,249 cara.semperger@enron.com,0,0,0,0,0,0,0,0,70,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,736,0,0,17,0 barry.tycholiz@enron.com,0,0,44,0,0,0,0,0,0,0,0,0,0,0,0,71,0,1,0,0,0,0,4,3,0,0,0,0,11,0,2,0,1,1,0,18,21,0,0,0,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,3,0,0,732,0,0,0 mike.carson@enron.com,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,721,0,0 bill.williams@enron.com,20,0,0,1,0,0,0,0,265,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,208,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,12,0,0,716,0 kerri.thompson@enron.com,0,0,0,0,0,0,0,0,693,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,711 ================================================ FILE: chapter-8/hierarchicalClusterer.py ================================================ from queue import PriorityQueue import math """ Example code for hierarchical clustering """ def getMedian(alist): """get median value of list alist""" tmp = list(alist) tmp.sort() alen = len(tmp) if (alen % 2) == 1: return tmp[alen // 2] else: return (tmp[alen // 2] + tmp[(alen // 2) - 1]) / 2 def normalizeColumn(column): """Normalize column using Modified Standard Score""" median = getMedian(column) asd = sum([abs(x - median) for x in column]) / len(column) result = [(x - median) / asd for x in column] return result class hClusterer: """ this clusterer assumes that the first column of the data is a label not used in the clustering. The other columns contain numeric data""" def __init__(self, filename): file = open(filename) self.data = {} self.counter = 0 self.queue = PriorityQueue() lines = file.readlines() file.close() header = lines[0].split(',') self.cols = len(header) self.data = [[] for i in range(len(header))] for line in lines[1:]: cells = line.split(',') toggle = 0 for cell in range(self.cols): if toggle == 0: self.data[cell].append(cells[cell]) toggle = 1 else: self.data[cell].append(float(cells[cell])) # now normalize number columns (that is, skip the first column) for i in range(1, self.cols): self.data[i] = normalizeColumn(self.data[i]) ### ### I have read in the data and normalized the ### columns. Now for each element i in the data, I am going to ### 1. compute the Euclidean Distance from element i to all the ### other elements. This data will be placed in neighbors, ### which is a Python dictionary. Let's say i = 1, and I am ### computing the distance to the neighbor j and let's say j ### is 2. The neighbors dictionary for i will look like ### {2: ((1,2), 1.23), 3: ((1, 3), 2.3)... } ### ### 2. find the closest neighbor ### ### 3. place the element on a priority queue, called simply queue, ### based on the distance to the nearest neighbor (and a counter ### used to break ties. # now push distances on queue rows = len(self.data[0]) for i in range(rows): minDistance = 99999 nearestNeighbor = 0 neighbors = {} for j in range(rows): if i != j: dist = self.distance(i, j) if i < j: pair = (i,j) else: pair = (j,i) neighbors[j] = (pair, dist) if dist < minDistance: minDistance = dist nearestNeighbor = j nearestNum = j # create nearest Pair if i < nearestNeighbor: nearestPair = (i, nearestNeighbor) else: nearestPair = (nearestNeighbor, i) # put instance on priority queue self.queue.put((minDistance, self.counter, [[self.data[0][i]], nearestPair, neighbors])) self.counter += 1 def distance(self, i, j): sumSquares = 0 for k in range(1, self.cols): sumSquares += (self.data[k][i] - self.data[k][j])**2 return math.sqrt(sumSquares) def cluster(self): done = False while not done: topOne = self.queue.get() nearestPair = topOne[2][1] if not self.queue.empty(): nextOne = self.queue.get() nearPair = nextOne[2][1] tmp = [] ## ## I have just popped two elements off the queue, ## topOne and nextOne. I need to check whether nextOne ## is topOne's nearest neighbor and vice versa. ## If not, I will pop another element off the queue ## until I find topOne's nearest neighbor. That is what ## this while loop does. ## while nearPair != nearestPair: tmp.append((nextOne[0], self.counter, nextOne[2])) self.counter += 1 nextOne = self.queue.get() nearPair = nextOne[2][1] ## ## this for loop pushes the elements I popped off in the ## above while loop. ## for item in tmp: self.queue.put(item) if len(topOne[2][0]) == 1: item1 = topOne[2][0][0] else: item1 = topOne[2][0] if len(nextOne[2][0]) == 1: item2 = nextOne[2][0][0] else: item2 = nextOne[2][0] ## curCluster is, perhaps obviously, the new cluster ## which combines cluster item1 with cluster item2. curCluster = (item1, item2) ## Now I am doing two things. First, finding the nearest ## neighbor to this new cluster. Second, building a new ## neighbors list by merging the neighbors lists of item1 ## and item2. If the distance between item1 and element 23 ## is 2 and the distance betweeen item2 and element 23 is 4 ## the distance between element 23 and the new cluster will ## be 2 (i.e., the shortest distance). ## minDistance = 99999 nearestPair = () nearestNeighbor = '' merged = {} nNeighbors = nextOne[2][2] for (key, value) in topOne[2][2].items(): if key in nNeighbors: if nNeighbors[key][1] < value[1]: dist = nNeighbors[key] else: dist = value if dist[1] < minDistance: minDistance = dist[1] nearestPair = dist[0] nearestNeighbor = key merged[key] = dist if merged == {}: return curCluster else: self.queue.put( (minDistance, self.counter, [curCluster, nearestPair, merged])) self.counter += 1 def printDendrogram(T, sep=3): """Print dendrogram of a binary tree. Each tree node is represented by a length-2 tuple. printDendrogram is written and provided by David Eppstein 2002. Accessed on 14 April 2014: http://code.activestate.com/recipes/139422-dendrogram-drawing/ """ def isPair(T): return type(T) == tuple and len(T) == 2 def maxHeight(T): if isPair(T): h = max(maxHeight(T[0]), maxHeight(T[1])) else: h = len(str(T)) return h + sep activeLevels = {} def traverse(T, h, isFirst): if isPair(T): traverse(T[0], h-sep, 1) s = [' ']*(h-sep) s.append('|') else: s = list(str(T)) s.append(' ') while len(s) < h: s.append('-') if (isFirst >= 0): s.append('+') if isFirst: activeLevels[h] = 1 else: del activeLevels[h] A = list(activeLevels) A.sort() for L in A: if len(s) < L: while len(s) < L: s.append(' ') s.append('|') print (''.join(s)) if isPair(T): traverse(T[1], h-sep, 0) traverse(T, maxHeight(T), -1) filename = '//Users/raz/Dropbox/guide/data/dogs.csv' hg = hClusterer(filename) cluster = hg.cluster() printDendrogram(cluster) ================================================ FILE: chapter-8/hierarchicalClustererTemplate.py ================================================ from queue import PriorityQueue import math """ Example code for hierarchical clustering """ def getMedian(alist): """get median value of list alist""" tmp = list(alist) tmp.sort() alen = len(tmp) if (alen % 2) == 1: return tmp[alen // 2] else: return (tmp[alen // 2] + tmp[(alen // 2) - 1]) / 2 def normalizeColumn(column): """Normalize column using Modified Standard Score""" median = getMedian(column) asd = sum([abs(x - median) for x in column]) / len(column) result = [(x - median) / asd for x in column] return result class hClusterer: """ this clusterer assumes that the first column of the data is a label not used in the clustering. The other columns contain numeric data""" def __init__(self, filename): file = open(filename) self.data = {} self.counter = 0 self.queue = PriorityQueue() lines = file.readlines() file.close() header = lines[0].split(',') self.cols = len(header) self.data = [[] for i in range(len(header))] for line in lines[1:]: cells = line.split(',') toggle = 0 for cell in range(self.cols): if toggle == 0: self.data[cell].append(cells[cell]) toggle = 1 else: self.data[cell].append(float(cells[cell])) # now normalize number columns (that is, skip the first column) for i in range(1, self.cols): self.data[i] = normalizeColumn(self.data[i]) ### ### I have read in the data and normalized the ### columns. Now for each element i in the data, I am going to ### 1. compute the Euclidean Distance from element i to all the ### other elements. This data will be placed in neighbors, which ### is a Python dictionary. Let's say i = 1, and I am computing ### the distance to the neighbor j and let's say j is 2. The ### neighbors dictionary for i will look like ### {2: ((1,2), 1.23), 3: ((1, 3), 2.3)... } ### ### 2. find the closest neighbor ### ### 3. place the element on a priority queue, called simply queue, ### based on the distance to the nearest neighbor (and a counter ### used to break ties. # TO DO def distance(self, i, j): sumSquares = 0 for k in range(1, self.cols): sumSquares += (self.data[k][i] - self.data[k][j])**2 return math.sqrt(sumSquares) def cluster(self): # TODO return "TO DO" def printDendrogram(T, sep=3): """Print dendrogram of a binary tree. Each tree node is represented by a length-2 tuple. printDendrogram is written and provided by David Eppstein 2002. Accessed on 14 April 2014: http://code.activestate.com/recipes/139422-dendrogram-drawing/ """ def isPair(T): return type(T) == tuple and len(T) == 2 def maxHeight(T): if isPair(T): h = max(maxHeight(T[0]), maxHeight(T[1])) else: h = len(str(T)) return h + sep activeLevels = {} def traverse(T, h, isFirst): if isPair(T): traverse(T[0], h-sep, 1) s = [' ']*(h-sep) s.append('|') else: s = list(str(T)) s.append(' ') while len(s) < h: s.append('-') if (isFirst >= 0): s.append('+') if isFirst: activeLevels[h] = 1 else: del activeLevels[h] A = list(activeLevels) A.sort() for L in A: if len(s) < L: while len(s) < L: s.append(' ') s.append('|') print (''.join(s)) if isPair(T): traverse(T[1], h-sep, 0) traverse(T, maxHeight(T), -1) filename = '//Users/raz/Dropbox/guide/pg2dm-python/ch8/dogs.csv' #filename = '//Users/raz/Dropbox/guide/pg2dm-python/ch8/cerealTemp.csv' hg = hClusterer(filename) cluster = hg.cluster() printDendrogram(cluster) ================================================ FILE: chapter-8/kmeans.py ================================================ import math import random """ Implementation of the K-means algorithm for the book A Programmer's Guide to Data Mining" http://www.guidetodatamining.com """ def getMedian(alist): """get median of list""" tmp = list(alist) tmp.sort() alen = len(tmp) if (alen % 2) == 1: return tmp[alen // 2] else: return (tmp[alen // 2] + tmp[(alen // 2) - 1]) / 2 def normalizeColumn(column): """normalize the values of a column using Modified Standard Score that is (each value - median) / (absolute standard deviation)""" median = getMedian(column) asd = sum([abs(x - median) for x in column]) / len(column) result = [(x - median) / asd for x in column] return result class kClusterer: """ Implementation of kMeans Clustering This clusterer assumes that the first column of the data is a label not used in the clustering. The other columns contain numeric data """ def __init__(self, filename, k): """ k is the number of clusters to make This init method: 1. reads the data from the file named filename 2. stores that data by column in self.data 3. normalizes the data using Modified Standard Score 4. randomly selects the initial centroids 5. assigns points to clusters associated with those centroids """ file = open(filename) self.data = {} self.k = k self.counter = 0 self.iterationNumber = 0 # used to keep track of % of points that change cluster membership # in an iteration self.pointsChanged = 0 # Sum of Squared Error self.sse = 0 # # read data from file # lines = file.readlines() file.close() header = lines[0].split(',') self.cols = len(header) self.data = [[] for i in range(len(header))] # we are storing the data by column. # For example, self.data[0] is the data from column 0. # self.data[0][10] is the column 0 value of item 10. for line in lines[1:]: cells = line.split(',') toggle = 0 for cell in range(self.cols): if toggle == 0: self.data[cell].append(cells[cell]) toggle = 1 else: self.data[cell].append(float(cells[cell])) self.datasize = len(self.data[1]) self.memberOf = [-1 for x in range(len(self.data[1]))] # # now normalize number columns # for i in range(1, self.cols): self.data[i] = normalizeColumn(self.data[i]) # select random centroids from existing points random.seed() self.centroids = [[self.data[i][r] for i in range(1, len(self.data))] for r in random.sample(range(len(self.data[0])), self.k)] self.assignPointsToCluster() def updateCentroids(self): """Using the points in the clusters, determine the centroid (mean point) of each cluster""" members = [self.memberOf.count(i) for i in range(len(self.centroids))] self.centroids = [[sum([self.data[k][i] for i in range(len(self.data[0])) if self.memberOf[i] == centroid])/members[centroid] for k in range(1, len(self.data))] for centroid in range(len(self.centroids))] def assignPointToCluster(self, i): """ assign point to cluster based on distance from centroids""" min = 999999 clusterNum = -1 for centroid in range(self.k): dist = self.euclideanDistance(i, centroid) if dist < min: min = dist clusterNum = centroid # here is where I will keep track of changing points if clusterNum != self.memberOf[i]: self.pointsChanged += 1 # add square of distance to running sum of squared error self.sse += min**2 return clusterNum def assignPointsToCluster(self): """ assign each data point to a cluster""" self.pointsChanged = 0 self.sse = 0 self.memberOf = [self.assignPointToCluster(i) for i in range(len(self.data[1]))] def euclideanDistance(self, i, j): """ compute distance of point i from centroid j""" sumSquares = 0 for k in range(1, self.cols): sumSquares += (self.data[k][i] - self.centroids[j][k-1])**2 return math.sqrt(sumSquares) def kCluster(self): """the method that actually performs the clustering As you can see this method repeatedly updates the centroids by computing the mean point of each cluster re-assign the points to clusters based on these new centroids until the number of points that change cluster membership is less than 1%. """ done = False while not done: self.iterationNumber += 1 self.updateCentroids() self.assignPointsToCluster() # # we are done if fewer than 1% of the points change clusters # if float(self.pointsChanged) / len(self.memberOf) < 0.01: done = True print("Final SSE: %f" % self.sse) def showMembers(self): """Display the results""" for centroid in range(len(self.centroids)): print ("\n\nClass %i\n========" % centroid) for name in [self.data[0][i] for i in range(len(self.data[0])) if self.memberOf[i] == centroid]: print (name) ## ## RUN THE K-MEANS CLUSTERER ON THE DOG DATA USING K = 3 ### # change the path in the following to match where dogs.csv is on your machine km = kClusterer('../../data/dogs.csv', 3) km.kCluster() km.showMembers() ================================================ FILE: chapter-8/kmeansPlusPlus.py ================================================ import math import random """ Implementation of the K-means++ algorithm for the book A Programmer's Guide to Data Mining" http://www.guidetodatamining.com """ def getMedian(alist): """get median of list""" tmp = list(alist) tmp.sort() alen = len(tmp) if (alen % 2) == 1: return tmp[alen // 2] else: return (tmp[alen // 2] + tmp[(alen // 2) - 1]) / 2 def normalizeColumn(column): """normalize the values of a column using Modified Standard Score that is (each value - median) / (absolute standard deviation)""" median = getMedian(column) asd = sum([abs(x - median) for x in column]) / len(column) result = [(x - median) / asd for x in column] return result class kClusterer: """ Implementation of kMeans Clustering This clusterer assumes that the first column of the data is a label not used in the clustering. The other columns contain numeric data """ def __init__(self, filename, k): """ k is the number of clusters to make This init method: 1. reads the data from the file named filename 2. stores that data by column in self.data 3. normalizes the data using Modified Standard Score 4. randomly selects the initial centroids 5. assigns points to clusters associated with those centroids """ file = open(filename) self.data = {} self.k = k self.counter = 0 self.iterationNumber = 0 # used to keep track of % of points that change cluster membership # in an iteration self.pointsChanged = 0 # Sum of Squared Error self.sse = 0 # # read data from file # lines = file.readlines() file.close() header = lines[0].split(',') self.cols = len(header) self.data = [[] for i in range(len(header))] # we are storing the data by column. # For example, self.data[0] is the data from column 0. # self.data[0][10] is the column 0 value of item 10. for line in lines[1:]: cells = line.split(',') toggle = 0 for cell in range(self.cols): if toggle == 0: self.data[cell].append(cells[cell]) toggle = 1 else: self.data[cell].append(float(cells[cell])) self.datasize = len(self.data[1]) self.memberOf = [-1 for x in range(len(self.data[1]))] # # now normalize number columns # for i in range(1, self.cols): self.data[i] = normalizeColumn(self.data[i]) # select random centroids from existing points random.seed() self.selectInitialCentroids() self.assignPointsToCluster() def showData(self): for i in range(len(self.data[0])): print("%20s %8.4f %8.4f" % (self.data[0][i], self.data[1][i], self.data[2][i])) def distanceToClosestCentroid(self, point, centroidList): result = self.eDistance(point, centroidList[0]) for centroid in centroidList[1:]: distance = self.eDistance(point, centroid) if distance < result: result = distance return result def selectInitialCentroids(self): """implement the k-means++ method of selecting the set of initial centroids""" centroids = [] total = 0 # first step is to select a random first centroid current = random.choice(range(len(self.data[0]))) centroids.append(current) # loop to select the rest of the centroids, one at a time for i in range(0, self.k - 1): # for every point in the data find its distance to # the closest centroid weights = [self.distanceToClosestCentroid(x, centroids) for x in range(len(self.data[0]))] total = sum(weights) # instead of raw distances, convert so sum of weight = 1 weights = [x / total for x in weights] # # now roll virtual die num = random.random() total = 0 x = -1 # the roulette wheel simulation while total < num: x += 1 total += weights[x] centroids.append(x) self.centroids = [[self.data[i][r] for i in range(1, len(self.data))] for r in centroids] def updateCentroids(self): """Using the points in the clusters, determine the centroid (mean point) of each cluster""" members = [self.memberOf.count(i) for i in range(len(self.centroids))] self.centroids = [[sum([self.data[k][i] for i in range(len(self.data[0])) if self.memberOf[i] == centroid])/members[centroid] for k in range(1, len(self.data))] for centroid in range(len(self.centroids))] def assignPointToCluster(self, i): """ assign point to cluster based on distance from centroids""" min = 999999 clusterNum = -1 for centroid in range(self.k): dist = self.euclideanDistance(i, centroid) if dist < min: min = dist clusterNum = centroid # here is where I will keep track of changing points if clusterNum != self.memberOf[i]: self.pointsChanged += 1 # add square of distance to running sum of squared error self.sse += min**2 return clusterNum def assignPointsToCluster(self): """ assign each data point to a cluster""" self.pointsChanged = 0 self.sse = 0 self.memberOf = [self.assignPointToCluster(i) for i in range(len(self.data[1]))] def eDistance(self, i, j): """ compute distance of point i from centroid j""" sumSquares = 0 for k in range(1, self.cols): sumSquares += (self.data[k][i] - self.data[k][j])**2 return math.sqrt(sumSquares) def euclideanDistance(self, i, j): """ compute distance of point i from centroid j""" sumSquares = 0 for k in range(1, self.cols): sumSquares += (self.data[k][i] - self.centroids[j][k-1])**2 return math.sqrt(sumSquares) def kCluster(self): """the method that actually performs the clustering As you can see this method repeatedly updates the centroids by computing the mean point of each cluster re-assign the points to clusters based on these new centroids until the number of points that change cluster membership is less than 1%. """ done = False while not done: self.iterationNumber += 1 self.updateCentroids() self.assignPointsToCluster() # # we are done if fewer than 1% of the points change clusters # if float(self.pointsChanged) / len(self.memberOf) < 0.01: done = True print("Final SSE: %f" % self.sse) def showMembers(self): """Display the results""" for centroid in range(len(self.centroids)): print ("\n\nClass %i\n========" % centroid) for name in [self.data[0][i] for i in range(len(self.data[0])) if self.memberOf[i] == centroid]: print (name) ## ## RUN THE K-MEANS CLUSTERER ON THE DOG DATA USING K = 3 ### km = kClusterer('../../data/dogs.csv', 3) km.kCluster() km.showMembers()