[
  {
    "path": ".gitignore",
    "content": "# Byte-compiled / optimized / DLL files\n__pycache__/\n*.py[cod]\n*$py.class\n\n# C extensions\n*.so\n\n# Distribution / packaging\n.Python\nenv/\nbuild/\ndevelop-eggs/\ndist/\ndownloads/\neggs/\n.eggs/\nlib/\nlib64/\nparts/\nsdist/\nvar/\n*.egg-info/\n.installed.cfg\n*.egg\n\n# PyInstaller\n#  Usually these files are written by a python script from a template\n#  before PyInstaller builds the exe, so as to inject date/other infos into it.\n*.manifest\n*.spec\n\n# Installer logs\npip-log.txt\npip-delete-this-directory.txt\n\n# Unit test / coverage reports\nhtmlcov/\n.tox/\n.coverage\n.coverage.*\n.cache\nnosetests.xml\ncoverage.xml\n*,cover\n.hypothesis/\n\n# Translations\n*.mo\n*.pot\n\n# Django stuff:\n*.log\n\n# Sphinx documentation\ndocs/_build/\n\n# PyBuilder\ntarget/\n\n#Ipython Notebook\n.ipynb_checkpoints\n"
  },
  {
    "path": "README.md",
    "content": "# DataminingGuideBook-Codes\n\n[《面向程序员的数据挖掘指南》](http://dataminingguide.books.yourtion.com) 源码\n\n## 目录\n\n### [第一章：简介](http://dataminingguide.books.yourtion.com/chapter-1.html)\n\n讲述什么是数据挖掘，它所能解决的问题的是什么，以及在阅读完本书后，你可以做些什么。\n\n### [第二章：推荐系统入门](http://dataminingguide.books.yourtion.com/chapter-2.html)\n\n介绍协同过滤，基本的距离算法，包括曼哈顿距离、欧几里得距离、闵科夫斯基距离、皮尔森相关系数。使用Python实现一个基本的推荐算法。\n\n### [第三章：隐式评价和基于物品的过滤算法](http://dataminingguide.books.yourtion.com/chapter-3.html)\n\n这章开始讨论可供选择的用户评价体系。用户能够显示地給于评价（好、差、五星评价等），或者隐式地給于评价——如果用户在亚马逊购买了一个MP3，我们则认为他是“喜欢”这件商品的。\n\n### [第四章：分类](http://dataminingguide.books.yourtion.com/chapter-4.html)\n\n上一章中我们使用用户对商品的评价来进行推荐，这一章我们会使用商品本身的特性来进行推荐。这种算法在潘多拉等网站中采用。\n\n### [第五章：进一步探索分类](http://dataminingguide.books.yourtion.com/chapter-5.html)\n\n本章会讨论如何评价分类器的效果，方法包括十折交叉验证、留一法、以及Kappa检验等，同时还会引入kNN算法。\n\n### [第六章：朴素贝叶斯](http://dataminingguide.books.yourtion.com/chapter-6.html)\n\n我们会在这章探索朴素贝叶斯分类算法，使用概率密度函数来处理数值型数据。\n\n### [第七章：朴素贝叶斯算法和非结构化文本](http://dataminingguide.books.yourtion.com/chapter-7.html)\n\n这一章我们会尝试使用朴素贝叶斯算法来对非结构化文本进行分类。我们是否能够判断出Twitter上的一片影评是正面评价还是负面的呢？\n\n### [第八章：聚类](http://dataminingguide.books.yourtion.com/chapter-8.html)\n\n我们会讨论层次聚类和kmeans聚类。\n\n"
  },
  {
    "path": "chapter-2/filteringdata.py",
    "content": "#\n#  FILTERINGDATA.py\n#\n#  Code file for the book Programmer's Guide to Data Mining\n#  http://guidetodatamining.com\n#  Ron Zacharski\n#\n\nfrom math import sqrt\n\nusers = {\"Angelica\": {\"Blues Traveler\": 3.5, \"Broken Bells\": 2.0, \"Norah Jones\": 4.5, \"Phoenix\": 5.0, \"Slightly Stoopid\": 1.5, \"The Strokes\": 2.5, \"Vampire Weekend\": 2.0},\n         \"Bill\":{\"Blues Traveler\": 2.0, \"Broken Bells\": 3.5, \"Deadmau5\": 4.0, \"Phoenix\": 2.0, \"Slightly Stoopid\": 3.5, \"Vampire Weekend\": 3.0},\n         \"Chan\": {\"Blues Traveler\": 5.0, \"Broken Bells\": 1.0, \"Deadmau5\": 1.0, \"Norah Jones\": 3.0, \"Phoenix\": 5, \"Slightly Stoopid\": 1.0},\n         \"Dan\": {\"Blues Traveler\": 3.0, \"Broken Bells\": 4.0, \"Deadmau5\": 4.5, \"Phoenix\": 3.0, \"Slightly Stoopid\": 4.5, \"The Strokes\": 4.0, \"Vampire Weekend\": 2.0},\n         \"Hailey\": {\"Broken Bells\": 4.0, \"Deadmau5\": 1.0, \"Norah Jones\": 4.0, \"The Strokes\": 4.0, \"Vampire Weekend\": 1.0},\n         \"Jordyn\":  {\"Broken Bells\": 4.5, \"Deadmau5\": 4.0, \"Norah Jones\": 5.0, \"Phoenix\": 5.0, \"Slightly Stoopid\": 4.5, \"The Strokes\": 4.0, \"Vampire Weekend\": 4.0},\n         \"Sam\": {\"Blues Traveler\": 5.0, \"Broken Bells\": 2.0, \"Norah Jones\": 3.0, \"Phoenix\": 5.0, \"Slightly Stoopid\": 4.0, \"The Strokes\": 5.0},\n         \"Veronica\": {\"Blues Traveler\": 3.0, \"Norah Jones\": 5.0, \"Phoenix\": 4.0, \"Slightly Stoopid\": 2.5, \"The Strokes\": 3.0}\n        }\n\n\n\ndef manhattan(rating1, rating2):\n    \"\"\"Computes the Manhattan distance. Both rating1 and rating2 are dictionaries\n       of the form {'The Strokes': 3.0, 'Slightly Stoopid': 2.5}\"\"\"\n    distance = 0\n    commonRatings = False \n    for key in rating1:\n        if key in rating2:\n            distance += abs(rating1[key] - rating2[key])\n            commonRatings = True\n    if commonRatings:\n        return distance\n    else:\n        return -1 #Indicates no ratings in common\n\n\ndef computeNearestNeighbor(username, users):\n    \"\"\"creates a sorted list of users based on their distance to username\"\"\"\n    distances = []\n    for user in users:\n        if user != username:\n            distance = manhattan(users[user], users[username])\n            distances.append((distance, user))\n    # sort based on distance -- closest first\n    distances.sort()\n    return distances\n\ndef recommend(username, users):\n    \"\"\"Give list of recommendations\"\"\"\n    # first find nearest neighbor\n    nearest = computeNearestNeighbor(username, users)[0][1]\n\n    recommendations = []\n    # now find bands neighbor rated that user didn't\n    neighborRatings = users[nearest]\n    userRatings = users[username]\n    for artist in neighborRatings:\n        if not artist in userRatings:\n            recommendations.append((artist, neighborRatings[artist]))\n    # using the fn sorted for variety - sort is more efficient\n    return sorted(recommendations, key=lambda artistTuple: artistTuple[1], reverse = True)\n\n# examples - uncomment to run\n\nprint( recommend('Hailey', users))\n#print( recommend('Chan', users))\n"
  },
  {
    "path": "chapter-2/filteringdataPearson.py",
    "content": "#\n#  FILTERINGDATA.py\n#\n#  Code file for the book Programmer's Guide to Data Mining\n#  http://guidetodatamining.com\n#  Ron Zacharski\n#\n\nfrom math import sqrt\n\nusers = {\"Angelica\": {\"Blues Traveler\": 3.5, \"Broken Bells\": 2.0, \"Norah Jones\": 4.5, \"Phoenix\": 5.0, \"Slightly Stoopid\": 1.5, \"The Strokes\": 2.5, \"Vampire Weekend\": 2.0},\n         \"Bill\":{\"Blues Traveler\": 2.0, \"Broken Bells\": 3.5, \"Deadmau5\": 4.0, \"Phoenix\": 2.0, \"Slightly Stoopid\": 3.5, \"Vampire Weekend\": 3.0},\n         \"Chan\": {\"Blues Traveler\": 5.0, \"Broken Bells\": 1.0, \"Deadmau5\": 1.0, \"Norah Jones\": 3.0, \"Phoenix\": 5, \"Slightly Stoopid\": 1.0},\n         \"Dan\": {\"Blues Traveler\": 3.0, \"Broken Bells\": 4.0, \"Deadmau5\": 4.5, \"Phoenix\": 3.0, \"Slightly Stoopid\": 4.5, \"The Strokes\": 4.0, \"Vampire Weekend\": 2.0},\n         \"Hailey\": {\"Broken Bells\": 4.0, \"Deadmau5\": 1.0, \"Norah Jones\": 4.0, \"The Strokes\": 4.0, \"Vampire Weekend\": 1.0},\n         \"Jordyn\":  {\"Broken Bells\": 4.5, \"Deadmau5\": 4.0, \"Norah Jones\": 5.0, \"Phoenix\": 5.0, \"Slightly Stoopid\": 4.5, \"The Strokes\": 4.0, \"Vampire Weekend\": 4.0},\n         \"Sam\": {\"Blues Traveler\": 5.0, \"Broken Bells\": 2.0, \"Norah Jones\": 3.0, \"Phoenix\": 5.0, \"Slightly Stoopid\": 4.0, \"The Strokes\": 5.0},\n         \"Veronica\": {\"Blues Traveler\": 3.0, \"Norah Jones\": 5.0, \"Phoenix\": 4.0, \"Slightly Stoopid\": 2.5, \"The Strokes\": 3.0}\n        }\n\n\n\ndef manhattan(rating1, rating2):\n    \"\"\"Computes the Manhattan distance. Both rating1 and rating2 are dictionaries\n       of the form {'The Strokes': 3.0, 'Slightly Stoopid': 2.5}\"\"\"\n    distance = 0\n    total = 0\n    for key in rating1:\n        if key in rating2:\n            distance += abs(rating1[key] - rating2[key])\n            total += 1\n    if total > 0:\n        return distance / total\n    else:\n        return -1 #Indicates no ratings in common\n\n\n\ndef pearson(rating1, rating2):\n    sum_xy = 0\n    sum_x = 0\n    sum_y = 0\n    sum_x2 = 0\n    sum_y2 = 0\n    n = 0\n    for key in rating1:\n        if key in rating2:\n            n += 1\n            x = rating1[key]\n            y = rating2[key]\n            sum_xy += x * y\n            sum_x += x\n            sum_y += y\n            sum_x2 += pow(x, 2)\n            sum_y2 += pow(y, 2)\n    # now compute denominator\n    denominator = sqrt(sum_x2 - pow(sum_x, 2) / n) * sqrt(sum_y2 - pow(sum_y, 2) / n)\n    if denominator == 0:\n        return 0\n    else:\n        return (sum_xy - (sum_x * sum_y) / n) / denominator\n            \n\ndef computeNearestNeighbor(username, users):\n    \"\"\"creates a sorted list of users based on their distance to username\"\"\"\n    distances = []\n    for user in users:\n        if user != username:\n            distance = manhattan(users[user], users[username])\n            distances.append((distance, user))\n    # sort based on distance -- closest first\n    distances.sort()\n    return distances\n\ndef recommend(username, users):\n    \"\"\"Give list of recommendations\"\"\"\n    # first find nearest neighbor\n    nearest = computeNearestNeighbor(username, users)[0][1]\n\n    recommendations = []\n    # now find bands neighbor rated that user didn't\n    neighborRatings = users[nearest]\n    userRatings = users[username]\n    for artist in neighborRatings:\n        if not artist in userRatings:\n            recommendations.append((artist, neighborRatings[artist]))\n    # using the fn sorted for variety - sort is more efficient\n    return sorted(recommendations, key=lambda artistTuple: artistTuple[1], reverse = True)\n\n"
  },
  {
    "path": "chapter-2/recommender.py",
    "content": "import codecs \nfrom math import sqrt\n\nusers = {\"Angelica\": {\"Blues Traveler\": 3.5, \"Broken Bells\": 2.0,\n                      \"Norah Jones\": 4.5, \"Phoenix\": 5.0,\n                      \"Slightly Stoopid\": 1.5,\n                      \"The Strokes\": 2.5, \"Vampire Weekend\": 2.0},\n         \n         \"Bill\":{\"Blues Traveler\": 2.0, \"Broken Bells\": 3.5,\n                 \"Deadmau5\": 4.0, \"Phoenix\": 2.0,\n                 \"Slightly Stoopid\": 3.5, \"Vampire Weekend\": 3.0},\n         \n         \"Chan\": {\"Blues Traveler\": 5.0, \"Broken Bells\": 1.0,\n                  \"Deadmau5\": 1.0, \"Norah Jones\": 3.0, \"Phoenix\": 5,\n                  \"Slightly Stoopid\": 1.0},\n         \n         \"Dan\": {\"Blues Traveler\": 3.0, \"Broken Bells\": 4.0,\n                 \"Deadmau5\": 4.5, \"Phoenix\": 3.0,\n                 \"Slightly Stoopid\": 4.5, \"The Strokes\": 4.0,\n                 \"Vampire Weekend\": 2.0},\n         \n         \"Hailey\": {\"Broken Bells\": 4.0, \"Deadmau5\": 1.0,\n                    \"Norah Jones\": 4.0, \"The Strokes\": 4.0,\n                    \"Vampire Weekend\": 1.0},\n         \n         \"Jordyn\":  {\"Broken Bells\": 4.5, \"Deadmau5\": 4.0,\n                     \"Norah Jones\": 5.0, \"Phoenix\": 5.0,\n                     \"Slightly Stoopid\": 4.5, \"The Strokes\": 4.0,\n                     \"Vampire Weekend\": 4.0},\n         \n         \"Sam\": {\"Blues Traveler\": 5.0, \"Broken Bells\": 2.0,\n                 \"Norah Jones\": 3.0, \"Phoenix\": 5.0,\n                 \"Slightly Stoopid\": 4.0, \"The Strokes\": 5.0},\n         \n         \"Veronica\": {\"Blues Traveler\": 3.0, \"Norah Jones\": 5.0,\n                      \"Phoenix\": 4.0, \"Slightly Stoopid\": 2.5,\n                      \"The Strokes\": 3.0}\n        }\n\n\n\nclass recommender:\n\n    def __init__(self, data, k=1, metric='pearson', n=5):\n        \"\"\" initialize recommender\n        currently, if data is dictionary the recommender is initialized\n        to it.\n        For all other data types of data, no initialization occurs\n        k is the k value for k nearest neighbor\n        metric is which distance formula to use\n        n is the maximum number of recommendations to make\"\"\"\n        self.k = k\n        self.n = n\n        self.username2id = {}\n        self.userid2name = {}\n        self.productid2name = {}\n        # for some reason I want to save the name of the metric\n        self.metric = metric\n        if self.metric == 'pearson':\n            self.fn = self.pearson\n        #\n        # if data is dictionary set recommender data to it\n        #\n        if type(data).__name__ == 'dict':\n            self.data = data\n\n    def convertProductID2name(self, id):\n        \"\"\"Given product id number return product name\"\"\"\n        if id in self.productid2name:\n            return self.productid2name[id]\n        else:\n            return id\n\n\n    def userRatings(self, id, n):\n        \"\"\"Return n top ratings for user with id\"\"\"\n        print (\"Ratings for \" + self.userid2name[id])\n        ratings = self.data[id]\n        print(len(ratings))\n        ratings = list(ratings.items())\n        ratings = [(self.convertProductID2name(k), v)\n                   for (k, v) in ratings]\n        # finally sort and return\n        ratings.sort(key=lambda artistTuple: artistTuple[1],\n                     reverse = True)\n        ratings = ratings[:n]\n        for rating in ratings:\n            print(\"%s\\t%i\" % (rating[0], rating[1]))\n        \n\n        \n\n    def loadBookDB(self, path=''):\n        \"\"\"loads the BX book dataset. Path is where the BX files are\n        located\"\"\"\n        self.data = {}\n        i = 0\n        #\n        # First load book ratings into self.data\n        #\n        f = codecs.open(path + \"BX-Book-Ratings.csv\", 'r', 'utf8')\n        for line in f:\n            i += 1\n            #separate line into fields\n            fields = line.split(';')\n            user = fields[0].strip('\"')\n            book = fields[1].strip('\"')\n            rating = int(fields[2].strip().strip('\"'))\n            if user in self.data:\n                currentRatings = self.data[user]\n            else:\n                currentRatings = {}\n            currentRatings[book] = rating\n            self.data[user] = currentRatings\n        f.close()\n        #\n        # Now load books into self.productid2name\n        # Books contains isbn, title, and author among other fields\n        #\n        f = codecs.open(path + \"BX-Books.csv\", 'r', 'utf8')\n        for line in f:\n            i += 1\n            #separate line into fields\n            fields = line.split(';')\n            isbn = fields[0].strip('\"')\n            title = fields[1].strip('\"')\n            author = fields[2].strip().strip('\"')\n            title = title + ' by ' + author\n            self.productid2name[isbn] = title\n        f.close()\n        #\n        #  Now load user info into both self.userid2name and\n        #  self.username2id\n        #\n        f = codecs.open(path + \"BX-Users.csv\", 'r', 'utf8')\n        for line in f:\n            i += 1\n            #print(line)\n            #separate line into fields\n            fields = line.split(';')\n            userid = fields[0].strip('\"')\n            location = fields[1].strip('\"')\n            if len(fields) > 3:\n                age = fields[2].strip().strip('\"')\n            else:\n                age = 'NULL'\n            if age != 'NULL':\n                value = location + '  (age: ' + age + ')'\n            else:\n                value = location\n            self.userid2name[userid] = value\n            self.username2id[location] = userid\n        f.close()\n        print(i)\n                \n        \n    def pearson(self, rating1, rating2):\n        sum_xy = 0\n        sum_x = 0\n        sum_y = 0\n        sum_x2 = 0\n        sum_y2 = 0\n        n = 0\n        for key in rating1:\n            if key in rating2:\n                n += 1\n                x = rating1[key]\n                y = rating2[key]\n                sum_xy += x * y\n                sum_x += x\n                sum_y += y\n                sum_x2 += pow(x, 2)\n                sum_y2 += pow(y, 2)\n        if n == 0:\n            return 0\n        # now compute denominator\n        denominator = (sqrt(sum_x2 - pow(sum_x, 2) / n)\n                       * sqrt(sum_y2 - pow(sum_y, 2) / n))\n        if denominator == 0:\n            return 0\n        else:\n            return (sum_xy - (sum_x * sum_y) / n) / denominator\n\n\n    def computeNearestNeighbor(self, username):\n        \"\"\"creates a sorted list of users based on their distance to\n        username\"\"\"\n        distances = []\n        for instance in self.data:\n            if instance != username:\n                distance = self.fn(self.data[username],\n                                   self.data[instance])\n                distances.append((instance, distance))\n        # sort based on distance -- closest first\n        distances.sort(key=lambda artistTuple: artistTuple[1],\n                       reverse=True)\n        return distances\n\n    def recommend(self, user):\n       \"\"\"Give list of recommendations\"\"\"\n       recommendations = {}\n       # first get list of users  ordered by nearness\n       nearest = self.computeNearestNeighbor(user)\n       #\n       # now get the ratings for the user\n       #\n       userRatings = self.data[user]\n       #\n       # determine the total distance\n       totalDistance = 0.0\n       for i in range(self.k):\n          totalDistance += nearest[i][1]\n       # now iterate through the k nearest neighbors\n       # accumulating their ratings\n       for i in range(self.k):\n          # compute slice of pie \n          weight = nearest[i][1] / totalDistance\n          # get the name of the person\n          name = nearest[i][0]\n          # get the ratings for this person\n          neighborRatings = self.data[name]\n          # get the name of the person\n          # now find bands neighbor rated that user didn't\n          for artist in neighborRatings:\n             if not artist in userRatings:\n                if artist not in recommendations:\n                   recommendations[artist] = (neighborRatings[artist]\n                                              * weight)\n                else:\n                   recommendations[artist] = (recommendations[artist]\n                                              + neighborRatings[artist]\n                                              * weight)\n       # now make list from dictionary\n       recommendations = list(recommendations.items())\n       recommendations = [(self.convertProductID2name(k), v)\n                          for (k, v) in recommendations]\n       # finally sort and return\n       recommendations.sort(key=lambda artistTuple: artistTuple[1],\n                            reverse = True)\n       # Return the first n items\n       return recommendations[:self.n]\n\n"
  },
  {
    "path": "chapter-3/adjusted_cosine_similarity.py",
    "content": "# -*- coding: utf-8 -*-\n\nfrom math import sqrt\n\nusers3 = {\"David\": {\"Imagine Dragons\": 3, \"Daft Punk\": 5,\n                    \"Lorde\": 4, \"Fall Out Boy\": 1},\n          \"Matt\": {\"Imagine Dragons\": 3, \"Daft Punk\": 4,\n                   \"Lorde\": 4, \"Fall Out Boy\": 1},\n          \"Ben\": {\"Kacey Musgraves\": 4, \"Imagine Dragons\": 3,\n                  \"Lorde\": 3, \"Fall Out Boy\": 1},\n          \"Chris\": {\"Kacey Musgraves\": 4, \"Imagine Dragons\": 4,\n                    \"Daft Punk\": 4, \"Lorde\": 3, \"Fall Out Boy\": 1},\n          \"Tori\": {\"Kacey Musgraves\": 5, \"Imagine Dragons\": 4,\n                   \"Daft Punk\": 5, \"Fall Out Boy\": 3}}\n\n\ndef computeSimilarity(band1, band2, userRatings):\n    averages = {}\n    for (key, ratings) in userRatings.items():\n        averages[key] = (float(sum(ratings.values())) / len(ratings.values()))\n\n    num = 0 # 分子\n    dem1 = 0 # 分母的第一部分\n    dem2 = 0\n    for (user, ratings) in userRatings.items():\n        if band1 in ratings and band2 in ratings:\n            avg = averages[user]\n            num += (ratings[band1] - avg) * (ratings[band2] - avg)\n            dem1 += (ratings[band1] - avg) ** 2\n            dem2 += (ratings[band2] - avg) ** 2\n    return num / (sqrt(dem1) * sqrt(dem2))\n\nprint(computeSimilarity('Kacey Musgraves', 'Lorde', users3))\nprint(computeSimilarity('Imagine Dragons', 'Lorde', users3))\nprint(computeSimilarity('Daft Punk', 'Lorde', users3))\n\n"
  },
  {
    "path": "chapter-3/recommender3.py",
    "content": "import codecs \nfrom math import sqrt\n\nusers2 = {\"Amy\": {\"Taylor Swift\": 4, \"PSY\": 3, \"Whitney Houston\": 4},\n          \"Ben\": {\"Taylor Swift\": 5, \"PSY\": 2},\n          \"Clara\": {\"PSY\": 3.5, \"Whitney Houston\": 4},\n          \"Daisy\": {\"Taylor Swift\": 5, \"Whitney Houston\": 3}}\n\nusers = {\"Angelica\": {\"Blues Traveler\": 3.5, \"Broken Bells\": 2.0,\n                      \"Norah Jones\": 4.5, \"Phoenix\": 5.0,\n                      \"Slightly Stoopid\": 1.5, \"The Strokes\": 2.5,\n                      \"Vampire Weekend\": 2.0},\n         \"Bill\":{\"Blues Traveler\": 2.0, \"Broken Bells\": 3.5,\n                 \"Deadmau5\": 4.0, \"Phoenix\": 2.0,\n                 \"Slightly Stoopid\": 3.5, \"Vampire Weekend\": 3.0},\n         \"Chan\": {\"Blues Traveler\": 5.0, \"Broken Bells\": 1.0,\n                  \"Deadmau5\": 1.0, \"Norah Jones\": 3.0,\n                  \"Phoenix\": 5, \"Slightly Stoopid\": 1.0},\n         \"Dan\": {\"Blues Traveler\": 3.0, \"Broken Bells\": 4.0,\n                 \"Deadmau5\": 4.5, \"Phoenix\": 3.0,\n                 \"Slightly Stoopid\": 4.5, \"The Strokes\": 4.0,\n                 \"Vampire Weekend\": 2.0},\n         \"Hailey\": {\"Broken Bells\": 4.0, \"Deadmau5\": 1.0,\n                    \"Norah Jones\": 4.0, \"The Strokes\": 4.0,\n                    \"Vampire Weekend\": 1.0},\n         \"Jordyn\":  {\"Broken Bells\": 4.5, \"Deadmau5\": 4.0,\n                     \"Norah Jones\": 5.0, \"Phoenix\": 5.0,\n                     \"Slightly Stoopid\": 4.5, \"The Strokes\": 4.0,\n                     \"Vampire Weekend\": 4.0},\n         \"Sam\": {\"Blues Traveler\": 5.0, \"Broken Bells\": 2.0,\n                 \"Norah Jones\": 3.0, \"Phoenix\": 5.0,\n                 \"Slightly Stoopid\": 4.0, \"The Strokes\": 5.0},\n         \"Veronica\": {\"Blues Traveler\": 3.0, \"Norah Jones\": 5.0,\n                      \"Phoenix\": 4.0, \"Slightly Stoopid\": 2.5,\n                      \"The Strokes\": 3.0}\n        }\n\n\n\nclass recommender:\n\n   def __init__(self, data, k=1, metric='pearson', n=5):\n      \"\"\" initialize recommender\n      currently, if data is dictionary the recommender is initialized\n      to it.\n      For all other data types of data, no initialization occurs\n      k is the k value for k nearest neighbor\n      metric is which distance formula to use\n      n is the maximum number of recommendations to make\"\"\"\n      self.k = k\n      self.n = n\n      self.username2id = {}\n      self.userid2name = {}\n      self.productid2name = {}\n      #\n      # The following two variables are used for Slope One\n      # \n      self.frequencies = {}\n      self.deviations = {}\n      # for some reason I want to save the name of the metric\n      self.metric = metric\n      if self.metric == 'pearson':\n         self.fn = self.pearson\n      #\n      # if data is dictionary set recommender data to it\n      #\n      if type(data).__name__ == 'dict':\n         self.data = data\n\n   def convertProductID2name(self, id):\n      \"\"\"Given product id number return product name\"\"\"\n      if id in self.productid2name:\n         return self.productid2name[id]\n      else:\n         return id\n\n\n   def userRatings(self, id, n):\n      \"\"\"Return n top ratings for user with id\"\"\"\n      print (\"Ratings for \" + self.userid2name[id])\n      ratings = self.data[id]\n      print(len(ratings))\n      ratings = list(ratings.items())[:n]\n      ratings = [(self.convertProductID2name(k), v)\n                 for (k, v) in ratings]\n      # finally sort and return\n      ratings.sort(key=lambda artistTuple: artistTuple[1],\n                   reverse = True)      \n      for rating in ratings:\n         print(\"%s\\t%i\" % (rating[0], rating[1]))\n\n\n   def showUserTopItems(self, user, n):\n      \"\"\" show top n items for user\"\"\"\n      items = list(self.data[user].items())\n      items.sort(key=lambda itemTuple: itemTuple[1], reverse=True)\n      for i in range(n):\n         print(\"%s\\t%i\" % (self.convertProductID2name(items[i][0]),\n                           items[i][1]))\n            \n   def loadMovieLens(self, path=''):\n      self.data = {}\n      #\n      # first load movie ratings\n      #\n      i = 0\n      #\n      # First load book ratings into self.data\n      #\n      #f = codecs.open(path + \"u.data\", 'r', 'utf8')\n      f = codecs.open(path + \"u.data\", 'r', 'ascii')\n      #  f = open(path + \"u.data\")\n      for line in f:\n         i += 1\n         #separate line into fields\n         fields = line.split('\\t')\n         user = fields[0]\n         movie = fields[1]\n         rating = int(fields[2].strip().strip('\"'))\n         if user in self.data:\n            currentRatings = self.data[user]\n         else:\n            currentRatings = {}\n         currentRatings[movie] = rating\n         self.data[user] = currentRatings\n      f.close()\n      #\n      # Now load movie into self.productid2name\n      # the file u.item contains movie id, title, release date among\n      # other fields\n      #\n      #f = codecs.open(path + \"u.item\", 'r', 'utf8')\n      f = codecs.open(path + \"u.item\", 'r', 'iso8859-1', 'ignore')\n      #f = open(path + \"u.item\")\n      for line in f:\n         i += 1\n         #separate line into fields\n         fields = line.split('|')\n         mid = fields[0].strip()\n         title = fields[1].strip()\n         self.productid2name[mid] = title\n      f.close()\n      #\n      #  Now load user info into both self.userid2name\n      #  and self.username2id\n      #\n      #f = codecs.open(path + \"u.user\", 'r', 'utf8')\n      f = open(path + \"u.user\")\n      for line in f:\n         i += 1\n         fields = line.split('|')\n         userid = fields[0].strip('\"')\n         self.userid2name[userid] = line\n         self.username2id[line] = userid\n      f.close()\n      print(i)\n\n\n\n\n   def loadBookDB(self, path=''):\n      \"\"\"loads the BX book dataset. Path is where the BX files are\n      located\"\"\"\n      self.data = {}\n      i = 0\n      #\n      # First load book ratings into self.data\n      #\n      f = codecs.open(path + \"u.data\", 'r', 'utf8')\n      for line in f:\n         i += 1\n         # separate line into fields\n         fields = line.split(';')\n         user = fields[0].strip('\"')\n         book = fields[1].strip('\"')\n         rating = int(fields[2].strip().strip('\"'))\n         if rating > 5:\n            print(\"EXCEEDING \", rating)\n         if user in self.data:\n            currentRatings = self.data[user]\n         else:\n            currentRatings = {}\n         currentRatings[book] = rating\n         self.data[user] = currentRatings\n      f.close()\n      #\n      # Now load books into self.productid2name\n      # Books contains isbn, title, and author among other fields\n      #\n      f = codecs.open(path + \"BX-Books.csv\", 'r', 'utf8')\n      for line in f:\n         i += 1\n         # separate line into fields\n         fields = line.split(';')\n         isbn = fields[0].strip('\"')\n         title = fields[1].strip('\"')\n         author = fields[2].strip().strip('\"')\n         title = title + ' by ' + author\n         self.productid2name[isbn] = title\n      f.close()\n      #\n      #  Now load user info into both self.userid2name and\n      #  self.username2id\n      #\n      f = codecs.open(path + \"BX-Users.csv\", 'r', 'utf8')\n      for line in f:\n         i += 1\n         # separate line into fields\n         fields = line.split(';')\n         userid = fields[0].strip('\"')\n         location = fields[1].strip('\"')\n         if len(fields) > 3:\n            age = fields[2].strip().strip('\"')\n         else:\n            age = 'NULL'\n         if age != 'NULL':\n            value = location + '  (age: ' + age + ')'\n         else:\n            value = location\n         self.userid2name[userid] = value\n         self.username2id[location] = userid\n      f.close()\n      print(i)\n                \n        \n   def computeDeviations(self):\n      # for each person in the data:\n      #    get their ratings\n      for ratings in self.data.values():\n         # for each item & rating in that set of ratings:\n         for (item, rating) in ratings.items():\n            self.frequencies.setdefault(item, {})\n            self.deviations.setdefault(item, {})                    \n            # for each item2 & rating2 in that set of ratings:\n            for (item2, rating2) in ratings.items():\n               if item != item2:\n                  # add the difference between the ratings to our\n                  # computation\n                  self.frequencies[item].setdefault(item2, 0)\n                  self.deviations[item].setdefault(item2, 0.0)\n                  self.frequencies[item][item2] += 1\n                  self.deviations[item][item2] += rating - rating2\n        \n      for (item, ratings) in self.deviations.items():\n         for item2 in ratings:\n            ratings[item2] /= self.frequencies[item][item2]\n\n\n   def slopeOneRecommendations(self, userRatings):\n      recommendations = {}\n      frequencies = {}\n      # for every item and rating in the user's recommendations\n      for (userItem, userRating) in userRatings.items():\n         # for every item in our dataset that the user didn't rate\n         for (diffItem, diffRatings) in self.deviations.items():\n            if diffItem not in userRatings and \\\n               userItem in self.deviations[diffItem]:\n               freq = self.frequencies[diffItem][userItem]\n               recommendations.setdefault(diffItem, 0.0)\n               frequencies.setdefault(diffItem, 0)\n               # add to the running sum representing the numerator\n               # of the formula\n               recommendations[diffItem] += (diffRatings[userItem] +\n                                             userRating) * freq\n               # keep a running sum of the frequency of diffitem\n               frequencies[diffItem] += freq\n      recommendations =  [(self.convertProductID2name(k),\n                           v / frequencies[k])\n                          for (k, v) in recommendations.items()]\n      # finally sort and return\n      recommendations.sort(key=lambda artistTuple: artistTuple[1],\n                           reverse = True)\n      # I am only going to return the first 50 recommendations\n      return recommendations[:50]\n        \n   def pearson(self, rating1, rating2):\n      sum_xy = 0\n      sum_x = 0\n      sum_y = 0\n      sum_x2 = 0\n      sum_y2 = 0\n      n = 0\n      for key in rating1:\n         if key in rating2:\n            n += 1\n            x = rating1[key]\n            y = rating2[key]\n            sum_xy += x * y\n            sum_x += x\n            sum_y += y\n            sum_x2 += pow(x, 2)\n            sum_y2 += pow(y, 2)\n      if n == 0:\n         return 0\n      # now compute denominator\n      denominator = sqrt(sum_x2 - pow(sum_x, 2) / n) * \\\n                    sqrt(sum_y2 - pow(sum_y, 2) / n)\n      if denominator == 0:\n         return 0\n      else:\n         return (sum_xy - (sum_x * sum_y) / n) / denominator\n\n\n   def computeNearestNeighbor(self, username):\n      \"\"\"creates a sorted list of users based on their distance\n      to username\"\"\"\n      distances = []\n      for instance in self.data:\n         if instance != username:\n            distance = self.fn(self.data[username],\n                               self.data[instance])\n            distances.append((instance, distance))\n      # sort based on distance -- closest first\n      distances.sort(key=lambda artistTuple: artistTuple[1],\n                     reverse=True)\n      return distances\n\n   def recommend(self, user):\n      \"\"\"Give list of recommendations\"\"\"\n      recommendations = {}\n      # first get list of users  ordered by nearness\n      nearest = self.computeNearestNeighbor(user)\n      #\n      # now get the ratings for the user\n      #\n      userRatings = self.data[user]\n      #\n      # determine the total distance\n      totalDistance = 0.0\n      for i in range(self.k):\n         totalDistance += nearest[i][1]\n      # now iterate through the k nearest neighbors\n      # accumulating their ratings\n      for i in range(self.k):\n         # compute slice of pie \n         weight = nearest[i][1] / totalDistance\n         # get the name of the person\n         name = nearest[i][0]\n         # get the ratings for this person\n         neighborRatings = self.data[name]\n         # get the name of the person\n         # now find bands neighbor rated that user didn't\n         for artist in neighborRatings:\n            if not artist in userRatings:\n               if artist not in recommendations:\n                  recommendations[artist] = neighborRatings[artist] * \\\n                                            weight\n               else:\n                  recommendations[artist] = recommendations[artist] + \\\n                                            neighborRatings[artist] * \\\n                                            weight\n      # now make list from dictionary and only get the first n items\n      recommendations = list(recommendations.items())[:self.n]\n      recommendations = [(self.convertProductID2name(k), v)\n                         for (k, v) in recommendations]\n      # finally sort and return\n      recommendations.sort(key=lambda artistTuple: artistTuple[1],\n                           reverse = True)\n      return recommendations\n\n"
  },
  {
    "path": "chapter-4/athletesTestSet.txt",
    "content": "Aly Raisman\tGymnastics\t62\t115\nCrystal Langhorne\tBasketball\t74\t190\nDiana Taurasi\tBasketball\t72\t163\nErin Thorn\tBasketball\t69\t144\nHannah Whelan\tGymnastics\t63\t117\nJaycie Phelps\tGymnastics\t60\t97\nKelly Miller\tBasketball\t70\t140\nKerri Strug\tGymnastics\t57\t87\nKoko Tsurumi\tGymnastics\t55\t75\nLi Shanshan\tGymnastics\t64\t101\nLindsay Whalen\tBasketball\t69\t169\nLisa Jane Weightman\tTrack\t62\t97\nMaya Moore\tBasketball\t72\t174\nPaula Radcliffe\tTrack\t68\t120\nPenny Taylor\tBasketball\t73\t165\nPriscah Jeptoo\tTrack\t65\t108\nShalane Flanagan\tTrack\t65\t106\nXiaolin Zhu\tTrack\t67\t121\nXueqin Wang\tTrack\t64\t110\nZhu Xiaolin\tTrack\t67\t123\n"
  },
  {
    "path": "chapter-4/athletesTrainingSet.txt",
    "content": "comment\tclass\tnum\tnum\nAsuka Teramoto\tGymnastics\t54\t66\nBrittainey Raven\tBasketball\t72\t162\nChen Nan\tBasketball\t78\t204\nGabby Douglas\tGymnastics\t49\t90\nHelalia Johannes\tTrack\t65\t99\nIrina Miketenko\tTrack\t63\t106\nJennifer Lacy\tBasketball\t75\t175\nKara Goucher\tTrack\t67\t123\nLinlin Deng\tGymnastics\t54\t68\nNakia Sanford\tBasketball\t76\t200\nNikki Blue\tBasketball\t68\t163\nQiushuang Huang\tGymnastics\t61\t95\nRebecca Tunney\tGymnastics\t58\t77\nRene Kalmer\tTrack\t70\t108\nShanna Crossley\tBasketball\t70\t155\nShavonte Zellous\tBasketball\t70\t155\nTatyana Petrova\tTrack\t63\t108\nTiki Gelana\tTrack\t65\t106\nValeria Straneo\tTrack\t66\t97\nViktoria Komova\tGymnastics\t61\t76"
  },
  {
    "path": "chapter-4/classifyTemplate.py",
    "content": "#\n#  Classify Template \n#\n#  Finish the code for the method, nearestNeighbor\n#\n#  Code file for the book Programmer's Guide to Data Mining\n#  http://guidetodatamining.com\n#\n#  Ron Zacharski\n#\n\n\n\nclass Classifier:\n\n    def __init__(self, filename):\n\n        self.medianAndDeviation = []\n        \n        # reading the data in from the file\n        f = open(filename)\n        lines = f.readlines()\n        f.close()\n        self.format = lines[0].strip().split('\\t')\n        self.data = []\n        for line in lines[1:]:\n            fields = line.strip().split('\\t')\n            ignore = []\n            vector = []\n            for i in range(len(fields)):\n                if self.format[i] == 'num':\n                    vector.append(int(fields[i]))\n                elif self.format[i] == 'comment':\n                    ignore.append(fields[i])\n                elif self.format[i] == 'class':\n                    classification = fields[i]\n            self.data.append((classification, vector, ignore))\n        self.rawData = list(self.data)\n        # get length of instance vector\n        self.vlen = len(self.data[0][1])\n        # now normalize the data\n        for i in range(self.vlen):\n            self.normalizeColumn(i)\n        \n\n        \n    \n    ##################################################\n    ###\n    ###  CODE TO COMPUTE THE MODIFIED STANDARD SCORE\n\n    def getMedian(self, alist):\n        \"\"\"return median of alist\"\"\"\n        if alist == []:\n            return []\n        blist = sorted(alist)\n        length = len(alist)\n        if length % 2 == 1:\n            # length of list is odd so return middle element\n            return blist[int(((length + 1) / 2) -  1)]\n        else:\n            # length of list is even so compute midpoint\n            v1 = blist[int(length / 2)]\n            v2 =blist[(int(length / 2) - 1)]\n            return (v1 + v2) / 2.0\n        \n\n    def getAbsoluteStandardDeviation(self, alist, median):\n        \"\"\"given alist and median return absolute standard deviation\"\"\"\n        sum = 0\n        for item in alist:\n            sum += abs(item - median)\n        return sum / len(alist)\n\n\n    def normalizeColumn(self, columnNumber):\n       \"\"\"given a column number, normalize that column in self.data\"\"\"\n       # first extract values to list\n       col = [v[1][columnNumber] for v in self.data]\n       median = self.getMedian(col)\n       asd = self.getAbsoluteStandardDeviation(col, median)\n       #print(\"Median: %f   ASD = %f\" % (median, asd))\n       self.medianAndDeviation.append((median, asd))\n       for v in self.data:\n           v[1][columnNumber] = (v[1][columnNumber] - median) / asd\n\n\n    def normalizeVector(self, v):\n        \"\"\"We have stored the median and asd for each column.\n        We now use them to normalize vector v\"\"\"\n        vector = list(v)\n        for i in range(len(vector)):\n            (median, asd) = self.medianAndDeviation[i]\n            vector[i] = (vector[i] - median) / asd\n        return vector\n\n    \n    ###\n    ### END NORMALIZATION\n    ##################################################\n\n\n\n    def manhattan(self, vector1, vector2):\n        \"\"\"Computes the Manhattan distance.\"\"\"\n        return sum(map(lambda v1, v2: abs(v1 - v2), vector1, vector2))\n\n\n    def nearestNeighbor(self, itemVector):\n        \"\"\"return nearest neighbor to itemVector\"\"\"\n        \n        return ((0, (\"REPLACE THIS LINE WITH CORRECT RETURN\", [0], [])))\n    \n    def classify(self, itemVector):\n        \"\"\"Return class we think item Vector is in\"\"\"\n        return(self.nearestNeighbor(self.normalizeVector(itemVector))[1][0])\n \n\ndef unitTest():\n    classifier = Classifier('athletesTrainingSet.txt')\n    br = ('Basketball', [72, 162], ['Brittainey Raven'])\n    nl = ('Gymnastics', [61, 76], ['Viktoria Komova'])\n    cl = (\"Basketball\", [74, 190], ['Crystal Langhorne'])\n    # first check normalize function\n    brNorm = classifier.normalizeVector(br[1])\n    nlNorm = classifier.normalizeVector(nl[1])\n    clNorm = classifier.normalizeVector(cl[1])\n    assert(brNorm == classifier.data[1][1])\n    assert(nlNorm == classifier.data[-1][1])\n    print('normalizeVector fn OK')\n    # check distance\n    assert (round(classifier.manhattan(clNorm, classifier.data[1][1]), 5) == 1.16823)\n    assert(classifier.manhattan(brNorm, classifier.data[1][1]) == 0)\n    assert(classifier.manhattan(nlNorm, classifier.data[-1][1]) == 0)\n    print('Manhattan distance fn OK')\n    # Brittainey Raven's nearest neighbor should be herself\n    result = classifier.nearestNeighbor(brNorm)\n    assert(result[1][2]== br[2])\n    # Nastia Liukin's nearest neighbor should be herself\n    result = classifier.nearestNeighbor(nlNorm)\n    assert(result[1][2]== nl[2])\n    # Crystal Langhorne's nearest neighbor is Jennifer Lacy\"\n    assert(classifier.nearestNeighbor(clNorm)[1][2][0] == \"Jennifer Lacy\")\n    print(\"Nearest Neighbor fn OK\")\n    # Check if classify correctly identifies sports\n    assert(classifier.classify(br[1]) == 'Basketball')\n    assert(classifier.classify(cl[1]) == 'Basketball')\n    assert(classifier.classify(nl[1]) == 'Gymnastics')\n    print('Classify fn OK')\n\n\n\nunitTest()\n    \n"
  },
  {
    "path": "chapter-4/filteringdata.py",
    "content": "#\n#  ch4-filteringdata.py\n#\n#  Code for the first example from chapter 4.\n#  The only change from the original filteringdata.py is the addition of the music dictionary.\n#\n#  Code file for the book Programmer's Guide to Data Mining\n#  http://guidetodatamining.com\n#  Ron Zacharski\n#\n\nfrom math import sqrt\n\nusers = {\"Angelica\": {\"Blues Traveler\": 3.5, \"Broken Bells\": 2.0, \"Norah Jones\": 4.5, \"Phoenix\": 5.0, \"Slightly Stoopid\": 1.5, \"The Strokes\": 2.5, \"Vampire Weekend\": 2.0},\n         \"Bill\":{\"Blues Traveler\": 2.0, \"Broken Bells\": 3.5, \"Deadmau5\": 4.0, \"Phoenix\": 2.0, \"Slightly Stoopid\": 3.5, \"Vampire Weekend\": 3.0},\n         \"Chan\": {\"Blues Traveler\": 5.0, \"Broken Bells\": 1.0, \"Deadmau5\": 1.0, \"Norah Jones\": 3.0, \"Phoenix\": 5, \"Slightly Stoopid\": 1.0},\n         \"Dan\": {\"Blues Traveler\": 3.0, \"Broken Bells\": 4.0, \"Deadmau5\": 4.5, \"Phoenix\": 3.0, \"Slightly Stoopid\": 4.5, \"The Strokes\": 4.0, \"Vampire Weekend\": 2.0},\n         \"Hailey\": {\"Broken Bells\": 4.0, \"Deadmau5\": 1.0, \"Norah Jones\": 4.0, \"The Strokes\": 4.0, \"Vampire Weekend\": 1.0},\n         \"Jordyn\":  {\"Broken Bells\": 4.5, \"Deadmau5\": 4.0, \"Norah Jones\": 5.0, \"Phoenix\": 5.0, \"Slightly Stoopid\": 4.5, \"The Strokes\": 4.0, \"Vampire Weekend\": 4.0},\n         \"Sam\": {\"Blues Traveler\": 5.0, \"Broken Bells\": 2.0, \"Norah Jones\": 3.0, \"Phoenix\": 5.0, \"Slightly Stoopid\": 4.0, \"The Strokes\": 5.0},\n         \"Veronica\": {\"Blues Traveler\": 3.0, \"Norah Jones\": 5.0, \"Phoenix\": 4.0, \"Slightly Stoopid\": 2.5, \"The Strokes\": 3.0}\n        }\n\nmusic = {\"Dr Dog/Fate\": {\"piano\": 2.5, \"vocals\": 4, \"beat\": 3.5, \"blues\": 3, \"guitar\": 5, \"backup vocals\": 4, \"rap\": 1},\n         \"Phoenix/Lisztomania\": {\"piano\": 2, \"vocals\": 5, \"beat\": 5, \"blues\": 3, \"guitar\": 2, \"backup vocals\": 1, \"rap\": 1},\n         \"Heartless Bastards/Out at Sea\": {\"piano\": 1, \"vocals\": 5, \"beat\": 4, \"blues\": 2, \"guitar\": 4, \"backup vocals\": 1, \"rap\": 1},\n         \"Todd Snider/Don't Tempt Me\": {\"piano\": 4, \"vocals\": 5, \"beat\": 4, \"blues\": 4, \"guitar\": 1, \"backup vocals\": 5, \"rap\": 1},\n         \"The Black Keys/Magic Potion\": {\"piano\": 1, \"vocals\": 4, \"beat\": 5, \"blues\": 3.5, \"guitar\": 5, \"backup vocals\": 1, \"rap\": 1},\n         \"Glee Cast/Jessie's Girl\": {\"piano\": 1, \"vocals\": 5, \"beat\": 3.5, \"blues\": 3, \"guitar\":4, \"backup vocals\": 5, \"rap\": 1},\n         \"La Roux/Bulletproof\": {\"piano\": 5, \"vocals\": 5, \"beat\": 4, \"blues\": 2, \"guitar\": 1, \"backup vocals\": 1, \"rap\": 1},\n         \"Mike Posner\": {\"piano\": 2.5, \"vocals\": 4, \"beat\": 4, \"blues\": 1, \"guitar\": 1, \"backup vocals\": 1, \"rap\": 1},\n         \"Black Eyed Peas/Rock That Body\": {\"piano\": 2, \"vocals\": 5, \"beat\": 5, \"blues\": 1, \"guitar\": 2, \"backup vocals\": 2, \"rap\": 4},\n         \"Lady Gaga/Alejandro\": {\"piano\": 1, \"vocals\": 5, \"beat\": 3, \"blues\": 2, \"guitar\": 1, \"backup vocals\": 2, \"rap\": 1}}\n\ndef manhattan(rating1, rating2):\n    \"\"\"Computes the Manhattan distance. Both rating1 and rating2 are dictionaries\n       of the form {'The Strokes': 3.0, 'Slightly Stoopid': 2.5}\"\"\"\n    distance = 0\n    total = 0\n    for key in rating1:\n        if key in rating2:\n            distance += abs(rating1[key] - rating2[key])\n            total += 1\n    return distance\n\n\n\ndef computeNearestNeighbor(username, users):\n    \"\"\"creates a sorted list of users based on their distance to username\"\"\"\n    distances = []\n    for user in users:\n        if user != username:\n            distance = manhattan(users[user], users[username])\n            distances.append((distance, user))\n    # sort based on distance -- closest first\n    distances.sort()\n    return distances\n\ndef recommend(username, users):\n    \"\"\"Give list of recommendations\"\"\"\n    # first find nearest neighbor\n    nearest = computeNearestNeighbor(username, users)[0][1]\n\n    recommendations = []\n    # now find bands neighbor rated that user didn't\n    neighborRatings = users[nearest]\n    userRatings = users[username]\n    for artist in neighborRatings:\n        if not artist in userRatings:\n            recommendations.append((artist, neighborRatings[artist]))\n    # using the fn sorted for variety - sort is more efficient\n    return sorted(recommendations, key=lambda artistTuple: artistTuple[1], reverse = True)\n\n"
  },
  {
    "path": "chapter-4/irisTestSet.data",
    "content": "5.1\t3.5\t1.4\t0.2\tIris-setosa\n4.9\t3.0\t1.4\t0.2\tIris-setosa\n4.7\t3.2\t1.3\t0.2\tIris-setosa\n4.6\t3.1\t1.5\t0.2\tIris-setosa\n5.0\t3.6\t1.4\t0.2\tIris-setosa\n5.4\t3.9\t1.7\t0.4\tIris-setosa\n4.6\t3.4\t1.4\t0.3\tIris-setosa\n5.0\t3.4\t1.5\t0.2\tIris-setosa\n4.4\t2.9\t1.4\t0.2\tIris-setosa\n4.9\t3.1\t1.5\t0.1\tIris-setosa\n7.0\t3.2\t4.7\t1.4\tIris-versicolor\n6.4\t3.2\t4.5\t1.5\tIris-versicolor\n6.9\t3.1\t4.9\t1.5\tIris-versicolor\n5.5\t2.3\t4.0\t1.3\tIris-versicolor\n6.5\t2.8\t4.6\t1.5\tIris-versicolor\n5.7\t2.8\t4.5\t1.3\tIris-versicolor\n6.3\t3.3\t4.7\t1.6\tIris-versicolor\n4.9\t2.4\t3.3\t1.0\tIris-versicolor\n6.6\t2.9\t4.6\t1.3\tIris-versicolor\n5.2\t2.7\t3.9\t1.4\tIris-versicolor\n6.7\t3.1\t5.6\t2.4\tIris-virginica\n6.9\t3.1\t5.1\t2.3\tIris-virginica\n5.8\t2.7\t5.1\t1.9\tIris-virginica\n6.8\t3.2\t5.9\t2.3\tIris-virginica\n6.7\t3.3\t5.7\t2.5\tIris-virginica\n6.7\t3.0\t5.2\t2.3\tIris-virginica\n6.3\t2.5\t5.0\t1.9\tIris-virginica\n6.5\t3.0\t5.2\t2.0\tIris-virginica\n6.2\t3.4\t5.4\t2.3\tIris-virginica\n5.9\t3.0\t5.1\t1.8\tIris-virginica"
  },
  {
    "path": "chapter-4/irisTrainingSet.data",
    "content": "num\tnum\tnum\tnum\tclass\n5.4\t3.7\t1.5\t0.2\tIris-setosa\n4.8\t3.4\t1.6\t0.2\tIris-setosa\n4.8\t3.0\t1.4\t0.1\tIris-setosa\n4.3\t3.0\t1.1\t0.1\tIris-setosa\n5.8\t4.0\t1.2\t0.2\tIris-setosa\n5.7\t4.4\t1.5\t0.4\tIris-setosa\n5.4\t3.9\t1.3\t0.4\tIris-setosa\n5.1\t3.5\t1.4\t0.3\tIris-setosa\n5.7\t3.8\t1.7\t0.3\tIris-setosa\n5.1\t3.8\t1.5\t0.3\tIris-setosa\n5.4\t3.4\t1.7\t0.2\tIris-setosa\n5.1\t3.7\t1.5\t0.4\tIris-setosa\n4.6\t3.6\t1.0\t0.2\tIris-setosa\n5.1\t3.3\t1.7\t0.5\tIris-setosa\n4.8\t3.4\t1.9\t0.2\tIris-setosa\n5.0\t3.0\t1.6\t0.2\tIris-setosa\n5.0\t3.4\t1.6\t0.4\tIris-setosa\n5.2\t3.5\t1.5\t0.2\tIris-setosa\n5.2\t3.4\t1.4\t0.2\tIris-setosa\n4.7\t3.2\t1.6\t0.2\tIris-setosa\n4.8\t3.1\t1.6\t0.2\tIris-setosa\n5.4\t3.4\t1.5\t0.4\tIris-setosa\n5.2\t4.1\t1.5\t0.1\tIris-setosa\n5.5\t4.2\t1.4\t0.2\tIris-setosa\n4.9\t3.1\t1.5\t0.1\tIris-setosa\n5.0\t3.2\t1.2\t0.2\tIris-setosa\n5.5\t3.5\t1.3\t0.2\tIris-setosa\n4.9\t3.1\t1.5\t0.1\tIris-setosa\n4.4\t3.0\t1.3\t0.2\tIris-setosa\n5.1\t3.4\t1.5\t0.2\tIris-setosa\n5.0\t3.5\t1.3\t0.3\tIris-setosa\n4.5\t2.3\t1.3\t0.3\tIris-setosa\n4.4\t3.2\t1.3\t0.2\tIris-setosa\n5.0\t3.5\t1.6\t0.6\tIris-setosa\n5.1\t3.8\t1.9\t0.4\tIris-setosa\n4.8\t3.0\t1.4\t0.3\tIris-setosa\n5.1\t3.8\t1.6\t0.2\tIris-setosa\n4.6\t3.2\t1.4\t0.2\tIris-setosa\n5.3\t3.7\t1.5\t0.2\tIris-setosa\n5.0\t3.3\t1.4\t0.2\tIris-setosa\n5.0\t2.0\t3.5\t1.0\tIris-versicolor\n5.9\t3.0\t4.2\t1.5\tIris-versicolor\n6.0\t2.2\t4.0\t1.0\tIris-versicolor\n6.1\t2.9\t4.7\t1.4\tIris-versicolor\n5.6\t2.9\t3.6\t1.3\tIris-versicolor\n6.7\t3.1\t4.4\t1.4\tIris-versicolor\n5.6\t3.0\t4.5\t1.5\tIris-versicolor\n5.8\t2.7\t4.1\t1.0\tIris-versicolor\n6.2\t2.2\t4.5\t1.5\tIris-versicolor\n5.6\t2.5\t3.9\t1.1\tIris-versicolor\n5.9\t3.2\t4.8\t1.8\tIris-versicolor\n6.1\t2.8\t4.0\t1.3\tIris-versicolor\n6.3\t2.5\t4.9\t1.5\tIris-versicolor\n6.1\t2.8\t4.7\t1.2\tIris-versicolor\n6.4\t2.9\t4.3\t1.3\tIris-versicolor\n6.6\t3.0\t4.4\t1.4\tIris-versicolor\n6.8\t2.8\t4.8\t1.4\tIris-versicolor\n6.7\t3.0\t5.0\t1.7\tIris-versicolor\n6.0\t2.9\t4.5\t1.5\tIris-versicolor\n5.7\t2.6\t3.5\t1.0\tIris-versicolor\n5.5\t2.4\t3.8\t1.1\tIris-versicolor\n5.5\t2.4\t3.7\t1.0\tIris-versicolor\n5.8\t2.7\t3.9\t1.2\tIris-versicolor\n6.0\t2.7\t5.1\t1.6\tIris-versicolor\n5.4\t3.0\t4.5\t1.5\tIris-versicolor\n6.0\t3.4\t4.5\t1.6\tIris-versicolor\n6.7\t3.1\t4.7\t1.5\tIris-versicolor\n6.3\t2.3\t4.4\t1.3\tIris-versicolor\n5.6\t3.0\t4.1\t1.3\tIris-versicolor\n5.5\t2.5\t4.0\t1.3\tIris-versicolor\n5.5\t2.6\t4.4\t1.2\tIris-versicolor\n6.1\t3.0\t4.6\t1.4\tIris-versicolor\n5.8\t2.6\t4.0\t1.2\tIris-versicolor\n5.0\t2.3\t3.3\t1.0\tIris-versicolor\n5.6\t2.7\t4.2\t1.3\tIris-versicolor\n5.7\t3.0\t4.2\t1.2\tIris-versicolor\n5.7\t2.9\t4.2\t1.3\tIris-versicolor\n6.2\t2.9\t4.3\t1.3\tIris-versicolor\n5.1\t2.5\t3.0\t1.1\tIris-versicolor\n5.7\t2.8\t4.1\t1.3\tIris-versicolor\n6.3\t3.3\t6.0\t2.5\tIris-virginica\n5.8\t2.7\t5.1\t1.9\tIris-virginica\n7.1\t3.0\t5.9\t2.1\tIris-virginica\n6.3\t2.9\t5.6\t1.8\tIris-virginica\n6.5\t3.0\t5.8\t2.2\tIris-virginica\n7.6\t3.0\t6.6\t2.1\tIris-virginica\n4.9\t2.5\t4.5\t1.7\tIris-virginica\n7.3\t2.9\t6.3\t1.8\tIris-virginica\n6.7\t2.5\t5.8\t1.8\tIris-virginica\n7.2\t3.6\t6.1\t2.5\tIris-virginica\n6.5\t3.2\t5.1\t2.0\tIris-virginica\n6.4\t2.7\t5.3\t1.9\tIris-virginica\n6.8\t3.0\t5.5\t2.1\tIris-virginica\n5.7\t2.5\t5.0\t2.0\tIris-virginica\n5.8\t2.8\t5.1\t2.4\tIris-virginica\n6.4\t3.2\t5.3\t2.3\tIris-virginica\n6.5\t3.0\t5.5\t1.8\tIris-virginica\n7.7\t3.8\t6.7\t2.2\tIris-virginica\n7.7\t2.6\t6.9\t2.3\tIris-virginica\n6.0\t2.2\t5.0\t1.5\tIris-virginica\n6.9\t3.2\t5.7\t2.3\tIris-virginica\n5.6\t2.8\t4.9\t2.0\tIris-virginica\n7.7\t2.8\t6.7\t2.0\tIris-virginica\n6.3\t2.7\t4.9\t1.8\tIris-virginica\n6.7\t3.3\t5.7\t2.1\tIris-virginica\n7.2\t3.2\t6.0\t1.8\tIris-virginica\n6.2\t2.8\t4.8\t1.8\tIris-virginica\n6.1\t3.0\t4.9\t1.8\tIris-virginica\n6.4\t2.8\t5.6\t2.1\tIris-virginica\n7.2\t3.0\t5.8\t1.6\tIris-virginica\n7.4\t2.8\t6.1\t1.9\tIris-virginica\n7.9\t3.8\t6.4\t2.0\tIris-virginica\n6.4\t2.8\t5.6\t2.2\tIris-virginica\n6.3\t2.8\t5.1\t1.5\tIris-virginica\n6.1\t2.6\t5.6\t1.4\tIris-virginica\n7.7\t3.0\t6.1\t2.3\tIris-virginica\n6.3\t3.4\t5.6\t2.4\tIris-virginica\n6.4\t3.1\t5.5\t1.8\tIris-virginica\n6.0\t3.0\t4.8\t1.8\tIris-virginica\n6.9\t3.1\t5.4\t2.1\tIris-virginica"
  },
  {
    "path": "chapter-4/mpgTestSet.txt",
    "content": "15\t8\t390.0\t190.0\t3850\t8.5\tamc ambassador dpl\n15\t8\t383.0\t170.0\t3563\t10.0\tdodge challenger se\n15\t8\t340.0\t160.0\t3609\t8.0\tplymouth 'cuda 340\n15\t8\t400.0\t150.0\t3761\t9.5\tchevrolet monte carlo\n15\t8\t455.0\t225.0\t3086\t10.0\tbuick estate wagon (sw)\n25\t4\t113.0\t95.00\t2372\t15.0\ttoyota corona mark ii\n20\t6\t198.0\t95.00\t2833\t15.5\tplymouth duster\n20\t6\t199.0\t97.00\t2774\t15.5\tamc hornet\n20\t6\t200.0\t85.00\t2587\t16.0\tford maverick\n25\t4\t97.00\t88.00\t2130\t14.5\tdatsun pl510\n25\t4\t97.00\t46.00\t1835\t20.5\tvolkswagen 1131 deluxe sedan\n25\t4\t110.0\t87.00\t2672\t17.5\tpeugeot 504\n25\t4\t107.0\t90.00\t2430\t14.5\taudi 100 ls\n25\t4\t104.0\t95.00\t2375\t17.5\tsaab 99e\n25\t4\t121.0\t113.0\t2234\t12.5\tbmw 2002\n20\t6\t199.0\t90.00\t2648\t15.0\tamc gremlin\n10\t8\t360.0\t215.0\t4615\t14.0\tford f250\n10\t8\t307.0\t200.0\t4376\t15.0\tchevy c20\n10\t8\t318.0\t210.0\t4382\t13.5\tdodge d200\n10\t8\t304.0\t193.0\t4732\t18.5\thi 1200d\n25\t4\t97.00\t88.00\t2130\t14.5\tdatsun pl510\n30\t4\t140.0\t90.00\t2264\t15.5\tchevrolet vega 2300\n25\t4\t113.0\t95.00\t2228\t14.0\ttoyota corona\n20\t6\t232.0\t100.0\t2634\t13.0\tamc gremlin\n15\t6\t225.0\t105.0\t3439\t15.5\tplymouth satellite custom\n15\t6\t250.0\t100.0\t3329\t15.5\tchevrolet chevelle malibu\n20\t6\t250.0\t88.00\t3302\t15.5\tford torino 500\n20\t6\t232.0\t100.0\t3288\t15.5\tamc matador\n15\t8\t350.0\t165.0\t4209\t12.0\tchevrolet impala\n15\t8\t400.0\t175.0\t4464\t11.5\tpontiac catalina brougham\n15\t8\t351.0\t153.0\t4154\t13.5\tford galaxie 500\n15\t8\t318.0\t150.0\t4096\t13.0\tplymouth fury iii\n10\t8\t383.0\t180.0\t4955\t11.5\tdodge monaco (sw)\n15\t8\t400.0\t170.0\t4746\t12.0\tford country squire (sw)\n15\t8\t400.0\t175.0\t5140\t12.0\tpontiac safari (sw)\n20\t6\t258.0\t110.0\t2962\t13.5\tamc hornet sportabout (sw)\n20\t4\t140.0\t72.00\t2408\t19.0\tchevrolet vega (sw)\n20\t6\t250.0\t100.0\t3282\t15.0\tpontiac firebird\n20\t6\t250.0\t88.00\t3139\t14.5\tford mustang\n25\t4\t122.0\t86.00\t2220\t14.0\tmercury capri 2000\n30\t4\t116.0\t90.00\t2123\t14.0\topel 1900\n30\t4\t79.00\t70.00\t2074\t19.5\tpeugeot 304\n30\t4\t88.00\t76.00\t2065\t14.5\tfiat 124b\n30\t4\t71.00\t65.00\t1773\t19.0\ttoyota corolla 1200\n35\t4\t72.00\t69.00\t1613\t18.0\tdatsun 1200\n25\t4\t97.00\t60.00\t1834\t19.0\tvolkswagen model 111\n25\t4\t91.00\t70.00\t1955\t20.5\tplymouth cricket\n25\t4\t113.0\t95.00\t2278\t15.5\ttoyota corona hardtop\n25\t4\t97.50\t80.00\t2126\t17.0\tdodge colt hardtop\n25\t4\t97.00\t54.00\t2254\t23.5\tvolkswagen type 3\n"
  },
  {
    "path": "chapter-4/mpgTrainingSet.txt",
    "content": "class\tnum\tnum\tnum\tnum\tnum\tcomment\n20\t8\t307.0\t130.0\t3504\t12.0\tchevrolet chevelle malibu\n15\t8\t350.0\t165.0\t3693\t11.5\tbuick skylark 320\n20\t8\t318.0\t150.0\t3436\t11.0\tplymouth satellite\n15\t8\t304.0\t150.0\t3433\t12.0\tamc rebel sst\n15\t8\t302.0\t140.0\t3449\t10.5\tford torino\n15\t8\t429.0\t198.0\t4341\t10.0\tford galaxie 500\n15\t8\t454.0\t220.0\t4354\t9.0\tchevrolet impala\n15\t8\t440.0\t215.0\t4312\t8.5\tplymouth fury iii\n15\t8\t455.0\t225.0\t4425\t10.0\tpontiac catalina\n20\t4\t140.0\t90.00\t2408\t19.5\tchevrolet vega\n20\t4\t122.0\t86.00\t2226\t16.5\tford pinto runabout\n15\t8\t350.0\t165.0\t4274\t12.0\tchevrolet impala\n15\t8\t400.0\t175.0\t4385\t12.0\tpontiac catalina\n15\t8\t318.0\t150.0\t4135\t13.5\tplymouth fury iii\n15\t8\t351.0\t153.0\t4129\t13.0\tford galaxie 500\n15\t8\t304.0\t150.0\t3672\t11.5\tamc ambassador sst\n10\t8\t429.0\t208.0\t4633\t11.0\tmercury marquis\n15\t8\t350.0\t155.0\t4502\t13.5\tbuick lesabre custom\n10\t8\t350.0\t160.0\t4456\t13.5\toldsmobile delta 88 royale\n15\t8\t400.0\t190.0\t4422\t12.5\tchrysler newport royal\n20\t3\t70.00\t97.00\t2330\t13.5\tmazda rx2 coupe\n15\t8\t304.0\t150.0\t3892\t12.5\tamc matador (sw)\n15\t8\t307.0\t130.0\t4098\t14.0\tchevrolet chevelle concours (sw)\n15\t8\t302.0\t140.0\t4294\t16.0\tford gran torino (sw)\n15\t8\t318.0\t150.0\t4077\t14.0\tplymouth satellite custom (sw)\n20\t4\t121.0\t112.0\t2933\t14.5\tvolvo 145e (sw)\n20\t4\t121.0\t76.00\t2511\t18.0\tvolkswagen 411 (sw)\n20\t4\t120.0\t87.00\t2979\t19.5\tpeugeot 504 (sw)\n25\t4\t96.00\t69.00\t2189\t18.0\trenault 12 (sw)\n20\t4\t122.0\t86.00\t2395\t16.0\tford pinto (sw)\n30\t4\t97.00\t92.00\t2288\t17.0\tdatsun 510 (sw)\n25\t4\t120.0\t97.00\t2506\t14.5\ttoyouta corona mark ii (sw)\n30\t4\t98.00\t80.00\t2164\t15.0\tdodge colt (sw)\n25\t4\t97.00\t88.00\t2100\t16.5\ttoyota corolla 1600 (sw)\n15\t8\t350.0\t175.0\t4100\t13.0\tbuick century 350\n15\t8\t304.0\t150.0\t3672\t11.5\tamc matador\n15\t8\t350.0\t145.0\t3988\t13.0\tchevrolet malibu\n15\t8\t302.0\t137.0\t4042\t14.5\tford gran torino\n15\t8\t318.0\t150.0\t3777\t12.5\tdodge coronet custom\n10\t8\t429.0\t198.0\t4952\t11.5\tmercury marquis brougham\n15\t8\t400.0\t150.0\t4464\t12.0\tchevrolet caprice classic\n15\t8\t351.0\t158.0\t4363\t13.0\tford ltd\n15\t8\t318.0\t150.0\t4237\t14.5\tplymouth fury gran sedan\n15\t8\t440.0\t215.0\t4735\t11.0\tchrysler new yorker brougham\n10\t8\t455.0\t225.0\t4951\t11.0\tbuick electra 225 custom\n15\t8\t360.0\t175.0\t3821\t11.0\tamc ambassador brougham\n20\t6\t225.0\t105.0\t3121\t16.5\tplymouth valiant\n15\t6\t250.0\t100.0\t3278\t18.0\tchevrolet nova custom\n20\t6\t232.0\t100.0\t2945\t16.0\tamc hornet\n20\t6\t250.0\t88.00\t3021\t16.5\tford maverick\n25\t6\t198.0\t95.00\t2904\t16.0\tplymouth duster\n25\t4\t97.00\t46.00\t1950\t21.0\tvolkswagen super beetle\n10\t8\t400.0\t150.0\t4997\t14.0\tchevrolet impala\n10\t8\t400.0\t167.0\t4906\t12.5\tford country\n15\t8\t360.0\t170.0\t4654\t13.0\tplymouth custom suburb\n10\t8\t350.0\t180.0\t4499\t12.5\toldsmobile vista cruiser\n20\t6\t232.0\t100.0\t2789\t15.0\tamc gremlin\n20\t4\t97.00\t88.00\t2279\t19.0\ttoyota carina\n20\t4\t140.0\t72.00\t2401\t19.5\tchevrolet vega\n20\t4\t108.0\t94.00\t2379\t16.5\tdatsun 610\n20\t3\t70.00\t90.00\t2124\t13.5\tmaxda rx3\n20\t4\t122.0\t85.00\t2310\t18.5\tford pinto\n20\t6\t155.0\t107.0\t2472\t14.0\tmercury capri v6\n25\t4\t98.00\t90.00\t2265\t15.5\tfiat 124 sport coupe\n15\t8\t350.0\t145.0\t4082\t13.0\tchevrolet monte carlo s\n15\t8\t400.0\t230.0\t4278\t9.50\tpontiac grand prix\n30\t4\t68.00\t49.00\t1867\t19.5\tfiat 128\n25\t4\t116.0\t75.00\t2158\t15.5\topel manta\n20\t4\t114.0\t91.00\t2582\t14.0\taudi 100ls\n20\t4\t121.0\t112.0\t2868\t15.5\tvolvo 144ea\n15\t8\t318.0\t150.0\t3399\t11.0\tdodge dart custom\n25\t4\t121.0\t110.0\t2660\t14.0\tsaab 99le\n20\t6\t156.0\t122.0\t2807\t13.5\ttoyota mark ii\n10\t8\t350.0\t180.0\t3664\t11.0\toldsmobile omega\n20\t6\t198.0\t95.00\t3102\t16.5\tplymouth duster\n20\t6\t232.0\t100.0\t2901\t16.0\tamc hornet\n15\t6\t250.0\t100.0\t3336\t17.0\tchevrolet nova\n30\t4\t79.00\t67.00\t1950\t19.0\tdatsun b210\n25\t4\t122.0\t80.00\t2451\t16.5\tford pinto\n30\t4\t71.00\t65.00\t1836\t21.0\ttoyota corolla 1200\n25\t4\t140.0\t75.00\t2542\t17.0\tchevrolet vega\n15\t6\t250.0\t100.0\t3781\t17.0\tchevrolet chevelle malibu classic\n15\t6\t258.0\t110.0\t3632\t18.0\tamc matador\n20\t6\t225.0\t105.0\t3613\t16.5\tplymouth satellite sebring\n15\t8\t302.0\t140.0\t4141\t14.0\tford gran torino\n15\t8\t350.0\t150.0\t4699\t14.5\tbuick century luxus (sw)\n15\t8\t318.0\t150.0\t4457\t13.5\tdodge coronet custom (sw)\n15\t8\t302.0\t140.0\t4638\t16.0\tford gran torino (sw)\n15\t8\t304.0\t150.0\t4257\t15.5\tamc matador (sw)\n30\t4\t98.00\t83.00\t2219\t16.5\taudi fox\n25\t4\t79.00\t67.00\t1963\t15.5\tvolkswagen dasher\n25\t4\t97.00\t78.00\t2300\t14.5\topel manta\n30\t4\t76.00\t52.00\t1649\t16.5\ttoyota corona\n30\t4\t83.00\t61.00\t2003\t19.0\tdatsun 710\n30\t4\t90.00\t75.00\t2125\t14.5\tdodge colt\n25\t4\t90.00\t75.00\t2108\t15.5\tfiat 128\n25\t4\t116.0\t75.00\t2246\t14.0\tfiat 124 tc\n25\t4\t120.0\t97.00\t2489\t15.0\thonda civic\n25\t4\t108.0\t93.00\t2391\t15.5\tsubaru\n30\t4\t79.00\t67.00\t2000\t16.0\tfiat x1.9\n20\t6\t225.0\t95.00\t3264\t16.0\tplymouth valiant custom\n20\t6\t250.0\t105.0\t3459\t16.0\tchevrolet nova\n15\t6\t250.0\t72.00\t3432\t21.0\tmercury monarch\n15\t6\t250.0\t72.00\t3158\t19.5\tford maverick\n15\t8\t400.0\t170.0\t4668\t11.5\tpontiac catalina\n15\t8\t350.0\t145.0\t4440\t14.0\tchevrolet bel air\n15\t8\t318.0\t150.0\t4498\t14.5\tplymouth grand fury\n15\t8\t351.0\t148.0\t4657\t13.5\tford ltd\n15\t6\t231.0\t110.0\t3907\t21.0\tbuick century\n15\t6\t250.0\t105.0\t3897\t18.5\tchevroelt chevelle malibu\n15\t6\t258.0\t110.0\t3730\t19.0\tamc matador\n20\t6\t225.0\t95.00\t3785\t19.0\tplymouth fury\n20\t6\t231.0\t110.0\t3039\t15.0\tbuick skyhawk\n20\t8\t262.0\t110.0\t3221\t13.5\tchevrolet monza 2+2\n15\t8\t302.0\t129.0\t3169\t12.0\tford mustang ii\n30\t4\t97.00\t75.00\t2171\t16.0\ttoyota corolla\n25\t4\t140.0\t83.00\t2639\t17.0\tford pinto\n20\t6\t232.0\t100.0\t2914\t16.0\tamc gremlin\n25\t4\t140.0\t78.00\t2592\t18.5\tpontiac astro\n25\t4\t134.0\t96.00\t2702\t13.5\ttoyota corona\n25\t4\t90.00\t71.00\t2223\t16.5\tvolkswagen dasher\n25\t4\t119.0\t97.00\t2545\t17.0\tdatsun 710\n20\t6\t171.0\t97.00\t2984\t14.5\tford pinto\n30\t4\t90.00\t70.00\t1937\t14.0\tvolkswagen rabbit\n20\t6\t232.0\t90.00\t3211\t17.0\tamc pacer\n25\t4\t115.0\t95.00\t2694\t15.0\taudi 100ls\n25\t4\t120.0\t88.00\t2957\t17.0\tpeugeot 504\n20\t4\t121.0\t98.00\t2945\t14.5\tvolvo 244dl\n25\t4\t121.0\t115.0\t2671\t13.5\tsaab 99le\n35\t4\t91.00\t53.00\t1795\t17.5\thonda civic cvcc\n30\t4\t107.0\t86.00\t2464\t15.5\tfiat 131\n25\t4\t116.0\t81.00\t2220\t16.9\topel 1900\n25\t4\t140.0\t92.00\t2572\t14.9\tcapri ii\n25\t4\t98.00\t79.00\t2255\t17.7\tdodge colt\n25\t4\t101.0\t83.00\t2202\t15.3\trenault 12tl\n20\t8\t305.0\t140.0\t4215\t13.0\tchevrolet chevelle malibu classic\n15\t8\t318.0\t150.0\t4190\t13.0\tdodge coronet brougham\n15\t8\t304.0\t120.0\t3962\t13.9\tamc matador\n15\t8\t351.0\t152.0\t4215\t12.8\tford gran torino\n20\t6\t225.0\t100.0\t3233\t15.4\tplymouth valiant\n20\t6\t250.0\t105.0\t3353\t14.5\tchevrolet nova\n25\t6\t200.0\t81.00\t3012\t17.6\tford maverick\n25\t6\t232.0\t90.00\t3085\t17.6\tamc hornet\n30\t4\t85.00\t52.00\t2035\t22.2\tchevrolet chevette\n25\t4\t98.00\t60.00\t2164\t22.1\tchevrolet woody\n30\t4\t90.00\t70.00\t1937\t14.2\tvw rabbit\n35\t4\t91.00\t53.00\t1795\t17.4\thonda civic\n20\t6\t225.0\t100.0\t3651\t17.7\tdodge aspen se\n20\t6\t250.0\t78.00\t3574\t21.0\tford granada ghia\n20\t6\t250.0\t110.0\t3645\t16.2\tpontiac ventura sj\n20\t6\t258.0\t95.00\t3193\t17.8\tamc pacer d/l\n30\t4\t97.00\t71.00\t1825\t12.2\tvolkswagen rabbit\n30\t4\t85.00\t70.00\t1990\t17.0\tdatsun b-210\n30\t4\t97.00\t75.00\t2155\t16.4\ttoyota corolla\n25\t4\t140.0\t72.00\t2565\t13.6\tford pinto\n20\t4\t130.0\t102.0\t3150\t15.7\tvolvo 245\n15\t8\t318.0\t150.0\t3940\t13.2\tplymouth volare premier v8\n20\t4\t120.0\t88.00\t3270\t21.9\tpeugeot 504\n20\t6\t156.0\t108.0\t2930\t15.5\ttoyota mark ii\n15\t6\t168.0\t120.0\t3820\t16.7\tmercedes-benz 280s\n15\t8\t350.0\t180.0\t4380\t12.1\tcadillac seville\n15\t8\t350.0\t145.0\t4055\t12.0\tchevy c10\n15\t8\t302.0\t130.0\t3870\t15.0\tford f108\n15\t8\t318.0\t150.0\t3755\t14.0\tdodge d100\n30\t4\t98.00\t68.00\t2045\t18.5\thonda accord cvcc\n30\t4\t111.0\t80.00\t2155\t14.8\tbuick opel isuzu deluxe\n35\t4\t79.00\t58.00\t1825\t18.6\trenault 5 gtl\n25\t4\t122.0\t96.00\t2300\t15.5\tplymouth arrow gs\n35\t4\t85.00\t70.00\t1945\t16.8\tdatsun f-10 hatchback\n20\t8\t305.0\t145.0\t3880\t12.5\tchevrolet caprice classic\n15\t8\t260.0\t110.0\t4060\t19.0\toldsmobile cutlass supreme\n15\t8\t318.0\t145.0\t4140\t13.7\tdodge monaco brougham\n15\t8\t302.0\t130.0\t4295\t14.9\tmercury cougar brougham\n20\t6\t250.0\t110.0\t3520\t16.4\tchevrolet concours\n20\t6\t231.0\t105.0\t3425\t16.9\tbuick skylark\n20\t6\t225.0\t100.0\t3630\t17.7\tplymouth volare custom\n20\t6\t250.0\t98.00\t3525\t19.0\tford granada\n15\t8\t400.0\t180.0\t4220\t11.1\tpontiac grand prix lj\n15\t8\t350.0\t170.0\t4165\t11.4\tchevrolet monte carlo landau\n15\t8\t400.0\t190.0\t4325\t12.2\tchrysler cordoba\n15\t8\t351.0\t149.0\t4335\t14.5\tford thunderbird\n30\t4\t97.00\t78.00\t1940\t14.5\tvolkswagen rabbit custom\n25\t4\t151.0\t88.00\t2740\t16.0\tpontiac sunbird coupe\n25\t4\t97.00\t75.00\t2265\t18.2\ttoyota corolla liftback\n25\t4\t140.0\t89.00\t2755\t15.8\tford mustang ii 2+2\n30\t4\t98.00\t63.00\t2051\t17.0\tchevrolet chevette\n35\t4\t98.00\t83.00\t2075\t15.9\tdodge colt m/m\n30\t4\t97.00\t67.00\t1985\t16.4\tsubaru dl\n30\t4\t97.00\t78.00\t2190\t14.1\tvolkswagen dasher\n20\t6\t146.0\t97.00\t2815\t14.5\tdatsun 810\n20\t4\t121.0\t110.0\t2600\t12.8\tbmw 320i\n20\t3\t80.00\t110.0\t2720\t13.5\tmazda rx-4\n45\t4\t90.00\t48.00\t1985\t21.5\tvolkswagen rabbit custom diesel\n35\t4\t98.00\t66.00\t1800\t14.4\tford fiesta\n35\t4\t78.00\t52.00\t1985\t19.4\tmazda glc deluxe\n40\t4\t85.00\t70.00\t2070\t18.6\tdatsun b210 gx\n35\t4\t91.00\t60.00\t1800\t16.4\thonda civic cvcc\n20\t8\t260.0\t110.0\t3365\t15.5\toldsmobile cutlass salon brougham\n20\t8\t318.0\t140.0\t3735\t13.2\tdodge diplomat\n20\t8\t302.0\t139.0\t3570\t12.8\tmercury monarch ghia\n20\t6\t231.0\t105.0\t3535\t19.2\tpontiac phoenix lj\n20\t6\t200.0\t95.00\t3155\t18.2\tchevrolet malibu\n20\t6\t200.0\t85.00\t2965\t15.8\tford fairmont (auto)\n25\t4\t140.0\t88.00\t2720\t15.4\tford fairmont (man)\n20\t6\t225.0\t100.0\t3430\t17.2\tplymouth volare\n20\t6\t232.0\t90.00\t3210\t17.2\tamc concord\n20\t6\t231.0\t105.0\t3380\t15.8\tbuick century special\n20\t6\t200.0\t85.00\t3070\t16.7\tmercury zephyr\n20\t6\t225.0\t110.0\t3620\t18.7\tdodge aspen\n20\t6\t258.0\t120.0\t3410\t15.1\tamc concord d/l\n20\t8\t305.0\t145.0\t3425\t13.2\tchevrolet monte carlo landau\n20\t6\t231.0\t165.0\t3445\t13.4\tbuick regal sport coupe (turbo)\n20\t8\t302.0\t139.0\t3205\t11.2\tford futura\n20\t8\t318.0\t140.0\t4080\t13.7\tdodge magnum xe\n30\t4\t98.00\t68.00\t2155\t16.5\tchevrolet chevette\n30\t4\t134.0\t95.00\t2560\t14.2\ttoyota corona\n25\t4\t119.0\t97.00\t2300\t14.7\tdatsun 510\n30\t4\t105.0\t75.00\t2230\t14.5\tdodge omni\n20\t4\t134.0\t95.00\t2515\t14.8\ttoyota celica gt liftback\n25\t4\t156.0\t105.0\t2745\t16.7\tplymouth sapporo\n25\t4\t151.0\t85.00\t2855\t17.6\toldsmobile starfire sx\n25\t4\t119.0\t97.00\t2405\t14.9\tdatsun 200-sx\n20\t5\t131.0\t103.0\t2830\t15.9\taudi 5000\n15\t6\t163.0\t125.0\t3140\t13.6\tvolvo 264gl\n20\t4\t121.0\t115.0\t2795\t15.7\tsaab 99gle\n15\t6\t163.0\t133.0\t3410\t15.8\tpeugeot 604sl\n30\t4\t89.00\t71.00\t1990\t14.9\tvolkswagen scirocco\n30\t4\t98.00\t68.00\t2135\t16.6\thonda accord lx\n20\t6\t231.0\t115.0\t3245\t15.4\tpontiac lemans v6\n20\t6\t200.0\t85.00\t2990\t18.2\tmercury zephyr 6\n20\t4\t140.0\t88.00\t2890\t17.3\tford fairmont 4\n20\t6\t232.0\t90.00\t3265\t18.2\tamc concord dl 6\n20\t6\t225.0\t110.0\t3360\t16.6\tdodge aspen 6\n15\t8\t305.0\t130.0\t3840\t15.4\tchevrolet caprice classic\n20\t8\t302.0\t129.0\t3725\t13.4\tford ltd landau\n15\t8\t351.0\t138.0\t3955\t13.2\tmercury grand marquis\n20\t8\t318.0\t135.0\t3830\t15.2\tdodge st. regis\n15\t8\t350.0\t155.0\t4360\t14.9\tbuick estate wagon (sw)\n15\t8\t351.0\t142.0\t4054\t14.3\tford country squire (sw)\n20\t8\t267.0\t125.0\t3605\t15.0\tchevrolet malibu classic (sw)\n20\t8\t360.0\t150.0\t3940\t13.0\tchrysler lebaron town @ country (sw)\n30\t4\t89.00\t71.00\t1925\t14.0\tvw rabbit custom\n35\t4\t86.00\t65.00\t1975\t15.2\tmaxda glc deluxe\n35\t4\t98.00\t80.00\t1915\t14.4\tdodge colt hatchback custom\n25\t4\t121.0\t80.00\t2670\t15.0\tamc spirit dl\n25\t5\t183.0\t77.00\t3530\t20.1\tmercedes benz 300d\n25\t8\t350.0\t125.0\t3900\t17.4\tcadillac eldorado\n25\t4\t141.0\t71.00\t3190\t24.8\tpeugeot 504\n25\t8\t260.0\t90.00\t3420\t22.2\toldsmobile cutlass salon brougham\n35\t4\t105.0\t70.00\t2200\t13.2\tplymouth horizon\n35\t4\t105.0\t70.00\t2150\t14.9\tplymouth horizon tc3\n30\t4\t85.00\t65.00\t2020\t19.2\tdatsun 210\n35\t4\t91.00\t69.00\t2130\t14.7\tfiat strada custom\n30\t4\t151.0\t90.00\t2670\t16.0\tbuick skylark limited\n30\t6\t173.0\t115.0\t2595\t11.3\tchevrolet citation\n25\t6\t173.0\t115.0\t2700\t12.9\toldsmobile omega brougham\n35\t4\t151.0\t90.00\t2556\t13.2\tpontiac phoenix\n40\t4\t98.00\t76.00\t2144\t14.7\tvw rabbit\n40\t4\t89.00\t60.00\t1968\t18.8\ttoyota corolla tercel\n30\t4\t98.00\t70.00\t2120\t15.5\tchevrolet chevette\n35\t4\t86.00\t65.00\t2019\t16.4\tdatsun 310\n30\t4\t151.0\t90.00\t2678\t16.5\tchevrolet citation\n25\t4\t140.0\t88.00\t2870\t18.1\tford fairmont\n25\t4\t151.0\t90.00\t3003\t20.1\tamc concord\n20\t6\t225.0\t90.00\t3381\t18.7\tdodge aspen\n35\t4\t97.00\t78.00\t2188\t15.8\taudi 4000\n30\t4\t134.0\t90.00\t2711\t15.5\ttoyota corona liftback\n30\t4\t120.0\t75.00\t2542\t17.5\tmazda 626\n35\t4\t119.0\t92.00\t2434\t15.0\tdatsun 510 hatchback\n30\t4\t108.0\t75.00\t2265\t15.2\ttoyota corolla\n45\t4\t86.00\t65.00\t2110\t17.9\tmazda glc\n30\t4\t156.0\t105.0\t2800\t14.4\tdodge colt\n40\t4\t85.00\t65.00\t2110\t19.2\tdatsun 210\n45\t4\t90.00\t48.00\t2085\t21.7\tvw rabbit c (diesel)\n45\t4\t90.00\t48.00\t2335\t23.7\tvw dasher (diesel)\n35\t5\t121.0\t67.00\t2950\t19.9\taudi 5000s (diesel)\n30\t4\t146.0\t67.00\t3250\t21.8\tmercedes-benz 240d\n45\t4\t91.00\t67.00\t1850\t13.8\thonda civic 1500 gl\n35\t4\t97.00\t67.00\t2145\t18.0\tsubaru dl\n30\t4\t89.00\t62.00\t1845\t15.3\tvokswagen rabbit\n35\t6\t168.0\t132.0\t2910\t11.4\tdatsun 280-zx\n25\t3\t70.00\t100.0\t2420\t12.5\tmazda rx-7 gs\n35\t4\t122.0\t88.00\t2500\t15.1\ttriumph tr7 coupe\n30\t4\t107.0\t72.00\t2290\t17.0\thonda accord\n25\t4\t135.0\t84.00\t2490\t15.7\tplymouth reliant\n25\t4\t151.0\t84.00\t2635\t16.4\tbuick skylark\n25\t4\t156.0\t92.00\t2620\t14.4\tdodge aries wagon (sw)\n25\t6\t173.0\t110.0\t2725\t12.6\tchevrolet citation\n30\t4\t135.0\t84.00\t2385\t12.9\tplymouth reliant\n40\t4\t79.00\t58.00\t1755\t16.9\ttoyota starlet\n40\t4\t86.00\t64.00\t1875\t16.4\tplymouth champ\n35\t4\t81.00\t60.00\t1760\t16.1\thonda civic 1300\n30\t4\t97.00\t67.00\t2065\t17.8\tsubaru\n35\t4\t85.00\t65.00\t1975\t19.4\tdatsun 210 mpg\n40\t4\t89.00\t62.00\t2050\t17.3\ttoyota tercel\n35\t4\t91.00\t68.00\t1985\t16.0\tmazda glc 4\n35\t4\t105.0\t63.00\t2215\t14.9\tplymouth horizon 4\n35\t4\t98.00\t65.00\t2045\t16.2\tford escort 4w\n30\t4\t98.00\t65.00\t2380\t20.7\tford escort 2h\n35\t4\t105.0\t74.00\t2190\t14.2\tvolkswagen jetta\n35\t4\t107.0\t75.00\t2210\t14.4\thonda prelude\n30\t4\t108.0\t75.00\t2350\t16.8\ttoyota corolla\n35\t4\t119.0\t100.0\t2615\t14.8\tdatsun 200sx\n30\t4\t120.0\t74.00\t2635\t18.3\tmazda 626\n30\t4\t141.0\t80.00\t3230\t20.4\tpeugeot 505s turbo diesel\n30\t6\t145.0\t76.00\t3160\t19.6\tvolvo diesel\n25\t6\t168.0\t116.0\t2900\t12.6\ttoyota cressida\n25\t6\t146.0\t120.0\t2930\t13.8\tdatsun 810 maxima\n20\t6\t231.0\t110.0\t3415\t15.8\tbuick century\n25\t8\t350.0\t105.0\t3725\t19.0\toldsmobile cutlass ls\n20\t6\t200.0\t88.00\t3060\t17.1\tford granada gl\n20\t6\t225.0\t85.00\t3465\t16.6\tchrysler lebaron salon\n30\t4\t112.0\t88.00\t2605\t19.6\tchevrolet cavalier\n25\t4\t112.0\t88.00\t2640\t18.6\tchevrolet cavalier wagon\n35\t4\t112.0\t88.00\t2395\t18.0\tchevrolet cavalier 2-door\n30\t4\t112.0\t85.00\t2575\t16.2\tpontiac j2000 se hatchback\n30\t4\t135.0\t84.00\t2525\t16.0\tdodge aries se\n25\t4\t151.0\t90.00\t2735\t18.0\tpontiac phoenix\n25\t4\t140.0\t92.00\t2865\t16.4\tford fairmont futura\n35\t4\t105.0\t74.00\t1980\t15.3\tvolkswagen rabbit l\n35\t4\t91.00\t68.00\t2025\t18.2\tmazda glc custom l\n30\t4\t91.00\t68.00\t1970\t17.6\tmazda glc custom\n40\t4\t105.0\t63.00\t2125\t14.7\tplymouth horizon miser\n35\t4\t98.00\t70.00\t2125\t17.3\tmercury lynx l\n35\t4\t120.0\t88.00\t2160\t14.5\tnissan stanza xe\n35\t4\t107.0\t75.00\t2205\t14.5\thonda accord\n35\t4\t108.0\t70.00\t2245\t16.9\ttoyota corolla\n40\t4\t91.00\t67.00\t1965\t15.0\thonda civic\n30\t4\t91.00\t67.00\t1965\t15.7\thonda civic (auto)\n40\t4\t91.00\t67.00\t1995\t16.2\tdatsun 310 gx\n25\t6\t181.0\t110.0\t2945\t16.4\tbuick century limited\n40\t6\t262.0\t85.00\t3015\t17.0\toldsmobile cutlass ciera (diesel)\n25\t4\t156.0\t92.00\t2585\t14.5\tchrysler lebaron medallion\n20\t6\t232.0\t112.0\t2835\t14.7\tford granada l\n30\t4\t144.0\t96.00\t2665\t13.9\ttoyota celica gt\n35\t4\t135.0\t84.00\t2370\t13.0\tdodge charger 2.2\n25\t4\t151.0\t90.00\t2950\t17.3\tchevrolet camaro\n25\t4\t140.0\t86.00\t2790\t15.6\tford mustang gl\n45\t4\t97.00\t52.00\t2130\t24.6\tvw pickup\n30\t4\t135.0\t84.00\t2295\t11.6\tdodge rampage\n30\t4\t120.0\t79.00\t2625\t18.6\tford ranger\n30\t4\t119.0\t82.00\t2720\t19.4\tchevy s-10"
  },
  {
    "path": "chapter-4/nearestNeighborClassifier.py",
    "content": "#\n#  Nearest Neighbor Classifier \n#\n#\n#  Code file for the book Programmer's Guide to Data Mining\n#  http://guidetodatamining.com\n#\n#  Ron Zacharski\n#\n\n\n##   I am trying to make the classifier more general purpose\n##   by reading the data from a file.\n##   Each line of the file contains tab separated fields.\n##   The first line of the file describes how those fields (columns) should\n##   be interpreted. The descriptors in the fields of the first line are:\n##\n##        comment   -  this field should be interpreted as a comment\n##        class     -  this field describes the class of the field\n##        num       -  this field describes an integer attribute that should \n##                     be included in the computation.\n##\n##        more to be described as needed\n## \n##\n##    So, for example, if our file describes athletes and is of the form:\n##    Shavonte Zellous   basketball  70  155\n##    The first line might be:\n##    comment   class  num   num\n##\n##    Meaning the first column (name of the player) should be considered a comment; \n##    the next column represents the class of the entry (the sport); \n##    and the next 2 represent attributes to use in the calculations.\n##\n##    The classifer reads this file into the list called data.\n##    The format of each entry in that list is a tuple\n##  \n##    (class, normalized attribute-list, comment-list)\n##\n##    so, for example\n##\n##   [('basketball', [1.28, 1.71], ['Brittainey Raven']),\n##    ('basketball', [0.89, 1.47], ['Shavonte Zellous']),\n##    ('gymnastics', [-1.68, -0.75], ['Shawn Johnson']),\n##    ('gymnastics', [-2.27, -1.2], ['Ksenia Semenova']),\n##    ('track', [0.09, -0.06], ['Blake Russell'])]\n##\n   \n            \n\nclass Classifier:\n\n    def __init__(self, filename):\n\n        self.medianAndDeviation = []\n        \n        # reading the data in from the file\n        f = open(filename)\n        lines = f.readlines()\n        f.close()\n        self.format = lines[0].strip().split('\\t')\n        self.data = []\n        for line in lines[1:]:\n            fields = line.strip().split('\\t')\n            ignore = []\n            vector = []\n            for i in range(len(fields)):\n                if self.format[i] == 'num':\n                    vector.append(float(fields[i]))\n                elif self.format[i] == 'comment':\n                    ignore.append(fields[i])\n                elif self.format[i] == 'class':\n                    classification = fields[i]\n            self.data.append((classification, vector, ignore))\n        self.rawData = list(self.data)\n        # get length of instance vector\n        self.vlen = len(self.data[0][1])\n        # now normalize the data\n        for i in range(self.vlen):\n            self.normalizeColumn(i)\n        \n\n        \n    \n    ##################################################\n    ###\n    ###  CODE TO COMPUTE THE MODIFIED STANDARD SCORE\n\n    def getMedian(self, alist):\n        \"\"\"return median of alist\"\"\"\n        if alist == []:\n            return []\n        blist = sorted(alist)\n        length = len(alist)\n        if length % 2 == 1:\n            # length of list is odd so return middle element\n            return blist[int(((length + 1) / 2) -  1)]\n        else:\n            # length of list is even so compute midpoint\n            v1 = blist[int(length / 2)]\n            v2 =blist[(int(length / 2) - 1)]\n            return (v1 + v2) / 2.0\n        \n\n    def getAbsoluteStandardDeviation(self, alist, median):\n        \"\"\"given alist and median return absolute standard deviation\"\"\"\n        sum = 0\n        for item in alist:\n            sum += abs(item - median)\n        return sum / len(alist)\n\n\n    def normalizeColumn(self, columnNumber):\n       \"\"\"given a column number, normalize that column in self.data\"\"\"\n       # first extract values to list\n       col = [v[1][columnNumber] for v in self.data]\n       median = self.getMedian(col)\n       asd = self.getAbsoluteStandardDeviation(col, median)\n       #print(\"Median: %f   ASD = %f\" % (median, asd))\n       self.medianAndDeviation.append((median, asd))\n       for v in self.data:\n           v[1][columnNumber] = (v[1][columnNumber] - median) / asd\n\n\n    def normalizeVector(self, v):\n        \"\"\"We have stored the median and asd for each column.\n        We now use them to normalize vector v\"\"\"\n        vector = list(v)\n        for i in range(len(vector)):\n            (median, asd) = self.medianAndDeviation[i]\n            vector[i] = (vector[i] - median) / asd\n        return vector\n\n    \n    ###\n    ### END NORMALIZATION\n    ##################################################\n\n\n\n    def manhattan(self, vector1, vector2):\n        \"\"\"Computes the Manhattan distance.\"\"\"\n        return sum(map(lambda v1, v2: abs(v1 - v2), vector1, vector2))\n\n\n    def nearestNeighbor(self, itemVector):\n        \"\"\"return nearest neighbor to itemVector\"\"\"\n        return min([ (self.manhattan(itemVector, item[1]), item)\n                     for item in self.data])\n    \n    def classify(self, itemVector):\n        \"\"\"Return class we think item Vector is in\"\"\"\n        return(self.nearestNeighbor(self.normalizeVector(itemVector))[1][0])\n \n\ndef unitTest():\n    classifier = Classifier('athletesTrainingSet.txt')\n    br = ('Basketball', [72, 162], ['Brittainey Raven'])\n    nl = ('Gymnastics', [61, 76], ['Viktoria Komova'])\n    cl = (\"Basketball\", [74, 190], ['Crystal Langhorne'])\n    # first check normalize function\n    brNorm = classifier.normalizeVector(br[1])\n    nlNorm = classifier.normalizeVector(nl[1])\n    clNorm = classifier.normalizeVector(cl[1])\n    assert(brNorm == classifier.data[1][1])\n    assert(nlNorm == classifier.data[-1][1])\n    print('normalizeVector fn OK')\n    # check distance\n    assert (round(classifier.manhattan(clNorm, classifier.data[1][1]), 5) == 1.16823)\n    assert(classifier.manhattan(brNorm, classifier.data[1][1]) == 0)\n    assert(classifier.manhattan(nlNorm, classifier.data[-1][1]) == 0)\n    print('Manhattan distance fn OK')\n    # Brittainey Raven's nearest neighbor should be herself\n    result = classifier.nearestNeighbor(brNorm)\n    assert(result[1][2]== br[2])\n    # Nastia Liukin's nearest neighbor should be herself\n    result = classifier.nearestNeighbor(nlNorm)\n    assert(result[1][2]== nl[2])\n    # Crystal Langhorne's nearest neighbor is Jennifer Lacy\"\n    assert(classifier.nearestNeighbor(clNorm)[1][2][0] == \"Jennifer Lacy\")\n    print(\"Nearest Neighbor fn OK\")\n    # Check if classify correctly identifies sports\n    assert(classifier.classify(br[1]) == 'Basketball')\n    assert(classifier.classify(cl[1]) == 'Basketball')\n    assert(classifier.classify(nl[1]) == 'Gymnastics')\n    print('Classify fn OK')\n\ndef test(training_filename, test_filename):\n    \"\"\"Test the classifier on a test set of data\"\"\"\n    classifier = Classifier(training_filename)\n    f = open(test_filename)\n    lines = f.readlines()\n    f.close()\n    numCorrect = 0.0\n    for line in lines:\n        data = line.strip().split('\\t')\n        vector = []\n        classInColumn = -1\n        for i in range(len(classifier.format)):\n              if classifier.format[i] == 'num':\n                  vector.append(float(data[i]))\n              elif classifier.format[i] == 'class':\n                  classInColumn = i\n        theClass= classifier.classify(vector)\n        prefix = '-'\n        if theClass == data[classInColumn]:\n            # it is correct\n            numCorrect += 1\n            prefix = '+'\n        print(\"%s  %12s  %s\" % (prefix, theClass, line))\n    print(\"%4.2f%% correct\" % (numCorrect * 100/ len(lines)))\n        \n\n##\n##  Here are examples of how the classifier is used on different data sets\n##  in the book.\n#  test('athletesTrainingSet.txt', 'athletesTestSet.txt')\n#  test(\"irisTrainingSet.data\", \"irisTestSet.data\")\n#  test(\"mpgTrainingSet.txt\", \"mpgTestSet.txt\")\n    \n"
  },
  {
    "path": "chapter-4/normalizeColumnTemplate.py",
    "content": "#\n#  normalize column \n#\n#  This is the template for you to write and test the method\n#\n#  normalizeColumn\n#\n#  You will also need the file athletesTrainingSet.txt\n#\n#  For use with the book Programmer's Guide to Data Mining\n#  http://guidetodatamining.com\n#\n#  Ron Zacharski\n#\n\n   \n            \n\nclass Classifier:\n\n    def __init__(self, filename):\n\n        self.medianAndDeviation = []       \n        # reading the data in from the file\n        f = open(filename)\n        lines = f.readlines()\n        f.close()\n        self.format = lines[0].strip().split('\\t')\n        self.data = []\n        for line in lines[1:]:\n            fields = line.strip().split('\\t')\n            ignore = []\n            vector = []\n            for i in range(len(fields)):\n                if self.format[i] == 'num':\n                    vector.append(int(fields[i]))\n                elif self.format[i] == 'comment':\n                    ignore.append(fields[i])\n                elif self.format[i] == 'class':\n                    classification = fields[i]\n            self.data.append((classification, vector, ignore))\n        self.rawData = list(self.data)\n        # get length of instance vector\n        self.vlen = len(self.data[0][1])\n        # now normalize the data\n        for i in range(self.vlen):\n            self.normalizeColumn(i)\n        \n\n    \n\n    def getMedian(self, alist):\n        \"\"\"return median of alist\"\"\"\n        if alist == []:\n            return []\n        blist = sorted(alist)\n        length = len(alist)\n        if length % 2 == 1:\n            # length of list is odd so return middle element\n            return blist[int(((length + 1) / 2) -  1)]\n        else:\n            # length of list is even so compute midpoint\n            v1 = blist[int(length / 2)]\n            v2 =blist[(int(length / 2) - 1)]\n            return (v1 + v2) / 2.0\n        \n\n    def getAbsoluteStandardDeviation(self, alist, median):\n        \"\"\"given alist and median return absolute standard deviation\"\"\"\n        sum = 0\n        for item in alist:\n            sum += abs(item - median)\n        return sum / len(alist)\n\n\n    ##################################################\n    ###\n    ### FINISH WRITING THIS METHOD\n\n    \n    def normalizeColumn(self, columnNumber):\n       \"\"\"given a column number, normalize that column in self.data\n       using the Modified Standard Score\"\"\"\n\n       \"\"\" TO BE DONE\"\"\"\n\n\n    \n    ###\n    ### \n    ##################################################\n\n\n\ndef unitTest():\n    classifier = Classifier('athletesTrainingSet.txt')\n    #\n    #  test median and absolute standard deviation methods\n    list1 = [54, 72, 78, 49, 65, 63, 75, 67, 54, 76, 68,\n             61, 58, 70, 70, 70, 63, 65, 66, 61]\n    list2 = [66, 162, 204, 90, 99, 106, 175, 123, 68,\n             200, 163, 95, 77, 108, 155, 155, 108, 106, 97, 76]\n    m1 = classifier.getMedian(list1)\n    assert(round(m1, 3) == 65.5)\n    m2 = classifier.getMedian(list2)\n    assert(round(m2, 3) == 107)\n    assert(round(classifier.getAbsoluteStandardDeviation(list1, m1),3) == 5.95)\n    assert(round(classifier.getAbsoluteStandardDeviation(list2, m2),3) == 33.65)\n    print(\"getMedian and getAbsoluteStandardDeviation are OK\")\n\n    # test normalizeColumn\n    list1 = [[-1.9328, -1.2184], [1.0924, 1.6345], [2.1008, 2.8826],\n             [-2.7731, -0.5052], [-0.084, -0.2377], [-0.4202, -0.0297],\n             [1.5966, 2.0208], [0.2521, 0.4755], [-1.9328, -1.159],\n             [1.7647, 2.7637], [0.4202, 1.6642], [-0.7563, -0.3566],\n             [-1.2605, -0.8915], [0.7563, 0.0297], [0.7563, 1.4264],\n             [0.7563, 1.4264], [-0.4202, 0.0297], [-0.084, -0.0297],\n             [0.084, -0.2972], [-0.7563, -0.9212]]\n    \n\n    for i in range(len(list1)):\n        assert(round(classifier.data[i][1][0],4) == list1[i][0])\n        assert(round(classifier.data[i][1][1],4) == list1[i][1])\n    print(\"normalizeColumn is OK\")\n     \nunitTest()\n"
  },
  {
    "path": "chapter-4/testMedianAndASD.py",
    "content": "#\n#  Template -- please add code for the two functions\n#              getMedian\n#              getAbsoluteStandardDeviation\n#\n# also download the file athletesTrainingSet.txt, which you should\n# put in the same folder as this file.\n   \n            \n\nclass Classifier:\n\n    def __init__(self, filename):\n\n        self.medianAndDeviation = []\n        \n        # reading the data in from the file\n        f = open(filename)\n        lines = f.readlines()\n        f.close()\n        self.format = lines[0].strip().split('\\t')\n        self.data = []\n        for line in lines[1:]:\n            fields = line.strip().split('\\t')\n            ignore = []\n            vector = []\n            for i in range(len(fields)):\n                if self.format[i] == 'num':\n                    vector.append(int(fields[i]))\n                elif self.format[i] == 'comment':\n                    ignore.append(fields[i])\n                elif self.format[i] == 'class':\n                    classification = fields[i]\n            self.data.append((classification, vector, ignore))\n        self.rawData = list(self.data)\n        \n\n        \n    \n    ##################################################\n    ###\n    ###  FINISH THE FOLLOWING TWO METHODS\n\n    def getMedian(self, alist):\n        \"\"\"return median of alist\"\"\"\n\n        \"\"\"TO BE DONE\"\"\"\n        return 0\n        \n\n    def getAbsoluteStandardDeviation(self, alist, median):\n        \"\"\"given alist and median return absolute standard deviation\"\"\"\n\n        \"\"\"TO BE DONE\"\"\"\n        return 0\n\n    \n    ###\n    ### \n    ##################################################\n\n\n\ndef unitTest():\n    list1 = [54, 72, 78, 49, 65, 63, 75, 67, 54]\n    list2 = [54, 72, 78, 49, 65, 63, 75, 67, 54, 68]\n    list3 = [69]\n    list4 = [69, 72]\n    classifier = Classifier('athletesTrainingSet.txt')\n    m1 = classifier.getMedian(list1)\n    m2 = classifier.getMedian(list2)\n    m3 = classifier.getMedian(list3)\n    m4 = classifier.getMedian(list4)\n    asd1 = classifier.getAbsoluteStandardDeviation(list1, m1)\n    asd2 = classifier.getAbsoluteStandardDeviation(list2, m2)\n    asd3 = classifier.getAbsoluteStandardDeviation(list3, m3)\n    asd4 = classifier.getAbsoluteStandardDeviation(list4, m4)\n    assert(round(m1, 3) == 65)\n    assert(round(m2, 3) == 66)\n    assert(round(m3, 3) == 69)\n    assert(round(m4, 3) == 70.5)\n    assert(round(asd1, 3) == 8)\n    assert(round(asd2, 3) == 7.5)\n    assert(round(asd3, 3) == 0)\n    assert(round(asd4, 3) == 1.5)\n    \n    print(\"getMedian and getAbsoluteStandardDeviation work correctly\")\n\nunitTest()\n    \n"
  },
  {
    "path": "chapter-5/crossValidation.py",
    "content": "#  \n# \n#  Nearest Neighbor Classifier for mpg dataset \n#\n#  for chapter 5 page 14\n#\n#  Code file for the book Programmer's Guide to Data Mining\n#  http://guidetodatamining.com\n#\n#  Ron Zacharski\n#\n\nclass Classifier:\n    def __init__(self, bucketPrefix, testBucketNumber, dataFormat):\n\n        \"\"\" a classifier will be built from files with the bucketPrefix\n        excluding the file with textBucketNumber. dataFormat is a string that\n        describes how to interpret each line of the data files. For example,\n        for the mpg data the format is:\n\n        \"class\tnum\tnum\tnum\tnum\tnum\tcomment\"\n        \"\"\"\n   \n        self.medianAndDeviation = []\n        \n        # reading the data in from the file\n \n        self.format = dataFormat.strip().split('\\t')\n        self.data = []\n        # for each of the buckets numbered 1 through 10:\n        for i in range(1, 11):\n            # if it is not the bucket we should ignore, read in the data\n            if i != testBucketNumber:\n                filename = \"%s-%02i\" % (bucketPrefix, i)\n                f = open(filename)\n                lines = f.readlines()\n                f.close()\n                for line in lines[1:]:\n                    fields = line.strip().split('\\t')\n                    ignore = []\n                    vector = []\n                    for i in range(len(fields)):\n                        if self.format[i] == 'num':\n                            vector.append(float(fields[i]))\n                        elif self.format[i] == 'comment':\n                            ignore.append(fields[i])\n                        elif self.format[i] == 'class':\n                            classification = fields[i]\n                    self.data.append((classification, vector, ignore))\n        self.rawData = list(self.data)\n        # get length of instance vector\n        self.vlen = len(self.data[0][1])\n        # now normalize the data\n        for i in range(self.vlen):\n            self.normalizeColumn(i)\n        \n\n        \n    \n    ##################################################\n    ###\n    ###  CODE TO COMPUTE THE MODIFIED STANDARD SCORE\n\n    def getMedian(self, alist):\n        \"\"\"return median of alist\"\"\"\n        if alist == []:\n            return []\n        blist = sorted(alist)\n        length = len(alist)\n        if length % 2 == 1:\n            # length of list is odd so return middle element\n            return blist[int(((length + 1) / 2) -  1)]\n        else:\n            # length of list is even so compute midpoint\n            v1 = blist[int(length / 2)]\n            v2 =blist[(int(length / 2) - 1)]\n            return (v1 + v2) / 2.0\n        \n\n    def getAbsoluteStandardDeviation(self, alist, median):\n        \"\"\"given alist and median return absolute standard deviation\"\"\"\n        sum = 0\n        for item in alist:\n            sum += abs(item - median)\n        return sum / len(alist)\n\n\n    def normalizeColumn(self, columnNumber):\n       \"\"\"given a column number, normalize that column in self.data\"\"\"\n       # first extract values to list\n       col = [v[1][columnNumber] for v in self.data]\n       median = self.getMedian(col)\n       asd = self.getAbsoluteStandardDeviation(col, median)\n       #print(\"Median: %f   ASD = %f\" % (median, asd))\n       self.medianAndDeviation.append((median, asd))\n       for v in self.data:\n           v[1][columnNumber] = (v[1][columnNumber] - median) / asd\n\n\n    def normalizeVector(self, v):\n        \"\"\"We have stored the median and asd for each column.\n        We now use them to normalize vector v\"\"\"\n        vector = list(v)\n        for i in range(len(vector)):\n            (median, asd) = self.medianAndDeviation[i]\n            vector[i] = (vector[i] - median) / asd\n        return vector\n    ###\n    ### END NORMALIZATION\n    ##################################################\n\n    def testBucket(self, bucketPrefix, bucketNumber):\n        \"\"\"Evaluate the classifier with data from the file\n        bucketPrefix-bucketNumber\"\"\"\n        \n        filename = \"%s-%02i\" % (bucketPrefix, bucketNumber)\n        f = open(filename)\n        lines = f.readlines()\n        totals = {}\n        f.close()\n        for line in lines:\n            data = line.strip().split('\\t')\n            vector = []\n            classInColumn = -1\n            for i in range(len(self.format)):\n                  if self.format[i] == 'num':\n                      vector.append(float(data[i]))\n                  elif self.format[i] == 'class':\n                      classInColumn = i\n            theRealClass = data[classInColumn]\n            classifiedAs = self.classify(vector)\n            totals.setdefault(theRealClass, {})\n            totals[theRealClass].setdefault(classifiedAs, 0)\n            totals[theRealClass][classifiedAs] += 1\n        return totals\n\n\n\n    def manhattan(self, vector1, vector2):\n        \"\"\"Computes the Manhattan distance.\"\"\"\n        return sum(map(lambda v1, v2: abs(v1 - v2), vector1, vector2))\n\n\n    def nearestNeighbor(self, itemVector):\n        \"\"\"return nearest neighbor to itemVector\"\"\"\n        return min([ (self.manhattan(itemVector, item[1]), item)\n                     for item in self.data])\n    \n    def classify(self, itemVector):\n        \"\"\"Return class we think item Vector is in\"\"\"\n        return(self.nearestNeighbor(self.normalizeVector(itemVector))[1][0])\n \n\n       \ndef tenfold(bucketPrefix, dataFormat):\n    results = {}\n    for i in range(1, 11):\n        c = Classifier(bucketPrefix, i, dataFormat)\n        t = c.testBucket(bucketPrefix, i)\n        for (key, value) in t.items():\n            results.setdefault(key, {})\n            for (ckey, cvalue) in value.items():\n                results[key].setdefault(ckey, 0)\n                results[key][ckey] += cvalue\n                \n    # now print results\n    categories = list(results.keys())\n    categories.sort()\n    print(   \"\\n       Classified as: \")\n    header =    \"        \"\n    subheader = \"      +\"\n    for category in categories:\n        header += category + \"   \"\n        subheader += \"----+\"\n    print (header)\n    print (subheader)\n    total = 0.0\n    correct = 0.0\n    for category in categories:\n        row = category + \"    |\"\n        for c2 in categories:\n            if c2 in results[category]:\n                count = results[category][c2]\n            else:\n                count = 0\n            row += \" %2i |\" % count\n            total += count\n            if c2 == category:\n                correct += count\n        print(row)\n    print(subheader)\n    print(\"\\n%5.3f percent correct\" %((correct * 100) / total))\n    print(\"total of %i instances\" % total)\n\n\ntenfold(\"mpgData/mpgData\",        \"class\tnum\tnum\tnum\tnum\tnum\tcomment\")\n"
  },
  {
    "path": "chapter-5/divide.py",
    "content": "# divide data into 10 buckets\nimport random\n\ndef buckets(filename, bucketName, separator, classColumn):\n    \"\"\"the original data is in the file named filename\n    bucketName is the prefix for all the bucket names\n    separator is the character that divides the columns\n    (for ex., a tab or comma and classColumn is the column\n    that indicates the class\"\"\"\n\n    # put the data in 10 buckets\n    numberOfBuckets = 10\n    data = {}\n    # first read in the data and divide by category\n    with open(filename) as f:\n        lines = f.readlines()\n    for line in lines:\n        if separator != '\\t':\n            line = line.replace(separator, '\\t')\n        # first get the category\n        category = line.split()[classColumn]\n        data.setdefault(category, [])\n        data[category].append(line)\n    # initialize the buckets\n    buckets = []\n    for i in range(numberOfBuckets):\n        buckets.append([])       \n    # now for each category put the data into the buckets\n    for k in data.keys():\n        #randomize order of instances for each class\n        random.shuffle(data[k])\n        bNum = 0\n        # divide into buckets\n        for item in data[k]:\n            buckets[bNum].append(item)\n            bNum = (bNum + 1) % numberOfBuckets\n\n    # write to file\n    for bNum in range(numberOfBuckets):\n        f = open(\"%s-%02i\" % (bucketName, bNum + 1), 'w')\n        for item in buckets[bNum]:\n            f.write(item)\n        f.close()\n\n# example of how to use this code          \nbuckets(\"pimaSmall.txt\", 'pimaSmall',',',8)\n"
  },
  {
    "path": "chapter-5/pimaKNN.py",
    "content": "#  \n# \n#  Nearest Neighbor Classifier for Pima dataset\n#\n#\n#  Code file for the book Programmer's Guide to Data Mining\n#  http://guidetodatamining.com\n#\n#  Ron Zacharski\n#\nimport heapq\nimport random\n\nclass Classifier:\n    def __init__(self, bucketPrefix, testBucketNumber, dataFormat, k):\n\n        \"\"\" a classifier will be built from files with the bucketPrefix\n        excluding the file with textBucketNumber. dataFormat is a string that\n        describes how to interpret each line of the data files. For example,\n        for the mpg data the format is:\n\n        \"class\tnum\tnum\tnum\tnum\tnum\tcomment\"\n        \"\"\"\n   \n        self.medianAndDeviation = []\n        self.k = k\n        # reading the data in from the file\n \n        self.format = dataFormat.strip().split('\\t')\n        self.data = []\n        # for each of the buckets numbered 1 through 10:\n        for i in range(1, 11):\n            # if it is not the bucket we should ignore, read in the data\n            if i != testBucketNumber:\n                filename = \"%s-%02i\" % (bucketPrefix, i)\n                f = open(filename)\n                lines = f.readlines()\n                f.close()\n                for line in lines[1:]:\n                    fields = line.strip().split('\\t')\n                    ignore = []\n                    vector = []\n                    for i in range(len(fields)):\n                        if self.format[i] == 'num':\n                            vector.append(float(fields[i]))\n                        elif self.format[i] == 'comment':\n                            ignore.append(fields[i])\n                        elif self.format[i] == 'class':\n                            classification = fields[i]\n                    self.data.append((classification, vector, ignore))\n        self.rawData = list(self.data)\n        # get length of instance vector\n        self.vlen = len(self.data[0][1])\n        # now normalize the data\n        for i in range(self.vlen):\n            self.normalizeColumn(i)\n        \n\n        \n    \n    ##################################################\n    ###\n    ###  CODE TO COMPUTE THE MODIFIED STANDARD SCORE\n\n    def getMedian(self, alist):\n        \"\"\"return median of alist\"\"\"\n        if alist == []:\n            return []\n        blist = sorted(alist)\n        length = len(alist)\n        if length % 2 == 1:\n            # length of list is odd so return middle element\n            return blist[int(((length + 1) / 2) -  1)]\n        else:\n            # length of list is even so compute midpoint\n            v1 = blist[int(length / 2)]\n            v2 =blist[(int(length / 2) - 1)]\n            return (v1 + v2) / 2.0\n        \n\n    def getAbsoluteStandardDeviation(self, alist, median):\n        \"\"\"given alist and median return absolute standard deviation\"\"\"\n        sum = 0\n        for item in alist:\n            sum += abs(item - median)\n        return sum / len(alist)\n\n\n    def normalizeColumn(self, columnNumber):\n       \"\"\"given a column number, normalize that column in self.data\"\"\"\n       # first extract values to list\n       col = [v[1][columnNumber] for v in self.data]\n       median = self.getMedian(col)\n       asd = self.getAbsoluteStandardDeviation(col, median)\n       #print(\"Median: %f   ASD = %f\" % (median, asd))\n       self.medianAndDeviation.append((median, asd))\n       for v in self.data:\n           v[1][columnNumber] = (v[1][columnNumber] - median) / asd\n\n\n    def normalizeVector(self, v):\n        \"\"\"We have stored the median and asd for each column.\n        We now use them to normalize vector v\"\"\"\n        vector = list(v)\n        for i in range(len(vector)):\n            (median, asd) = self.medianAndDeviation[i]\n            vector[i] = (vector[i] - median) / asd\n        return vector\n    ###\n    ### END NORMALIZATION\n    ##################################################\n\n    def testBucket(self, bucketPrefix, bucketNumber):\n        \"\"\"Evaluate the classifier with data from the file\n        bucketPrefix-bucketNumber\"\"\"\n        \n        filename = \"%s-%02i\" % (bucketPrefix, bucketNumber)\n        f = open(filename)\n        lines = f.readlines()\n        totals = {}\n        f.close()\n        for line in lines:\n            data = line.strip().split('\\t')\n            vector = []\n            classInColumn = -1\n            for i in range(len(self.format)):\n                  if self.format[i] == 'num':\n                      vector.append(float(data[i]))\n                  elif self.format[i] == 'class':\n                      classInColumn = i\n            theRealClass = data[classInColumn]\n            #print(\"REAL \", theRealClass)\n            classifiedAs = self.classify(vector)\n            totals.setdefault(theRealClass, {})\n            totals[theRealClass].setdefault(classifiedAs, 0)\n            totals[theRealClass][classifiedAs] += 1\n        return totals\n\n\n\n    def manhattan(self, vector1, vector2):\n        \"\"\"Computes the Manhattan distance.\"\"\"\n        return sum(map(lambda v1, v2: abs(v1 - v2), vector1, vector2))\n\n\n    def nearestNeighbor(self, itemVector):\n        \"\"\"return nearest neighbor to itemVector\"\"\"\n        return min([ (self.manhattan(itemVector, item[1]), item)\n                     for item in self.data])\n    \n    def knn(self, itemVector):\n        \"\"\"returns the predicted class of itemVector using k\n        Nearest Neighbors\"\"\"\n        # changed from min to heapq.nsmallest to get the\n        # k closest neighbors\n        neighbors = heapq.nsmallest(self.k,\n                                   [(self.manhattan(itemVector, item[1]), item)\n                     for item in self.data])\n        # each neighbor gets a vote\n        results = {}\n        for neighbor in neighbors: \n            theClass = neighbor[1][0]\n            results.setdefault(theClass, 0)\n            results[theClass] += 1\n        resultList = sorted([(i[1], i[0]) for i in results.items()], reverse=True)\n        #get all the classes that have the maximum votes\n        maxVotes = resultList[0][0]\n        possibleAnswers = [i[1] for i in resultList if i[0] == maxVotes]\n        # randomly select one of the classes that received the max votes\n        answer = random.choice(possibleAnswers)\n        return( answer)\n    \n    def classify(self, itemVector):\n        \"\"\"Return class we think item Vector is in\"\"\"\n        # k represents how many nearest neighbors to use\n        return(self.knn(self.normalizeVector(itemVector)))                             \n \n\n       \ndef tenfold(bucketPrefix, dataFormat, k):\n    results = {}\n    for i in range(1, 11):\n        c = Classifier(bucketPrefix, i, dataFormat, k)\n        t = c.testBucket(bucketPrefix, i)\n        for (key, value) in t.items():\n            results.setdefault(key, {})\n            for (ckey, cvalue) in value.items():\n                results[key].setdefault(ckey, 0)\n                results[key][ckey] += cvalue\n                \n    # now print results\n    categories = list(results.keys())\n    categories.sort()\n    print(   \"\\n       Classified as: \")\n    header =    \"        \"\n    subheader = \"      +\"\n    for category in categories:\n        header += \"% 2s   \" % category\n        subheader += \"-----+\"\n    print (header)\n    print (subheader)\n    total = 0.0\n    correct = 0.0\n    for category in categories:\n        row = \" %s    |\" % category \n        for c2 in categories:\n            if c2 in results[category]:\n                count = results[category][c2]\n            else:\n                count = 0\n            row += \" %3i |\" % count\n            total += count\n            if c2 == category:\n                correct += count\n        print(row)\n    print(subheader)\n    print(\"\\n%5.3f percent correct\" %((correct * 100) / total))\n    print(\"total of %i instances\" % total)\n\nprint(\"SMALL DATA SET\")\ntenfold(\"pimaSmall/pimaSmall\",\n        \"num\tnum\tnum\tnum\tnum\tnum\tnum\tnum\tclass\", 3)\nprint(\"\\n\\nLARGE DATA SET\")\n\ntenfold(\"pima/pima\",\n        \"num\tnum\tnum\tnum\tnum\tnum\tnum\tnum\tclass\", 3)\n"
  },
  {
    "path": "chapter-6/naiveBayes.py",
    "content": "  \n# \n#  Naive Bayes Classifier chapter 6\n#\n\n\n# _____________________________________________________________________\n\nclass Classifier:\n    def __init__(self, bucketPrefix, testBucketNumber, dataFormat):\n\n        \"\"\" a classifier will be built from files with the bucketPrefix\n        excluding the file with textBucketNumber. dataFormat is a string that\n        describes how to interpret each line of the data files. For example,\n        for the iHealth data the format is:\n        \"attr\tattr\tattr\tattr\tclass\"\n        \"\"\"\n   \n        total = 0\n        classes = {}\n        counts = {}\n        \n        \n        # reading the data in from the file\n        \n        self.format = dataFormat.strip().split('\\t')\n        self.prior = {}\n        self.conditional = {}\n        # for each of the buckets numbered 1 through 10:\n        for i in range(1, 11):\n            # if it is not the bucket we should ignore, read in the data\n            if i != testBucketNumber:\n                filename = \"%s-%02i\" % (bucketPrefix, i)\n                f = open(filename)\n                lines = f.readlines()\n                f.close()\n                for line in lines:\n                    fields = line.strip().split('\\t')\n                    ignore = []\n                    vector = []\n                    for i in range(len(fields)):\n                        if self.format[i] == 'num':\n                            vector.append(float(fields[i]))\n                        elif self.format[i] == 'attr':\n                            vector.append(fields[i])                           \n                        elif self.format[i] == 'comment':\n                            ignore.append(fields[i])\n                        elif self.format[i] == 'class':\n                            category = fields[i]\n                    # now process this instance\n                    total += 1\n                    classes.setdefault(category, 0)\n                    counts.setdefault(category, {})\n                    classes[category] += 1\n                    # now process each attribute of the instance\n                    col = 0\n                    for columnValue in vector:\n                        col += 1\n                        counts[category].setdefault(col, {})\n                        counts[category][col].setdefault(columnValue, 0)\n                        counts[category][col][columnValue] += 1\n        \n        #\n        # ok done counting. now compute probabilities\n        #\n        # first prior probabilities p(h)\n        #\n        for (category, count) in classes.items():\n            self.prior[category] = count / total\n        #\n        # now compute conditional probabilities p(h|D)\n        #\n        for (category, columns) in counts.items():\n              self.conditional.setdefault(category, {})\n              for (col, valueCounts) in columns.items():\n                  self.conditional[category].setdefault(col, {})\n                  for (attrValue, count) in valueCounts.items():\n                      self.conditional[category][col][attrValue] = (\n                          count / classes[category])\n        self.tmp =  counts               \n        \n\n           \n    def testBucket(self, bucketPrefix, bucketNumber):\n        \"\"\"Evaluate the classifier with data from the file\n        bucketPrefix-bucketNumber\"\"\"\n        \n        filename = \"%s-%02i\" % (bucketPrefix, bucketNumber)\n        f = open(filename)\n        lines = f.readlines()\n        totals = {}\n        f.close()\n        loc = 1\n        for line in lines:\n            loc += 1\n            data = line.strip().split('\\t')\n            vector = []\n            classInColumn = -1\n            for i in range(len(self.format)):\n                  if self.format[i] == 'num':\n                      vector.append(float(data[i]))\n                  elif self.format[i] == 'attr':\n                      vector.append(data[i])\n                  elif self.format[i] == 'class':\n                      classInColumn = i\n            theRealClass = data[classInColumn]\n            classifiedAs = self.classify(vector)\n            totals.setdefault(theRealClass, {})\n            totals[theRealClass].setdefault(classifiedAs, 0)\n            totals[theRealClass][classifiedAs] += 1\n        return totals\n\n\n    \n    def classify(self, itemVector):\n        \"\"\"Return class we think item Vector is in\"\"\"\n        results = []\n        for (category, prior) in self.prior.items():\n            prob = prior\n            col = 1\n            for attrValue in itemVector:\n                if not attrValue in self.conditional[category][col]:\n                    # we did not find any instances of this attribute value\n                    # occurring with this category so prob = 0\n                    prob = 0\n                else:\n                    prob = prob * self.conditional[category][col][attrValue]\n                col += 1\n            results.append((prob, category))\n        # return the category with the highest probability\n        return(max(results)[1])\n \n\ndef tenfold(bucketPrefix, dataFormat):\n    results = {}\n    for i in range(1, 11):\n        c = Classifier(bucketPrefix, i, dataFormat)\n        t = c.testBucket(bucketPrefix, i)\n        for (key, value) in t.items():\n            results.setdefault(key, {})\n            for (ckey, cvalue) in value.items():\n                results[key].setdefault(ckey, 0)\n                results[key][ckey] += cvalue\n                \n    # now print results\n    categories = list(results.keys())\n    categories.sort()\n    print(   \"\\n            Classified as: \")\n    header =    \"             \"\n    subheader = \"               +\"\n    for category in categories:\n        header += \"% 10s   \" % category\n        subheader += \"-------+\"\n    print (header)\n    print (subheader)\n    total = 0.0\n    correct = 0.0\n    for category in categories:\n        row = \" %10s    |\" % category \n        for c2 in categories:\n            if c2 in results[category]:\n                count = results[category][c2]\n            else:\n                count = 0\n            row += \" %5i |\" % count\n            total += count\n            if c2 == category:\n                correct += count\n        print(row)\n    print(subheader)\n    print(\"\\n%5.3f percent correct\" %((correct * 100) / total))\n    print(\"total of %i instances\" % total)\n\ntenfold(\"house-votes/hv\", \"class\\tattr\\tattr\\tattr\\tattr\\tattr\\tattr\\tattr\\tattr\\tattr\\tattr\\tattr\\tattr\\tattr\\tattr\\tattr\\tattr\")\n#c = Classifier(\"house-votes/hv\", 0,\n#                       \"class\\tattr\\tattr\\tattr\\tattr\\tattr\\tattr\\tattr\\tattr\\tattr\\tattr\\tattr\\tattr\\tattr\\tattr\\tattr\\tattr\")\n\n#c = Classifier(\"iHealth/i\", 10,\n#                       \"attr\\tattr\\tattr\\tattr\\tclass\")\n#print(c.classify(['health', 'moderate', 'moderate', 'yes']))\n\n#c = Classifier(\"house-votes-filtered/hv\", 5, \"class\\tattr\\tattr\\tattr\\tattr\\tattr\\tattr\\tattr\\tattr\\tattr\\tattr\\tattr\\tattr\\tattr\\tattr\\tattr\\tattr\")\n#t = c.testBucket(\"house-votes-filtered/hv\", 5)\n#print(t)\n\n"
  },
  {
    "path": "chapter-6/naiveBayesDensityFunction.py",
    "content": "  \n# \n#  Naive Bayes Classifier chapter 6\n#\n\n\n# _____________________________________________________________________\n\nimport math\n\nclass Classifier:\n    def __init__(self, bucketPrefix, testBucketNumber, dataFormat):\n\n        \"\"\" a classifier will be built from files with the bucketPrefix\n        excluding the file with textBucketNumber. dataFormat is a string that\n        describes how to interpret each line of the data files. For example,\n        for the iHealth data the format is:\n        \"attr\tattr\tattr\tattr\tclass\"\n        \"\"\"\n   \n        total = 0\n        classes = {}\n        # counts used for attributes that are not numeric\n        counts = {}\n        # totals used for attributes that are numereric\n        # we will use these to compute the mean and sample standard deviation for\n        # each attribute - class pair.\n        totals = {}\n        numericValues = {}\n        \n        \n        # reading the data in from the file\n        \n        self.format = dataFormat.strip().split('\\t')\n        # \n        self.prior = {}\n        self.conditional = {}\n \n        # for each of the buckets numbered 1 through 10:\n        for i in range(1, 11):\n            # if it is not the bucket we should ignore, read in the data\n            if i != testBucketNumber:\n                filename = \"%s-%02i\" % (bucketPrefix, i)\n                f = open(filename)\n                lines = f.readlines()\n                f.close()\n                for line in lines:\n                    fields = line.strip().split('\\t')\n                    ignore = []\n                    vector = []\n                    nums = []\n                    for i in range(len(fields)):\n                        if self.format[i] == 'num':\n                            nums.append(float(fields[i]))\n                        elif self.format[i] == 'attr':\n                            vector.append(fields[i])                           \n                        elif self.format[i] == 'comment':\n                            ignore.append(fields[i])\n                        elif self.format[i] == 'class':\n                            category = fields[i]\n                    # now process this instance\n                    total += 1\n                    classes.setdefault(category, 0)\n                    counts.setdefault(category, {})\n                    totals.setdefault(category, {})\n                    numericValues.setdefault(category, {})\n                    classes[category] += 1\n                    # now process each non-numeric attribute of the instance\n                    col = 0\n                    for columnValue in vector:\n                        col += 1\n                        counts[category].setdefault(col, {})\n                        counts[category][col].setdefault(columnValue, 0)\n                        counts[category][col][columnValue] += 1\n                    # process numeric attributes\n                    col = 0\n                    for columnValue in nums:\n                        col += 1\n                        totals[category].setdefault(col, 0)\n                        #totals[category][col].setdefault(columnValue, 0)\n                        totals[category][col] += columnValue\n                        numericValues[category].setdefault(col, [])\n                        numericValues[category][col].append(columnValue)\n                    \n        \n        #\n        # ok done counting. now compute probabilities\n        #\n        # first prior probabilities p(h)\n        #\n        for (category, count) in classes.items():\n            self.prior[category] = count / total\n        #\n        # now compute conditional probabilities p(h|D)\n        #\n        for (category, columns) in counts.items():\n              self.conditional.setdefault(category, {})\n              for (col, valueCounts) in columns.items():\n                  self.conditional[category].setdefault(col, {})\n                  for (attrValue, count) in valueCounts.items():\n                      self.conditional[category][col][attrValue] = (\n                          count / classes[category])\n        self.tmp =  counts               \n        #\n        # now compute mean and sample standard deviation\n        #\n        self.means = {}\n        self.totals = totals\n        for (category, columns) in totals.items():\n            self.means.setdefault(category, {})\n            for (col, cTotal) in columns.items():\n                self.means[category][col] = cTotal / classes[category]\n        # standard deviation\n        self.ssd = {}\n        for (category, columns) in numericValues.items():\n            \n            self.ssd.setdefault(category, {})\n            for (col, values) in columns.items():\n                SumOfSquareDifferences = 0\n                theMean = self.means[category][col]\n                for value in values:\n                    SumOfSquareDifferences += (value - theMean)**2\n                columns[col] = 0\n                self.ssd[category][col] = math.sqrt(SumOfSquareDifferences / (classes[category]  - 1))      \n        \n\n           \n    def testBucket(self, bucketPrefix, bucketNumber):\n        \"\"\"Evaluate the classifier with data from the file\n        bucketPrefix-bucketNumber\"\"\"\n        \n        filename = \"%s-%02i\" % (bucketPrefix, bucketNumber)\n        f = open(filename)\n        lines = f.readlines()\n        totals = {}\n        f.close()\n        loc = 1\n        for line in lines:\n            loc += 1\n            data = line.strip().split('\\t')\n            vector = []\n            numV = []\n            classInColumn = -1\n            for i in range(len(self.format)):\n                  if self.format[i] == 'num':\n                      numV.append(float(data[i]))\n                  elif self.format[i] == 'attr':\n                      vector.append(data[i])\n                  elif self.format[i] == 'class':\n                      classInColumn = i\n            theRealClass = data[classInColumn]\n            classifiedAs = self.classify(vector, numV)\n            totals.setdefault(theRealClass, {})\n            totals[theRealClass].setdefault(classifiedAs, 0)\n            totals[theRealClass][classifiedAs] += 1\n        return totals\n\n\n    \n    def classify(self, itemVector, numVector):\n        \"\"\"Return class we think item Vector is in\"\"\"\n        results = []\n        sqrt2pi = math.sqrt(2 * math.pi)\n        for (category, prior) in self.prior.items():\n            prob = prior\n            col = 1\n            for attrValue in itemVector:\n                if not attrValue in self.conditional[category][col]:\n                    # we did not find any instances of this attribute value\n                    # occurring with this category so prob = 0\n                    prob = 0\n                else:\n                    prob = prob * self.conditional[category][col][attrValue]\n                col += 1\n            col = 1\n            for x in  numVector:\n                mean = self.means[category][col]\n                ssd = self.ssd[category][col]\n                ePart = math.pow(math.e, -(x - mean)**2/(2*ssd**2))\n                prob = prob * ((1.0 / (sqrt2pi*ssd)) * ePart)\n                col += 1\n            results.append((prob, category))\n        # return the category with the highest probability\n        #print(results)\n        return(max(results)[1])\n \n\ndef tenfold(bucketPrefix, dataFormat):\n    results = {}\n    for i in range(1, 11):\n        c = Classifier(bucketPrefix, i, dataFormat)\n        t = c.testBucket(bucketPrefix, i)\n        for (key, value) in t.items():\n            results.setdefault(key, {})\n            for (ckey, cvalue) in value.items():\n                results[key].setdefault(ckey, 0)\n                results[key][ckey] += cvalue\n                \n    # now print results\n    categories = list(results.keys())\n    categories.sort()\n    print(   \"\\n            Classified as: \")\n    header =    \"             \"\n    subheader = \"               +\"\n    for category in categories:\n        header += \"% 10s   \" % category\n        subheader += \"-------+\"\n    print (header)\n    print (subheader)\n    total = 0.0\n    correct = 0.0\n    for category in categories:\n        row = \" %10s    |\" % category \n        for c2 in categories:\n            if c2 in results[category]:\n                count = results[category][c2]\n            else:\n                count = 0\n            row += \" %5i |\" % count\n            total += count\n            if c2 == category:\n                correct += count\n        print(row)\n    print(subheader)\n    print(\"\\n%5.3f percent correct\" %((correct * 100) / total))\n    print(\"total of %i instances\" % total)\n\n\ndef pdf(mean, ssd, x):\n   \"\"\"Probability Density Function  computing P(x|y)\n   input is the mean, sample standard deviation for all the items in y,\n   and x.\"\"\"\n   ePart = math.pow(math.e, -(x-mean)**2/(2*ssd**2))\n   print (ePart)\n   return (1.0 / (math.sqrt(2*math.pi)*ssd)) * ePart\n\n#tenfold(\"house-votes/hv\", \"class\\tattr\\tattr\\tattr\\tattr\\tattr\\tattr\\tattr\\tattr\\tattr\\tattr\\tattr\\tattr\\tattr\\tattr\\tattr\\tattr\")\n#c = Classifier(\"house-votes/hv\", 0,\n#                       \"class\\tattr\\tattr\\tattr\\tattr\\tattr\\tattr\\tattr\\tattr\\tattr\\tattr\\tattr\\tattr\\tattr\\tattr\\tattr\\tattr\")\ntenfold(\"pimaSmall/pimaSmall\",  \"num\tnum\tnum\tnum\tnum\tnum\tnum\tnum\tclass\")\ntenfold(\"pima/pima\",  \"num\tnum\tnum\tnum\tnum\tnum\tnum\tnum\tclass\")\n\n#c = Classifier(\"iHealth/i\", 10,\n#                       \"attr\\tattr\\tattr\\tattr\\tclass\")\n#print(c.classify([], [3, 78, 50, 32, 88, 31.0, 0.248, 26]))\n\n#c = Classifier(\"house-votes-filtered/hv\", 5, \"class\\tattr\\tattr\\tattr\\tattr\\tattr\\tattr\\tattr\\tattr\\tattr\\tattr\\tattr\\tattr\\tattr\\tattr\\tattr\\tattr\")\n#t = c.testBucket(\"house-votes-filtered/hv\", 5)\n#print(t)\n\n"
  },
  {
    "path": "chapter-7/bayesSentiment.py",
    "content": "from __future__ import print_function\nimport os, codecs, math\n\nclass BayesText:\n\n    def __init__(self, trainingdir, stopwordlist, ignoreBucket):\n        \"\"\"This class implements a naive Bayes approach to text\n        classification\n        trainingdir is the training data. Each subdirectory of\n        trainingdir is titled with the name of the classification\n        category -- those subdirectories in turn contain the text\n        files for that category.\n        The stopwordlist is a list of words (one per line) will be\n        removed before any counting takes place.\n        \"\"\"\n        self.vocabulary = {}\n        self.prob = {}\n        self.totals = {}\n        self.stopwords = {}\n        f = open(stopwordlist)\n        for line in f:\n            self.stopwords[line.strip()] = 1\n        f.close()\n        categories = os.listdir(trainingdir)\n        #filter out files that are not directories\n        self.categories = [filename for filename in categories\n                           if os.path.isdir(trainingdir + filename)]\n        print(\"Counting ...\")\n        for category in self.categories:\n            #print('    ' + category)\n            (self.prob[category],\n             self.totals[category]) = self.train(trainingdir, category,\n                                                 ignoreBucket)\n        # I am going to eliminate any word in the vocabulary\n        # that doesn't occur at least 3 times\n        toDelete = []\n        for word in self.vocabulary:\n            if self.vocabulary[word] < 3:\n                # mark word for deletion\n                # can't delete now because you can't delete\n                # from a list you are currently iterating over\n                toDelete.append(word)\n        # now delete\n        for word in toDelete:\n            del self.vocabulary[word]\n        # now compute probabilities\n        vocabLength = len(self.vocabulary)\n        #print(\"Computing probabilities:\")\n        for category in self.categories:\n            #print('    ' + category)\n            denominator = self.totals[category] + vocabLength\n            for word in self.vocabulary:\n                if word in self.prob[category]:\n                    count = self.prob[category][word]\n                else:\n                    count = 1\n                self.prob[category][word] = (float(count + 1)\n                                             / denominator)\n        #print (\"DONE TRAINING\\n\\n\")\n                    \n\n    def train(self, trainingdir, category, bucketNumberToIgnore):\n        \"\"\"counts word occurrences for a particular category\"\"\"\n        ignore = \"%i\" % bucketNumberToIgnore\n        currentdir = trainingdir + category\n        directories = os.listdir(currentdir)\n        counts = {}\n        total = 0\n        for directory in directories:\n            if directory != ignore:\n                currentBucket = trainingdir + category + \"/\" + directory\n                files = os.listdir(currentBucket)\n                #print(\"   \" + currentBucket)\n                for file in files:\n                    f = codecs.open(currentBucket + '/' + file, 'r', 'iso8859-1')\n                    for line in f:\n                        tokens = line.split()\n                        for token in tokens:\n                            # get rid of punctuation and lowercase token\n                            token = token.strip('\\'\".,?:-')\n                            token = token.lower()\n                            if token != '' and not token in self.stopwords:\n                                self.vocabulary.setdefault(token, 0)\n                                self.vocabulary[token] += 1\n                                counts.setdefault(token, 0)\n                                counts[token] += 1\n                                total += 1\n                    f.close()\n        return(counts, total)\n                    \n                    \n    def classify(self, filename):\n        results = {}\n        for category in self.categories:\n            results[category] = 0\n        f = codecs.open(filename, 'r', 'iso8859-1')\n        for line in f:\n            tokens = line.split()\n            for token in tokens:\n                #print(token)\n                token = token.strip('\\'\".,?:-').lower()\n                if token in self.vocabulary:\n                    for category in self.categories:\n                        if self.prob[category][token] == 0:\n                            print(\"%s %s\" % (category, token))\n                        results[category] += math.log(\n                            self.prob[category][token])\n        f.close()\n        results = list(results.items())\n        results.sort(key=lambda tuple: tuple[1], reverse = True)\n        # for debugging I can change this to give me the entire list\n        return results[0][0]\n\n    def testCategory(self, direc, category, bucketNumber):\n        results = {}\n        directory = direc + (\"%i/\" % bucketNumber)\n        #print(\"Testing \" + directory)\n        files = os.listdir(directory)\n        total = 0\n        correct = 0\n        for file in files:\n            total += 1\n            result = self.classify(directory + file)\n            results.setdefault(result, 0)\n            results[result] += 1\n            #if result == category:\n            #               correct += 1\n        return results\n\n    def test(self, testdir, bucketNumber):\n        \"\"\"Test all files in the test directory--that directory is\n        organized into subdirectories--each subdir is a classification\n        category\"\"\"\n        results = {}\n        categories = os.listdir(testdir)\n        #filter out files that are not directories\n        categories = [filename for filename in categories if\n                      os.path.isdir(testdir + filename)]\n        correct = 0\n        total = 0\n        for category in categories:\n            #print(\".\", end=\"\")\n            results[category] = self.testCategory(\n                testdir + category + '/', category, bucketNumber)\n        return results\n\ndef tenfold(dataPrefix, stoplist):\n    results = {}\n    for i in range(0,10):\n        bT = BayesText(dataPrefix, stoplist, i)\n        r = bT.test(theDir, i)\n        for (key, value) in r.items():\n            results.setdefault(key, {})\n            for (ckey, cvalue) in value.items():\n                results[key].setdefault(ckey, 0)\n                results[key][ckey] += cvalue\n                categories = list(results.keys())\n    categories.sort()\n    print(   \"\\n       Classified as: \")\n    header =    \"          \"\n    subheader = \"        +\"\n    for category in categories:\n        header += \"% 2s   \" % category\n        subheader += \"-----+\"\n    print (header)\n    print (subheader)\n    total = 0.0\n    correct = 0.0\n    for category in categories:\n        row = \" %s    |\" % category \n        for c2 in categories:\n            if c2 in results[category]:\n                count = results[category][c2]\n            else:\n                count = 0\n            row += \" %3i |\" % count\n            total += count\n            if c2 == category:\n                correct += count\n        print(row)\n    print(subheader)\n    print(\"\\n%5.3f percent correct\" %((correct * 100) / total))\n    print(\"total of %i instances\" % total)\n\n# change these to match your directory structure\nprefixPath = \"/Users/raz/Dropbox/guide/data/review_polarity_buckets/\"\ntheDir = prefixPath + \"/txt_sentoken/\"\nstoplistfile = prefixPath + \"stopwords25.txt\"\ntenfold(theDir, stoplistfile)\n"
  },
  {
    "path": "chapter-7/bayesText.py",
    "content": "from __future__ import print_function\nimport os, codecs, math\n\nclass BayesText:\n\n    def __init__(self, trainingdir, stopwordlist):\n        \"\"\"This class implements a naive Bayes approach to text\n        classification\n        trainingdir is the training data. Each subdirectory of\n        trainingdir is titled with the name of the classification\n        category -- those subdirectories in turn contain the text\n        files for that category.\n        The stopwordlist is a list of words (one per line) will be\n        removed before any counting takes place.\n        \"\"\"\n        self.vocabulary = {}\n        self.prob = {}\n        self.totals = {}\n        self.stopwords = {}\n        f = open(stopwordlist)\n        for line in f:\n            self.stopwords[line.strip()] = 1\n        f.close()\n        categories = os.listdir(trainingdir)\n        #filter out files that are not directories\n        self.categories = [filename for filename in categories\n                           if os.path.isdir(trainingdir + filename)]\n        print(\"Counting ...\")\n        for category in self.categories:\n            print('    ' + category)\n            (self.prob[category],\n             self.totals[category]) = self.train(trainingdir, category)\n        # I am going to eliminate any word in the vocabulary\n        # that doesn't occur at least 3 times\n        toDelete = []\n        for word in self.vocabulary:\n            if self.vocabulary[word] < 3:\n                # mark word for deletion\n                # can't delete now because you can't delete\n                # from a list you are currently iterating over\n                toDelete.append(word)\n        # now delete\n        for word in toDelete:\n            del self.vocabulary[word]\n        # now compute probabilities\n        vocabLength = len(self.vocabulary)\n        print(\"Computing probabilities:\")\n        for category in self.categories:\n            print('    ' + category)\n            denominator = self.totals[category] + vocabLength\n            for word in self.vocabulary:\n                if word in self.prob[category]:\n                    count = self.prob[category][word]\n                else:\n                    count = 1\n                self.prob[category][word] = (float(count + 1)\n                                             / denominator)\n        print (\"DONE TRAINING\\n\\n\")\n                    \n\n    def train(self, trainingdir, category):\n        \"\"\"counts word occurrences for a particular category\"\"\"\n        currentdir = trainingdir + category\n        files = os.listdir(currentdir)\n        counts = {}\n        total = 0\n        for file in files:\n            #print(currentdir + '/' + file)\n            f = codecs.open(currentdir + '/' + file, 'r', 'iso8859-1')\n            for line in f:\n                tokens = line.split()\n                for token in tokens:\n                    # get rid of punctuation and lowercase token\n                    token = token.strip('\\'\".,?:-')\n                    token = token.lower()\n                    if token != '' and not token in self.stopwords:\n                        self.vocabulary.setdefault(token, 0)\n                        self.vocabulary[token] += 1\n                        counts.setdefault(token, 0)\n                        counts[token] += 1\n                        total += 1\n            f.close()\n        return(counts, total)\n                    \n                    \n    def classify(self, filename):\n        results = {}\n        for category in self.categories:\n            results[category] = 0\n        f = codecs.open(filename, 'r', 'iso8859-1')\n        for line in f:\n            tokens = line.split()\n            for token in tokens:\n                #print(token)\n                token = token.strip('\\'\".,?:-').lower()\n                if token in self.vocabulary:\n                    for category in self.categories:\n                        if self.prob[category][token] == 0:\n                            print(\"%s %s\" % (category, token))\n                        results[category] += math.log(\n                            self.prob[category][token])\n        f.close()\n        results = list(results.items())\n        results.sort(key=lambda tuple: tuple[1], reverse = True)\n        # for debugging I can change this to give me the entire list\n        return results[0][0]\n\n    def testCategory(self, directory, category):\n        files = os.listdir(directory)\n        total = 0\n        correct = 0\n        for file in files:\n            total += 1\n            result = self.classify(directory + file)\n            if result == category:\n                correct += 1\n        return (correct, total)\n\n    def test(self, testdir):\n        \"\"\"Test all files in the test directory--that directory is\n        organized into subdirectories--each subdir is a classification\n        category\"\"\"\n        categories = os.listdir(testdir)\n        #filter out files that are not directories\n        categories = [filename for filename in categories if\n                      os.path.isdir(testdir + filename)]\n        correct = 0\n        total = 0\n        for category in categories:\n            print(\".\", end=\"\")\n            (catCorrect, catTotal) = self.testCategory(\n                testdir + category + '/', category)\n            correct += catCorrect\n            total += catTotal\n        print(\"\\n\\nAccuracy is  %f%%  (%i test instances)\" %\n              ((float(correct) / total) * 100, total))\n            \n# change these to match your directory structure\nbaseDirectory = \"/Users/raz/Dropbox/guide/data/20news-bydate/\"\ntrainingDir = baseDirectory + \"20news-bydate-train/\"\ntestDir = baseDirectory + \"20news-bydate-test/\"\n\n\nstoplistfile = \"/Users/raz/Downloads/20news-bydate/stopwords0.txt\"\nprint(\"Reg stoplist 0 \")\nbT = BayesText(trainingDir, baseDirectory + \"stopwords0.txt\")\nprint(\"Running Test ...\")\nbT.test(testDir)\n\nprint(\"\\n\\nReg stoplist 25 \")\nbT = BayesText(trainingDir, baseDirectory + \"stopwords25.txt\")\nprint(\"Running Test ...\")\nbT.test(testDir)\n\nprint(\"\\n\\nReg stoplist 174 \")\nbT = BayesText(trainingDir, baseDirectory + \"stopwords174.txt\")\nprint(\"Running Test ...\")\nbT.test(testDir)\n"
  },
  {
    "path": "chapter-8/cereal.csv",
    "content": "Name,Calories,Protein,Fat (g),Sodium (mg),dietary fiber (g),carbohydrates (g),sugar,x,\n100% Bran,70,4,1,130,10,5,6,280,25\n100% Natural Bran,120,3,5,15,2,8,8,135,0\nAll-Bran,70,4,1,260,9,7,5,320,25\nAll-Bran with Extra Fiber,50,4,0,140,14,8,0,330,25\nAlmond Delight,110,2,2,200,1,14,8,-1,25\nApple Cinnamon Cheerios,110,2,2,180,1.5,10.5,10,70,25\nApple Jacks,110,2,0,125,1,11,14,30,25\nBasic 4,130,3,2,210,2,18,8,100,25\nBran Chex,90,2,1,200,4,15,6,125,25\nBran Flakes,90,3,0,210,5,13,5,190,25\nCap'n'Crunch,120,1,2,220,0,12,12,35,25\nCheerios,110,6,2,290,2,17,1,105,25\nCinnamon Toast Crunch,120,1,3,210,0,13,9,45,25\nClusters,110,3,2,140,2,13,7,105,25\nCocoa Puffs,110,1,1,180,0,12,13,55,25\nCorn Chex,110,2,0,280,0,22,3,25,25\nCorn Flakes,100,2,0,290,1,21,2,35,25\nCorn Pops,110,1,0,90,1,13,12,20,25\nCount Chocula,110,1,1,180,0,12,13,65,25\nCracklin' Oat Bran,110,3,3,140,4,10,7,160,25\nCream of Wheat (Quick),100,3,0,80,1,21,0,-1,0\nCrispix,110,2,0,220,1,21,3,30,25\nCrispy Wheat & Raisins,100,2,1,140,2,11,10,120,25\nDouble Chex,100,2,0,190,1,18,5,80,25\nFroot Loops,110,2,1,125,1,11,13,30,25\nFrosted Flakes,110,1,0,200,1,14,11,25,25\nFrosted Mini-Wheats,100,3,0,0,3,14,7,100,25\nFruit & Fibre,120,3,2,160,5,12,10,200,25\nFruitful Bran,120,3,0,240,5,14,12,190,25\nFruity Pebbles,110,1,1,135,0,13,12,25,25\nGolden Crisp,100,2,0,45,0,11,15,40,25\nGolden Grahams,110,1,1,280,0,15,9,45,25\nGrape Nuts Flakes,100,3,1,140,3,15,5,85,25\nGrape-Nuts,110,3,0,170,3,17,3,90,25\nGreat Grains Pecan,120,3,3,75,3,13,4,100,25\nHoney Graham Ohs,120,1,2,220,1,12,11,45,25\nHoney Nut Cheerios,110,3,1,250,1.5,11.5,10,90,25\nHoney-comb,110,1,0,180,0,14,11,35,25\nJust Right Crunchy  Nuggets,110,2,1,170,1,17,6,60,100\nJust Right Fruit & Nut,140,3,1,170,2,20,9,95,100\nKix,110,2,1,260,0,21,3,40,25\nLife,100,4,2,150,2,12,6,95,25\nLucky Charms,110,2,1,180,0,12,12,55,25\nMaypo,100,4,1,0,0,16,3,95,25\nMuesli Raisins & Almonds,150,4,3,95,3,16,11,170,25\nMuesli Peaches & Pecans,150,4,3,150,3,16,11,170,25\nMueslix Crispy Blend,160,3,2,150,3,17,13,160,25\nMulti-Grain Cheerios,100,2,1,220,2,15,6,90,25\nNut&Honey Crunch,120,2,1,190,0,15,9,40,25\nNutri-Grain Almond-Raisin,140,3,2,220,3,21,7,130,25\nNutri-grain Wheat,90,3,0,170,3,18,2,90,25\nOatmeal Raisin Crisp,130,3,2,170,1.5,13.5,10,120,25\nPost Nat. Raisin Bran,120,3,1,200,6,11,14,260,25\nProduct 19,100,3,0,320,1,20,3,45,100\nPuffed Rice,50,1,0,0,0,13,0,15,0\nPuffed Wheat,50,2,0,0,1,10,0,50,0\nQuaker Oat Squares,100,4,1,135,2,14,6,110,25\nQuaker Oatmeal,100,5,2,0,2.7,-1,-1,110,0\nRaisin Bran,120,3,1,210,5,14,12,240,25\nRaisin Nut Bran,100,3,2,140,2.5,10.5,8,140,25\nRaisin Squares,90,2,0,0,2,15,6,110,25\nRice Chex,110,1,0,240,0,23,2,30,25\nRice Krispies,110,2,0,290,0,22,3,35,25\nShredded Wheat,80,2,0,0,3,16,0,95,0\nShredded Wheat 'n'Bran,90,3,0,0,4,19,0,140,0\nShredded Wheat spoon size,90,3,0,0,3,20,0,120,0\nSmacks,110,2,1,70,1,9,15,40,25\nSpecial K,110,6,0,230,1,16,3,55,25\nStrawberry Fruit Wheats,90,2,0,15,3,15,5,90,25\nTotal Corn Flakes,110,2,1,200,0,21,3,35,100\nTotal Raisin Bran,140,3,1,190,4,15,14,230,100\nTotal Whole Grain,100,3,1,200,3,16,3,110,100\nTriples,110,2,1,250,0,21,3,60,25\nTrix,110,1,1,140,0,13,12,25,25\nWheat Chex,100,3,1,230,3,17,3,115,25\nWheaties,100,3,1,200,3,17,3,110,25\nWheaties Honey Gold,110,2,1,200,1,16,8,60,25\n"
  },
  {
    "path": "chapter-8/dogs.csv",
    "content": "breed,height (inches),weight (pounds)\r\nBorder Collie,20,45\r\nBoston Terrier,16,20\r\nBrittany Spaniel,18,35\r\nBullmastiff,27,120\r\nChihuahua,8,8\r\nGerman Shepherd,25,78\r\nGolden Retriever,23,70\r\nGreat Dane,32,160\r\nPortuguese Water Dog,21,50\r\nStandard Poodle,19,65\r\nYorkshire Terrier,6,7"
  },
  {
    "path": "chapter-8/enrondata.txt",
    "content": "kay.mann@enron.com,vince.kaminski@enron.com,jeff.dasovich@enron.com,pete.davis@enron.com,chris.germany@enron.com,sara.shackleton@enron.com,tana.jones@enron.com,steven.kean@enron.com,kate.symes@enron.com,matthew.lenhart@enron.com,eric.bass@enron.com,debra.perlingiere@enron.com,sally.beck@enron.com,mark.taylor@enron.com,susan.scott@enron.com,gerald.nemec@enron.com,drew.fossum@enron.com,john.arnold@enron.com,carol.clair@enron.com,benjamin.rogers@enron.com,richard.sanders@enron.com,phillip.love@enron.com,david.delainey@enron.com,darron.giron@enron.com,daren.farmer@enron.com,mike.mcconnell@enron.com,jeffrey.shankman@enron.com,elizabeth.sager@enron.com,john.lavorato@enron.com,robin.rodrigue@enron.com,phillip.allen@enron.com,mark.haedicke@enron.com,chris.dorland@enron.com,scott.neal@enron.com,michelle.cash@enron.com,louise.kitchen@enron.com,mike.grigsby@enron.com,susan.mara@enron.com,d..steffes@enron.com,mary.hain@enron.com,dan.hyvl@enron.com,larry.campbell@enron.com,james.steffes@enron.com,errol.mclaughlin@enron.com,j.kaminski@enron.com,kimberly.watson@enron.com,richard.shapiro@enron.com,lynn.blair@enron.com,maureen.mcvicker@enron.com,rosalee.fleming@enron.com,stanley.horton@enron.com,mjones7@txu.com,rod.hayslett@enron.com,marie.heard@enron.com,matt.smith@enron.com,rick.buy@enron.com,m..love@enron.com,hunter.shively@enron.com,shirley.crenshaw@enron.com,sherri.sera@enron.com,mark.guzman@enron.com,shelley.corman@enron.com,ginger.dernehl@enron.com,james.derrick@enron.com,michelle.lokay@enron.com,mary.cook@enron.com,dana.davis@enron.com,david.forster@enron.com,judy.hernandez@enron.com,m..presto@enron.com,soblander@carrfut.com,karen.denne@enron.com,christi.nicolay@enron.com,evelyn.metoyer@enron.com,perfmgmt@enron.com,leslie.hansen@enron.com,kevin.hyatt@enron.com,tori.kuykendall@enron.com,lorna.brennan@enron.com,liz.taylor@enron.com,patrice.mims@enron.com,mike.maggi@enron.com,tracy.geaccone@enron.com,jane.tholt@enron.com,rhonda.denton@enron.com,cara.semperger@enron.com,barry.tycholiz@enron.com,mike.carson@enron.com,bill.williams@enron.com,kerri.thompson@enron.com\nkay.mann@enron.com,16735,0,0,0,10,20,4,0,0,0,0,7,0,6,0,6,1,0,9,0,41,0,0,0,0,0,0,94,0,0,0,16,0,0,10,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,2,0,0,71,0,4,3,0,0,0,0,0,0,0,0,12,0,0,0,56,0\nvince.kaminski@enron.com,0,14368,0,0,0,0,0,14,0,0,0,0,21,0,0,0,0,8,0,4,0,0,16,0,0,0,75,0,53,0,0,19,0,0,0,54,0,0,0,0,0,0,7,0,0,28,8,0,0,0,7,0,0,0,0,42,0,8,1246,23,0,5,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,13,0,0,5,0,0,0,0,0,0,0\njeff.dasovich@enron.com,0,2,11411,0,0,0,0,117,0,0,0,0,0,0,92,0,42,0,0,0,1010,0,164,0,0,0,0,0,142,0,47,2,0,0,0,132,3,2660,442,399,0,0,2712,0,1,5,2889,0,89,48,0,0,0,0,0,0,0,0,0,0,0,50,1114,0,4,0,1,0,0,1,0,2480,27,0,0,0,2,0,0,0,0,0,0,0,0,0,106,0,1,0\npete.davis@enron.com,0,0,0,9149,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0\nchris.germany@enron.com,43,0,0,0,8801,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,0,0,0,0,0,0,0,0,112,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,29,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0\nsara.shackleton@enron.com,17,0,0,0,0,8777,569,0,0,0,0,4,0,665,0,0,0,8,436,0,22,0,0,0,0,0,0,5,0,0,0,0,0,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,295,0,0,0,0,0,0,0,0,0,0,0,313,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2,0,0,2,0,0,0,0,0\ntana.jones@enron.com,2,0,0,0,0,575,8490,2,0,0,0,7,0,824,0,0,0,55,460,0,2,0,0,0,0,0,2,334,2,0,0,4,0,0,0,114,0,0,0,0,16,0,0,0,0,0,0,0,0,0,0,0,2,804,0,0,0,0,0,0,0,0,0,0,0,278,8,162,0,2,0,0,0,0,0,730,0,0,0,0,0,2,0,0,21,0,2,0,0,0\nsteven.kean@enron.com,0,0,408,0,0,0,0,6759,0,0,0,0,0,0,0,0,0,0,0,0,22,0,38,0,0,18,8,0,41,0,26,23,0,6,9,9,6,155,0,57,0,0,245,0,0,0,361,0,1038,126,7,0,4,0,0,21,0,0,0,62,0,52,11,34,0,0,0,0,0,0,0,156,35,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0\nkate.symes@enron.com,0,0,0,1,0,0,0,0,5438,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,51,0,0,0,0,0,0,0,0,0,0,0,0,939,0,0,0,0,0,0,0,0,0,0,195,85,0,0,62,888\nmatthew.lenhart@enron.com,0,0,0,0,0,0,0,0,0,5265,199,0,0,0,49,0,0,0,0,0,0,56,0,0,0,0,0,0,0,0,28,0,2,0,0,0,68,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,17,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,60,0,0,0,0,0,13,0,0,0,0,0,0\neric.bass@enron.com,0,0,0,0,0,0,0,0,0,692,5158,0,0,0,4,0,0,0,0,0,0,413,0,12,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,27,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0\ndebra.perlingiere@enron.com,14,0,0,0,0,0,0,0,0,0,0,4387,0,0,0,130,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,147,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0\nsally.beck@enron.com,0,6,0,0,0,0,0,4,0,0,27,0,4343,6,4,0,0,16,0,16,0,19,117,17,16,19,25,1,134,8,0,0,0,18,0,177,2,0,0,0,0,16,0,16,0,0,0,0,0,1,6,0,0,0,0,28,16,0,0,0,0,0,0,6,0,0,16,9,12,1,0,0,0,8,0,0,0,0,0,5,0,16,0,0,19,0,3,16,0,8\nmark.taylor@enron.com,6,0,0,0,0,297,377,0,0,0,0,8,0,4111,0,8,0,2,188,0,34,0,0,0,0,0,0,47,0,0,0,36,0,0,12,111,0,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,39,0,0,0,0,0,0,0,0,0,0,0,73,0,160,0,0,0,0,1,0,0,72,0,0,0,0,0,0,0,0,0,0,0,0,0,0\nsusan.scott@enron.com,0,0,52,0,0,0,0,0,0,15,15,0,0,0,4000,13,81,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,6,0,0,0,4,0,0,0,0,0,0,26,0,0,3,17,0,0,0,0,0,0,0,0,0,0,0,0,0,104,0,0,131,0,5,0,0,1,0,0,14,0,0,0,164,2,0,0,0,0,0,0,0,0,1,16,0,0\ngerald.nemec@enron.com,0,0,0,0,6,2,2,0,0,0,0,60,0,18,46,3888,14,0,2,0,6,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0,0,0,0,0,18,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0,0,0,0,0,0,114,0,0,0\ndrew.fossum@enron.com,1,0,3,0,0,0,0,0,0,0,0,0,0,2,405,0,3706,0,2,0,0,0,0,0,0,0,0,2,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,22,0,23,0,0,17,0,38,0,0,0,0,0,0,0,0,105,0,2,5,0,0,3,0,0,0,0,0,0,0,0,166,0,17,0,0,0,0,0,0,0,0,0,0,0\njohn.arnold@enron.com,0,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3578,0,0,0,0,0,0,0,0,47,0,162,0,8,0,0,11,0,18,4,0,0,0,0,0,0,54,0,0,0,0,0,0,0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,6,0,39,0,1,11,0,0,0,0,0,0,0,0,53,0,206,0,0,0,0,0,0,0,0\ncarol.clair@enron.com,4,0,0,0,0,458,323,0,0,0,0,2,0,371,0,0,0,0,3564,0,13,0,0,0,0,0,0,66,0,0,0,2,0,0,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,28,0,0,0,0,0,0,0,0,0,0,0,191,0,104,0,0,0,0,2,0,0,74,0,0,0,0,0,0,0,0,22,0,0,0,0,0\nbenjamin.rogers@enron.com,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3427,0,0,32,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0\nrichard.sanders@enron.com,10,2,36,0,0,6,2,78,0,0,0,0,0,8,0,6,0,0,2,0,3262,0,8,0,0,4,0,32,4,0,2,159,0,0,29,4,2,9,0,21,0,0,42,0,0,0,31,0,0,0,0,0,0,4,0,2,0,0,0,0,0,0,5,33,0,0,0,0,0,0,0,9,20,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0\nphillip.love@enron.com,0,0,0,0,0,0,0,0,0,64,121,0,0,0,0,0,0,0,0,0,0,3112,0,88,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,19,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,15,0,0,4,0,0,15,0,0,0,0,0,0\ndavid.delainey@enron.com,0,33,2,0,0,0,0,34,0,0,0,0,72,0,0,0,0,26,0,8,6,0,3069,0,0,11,64,4,259,0,35,140,0,39,0,21,0,0,0,0,0,0,61,0,0,0,46,0,0,0,0,0,0,0,0,54,0,39,0,4,0,0,0,3,0,0,7,0,0,0,0,0,20,0,0,0,0,0,0,0,0,0,0,0,0,0,39,0,0,0\ndarron.giron@enron.com,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,177,0,2963,0,0,0,0,0,12,0,0,0,0,0,0,4,0,0,0,0,0,0,26,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0\ndaren.farmer@enron.com,0,0,0,0,0,0,0,0,0,0,4,0,2,0,0,0,0,0,0,0,0,0,0,0,2812,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0\nmike.mcconnell@enron.com,0,10,0,0,0,0,0,28,0,0,0,0,24,0,0,0,0,0,0,0,0,0,25,0,0,2742,249,0,21,0,0,3,0,0,0,65,0,0,0,0,0,0,0,0,0,0,0,0,0,3,11,0,0,0,0,15,0,0,0,3,0,0,0,7,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,18,0,0,0,0,0,0,0,0,0,0\njeffrey.shankman@enron.com,0,39,0,0,0,0,0,5,0,0,0,0,9,0,0,0,0,16,0,0,0,0,14,0,0,131,2681,0,29,0,6,7,0,3,5,7,6,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,38,0,9,2,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0\nelizabeth.sager@enron.com,16,0,0,0,0,0,23,19,0,0,0,0,0,28,0,4,0,0,96,0,62,0,11,0,0,0,0,2636,1,0,0,97,0,0,10,12,0,0,0,0,0,0,6,0,0,0,4,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,28,2,0,0,3,0,15,59,0,0,74,0,0,0,0,0,0,0,0,0,0,0,0,0,0\njohn.lavorato@enron.com,0,14,0,0,18,0,0,6,0,18,18,0,25,0,0,0,0,176,0,0,0,0,169,0,0,2,46,0,2585,0,63,29,27,66,0,123,39,0,0,0,0,0,6,0,1,0,13,0,2,2,5,0,2,0,1,102,0,54,0,2,0,0,0,2,0,0,20,16,0,34,0,0,0,0,0,0,0,18,0,7,10,18,0,10,0,0,35,19,8,0\nrobin.rodrigue@enron.com,0,0,0,0,0,0,0,0,0,0,0,0,0,0,26,0,0,0,0,0,0,8,0,12,0,0,0,0,0,2496,0,0,0,0,0,0,0,0,0,0,0,0,0,27,0,0,0,0,0,0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0\nphillip.allen@enron.com,0,0,19,0,0,0,0,27,0,88,0,0,4,6,18,0,0,2,0,0,17,0,8,0,0,0,0,0,63,0,2195,0,0,2,0,0,173,17,0,17,0,0,27,0,0,0,17,0,0,0,6,0,0,0,24,0,0,26,0,0,0,0,0,0,0,0,0,0,0,0,0,0,21,0,0,0,0,55,0,0,0,0,0,52,0,0,7,0,0,0\nmark.haedicke@enron.com,2,2,0,0,0,2,2,10,0,0,0,2,0,133,0,2,0,0,14,0,53,0,33,0,0,2,6,99,18,0,0,1941,0,0,20,27,0,0,0,0,1,0,2,0,0,0,7,0,0,0,0,0,0,2,0,9,0,0,0,0,0,0,0,17,0,10,0,2,0,0,0,0,4,0,0,6,0,0,0,6,0,0,0,0,0,0,0,0,0,0\nchris.dorland@enron.com,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,23,0,0,0,1840,0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,13,7,0,0\nscott.neal@enron.com,0,0,0,0,45,2,0,0,0,0,0,9,4,0,0,0,0,34,0,0,0,0,7,0,0,0,4,0,35,0,70,0,0,1829,0,5,36,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,82,0,0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,0,0,0,0,8,0,20,0,0,0,0,0,0,0,0\nmichelle.cash@enron.com,0,0,0,0,0,0,0,2,0,0,0,0,3,18,0,0,0,0,0,0,17,0,0,0,0,0,1,22,5,0,0,24,0,0,1824,12,0,0,0,0,0,0,0,0,2,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,3,0,0,0,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0\nlouise.kitchen@enron.com,0,10,1,0,0,1,29,7,0,0,0,0,115,111,0,0,0,61,2,0,12,0,5,0,0,10,6,35,175,0,7,19,0,66,7,1728,71,0,74,0,0,0,1,0,4,0,5,0,0,0,0,0,0,2,0,64,0,7,0,0,0,0,0,0,0,26,30,168,0,112,0,7,6,0,0,23,0,0,0,10,2,0,0,0,0,0,97,10,0,0\nmike.grigsby@enron.com,0,0,0,0,0,0,0,0,0,338,45,0,0,0,22,0,0,13,0,0,0,0,0,4,0,0,7,0,40,0,96,0,8,10,0,8,1719,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,282,0,6,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,347,0,0,0,0,0,73,0,0,245,0,0,0\nsusan.mara@enron.com,0,0,1200,0,0,0,0,189,0,0,0,0,0,0,0,0,0,0,2,0,533,0,50,0,0,0,0,2,46,0,383,0,0,0,0,46,0,1687,174,445,0,0,865,0,0,0,889,0,0,0,0,0,0,0,0,0,0,0,0,0,22,0,451,0,0,0,0,0,0,0,0,726,33,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,0\nd..steffes@enron.com,0,0,186,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,12,0,0,0,0,12,30,0,0,0,0,0,0,52,0,133,1655,0,0,0,0,0,2,0,187,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,16,0,0,0,6,7,0,22,0,11,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0\nmary.hain@enron.com,0,0,279,0,0,0,0,205,0,0,0,0,0,0,0,0,0,0,0,0,394,0,13,0,3,0,0,14,13,0,153,156,0,0,0,0,127,425,0,1456,0,0,517,0,0,0,215,0,16,0,0,0,0,0,0,137,0,0,0,0,0,32,14,10,0,0,0,0,0,0,0,158,78,0,0,0,0,0,0,0,0,0,0,0,14,23,0,0,19,0\ndan.hyvl@enron.com,0,0,0,0,0,0,10,0,0,0,5,71,0,6,0,8,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1454,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6,0,4,0,0,6,0,0,0,0,0,29,0,0,0\nlarry.campbell@enron.com,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1388,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0\njames.steffes@enron.com,0,5,635,0,0,0,0,539,0,0,0,0,0,4,3,0,0,0,0,0,149,0,157,0,0,0,0,77,129,0,85,16,0,1,0,52,70,410,0,254,0,0,1346,0,0,0,576,0,16,0,0,0,0,0,0,0,0,3,0,0,0,34,20,0,0,0,0,0,0,0,0,107,51,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0\nerrol.mclaughlin@enron.com,0,0,0,0,0,0,0,0,0,0,0,0,0,0,22,0,0,168,0,0,0,3,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1325,0,0,0,0,0,0,0,0,0,0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,138,0,0,0,0,0,0,0,0\nj.kaminski@enron.com,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,0,0,0,0,0,2,5,0,0,0,0,0,0,0,0,1247,2,0,0,0,0,1,0,0,0,0,1,0,0,70,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0\nkimberly.watson@enron.com,0,19,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,1217,0,10,0,0,1,0,7,0,0,0,0,0,2,0,0,8,0,0,223,0,0,0,0,0,0,0,0,0,0,0,20,0,1,0,0,0,16,0,0,0,0,0,0,0\nrichard.shapiro@enron.com,0,6,237,0,0,0,0,476,0,0,0,0,0,0,0,0,0,0,0,0,2,0,26,0,0,9,5,0,48,0,0,4,0,0,0,37,0,111,96,17,0,0,144,0,0,0,1215,0,68,5,0,0,0,0,0,0,0,0,0,0,0,0,92,0,0,0,0,0,0,11,0,22,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0\nlynn.blair@enron.com,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,53,0,1210,0,0,4,0,3,0,0,0,0,0,0,0,0,145,0,0,24,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0\nmaureen.mcvicker@enron.com,0,0,189,0,0,0,0,230,0,0,0,0,0,0,0,0,0,0,0,0,32,0,75,0,0,30,5,3,22,0,13,15,0,0,0,22,0,73,2,53,0,0,125,0,0,0,158,0,1186,25,55,0,6,0,0,25,0,0,0,27,0,0,40,55,0,0,0,0,0,0,0,66,30,0,0,0,0,0,0,5,0,0,0,0,0,0,0,0,0,0\nrosalee.fleming@enron.com,0,3,18,0,0,0,0,190,0,0,0,0,1,0,0,0,0,0,0,0,2,0,129,0,0,134,118,0,131,0,0,0,0,0,0,128,0,0,0,0,0,0,0,0,0,0,15,0,128,1119,152,0,118,0,0,153,0,0,0,133,0,2,0,153,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,0,0,0\nstanley.horton@enron.com,0,9,0,0,0,0,0,12,0,0,0,0,2,0,0,0,12,0,0,4,0,0,0,0,0,9,6,0,19,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,1,0,2,0,0,1073,0,34,0,0,2,0,2,0,0,0,28,0,0,3,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,2,0,0,0,0,0,0,0\nmjones7@txu.com,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1063,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0\nrod.hayslett@enron.com,0,0,0,0,0,0,2,0,0,0,0,0,9,0,0,0,45,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,60,0,1061,0,0,0,0,0,0,0,0,26,0,0,11,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,225,0,0,0,0,0,0,0\nmarie.heard@enron.com,0,0,0,0,0,160,224,0,0,0,0,5,0,14,0,72,0,0,10,0,0,0,0,0,0,0,0,79,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1061,0,1,0,0,0,0,0,0,0,0,0,106,0,0,0,0,0,0,0,0,0,70,0,0,0,0,0,0,0,0,0,0,0,0,0,0\nmatt.smith@enron.com,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1060,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0\nrick.buy@enron.com,0,3,0,0,0,0,0,1,0,0,0,0,5,0,0,0,0,0,0,0,0,0,26,0,0,5,3,0,20,0,0,2,0,0,0,4,0,0,0,0,0,0,0,0,1,0,0,0,0,2,0,0,0,0,0,1053,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0\nm..love@enron.com,0,0,0,0,15,0,0,0,0,4,22,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,0,0,0,15,0,0,13,0,0,0,0,0,0,41,0,0,0,0,0,0,0,0,0,0,0,0,732,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0\nhunter.shively@enron.com,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,3,0,0,7,0,33,0,15,0,0,15,0,10,4,0,0,0,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1052,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0\nshirley.crenshaw@enron.com,0,364,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,26,3,0,0,0,0,0,0,0,0,0,3,0,0,974,2,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,0,0,0,0\nsherri.sera@enron.com,0,17,0,0,0,0,0,238,0,0,0,0,7,0,0,0,0,0,0,0,0,0,56,0,0,41,28,0,29,0,0,0,0,7,0,37,0,0,0,0,0,0,7,0,0,0,4,0,4,0,52,0,34,0,0,41,0,0,0,971,0,0,4,53,0,0,0,0,0,0,0,23,0,0,0,0,0,0,0,4,0,0,0,0,0,0,7,0,0,0\nmark.guzman@enron.com,0,0,0,0,0,0,0,0,49,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,970,0,0,0,0,0,0,0,0,0,0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,10,0,0,4,20\nshelley.corman@enron.com,0,0,26,0,0,0,0,134,0,0,0,0,0,2,85,0,120,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,32,1,32,0,0,51,0,0,47,61,108,0,0,133,0,92,0,0,0,0,0,0,0,0,940,0,0,10,0,0,0,0,0,0,0,38,0,0,0,10,0,10,0,0,0,8,0,0,0,0,0,0,0\nginger.dernehl@enron.com,0,0,681,0,0,0,0,442,0,0,0,0,0,0,108,0,0,0,0,0,32,0,44,0,0,8,8,3,42,0,16,40,0,0,0,8,16,573,151,313,0,0,472,0,0,0,642,0,510,2,0,0,0,0,0,32,0,0,0,0,0,32,925,32,0,0,0,0,0,0,0,83,394,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0\njames.derrick@enron.com,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,12,0,0,0,15,0,0,0,0,0,0,0,0,0,0,13,0,0,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,909,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0\nmichelle.lokay@enron.com,0,0,1,0,0,0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,192,0,4,0,0,0,0,7,0,0,0,0,0,0,0,0,9,0,0,904,0,0,0,0,0,0,0,0,0,0,0,116,0,0,0,0,0,2,0,0,0,0,0,0,0\nmary.cook@enron.com,1,0,0,0,0,445,376,0,0,0,0,0,0,179,0,156,0,5,134,0,0,0,0,0,0,0,0,202,1,0,0,3,0,0,1,17,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,198,0,0,0,0,0,0,0,0,0,1,0,901,0,1,0,0,0,0,0,0,0,148,0,0,0,0,0,0,0,0,0,0,2,0,0,0\ndana.davis@enron.com,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,899,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0\ndavid.forster@enron.com,0,0,38,38,27,0,80,56,0,59,43,0,42,291,0,0,0,40,20,2,2,0,0,0,24,56,77,45,114,0,21,59,59,61,38,189,61,0,41,0,0,2,0,38,0,0,0,0,0,0,0,0,0,38,0,56,38,29,0,0,0,0,0,0,2,0,53,891,0,53,0,0,0,38,0,13,0,21,0,0,21,59,0,27,0,0,55,59,38,0\njudy.hernandez@enron.com,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,888,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0\nm..presto@enron.com,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,2,0,0,0,0,0,0,0,0,0,20,56,0,0,0,2,0,0,46,0,0,13,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,35,8,0,885,0,0,0,0,0,2,0,0,0,5,0,0,0,0,0,0,0,7,0,0\nsoblander@carrfut.com,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,863,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0\nkaren.denne@enron.com,0,0,478,0,0,0,0,327,0,0,0,0,12,0,0,0,0,5,0,0,9,0,24,0,0,12,0,0,43,0,0,2,0,0,2,45,0,283,9,2,0,0,238,0,0,0,283,0,3,9,21,0,8,0,0,15,0,0,0,9,0,6,7,72,0,0,0,0,0,1,0,851,0,0,0,0,0,0,0,5,0,0,0,0,0,0,0,0,0,0\nchristi.nicolay@enron.com,93,3,14,0,0,0,0,363,0,0,0,0,0,15,21,0,0,0,0,3,30,0,102,0,0,0,0,168,64,0,9,47,0,1,0,21,0,144,1,200,0,0,362,0,0,0,428,0,15,0,0,0,0,0,0,0,0,0,0,7,0,75,7,0,0,0,0,3,0,1,0,0,836,0,0,2,0,0,0,0,0,0,0,0,0,0,0,29,0,0\nevelyn.metoyer@enron.com,0,0,0,0,0,0,0,0,826,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,830,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0\nperfmgmt@enron.com,23,0,30,0,12,9,32,0,0,2,0,0,34,8,0,28,0,0,0,2,0,0,0,0,0,0,0,2,0,0,0,0,0,18,1,3,6,0,0,0,0,0,0,6,0,0,0,0,0,0,0,0,7,0,6,0,0,0,0,0,15,0,0,0,30,0,12,0,0,0,0,0,0,0,830,0,0,3,0,0,0,0,0,0,0,0,1,0,1,0\nleslie.hansen@enron.com,3,0,0,0,0,19,514,0,0,0,0,13,0,74,0,7,0,0,5,0,0,0,0,0,0,0,0,11,0,0,0,2,0,0,3,6,0,0,0,0,22,0,0,0,0,0,0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0,0,0,0,2,0,12,0,1,0,0,0,0,0,829,0,0,0,0,0,0,0,0,4,0,0,0,0,0\nkevin.hyatt@enron.com,0,0,0,0,1,0,0,0,0,0,0,0,0,0,2,0,10,0,0,0,2,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,32,0,0,0,68,0,6,0,0,1,0,5,0,0,0,0,0,0,0,0,1,0,0,68,0,0,0,0,0,0,0,0,0,0,0,821,0,4,0,0,0,6,0,0,0,0,0,0,0\ntori.kuykendall@enron.com,0,0,0,0,0,0,0,0,0,24,1,0,0,2,10,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,6,0,0,0,0,0,22,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,810,0,0,0,0,0,9,0,0,0,0,0,0\nlorna.brennan@enron.com,0,0,0,0,0,0,0,0,0,0,0,0,0,0,277,0,332,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,589,0,336,0,0,0,0,14,0,0,0,0,0,0,0,0,193,0,0,745,0,0,0,0,0,0,0,0,0,0,0,750,0,806,0,0,0,0,0,0,0,0,0,0,0\nliz.taylor@enron.com,0,6,0,0,0,0,38,21,0,32,32,0,21,6,3,0,0,16,0,0,0,0,40,0,0,118,64,41,159,0,0,10,32,40,6,101,39,0,3,0,0,0,0,0,1,0,4,0,0,3,46,0,5,32,0,28,0,3,3,0,0,0,0,28,0,41,39,13,0,42,0,0,0,0,0,47,0,0,0,805,0,32,0,0,0,0,41,32,0,0\npatrice.mims@enron.com,0,0,0,0,0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,776,0,0,0,0,0,0,0,0,0\nmike.maggi@enron.com,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,772,0,0,0,0,0,0,0,0\ntracy.geaccone@enron.com,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,22,0,0,0,0,7,0,160,0,0,0,0,0,0,0,0,5,0,0,6,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,770,0,0,0,0,0,0,0\njane.tholt@enron.com,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,28,0,0,0,0,0,28,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,40,0,0,0,0,0,767,0,0,4,0,0,0\nrhonda.denton@enron.com,12,0,0,0,0,2,12,0,566,0,0,0,0,0,0,0,0,0,11,330,0,0,0,0,0,0,0,502,0,0,0,0,249,0,0,0,0,0,0,339,0,485,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,501,0,0,0,0,0,484,0,0,8,0,0,498,249,0,502,0,0,0,0,0,0,0,0,760,498,0,495,19,249\ncara.semperger@enron.com,0,0,0,0,0,0,0,0,70,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,736,0,0,17,0\nbarry.tycholiz@enron.com,0,0,44,0,0,0,0,0,0,0,0,0,0,0,0,71,0,1,0,0,0,0,4,3,0,0,0,0,11,0,2,0,1,1,0,18,21,0,0,0,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,3,0,0,732,0,0,0\nmike.carson@enron.com,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,721,0,0\nbill.williams@enron.com,20,0,0,1,0,0,0,0,265,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,208,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,12,0,0,716,0\nkerri.thompson@enron.com,0,0,0,0,0,0,0,0,693,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,711\n"
  },
  {
    "path": "chapter-8/hierarchicalClusterer.py",
    "content": "from queue import PriorityQueue\nimport math\n\n\n\"\"\"\nExample code for hierarchical clustering\n\"\"\"\n\ndef getMedian(alist):\n    \"\"\"get median value of list alist\"\"\"\n    tmp = list(alist)\n    tmp.sort()\n    alen = len(tmp)\n    if (alen % 2) == 1:\n        return tmp[alen // 2]\n    else:\n        return (tmp[alen // 2] + tmp[(alen // 2) - 1]) / 2\n    \n\ndef normalizeColumn(column):\n    \"\"\"Normalize column using Modified Standard Score\"\"\"\n    median = getMedian(column)\n    asd = sum([abs(x - median) for x in column]) / len(column)\n    result = [(x - median) / asd for x in column]\n    return result\n\nclass hClusterer:\n    \"\"\" this clusterer assumes that the first column of the data is a label\n    not used in the clustering. The other columns contain numeric data\"\"\"\n    \n    def __init__(self, filename):\n        file = open(filename)\n        self.data = {}\n        self.counter = 0\n        self.queue = PriorityQueue()\n        lines = file.readlines()\n        file.close()\n        header = lines[0].split(',')\n        self.cols = len(header)\n        self.data = [[] for i in range(len(header))]\n        for line in lines[1:]:\n            cells = line.split(',')\n            toggle = 0\n            for cell in range(self.cols):\n                if toggle == 0:\n                   self.data[cell].append(cells[cell])\n                   toggle = 1\n                else:\n                    self.data[cell].append(float(cells[cell]))\n        # now normalize number columns (that is, skip the first column)\n        for i in range(1, self.cols):\n                self.data[i] = normalizeColumn(self.data[i])\n\n        ###\n        ###  I have read in the data and normalized the \n        ###  columns. Now for each element i in the data, I am going to\n        ###     1. compute the Euclidean Distance from element i to all the \n        ###        other elements.  This data will be placed in neighbors,\n        ###        which is a Python dictionary. Let's say i = 1, and I am \n        ###        computing the distance to the neighbor j and let's say j \n        ###        is 2. The neighbors dictionary for i will look like\n        ###        {2: ((1,2), 1.23),  3: ((1, 3), 2.3)... }\n        ###\n        ###     2. find the closest neighbor\n        ###\n        ###     3. place the element on a priority queue, called simply queue,\n        ###        based on the distance to the nearest neighbor (and a counter\n        ###        used to break ties.\n\n\n\n        # now push distances on queue        \n        rows = len(self.data[0])              \n\n        for i in range(rows):\n            minDistance = 99999\n            nearestNeighbor = 0\n            neighbors = {}\n            for j in range(rows):\n                if i != j:\n                    dist = self.distance(i, j)\n                    if i < j:\n                        pair = (i,j)\n                    else:\n                        pair = (j,i)\n                    neighbors[j] = (pair, dist)\n                    if dist < minDistance:\n                        minDistance = dist\n                        nearestNeighbor = j\n                        nearestNum = j\n            # create nearest Pair\n            if i < nearestNeighbor:\n                nearestPair = (i, nearestNeighbor)\n            else:\n                nearestPair = (nearestNeighbor, i)\n                \n            # put instance on priority queue    \n            self.queue.put((minDistance, self.counter,\n                            [[self.data[0][i]], nearestPair, neighbors]))\n            self.counter += 1\n    \n\n    def distance(self, i, j):\n        sumSquares = 0\n        for k in range(1, self.cols):\n            sumSquares += (self.data[k][i] - self.data[k][j])**2\n        return math.sqrt(sumSquares)\n            \n\n    def cluster(self):\n         done = False\n         while not done:\n             topOne = self.queue.get()\n             nearestPair = topOne[2][1]\n             if not self.queue.empty():\n                 nextOne = self.queue.get()\n                 nearPair = nextOne[2][1]\n                 tmp = []\n                 ##\n                 ##  I have just popped two elements off the queue,\n                 ##  topOne and nextOne. I need to check whether nextOne\n                 ##  is topOne's nearest neighbor and vice versa.\n                 ##  If not, I will pop another element off the queue\n                 ##  until I find topOne's nearest neighbor. That is what\n                 ##  this while loop does.\n                 ##\n\n                 while nearPair != nearestPair:\n                     tmp.append((nextOne[0], self.counter, nextOne[2]))\n                     self.counter += 1\n                     nextOne = self.queue.get()\n                     nearPair = nextOne[2][1]\n                 ##\n                 ## this for loop pushes the elements I popped off in the\n                 ## above while loop.\n                 ##                 \n                 for item in tmp:\n                     self.queue.put(item)\n                     \n                 if len(topOne[2][0]) == 1:\n                    item1 = topOne[2][0][0]\n                 else:\n                     item1 = topOne[2][0]\n                 if len(nextOne[2][0]) == 1:\n                    item2 = nextOne[2][0][0]\n                 else:\n                     item2 = nextOne[2][0]\n                 ##  curCluster is, perhaps obviously, the new cluster\n                 ##  which combines cluster item1 with cluster item2.\n                 curCluster = (item1, item2)\n\n                 ## Now I am doing two things. First, finding the nearest\n                 ## neighbor to this new cluster. Second, building a new\n                 ## neighbors list by merging the neighbors lists of item1\n                 ## and item2. If the distance between item1 and element 23\n                 ## is 2 and the distance betweeen item2 and element 23 is 4\n                 ## the distance between element 23 and the new cluster will\n                 ## be 2 (i.e., the shortest distance).\n                 ##\n\n                 minDistance = 99999\n                 nearestPair = ()\n                 nearestNeighbor = ''\n                 merged = {}\n                 nNeighbors = nextOne[2][2]\n                 for (key, value) in topOne[2][2].items():\n                    if key in nNeighbors:\n                        if nNeighbors[key][1] < value[1]:\n                             dist =  nNeighbors[key]\n                        else:\n                            dist = value\n                        if dist[1] < minDistance:\n                             minDistance =  dist[1]\n                             nearestPair = dist[0]\n                             nearestNeighbor = key\n                        merged[key] = dist\n                    \n                 if merged == {}:\n                    return curCluster\n                 else:\n                    self.queue.put( (minDistance, self.counter,\n                                     [curCluster, nearestPair, merged]))\n                    self.counter += 1\n                               \n                        \n                         \n\n\ndef printDendrogram(T, sep=3):\n    \"\"\"Print dendrogram of a binary tree.  Each tree node is represented by a\n    length-2 tuple. printDendrogram is written and provided by David Eppstein\n    2002. Accessed on 14 April 2014:\n    http://code.activestate.com/recipes/139422-dendrogram-drawing/ \"\"\"\n\t\n    def isPair(T):\n        return type(T) == tuple and len(T) == 2\n    \n    def maxHeight(T):\n        if isPair(T):\n            h = max(maxHeight(T[0]), maxHeight(T[1]))\n        else:\n            h = len(str(T))\n        return h + sep\n        \n    activeLevels = {}\n\n    def traverse(T, h, isFirst):\n        if isPair(T):\n            traverse(T[0], h-sep, 1)\n            s = [' ']*(h-sep)\n            s.append('|')\n        else:\n            s = list(str(T))\n            s.append(' ')\n\n        while len(s) < h:\n            s.append('-')\n        \n        if (isFirst >= 0):\n            s.append('+')\n            if isFirst:\n                activeLevels[h] = 1\n            else:\n                del activeLevels[h]\n        \n        A = list(activeLevels)\n        A.sort()\n        for L in A:\n            if len(s) < L:\n                while len(s) < L:\n                    s.append(' ')\n                s.append('|')\n\n        print (''.join(s))    \n        \n        if isPair(T):\n            traverse(T[1], h-sep, 0)\n\n    traverse(T, maxHeight(T), -1)\n\n\n\n\nfilename = '//Users/raz/Dropbox/guide/data/dogs.csv'\n\nhg = hClusterer(filename)\ncluster = hg.cluster()\nprintDendrogram(cluster)\n\n"
  },
  {
    "path": "chapter-8/hierarchicalClustererTemplate.py",
    "content": "from queue import PriorityQueue\nimport math\n\n\n\"\"\"\nExample code for hierarchical clustering\n\"\"\"\n\ndef getMedian(alist):\n    \"\"\"get median value of list alist\"\"\"\n    tmp = list(alist)\n    tmp.sort()\n    alen = len(tmp)\n    if (alen % 2) == 1:\n        return tmp[alen // 2]\n    else:\n        return (tmp[alen // 2] + tmp[(alen // 2) - 1]) / 2\n    \n\ndef normalizeColumn(column):\n    \"\"\"Normalize column using Modified Standard Score\"\"\"\n    median = getMedian(column)\n    asd = sum([abs(x - median) for x in column]) / len(column)\n    result = [(x - median) / asd for x in column]\n    return result\n\nclass hClusterer:\n    \"\"\" this clusterer assumes that the first column of the data is a label\n    not used in the clustering. The other columns contain numeric data\"\"\"\n    \n    def __init__(self, filename):\n        file = open(filename)\n        self.data = {}\n        self.counter = 0\n        self.queue = PriorityQueue()\n        lines = file.readlines()\n        file.close()\n        header = lines[0].split(',')\n        self.cols = len(header)\n        self.data = [[] for i in range(len(header))]\n        for line in lines[1:]:\n            cells = line.split(',')\n            toggle = 0\n            for cell in range(self.cols):\n                if toggle == 0:\n                   self.data[cell].append(cells[cell])\n                   toggle = 1\n                else:\n                    self.data[cell].append(float(cells[cell]))\n        # now normalize number columns (that is, skip the first column)\n        for i in range(1, self.cols):\n                self.data[i] = normalizeColumn(self.data[i])\n\n        ###\n        ###  I have read in the data and normalized the \n        ###  columns. Now for each element i in the data, I am going to\n        ###     1. compute the Euclidean Distance from element i to all the \n        ###        other elements.  This data will be placed in neighbors, which\n        ###        is a Python dictionary. Let's say i = 1, and I am computing\n        ###        the distance to the neighbor j and let's say j is 2. The\n        ###        neighbors dictionary for i will look like\n        ###        {2: ((1,2), 1.23),  3: ((1, 3), 2.3)... }\n        ###\n        ###     2. find the closest neighbor\n        ###\n        ###     3. place the element on a priority queue, called simply queue,\n        ###        based on the distance to the nearest neighbor (and a counter\n        ###        used to break ties.\n\n\n\n        # TO DO        \n    \n\n    def distance(self, i, j):\n        sumSquares = 0\n        for k in range(1, self.cols):\n            sumSquares += (self.data[k][i] - self.data[k][j])**2\n        return math.sqrt(sumSquares)\n            \n\n    def cluster(self):\n        # TODO\n        return \"TO DO\"\n                         \n\n\ndef printDendrogram(T, sep=3):\n    \"\"\"Print dendrogram of a binary tree.  Each tree node is represented by a length-2 tuple.\n    printDendrogram is written and provided by David Eppstein 2002. Accessed on 14 April 2014:\n    http://code.activestate.com/recipes/139422-dendrogram-drawing/ \"\"\"\n\t\n    def isPair(T):\n        return type(T) == tuple and len(T) == 2\n    \n    def maxHeight(T):\n        if isPair(T):\n            h = max(maxHeight(T[0]), maxHeight(T[1]))\n        else:\n            h = len(str(T))\n        return h + sep\n        \n    activeLevels = {}\n\n    def traverse(T, h, isFirst):\n        if isPair(T):\n            traverse(T[0], h-sep, 1)\n            s = [' ']*(h-sep)\n            s.append('|')\n        else:\n            s = list(str(T))\n            s.append(' ')\n\n        while len(s) < h:\n            s.append('-')\n        \n        if (isFirst >= 0):\n            s.append('+')\n            if isFirst:\n                activeLevels[h] = 1\n            else:\n                del activeLevels[h]\n        \n        A = list(activeLevels)\n        A.sort()\n        for L in A:\n            if len(s) < L:\n                while len(s) < L:\n                    s.append(' ')\n                s.append('|')\n\n        print (''.join(s))    \n        \n        if isPair(T):\n            traverse(T[1], h-sep, 0)\n\n    traverse(T, maxHeight(T), -1)\n\n\n\n\nfilename = '//Users/raz/Dropbox/guide/pg2dm-python/ch8/dogs.csv'\n#filename = '//Users/raz/Dropbox/guide/pg2dm-python/ch8/cerealTemp.csv'\n\nhg = hClusterer(filename)\ncluster = hg.cluster()\nprintDendrogram(cluster)\n\n"
  },
  {
    "path": "chapter-8/kmeans.py",
    "content": "import math\nimport random \n\n\n\"\"\"\nImplementation of the K-means algorithm\nfor the book A Programmer's Guide to Data Mining\"\nhttp://www.guidetodatamining.com\n\n\"\"\"\n\ndef getMedian(alist):\n    \"\"\"get median of list\"\"\"\n    tmp = list(alist)\n    tmp.sort()\n    alen = len(tmp)\n    if (alen % 2) == 1:\n        return tmp[alen // 2]\n    else:\n        return (tmp[alen // 2] + tmp[(alen // 2) - 1]) / 2\n    \n\ndef normalizeColumn(column):\n    \"\"\"normalize the values of a column using Modified Standard Score\n    that is (each value - median) / (absolute standard deviation)\"\"\"\n    median = getMedian(column)\n    asd = sum([abs(x - median) for x in column]) / len(column)\n    result = [(x - median) / asd for x in column]\n    return result\n\n\nclass kClusterer:\n    \"\"\" Implementation of kMeans Clustering\n    This clusterer assumes that the first column of the data is a label\n    not used in the clustering. The other columns contain numeric data\n    \"\"\"\n    \n    def __init__(self, filename, k):\n        \"\"\" k is the number of clusters to make\n        This init method:\n           1. reads the data from the file named filename\n           2. stores that data by column in self.data\n           3. normalizes the data using Modified Standard Score\n           4. randomly selects the initial centroids\n           5. assigns points to clusters associated with those centroids\n        \"\"\"\n        file = open(filename)\n        self.data = {}\n        self.k = k\n        self.counter = 0\n        self.iterationNumber = 0\n        # used to keep track of % of points that change cluster membership\n        # in an iteration\n        self.pointsChanged = 0\n        # Sum of Squared Error\n        self.sse = 0\n        #\n        # read data from file\n        #\n        lines = file.readlines()\n        file.close()\n        header = lines[0].split(',')\n        self.cols = len(header)\n        self.data = [[] for i in range(len(header))]\n        # we are storing the data by column.\n        # For example, self.data[0] is the data from column 0.\n        # self.data[0][10] is the column 0 value of item 10.\n        for line in lines[1:]:\n            cells = line.split(',')\n            toggle = 0\n            for cell in range(self.cols):\n                if toggle == 0:\n                   self.data[cell].append(cells[cell])\n                   toggle = 1\n                else:\n                    self.data[cell].append(float(cells[cell]))\n                    \n        self.datasize = len(self.data[1])\n        self.memberOf = [-1 for x in range(len(self.data[1]))]\n        #\n        # now normalize number columns\n        #\n        for i in range(1, self.cols):\n                self.data[i] = normalizeColumn(self.data[i])\n\n        # select random centroids from existing points\n        random.seed()\n        self.centroids = [[self.data[i][r]  for i in range(1, len(self.data))]\n                           for r in random.sample(range(len(self.data[0])),\n                                                 self.k)]\n        self.assignPointsToCluster()\n\n            \n\n    def updateCentroids(self):\n        \"\"\"Using the points in the clusters, determine the centroid\n        (mean point) of each cluster\"\"\"\n        members = [self.memberOf.count(i) for i in range(len(self.centroids))]\n        self.centroids = [[sum([self.data[k][i]\n                                for i in range(len(self.data[0]))\n                                if self.memberOf[i] == centroid])/members[centroid]\n                           for k in range(1, len(self.data))]\n                          for centroid in range(len(self.centroids))] \n            \n        \n    \n    def assignPointToCluster(self, i):\n        \"\"\" assign point to cluster based on distance from centroids\"\"\"\n        min = 999999\n        clusterNum = -1\n        for centroid in range(self.k):\n            dist = self.euclideanDistance(i, centroid)\n            if dist < min:\n                min = dist\n                clusterNum = centroid\n        # here is where I will keep track of changing points\n        if clusterNum != self.memberOf[i]:\n            self.pointsChanged += 1\n        # add square of distance to running sum of squared error\n        self.sse += min**2\n        return clusterNum\n\n    def assignPointsToCluster(self):\n        \"\"\" assign each data point to a cluster\"\"\"\n        self.pointsChanged = 0\n        self.sse = 0\n        self.memberOf = [self.assignPointToCluster(i)\n                         for i in range(len(self.data[1]))]\n        \n\n        \n    def euclideanDistance(self, i, j):\n        \"\"\" compute distance of point i from centroid j\"\"\"\n        sumSquares = 0\n        for k in range(1, self.cols):\n            sumSquares += (self.data[k][i] - self.centroids[j][k-1])**2\n        return math.sqrt(sumSquares)\n\n    def kCluster(self):\n        \"\"\"the method that actually performs the clustering\n        As you can see this method repeatedly\n            updates the centroids by computing the mean point of each cluster\n            re-assign the points to clusters based on these new centroids\n        until the number of points that change cluster membership is less than 1%.\n        \"\"\"\n        done = False\n \n        while not done:\n            self.iterationNumber += 1\n            self.updateCentroids()\n            self.assignPointsToCluster()\n            #\n            # we are done if fewer than 1% of the points change clusters\n            #\n            if float(self.pointsChanged) / len(self.memberOf) <  0.01:\n                done = True\n        print(\"Final SSE: %f\" % self.sse)\n\n    def showMembers(self):\n        \"\"\"Display the results\"\"\"\n        for centroid in range(len(self.centroids)):\n             print (\"\\n\\nClass %i\\n========\" % centroid)\n             for name in [self.data[0][i]  for i in range(len(self.data[0]))\n                          if self.memberOf[i] == centroid]:\n                 print (name)\n        \n##\n## RUN THE K-MEANS CLUSTERER ON THE DOG DATA USING K = 3\n###\n# change the path in the following to match where dogs.csv is on your machine\nkm = kClusterer('../../data/dogs.csv', 3)\nkm.kCluster()\nkm.showMembers()\n"
  },
  {
    "path": "chapter-8/kmeansPlusPlus.py",
    "content": "import math\nimport random \n\n\n\"\"\"\nImplementation of the K-means++ algorithm\nfor the book A Programmer's Guide to Data Mining\"\nhttp://www.guidetodatamining.com\n\n\"\"\"\n\ndef getMedian(alist):\n    \"\"\"get median of list\"\"\"\n    tmp = list(alist)\n    tmp.sort()\n    alen = len(tmp)\n    if (alen % 2) == 1:\n        return tmp[alen // 2]\n    else:\n        return (tmp[alen // 2] + tmp[(alen // 2) - 1]) / 2\n    \n\ndef normalizeColumn(column):\n    \"\"\"normalize the values of a column using Modified Standard Score\n    that is (each value - median) / (absolute standard deviation)\"\"\"\n    median = getMedian(column)\n    asd = sum([abs(x - median) for x in column]) / len(column)\n    result = [(x - median) / asd for x in column]\n    return result\n\n\nclass kClusterer:\n    \"\"\" Implementation of kMeans Clustering\n    This clusterer assumes that the first column of the data is a label\n    not used in the clustering. The other columns contain numeric data\n    \"\"\"\n    \n    def __init__(self, filename, k):\n        \"\"\" k is the number of clusters to make\n        This init method:\n           1. reads the data from the file named filename\n           2. stores that data by column in self.data\n           3. normalizes the data using Modified Standard Score\n           4. randomly selects the initial centroids\n           5. assigns points to clusters associated with those centroids\n        \"\"\"\n        file = open(filename)\n        self.data = {}\n        self.k = k\n        self.counter = 0\n        self.iterationNumber = 0\n        # used to keep track of % of points that change cluster membership\n        # in an iteration\n        self.pointsChanged = 0\n        # Sum of Squared Error\n        self.sse = 0\n        #\n        # read data from file\n        #\n        lines = file.readlines()\n        file.close()\n        header = lines[0].split(',')\n        self.cols = len(header)\n        self.data = [[] for i in range(len(header))]\n        # we are storing the data by column.\n        # For example, self.data[0] is the data from column 0.\n        # self.data[0][10] is the column 0 value of item 10.\n        for line in lines[1:]:\n            cells = line.split(',')\n            toggle = 0\n            for cell in range(self.cols):\n                if toggle == 0:\n                   self.data[cell].append(cells[cell])\n                   toggle = 1\n                else:\n                    self.data[cell].append(float(cells[cell]))\n                    \n        self.datasize = len(self.data[1])\n        self.memberOf = [-1 for x in range(len(self.data[1]))]\n        #\n        # now normalize number columns\n        #\n        for i in range(1, self.cols):\n                self.data[i] = normalizeColumn(self.data[i])\n\n        # select random centroids from existing points\n        random.seed()\n        self.selectInitialCentroids()\n        self.assignPointsToCluster()\n\n\n    def showData(self):\n        for i in range(len(self.data[0])):\n            print(\"%20s   %8.4f  %8.4f\" %\n                (self.data[0][i], self.data[1][i], self.data[2][i]))\n\n    def distanceToClosestCentroid(self, point, centroidList):\n        result = self.eDistance(point, centroidList[0])\n        for centroid in centroidList[1:]:\n            distance = self.eDistance(point, centroid)\n            if distance < result:\n                result = distance\n        return result\n\n\n    def selectInitialCentroids(self):\n        \"\"\"implement the k-means++ method of selecting\n        the set of initial centroids\"\"\"\n        centroids = []\n        total = 0\n        # first step is to select a random first centroid\n        current = random.choice(range(len(self.data[0])))\n        centroids.append(current)\n        # loop to select the rest of the centroids, one at a time\n        for i in range(0, self.k - 1):\n            # for every point in the data find its distance to\n            # the closest centroid\n            weights = [self.distanceToClosestCentroid(x, centroids) \n                       for x in range(len(self.data[0]))]\n            total = sum(weights)\n            # instead of raw distances, convert so sum of weight = 1\n            weights = [x / total for x in weights]\n            #\n            # now roll virtual die\n            num = random.random()\n            total = 0\n            x = -1\n            # the roulette wheel simulation\n            while total < num:\n                x += 1\n                total += weights[x]\n            centroids.append(x)\n        self.centroids = [[self.data[i][r]  for i in range(1, len(self.data))]\n                            for r in centroids]\n                \n            \n    \n \n    def updateCentroids(self):\n        \"\"\"Using the points in the clusters, determine the centroid\n        (mean point) of each cluster\"\"\"\n        members = [self.memberOf.count(i) for i in range(len(self.centroids))]\n        \n        self.centroids = [[sum([self.data[k][i]\n                            for i in range(len(self.data[0]))\n                            if self.memberOf[i] == centroid])/members[centroid]\n                           for k in range(1, len(self.data))]\n                          for centroid in range(len(self.centroids))] \n            \n        \n    \n    def assignPointToCluster(self, i):\n        \"\"\" assign point to cluster based on distance from centroids\"\"\"\n        min = 999999\n        clusterNum = -1\n        for centroid in range(self.k):\n            dist = self.euclideanDistance(i, centroid)\n            if dist < min:\n                min = dist\n                clusterNum = centroid\n        # here is where I will keep track of changing points\n        if clusterNum != self.memberOf[i]:\n            self.pointsChanged += 1\n        # add square of distance to running sum of squared error\n        self.sse += min**2\n        return clusterNum\n\n    def assignPointsToCluster(self):\n        \"\"\" assign each data point to a cluster\"\"\"\n        self.pointsChanged = 0\n        self.sse = 0\n        self.memberOf = [self.assignPointToCluster(i)\n                         for i in range(len(self.data[1]))]\n        \n\n    def eDistance(self, i, j):\n        \"\"\" compute distance of point i from centroid j\"\"\"\n        sumSquares = 0\n        for k in range(1, self.cols):\n            sumSquares += (self.data[k][i] - self.data[k][j])**2\n        return math.sqrt(sumSquares)\n      \n    def euclideanDistance(self, i, j):\n        \"\"\" compute distance of point i from centroid j\"\"\"\n        sumSquares = 0\n        for k in range(1, self.cols):\n            sumSquares += (self.data[k][i] - self.centroids[j][k-1])**2\n        return math.sqrt(sumSquares)\n\n    def kCluster(self):\n        \"\"\"the method that actually performs the clustering\n        As you can see this method repeatedly\n            updates the centroids by computing the mean point of each cluster\n            re-assign the points to clusters based on these new centroids\n        until the number of points that change cluster membership is less than 1%.\n        \"\"\"\n        done = False\n \n        while not done:\n            self.iterationNumber += 1\n            self.updateCentroids()\n            self.assignPointsToCluster()\n            #\n            # we are done if fewer than 1% of the points change clusters\n            #\n            if float(self.pointsChanged) / len(self.memberOf) <  0.01:\n                done = True\n        print(\"Final SSE: %f\" % self.sse)\n\n    def showMembers(self):\n        \"\"\"Display the results\"\"\"\n        for centroid in range(len(self.centroids)):\n             print (\"\\n\\nClass %i\\n========\" % centroid)\n             for name in [self.data[0][i]  for i in range(len(self.data[0]))\n                          if self.memberOf[i] == centroid]:\n                 print (name)\n        \n##\n## RUN THE K-MEANS CLUSTERER ON THE DOG DATA USING K = 3\n###\nkm = kClusterer('../../data/dogs.csv', 3)\nkm.kCluster()\nkm.showMembers()\n"
  }
]