[
  {
    "path": ".gitignore",
    "content": "images/like/Image*\nimages/dislike/Image*\n\n# Byte-compiled / optimized / DLL files\n__pycache__/\n*.py[cod]\n*$py.class\n"
  },
  {
    "path": "README.md",
    "content": "Eigenstyle\n======\nPrincipal Component Analysis and Fashion\n\n###To Use\n\n- Find a bunch of images (I used images of dresses from Amazon).\n- Put the ones that match your style in the \"like\" folder, and the others in the \"dislike\" folder\n- In terminal, run \n```bash\npython visuals.py\n```\n\n###Results\n\nYou'll see the principal components in the \"eigendresses\" folder (examples shown are from my dataset; yours will be different).\n\n![Eigendress](http://graceavery.com/eigenstyle/4_eigendress.png)![Eigendress](http://graceavery.com/eigenstyle/0_eigendress.png)!\n\nIn the \"history\" folder, you'll see a known dress being rebuilt from its components.\n\n![Dress from one component](http://graceavery.com/eigenstyle/dress_763_1.png)![Dress from four components](http://graceavery.com/eigenstyle/dress_763_4.png)![Dress from ten components](http://graceavery.com/eigenstyle/dress_763_10.png)![Dress from fifteen components](http://graceavery.com/eigenstyle/dress_763_15.png)![Dress from thirty components](http://graceavery.com/eigenstyle/dress_763_30.png)![Dress from seventy components](http://graceavery.com/eigenstyle/dress_763_70.png)\n\nIn the \"recreatedDresses\" folder, you can see just the end product of this process for different dresses.\n\n![Original](http://graceavery.com/eigenstyle/6_original.png)![Recreated](http://graceavery.com/eigenstyle/6_recreated.png)\n\nIn the \"notableDresses\" folder, you'll see the prettiest dresses, the ugliest dresses, the most extreme dresses (those that had high scores on many components), etc.\n\n![Prettiest 1](http://graceavery.com/eigenstyle/prettiest_pretty_1.png)![Ugliest 2](http://graceavery.com/eigenstyle/ugliest_ugly_2.png)\n\n\nIn the \"createdDresses\" folder, you'll find completely new dresses that were made from choosing random values for the principal components.\n\n![New Dress](http://graceavery.com/eigenstyle/RandomDress5.png)![New Dress](http://graceavery.com/eigenstyle/RandomDress18.png)\n\n\n### More Info\n[Blog post](http://blog.thehackerati.com/post/126701202241/eigenstyle)\n\n[Joel Grus's blog post](http://joelgrus.com/2013/06/24/t-shirts-feminism-parenting-and-data-science-part-2-eigenshirts/)\n"
  },
  {
    "path": "images/dislike/.gitignore",
    "content": "# Ignore everything in this directory\n*\n# Except this file\n!.gitignore"
  },
  {
    "path": "images/like/.gitignore",
    "content": "# Ignore everything in this directory\n*\n# Except this file\n!.gitignore"
  },
  {
    "path": "statistics.py",
    "content": "from collections import defaultdict\nfrom random import shuffle, seed\nimport numpy as np\nimport math\n\n# The following methods are from Joel Grus\n# https://github.com/joelgrus/data-science-from-scratch\ndef mean(x): \n    return sum(x) / (len(x) * 1.0)\n\ndef median(v):\n    \"\"\"finds the 'middle-most' value of v\"\"\"\n    n = len(v)\n    sorted_v = sorted(v)\n    midpoint = n // 2\n    if n % 2 == 1:\n        # if odd, return the middle value\n        return sorted_v[midpoint]\n    else:\n        # if even, return the average of the middle values\n        lo = midpoint - 1\n        hi = midpoint\n        return (sorted_v[lo] + sorted_v[hi]) / 2\n        \ndef data_range(x):\n    return max(x) - min(x)\n\ndef de_mean(x):\n    \"\"\"translate x by subtracting its mean (so the result has mean 0)\"\"\"\n    x_bar = mean(x)\n    return [x_i - x_bar for x_i in x]\n\ndef variance(x):\n    \"\"\"assumes x has at least two elements\"\"\"\n    n = len(x)\n    deviations = de_mean(x)\n    return sum_of_squares(deviations) / (n - 1)\n    \ndef standard_deviation(x):\n    return math.sqrt(variance(x))\n\ndef quantile(x, p):\n    \"\"\"returns the pth-percentile value in x\"\"\"\n    p_index = int(p * len(x))\n    return sorted(x)[p_index]\n\ndef interquartile_range(x):\n    return quantile(x, 0.75) - quantile(x, 0.25)\n\ndef dot(v, w):\n    \"\"\"v_1 * w_1 + ... + v_n * w_n\"\"\"\n    return sum(v_i * w_i for v_i, w_i in zip(v, w))\n\ndef sum_of_squares(v):\n    \"\"\"v_1 * v_1 + ... + v_n * v_n\"\"\"\n    return dot(v, v)\n\ndef normal_cdf(x, mu=0,sigma=1):\n    return (1 + math.erf((x - mu) / math.sqrt(2) / sigma)) / 2  \n\ndef inverse_normal_cdf(p, mu=0, sigma=1, tolerance=0.00001):\n    \"\"\"find approximate inverse using binary search\"\"\"\n    # if not standard, compute standard and rescale\n    if mu != 0 or sigma != 1:\n        return mu + sigma * inverse_normal_cdf(p, tolerance=tolerance)\n    low_z, low_p = -10.0, 0            # normal_cdf(-10) is (very close to) 0\n    hi_z,  hi_p  =  10.0, 1            # normal_cdf(10)  is (very close to) 1\n    while hi_z - low_z > tolerance:\n        mid_z = (low_z + hi_z) / 2     # consider the midpoint\n        mid_p = normal_cdf(mid_z)      # and the cdf's value there\n        if mid_p < p:\n            # midpoint is still too low, search above it\n            low_z, low_p = mid_z, mid_p\n        elif mid_p > p:\n            # midpoint is still too high, search below it\n            hi_z, hi_p = mid_z, mid_p\n        else:\n            break\n    return mid_z\n\n"
  },
  {
    "path": "visuals.py",
    "content": "from PIL import Image\nimport PIL.ImageOps\nfrom collections import defaultdict\nfrom glob import glob\nfrom random import shuffle, seed\nimport numpy as np\nimport pylab as pl\nimport pandas as pd\nimport re\nfrom sklearn.decomposition import RandomizedPCA\nfrom sklearn.linear_model import LogisticRegression\nimport math\nimport random\nimport os\nfrom statistics import mean, median, standard_deviation, inverse_normal_cdf, interquartile_range\n\nN_COMPONENTS = 50\nN_COMPONENTS_TO_SHOW = 10\nN_DRESSES_TO_SHOW = 5\nN_NEW_DRESSES_TO_CREATE = 20\n\n# this is the size of all the Amazon.com images\n# If you are using a different source, change the size here \nSTANDARD_SIZE = (200,260)\n\ndef img_to_array(filename):\n    \"\"\"takes a filename and turns it into a numpy array of RGB pixels\"\"\"\n    img = Image.open(filename)\n    img = img.resize(STANDARD_SIZE)\n    img = list(img.getdata())\n    img = map(list, img)\n    img = np.array(img)\n    s = img.shape[0] * img.shape[1]\n    img_wide = img.reshape(1, s)\n    return img_wide[0]\n\ndef makeFolder(directory):\n    if not os.path.exists(directory):\n        os.makedirs(directory)\n\n# write out each eigendress and the dresses that most and least match it\n# the file names here are chosen because of the order i wanna look at the results\n# (when displayed alphabetically in finder)\ndef createEigendressPictures():\n    print(\"creating eigendress pictures\")\n    directory = \"results/eigendresses/\"\n    makeFolder(directory)\n    for i in range(N_COMPONENTS_TO_SHOW):\n        component = pca.components_[i]\n        img = image_from_component_values(component)\n        img.save(directory + str(i) + \"_eigendress___.png\")\n        reverse_img = PIL.ImageOps.invert(img)\n        reverse_img.save(directory + str(i) + \"_eigendress_inverted.png\")\n        ranked_dresses = sorted(enumerate(X),\n               key=lambda (a,x): x[i])\n        most_i = ranked_dresses[-1][0]\n        least_i = ranked_dresses[0][0]\n\n        for j in range(N_DRESSES_TO_SHOW):\n            most_j = j * -1 - 1\n            Image.open(raw_data[ranked_dresses[most_j][0]][2]).save(directory + str(i) + \"_eigendress__most\" + str(j) + \".png\")\n            Image.open(raw_data[ranked_dresses[j][0]][2]).save(directory + str(i) + \"_eigendress_least\" + str(j) + \".png\")\n\ndef indexesForImageName(imageName):\n    return [i for (i,(cd,_y,f)) in enumerate(raw_data) if imageName in f]\n\ndef predictiveModeling():\n    print(\"logistic regression...\")\n    directory = \"results/notableDresses/\"\n    makeFolder(directory)\n\n    # split the data into a training set and a test set\n    train_split = int(len(data) * 4.0 / 5.0)\n\n    X_train = X[:train_split]\n    X_test = X[train_split:]\n    y_train = y[:train_split]\n    y_test = y[train_split:]\n\n    # if you wanted to use a different model, you'd specify that here\n    clf = LogisticRegression(penalty='l2')\n    clf.fit(X_train,y_train)\n\n    print \"score\",clf.score(X_test,y_test)\n        \n    # first, let's find the model score for every dress in our dataset\n    probs = zip(clf.decision_function(X),raw_data)\n\n    prettiest_liked_things = sorted(probs,key=lambda (p,(cd,g,f)): (0 if g == 'like' else 1,p))\n    prettiest_disliked_things = sorted(probs,key=lambda (p,(cd,g,f)): (0 if g == 'dislike' else 1,p))\n    ugliest_liked_things = sorted(probs,key=lambda (p,(cd,g,f)): (0 if g == 'like' else 1,-p))\n    ugliest_disliked_things = sorted(probs,key=lambda (p,(cd,g,f)): (0 if g == 'dislike' else 1,-p))\n    in_between_things = sorted(probs,key=lambda (p,(cd,g,f)): abs(p))\n\n    # and let's look at the most and least extreme dresses\n    cd = zip(X,raw_data)\n    least_extreme_things = sorted(cd,key=lambda (x,(d,g,f)): sum([abs(c) for c in x]))\n    most_extreme_things =  sorted(cd,key=lambda (x,(d,g,f)): sum([abs(c) for c in x]),reverse=True)\n\n    least_interesting_things = sorted(cd,key=lambda (x,(d,g,f)): max([abs(c) for c in x]))\n    most_interesting_things =  sorted(cd,key=lambda (x,(d,g,f)): min([abs(c) for c in x]),reverse=True)\n\n    for i in range(10):\n        Image.open(prettiest_liked_things[i][1][2]).save(directory + \"prettiest_pretty_\" + str(i) + \".png\")\n        Image.open(prettiest_disliked_things[i][1][2]).save(directory + \"prettiest_ugly_\" + str(i) + \".png\")\n        Image.open(ugliest_liked_things[i][1][2]).save(directory + \"ugliest_pretty_\" + str(i) + \".png\")\n        Image.open(ugliest_disliked_things[i][1][2]).save(directory + \"directoryugliest_ugly_\" + str(i) + \".png\")\n        Image.open(in_between_things[i][1][2]).save(directory + \"neither_pretty_nor_ugly_\" + str(i) + \".png\")\n        Image.open(least_extreme_things[i][1][2]).save(directory + \"least_extreme_\" + str(i) + \".png\")\n        Image.open(most_extreme_things[i][1][2]).save(directory + \"most_extreme_\" + str(i) + \".png\")\n        Image.open(least_interesting_things[i][1][2]).save(directory + \"least_interesting_\" + str(i) + \".png\")\n        Image.open(most_interesting_things[i][1][2]).save(directory + \"most_interesting_\" + str(i) + \".png\")\n\n    # and now let's look at precision-recall\n    probs = zip(clf.decision_function(X_test),raw_data[train_split:])\n    num_dislikes = len([c for c in y_test if c == 1])\n    num_likes = len([c for c in y_test if c == 0])\n    lowest_score = round(min([p[0] for p in probs]),1) - 0.1\n    highest_score = round(max([p[0] for p in probs]),1) + 0.1\n    INTERVAL = 0.1\n\n    # first do the likes\n    score = lowest_score\n    while score <= highest_score:\n        true_positives  = len([p for p in probs if p[0] <= score and p[1][1] == 'like'])\n        false_positives = len([p for p in probs if p[0] <= score and p[1][1] == 'dislike'])\n        positives = true_positives + false_positives\n        if positives > 0:\n            precision = 1.0 * true_positives / positives\n            recall = 1.0 * true_positives / num_likes\n            print \"likes\",score,precision,recall\n        score += INTERVAL\n\n    # then do the dislikes\n    score = highest_score\n    while score >= lowest_score:\n        true_positives  = len([p for p in probs if p[0] >= score and p[1][1] == 'dislike'])\n        false_positives = len([p for p in probs if p[0] >= score and p[1][1] == 'like'])\n        positives = true_positives + false_positives\n        if positives > 0:\n            precision = 1.0 * true_positives / positives\n            recall = 1.0 * true_positives / num_dislikes\n            print \"dislikes\",score,precision,recall\n        score -= INTERVAL\n\n    # now do both\n    score = lowest_score\n    while score <= highest_score:\n        likes  = len([p for p in probs if p[0] <= score and p[1][1] == 'like'])\n        dislikes = len([p for p in probs if p[0] <= score and p[1][1] == 'dislike'])\n        print score, likes, dislikes\n        score += INTERVAL\n\ndef showHistoryOfDress(dressName):\n    index = indexesForImageName(dressName)[0]\n    directory = \"results/history/dress\" + str(index) + \"/\"\n    makeFolder(directory)\n    dress = X[index]\n    origImage = raw_data[index][2]\n    Image.open(origImage).save(directory + \"dress_\" + str(index) + \"_original.png\")\n    for i in range(1,len(dress)):\n        reduced = dress[:i]\n        construct(reduced, directory + \"dress_\" + str(index) + \"_\" + str(i))\n\ndef bulkShowDressHistories(lo, hi):\n    for index in range(lo, hi):\n        directory = \"results/history/dress\" + str(index) + \"/\"\n        makeFolder(directory)\n        dress = X[index]\n        origImage = raw_data[index][2]\n        Image.open(origImage).save(directory + \"dress_\" + str(index) + \"_original.png\")\n        for i in range(1,len(dress)):\n            reduced = dress[:i]\n            construct(reduced, directory + \"dress_\" + str(index) + \"_\" + str(i))\n\ndef reconstruct(dress_number, saveName = 'reconstruct'):\n    eigenvalues = X[dress_number]\n    construct(eigenvalues, saveName)\n\ndef construct(eigenvalues, saveName = 'reconstruct'):\n    components = pca.components_\n    eigenzip = zip(eigenvalues,components)\n    N = len(components[0])   \n    r = [int(sum([w * c[i] for (w,c) in eigenzip]))\n                     for i in range(N)]\n    img = image_from_component_values(r)\n    img.save(saveName + '.png')\n\ndef image_from_component_values(component):\n    \"\"\"takes one of the principal components and turns it into an image\"\"\"\n    hi = max(component)\n    lo = min(component)\n    n = len(component) / 3\n    divisor = hi - lo\n    if divisor == 0:\n        divisor = 1\n    def rescale(x):\n        return int(255 * (x - lo) / divisor)\n    d = [(rescale(component[3 * i]),\n          rescale(component[3 * i + 1]),\n          rescale(component[3 * i + 2])) for i in range(n)]\n    im = Image.new('RGB',STANDARD_SIZE)\n    im.putdata(d)\n    return im\n\ndef makeRandomDress(saveName, liked):\n    randomArr = []\n    base = likesByComponent if liked else dislikesByComponent\n    for c in base[:100]:\n        mu = mean(c)\n        sigma = standard_deviation(c)\n        p = random.uniform(0.0, 1.0)\n        num = inverse_normal_cdf(p, mu, sigma)\n        randomArr.append(num)\n    construct(randomArr, 'results/createdDresses/' + saveName)\n\ndef reconstructKnownDresses():\n    print(\"reconstructing dresses...\")\n    directory = \"results/recreatedDresses/\"\n    makeFolder(directory)\n    for i in range(N_DRESSES_TO_SHOW):\n        Image.open(raw_data[i][2]).save(directory + str(i) + \"_original.png\")\n        saveName = directory + str(i) \n        reconstruct(i, saveName)\n\ndef createNewDresses():\n    print(\"creating brand new dresses...\")\n    directory = \"results/createdDresses/\"\n    makeFolder(directory)\n    for i in range(N_NEW_DRESSES_TO_CREATE):\n        saveNameLike = \"newLikeDress\" + str(i)\n        saveNameDislike = \"newDislikeDress\" + str(i)\n        makeRandomDress(saveNameLike, True)\n        makeRandomDress(saveNameDislike, False)\n\ndef printComponentStatistics():\n    print(\"component statistics:\\n\")\n    for i in range(N_COMPONENTS_TO_SHOW):\n        print(\"component \" + str(i) + \":\")\n        likeComp = likesByComponent[i]\n        dislikeComp = dislikesByComponent[i]\n        print(\"means:                     like = \" + str(mean(likeComp)) + \"     dislike = \" + str(mean(dislikeComp)))\n        print(\"medians:                   like = \" + str(median(likeComp)) + \"     dislike = \" + str(median(dislikeComp)))\n        print(\"stdevs:                    like = \" + str(standard_deviation(likeComp)) + \"     dislike = \" + str(standard_deviation(dislikeComp)))\n        print(\"interquartile range:       like = \" + str(interquartile_range(likeComp)) + \"     dislike = \" + str(interquartile_range(dislikeComp)))\n        print(\"\\n\")\n\n\n\nlike_files = glob('images/like/Image*')\ndislike_files = glob('images/dislike/Image*')\n\nprocess_file = img_to_array\n\nprint('processing images...')\nprint('(this takes a long time if you have a lot of images)')\nraw_data = [(process_file(filename),'like',filename) for filename in like_files] + \\\n           [(process_file(filename),'dislike',filename) for filename in dislike_files]\n\n# randomly order the data\n#seed(0)\nshuffle(raw_data)\n\n# pull out the features and the labels\ndata = np.array([cd for (cd,_y,f) in raw_data])\nlabels = np.array([_y for (cd,_y,f) in raw_data])\n\nprint('finding principal components...')\npca = RandomizedPCA(n_components=N_COMPONENTS, random_state=0)\nX = pca.fit_transform(data)\ny = [1 if label == 'dislike' else 0 for label in labels]\n\nzipped = zip(X, raw_data)\nlikes = [x[0] for x in zipped if x[1][1] == \"like\"]\ndislikes = [x[0] for x in zipped if x[1][1] == \"dislike\"]\n\nlikesByComponent = zip(*likes)\ndislikesByComponent = zip(*dislikes)\nallByComponent = zip(*X)\n\n\n\nprintComponentStatistics()\n\ncreateEigendressPictures()\n\npredictiveModeling()\n\nreconstructKnownDresses()\n\nbulkShowDressHistories(0,1)\n\ncreateNewDresses()\n\n\n\n\n"
  }
]