Repository: graceavery/Eigenstyle Branch: master Commit: 4ae1e0cb7fc9 Files: 6 Total size: 16.1 KB Directory structure: gitextract_mxqrzhpu/ ├── .gitignore ├── README.md ├── images/ │ ├── dislike/ │ │ └── .gitignore │ └── like/ │ └── .gitignore ├── statistics.py └── visuals.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ images/like/Image* images/dislike/Image* # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class ================================================ FILE: README.md ================================================ Eigenstyle ====== Principal Component Analysis and Fashion ###To Use - Find a bunch of images (I used images of dresses from Amazon). - Put the ones that match your style in the "like" folder, and the others in the "dislike" folder - In terminal, run ```bash python visuals.py ``` ###Results You'll see the principal components in the "eigendresses" folder (examples shown are from my dataset; yours will be different). ![Eigendress](http://graceavery.com/eigenstyle/4_eigendress.png)![Eigendress](http://graceavery.com/eigenstyle/0_eigendress.png)! In the "history" folder, you'll see a known dress being rebuilt from its components. ![Dress from one component](http://graceavery.com/eigenstyle/dress_763_1.png)![Dress from four components](http://graceavery.com/eigenstyle/dress_763_4.png)![Dress from ten components](http://graceavery.com/eigenstyle/dress_763_10.png)![Dress from fifteen components](http://graceavery.com/eigenstyle/dress_763_15.png)![Dress from thirty components](http://graceavery.com/eigenstyle/dress_763_30.png)![Dress from seventy components](http://graceavery.com/eigenstyle/dress_763_70.png) In the "recreatedDresses" folder, you can see just the end product of this process for different dresses. ![Original](http://graceavery.com/eigenstyle/6_original.png)![Recreated](http://graceavery.com/eigenstyle/6_recreated.png) In the "notableDresses" folder, you'll see the prettiest dresses, the ugliest dresses, the most extreme dresses (those that had high scores on many components), etc. ![Prettiest 1](http://graceavery.com/eigenstyle/prettiest_pretty_1.png)![Ugliest 2](http://graceavery.com/eigenstyle/ugliest_ugly_2.png) In the "createdDresses" folder, you'll find completely new dresses that were made from choosing random values for the principal components. ![New Dress](http://graceavery.com/eigenstyle/RandomDress5.png)![New Dress](http://graceavery.com/eigenstyle/RandomDress18.png) ### More Info [Blog post](http://blog.thehackerati.com/post/126701202241/eigenstyle) [Joel Grus's blog post](http://joelgrus.com/2013/06/24/t-shirts-feminism-parenting-and-data-science-part-2-eigenshirts/) ================================================ FILE: images/dislike/.gitignore ================================================ # Ignore everything in this directory * # Except this file !.gitignore ================================================ FILE: images/like/.gitignore ================================================ # Ignore everything in this directory * # Except this file !.gitignore ================================================ FILE: statistics.py ================================================ from collections import defaultdict from random import shuffle, seed import numpy as np import math # The following methods are from Joel Grus # https://github.com/joelgrus/data-science-from-scratch def mean(x): return sum(x) / (len(x) * 1.0) def median(v): """finds the 'middle-most' value of v""" n = len(v) sorted_v = sorted(v) midpoint = n // 2 if n % 2 == 1: # if odd, return the middle value return sorted_v[midpoint] else: # if even, return the average of the middle values lo = midpoint - 1 hi = midpoint return (sorted_v[lo] + sorted_v[hi]) / 2 def data_range(x): return max(x) - min(x) def de_mean(x): """translate x by subtracting its mean (so the result has mean 0)""" x_bar = mean(x) return [x_i - x_bar for x_i in x] def variance(x): """assumes x has at least two elements""" n = len(x) deviations = de_mean(x) return sum_of_squares(deviations) / (n - 1) def standard_deviation(x): return math.sqrt(variance(x)) def quantile(x, p): """returns the pth-percentile value in x""" p_index = int(p * len(x)) return sorted(x)[p_index] def interquartile_range(x): return quantile(x, 0.75) - quantile(x, 0.25) def dot(v, w): """v_1 * w_1 + ... + v_n * w_n""" return sum(v_i * w_i for v_i, w_i in zip(v, w)) def sum_of_squares(v): """v_1 * v_1 + ... + v_n * v_n""" return dot(v, v) def normal_cdf(x, mu=0,sigma=1): return (1 + math.erf((x - mu) / math.sqrt(2) / sigma)) / 2 def inverse_normal_cdf(p, mu=0, sigma=1, tolerance=0.00001): """find approximate inverse using binary search""" # if not standard, compute standard and rescale if mu != 0 or sigma != 1: return mu + sigma * inverse_normal_cdf(p, tolerance=tolerance) low_z, low_p = -10.0, 0 # normal_cdf(-10) is (very close to) 0 hi_z, hi_p = 10.0, 1 # normal_cdf(10) is (very close to) 1 while hi_z - low_z > tolerance: mid_z = (low_z + hi_z) / 2 # consider the midpoint mid_p = normal_cdf(mid_z) # and the cdf's value there if mid_p < p: # midpoint is still too low, search above it low_z, low_p = mid_z, mid_p elif mid_p > p: # midpoint is still too high, search below it hi_z, hi_p = mid_z, mid_p else: break return mid_z ================================================ FILE: visuals.py ================================================ from PIL import Image import PIL.ImageOps from collections import defaultdict from glob import glob from random import shuffle, seed import numpy as np import pylab as pl import pandas as pd import re from sklearn.decomposition import RandomizedPCA from sklearn.linear_model import LogisticRegression import math import random import os from statistics import mean, median, standard_deviation, inverse_normal_cdf, interquartile_range N_COMPONENTS = 50 N_COMPONENTS_TO_SHOW = 10 N_DRESSES_TO_SHOW = 5 N_NEW_DRESSES_TO_CREATE = 20 # this is the size of all the Amazon.com images # If you are using a different source, change the size here STANDARD_SIZE = (200,260) def img_to_array(filename): """takes a filename and turns it into a numpy array of RGB pixels""" img = Image.open(filename) img = img.resize(STANDARD_SIZE) img = list(img.getdata()) img = map(list, img) img = np.array(img) s = img.shape[0] * img.shape[1] img_wide = img.reshape(1, s) return img_wide[0] def makeFolder(directory): if not os.path.exists(directory): os.makedirs(directory) # write out each eigendress and the dresses that most and least match it # the file names here are chosen because of the order i wanna look at the results # (when displayed alphabetically in finder) def createEigendressPictures(): print("creating eigendress pictures") directory = "results/eigendresses/" makeFolder(directory) for i in range(N_COMPONENTS_TO_SHOW): component = pca.components_[i] img = image_from_component_values(component) img.save(directory + str(i) + "_eigendress___.png") reverse_img = PIL.ImageOps.invert(img) reverse_img.save(directory + str(i) + "_eigendress_inverted.png") ranked_dresses = sorted(enumerate(X), key=lambda (a,x): x[i]) most_i = ranked_dresses[-1][0] least_i = ranked_dresses[0][0] for j in range(N_DRESSES_TO_SHOW): most_j = j * -1 - 1 Image.open(raw_data[ranked_dresses[most_j][0]][2]).save(directory + str(i) + "_eigendress__most" + str(j) + ".png") Image.open(raw_data[ranked_dresses[j][0]][2]).save(directory + str(i) + "_eigendress_least" + str(j) + ".png") def indexesForImageName(imageName): return [i for (i,(cd,_y,f)) in enumerate(raw_data) if imageName in f] def predictiveModeling(): print("logistic regression...") directory = "results/notableDresses/" makeFolder(directory) # split the data into a training set and a test set train_split = int(len(data) * 4.0 / 5.0) X_train = X[:train_split] X_test = X[train_split:] y_train = y[:train_split] y_test = y[train_split:] # if you wanted to use a different model, you'd specify that here clf = LogisticRegression(penalty='l2') clf.fit(X_train,y_train) print "score",clf.score(X_test,y_test) # first, let's find the model score for every dress in our dataset probs = zip(clf.decision_function(X),raw_data) prettiest_liked_things = sorted(probs,key=lambda (p,(cd,g,f)): (0 if g == 'like' else 1,p)) prettiest_disliked_things = sorted(probs,key=lambda (p,(cd,g,f)): (0 if g == 'dislike' else 1,p)) ugliest_liked_things = sorted(probs,key=lambda (p,(cd,g,f)): (0 if g == 'like' else 1,-p)) ugliest_disliked_things = sorted(probs,key=lambda (p,(cd,g,f)): (0 if g == 'dislike' else 1,-p)) in_between_things = sorted(probs,key=lambda (p,(cd,g,f)): abs(p)) # and let's look at the most and least extreme dresses cd = zip(X,raw_data) least_extreme_things = sorted(cd,key=lambda (x,(d,g,f)): sum([abs(c) for c in x])) most_extreme_things = sorted(cd,key=lambda (x,(d,g,f)): sum([abs(c) for c in x]),reverse=True) least_interesting_things = sorted(cd,key=lambda (x,(d,g,f)): max([abs(c) for c in x])) most_interesting_things = sorted(cd,key=lambda (x,(d,g,f)): min([abs(c) for c in x]),reverse=True) for i in range(10): Image.open(prettiest_liked_things[i][1][2]).save(directory + "prettiest_pretty_" + str(i) + ".png") Image.open(prettiest_disliked_things[i][1][2]).save(directory + "prettiest_ugly_" + str(i) + ".png") Image.open(ugliest_liked_things[i][1][2]).save(directory + "ugliest_pretty_" + str(i) + ".png") Image.open(ugliest_disliked_things[i][1][2]).save(directory + "directoryugliest_ugly_" + str(i) + ".png") Image.open(in_between_things[i][1][2]).save(directory + "neither_pretty_nor_ugly_" + str(i) + ".png") Image.open(least_extreme_things[i][1][2]).save(directory + "least_extreme_" + str(i) + ".png") Image.open(most_extreme_things[i][1][2]).save(directory + "most_extreme_" + str(i) + ".png") Image.open(least_interesting_things[i][1][2]).save(directory + "least_interesting_" + str(i) + ".png") Image.open(most_interesting_things[i][1][2]).save(directory + "most_interesting_" + str(i) + ".png") # and now let's look at precision-recall probs = zip(clf.decision_function(X_test),raw_data[train_split:]) num_dislikes = len([c for c in y_test if c == 1]) num_likes = len([c for c in y_test if c == 0]) lowest_score = round(min([p[0] for p in probs]),1) - 0.1 highest_score = round(max([p[0] for p in probs]),1) + 0.1 INTERVAL = 0.1 # first do the likes score = lowest_score while score <= highest_score: true_positives = len([p for p in probs if p[0] <= score and p[1][1] == 'like']) false_positives = len([p for p in probs if p[0] <= score and p[1][1] == 'dislike']) positives = true_positives + false_positives if positives > 0: precision = 1.0 * true_positives / positives recall = 1.0 * true_positives / num_likes print "likes",score,precision,recall score += INTERVAL # then do the dislikes score = highest_score while score >= lowest_score: true_positives = len([p for p in probs if p[0] >= score and p[1][1] == 'dislike']) false_positives = len([p for p in probs if p[0] >= score and p[1][1] == 'like']) positives = true_positives + false_positives if positives > 0: precision = 1.0 * true_positives / positives recall = 1.0 * true_positives / num_dislikes print "dislikes",score,precision,recall score -= INTERVAL # now do both score = lowest_score while score <= highest_score: likes = len([p for p in probs if p[0] <= score and p[1][1] == 'like']) dislikes = len([p for p in probs if p[0] <= score and p[1][1] == 'dislike']) print score, likes, dislikes score += INTERVAL def showHistoryOfDress(dressName): index = indexesForImageName(dressName)[0] directory = "results/history/dress" + str(index) + "/" makeFolder(directory) dress = X[index] origImage = raw_data[index][2] Image.open(origImage).save(directory + "dress_" + str(index) + "_original.png") for i in range(1,len(dress)): reduced = dress[:i] construct(reduced, directory + "dress_" + str(index) + "_" + str(i)) def bulkShowDressHistories(lo, hi): for index in range(lo, hi): directory = "results/history/dress" + str(index) + "/" makeFolder(directory) dress = X[index] origImage = raw_data[index][2] Image.open(origImage).save(directory + "dress_" + str(index) + "_original.png") for i in range(1,len(dress)): reduced = dress[:i] construct(reduced, directory + "dress_" + str(index) + "_" + str(i)) def reconstruct(dress_number, saveName = 'reconstruct'): eigenvalues = X[dress_number] construct(eigenvalues, saveName) def construct(eigenvalues, saveName = 'reconstruct'): components = pca.components_ eigenzip = zip(eigenvalues,components) N = len(components[0]) r = [int(sum([w * c[i] for (w,c) in eigenzip])) for i in range(N)] img = image_from_component_values(r) img.save(saveName + '.png') def image_from_component_values(component): """takes one of the principal components and turns it into an image""" hi = max(component) lo = min(component) n = len(component) / 3 divisor = hi - lo if divisor == 0: divisor = 1 def rescale(x): return int(255 * (x - lo) / divisor) d = [(rescale(component[3 * i]), rescale(component[3 * i + 1]), rescale(component[3 * i + 2])) for i in range(n)] im = Image.new('RGB',STANDARD_SIZE) im.putdata(d) return im def makeRandomDress(saveName, liked): randomArr = [] base = likesByComponent if liked else dislikesByComponent for c in base[:100]: mu = mean(c) sigma = standard_deviation(c) p = random.uniform(0.0, 1.0) num = inverse_normal_cdf(p, mu, sigma) randomArr.append(num) construct(randomArr, 'results/createdDresses/' + saveName) def reconstructKnownDresses(): print("reconstructing dresses...") directory = "results/recreatedDresses/" makeFolder(directory) for i in range(N_DRESSES_TO_SHOW): Image.open(raw_data[i][2]).save(directory + str(i) + "_original.png") saveName = directory + str(i) reconstruct(i, saveName) def createNewDresses(): print("creating brand new dresses...") directory = "results/createdDresses/" makeFolder(directory) for i in range(N_NEW_DRESSES_TO_CREATE): saveNameLike = "newLikeDress" + str(i) saveNameDislike = "newDislikeDress" + str(i) makeRandomDress(saveNameLike, True) makeRandomDress(saveNameDislike, False) def printComponentStatistics(): print("component statistics:\n") for i in range(N_COMPONENTS_TO_SHOW): print("component " + str(i) + ":") likeComp = likesByComponent[i] dislikeComp = dislikesByComponent[i] print("means: like = " + str(mean(likeComp)) + " dislike = " + str(mean(dislikeComp))) print("medians: like = " + str(median(likeComp)) + " dislike = " + str(median(dislikeComp))) print("stdevs: like = " + str(standard_deviation(likeComp)) + " dislike = " + str(standard_deviation(dislikeComp))) print("interquartile range: like = " + str(interquartile_range(likeComp)) + " dislike = " + str(interquartile_range(dislikeComp))) print("\n") like_files = glob('images/like/Image*') dislike_files = glob('images/dislike/Image*') process_file = img_to_array print('processing images...') print('(this takes a long time if you have a lot of images)') raw_data = [(process_file(filename),'like',filename) for filename in like_files] + \ [(process_file(filename),'dislike',filename) for filename in dislike_files] # randomly order the data #seed(0) shuffle(raw_data) # pull out the features and the labels data = np.array([cd for (cd,_y,f) in raw_data]) labels = np.array([_y for (cd,_y,f) in raw_data]) print('finding principal components...') pca = RandomizedPCA(n_components=N_COMPONENTS, random_state=0) X = pca.fit_transform(data) y = [1 if label == 'dislike' else 0 for label in labels] zipped = zip(X, raw_data) likes = [x[0] for x in zipped if x[1][1] == "like"] dislikes = [x[0] for x in zipped if x[1][1] == "dislike"] likesByComponent = zip(*likes) dislikesByComponent = zip(*dislikes) allByComponent = zip(*X) printComponentStatistics() createEigendressPictures() predictiveModeling() reconstructKnownDresses() bulkShowDressHistories(0,1) createNewDresses()