Repository: ScarletPan/Kaggle-Rental-Listing-Inquireies Branch: master Commit: 4b177f8dffed Files: 10 Total size: 60.3 KB Directory structure: gitextract_aw7n92_y/ ├── README.md ├── classifiers.py ├── modelTraining.py ├── ppt/ │ └── AIC-Sharing-11-19.pptx ├── preprocess.py └── stack/ ├── StackNet.jar ├── params.txt ├── parse.py ├── start.sh └── utils.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: README.md ================================================ # README * ```preprocess.py```: data cleaning, feature engineering * ```modelTraining.py```: cross validation, submission generating, stacking preparing * ```classifiers.py```: my encapsulation of xgboost * stack * ```StackNet.jar```: stacking tools shared by KazAnova, repo is [here](https://github.com/kaz-Anova/StackNet) * ```parse.py```: tools for evaluate the cv scores during stacking. * ```utils.py```: generating submission after StackNet * ```start.sh```: commands for executing StackNet * ```params.txt```: my params for stacking ### links: * [Kaggle:Rental Listing Inquireies](https://www.kaggle.com/c/two-sigma-connect-rental-listing-inquiries) * [Summary of getting a silver medal in kaggle](http://scarletpan.github.io/summary-of-get-a-silver-medal-in-kaggle/) * [Kaggle 首战拿银总结 | 入门指导 (长文、干货) -- 知乎专栏](https://zhuanlan.zhihu.com/p/26645088) * [AI Challenge 分享会PPT](https://github.com/ScarletPan/Kaggle-Rental-Listing-Inquireies/blob/master/ppt/AIC-Sharing-11-19.pptx) ================================================ FILE: classifiers.py ================================================ import xgboost as xgb import numpy as np from sklearn.metrics import log_loss class xgboostClassifier(): def __init__(self, **params): self.clf = None self.progress = {} self.params = params def fit(self, X, y): xg_train = xgb.DMatrix(X, label=y) self.clf = xgb.train(self.params, xg_train, self.params['num_rounds']) def fit_CV(self, X_train, X_val, y_train, y_val): xg_train = xgb.DMatrix(X_train, label=y_train) xg_val = xgb.DMatrix(X_val, label=y_val) watchlist = [(xg_train, 'train'), (xg_val, 'eval')] self.clf = xgb.train(self.params, xg_train, self.params['num_rounds'], watchlist, early_stopping_rounds=200, evals_result=self.progress) def get_eval_res(self): return self.progress def score(self, X, y): Y = self.predict_proba(X) return 1 / log_loss(y, Y) def predict_proba(self, X_test): res = self.clf.predict(xgb.DMatrix(X_test)) return res.astype(np.float32) def predict(self, X_test): res = np.argmax(self.clf.predict(xgb.DMatrix(X_test)), axis=1) return res def get_params(self, **params): return self.params def set_params(self, **params): self.params.update(params) def getSortedImportance(self, features): with open('xgb.fmap', 'w') as f: for i in range(len(features)): f.write('{0}\t{1}\tq\n'.format(i, features[i])) importance = self.clf.get_fscore(fmap='xgb.fmap') importance = sorted(importance.items(), key=operator.itemgetter(1)) #print(importance) return importance class BaseClassifier(object): def __init__(self, clf, seed=0, params=None): params['random_state'] = seed self.clf = clf(**params) def train(self, x_train, y_train): self.clf.fit(x_train, y_train) def predict(self, x): return self.clf.predict(x) def predict_proba(self, x): return self.clf.predict_proba(x) def fit(self,x,y): return self.clf.fit(x,y) def set_params(self, **params): self.params.update(params) ================================================ FILE: modelTraining.py ================================================ import sys import time import random from collections import defaultdict import numpy as np import pandas as pd from sklearn.model_selection import KFold, StratifiedKFold, train_test_split from sklearn.metrics import log_loss from preprocess import coreProcess from classifiers import xgboostClassifier TRAIN_FILE_NAME = '~/Kaggle/RLI/input/train.json' TEST_FILE_NAME = '~/Kaggle/RLI/input/test.json' target_num_map = {'high': 0, 'medium': 1, 'low': 2} train_data = pd.read_json(TRAIN_FILE_NAME).reset_index() test_data = pd.read_json(TEST_FILE_NAME).reset_index() list_img_time = pd.read_csv("~/Kaggle/RLI/input/listing_image_time.csv") train_data = train_data.merge(list_img_time, left_on="listing_id", right_on="Listing_Id", how='inner') test_data = test_data.merge(list_img_time, left_on="listing_id", right_on="Listing_Id", how='inner') RS = 2016 random.seed(RS) np.random.seed(RS) # RS = 0 def validation_score(early_stop=False): clf = xgboostClassifier( objective = 'multi:softprob', eval_metric = 'mlogloss', num_class = 3, nthread = 3, eta = 0.04, max_depth = 6, subsample = 0.7, colsample_bytree = 1.0, colsample_bylevel = 0.7, min_child_weight=1, silent = 1, num_rounds = 700, seed = RS, ) print("*** Validation start ***") data = train_data.copy() y = data["interest_level"].apply(lambda x: target_num_map[x]) del data["interest_level"] # skf = StratifiedKFold(n_splits=5, random_state=RS, shuffle=True) skf = StratifiedKFold(n_splits=3, shuffle=False) cv_scores = [] i = 0 for train_idx, val_idx in skf.split(data, y): i += 1 X = data.copy() y_train, y_val = y.iloc[train_idx], y.iloc[val_idx] X_train, X_val, feats = coreProcess(X, y_train, train_idx, val_idx) clf.fit(X_train, y_train) # clf.fit_CV(X_train, X_val, y_train, y_val) y_val_pred = clf.predict_proba(X_val) loss = log_loss(y_val, y_val_pred) print("Iteration {}'s loss: {}".format(i, loss)) cv_scores.append(loss) if early_stop: break print("*** Validation finished ***\n") return cv_scores def validation_avg_score(clfs): print("*** Validation start ***") data = train_data.copy() y = data["interest_level"].apply(lambda x: target_num_map[x]) del data["interest_level"] # skf = StratifiedKFold(n_splits=5, random_state=RS, shuffle=True) skf = StratifiedKFold(n_splits=3) cv_scores = {i:[] for i in range(len(clfs))} cv_scores["Avg"] = [] i = 0 for train_idx, val_idx in skf.split(data, y): i += 1 X = data.copy() y_train, y_val = y.iloc[train_idx], y.iloc[val_idx] X_train, X_val, feats = coreProcess(X, y_train, train_idx, val_idx) tmp = [] preds = [] j = 0 for clf in clfs: clf.fit(X_train, y_train) y_val_pred = clf.predict_proba(X_val) tmp.append(y_val_pred) loss = log_loss(y_val, y_val_pred) cv_scores[j].append(loss) preds.append(y_val_pred) j += 1 print("clf_{}, Iteration {}'s loss: {}".format(j, i, loss)) preds = np.array(preds) avg_pred = np.mean(preds, axis=0) loss = log_loss(y_val, avg_pred) cv_scores["Avg"].append(loss) print("Iteration {}'s Avg loss: {}".format(i, loss)) for i in range(len(clfs)): print("clf_{} validation loss : {}".format(i, np.mean(cv_scores[i]))) print("Average validation loss : {}".format(np.mean(cv_scores["Avg"]))) print("*** Validation finished ***\n") return cv_scores["Avg"] def paramSearch(clf, param_dict): def outer_join(left, right): if left == []: return right if right == []: return left res = [] for i in left: for j in right: if isinstance(i, list): tmp = i[:] tmp.append(j) res.append(tmp) else: res.append([i, j]) return res # Creating list of param_dict param_list = sorted(param_dict.items(), key=lambda x: x[0]) param_keys = [ item[0] for item in param_list ] param_vals = [ item[1] for item in param_list ] all_vals = [] for val in param_vals: all_vals = outer_join(all_vals, val) all_param_lists = [] for vals in all_vals: all_param_lists.append(dict(zip(param_keys, vals))) # for item in all_param_lists: # print(item) # Searching best_score = float('inf') best_params = None scores = [] i = 0 for params in all_param_lists: print("\n" + "-" * 70) for param_name in params.keys(): print("{} : {}".format(param_name, params[param_name])) clf.set_params(**params) score = np.mean(validation_score(clf)) if score < best_score: best_score = score best_params = params i += 1 print("{} / {}, Done".format(i, len(all_param_lists))) print("Score: ", score) scores.append(score) print(scores) print("Best parameters:") for param_name in best_params.keys(): print("{} : {}".format(param_name, best_params[param_name])) print("Score: ", best_score) def gen_sub(): train = train_data.copy() train_idx = [i for i in range(train.shape[0])] test = test_data.copy() test_idx = [i + train.shape[0] for i in range(test.shape[0])] y = train["interest_level"].apply(lambda x: target_num_map[x]) del train["interest_level"] data = pd.concat([train, test]).reset_index() X_train, X_test, feats = coreProcess(data, y, train_idx, test_idx) xgb_clf = xgboostClassifier( objective = 'multi:softprob', eval_metric = 'mlogloss', num_class = 3, nthread = 12, eta = 0.02, max_depth = 6, subsample = 0.8, colsample_bytree = 1.0, colsample_bylevel = 0.8, min_child_weight=1, silent = 1, num_rounds = 1700, seed = RS, ) print("Trainning:...") xgb_clf.fit(X_train, y) preds = xgb_clf.predict_proba(X_test) sub = pd.DataFrame(preds) # sub.columns = ["high", "medium", "low"] sub.columns = [ "high", "medium", "low"] sub["listing_id"] = test.listing_id.values sub.to_csv("submission.csv", index=False) def genAvgSub(clfs): train = train_data.copy() train_idx = [i for i in range(train.shape[0])] test = test_data.copy() test_idx = [i + train.shape[0] for i in range(test.shape[0])] y = train["interest_level"].apply(lambda x: target_num_map[x]) del train["interest_level"] data = pd.concat([train, test]).reset_index() X_train, X_test, feats = coreProcess(data, y, train_idx, test_idx) print("Trainning:...") preds = [] for i in range(len(clfs)): print("Clf_{} fiting".format(i)) clfs[i].fit(X_train, y) print("Clf_{} predicting".format(i)) pred = clfs[i].predict_proba(X_test) preds.append(pred) sub = pd.DataFrame(np.mean(preds, axis=0)) # sub.columns = ["high", "medium", "low"] sub.columns = [ "high", "medium", "low"] sub["listing_id"] = test.listing_id.values sub.to_csv("submission.csv", index=False) print("Train done.") def validate(clfs): cv_scores = validation_avg_score(clfs) return cv_scores def search(): param_dict = { 'eta' : [0.02], 'max_depth' : [6], 'subsample' : [0.8], 'colsample_bylevel' : [0.7], 'num_rounds' : [1400, 1500, 1600, 1650], } clf = xgboostClassifier( objective = 'multi:softprob', eval_metric = 'mlogloss', num_class = 3, nthread = 12, eta = 0.04, max_depth = 6, subsample = 0.7, colsample_bytree = 1.0, colsample_bylevel = 1.0, min_child_weight=1, silent = 1, num_rounds = 700, seed = RS, ) paramSearch(clf, param_dict) def write2file(cv_scores, val_desc=None): print("*" * 50) print("Cross validation loss: ", np.mean(cv_scores)) with open("results.log", "a") as fp: fp.write(time.strftime("%m/%d/%Y %H:%M") + '\n') if(val_desc is not None): fp.write(val_desc + '\n') for score in cv_scores: fp.write(str(score) + " ") fp.write("\nCross Validation: {}\n".format(np.array(cv_scores).mean())) fp.write("*" * 50 + "\n") def stacking(clfs): print("Stacking") train = train_data.copy() test = test_data.copy() y = train["interest_level"].apply(lambda x: target_num_map[x]) del train["interest_level"] train_stackers = [] for RS in [0, 1, 2, 64, 128, 256, 512, 1024, 2048, 4096]: skf = StratifiedKFold(n_splits=10, random_state=RS, shuffle=True) #Create Arrays for meta train_stacker = [[0.0 for s in range(3)] for k in range (0,(train.shape[0]))] cv_scores = {i:[] for i in range(len(clfs))} cv_scores["Avg"] = [] print("Begin 10-flod cross validation") cnt = 0 for train_idx, val_idx in skf.split(train, y): cnt += 1 X = train.copy() y_train, y_val = y.iloc[train_idx], y.iloc[val_idx] X_train, X_val, feats = coreProcess(X, y_train, train_idx, val_idx) X_train.toarray() preds = [] k = 0 for clf in clfs: clf.fit(X_train, y_train) y_val_pred = clf.predict_proba(X_val) loss = log_loss(y_val, y_val_pred) preds.append(y_val_pred) cv_scores[k].append(loss) k += 1 print("Clf_{} iteration {}'s loss: {}".format(k, cnt, loss)) preds = np.array(preds) avg_pred = np.mean(preds, axis=0) avg_loss = log_loss(y_val, avg_pred) cv_scores["Avg"].append(avg_loss) print("Iteration {}'s Avg loss: {}".format(cnt, avg_loss)) no = 0 for real_idx in val_idx: for i in range(3): train_stacker[real_idx][i] = avg_pred[no][i] no += 1 for i in range(len(clfs)): print("clf_{} validation loss : {}".format(i, np.mean(cv_scores[i]))) print("Average validation loss : {}".format(np.mean(cv_scores["Avg"]))) train_stackers.append(train_stacker) train_stacker = np.mean(train_stackers, axis=0) print("*** Validation finished ***\n") test_stacker = [[0.0 for s in range(3)] for k in range (0,(test.shape[0]))] train_idx = [i for i in range(train.shape[0])] test_idx = [i + train.shape[0] for i in range(test.shape[0])] data = pd.concat([train, test]).reset_index() X_train, X_test, feats = coreProcess(data, y, train_idx, test_idx) print(X_train.shape, len(train_stacker)) print("Begin predicting") preds = [] for i in range(len(clfs)): print("Clf_{} fiting".format(i)) clfs[i].fit(X_train, y) print("Clf_{} predicting".format(i)) pred = clfs[i].predict_proba(X_test) preds.append(pred) preds = np.mean(preds, axis=0) for pr in range (0, len(preds)): for d in range (0,3): test_stacker[pr][d]=(preds[pr][d]) print ("merging columns") #stack xgboost predictions X_train = np.column_stack((X_train.toarray(),train_stacker)) # stack id to test X_test = np.column_stack((X_test.toarray(),test_stacker)) # stack target to train X = np.column_stack((y,X_train)) ids = test.listing_id.values X_test = np.column_stack((ids, X_test)) np.savetxt("./train_stacknet.csv", X, delimiter=",", fmt='%.5f') np.savetxt("./test_stacknet.csv", X_test, delimiter=",", fmt='%.5f') print("Write results...") output_file = "submission_{}.csv".format(np.mean(cv_scores["Avg"])) print("Writing submission to %s" % output_file) f = open(output_file, "w") f.write("listing_id,high,medium,low\n")# the header for g in range(0, len(test_stacker)) : f.write("%s" % (ids[g])) for prediction in test_stacker[g]: f.write(",%f" % (prediction)) f.write("\n") f.close() print("Done.") if __name__ == "__main__": clfs = [] # clfs.append(xgboostClassifier( # objective = 'multi:softprob', # eval_metric = 'mlogloss', # num_class = 3, # nthread = 6, # eta = 0.04, # max_depth = 6, # subsample = 0.7, # colsample_bytree = 1.0, # colsample_bylevel = 0.7, # min_child_weight=1, # silent = 1, # num_rounds = 700, # seed = 0, # )) # clfs.append(xgboostClassifier( # objective = 'multi:softprob', # eval_metric = 'mlogloss', # num_class = 3, # nthread = 6, # eta = 0.02, # max_depth = 6, # subsample = 0.8, # colsample_bytree = 1.0, # colsample_bylevel = 0.8, # min_child_weight=1, # silent = 1, # num_rounds = 1700, # seed = 0, # )) clfs.append(xgboostClassifier( objective = 'multi:softprob', eval_metric = 'mlogloss', num_class = 3, nthread = 9, eta = 0.02, max_depth = 6, subsample = 0.8, colsample_bytree = 1.0, colsample_bylevel = 0.7, min_child_weight=1, silent = 1, num_rounds = 1500, seed = 0, )) clfs.append(xgboostClassifier( objective = 'multi:softprob', eval_metric = 'mlogloss', num_class = 3, nthread = 9, eta = 0.02, max_depth = 6, subsample = 0.8, colsample_bytree = 1.0, colsample_bylevel = 0.8, min_child_weight=1, silent = 1, num_rounds = 1500, seed = 128, )) clfs.append(xgboostClassifier( objective = 'multi:softprob', eval_metric = 'mlogloss', num_class = 3, nthread = 9, eta = 0.02, max_depth = 6, subsample = 0.8, colsample_bytree = 1.0, colsample_bylevel = 0.8, min_child_weight=1, silent = 1, num_rounds = 1500, seed = 512, )) clfs.append(xgboostClassifier( objective = 'multi:softprob', eval_metric = 'mlogloss', num_class = 3, nthread = 9, eta = 0.02, max_depth = 6, subsample = 0.8, colsample_bytree = 1.0, colsample_bylevel = 0.8, min_child_weight=1, silent = 1, num_rounds = 1500, seed = 1024, )) clfs.append(xgboostClassifier( objective = 'multi:softprob', eval_metric = 'mlogloss', num_class = 3, nthread = 9, eta = 0.02, max_depth = 6, subsample = 0.8, colsample_bytree = 1.0, colsample_bylevel = 0.8, min_child_weight=1, silent = 1, num_rounds = 1500, seed = 2048, )) if len(sys.argv) == 1: cv_scores = validate(clfs) write2file(cv_scores) elif len(sys.argv) == 2: if sys.argv[1] == '-v': cv_scores = validate(clfs) write2file(cv_scores) elif sys.argv[1] == '-g': gen_sub() elif sys.argv[1] == '-s': search() elif sys.argv[1] == '-ga': genAvgSub(clfs) elif sys.argv[1] == '-stack': stacking(clfs) elif sys.argv[1] == '-v3': cv_scores = validate(clfs) val_desc = sys.argv[2] write2file(cv_scores, val_desc) elif len(sys.argv) == 3: if sys.argv[1] == '-v': cv_scores = validate(clfs) val_desc = sys.argv[2] write2file(cv_scores, val_desc) elif sys.argv[1] == '-g': gen_sub() elif sys.argv[1] == '-v3': cv_scores = validation_score() val_desc = sys.argv[2] write2file(cv_scores, val_desc) ================================================ FILE: preprocess.py ================================================ #!/usr/bin/python3 #-*- encoding: utf-8 -*- import sys import random import operator import datetime import time from collections import defaultdict, Counter import pandas as pd import numpy as np from scipy import sparse import xgboost as xgb from sklearn import preprocessing from sklearn.model_selection import train_test_split, GridSearchCV, KFold from sklearn.metrics import log_loss from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer from sklearn.cluster import KMeans from sklearn.cross_validation import StratifiedKFold from sklearn.preprocessing import StandardScaler from nltk.metrics import distance as distance FEATURE_NOT_USE = ['created','description','features','photos', 'index']# ,'bathrooms', 'bedrooms''listing_id', FEATURE_NOT_USE.append('display_address') FEATURE_NOT_USE.extend(['low_build_frac', 'high_build_frac', 'medium_build_frac', 'build_count'])# FEATURE_NOT_USE.extend(['low_manager_frac', 'high_manager_frac', 'medium_manager_frac','manager_count'])# FEATURE_NOT_USE.extend(['Listing_Id', 'img_created']) # , 'time_stamp' def bedroomProcess(data, train_idx, test_idx): # Some basic feature from bedrooms data["no_bedroom"] = data["bedrooms"].apply(lambda x: 1 if x == 0 else 0) data["more_than_5_bedroom"] = data["bedrooms"].apply(lambda x: 1 if x >= 5 else 0) data.loc[data["bedrooms"] + data["bathrooms"] == 0, "bedrooms"] = 0.001 train = data.iloc[train_idx, :].copy() test = data.iloc[test_idx, :].copy() # remove null value (ugly code) train.loc[data["bedrooms"] == 0.001, "bathrooms"] = train["bathrooms"].mean() test.loc[data["bedrooms"] == 0.001, "bathrooms"] = test["bathrooms"].mean() data.iloc[train_idx, :] = train data.iloc[test_idx, :] = test data["bedroom_per_room"] = data["bedrooms"] / (data["bedrooms"] + data["bathrooms"]) data.loc[data["bedrooms"] == 0.001, "bathrooms"] = 0 data.loc[data["bedrooms"] == 0.001, "bedrooms"] = 0 return data def bathroomProcess(data, train_idx, test_idx): # Some basic feature from bathrooms data.loc[data["bathrooms"] == 112, "bathrooms"] = 1.5 data.loc[data["bathrooms"] == 10, "bathrooms"] = 1 data.loc[data["bathrooms"] == 20, "bathrooms"] = 2 data["1_to_2_bathrooms"] = data["bathrooms"].apply(lambda x : 1if x != 0 and x <= 2 else 0) data.loc[data["bedrooms"] + data["bathrooms"] == 0, "bathrooms"] = 0.001 train = data.iloc[train_idx, :].copy() test = data.iloc[test_idx, :].copy() # remove null value (ugly code) train.loc[data["bathrooms"] == 0.001, "bedrooms"] = train["bedrooms"].mean() test.loc[data["bathrooms"] == 0.001, "bedrooms"] = test["bedrooms"].mean() data.iloc[train_idx, :] = train data.iloc[test_idx, :] = test data["bathoom_per_room"] = data["bathrooms"] / (data["bedrooms"] + data["bathrooms"]) data.loc[data["bathrooms"] == 0.001, "bedrooms"] = 0 data.loc[data["bathrooms"] == 0.001, "bathrooms"] = 0 return data def buildingIdProcess(data, y, train_idx, test_idx): # Have tried some ideas but failed return data def createdProcess(data): # Some basic features from created data["created"] = pd.to_datetime(data['created']) data["latest"] = (data["created"]- data["created"].min()) data["latest"] = data["latest"].apply(lambda x: x.total_seconds()) data["passed"] = (data["created"].max()- data["created"]) data["passed"] = data["passed"].apply(lambda x: x.total_seconds()) # year is weird data["year"] = data["created"].dt.year data['month'] = data['created'].dt.month data['day'] = data['created'].dt.day data['hour'] = data['created'].dt.hour data['weekday'] = data['created'].dt.weekday data['week'] = data['created'].dt.week data['quarter'] = data['created'].dt.quarter data['weekend'] = ((data['weekday'] == 5) & (data['weekday'] == 6)) data['weekend'] = data['weekend'].apply(int) # data["created_stamp"] = data["created"].apply(lambda x: time.mktime(x.timetuple())) #* data["latest_list_rank"] = data["latest"] / data["listing_id"] # data["diff_rank_2"] = data["passed"] / data["listing_id"] #* # image time after leak data.loc[data["time_stamp"] > 1490000000, "time_stamp"] = 1478524550 data["img_created"] = data["time_stamp"].apply(lambda x: datetime.datetime.fromtimestamp(x)) data["img_latest"] = (data["img_created"]- data["img_created"].min()) data["img_latest"] = data["img_latest"].apply(lambda x: x.total_seconds()) data["img_passed"] = (data["img_created"].max()- data["img_created"]) data["img_passed"] = data["img_passed"].apply(lambda x: x.total_seconds()) data["img_year"] = data["img_created"].dt.year data['img_month'] = data['img_created'].dt.month data['img_day'] = data['img_created'].dt.day data['img_hour'] = data['img_created'].dt.hour # data['img_weekday'] = data['img_created'].dt.weekday # data['img_week'] = data['img_created'].dt.week # data['img_quarter'] = data['img_created'].dt.quarter # data['img_weekend'] = ((data['img_weekday'] == 5) & (data['img_weekday'] == 6)) # data['img_weekend'] = data['img_weekend'].apply(int) data["img_latest_list_rank"] = data["img_latest"] / data["listing_id"] return data def descriptionProcess(data, train_idx, test_idx): data["description_words_num"] = data["description"].apply(lambda x: len(x.split(' '))) data["description_len"] = data["description"].apply(len) # Some info from descriptions desc_feats = { 'bedroom_mentions': ['br ', '---', "", "

"], 'html_tag_1':["", "

  • ", "
  • ", "", "-->", "2 else "nulldesc") # Tf-idf Encode tfidfdesc=TfidfVectorizer(min_df=20, max_features=50, strip_accents='unicode',lowercase =True, analyzer='word', token_pattern=r'\w{16,}', ngram_range=(1, 2), use_idf=False,smooth_idf=False, sublinear_tf=True, stop_words = 'english') tr_sparsed = tfidfdesc.fit_transform (data.iloc[train_idx, :]["description"]) te_sparsed = tfidfdesc.transform(data.iloc[test_idx, :]["description"]) feats_names = ["desc_" + x for x in tfidfdesc.get_feature_names()] return data, tr_sparsed, te_sparsed, feats_names def displayAddrProcess(data): # disp_price_dict = dict(data.groupby('display_address')['price'].mean()) # data["mean_disp_price"] = data.apply(lambda row: disp_price_dict[row["display_address"]], axis=1) # data["addr_sim"] = data.apply(lambda row: distance.edit_distance(row["display_address"].lower(), row["street_address"].lower()), axis=1) return data def featuresProcess(data, train_idx, test_idx): def afterRemoveStr(l, s): while s in l: l.remove(s) return l def afterRemoveFirstSpace(l): res = [] for s in l: res.append(s.strip()) return res data["features_num"] = data["features"].apply(len) mark = "#+-+#" data["features"] = data["features"].apply(lambda x: mark.join([i for i in x])) data["features"] = data["features"].apply(lambda x: x.lower()) # Deal with list like data data["features"] = data["features"].apply(lambda x: mark.join([i for i in x.split(" * ")])) data["features"] = data["features"].apply(lambda x: mark.join([i for i in x.split("**")])) data['features']=data['features'].str.replace("✓ hardwood floor ✓ high ceilings ✓ dishwasher", "hardwood floor" + mark + "high ceilings" + mark + "dishwasher") data['features']=data['features'].str.replace( "• on-site lifestyle concierge by luxury attaché " + "•24/7 doorman " + "• state of the art cardiovascular and weight training equipment " + "• 24-hour valet parking garage " + "• valet services including dry cleaning", "on-site lifestyle concierge by luxury attaché" + mark + "24/7 doorman" + mark + "state of the art cardiovascular and weight training equipment" + mark + "24-hour valet parking garage" + mark + "valet services including dry cleaning") data['features']=data['features'].str.replace( '{ 0 = "laundry in unit"; ' + '1 = "cats allowed"; '+ '10 = hardwood; '+ '11 = "high ceilings"; '+ '12 = renovated; '+ '13 = "marble bath"; '+ '14 = "granite kitchen"; '+ '15 = light; '+ '16 = "no fee"; '+ '17 = "walk-in closet"; '+ '2 = "dogs allowed"; '+ '3 = elevator; '+ '4 = exclusive; '+ '6 = laundry; '+ '7 = subway; '+ '8 = dishwasher; '+ '9 = washer; }', "laundry in unit" + mark + "cats allowed" + mark + "hardwood" + "high ceilings" + mark + "renovated" + mark + "marble bath" + "granite kitchen" + mark + "light" + mark + "no fee" + "walk-in closet" + mark + "dogs allowed" + mark + "elevator" + "exclusive" + mark + "laundry" + mark + "subway"+ "dishwasher" + mark + "washer") data['features']=data['features'].str.replace("windowed air-conditioned and monitored laundry room", "windowed air-conditioned" + mark + "monitored laundry room") data['features']=data['features'].str.replace("wall of windows. huge bedrooms", "wall of windows" + mark + "huge bedrooms") data['features']=data['features'].str.replace("to relax and recharge. this spacious 3 bedroom/2 bath residence also features oak hardwood flooring", "spacious" + mark + "3 bedroom" + mark + "2 bath" + mark + "residence" + mark + "oak hardwood flooring") data['features']=data['features'].str.replace("stunning 3 bedroom apartment with a terrace! east harlem! the best deal out now! get it now!!!!", "stunning" + mark + "3 bedroom" + mark + "a terrace" + mark + "east harlem" + mark + "the best deal out now! get it now!!!!") data['features']=data['features'].str.replace("ss appliances - d/w - m/w - recessed lighting - hardwood floors - high ceilings - marble bath", "ss appliances - d/w - m/w - " + mark + "recessed lighting" + mark + "hardwood floors" + mark + "high ceilings" + mark + "marble bath") data['features']=data['features'].str.replace("spacious living room for any kind of entertainment. prime location in theater distric", "spacious living room for any kind of entertainment." + mark + "prime location in theater distric") data['features']=data['features'].str.replace("spacious living room + home office", "spacious living room" + mark + "home office") data['features']=data['features'].str.replace("spacious and sunny 1st floor apartment "+ "overlooking the garden " + "*great williamsburg location* "+ "steps from shopping and cafes "+ "and 5 minute walk to graham avenue l train (3rd stop from manhattan) "+ "*shared back yard * "+ "large box style rooms * "+ "huge living room with high ceilings * "+ "nice bathroom with granite floor & ceramic tile * "+ "beautiful kitchen with granite counter tops lots of closet spacehardwood floors *"+ " heat included in the rent "+ "clean quiet building "+ "cat ok "+ "great location close to shopping", "spacious"+ mark +"sunny 1st floor"+ mark+ "overlooking the garden" + mark+ "great williamsburg location"+ mark+ "steps from shopping and cafes"+ mark+ "5 minute walk to graham avenue"+ mark +"train (3rd stop from manhattan)"+ mark+ "shared back yard"+mark+ "large box style rooms"+mark+ "huge living room " + mark + "high ceilings"+ mark+ "nice bathroom" + mark +"granite floor" + mark +"ceramic tile * "+mark+ "beautiful kitchen" + mark +"granite counter tops" + mark +"closet " + mark +"spacehardwood floors"+mark+ "heat included in the rent"+mark+ "clean quiet building"+mark+ "cat ok"+mark+ "close to shopping") data['features']=data['features'].str.replace("residents-only " + "fitness center " + "and aerobic room " + "professionally outfitted with a full complement of strength and cardio-training equipment", "residents-only"+ mark +"itness center"+ mark+ "and aerobic room" + mark+ "cardio-training equipment") data['features']=data['features'].str.replace("owner occupied - " + "3 family townhouse - " + "no realtor fees -"+ " this beautiful apt is offered below market rate", "owner occupied"+ mark +"3 family townhouse"+ mark+ "no realtor fees" + mark+ "this beautiful apt is offered below market rate") data['features']=data['features'].str.replace("newly renovated "+ "w/ oak wood floors "+ "mid century modern style interior "+ "large closets in every bedroom "+ "extra storage space in hall. "+ "large living room", "newly renovated"+ mark +"oak wood floors"+ mark+ "mid century modern style interior" + mark+ "large closets in every bedroom" + mark+ "extra storage space in hall"+ mark +"large living room") data['features']=data['features'].str.replace("live-in super package room "+ "smoke-free "+ "storage available "+ "virtual doorman "+ "guarantors accepted", "live-in super package room"+ mark +"smoke-free"+ mark+ "storage available" + mark+ "virtual doorman" + mark+ "guarantors accepted") data['features']=data['features'].str.replace("live-in super package room "+ "smoke-free "+ "storage available "+ "virtual doorman "+ "guarantors accepted", "live-in super package room"+ mark +"smoke-free"+ mark+ "storage available" + mark+ "virtual doorman" + mark+ "guarantors accepted") # Merging some features data['features']=data['features'].str.replace("washer/dyer combo","washer/dyer") data['features']=data['features'].str.replace("washer/dryer inside the unit","washer/dyer") data['features']=data['features'].str.replace("washer/dryer in-unit","washer/dyer") data['features']=data['features'].str.replace("washer/dryer in unit","washer/dyer") data['features']=data['features'].str.replace("washer/dryer in building","washer/dyer") data['features']=data['features'].str.replace("washer/dryer in bldg","washer/dyer") data['features']=data['features'].str.replace("washer/dryer hookup","washer/dyer") data['features']=data['features'].str.replace("washer/dryer stove/oven","washer/dyer") data['features']=data['features'].str.replace("washer/drier hookups","washer/dyer") data['features']=data['features'].str.replace("washer/ dryer in unit","washer/dyer") data['features']=data['features'].str.replace("washer/ dryer hookups","washer/dyer") data['features']=data['features'].str.replace("washer-dryer in unit","washer/dyer") data['features']=data['features'].str.replace("washer-dryer hookups","washer/dyer") data['features']=data['features'].str.replace("washer in unit","washer/dyer") data['features']=data['features'].str.replace("washer dryer in unit","washer/dyer") data['features']=data['features'].str.replace("washer dryer hookup","washer/dyer") data['features']=data['features'].str.replace("washer dryer hook up","washer/dyer") data['features']=data['features'].str.replace("washer and dryer in unit","washer/dyer") data['features']=data['features'].str.replace("washer and dryer in the unit","washer/dyer") data['features']=data['features'].str.replace("washer and dryer","washer/dyer") data['features']=data['features'].str.replace("washer / dryer in unit","washer/dyer") data['features']=data['features'].str.replace("washer / dryer (hookup only)","washer/dyer") data['features']=data['features'].str.replace("washer / dryer","washer/dyer") data['features']=data['features'].str.replace("washer & dryer.","washer/dyer") data['features']=data['features'].str.replace("washer","washer/dyer") data['features']=data['features'].str.replace("wash/dryer","washer/dyer") data['features']=data['features'].str.replace("pets: cats/small dogs","pet-friendly") data['features']=data['features'].str.replace("pets welcome","pet-friendly") data['features']=data['features'].str.replace("pets upon approval","pet-friendly") data['features']=data['features'].str.replace("pets on approval","pet-friendly") data['features']=data['features'].str.replace("pets ok.","pet-friendly") data['features']=data['features'].str.replace("pets ok","pet-friendly") data['features']=data['features'].str.replace("pets are welcome","pet-friendly") data['features']=data['features'].str.replace("pets allowed","pet-friendly") data['features']=data['features'].str.replace("pets accepted (on approval)","pet-friendly") data['features']=data['features'].str.replace("pets","pet-friendly") data['features']=data['features'].str.replace("pet grooming room","pet-friendly") data['features']=data['features'].str.replace("pet friendly building","pet-friendly") data['features']=data['features'].str.replace("pet friendly ( case by case )","pet-friendly") data['features']=data['features'].str.replace("pet friendly","pet-friendly") data['features']=data['features'].str.replace("pet friendly building","pet-friendly") data['features']=data['features'].str.replace("pet friendly building","pet-friendly") data['features']=data['features'].str.replace("garden/patio","garden") data['features']=data['features'].str.replace("patio","garden") data['features']=data['features'].str.replace("residents_garden","garden") data['features']=data['features'].str.replace("common garden","garden") data['features']=data['features'].str.replace("wifi access","wifi") data['features']=data['features'].str.replace("wifi included","wifi") data['features']=data['features'].str.replace("wifi in resident lounge","wifi") data['features']=data['features'].str.replace("wifi + utilities","wifi") data['features']=data['features'].str.replace("wi fi work lounge","wifi") data['features']=data['features'].str.replace("wi-fi access","wifi") data['features']=data['features'].str.replace("24/7","24") data['features']=data['features'].str.replace("24-hour","24") data['features']=data['features'].str.replace("24hr","24") data['features']=data['features'].str.replace("concierge","doorman") data['features']=data['features'].str.replace("ft doorman","doorman") data['features']=data['features'].str.replace("24 doorman","doorman") data['features']=data['features'].str.replace("24 hr doorman","doorman") data['features']=data['features'].str.replace("doorman service","doorman") data['features']=data['features'].str.replace("full-time doorman","doorman") data['features']=data['features'].str.replace("gym/fitness","fitness") data['features']=data['features'].str.replace("fitness room","fitness") data['features']=data['features'].str.replace("washer","laundry") data['features']=data['features'].str.replace("laundry in bldg","laundry") data['features']=data['features'].str.replace("laundry in building","laundry") data['features']=data['features'].str.replace("laundry in building/dryer","laundry") data['features']=data['features'].str.replace("laundry in building_&_dryer","laundry") data['features']=data['features'].str.replace("laundry room","laundry") data['features']=data['features'].str.replace("laundry & housekeeping","laundry") data['features']=data['features'].str.replace("laundry in unit","laundry") data['features']=data['features'].str.replace("laundry in-unit","laundry") data['features']=data['features'].str.replace("laundry on every floor","laundry") data['features']=data['features'].str.replace("laundry on floor","laundry") data['features']=data['features'].str.replace("in-unit laundry/dryer","laundry") data['features']=data['features'].str.replace("on-site laundry","laundry") data['features']=data['features'].str.replace("laundry/dryer","laundry") data['features']=data['features'].str.replace("high-speed internet","high_speed_internet") data['features']=data['features'].str.replace("high speed internet available","high_speed_internet") data['features']=data['features'].str.replace("parking available","parking") data['features']=data['features'].str.replace("parking space","parking") data['features']=data['features'].str.replace("on-site garage","parking") data['features']=data['features'].str.replace("on-site parking","parking") data['features']=data['features'].str.replace("on-site parking lot","parking") data['features']=data['features'].str.replace("full service garage","parking") data['features']=data['features'].str.replace("common parking/garage","parking") data['features']=data['features'].str.replace("garage","parking") data['features']=data['features'].str.replace("assigned-parking-space","private_parking") data['features']=data['features'].str.replace("storage available","storage") data['features']=data['features'].str.replace("storage facilities available","storage") data['features']=data['features'].str.replace("storage space","storage") data['features']=data['features'].str.replace("storage room","storage") data['features']=data['features'].str.replace("common storage","storage") data['features']=data['features'].str.replace("central a/c","central_air") data['features']=data['features'].str.replace("central ac","central_air") data['features']=data['features'].str.replace("air conditioning","central_air") data['features']=data['features'].str.replace("close to subway","subway") data['features']=data['features'].str.replace("roofdeck","roof-deck") data['features']=data['features'].str.replace("roof-deck","roof-deck") data['features']=data['features'].str.replace("rooftop terrace","roof-deck") data['features']=data['features'].str.replace("rooftop deck","roof-deck") data['features']=data['features'].str.replace("roof access","roof-deck") data['features']=data['features'].str.replace("common roof deck","roof-deck") data['features']=data['features'].str.replace("roof decks","roof-deck") data['features']=data['features'].str.replace("roof grilling area","roof-deck") data['features']=data['features'].str.replace("roof garden and lounge","roof-deck") data['features']=data['features'].str.replace("roof deck with stunning view","roof-deck") data['features']=data['features'].str.replace("roof deck with real grass","roof-deck") data['features']=data['features'].str.replace("roof deck with grills","roof-deck") data['features']=data['features'].str.replace("roof deck w/ grills","roof-deck") data['features']=data['features'].str.replace("roof deck / sun deck","roof-deck") data['features']=data['features'].str.replace("roof deck","roof-deck") data['features']=data['features'].str.replace("swimming pool","pool") data['features']=data['features'].str.replace("indoor pool","pool") data['features']=data['features'].str.replace("deco fireplace","fireplaces") data['features']=data['features'].str.replace("decorative fireplace","fireplaces") data['features']=data['features'].str.replace("yoga/pilates studio","yoga") data['features']=data['features'].str.replace("yoga studio","yoga") data['features']=data['features'].str.replace("yoga room","yoga") data['features']=data['features'].str.replace("yoga classes","yoga") data['features']=data['features'].str.replace("yoga and spin studios","yoga") data['features']=data['features'].str.replace("yoga an pilates class","yoga") data['features']=data['features'].str.replace("yoga / dance studio","yoga") # data["features"] = data["features"].apply(lambda x: afterRemoveStr(x, '')) # data["features"] = data["features"].apply(lambda x: afterRemoveFirstSpace(x)) data["features"] = data["features"].apply(lambda x: x.split(mark)) data["features"] = data["features"].apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x])) tfidf = CountVectorizer(stop_words="english", max_features=200) tr_sparse_feats = tfidf.fit_transform(data.iloc[train_idx, :]["features"]) te_sparse_feats = tfidf.transform(data.iloc[test_idx, :]["features"]) feats_names = ["features_" + x for x in tfidf.get_feature_names()] return data, tr_sparse_feats, te_sparse_feats, feats_names def locationProcess(data, train_idx, test_idx): # Clustering # train_x = data.iloc[train_idx,:][['new_latitude', 'new_longitude']] # stest_x = data.iloc[test_idx,:][['new_latitude', 'new_longitude']] train_x = data.iloc[train_idx, :][['latitude', 'longitude']] test_x = data.iloc[test_idx, :][['latitude', 'longitude']] kmeans_cluster = KMeans(n_clusters=20) res = kmeans_cluster.fit(train_x) res = kmeans_cluster.predict(pd.concat([train_x, test_x])) d = dict(zip(data['listing_id'], res)) data['cenroid'] = data['listing_id'].apply(lambda x: d[x]) # Manhattan distance center = [data.iloc[train_idx, :]['latitude'].mean(), data.iloc[train_idx, :]['longitude'].mean()] data['distance'] = abs(data['latitude'] - center[0]) + abs(data['longitude'] - center[1]) # data['distance_2'] = np.sqrt((data['latitude'] - center[0]) ** 2 + (data['longitude'] - center[1]) ** 2) return data def managerIdProcess(data, y, train_idx, test_idx): manager_lgt_dict = dict(data.groupby('manager_id')['longitude'].mean()) manager_ltt_dict = dict(data.groupby('manager_id')['latitude'].mean()) # Group manager_id with location info data["mean_man_longitude"] = data.apply(lambda row: manager_lgt_dict[row["manager_id"]], axis=1) data["mean_man_latitude"] = data.apply(lambda row: manager_ltt_dict[row["manager_id"]], axis=1) # Group manager_id with time info data = group_with_time_features(data, "manager_id") data = group_with_img_time_features(data, "manager_id") manager_stamp_dict = dict(data.groupby('manager_id')['time_stamp'].mean()) data["mean_man_timestamp"] = data.apply(lambda row: manager_stamp_dict[row["manager_id"]], axis=1) # manager_stamp_dict = dict(data.groupby('manager_id')['created_stamp'].mean()) # data["mean_man_createdstamp"] = data.apply(lambda row: manager_stamp_dict[row["manager_id"]], axis=1) return data def photoProcess(data): data["photo_num"] = data["photos"].apply(len) return data def priceProcess(data): #data["out_price"] = data["price"].apply(lambda x: 1 if x < 700 or x > 15000 else 0) # Clean the outlier ulimit = 15000#np.percentile(data.price.values, 99) data.loc[data["price"] > ulimit, "price"] = ulimit dlimit = 350 data.loc[data["price"] < dlimit, "price"] = dlimit data["price_per_room"] = data["price"] / (data["bedrooms"] + data["bathrooms"] + 1.0) data["price_per_bed"] = data["price"] / (data["bedrooms"] + 1.0) #* # data.loc[~np.isfinite(data["price_per_room"]), "price_per_room"] = 0 # data.loc[~np.isfinite(data["price_per_bed"]), "price_per_bed"] = 0 data["price_latitude"] = data["price"] / (data["latitude"] + 1.0) data["price_longitude"] = data["price"] / (data["longitude"] + 1.0) # Grouping price with size or build median_list = ['bedrooms', 'bathrooms', 'building_id'] # median_list = ['month', 'day', 'hour', 'weekday', 'quarter', 'week', 'passed', 'latest'] for col in median_list: median_price = data[[col, 'price']].groupby(col)['price'].median() median_price = median_price[data[col]].values.astype(float) data['median_' + col] = median_price data['ratio_' + col] = data['price'] / median_price data['median_' + col] = data['median_' + col].apply(lambda x: np.log(x)) # data["price"] = data["price"].apply(lambda x: np.log(x)) return data def streetAddrProcess(data): #data["new_addr"] = data["street_address"].apply(lambda x: ' '.join([x.split()[i] for i in range(1, len(x.split()))])) #data["new_addr"] = preprocessing.LabelEncoder().fit_transform(data["new_addr"]) # data["street_address"] = data["street_address"].apply(lambda x: x.replace('\u00a0', '').strip().lower) return data def listingIdProcess(data): # It's weird。 data["listing_id"] = data["listing_id"] - 68119576.0 return data def coreProcess(data, y_train, train_idx, test_idx): data = listingIdProcess(data) data = bedroomProcess(data, train_idx, test_idx) data = bathroomProcess(data, train_idx, test_idx) data["room_diff"] = data["bathrooms"] - data["bedrooms"] data["room_num"] = data["bedrooms"] + data["bathrooms"] data = createdProcess(data) data = buildingIdProcess(data, y_train, train_idx, test_idx) data, tr_sparsed, te_sparsed, feats_sparsed = descriptionProcess(data, train_idx, test_idx) data = displayAddrProcess(data) data, tr_sparse, te_sparse, feats_sparse = featuresProcess(data, train_idx, test_idx) data = locationProcess(data, train_idx, test_idx) data = managerIdProcess(data, y_train, train_idx, test_idx) data = photoProcess(data) data = priceProcess(data) data = streetAddrProcess(data) categorical = ["display_address", "manager_id", "building_id", "street_address"] for f in categorical: if data[f].dtype=='object': cases=defaultdict(int) temp=np.array(data[f]).tolist() for k in temp: cases[k]+=1 # print(f, len(cases)) data[f] = data[f].apply(lambda x: cases[x]) feats_in_use = [col for col in data.columns if col not in FEATURE_NOT_USE] data_train = np.array(data.iloc[train_idx, :][feats_in_use]) data_test = np.array(data.iloc[test_idx, :][feats_in_use]) # Feature Scaling stda = StandardScaler() data_test = stda.fit_transform(data_test) data_train = stda.transform(data_train) # High cardinality feature high_card_feats = ["building_id", "manager_id", "longitude", "room_diff"] # "building_id", "manager_id", # C0 = [3, 12, 0, 4] C0 = [feats_in_use.index(f) for f in high_card_feats] W_train, W_cv = convert_to_avg(data_train, y_train, data_test, seed=1, cvals=5, roundings=2, columns=C0) # Add Sparse feature data_train = sparse.hstack([data_train, tr_sparse, tr_sparsed, W_train[:, C0]]).tocsr() data_test = sparse.hstack([data_test, te_sparse, te_sparsed, W_cv[:, C0]]).tocsr() feats_in_use.extend(feats_sparse) feats_in_use.extend(feats_sparsed) feats_in_use.extend(["build_high_card", "manager_high_card"]) # print(len(feats_in_use)) # print(tr_sparse.toarray().shape, tr_sparsed.toarray().shape, len(feats_in_use), data_train.shape) return data_train, data_test, feats_in_use # Copy from KazAnova's starter code def convert_dataset_to_avg(xc,yc,xt, rounding=2,cols=None): xc = xc.tolist() xt = xt.tolist() yc = yc.tolist() if cols == None: cols =[k for k in range(0,len(xc[0]))] woe=[ [0.0 for k in range(0,len(cols))] for g in range(0,len(xt))] good=[] bads=[] for col in cols: dictsgoouds=defaultdict(int) dictsbads=defaultdict(int) good.append(dictsgoouds) bads.append(dictsbads) total_count=0.0 total_sum =0.0 for a in range (0,len(xc)): target=yc[a] total_sum+=target total_count+=1.0 for j in range(0,len(cols)): col=cols[j] good[j][round(xc[a][col],rounding)]+=target bads[j][round(xc[a][col],rounding)]+=1.0 #print(total_goods,total_bads) for a in range (0,len(xt)): for j in range(0,len(cols)): col=cols[j] if round(xt[a][col],rounding) in good[j]: woe[a][j]=float(good[j][round(xt[a][col],rounding)])/float(bads[j][round(xt[a][col],rounding)]) else : woe[a][j]=round(total_sum/total_count) return woe def convert_to_avg(X,y, Xt, seed=1, cvals=5, roundings=2, columns=None): if columns==None: columns=[k for k in range(0,(X.shape[1]))] #print("it is not!!") X=X.tolist() Xt=Xt.tolist() woetrain=[ [0.0 for k in range(0,len(X[0]))] for g in range(0,len(X))] woetest=[ [0.0 for k in range(0,len(X[0]))] for g in range(0,len(Xt))] kfolder=StratifiedKFold(y, n_folds=cvals,shuffle=True, random_state=seed) for train_index, test_index in kfolder: # creaning and validation sets X_train, X_cv = np.array(X)[train_index], np.array(X)[test_index] y_train = np.array(y)[train_index] woecv=convert_dataset_to_avg(X_train,y_train,X_cv, rounding=roundings,cols=columns) X_cv=X_cv.tolist() no=0 for real_index in test_index: for j in range(0,len(X_cv[0])): woetrain[real_index][j]=X_cv[no][j] no+=1 no=0 for real_index in test_index: for j in range(0,len(columns)): col=columns[j] woetrain[real_index][col]=woecv[no][j] no+=1 woefinal=convert_dataset_to_avg(np.array(X),np.array(y),np.array(Xt), rounding=roundings,cols=columns) for real_index in range(0,len(Xt)): for j in range(0,len(Xt[0])): woetest[real_index][j]=Xt[real_index][j] for real_index in range(0,len(Xt)): for j in range(0,len(columns)): col=columns[j] woetest[real_index][col]=woefinal[real_index][j] return np.array(woetrain), np.array(woetest) # Grouping (Very important) def group_with_time_features(data, g_feat): mean_month_dict = dict(data.groupby(g_feat)['month'].mean()) data["mean_" + g_feat + "_month"] = data.apply(lambda row: mean_month_dict[row[g_feat]], axis=1) mean_day_dict = dict(data.groupby(g_feat)['day'].mean()) data["mean_" + g_feat + "_day"] = data.apply(lambda row: mean_day_dict[row[g_feat]], axis=1) mean_hour_dict = dict(data.groupby(g_feat)['hour'].mean()) data["mean_" + g_feat + "_hour"] = data.apply(lambda row: mean_hour_dict[row[g_feat]], axis=1) mean_weekday_dict = dict(data.groupby(g_feat)['weekday'].mean()) data["mean_" + g_feat + "_weekday"] = data.apply(lambda row: mean_weekday_dict[row[g_feat]], axis=1) mean_quarter_dict = dict(data.groupby(g_feat)['quarter'].mean()) data["mean_" + g_feat + "_quater"] = data.apply(lambda row: mean_quarter_dict[row[g_feat]], axis=1) mean_week_dict = dict(data.groupby(g_feat)['week'].mean()) data["mean_" + g_feat + "_week"] = data.apply(lambda row: mean_week_dict[row[g_feat]], axis=1) mean_passed_dict = dict(data.groupby(g_feat)['passed'].mean()) data["mean_" + g_feat + "_passed"] = data.apply(lambda row: mean_passed_dict[row[g_feat]], axis=1) mean_latest_dict = dict(data.groupby(g_feat)['latest'].mean()) data["mean_" + g_feat + "_latest"] = data.apply(lambda row: mean_latest_dict[row[g_feat]], axis=1) return data def group_with_img_time_features(data, g_feat): mean_month_dict = dict(data.groupby(g_feat)['img_month'].mean()) data["mean_" + g_feat + "_img_month"] = data.apply(lambda row: mean_month_dict[row[g_feat]], axis=1) mean_day_dict = dict(data.groupby(g_feat)['img_day'].mean()) data["mean_" + g_feat + "_img_day"] = data.apply(lambda row: mean_day_dict[row[g_feat]], axis=1) mean_hour_dict = dict(data.groupby(g_feat)['img_hour'].mean()) data["mean_" + g_feat + "_img_hour"] = data.apply(lambda row: mean_hour_dict[row[g_feat]], axis=1) # mean_weekday_dict = dict(data.groupby(g_feat)['img_weekday'].mean()) # data["mean_" + g_feat + "_img_weekday"] = data.apply(lambda row: mean_weekday_dict[row[g_feat]], axis=1) # mean_quarter_dict = dict(data.groupby(g_feat)['img_quarter'].mean()) # data["mean_" + g_feat + "_img_quater"] = data.apply(lambda row: mean_quarter_dict[row[g_feat]], axis=1) # mean_week_dict = dict(data.groupby(g_feat)['img_week'].mean()) # data["mean_" + g_feat + "_img_week"] = data.apply(lambda row: mean_week_dict[row[g_feat]], axis=1) mean_passed_dict = dict(data.groupby(g_feat)['img_passed'].mean()) data["mean_" + g_feat + "_img_passed"] = data.apply(lambda row: mean_passed_dict[row[g_feat]], axis=1) mean_latest_dict = dict(data.groupby(g_feat)['img_latest'].mean()) data["mean_" + g_feat + "_img_latest"] = data.apply(lambda row: mean_latest_dict[row[g_feat]], axis=1) return data ================================================ FILE: stack/params.txt ================================================ LogisticRegression Type:Liblinear C:6.1 threads:1 usescale:True maxim_Iteration:200 seed:1 verbose:false GradientBoostingForestClassifier estimators:300 shrinkage:0.18 threads:1 offset:0.00001 max_depth:3 max_features:0.65 min_leaf:2.0 min_split:7.0 Objective:RMSE row_subsample:1.0 seed:1 verbose:false LibFmClassifier maxim_Iteration:70 C:0.0041 C2:0.00120 lfeatures:1 seed:1 usescale:True init_values:0.046 learn_rate:0.05 smooth:0.1 threads:1 verbose:false softmaxnnclassifier usescale:True seed:1 Type:SGD maxim_Iteration:50 C:0.0000008 shuffle:false tolerance:0.01 learn_rate:0.0065 smooth:0.1 h1:40 h2:35 connection_nonlinearity:Relu init_values:0.020 verbose:false RandomForestClassifier bootsrap:false estimators:100 threads:1 offset:0.00001 max_depth:6 max_features:0.4 min_leaf:2.0 min_split:5.0 Objective:ENTROPY row_subsample:0.95 seed:1 verbose:false AdaboostRandomForestClassifier bootsrap:false weight_thresold:0.95 estimators:100 threads:1 max_depth:6 max_features:0.5 min_leaf:2.0 min_split:5.0 Objective:ENTROPY row_subsample:0.9 seed:1 verbose:false GradientBoostingForestRegressor bootsrap:false estimators:300 shrinkage:0.1 threads:1 offset:0.00001 max_depth:3 max_features:0.4 min_leaf:2.0 min_split:5.0 Objective:RMSE row_subsample:0.9 seed:1 verbose:false RandomForestRegressor bootsrap:false estimators:100 threads:1 offset:0.00001 max_depth:6 max_features:0.4 min_leaf:2.0 min_split:5.0 Objective:RMSE row_subsample:0.95 seed:1 verbose:false LibFmRegressor maxim_Iteration:70 C:0.0001 C2:0.0009 lfeatures:2 seed:1 usescale:True init_values:0.1 learn_rate:0.1 threads:1 verbose:false RandomForestClassifier bootsrap:false estimators:500 threads:3 offset:0.00001 max_depth:5 max_features:0.3 min_leaf:1.0 min_split:5.0 Objective:ENTROPY row_subsample:0.8 seed:1 verbose:false ================================================ FILE: stack/parse.py ================================================ import re import numpy as np with open("result.txt", "r") as f: raw = "".join(f.readlines()) str_res = re.findall(pattern="logloss : 0\.[0-9]+", string=raw) res = [float(x.split(" : ")[1]) for x in str_res] results = {i: [] for i in range(len(res) // 5)} for i in range(len(res)): results[i % (len(res) // 5)].append(res[i]) results = {i: np.mean(results[i]) for i in results} for item in sorted(results.items(), key=lambda x: x[1]): print(item) ================================================ FILE: stack/start.sh ================================================ java -Xmx3048m -jar StackNet.jar train train_file=train_stacknet.csv test_file=test_stacknet.csv params=params.txt pred_file=sigma_stack_pred.csv test_target=true verbose=true Threads=4 stackdata=false folds=5 seed=1 metric=logloss ================================================ FILE: stack/utils.py ================================================ import pandas as pd import numpy as np def getAvgSub(subs_in): subs = [] for sub in subs_in: sub = sub.sort_values(by=["listing_id"]).reset_index() subs.append(sub) n = len(subs) new_sub = subs[0].copy() for i in range(1, n): sub = subs[i] new_sub["high"] = new_sub["high"] + sub["high"] new_sub["medium"] = new_sub["medium"] + sub["medium"] new_sub["low"] = new_sub["low"] + sub["low"] new_sub["high"] = new_sub["high"] / n new_sub["medium"] = new_sub["medium"] / n new_sub["low"] = new_sub["low"] / n del new_sub["index"] return new_sub def getWeightedAvgSub(subs_in, weights): assert np.sum(weights) == 1, "Sum of weights need to be 1" subs = [] for sub in subs_in: sub = sub.sort_values(by=["listing_id"]).reset_index() subs.append(sub) n = len(subs) new_sub = subs[0].copy() new_sub["high"] = new_sub["high"] * weights[0] new_sub["medium"] = new_sub["medium"] * weights[0] new_sub["low"] = new_sub["low"] * weights[0] for i in range(1, n): sub = subs[i] new_sub["high"] = new_sub["high"] + sub["high"] * weights[i] new_sub["medium"] = new_sub["medium"] + sub["medium"] * weights[i] new_sub["low"] = new_sub["low"] + sub["low"] * weights[i] del new_sub["index"] return new_sub def generateStackSub(test_file_name, sub_file_name): test_array = np.loadtxt(test_file_name, delimiter=",") test = pd.DataFrame(test_array) sub_array = np.loadtxt(sub_file_name, delimiter=",") sub = pd.DataFrame(sub_array) sub.columns = ["high", "medium", "low"] sub["listing_id"] = test.iloc[:, 0].apply(lambda x: int(x)) sub.to_csv("new_sub.csv", index=False) def correct(df): interest_levels = ['low', 'medium', 'high'] tau = { 'low': 0.69195995, 'medium': 0.23108864, 'high': 0.07695141, } y = df[interest_levels].mean() a = [tau[k] / y[k] for k in interest_levels] print(a) def f(p): for k in range(len(interest_levels)): p[k] *= a[k] return p / p.sum() df_correct = df.copy() df_correct[interest_levels] = df_correct[interest_levels].apply(f, axis=1) y = df_correct[interest_levels].mean() a = [tau[k] / y[k] for k in interest_levels] print(a) return df_correct