master 4b177f8dffed cached
10 files
60.3 KB
16.9k tokens
49 symbols
1 requests
Download .txt
Repository: ScarletPan/Kaggle-Rental-Listing-Inquireies
Branch: master
Commit: 4b177f8dffed
Files: 10
Total size: 60.3 KB

Directory structure:
gitextract_aw7n92_y/

├── README.md
├── classifiers.py
├── modelTraining.py
├── ppt/
│   └── AIC-Sharing-11-19.pptx
├── preprocess.py
└── stack/
    ├── StackNet.jar
    ├── params.txt
    ├── parse.py
    ├── start.sh
    └── utils.py

================================================
FILE CONTENTS
================================================

================================================
FILE: README.md
================================================
# README

* ```preprocess.py```: data cleaning, feature engineering
* ```modelTraining.py```: cross validation, submission generating, stacking preparing
* ```classifiers.py```: my encapsulation of xgboost
* stack
  * ```StackNet.jar```: stacking tools shared by KazAnova, repo is [here](https://github.com/kaz-Anova/StackNet)
  * ```parse.py```: tools for evaluate the cv scores during stacking.
  * ```utils.py```: generating submission after StackNet
  * ```start.sh```: commands for executing StackNet
  * ```params.txt```: my params for stacking

### links:
  * [Kaggle:Rental Listing Inquireies](https://www.kaggle.com/c/two-sigma-connect-rental-listing-inquiries)
  * [Summary of getting a silver medal in kaggle](http://scarletpan.github.io/summary-of-get-a-silver-medal-in-kaggle/)
  * [Kaggle 首战拿银总结 | 入门指导 (长文、干货) -- 知乎专栏](https://zhuanlan.zhihu.com/p/26645088)
  * [AI Challenge 分享会PPT](https://github.com/ScarletPan/Kaggle-Rental-Listing-Inquireies/blob/master/ppt/AIC-Sharing-11-19.pptx)
  


================================================
FILE: classifiers.py
================================================
import xgboost as xgb
import numpy as np
from sklearn.metrics import log_loss


class xgboostClassifier():
    def __init__(self, **params):
        self.clf = None
        self.progress = {}
        self.params = params

    def fit(self, X, y):
        xg_train = xgb.DMatrix(X, label=y)
        self.clf = xgb.train(self.params, xg_train, self.params['num_rounds'])

    def fit_CV(self, X_train, X_val, y_train, y_val):
        xg_train = xgb.DMatrix(X_train, label=y_train)
        xg_val = xgb.DMatrix(X_val, label=y_val)
        watchlist = [(xg_train, 'train'), (xg_val, 'eval')]
        self.clf = xgb.train(self.params, xg_train, self.params['num_rounds'],
                         watchlist, early_stopping_rounds=200, evals_result=self.progress)

    def get_eval_res(self):
        return self.progress

    def score(self, X, y):
        Y = self.predict_proba(X)
        return 1 / log_loss(y, Y)

    def predict_proba(self, X_test):
        res = self.clf.predict(xgb.DMatrix(X_test))
        return res.astype(np.float32)

    def predict(self, X_test):
        res = np.argmax(self.clf.predict(xgb.DMatrix(X_test)), axis=1)
        return res 

    def get_params(self, **params):
        return self.params

    def set_params(self, **params):
        self.params.update(params)

    def getSortedImportance(self, features):
        with open('xgb.fmap', 'w') as f:
            for i in range(len(features)):
                f.write('{0}\t{1}\tq\n'.format(i, features[i]))
        importance = self.clf.get_fscore(fmap='xgb.fmap')
        importance = sorted(importance.items(), key=operator.itemgetter(1))
        #print(importance)
        return importance

class BaseClassifier(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict(x)

    def predict_proba(self, x):
        return self.clf.predict_proba(x)

    def fit(self,x,y):
        return self.clf.fit(x,y)

    def set_params(self, **params):
        self.params.update(params)
    

================================================
FILE: modelTraining.py
================================================
import sys
import time
import random
from collections import defaultdict
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn.metrics import log_loss
from preprocess import coreProcess
from classifiers import xgboostClassifier

TRAIN_FILE_NAME = '~/Kaggle/RLI/input/train.json'
TEST_FILE_NAME = '~/Kaggle/RLI/input/test.json'
target_num_map = {'high': 0, 'medium': 1, 'low': 2}
train_data = pd.read_json(TRAIN_FILE_NAME).reset_index()
test_data = pd.read_json(TEST_FILE_NAME).reset_index()
list_img_time = pd.read_csv("~/Kaggle/RLI/input/listing_image_time.csv")
train_data = train_data.merge(list_img_time, left_on="listing_id", right_on="Listing_Id", how='inner')
test_data = test_data.merge(list_img_time, left_on="listing_id", right_on="Listing_Id", how='inner')
RS = 2016
random.seed(RS)
np.random.seed(RS)
# RS = 0

def validation_score(early_stop=False):
    clf = xgboostClassifier(
        objective = 'multi:softprob',
        eval_metric = 'mlogloss',
        num_class = 3,
        nthread = 3,
        eta = 0.04,
        max_depth = 6,
        subsample = 0.7,
        colsample_bytree = 1.0,
        colsample_bylevel = 0.7,
        min_child_weight=1,
        silent = 1,
        num_rounds = 700,
        seed = RS,
    )
    print("*** Validation start ***")
    data = train_data.copy()
    y = data["interest_level"].apply(lambda x: target_num_map[x])
    del data["interest_level"]

    # skf = StratifiedKFold(n_splits=5, random_state=RS, shuffle=True)
    skf = StratifiedKFold(n_splits=3, shuffle=False)
    cv_scores = []
    i = 0
    for train_idx, val_idx in skf.split(data, y):
        i += 1
        X = data.copy()
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        X_train, X_val, feats = coreProcess(X, y_train, train_idx, val_idx)
        clf.fit(X_train, y_train)
        # clf.fit_CV(X_train, X_val, y_train, y_val)
        y_val_pred = clf.predict_proba(X_val)
        loss = log_loss(y_val, y_val_pred)
        print("Iteration {}'s loss: {}".format(i, loss))
        cv_scores.append(loss)
        if early_stop:
            break
    print("*** Validation finished ***\n")
    return cv_scores


def validation_avg_score(clfs):
    print("*** Validation start ***")
    data = train_data.copy()
    y = data["interest_level"].apply(lambda x: target_num_map[x])
    del data["interest_level"]

    # skf = StratifiedKFold(n_splits=5, random_state=RS, shuffle=True)
    skf = StratifiedKFold(n_splits=3)
    cv_scores = {i:[] for i in range(len(clfs))}
    cv_scores["Avg"] = []
    i = 0
    for train_idx, val_idx in skf.split(data, y):
        i += 1
        X = data.copy()
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        X_train, X_val, feats = coreProcess(X, y_train, train_idx, val_idx)
        tmp = []
        preds = []
        j = 0
        for clf in clfs:
            clf.fit(X_train, y_train)
            y_val_pred = clf.predict_proba(X_val)
            tmp.append(y_val_pred)
            loss = log_loss(y_val, y_val_pred)
            cv_scores[j].append(loss)
            preds.append(y_val_pred)
            j += 1
            print("clf_{}, Iteration {}'s loss: {}".format(j, i, loss))
        preds = np.array(preds)
        avg_pred = np.mean(preds, axis=0)
        loss = log_loss(y_val, avg_pred)
        cv_scores["Avg"].append(loss)
        print("Iteration {}'s Avg loss: {}".format(i, loss))
    for i in range(len(clfs)):
        print("clf_{} validation loss : {}".format(i, np.mean(cv_scores[i])))
    print("Average validation loss : {}".format(np.mean(cv_scores["Avg"])))
    print("*** Validation finished ***\n")
    return cv_scores["Avg"]


def paramSearch(clf, param_dict):

    def outer_join(left, right):
        if left == []:
            return right
        if right == []:
            return left
        res = []
        for i in left:
            for j in right:
                if isinstance(i, list):
                    tmp = i[:]
                    tmp.append(j)
                    res.append(tmp)
                else:
                    res.append([i, j])
        return res
    # Creating list of param_dict
    param_list = sorted(param_dict.items(), key=lambda x: x[0])
    param_keys = [ item[0] for item in param_list ]
    param_vals = [ item[1] for item in param_list ]
    all_vals = []
    for val in param_vals:
        all_vals = outer_join(all_vals, val)
    all_param_lists = []
    for vals in all_vals:
        all_param_lists.append(dict(zip(param_keys, vals)))
    # for item in all_param_lists:
    #     print(item)

    # Searching
    best_score = float('inf')
    best_params = None
    scores = []
    i = 0
    for params in all_param_lists:
        print("\n" + "-" * 70)
        for param_name in params.keys():
            print("{} : {}".format(param_name, params[param_name]))
        clf.set_params(**params)
        score = np.mean(validation_score(clf))
        if score < best_score:
            best_score = score
            best_params = params
        i += 1
        print("{} / {}, Done".format(i, len(all_param_lists)))
        print("Score: ", score)
        scores.append(score)
    print(scores)
    print("Best parameters:")
    for param_name in best_params.keys():
        print("{} : {}".format(param_name, best_params[param_name]))
    print("Score: ", best_score)


def gen_sub():
    train = train_data.copy()
    train_idx = [i for i in range(train.shape[0])]
    test = test_data.copy()
    test_idx = [i + train.shape[0] for i in range(test.shape[0])]
    y = train["interest_level"].apply(lambda x: target_num_map[x])
    del train["interest_level"]
    data = pd.concat([train, test]).reset_index()
    X_train, X_test, feats = coreProcess(data, y, train_idx, test_idx)
    xgb_clf = xgboostClassifier(
        objective = 'multi:softprob',
        eval_metric = 'mlogloss',
        num_class = 3,
        nthread = 12,
        eta = 0.02,
        max_depth = 6,
        subsample = 0.8,
        colsample_bytree = 1.0,
        colsample_bylevel = 0.8,
        min_child_weight=1,
        silent = 1,
        num_rounds = 1700,
        seed = RS,
    )
    print("Trainning:...")
    xgb_clf.fit(X_train, y)

    preds = xgb_clf.predict_proba(X_test)
    sub = pd.DataFrame(preds)
    # sub.columns = ["high", "medium", "low"]
    sub.columns = [ "high", "medium", "low"]
    sub["listing_id"] = test.listing_id.values
    sub.to_csv("submission.csv", index=False)


def genAvgSub(clfs):
    train = train_data.copy()
    train_idx = [i for i in range(train.shape[0])]
    test = test_data.copy()
    test_idx = [i + train.shape[0] for i in range(test.shape[0])]
    y = train["interest_level"].apply(lambda x: target_num_map[x])
    del train["interest_level"]
    data = pd.concat([train, test]).reset_index()
    X_train, X_test, feats = coreProcess(data, y, train_idx, test_idx)
    print("Trainning:...")
    preds = []
    for i in range(len(clfs)):
        print("Clf_{} fiting".format(i))
        clfs[i].fit(X_train, y)
        print("Clf_{} predicting".format(i))
        pred = clfs[i].predict_proba(X_test)
        preds.append(pred)
    sub = pd.DataFrame(np.mean(preds, axis=0))
    # sub.columns = ["high", "medium", "low"]
    sub.columns = [ "high", "medium", "low"]
    sub["listing_id"] = test.listing_id.values
    sub.to_csv("submission.csv", index=False)
    print("Train done.")


def validate(clfs):
    cv_scores = validation_avg_score(clfs)
    return cv_scores


def search():
    param_dict = {
        'eta' : [0.02],
        'max_depth' : [6],
        'subsample' : [0.8],
        'colsample_bylevel' : [0.7],
        'num_rounds' : [1400, 1500, 1600, 1650],
    }
    clf = xgboostClassifier(
        objective = 'multi:softprob',
        eval_metric = 'mlogloss',
        num_class = 3,
        nthread = 12,
        eta = 0.04,
        max_depth = 6,
        subsample = 0.7,
        colsample_bytree = 1.0,
        colsample_bylevel = 1.0,
        min_child_weight=1,
        silent = 1,
        num_rounds = 700,
        seed = RS,
    )
    paramSearch(clf, param_dict)


def write2file(cv_scores, val_desc=None):
    print("*" * 50)
    print("Cross validation loss: ", np.mean(cv_scores))
    with open("results.log", "a") as fp:
        fp.write(time.strftime("%m/%d/%Y %H:%M") + '\n')
        if(val_desc is not None):
            fp.write(val_desc + '\n')
        for score in cv_scores:
            fp.write(str(score) + " ")
        fp.write("\nCross Validation: {}\n".format(np.array(cv_scores).mean()))
        fp.write("*" * 50 + "\n")


def stacking(clfs):
    print("Stacking")
    train = train_data.copy()
    test = test_data.copy()
    y = train["interest_level"].apply(lambda x: target_num_map[x])
    del train["interest_level"]
    train_stackers = []
    for RS in [0, 1, 2, 64, 128, 256, 512, 1024, 2048, 4096]:
        skf = StratifiedKFold(n_splits=10, random_state=RS, shuffle=True)
        #Create Arrays for meta
        train_stacker = [[0.0 for s in range(3)]  for k in range (0,(train.shape[0]))]
        cv_scores = {i:[] for i in range(len(clfs))}
        cv_scores["Avg"] = []
        print("Begin 10-flod cross validation")
        cnt = 0
        for train_idx, val_idx in skf.split(train, y):
            cnt += 1
            X = train.copy()
            y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
            X_train, X_val, feats = coreProcess(X, y_train, train_idx, val_idx)
            X_train.toarray()
            preds = []
            k = 0
            for clf in clfs:
                clf.fit(X_train, y_train)
                y_val_pred = clf.predict_proba(X_val)
                loss = log_loss(y_val, y_val_pred)
                preds.append(y_val_pred)
                cv_scores[k].append(loss)
                k += 1
                print("Clf_{} iteration {}'s loss: {}".format(k, cnt, loss))
            preds = np.array(preds)
            avg_pred = np.mean(preds, axis=0)
            avg_loss = log_loss(y_val, avg_pred)
            cv_scores["Avg"].append(avg_loss)
            print("Iteration {}'s Avg loss: {}".format(cnt, avg_loss))
            no = 0
            for real_idx in val_idx:
                for i in range(3):
                    train_stacker[real_idx][i] = avg_pred[no][i]
                no += 1
        for i in range(len(clfs)):
            print("clf_{} validation loss : {}".format(i, np.mean(cv_scores[i])))
        print("Average validation loss : {}".format(np.mean(cv_scores["Avg"])))
        train_stackers.append(train_stacker)
    train_stacker = np.mean(train_stackers, axis=0)
    print("*** Validation finished ***\n")

    test_stacker = [[0.0 for s in range(3)]   for k in range (0,(test.shape[0]))]
    train_idx = [i for i in range(train.shape[0])]
    test_idx = [i + train.shape[0] for i in range(test.shape[0])]
    data = pd.concat([train, test]).reset_index()
    X_train, X_test, feats = coreProcess(data, y, train_idx, test_idx)
    print(X_train.shape, len(train_stacker))
    print("Begin predicting")
    preds = []
    for i in range(len(clfs)):
        print("Clf_{} fiting".format(i))
        clfs[i].fit(X_train, y)
        print("Clf_{} predicting".format(i))
        pred = clfs[i].predict_proba(X_test)
        preds.append(pred)
    preds = np.mean(preds, axis=0)
    for pr in range (0, len(preds)):  
            for d in range (0,3):            
                test_stacker[pr][d]=(preds[pr][d])   
    print ("merging columns")   
    #stack xgboost predictions
    X_train = np.column_stack((X_train.toarray(),train_stacker))
    # stack id to test
    X_test = np.column_stack((X_test.toarray(),test_stacker))         
    # stack target to train
    X = np.column_stack((y,X_train))
    ids = test.listing_id.values
    X_test = np.column_stack((ids, X_test))
    np.savetxt("./train_stacknet.csv", X, delimiter=",", fmt='%.5f')
    np.savetxt("./test_stacknet.csv", X_test, delimiter=",", fmt='%.5f') 
    print("Write results...")
    output_file = "submission_{}.csv".format(np.mean(cv_scores["Avg"]))
    print("Writing submission to %s" % output_file)
    f = open(output_file, "w")   
    f.write("listing_id,high,medium,low\n")# the header   
    for g in range(0, len(test_stacker))  :
      f.write("%s" % (ids[g]))
      for prediction in test_stacker[g]:
         f.write(",%f" % (prediction))    
      f.write("\n")
    f.close()
    print("Done.")


if __name__ == "__main__":
    clfs = []
    # clfs.append(xgboostClassifier(
    #     objective = 'multi:softprob',
    #     eval_metric = 'mlogloss',
    #     num_class = 3,
    #     nthread = 6,
    #     eta = 0.04,
    #     max_depth = 6,
    #     subsample = 0.7,
    #     colsample_bytree = 1.0,
    #     colsample_bylevel = 0.7,
    #     min_child_weight=1,
    #     silent = 1,
    #     num_rounds = 700,
    #     seed = 0,
    # ))
    # clfs.append(xgboostClassifier(
    #     objective = 'multi:softprob',
    #     eval_metric = 'mlogloss',
    #     num_class = 3,
    #     nthread = 6,
    #     eta = 0.02,
    #     max_depth = 6,
    #     subsample = 0.8,
    #     colsample_bytree = 1.0,
    #     colsample_bylevel = 0.8,
    #     min_child_weight=1,
    #     silent = 1,
    #     num_rounds = 1700,
    #     seed = 0,
    # ))
    clfs.append(xgboostClassifier(
        objective = 'multi:softprob',
        eval_metric = 'mlogloss',
        num_class = 3,
        nthread = 9,
        eta = 0.02,
        max_depth = 6,
        subsample = 0.8,
        colsample_bytree = 1.0,
        colsample_bylevel = 0.7,
        min_child_weight=1,
        silent = 1,
        num_rounds = 1500,
        seed = 0,
    ))
    clfs.append(xgboostClassifier(
        objective = 'multi:softprob',
        eval_metric = 'mlogloss',
        num_class = 3,
        nthread = 9,
        eta = 0.02,
        max_depth = 6,
        subsample = 0.8,
        colsample_bytree = 1.0,
        colsample_bylevel = 0.8,
        min_child_weight=1,
        silent = 1,
        num_rounds = 1500,
        seed = 128,
    ))
    clfs.append(xgboostClassifier(
        objective = 'multi:softprob',
        eval_metric = 'mlogloss',
        num_class = 3,
        nthread = 9,
        eta = 0.02,
        max_depth = 6,
        subsample = 0.8,
        colsample_bytree = 1.0,
        colsample_bylevel = 0.8,
        min_child_weight=1,
        silent = 1,
        num_rounds = 1500,
        seed = 512,
    )) 
    clfs.append(xgboostClassifier(
        objective = 'multi:softprob',
        eval_metric = 'mlogloss',
        num_class = 3,
        nthread = 9,
        eta = 0.02,
        max_depth = 6,
        subsample = 0.8,
        colsample_bytree = 1.0,
        colsample_bylevel = 0.8,
        min_child_weight=1,
        silent = 1,
        num_rounds = 1500,
        seed = 1024,
    ))   
    clfs.append(xgboostClassifier(
        objective = 'multi:softprob',
        eval_metric = 'mlogloss',
        num_class = 3,
        nthread = 9,
        eta = 0.02,
        max_depth = 6,
        subsample = 0.8,
        colsample_bytree = 1.0,
        colsample_bylevel = 0.8,
        min_child_weight=1,
        silent = 1,
        num_rounds = 1500,
        seed = 2048,
    ))    
    if len(sys.argv) == 1:
        cv_scores = validate(clfs)
        write2file(cv_scores)
    elif len(sys.argv) == 2:
        if sys.argv[1] == '-v':
            cv_scores = validate(clfs)
            write2file(cv_scores)
        elif sys.argv[1] == '-g':
            gen_sub()
        elif sys.argv[1] == '-s':
            search()
        elif sys.argv[1] == '-ga':
            genAvgSub(clfs)
        elif sys.argv[1] == '-stack':
            stacking(clfs)
        elif sys.argv[1] == '-v3':
            cv_scores = validate(clfs)
            val_desc = sys.argv[2]
            write2file(cv_scores, val_desc)
    elif len(sys.argv) == 3:
        if sys.argv[1] == '-v':
            cv_scores = validate(clfs)
            val_desc = sys.argv[2]
            write2file(cv_scores, val_desc)
        elif sys.argv[1] == '-g':
            gen_sub()
        elif sys.argv[1] == '-v3':
            cv_scores = validation_score()
            val_desc = sys.argv[2]
            write2file(cv_scores, val_desc)








================================================
FILE: preprocess.py
================================================
#!/usr/bin/python3
#-*- encoding: utf-8 -*-
import sys
import random
import operator
import datetime
import time
from collections import defaultdict, Counter
import pandas as pd
import numpy as np
from scipy import sparse
import xgboost as xgb
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.cluster import KMeans
from sklearn.cross_validation import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from nltk.metrics import distance as distance


FEATURE_NOT_USE = ['created','description','features','photos', 'index']# ,'bathrooms', 'bedrooms''listing_id',
FEATURE_NOT_USE.append('display_address')
FEATURE_NOT_USE.extend(['low_build_frac', 'high_build_frac', 'medium_build_frac', 'build_count'])# 
FEATURE_NOT_USE.extend(['low_manager_frac', 'high_manager_frac', 'medium_manager_frac','manager_count'])#
FEATURE_NOT_USE.extend(['Listing_Id', 'img_created']) # , 'time_stamp'

def bedroomProcess(data, train_idx, test_idx):
    # Some basic feature from bedrooms
    data["no_bedroom"] = data["bedrooms"].apply(lambda x: 1 if x == 0 else 0)
    data["more_than_5_bedroom"] = data["bedrooms"].apply(lambda x: 1 if x >= 5 else 0)
    data.loc[data["bedrooms"] + data["bathrooms"] == 0, "bedrooms"] = 0.001
    train = data.iloc[train_idx, :].copy()
    test = data.iloc[test_idx, :].copy()
    # remove null value (ugly code)
    train.loc[data["bedrooms"] == 0.001, "bathrooms"] = train["bathrooms"].mean()
    test.loc[data["bedrooms"] == 0.001, "bathrooms"] = test["bathrooms"].mean()
    data.iloc[train_idx, :] = train
    data.iloc[test_idx, :] = test
    data["bedroom_per_room"] = data["bedrooms"] / (data["bedrooms"] + data["bathrooms"])
    data.loc[data["bedrooms"] == 0.001, "bathrooms"] = 0
    data.loc[data["bedrooms"] == 0.001, "bedrooms"] = 0
    return data


def bathroomProcess(data, train_idx, test_idx):
    # Some basic feature from bathrooms
    data.loc[data["bathrooms"] == 112, "bathrooms"] = 1.5
    data.loc[data["bathrooms"] == 10, "bathrooms"] = 1
    data.loc[data["bathrooms"] == 20, "bathrooms"] = 2
    data["1_to_2_bathrooms"] = data["bathrooms"].apply(lambda x : 1if x != 0 and x <= 2 else 0)
    data.loc[data["bedrooms"] + data["bathrooms"] == 0, "bathrooms"] = 0.001
    train = data.iloc[train_idx, :].copy()
    test = data.iloc[test_idx, :].copy()
    # remove null value (ugly code)
    train.loc[data["bathrooms"] == 0.001, "bedrooms"] = train["bedrooms"].mean()
    test.loc[data["bathrooms"] == 0.001, "bedrooms"] = test["bedrooms"].mean()
    data.iloc[train_idx, :] = train
    data.iloc[test_idx, :] = test
    data["bathoom_per_room"] = data["bathrooms"] / (data["bedrooms"] + data["bathrooms"])
    data.loc[data["bathrooms"] == 0.001, "bedrooms"] = 0
    data.loc[data["bathrooms"] == 0.001, "bathrooms"] = 0
    return data


def buildingIdProcess(data, y, train_idx, test_idx):
    # Have tried some ideas but failed
    return data


def createdProcess(data):
    # Some basic features from created
    data["created"] = pd.to_datetime(data['created'])
    data["latest"] = (data["created"]- data["created"].min())
    data["latest"] = data["latest"].apply(lambda x: x.total_seconds())
    data["passed"] = (data["created"].max()- data["created"])
    data["passed"] = data["passed"].apply(lambda x: x.total_seconds())
    # year is weird
    data["year"] = data["created"].dt.year
    data['month'] = data['created'].dt.month
    data['day'] = data['created'].dt.day
    data['hour'] = data['created'].dt.hour
    data['weekday'] = data['created'].dt.weekday
    data['week'] = data['created'].dt.week
    data['quarter'] = data['created'].dt.quarter
    data['weekend'] = ((data['weekday'] == 5) & (data['weekday'] == 6))
    data['weekend'] = data['weekend'].apply(int)
    # data["created_stamp"] = data["created"].apply(lambda x: time.mktime(x.timetuple()))
    #*
    data["latest_list_rank"] = data["latest"] / data["listing_id"]   
    # data["diff_rank_2"] = data["passed"] / data["listing_id"]
    #*

    # image time after leak
    data.loc[data["time_stamp"] > 1490000000, "time_stamp"] = 1478524550
    data["img_created"] = data["time_stamp"].apply(lambda x: datetime.datetime.fromtimestamp(x))
    data["img_latest"] = (data["img_created"]- data["img_created"].min())
    data["img_latest"] = data["img_latest"].apply(lambda x: x.total_seconds())
    data["img_passed"] = (data["img_created"].max()- data["img_created"])
    data["img_passed"] = data["img_passed"].apply(lambda x: x.total_seconds())
    data["img_year"] = data["img_created"].dt.year
    data['img_month'] = data['img_created'].dt.month
    data['img_day'] = data['img_created'].dt.day
    data['img_hour'] = data['img_created'].dt.hour
    # data['img_weekday'] = data['img_created'].dt.weekday
    # data['img_week'] = data['img_created'].dt.week
    # data['img_quarter'] = data['img_created'].dt.quarter
    # data['img_weekend'] = ((data['img_weekday'] == 5) & (data['img_weekday'] == 6))
    # data['img_weekend'] = data['img_weekend'].apply(int)
    data["img_latest_list_rank"] = data["img_latest"] / data["listing_id"] 

    return data


def descriptionProcess(data, train_idx, test_idx):
    data["description_words_num"] = data["description"].apply(lambda x: len(x.split(' ')))
    data["description_len"] = data["description"].apply(len)
    # Some info from descriptions
    desc_feats = {
                  'bedroom_mentions': ['br ', '---', "<a", "a>", "<p>"],
                  'html_tag_1':["<img ", "</a>", "<li>", "</li>", "<ul>", "</ul>", "-->", "<close","<hr"],
                }
    for name, kwords in desc_feats.items():
        data[name] =  data['description'].apply(lambda x: sum([x.count(w)  for w in kwords]))

    data['description'] =  data['description'].apply(lambda x: str(x).encode('utf-8') if len(x)>2 else "nulldesc") 
    # Tf-idf Encode
    tfidfdesc=TfidfVectorizer(min_df=20, max_features=50, strip_accents='unicode',lowercase =True,
                        analyzer='word', token_pattern=r'\w{16,}', ngram_range=(1, 2), use_idf=False,smooth_idf=False, 
    sublinear_tf=True, stop_words = 'english')  
    tr_sparsed = tfidfdesc.fit_transform (data.iloc[train_idx, :]["description"])  
    te_sparsed = tfidfdesc.transform(data.iloc[test_idx, :]["description"])
    feats_names = ["desc_" + x for x in tfidfdesc.get_feature_names()]
    return data, tr_sparsed, te_sparsed, feats_names


def displayAddrProcess(data):
    # disp_price_dict = dict(data.groupby('display_address')['price'].mean())
    # data["mean_disp_price"] = data.apply(lambda row: disp_price_dict[row["display_address"]], axis=1)
    # data["addr_sim"] = data.apply(lambda row: distance.edit_distance(row["display_address"].lower(), row["street_address"].lower()), axis=1)
    return data


def featuresProcess(data, train_idx, test_idx):
    def afterRemoveStr(l, s):
        while s in l:
            l.remove(s)
        return l

    def afterRemoveFirstSpace(l):
        res = []
        for s in l:
            res.append(s.strip())
        return res

    data["features_num"] = data["features"].apply(len)
    mark = "#+-+#"
    data["features"] = data["features"].apply(lambda x: mark.join([i for i in x]))
    data["features"] = data["features"].apply(lambda x: x.lower())

    # Deal with list like data
    data["features"] = data["features"].apply(lambda x: mark.join([i for i in x.split(" * ")]))
    data["features"] = data["features"].apply(lambda x: mark.join([i for i in x.split("**")]))
    data['features']=data['features'].str.replace("✓ hardwood floor ✓ high ceilings ✓ dishwasher",
        "hardwood floor" + mark + "high ceilings" + mark + "dishwasher")
    data['features']=data['features'].str.replace(
        "• on-site lifestyle concierge by luxury attaché " + 
        "•24/7 doorman " + 
        "• state of the art cardiovascular and weight training equipment " +
        "• 24-hour valet parking garage " +
        "• valet services including dry cleaning",
        "on-site lifestyle concierge by luxury attaché" + mark + 
        "24/7 doorman" + mark + 
        "state of the art cardiovascular and weight training equipment" + mark + 
        "24-hour valet parking garage" + mark + 
        "valet services including dry cleaning")
    data['features']=data['features'].str.replace(
        '{     0 = "laundry in unit";     ' + 
        '1 = "cats allowed";     '+
        '10 = hardwood;     '+
        '11 = "high ceilings";     '+
        '12 = renovated;     '+
        '13 = "marble bath";     '+
        '14 = "granite kitchen";     '+
        '15 = light;     '+
        '16 = "no fee";     '+
        '17 = "walk-in closet";     '+
        '2 = "dogs allowed";     '+
        '3 = elevator;     '+
        '4 = exclusive;     '+
        '6 = laundry;     '+
        '7 = subway;     '+
        '8 = dishwasher;     '+
        '9 = washer; }',
        "laundry in unit" + mark + "cats allowed" + mark + "hardwood" + 
        "high ceilings" + mark + "renovated" + mark + "marble bath" + 
        "granite kitchen" + mark + "light" + mark + "no fee" +
        "walk-in closet" + mark + "dogs allowed" + mark + "elevator" +
        "exclusive" + mark + "laundry" + mark + "subway"+
        "dishwasher" + mark + "washer")
    data['features']=data['features'].str.replace("windowed air-conditioned and monitored laundry room",
        "windowed air-conditioned" + mark + "monitored laundry room")
    data['features']=data['features'].str.replace("wall of windows. huge bedrooms",
        "wall of windows" + mark + "huge bedrooms")
    data['features']=data['features'].str.replace("to relax and recharge. this spacious 3 bedroom/2 bath residence also features oak hardwood flooring",
        "spacious" + mark + "3 bedroom" + mark + "2 bath" + mark + "residence" + mark + "oak hardwood flooring")
    data['features']=data['features'].str.replace("stunning 3 bedroom apartment with a terrace! east harlem! the best deal out now! get it now!!!!",
        "stunning" + mark + "3 bedroom" + mark + "a terrace" + mark + "east harlem" + mark + "the best deal out now! get it now!!!!")
    data['features']=data['features'].str.replace("ss appliances - d/w -  m/w - recessed lighting - hardwood floors - high ceilings - marble bath",
        "ss appliances - d/w -  m/w - " + mark + "recessed lighting" + mark + "hardwood floors" + mark + "high ceilings" + mark + "marble bath")
    data['features']=data['features'].str.replace("spacious living room for any kind of entertainment. prime location in theater distric",
        "spacious living room for any kind of entertainment." + mark + "prime location in theater distric")
    data['features']=data['features'].str.replace("spacious living room + home office",
        "spacious living room" + mark + "home office")
    data['features']=data['features'].str.replace("spacious and sunny 1st floor apartment "+
        "overlooking the garden  " + 
        "*great williamsburg location*  "+
        "steps from shopping and cafes "+
        "and 5 minute walk to graham avenue l train (3rd stop from manhattan)  "+
        "*shared back yard * "+
        "large box style rooms * "+
        "huge living room with high ceilings * "+
        "nice bathroom with granite floor & ceramic tile * "+
        "beautiful kitchen with granite counter tops  lots of closet spacehardwood floors *"+
        " heat included in the rent  "+
        "clean quiet building   "+
        "cat ok  "+
        "great location close to shopping",
        "spacious"+ mark +"sunny 1st floor"+ mark+ 
        "overlooking the garden" + mark+ 
        "great williamsburg location"+ mark+ 
        "steps from shopping and cafes"+ mark+ 
        "5 minute walk to graham avenue"+ mark +"train (3rd stop from manhattan)"+ mark+ 
        "shared back yard"+mark+ 
        "large box style rooms"+mark+ 
        "huge living room " + mark + "high ceilings"+ mark+ 
        "nice bathroom" + mark +"granite floor" + mark +"ceramic tile * "+mark+ 
        "beautiful kitchen" + mark +"granite counter tops" + mark +"closet " + mark +"spacehardwood floors"+mark+ 
        "heat included in the rent"+mark+ 
        "clean quiet building"+mark+ 
        "cat ok"+mark+ 
        "close to shopping")
    data['features']=data['features'].str.replace("residents-only " + 
        "fitness center " + 
        "and aerobic room " + 
        "professionally outfitted with a full complement of strength and cardio-training equipment",
        "residents-only"+ mark +"itness center"+ mark+ 
        "and aerobic room" + mark+ 
        "cardio-training equipment")
    data['features']=data['features'].str.replace("owner occupied - " + 
        "3 family townhouse - " + 
        "no realtor fees -"+
        " this beautiful apt is offered below market rate",
        "owner occupied"+ mark +"3 family townhouse"+ mark+ 
        "no realtor fees" + mark+ 
        "this beautiful apt is offered below market rate")
    data['features']=data['features'].str.replace("newly renovated "+
        "w/ oak wood floors   "+
        "mid century modern style interior   "+
        "large closets in every bedroom "+
        "extra storage space in hall. "+
        "large living room",
        "newly renovated"+ mark +"oak wood floors"+ mark+ 
        "mid century modern style interior" + mark+ 
        "large closets in every bedroom" + mark+ 
        "extra storage space in hall"+ mark +"large living room")
    data['features']=data['features'].str.replace("live-in super package room "+
        "smoke-free "+
        "storage available "+
        "virtual doorman "+
        "guarantors accepted",

        "live-in super package room"+ mark +"smoke-free"+ mark+ 
        "storage available" + mark+ 
        "virtual doorman" + mark+ 
        "guarantors accepted")
    data['features']=data['features'].str.replace("live-in super package room "+
        "smoke-free "+
        "storage available "+
        "virtual doorman "+
        "guarantors accepted",

        "live-in super package room"+ mark +"smoke-free"+ mark+ 
        "storage available" + mark+ 
        "virtual doorman" + mark+ 
        "guarantors accepted")

    # Merging some features
    data['features']=data['features'].str.replace("washer/dyer combo","washer/dyer")
    data['features']=data['features'].str.replace("washer/dryer inside the unit","washer/dyer")
    data['features']=data['features'].str.replace("washer/dryer in-unit","washer/dyer")
    data['features']=data['features'].str.replace("washer/dryer in unit","washer/dyer")
    data['features']=data['features'].str.replace("washer/dryer in building","washer/dyer")
    data['features']=data['features'].str.replace("washer/dryer in bldg","washer/dyer")
    data['features']=data['features'].str.replace("washer/dryer hookup","washer/dyer")
    data['features']=data['features'].str.replace("washer/dryer  stove/oven","washer/dyer")
    data['features']=data['features'].str.replace("washer/drier hookups","washer/dyer")
    data['features']=data['features'].str.replace("washer/ dryer in unit","washer/dyer")
    data['features']=data['features'].str.replace("washer/ dryer hookups","washer/dyer")
    data['features']=data['features'].str.replace("washer-dryer in unit","washer/dyer")
    data['features']=data['features'].str.replace("washer-dryer hookups","washer/dyer")
    data['features']=data['features'].str.replace("washer in unit","washer/dyer")
    data['features']=data['features'].str.replace("washer dryer in unit","washer/dyer")
    data['features']=data['features'].str.replace("washer dryer hookup","washer/dyer")
    data['features']=data['features'].str.replace("washer dryer hook up","washer/dyer")
    data['features']=data['features'].str.replace("washer and dryer in unit","washer/dyer")
    data['features']=data['features'].str.replace("washer and dryer in the unit","washer/dyer")
    data['features']=data['features'].str.replace("washer and dryer","washer/dyer")
    data['features']=data['features'].str.replace("washer / dryer in unit","washer/dyer")
    data['features']=data['features'].str.replace("washer / dryer (hookup only)","washer/dyer")
    data['features']=data['features'].str.replace("washer / dryer","washer/dyer")
    data['features']=data['features'].str.replace("washer & dryer.","washer/dyer")
    data['features']=data['features'].str.replace("washer","washer/dyer")
    data['features']=data['features'].str.replace("wash/dryer","washer/dyer")


    data['features']=data['features'].str.replace("pets: cats/small dogs","pet-friendly")
    data['features']=data['features'].str.replace("pets welcome","pet-friendly")
    data['features']=data['features'].str.replace("pets upon approval","pet-friendly")
    data['features']=data['features'].str.replace("pets on approval","pet-friendly")
    data['features']=data['features'].str.replace("pets ok.","pet-friendly")
    data['features']=data['features'].str.replace("pets ok","pet-friendly")
    data['features']=data['features'].str.replace("pets are welcome","pet-friendly")
    data['features']=data['features'].str.replace("pets allowed","pet-friendly")
    data['features']=data['features'].str.replace("pets accepted (on approval)","pet-friendly")
    data['features']=data['features'].str.replace("pets","pet-friendly")
    data['features']=data['features'].str.replace("pet grooming room","pet-friendly")
    data['features']=data['features'].str.replace("pet friendly building","pet-friendly")
    data['features']=data['features'].str.replace("pet friendly ( case by case )","pet-friendly")
    data['features']=data['features'].str.replace("pet friendly","pet-friendly")
    data['features']=data['features'].str.replace("pet friendly building","pet-friendly")
    data['features']=data['features'].str.replace("pet friendly building","pet-friendly")

    data['features']=data['features'].str.replace("garden/patio","garden")
    data['features']=data['features'].str.replace("patio","garden")
    data['features']=data['features'].str.replace("residents_garden","garden")
    data['features']=data['features'].str.replace("common garden","garden")

    data['features']=data['features'].str.replace("wifi access","wifi")
    data['features']=data['features'].str.replace("wifi included","wifi")
    data['features']=data['features'].str.replace("wifi in resident lounge","wifi")
    data['features']=data['features'].str.replace("wifi + utilities","wifi")
    data['features']=data['features'].str.replace("wi fi work lounge","wifi")
    data['features']=data['features'].str.replace("wi-fi access","wifi")

    data['features']=data['features'].str.replace("24/7","24")
    data['features']=data['features'].str.replace("24-hour","24")
    data['features']=data['features'].str.replace("24hr","24")
    data['features']=data['features'].str.replace("concierge","doorman")
    data['features']=data['features'].str.replace("ft doorman","doorman")
    data['features']=data['features'].str.replace("24 doorman","doorman")
    data['features']=data['features'].str.replace("24 hr doorman","doorman")
    data['features']=data['features'].str.replace("doorman service","doorman")
    data['features']=data['features'].str.replace("full-time doorman","doorman")

    data['features']=data['features'].str.replace("gym/fitness","fitness")
    data['features']=data['features'].str.replace("fitness room","fitness")

    data['features']=data['features'].str.replace("washer","laundry")
    data['features']=data['features'].str.replace("laundry in bldg","laundry")
    data['features']=data['features'].str.replace("laundry in building","laundry")
    data['features']=data['features'].str.replace("laundry in building/dryer","laundry")
    data['features']=data['features'].str.replace("laundry in building_&_dryer","laundry")
    data['features']=data['features'].str.replace("laundry room","laundry")
    data['features']=data['features'].str.replace("laundry & housekeeping","laundry")
    data['features']=data['features'].str.replace("laundry in unit","laundry")
    data['features']=data['features'].str.replace("laundry in-unit","laundry")
    data['features']=data['features'].str.replace("laundry on every floor","laundry")
    data['features']=data['features'].str.replace("laundry on floor","laundry")
    data['features']=data['features'].str.replace("in-unit laundry/dryer","laundry")
    data['features']=data['features'].str.replace("on-site laundry","laundry")
    data['features']=data['features'].str.replace("laundry/dryer","laundry")

    data['features']=data['features'].str.replace("high-speed internet","high_speed_internet")
    data['features']=data['features'].str.replace("high speed internet available","high_speed_internet")

    data['features']=data['features'].str.replace("parking available","parking")
    data['features']=data['features'].str.replace("parking space","parking")
    data['features']=data['features'].str.replace("on-site garage","parking")
    data['features']=data['features'].str.replace("on-site parking","parking")
    data['features']=data['features'].str.replace("on-site parking lot","parking")
    data['features']=data['features'].str.replace("full service garage","parking")
    data['features']=data['features'].str.replace("common parking/garage","parking")
    data['features']=data['features'].str.replace("garage","parking")
    data['features']=data['features'].str.replace("assigned-parking-space","private_parking")

    data['features']=data['features'].str.replace("storage available","storage")
    data['features']=data['features'].str.replace("storage facilities available","storage")
    data['features']=data['features'].str.replace("storage space","storage")
    data['features']=data['features'].str.replace("storage room","storage")
    data['features']=data['features'].str.replace("common storage","storage")

    data['features']=data['features'].str.replace("central a/c","central_air")
    data['features']=data['features'].str.replace("central ac","central_air")
    data['features']=data['features'].str.replace("air conditioning","central_air")

    data['features']=data['features'].str.replace("close to  subway","subway")

    data['features']=data['features'].str.replace("roofdeck","roof-deck")
    data['features']=data['features'].str.replace("roof-deck","roof-deck")
    data['features']=data['features'].str.replace("rooftop terrace","roof-deck")
    data['features']=data['features'].str.replace("rooftop deck","roof-deck")
    data['features']=data['features'].str.replace("roof access","roof-deck")
    data['features']=data['features'].str.replace("common roof deck","roof-deck")
    data['features']=data['features'].str.replace("roof decks","roof-deck")
    data['features']=data['features'].str.replace("roof grilling area","roof-deck")
    data['features']=data['features'].str.replace("roof garden and lounge","roof-deck")
    data['features']=data['features'].str.replace("roof deck with stunning view","roof-deck")
    data['features']=data['features'].str.replace("roof deck with real grass","roof-deck")
    data['features']=data['features'].str.replace("roof deck with grills","roof-deck")
    data['features']=data['features'].str.replace("roof deck w/ grills","roof-deck")
    data['features']=data['features'].str.replace("roof deck / sun deck","roof-deck")
    data['features']=data['features'].str.replace("roof deck","roof-deck")

    data['features']=data['features'].str.replace("swimming pool","pool")
    data['features']=data['features'].str.replace("indoor pool","pool")

    data['features']=data['features'].str.replace("deco fireplace","fireplaces")
    data['features']=data['features'].str.replace("decorative fireplace","fireplaces")

    data['features']=data['features'].str.replace("yoga/pilates studio","yoga")
    data['features']=data['features'].str.replace("yoga studio","yoga")
    data['features']=data['features'].str.replace("yoga room","yoga")
    data['features']=data['features'].str.replace("yoga classes","yoga")
    data['features']=data['features'].str.replace("yoga and spin studios","yoga")
    data['features']=data['features'].str.replace("yoga an pilates class","yoga")
    data['features']=data['features'].str.replace("yoga / dance studio","yoga")


    # data["features"] = data["features"].apply(lambda x: afterRemoveStr(x, ''))
    # data["features"] = data["features"].apply(lambda x: afterRemoveFirstSpace(x))
    data["features"] = data["features"].apply(lambda x: x.split(mark))
    data["features"] = data["features"].apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x]))
    tfidf = CountVectorizer(stop_words="english", max_features=200)
    tr_sparse_feats = tfidf.fit_transform(data.iloc[train_idx, :]["features"])
    te_sparse_feats = tfidf.transform(data.iloc[test_idx, :]["features"])
    feats_names = ["features_" + x for x in tfidf.get_feature_names()]
    return data, tr_sparse_feats, te_sparse_feats, feats_names


def locationProcess(data, train_idx, test_idx):
    # Clustering

    # train_x = data.iloc[train_idx,:][['new_latitude', 'new_longitude']]
    # stest_x = data.iloc[test_idx,:][['new_latitude', 'new_longitude']]
    train_x = data.iloc[train_idx, :][['latitude', 'longitude']]
    test_x = data.iloc[test_idx, :][['latitude', 'longitude']]
    kmeans_cluster = KMeans(n_clusters=20)
    res = kmeans_cluster.fit(train_x)
    res = kmeans_cluster.predict(pd.concat([train_x, test_x]))
    d = dict(zip(data['listing_id'], res))
    data['cenroid'] = data['listing_id'].apply(lambda x: d[x])
    # Manhattan distance
    center = [data.iloc[train_idx, :]['latitude'].mean(), data.iloc[train_idx, :]['longitude'].mean()]
    data['distance'] = abs(data['latitude'] - center[0]) + abs(data['longitude'] - center[1])
    # data['distance_2'] = np.sqrt((data['latitude'] - center[0]) ** 2 + (data['longitude'] - center[1]) ** 2)

    return data


def managerIdProcess(data, y, train_idx, test_idx):
    manager_lgt_dict = dict(data.groupby('manager_id')['longitude'].mean())
    manager_ltt_dict =  dict(data.groupby('manager_id')['latitude'].mean())

    # Group manager_id with location info
    data["mean_man_longitude"] = data.apply(lambda row: manager_lgt_dict[row["manager_id"]], axis=1)
    data["mean_man_latitude"] = data.apply(lambda row: manager_ltt_dict[row["manager_id"]], axis=1)

    # Group manager_id with time info
    data = group_with_time_features(data, "manager_id")
    data = group_with_img_time_features(data, "manager_id")
    manager_stamp_dict = dict(data.groupby('manager_id')['time_stamp'].mean())
    data["mean_man_timestamp"] = data.apply(lambda row: manager_stamp_dict[row["manager_id"]], axis=1)
    # manager_stamp_dict = dict(data.groupby('manager_id')['created_stamp'].mean())
    # data["mean_man_createdstamp"] = data.apply(lambda row: manager_stamp_dict[row["manager_id"]], axis=1)  
    return data


def photoProcess(data):
    data["photo_num"] = data["photos"].apply(len)
    return data


def priceProcess(data):
    #data["out_price"] = data["price"].apply(lambda x: 1 if x < 700 or x > 15000 else 0)
    # Clean the outlier
    ulimit = 15000#np.percentile(data.price.values, 99)
    data.loc[data["price"] > ulimit, "price"] = ulimit
    dlimit = 350
    data.loc[data["price"] < dlimit, "price"] = dlimit
    data["price_per_room"] = data["price"] / (data["bedrooms"] + data["bathrooms"] + 1.0)
    data["price_per_bed"] = data["price"] / (data["bedrooms"] + 1.0)
    #*
    # data.loc[~np.isfinite(data["price_per_room"]), "price_per_room"] = 0
    # data.loc[~np.isfinite(data["price_per_bed"]), "price_per_bed"] = 0
    data["price_latitude"] = data["price"] / (data["latitude"] + 1.0)
    data["price_longitude"] = data["price"] / (data["longitude"] + 1.0)

    # Grouping price with size or build
    median_list = ['bedrooms', 'bathrooms', 'building_id']
    # median_list = ['month', 'day', 'hour', 'weekday', 'quarter', 'week', 'passed', 'latest']
    for col in median_list:
        median_price = data[[col, 'price']].groupby(col)['price'].median()
        median_price = median_price[data[col]].values.astype(float)
        data['median_' + col] = median_price
        data['ratio_' + col] = data['price'] / median_price
        data['median_' + col] = data['median_' + col].apply(lambda x: np.log(x))
    # data["price"] = data["price"].apply(lambda x: np.log(x))
    return data


def streetAddrProcess(data):
    #data["new_addr"] = data["street_address"].apply(lambda x: ' '.join([x.split()[i] for i in range(1, len(x.split()))]))
    #data["new_addr"] = preprocessing.LabelEncoder().fit_transform(data["new_addr"])
    # data["street_address"] = data["street_address"].apply(lambda x: x.replace('\u00a0', '').strip().lower)
    return data


def listingIdProcess(data):
    # It's weird。
    data["listing_id"] = data["listing_id"] - 68119576.0
    return data


def coreProcess(data, y_train, train_idx, test_idx):
    data = listingIdProcess(data)
    data = bedroomProcess(data, train_idx, test_idx)
    data = bathroomProcess(data, train_idx, test_idx)
    data["room_diff"] = data["bathrooms"] - data["bedrooms"]
    data["room_num"] = data["bedrooms"] + data["bathrooms"]
    data = createdProcess(data)
    data = buildingIdProcess(data, y_train, train_idx, test_idx)
    data, tr_sparsed, te_sparsed, feats_sparsed = descriptionProcess(data, train_idx, test_idx)
    data = displayAddrProcess(data)
    data, tr_sparse, te_sparse, feats_sparse = featuresProcess(data, train_idx, test_idx)
    data = locationProcess(data, train_idx, test_idx)
    data = managerIdProcess(data, y_train, train_idx, test_idx)
    data = photoProcess(data)
    data = priceProcess(data)
    data = streetAddrProcess(data)
    
    categorical = ["display_address", "manager_id", "building_id", "street_address"]
    for f in categorical:
        if data[f].dtype=='object':
            cases=defaultdict(int)
            temp=np.array(data[f]).tolist()
            for k in temp:
                cases[k]+=1
            # print(f, len(cases))
            data[f] = data[f].apply(lambda x: cases[x])
            
    feats_in_use = [col for col in data.columns if col not in FEATURE_NOT_USE]

    data_train = np.array(data.iloc[train_idx, :][feats_in_use])
    data_test  = np.array(data.iloc[test_idx, :][feats_in_use])
    # Feature Scaling
    stda = StandardScaler()  
    data_test = stda.fit_transform(data_test)          
    data_train = stda.transform(data_train)
    #  High cardinality feature
    high_card_feats = ["building_id", "manager_id", "longitude", "room_diff"] # "building_id", "manager_id", 
    # C0 = [3, 12, 0, 4]
    C0 = [feats_in_use.index(f) for f in high_card_feats]
    W_train, W_cv = convert_to_avg(data_train, y_train, data_test, seed=1, cvals=5, roundings=2, columns=C0)
    #  Add Sparse feature
    data_train = sparse.hstack([data_train, tr_sparse, tr_sparsed, W_train[:, C0]]).tocsr()
    data_test = sparse.hstack([data_test, te_sparse, te_sparsed, W_cv[:, C0]]).tocsr()
    feats_in_use.extend(feats_sparse)
    feats_in_use.extend(feats_sparsed)
    feats_in_use.extend(["build_high_card", "manager_high_card"])
    # print(len(feats_in_use))
    # print(tr_sparse.toarray().shape, tr_sparsed.toarray().shape, len(feats_in_use), data_train.shape)
    return data_train, data_test, feats_in_use


# Copy from KazAnova's starter code
def convert_dataset_to_avg(xc,yc,xt, rounding=2,cols=None):
    xc = xc.tolist()
    xt = xt.tolist()
    yc = yc.tolist()
    if cols == None:
        cols =[k for k in range(0,len(xc[0]))]
    woe=[ [0.0 for k in range(0,len(cols))] for g in range(0,len(xt))]
    good=[]
    bads=[]
    for col in cols:
        dictsgoouds=defaultdict(int)        
        dictsbads=defaultdict(int)
        good.append(dictsgoouds)
        bads.append(dictsbads)        
    total_count=0.0
    total_sum =0.0

    for a in range (0,len(xc)):
        target=yc[a]
        total_sum+=target
        total_count+=1.0
        for j in range(0,len(cols)):
            col=cols[j]
            good[j][round(xc[a][col],rounding)]+=target
            bads[j][round(xc[a][col],rounding)]+=1.0  
    #print(total_goods,total_bads)            
    
    for a in range (0,len(xt)):    
        for j in range(0,len(cols)):
            col=cols[j]
            if round(xt[a][col],rounding) in good[j]:
                 woe[a][j]=float(good[j][round(xt[a][col],rounding)])/float(bads[j][round(xt[a][col],rounding)])  
            else :
                 woe[a][j]=round(total_sum/total_count)
    return woe            


def convert_to_avg(X,y, Xt, seed=1, cvals=5, roundings=2, columns=None):
    
    if columns==None:
        columns=[k for k in range(0,(X.shape[1]))]    
    #print("it is not!!")        
    X=X.tolist()
    Xt=Xt.tolist() 
    woetrain=[ [0.0 for k in range(0,len(X[0]))] for g in range(0,len(X))]
    woetest=[ [0.0 for k in range(0,len(X[0]))] for g in range(0,len(Xt))]    
    
    kfolder=StratifiedKFold(y, n_folds=cvals,shuffle=True, random_state=seed)
    for train_index, test_index in kfolder:
        # creaning and validation sets
        X_train, X_cv = np.array(X)[train_index], np.array(X)[test_index]
        y_train = np.array(y)[train_index]

        woecv=convert_dataset_to_avg(X_train,y_train,X_cv, rounding=roundings,cols=columns)
        X_cv=X_cv.tolist()
        no=0
        for real_index in test_index:
            for j in range(0,len(X_cv[0])):
                woetrain[real_index][j]=X_cv[no][j]
            no+=1
        no=0
        for real_index in test_index:
            for j in range(0,len(columns)):
                col=columns[j]
                woetrain[real_index][col]=woecv[no][j]
            no+=1      
    woefinal=convert_dataset_to_avg(np.array(X),np.array(y),np.array(Xt), rounding=roundings,cols=columns) 

    for real_index in range(0,len(Xt)):
        for j in range(0,len(Xt[0])):           
            woetest[real_index][j]=Xt[real_index][j]
            
    for real_index in range(0,len(Xt)):
        for j in range(0,len(columns)):
            col=columns[j]
            woetest[real_index][col]=woefinal[real_index][j]
            
    return np.array(woetrain), np.array(woetest)


# Grouping (Very important)
def group_with_time_features(data, g_feat):
    mean_month_dict = dict(data.groupby(g_feat)['month'].mean())
    data["mean_" + g_feat + "_month"] = data.apply(lambda row: mean_month_dict[row[g_feat]], axis=1)
    mean_day_dict = dict(data.groupby(g_feat)['day'].mean())
    data["mean_" + g_feat + "_day"] = data.apply(lambda row: mean_day_dict[row[g_feat]], axis=1)
    mean_hour_dict = dict(data.groupby(g_feat)['hour'].mean())
    data["mean_" + g_feat + "_hour"] = data.apply(lambda row: mean_hour_dict[row[g_feat]], axis=1)
    mean_weekday_dict = dict(data.groupby(g_feat)['weekday'].mean())
    data["mean_" + g_feat + "_weekday"] = data.apply(lambda row: mean_weekday_dict[row[g_feat]], axis=1)
    mean_quarter_dict = dict(data.groupby(g_feat)['quarter'].mean())
    data["mean_" + g_feat + "_quater"] = data.apply(lambda row: mean_quarter_dict[row[g_feat]], axis=1)
    mean_week_dict = dict(data.groupby(g_feat)['week'].mean())
    data["mean_" + g_feat + "_week"] = data.apply(lambda row: mean_week_dict[row[g_feat]], axis=1)
    mean_passed_dict = dict(data.groupby(g_feat)['passed'].mean())
    data["mean_" + g_feat + "_passed"] = data.apply(lambda row: mean_passed_dict[row[g_feat]], axis=1)
    mean_latest_dict = dict(data.groupby(g_feat)['latest'].mean())
    data["mean_" + g_feat + "_latest"] = data.apply(lambda row: mean_latest_dict[row[g_feat]], axis=1)

    return data


def group_with_img_time_features(data, g_feat):
    mean_month_dict = dict(data.groupby(g_feat)['img_month'].mean())
    data["mean_" + g_feat + "_img_month"] = data.apply(lambda row: mean_month_dict[row[g_feat]], axis=1)
    mean_day_dict = dict(data.groupby(g_feat)['img_day'].mean())
    data["mean_" + g_feat + "_img_day"] = data.apply(lambda row: mean_day_dict[row[g_feat]], axis=1)
    mean_hour_dict = dict(data.groupby(g_feat)['img_hour'].mean())
    data["mean_" + g_feat + "_img_hour"] = data.apply(lambda row: mean_hour_dict[row[g_feat]], axis=1)
    # mean_weekday_dict = dict(data.groupby(g_feat)['img_weekday'].mean())
    # data["mean_" + g_feat + "_img_weekday"] = data.apply(lambda row: mean_weekday_dict[row[g_feat]], axis=1)
    # mean_quarter_dict = dict(data.groupby(g_feat)['img_quarter'].mean())
    # data["mean_" + g_feat + "_img_quater"] = data.apply(lambda row: mean_quarter_dict[row[g_feat]], axis=1)
    # mean_week_dict = dict(data.groupby(g_feat)['img_week'].mean())
    # data["mean_" + g_feat + "_img_week"] = data.apply(lambda row: mean_week_dict[row[g_feat]], axis=1)
    mean_passed_dict = dict(data.groupby(g_feat)['img_passed'].mean())
    data["mean_" + g_feat + "_img_passed"] = data.apply(lambda row: mean_passed_dict[row[g_feat]], axis=1)
    mean_latest_dict = dict(data.groupby(g_feat)['img_latest'].mean())
    data["mean_" + g_feat + "_img_latest"] = data.apply(lambda row: mean_latest_dict[row[g_feat]], axis=1)
    return data








================================================
FILE: stack/params.txt
================================================
LogisticRegression Type:Liblinear C:6.1 threads:1 usescale:True maxim_Iteration:200 seed:1 verbose:false
GradientBoostingForestClassifier estimators:300 shrinkage:0.18 threads:1 offset:0.00001 max_depth:3 max_features:0.65 min_leaf:2.0 min_split:7.0 Objective:RMSE row_subsample:1.0 seed:1 verbose:false
LibFmClassifier maxim_Iteration:70 C:0.0041 C2:0.00120 lfeatures:1 seed:1 usescale:True init_values:0.046 learn_rate:0.05 smooth:0.1 threads:1 verbose:false
softmaxnnclassifier usescale:True seed:1 Type:SGD maxim_Iteration:50 C:0.0000008 shuffle:false tolerance:0.01 learn_rate:0.0065 smooth:0.1 h1:40 h2:35 connection_nonlinearity:Relu init_values:0.020 verbose:false
RandomForestClassifier bootsrap:false estimators:100 threads:1 offset:0.00001 max_depth:6 max_features:0.4 min_leaf:2.0 min_split:5.0 Objective:ENTROPY row_subsample:0.95 seed:1 verbose:false
AdaboostRandomForestClassifier bootsrap:false weight_thresold:0.95 estimators:100 threads:1 max_depth:6 max_features:0.5 min_leaf:2.0 min_split:5.0 Objective:ENTROPY row_subsample:0.9 seed:1 verbose:false
GradientBoostingForestRegressor bootsrap:false estimators:300 shrinkage:0.1 threads:1 offset:0.00001 max_depth:3 max_features:0.4 min_leaf:2.0 min_split:5.0 Objective:RMSE row_subsample:0.9 seed:1 verbose:false
RandomForestRegressor bootsrap:false estimators:100 threads:1 offset:0.00001 max_depth:6 max_features:0.4 min_leaf:2.0 min_split:5.0 Objective:RMSE row_subsample:0.95 seed:1 verbose:false
LibFmRegressor maxim_Iteration:70 C:0.0001 C2:0.0009 lfeatures:2 seed:1 usescale:True init_values:0.1 learn_rate:0.1 threads:1 verbose:false

RandomForestClassifier bootsrap:false estimators:500 threads:3 offset:0.00001 max_depth:5 max_features:0.3 min_leaf:1.0 min_split:5.0 Objective:ENTROPY row_subsample:0.8 seed:1 verbose:false

================================================
FILE: stack/parse.py
================================================
import re
import numpy as np

with open("result.txt", "r") as f:
    raw = "".join(f.readlines())

str_res = re.findall(pattern="logloss : 0\.[0-9]+", string=raw)
res = [float(x.split(" : ")[1]) for x in str_res]
results = {i: [] for i in range(len(res) // 5)}
for i in range(len(res)):
    results[i % (len(res) // 5)].append(res[i])
results = {i: np.mean(results[i]) for i in results}
for item in sorted(results.items(), key=lambda x: x[1]):
    print(item)

================================================
FILE: stack/start.sh
================================================
java -Xmx3048m -jar StackNet.jar train train_file=train_stacknet.csv test_file=test_stacknet.csv params=params.txt pred_file=sigma_stack_pred.csv test_target=true verbose=true Threads=4 stackdata=false folds=5 seed=1 metric=logloss


================================================
FILE: stack/utils.py
================================================
import pandas as pd
import numpy as np


def getAvgSub(subs_in):
    subs = []
    for sub in subs_in:
        sub = sub.sort_values(by=["listing_id"]).reset_index()
        subs.append(sub)
    n = len(subs)
    new_sub = subs[0].copy()
    for i in range(1, n):
        sub = subs[i]
        new_sub["high"] = new_sub["high"] + sub["high"]
        new_sub["medium"] = new_sub["medium"] + sub["medium"]
        new_sub["low"] = new_sub["low"] + sub["low"]
    new_sub["high"] =  new_sub["high"] / n
    new_sub["medium"] = new_sub["medium"] / n
    new_sub["low"] = new_sub["low"] / n
    del new_sub["index"]
    return new_sub

def getWeightedAvgSub(subs_in, weights):
    assert np.sum(weights) == 1, "Sum of weights need to be 1"
    subs = []
    for sub in subs_in:
        sub = sub.sort_values(by=["listing_id"]).reset_index()
        subs.append(sub)
    n = len(subs)
    new_sub = subs[0].copy() 
    new_sub["high"] = new_sub["high"] * weights[0]
    new_sub["medium"] = new_sub["medium"] * weights[0]
    new_sub["low"] = new_sub["low"] * weights[0]
    for i in range(1, n):
        sub = subs[i]
        new_sub["high"] = new_sub["high"] + sub["high"] * weights[i]
        new_sub["medium"] = new_sub["medium"] + sub["medium"] * weights[i]
        new_sub["low"] = new_sub["low"] + sub["low"] * weights[i]
    del new_sub["index"]
    return new_sub

def generateStackSub(test_file_name, sub_file_name):
    test_array = np.loadtxt(test_file_name, delimiter=",") 
    test = pd.DataFrame(test_array)
    sub_array = np.loadtxt(sub_file_name, delimiter=",") 
    sub = pd.DataFrame(sub_array)
    sub.columns = ["high", "medium", "low"]
    sub["listing_id"] = test.iloc[:, 0].apply(lambda x: int(x))
    sub.to_csv("new_sub.csv", index=False)   




def correct(df):
    interest_levels = ['low', 'medium', 'high']

    tau = {
        'low': 0.69195995, 
        'medium': 0.23108864,
        'high': 0.07695141, 
    }

    y = df[interest_levels].mean()
    a = [tau[k] / y[k]  for k in interest_levels]
    print(a)

    def f(p):
        for k in range(len(interest_levels)):
            p[k] *= a[k]
        return p / p.sum()

    df_correct = df.copy()
    df_correct[interest_levels] = df_correct[interest_levels].apply(f, axis=1)

    y = df_correct[interest_levels].mean()
    a = [tau[k] / y[k]  for k in interest_levels]
    print(a)

    return df_correct
Download .txt
gitextract_aw7n92_y/

├── README.md
├── classifiers.py
├── modelTraining.py
├── ppt/
│   └── AIC-Sharing-11-19.pptx
├── preprocess.py
└── stack/
    ├── StackNet.jar
    ├── params.txt
    ├── parse.py
    ├── start.sh
    └── utils.py
Download .txt
SYMBOL INDEX (49 symbols across 4 files)

FILE: classifiers.py
  class xgboostClassifier (line 6) | class xgboostClassifier():
    method __init__ (line 7) | def __init__(self, **params):
    method fit (line 12) | def fit(self, X, y):
    method fit_CV (line 16) | def fit_CV(self, X_train, X_val, y_train, y_val):
    method get_eval_res (line 23) | def get_eval_res(self):
    method score (line 26) | def score(self, X, y):
    method predict_proba (line 30) | def predict_proba(self, X_test):
    method predict (line 34) | def predict(self, X_test):
    method get_params (line 38) | def get_params(self, **params):
    method set_params (line 41) | def set_params(self, **params):
    method getSortedImportance (line 44) | def getSortedImportance(self, features):
  class BaseClassifier (line 53) | class BaseClassifier(object):
    method __init__ (line 54) | def __init__(self, clf, seed=0, params=None):
    method train (line 58) | def train(self, x_train, y_train):
    method predict (line 61) | def predict(self, x):
    method predict_proba (line 64) | def predict_proba(self, x):
    method fit (line 67) | def fit(self,x,y):
    method set_params (line 70) | def set_params(self, **params):

FILE: modelTraining.py
  function validation_score (line 25) | def validation_score(early_stop=False):
  function validation_avg_score (line 67) | def validation_avg_score(clfs):
  function paramSearch (line 107) | def paramSearch(clf, param_dict):
  function gen_sub (line 162) | def gen_sub():
  function genAvgSub (line 197) | def genAvgSub(clfs):
  function validate (line 222) | def validate(clfs):
  function search (line 227) | def search():
  function write2file (line 253) | def write2file(cv_scores, val_desc=None):
  function stacking (line 266) | def stacking(clfs):

FILE: preprocess.py
  function bedroomProcess (line 29) | def bedroomProcess(data, train_idx, test_idx):
  function bathroomProcess (line 47) | def bathroomProcess(data, train_idx, test_idx):
  function buildingIdProcess (line 67) | def buildingIdProcess(data, y, train_idx, test_idx):
  function createdProcess (line 72) | def createdProcess(data):
  function descriptionProcess (line 116) | def descriptionProcess(data, train_idx, test_idx):
  function displayAddrProcess (line 138) | def displayAddrProcess(data):
  function featuresProcess (line 145) | def featuresProcess(data, train_idx, test_idx):
  function locationProcess (line 442) | def locationProcess(data, train_idx, test_idx):
  function managerIdProcess (line 462) | def managerIdProcess(data, y, train_idx, test_idx):
  function photoProcess (line 480) | def photoProcess(data):
  function priceProcess (line 485) | def priceProcess(data):
  function streetAddrProcess (line 513) | def streetAddrProcess(data):
  function listingIdProcess (line 520) | def listingIdProcess(data):
  function coreProcess (line 526) | def coreProcess(data, y_train, train_idx, test_idx):
  function convert_dataset_to_avg (line 578) | def convert_dataset_to_avg(xc,yc,xt, rounding=2,cols=None):
  function convert_to_avg (line 615) | def convert_to_avg(X,y, Xt, seed=1, cvals=5, roundings=2, columns=None):
  function group_with_time_features (line 659) | def group_with_time_features(data, g_feat):
  function group_with_img_time_features (line 680) | def group_with_img_time_features(data, g_feat):

FILE: stack/utils.py
  function getAvgSub (line 5) | def getAvgSub(subs_in):
  function getWeightedAvgSub (line 23) | def getWeightedAvgSub(subs_in, weights):
  function generateStackSub (line 42) | def generateStackSub(test_file_name, sub_file_name):
  function correct (line 54) | def correct(df):
Condensed preview — 10 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (65K chars).
[
  {
    "path": "README.md",
    "chars": 1005,
    "preview": "# README\n\n* ```preprocess.py```: data cleaning, feature engineering\n* ```modelTraining.py```: cross validation, submissi"
  },
  {
    "path": "classifiers.py",
    "chars": 2180,
    "preview": "import xgboost as xgb\nimport numpy as np\nfrom sklearn.metrics import log_loss\n\n\nclass xgboostClassifier():\n    def __ini"
  },
  {
    "path": "modelTraining.py",
    "chars": 16310,
    "preview": "import sys\nimport time\nimport random\nfrom collections import defaultdict\nimport numpy as np\nimport pandas as pd\nfrom skl"
  },
  {
    "path": "preprocess.py",
    "chars": 37348,
    "preview": "#!/usr/bin/python3\n#-*- encoding: utf-8 -*-\nimport sys\nimport random\nimport operator\nimport datetime\nimport time\nfrom co"
  },
  {
    "path": "stack/params.txt",
    "chars": 1801,
    "preview": "LogisticRegression Type:Liblinear C:6.1 threads:1 usescale:True maxim_Iteration:200 seed:1 verbose:false\nGradientBoostin"
  },
  {
    "path": "stack/parse.py",
    "chars": 459,
    "preview": "import re\nimport numpy as np\n\nwith open(\"result.txt\", \"r\") as f:\n    raw = \"\".join(f.readlines())\n\nstr_res = re.findall("
  },
  {
    "path": "stack/start.sh",
    "chars": 232,
    "preview": "java -Xmx3048m -jar StackNet.jar train train_file=train_stacknet.csv test_file=test_stacknet.csv params=params.txt pred_"
  },
  {
    "path": "stack/utils.py",
    "chars": 2385,
    "preview": "import pandas as pd\nimport numpy as np\n\n\ndef getAvgSub(subs_in):\n    subs = []\n    for sub in subs_in:\n        sub = sub"
  }
]

// ... and 2 more files (download for full content)

About this extraction

This page contains the full source code of the ScarletPan/Kaggle-Rental-Listing-Inquireies GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 10 files (60.3 KB), approximately 16.9k tokens, and a symbol index with 49 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.

Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.

Copied to clipboard!