[
  {
    "path": "README.md",
    "content": "# README\n\n* ```preprocess.py```: data cleaning, feature engineering\n* ```modelTraining.py```: cross validation, submission generating, stacking preparing\n* ```classifiers.py```: my encapsulation of xgboost\n* stack\n  * ```StackNet.jar```: stacking tools shared by KazAnova, repo is [here](https://github.com/kaz-Anova/StackNet)\n  * ```parse.py```: tools for evaluate the cv scores during stacking.\n  * ```utils.py```: generating submission after StackNet\n  * ```start.sh```: commands for executing StackNet\n  * ```params.txt```: my params for stacking\n\n### links:\n  * [Kaggle:Rental Listing Inquireies](https://www.kaggle.com/c/two-sigma-connect-rental-listing-inquiries)\n  * [Summary of getting a silver medal in kaggle](http://scarletpan.github.io/summary-of-get-a-silver-medal-in-kaggle/)\n  * [Kaggle 首战拿银总结 | 入门指导 (长文、干货） -- 知乎专栏](https://zhuanlan.zhihu.com/p/26645088)\n  * [AI Challenge 分享会PPT](https://github.com/ScarletPan/Kaggle-Rental-Listing-Inquireies/blob/master/ppt/AIC-Sharing-11-19.pptx)\n  \n"
  },
  {
    "path": "classifiers.py",
    "content": "import xgboost as xgb\nimport numpy as np\nfrom sklearn.metrics import log_loss\n\n\nclass xgboostClassifier():\n    def __init__(self, **params):\n        self.clf = None\n        self.progress = {}\n        self.params = params\n\n    def fit(self, X, y):\n        xg_train = xgb.DMatrix(X, label=y)\n        self.clf = xgb.train(self.params, xg_train, self.params['num_rounds'])\n\n    def fit_CV(self, X_train, X_val, y_train, y_val):\n        xg_train = xgb.DMatrix(X_train, label=y_train)\n        xg_val = xgb.DMatrix(X_val, label=y_val)\n        watchlist = [(xg_train, 'train'), (xg_val, 'eval')]\n        self.clf = xgb.train(self.params, xg_train, self.params['num_rounds'],\n                         watchlist, early_stopping_rounds=200, evals_result=self.progress)\n\n    def get_eval_res(self):\n        return self.progress\n\n    def score(self, X, y):\n        Y = self.predict_proba(X)\n        return 1 / log_loss(y, Y)\n\n    def predict_proba(self, X_test):\n        res = self.clf.predict(xgb.DMatrix(X_test))\n        return res.astype(np.float32)\n\n    def predict(self, X_test):\n        res = np.argmax(self.clf.predict(xgb.DMatrix(X_test)), axis=1)\n        return res \n\n    def get_params(self, **params):\n        return self.params\n\n    def set_params(self, **params):\n        self.params.update(params)\n\n    def getSortedImportance(self, features):\n        with open('xgb.fmap', 'w') as f:\n            for i in range(len(features)):\n                f.write('{0}\\t{1}\\tq\\n'.format(i, features[i]))\n        importance = self.clf.get_fscore(fmap='xgb.fmap')\n        importance = sorted(importance.items(), key=operator.itemgetter(1))\n        #print(importance)\n        return importance\n\nclass BaseClassifier(object):\n    def __init__(self, clf, seed=0, params=None):\n        params['random_state'] = seed\n        self.clf = clf(**params)\n\n    def train(self, x_train, y_train):\n        self.clf.fit(x_train, y_train)\n\n    def predict(self, x):\n        return self.clf.predict(x)\n\n    def predict_proba(self, x):\n        return self.clf.predict_proba(x)\n\n    def fit(self,x,y):\n        return self.clf.fit(x,y)\n\n    def set_params(self, **params):\n        self.params.update(params)\n    "
  },
  {
    "path": "modelTraining.py",
    "content": "import sys\nimport time\nimport random\nfrom collections import defaultdict\nimport numpy as np\nimport pandas as pd\nfrom sklearn.model_selection import KFold, StratifiedKFold, train_test_split\nfrom sklearn.metrics import log_loss\nfrom preprocess import coreProcess\nfrom classifiers import xgboostClassifier\n\nTRAIN_FILE_NAME = '~/Kaggle/RLI/input/train.json'\nTEST_FILE_NAME = '~/Kaggle/RLI/input/test.json'\ntarget_num_map = {'high': 0, 'medium': 1, 'low': 2}\ntrain_data = pd.read_json(TRAIN_FILE_NAME).reset_index()\ntest_data = pd.read_json(TEST_FILE_NAME).reset_index()\nlist_img_time = pd.read_csv(\"~/Kaggle/RLI/input/listing_image_time.csv\")\ntrain_data = train_data.merge(list_img_time, left_on=\"listing_id\", right_on=\"Listing_Id\", how='inner')\ntest_data = test_data.merge(list_img_time, left_on=\"listing_id\", right_on=\"Listing_Id\", how='inner')\nRS = 2016\nrandom.seed(RS)\nnp.random.seed(RS)\n# RS = 0\n\ndef validation_score(early_stop=False):\n    clf = xgboostClassifier(\n        objective = 'multi:softprob',\n        eval_metric = 'mlogloss',\n        num_class = 3,\n        nthread = 3,\n        eta = 0.04,\n        max_depth = 6,\n        subsample = 0.7,\n        colsample_bytree = 1.0,\n        colsample_bylevel = 0.7,\n        min_child_weight=1,\n        silent = 1,\n        num_rounds = 700,\n        seed = RS,\n    )\n    print(\"*** Validation start ***\")\n    data = train_data.copy()\n    y = data[\"interest_level\"].apply(lambda x: target_num_map[x])\n    del data[\"interest_level\"]\n\n    # skf = StratifiedKFold(n_splits=5, random_state=RS, shuffle=True)\n    skf = StratifiedKFold(n_splits=3, shuffle=False)\n    cv_scores = []\n    i = 0\n    for train_idx, val_idx in skf.split(data, y):\n        i += 1\n        X = data.copy()\n        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]\n        X_train, X_val, feats = coreProcess(X, y_train, train_idx, val_idx)\n        clf.fit(X_train, y_train)\n        # clf.fit_CV(X_train, X_val, y_train, y_val)\n        y_val_pred = clf.predict_proba(X_val)\n        loss = log_loss(y_val, y_val_pred)\n        print(\"Iteration {}'s loss: {}\".format(i, loss))\n        cv_scores.append(loss)\n        if early_stop:\n            break\n    print(\"*** Validation finished ***\\n\")\n    return cv_scores\n\n\ndef validation_avg_score(clfs):\n    print(\"*** Validation start ***\")\n    data = train_data.copy()\n    y = data[\"interest_level\"].apply(lambda x: target_num_map[x])\n    del data[\"interest_level\"]\n\n    # skf = StratifiedKFold(n_splits=5, random_state=RS, shuffle=True)\n    skf = StratifiedKFold(n_splits=3)\n    cv_scores = {i:[] for i in range(len(clfs))}\n    cv_scores[\"Avg\"] = []\n    i = 0\n    for train_idx, val_idx in skf.split(data, y):\n        i += 1\n        X = data.copy()\n        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]\n        X_train, X_val, feats = coreProcess(X, y_train, train_idx, val_idx)\n        tmp = []\n        preds = []\n        j = 0\n        for clf in clfs:\n            clf.fit(X_train, y_train)\n            y_val_pred = clf.predict_proba(X_val)\n            tmp.append(y_val_pred)\n            loss = log_loss(y_val, y_val_pred)\n            cv_scores[j].append(loss)\n            preds.append(y_val_pred)\n            j += 1\n            print(\"clf_{}, Iteration {}'s loss: {}\".format(j, i, loss))\n        preds = np.array(preds)\n        avg_pred = np.mean(preds, axis=0)\n        loss = log_loss(y_val, avg_pred)\n        cv_scores[\"Avg\"].append(loss)\n        print(\"Iteration {}'s Avg loss: {}\".format(i, loss))\n    for i in range(len(clfs)):\n        print(\"clf_{} validation loss : {}\".format(i, np.mean(cv_scores[i])))\n    print(\"Average validation loss : {}\".format(np.mean(cv_scores[\"Avg\"])))\n    print(\"*** Validation finished ***\\n\")\n    return cv_scores[\"Avg\"]\n\n\ndef paramSearch(clf, param_dict):\n\n    def outer_join(left, right):\n        if left == []:\n            return right\n        if right == []:\n            return left\n        res = []\n        for i in left:\n            for j in right:\n                if isinstance(i, list):\n                    tmp = i[:]\n                    tmp.append(j)\n                    res.append(tmp)\n                else:\n                    res.append([i, j])\n        return res\n    # Creating list of param_dict\n    param_list = sorted(param_dict.items(), key=lambda x: x[0])\n    param_keys = [ item[0] for item in param_list ]\n    param_vals = [ item[1] for item in param_list ]\n    all_vals = []\n    for val in param_vals:\n        all_vals = outer_join(all_vals, val)\n    all_param_lists = []\n    for vals in all_vals:\n        all_param_lists.append(dict(zip(param_keys, vals)))\n    # for item in all_param_lists:\n    #     print(item)\n\n    # Searching\n    best_score = float('inf')\n    best_params = None\n    scores = []\n    i = 0\n    for params in all_param_lists:\n        print(\"\\n\" + \"-\" * 70)\n        for param_name in params.keys():\n            print(\"{} : {}\".format(param_name, params[param_name]))\n        clf.set_params(**params)\n        score = np.mean(validation_score(clf))\n        if score < best_score:\n            best_score = score\n            best_params = params\n        i += 1\n        print(\"{} / {}, Done\".format(i, len(all_param_lists)))\n        print(\"Score: \", score)\n        scores.append(score)\n    print(scores)\n    print(\"Best parameters:\")\n    for param_name in best_params.keys():\n        print(\"{} : {}\".format(param_name, best_params[param_name]))\n    print(\"Score: \", best_score)\n\n\ndef gen_sub():\n    train = train_data.copy()\n    train_idx = [i for i in range(train.shape[0])]\n    test = test_data.copy()\n    test_idx = [i + train.shape[0] for i in range(test.shape[0])]\n    y = train[\"interest_level\"].apply(lambda x: target_num_map[x])\n    del train[\"interest_level\"]\n    data = pd.concat([train, test]).reset_index()\n    X_train, X_test, feats = coreProcess(data, y, train_idx, test_idx)\n    xgb_clf = xgboostClassifier(\n        objective = 'multi:softprob',\n        eval_metric = 'mlogloss',\n        num_class = 3,\n        nthread = 12,\n        eta = 0.02,\n        max_depth = 6,\n        subsample = 0.8,\n        colsample_bytree = 1.0,\n        colsample_bylevel = 0.8,\n        min_child_weight=1,\n        silent = 1,\n        num_rounds = 1700,\n        seed = RS,\n    )\n    print(\"Trainning:...\")\n    xgb_clf.fit(X_train, y)\n\n    preds = xgb_clf.predict_proba(X_test)\n    sub = pd.DataFrame(preds)\n    # sub.columns = [\"high\", \"medium\", \"low\"]\n    sub.columns = [ \"high\", \"medium\", \"low\"]\n    sub[\"listing_id\"] = test.listing_id.values\n    sub.to_csv(\"submission.csv\", index=False)\n\n\ndef genAvgSub(clfs):\n    train = train_data.copy()\n    train_idx = [i for i in range(train.shape[0])]\n    test = test_data.copy()\n    test_idx = [i + train.shape[0] for i in range(test.shape[0])]\n    y = train[\"interest_level\"].apply(lambda x: target_num_map[x])\n    del train[\"interest_level\"]\n    data = pd.concat([train, test]).reset_index()\n    X_train, X_test, feats = coreProcess(data, y, train_idx, test_idx)\n    print(\"Trainning:...\")\n    preds = []\n    for i in range(len(clfs)):\n        print(\"Clf_{} fiting\".format(i))\n        clfs[i].fit(X_train, y)\n        print(\"Clf_{} predicting\".format(i))\n        pred = clfs[i].predict_proba(X_test)\n        preds.append(pred)\n    sub = pd.DataFrame(np.mean(preds, axis=0))\n    # sub.columns = [\"high\", \"medium\", \"low\"]\n    sub.columns = [ \"high\", \"medium\", \"low\"]\n    sub[\"listing_id\"] = test.listing_id.values\n    sub.to_csv(\"submission.csv\", index=False)\n    print(\"Train done.\")\n\n\ndef validate(clfs):\n    cv_scores = validation_avg_score(clfs)\n    return cv_scores\n\n\ndef search():\n    param_dict = {\n        'eta' : [0.02],\n        'max_depth' : [6],\n        'subsample' : [0.8],\n        'colsample_bylevel' : [0.7],\n        'num_rounds' : [1400, 1500, 1600, 1650],\n    }\n    clf = xgboostClassifier(\n        objective = 'multi:softprob',\n        eval_metric = 'mlogloss',\n        num_class = 3,\n        nthread = 12,\n        eta = 0.04,\n        max_depth = 6,\n        subsample = 0.7,\n        colsample_bytree = 1.0,\n        colsample_bylevel = 1.0,\n        min_child_weight=1,\n        silent = 1,\n        num_rounds = 700,\n        seed = RS,\n    )\n    paramSearch(clf, param_dict)\n\n\ndef write2file(cv_scores, val_desc=None):\n    print(\"*\" * 50)\n    print(\"Cross validation loss: \", np.mean(cv_scores))\n    with open(\"results.log\", \"a\") as fp:\n        fp.write(time.strftime(\"%m/%d/%Y %H:%M\") + '\\n')\n        if(val_desc is not None):\n            fp.write(val_desc + '\\n')\n        for score in cv_scores:\n            fp.write(str(score) + \" \")\n        fp.write(\"\\nCross Validation: {}\\n\".format(np.array(cv_scores).mean()))\n        fp.write(\"*\" * 50 + \"\\n\")\n\n\ndef stacking(clfs):\n    print(\"Stacking\")\n    train = train_data.copy()\n    test = test_data.copy()\n    y = train[\"interest_level\"].apply(lambda x: target_num_map[x])\n    del train[\"interest_level\"]\n    train_stackers = []\n    for RS in [0, 1, 2, 64, 128, 256, 512, 1024, 2048, 4096]:\n        skf = StratifiedKFold(n_splits=10, random_state=RS, shuffle=True)\n        #Create Arrays for meta\n        train_stacker = [[0.0 for s in range(3)]  for k in range (0,(train.shape[0]))]\n        cv_scores = {i:[] for i in range(len(clfs))}\n        cv_scores[\"Avg\"] = []\n        print(\"Begin 10-flod cross validation\")\n        cnt = 0\n        for train_idx, val_idx in skf.split(train, y):\n            cnt += 1\n            X = train.copy()\n            y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]\n            X_train, X_val, feats = coreProcess(X, y_train, train_idx, val_idx)\n            X_train.toarray()\n            preds = []\n            k = 0\n            for clf in clfs:\n                clf.fit(X_train, y_train)\n                y_val_pred = clf.predict_proba(X_val)\n                loss = log_loss(y_val, y_val_pred)\n                preds.append(y_val_pred)\n                cv_scores[k].append(loss)\n                k += 1\n                print(\"Clf_{} iteration {}'s loss: {}\".format(k, cnt, loss))\n            preds = np.array(preds)\n            avg_pred = np.mean(preds, axis=0)\n            avg_loss = log_loss(y_val, avg_pred)\n            cv_scores[\"Avg\"].append(avg_loss)\n            print(\"Iteration {}'s Avg loss: {}\".format(cnt, avg_loss))\n            no = 0\n            for real_idx in val_idx:\n                for i in range(3):\n                    train_stacker[real_idx][i] = avg_pred[no][i]\n                no += 1\n        for i in range(len(clfs)):\n            print(\"clf_{} validation loss : {}\".format(i, np.mean(cv_scores[i])))\n        print(\"Average validation loss : {}\".format(np.mean(cv_scores[\"Avg\"])))\n        train_stackers.append(train_stacker)\n    train_stacker = np.mean(train_stackers, axis=0)\n    print(\"*** Validation finished ***\\n\")\n\n    test_stacker = [[0.0 for s in range(3)]   for k in range (0,(test.shape[0]))]\n    train_idx = [i for i in range(train.shape[0])]\n    test_idx = [i + train.shape[0] for i in range(test.shape[0])]\n    data = pd.concat([train, test]).reset_index()\n    X_train, X_test, feats = coreProcess(data, y, train_idx, test_idx)\n    print(X_train.shape, len(train_stacker))\n    print(\"Begin predicting\")\n    preds = []\n    for i in range(len(clfs)):\n        print(\"Clf_{} fiting\".format(i))\n        clfs[i].fit(X_train, y)\n        print(\"Clf_{} predicting\".format(i))\n        pred = clfs[i].predict_proba(X_test)\n        preds.append(pred)\n    preds = np.mean(preds, axis=0)\n    for pr in range (0, len(preds)):  \n            for d in range (0,3):            \n                test_stacker[pr][d]=(preds[pr][d])   \n    print (\"merging columns\")   \n    #stack xgboost predictions\n    X_train = np.column_stack((X_train.toarray(),train_stacker))\n    # stack id to test\n    X_test = np.column_stack((X_test.toarray(),test_stacker))         \n    # stack target to train\n    X = np.column_stack((y,X_train))\n    ids = test.listing_id.values\n    X_test = np.column_stack((ids, X_test))\n    np.savetxt(\"./train_stacknet.csv\", X, delimiter=\",\", fmt='%.5f')\n    np.savetxt(\"./test_stacknet.csv\", X_test, delimiter=\",\", fmt='%.5f') \n    print(\"Write results...\")\n    output_file = \"submission_{}.csv\".format(np.mean(cv_scores[\"Avg\"]))\n    print(\"Writing submission to %s\" % output_file)\n    f = open(output_file, \"w\")   \n    f.write(\"listing_id,high,medium,low\\n\")# the header   \n    for g in range(0, len(test_stacker))  :\n      f.write(\"%s\" % (ids[g]))\n      for prediction in test_stacker[g]:\n         f.write(\",%f\" % (prediction))    \n      f.write(\"\\n\")\n    f.close()\n    print(\"Done.\")\n\n\nif __name__ == \"__main__\":\n    clfs = []\n    # clfs.append(xgboostClassifier(\n    #     objective = 'multi:softprob',\n    #     eval_metric = 'mlogloss',\n    #     num_class = 3,\n    #     nthread = 6,\n    #     eta = 0.04,\n    #     max_depth = 6,\n    #     subsample = 0.7,\n    #     colsample_bytree = 1.0,\n    #     colsample_bylevel = 0.7,\n    #     min_child_weight=1,\n    #     silent = 1,\n    #     num_rounds = 700,\n    #     seed = 0,\n    # ))\n    # clfs.append(xgboostClassifier(\n    #     objective = 'multi:softprob',\n    #     eval_metric = 'mlogloss',\n    #     num_class = 3,\n    #     nthread = 6,\n    #     eta = 0.02,\n    #     max_depth = 6,\n    #     subsample = 0.8,\n    #     colsample_bytree = 1.0,\n    #     colsample_bylevel = 0.8,\n    #     min_child_weight=1,\n    #     silent = 1,\n    #     num_rounds = 1700,\n    #     seed = 0,\n    # ))\n    clfs.append(xgboostClassifier(\n        objective = 'multi:softprob',\n        eval_metric = 'mlogloss',\n        num_class = 3,\n        nthread = 9,\n        eta = 0.02,\n        max_depth = 6,\n        subsample = 0.8,\n        colsample_bytree = 1.0,\n        colsample_bylevel = 0.7,\n        min_child_weight=1,\n        silent = 1,\n        num_rounds = 1500,\n        seed = 0,\n    ))\n    clfs.append(xgboostClassifier(\n        objective = 'multi:softprob',\n        eval_metric = 'mlogloss',\n        num_class = 3,\n        nthread = 9,\n        eta = 0.02,\n        max_depth = 6,\n        subsample = 0.8,\n        colsample_bytree = 1.0,\n        colsample_bylevel = 0.8,\n        min_child_weight=1,\n        silent = 1,\n        num_rounds = 1500,\n        seed = 128,\n    ))\n    clfs.append(xgboostClassifier(\n        objective = 'multi:softprob',\n        eval_metric = 'mlogloss',\n        num_class = 3,\n        nthread = 9,\n        eta = 0.02,\n        max_depth = 6,\n        subsample = 0.8,\n        colsample_bytree = 1.0,\n        colsample_bylevel = 0.8,\n        min_child_weight=1,\n        silent = 1,\n        num_rounds = 1500,\n        seed = 512,\n    )) \n    clfs.append(xgboostClassifier(\n        objective = 'multi:softprob',\n        eval_metric = 'mlogloss',\n        num_class = 3,\n        nthread = 9,\n        eta = 0.02,\n        max_depth = 6,\n        subsample = 0.8,\n        colsample_bytree = 1.0,\n        colsample_bylevel = 0.8,\n        min_child_weight=1,\n        silent = 1,\n        num_rounds = 1500,\n        seed = 1024,\n    ))   \n    clfs.append(xgboostClassifier(\n        objective = 'multi:softprob',\n        eval_metric = 'mlogloss',\n        num_class = 3,\n        nthread = 9,\n        eta = 0.02,\n        max_depth = 6,\n        subsample = 0.8,\n        colsample_bytree = 1.0,\n        colsample_bylevel = 0.8,\n        min_child_weight=1,\n        silent = 1,\n        num_rounds = 1500,\n        seed = 2048,\n    ))    \n    if len(sys.argv) == 1:\n        cv_scores = validate(clfs)\n        write2file(cv_scores)\n    elif len(sys.argv) == 2:\n        if sys.argv[1] == '-v':\n            cv_scores = validate(clfs)\n            write2file(cv_scores)\n        elif sys.argv[1] == '-g':\n            gen_sub()\n        elif sys.argv[1] == '-s':\n            search()\n        elif sys.argv[1] == '-ga':\n            genAvgSub(clfs)\n        elif sys.argv[1] == '-stack':\n            stacking(clfs)\n        elif sys.argv[1] == '-v3':\n            cv_scores = validate(clfs)\n            val_desc = sys.argv[2]\n            write2file(cv_scores, val_desc)\n    elif len(sys.argv) == 3:\n        if sys.argv[1] == '-v':\n            cv_scores = validate(clfs)\n            val_desc = sys.argv[2]\n            write2file(cv_scores, val_desc)\n        elif sys.argv[1] == '-g':\n            gen_sub()\n        elif sys.argv[1] == '-v3':\n            cv_scores = validation_score()\n            val_desc = sys.argv[2]\n            write2file(cv_scores, val_desc)\n\n\n\n\n\n\n"
  },
  {
    "path": "preprocess.py",
    "content": "#!/usr/bin/python3\n#-*- encoding: utf-8 -*-\nimport sys\nimport random\nimport operator\nimport datetime\nimport time\nfrom collections import defaultdict, Counter\nimport pandas as pd\nimport numpy as np\nfrom scipy import sparse\nimport xgboost as xgb\nfrom sklearn import preprocessing\nfrom sklearn.model_selection import train_test_split, GridSearchCV, KFold\nfrom sklearn.metrics import log_loss\nfrom sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer\nfrom sklearn.cluster import KMeans\nfrom sklearn.cross_validation import StratifiedKFold\nfrom sklearn.preprocessing import StandardScaler\nfrom nltk.metrics import distance as distance\n\n\nFEATURE_NOT_USE = ['created','description','features','photos', 'index']# ,'bathrooms', 'bedrooms''listing_id',\nFEATURE_NOT_USE.append('display_address')\nFEATURE_NOT_USE.extend(['low_build_frac', 'high_build_frac', 'medium_build_frac', 'build_count'])# \nFEATURE_NOT_USE.extend(['low_manager_frac', 'high_manager_frac', 'medium_manager_frac','manager_count'])#\nFEATURE_NOT_USE.extend(['Listing_Id', 'img_created']) # , 'time_stamp'\n\ndef bedroomProcess(data, train_idx, test_idx):\n    # Some basic feature from bedrooms\n    data[\"no_bedroom\"] = data[\"bedrooms\"].apply(lambda x: 1 if x == 0 else 0)\n    data[\"more_than_5_bedroom\"] = data[\"bedrooms\"].apply(lambda x: 1 if x >= 5 else 0)\n    data.loc[data[\"bedrooms\"] + data[\"bathrooms\"] == 0, \"bedrooms\"] = 0.001\n    train = data.iloc[train_idx, :].copy()\n    test = data.iloc[test_idx, :].copy()\n    # remove null value (ugly code)\n    train.loc[data[\"bedrooms\"] == 0.001, \"bathrooms\"] = train[\"bathrooms\"].mean()\n    test.loc[data[\"bedrooms\"] == 0.001, \"bathrooms\"] = test[\"bathrooms\"].mean()\n    data.iloc[train_idx, :] = train\n    data.iloc[test_idx, :] = test\n    data[\"bedroom_per_room\"] = data[\"bedrooms\"] / (data[\"bedrooms\"] + data[\"bathrooms\"])\n    data.loc[data[\"bedrooms\"] == 0.001, \"bathrooms\"] = 0\n    data.loc[data[\"bedrooms\"] == 0.001, \"bedrooms\"] = 0\n    return data\n\n\ndef bathroomProcess(data, train_idx, test_idx):\n    # Some basic feature from bathrooms\n    data.loc[data[\"bathrooms\"] == 112, \"bathrooms\"] = 1.5\n    data.loc[data[\"bathrooms\"] == 10, \"bathrooms\"] = 1\n    data.loc[data[\"bathrooms\"] == 20, \"bathrooms\"] = 2\n    data[\"1_to_2_bathrooms\"] = data[\"bathrooms\"].apply(lambda x : 1if x != 0 and x <= 2 else 0)\n    data.loc[data[\"bedrooms\"] + data[\"bathrooms\"] == 0, \"bathrooms\"] = 0.001\n    train = data.iloc[train_idx, :].copy()\n    test = data.iloc[test_idx, :].copy()\n    # remove null value (ugly code)\n    train.loc[data[\"bathrooms\"] == 0.001, \"bedrooms\"] = train[\"bedrooms\"].mean()\n    test.loc[data[\"bathrooms\"] == 0.001, \"bedrooms\"] = test[\"bedrooms\"].mean()\n    data.iloc[train_idx, :] = train\n    data.iloc[test_idx, :] = test\n    data[\"bathoom_per_room\"] = data[\"bathrooms\"] / (data[\"bedrooms\"] + data[\"bathrooms\"])\n    data.loc[data[\"bathrooms\"] == 0.001, \"bedrooms\"] = 0\n    data.loc[data[\"bathrooms\"] == 0.001, \"bathrooms\"] = 0\n    return data\n\n\ndef buildingIdProcess(data, y, train_idx, test_idx):\n    # Have tried some ideas but failed\n    return data\n\n\ndef createdProcess(data):\n    # Some basic features from created\n    data[\"created\"] = pd.to_datetime(data['created'])\n    data[\"latest\"] = (data[\"created\"]- data[\"created\"].min())\n    data[\"latest\"] = data[\"latest\"].apply(lambda x: x.total_seconds())\n    data[\"passed\"] = (data[\"created\"].max()- data[\"created\"])\n    data[\"passed\"] = data[\"passed\"].apply(lambda x: x.total_seconds())\n    # year is weird\n    data[\"year\"] = data[\"created\"].dt.year\n    data['month'] = data['created'].dt.month\n    data['day'] = data['created'].dt.day\n    data['hour'] = data['created'].dt.hour\n    data['weekday'] = data['created'].dt.weekday\n    data['week'] = data['created'].dt.week\n    data['quarter'] = data['created'].dt.quarter\n    data['weekend'] = ((data['weekday'] == 5) & (data['weekday'] == 6))\n    data['weekend'] = data['weekend'].apply(int)\n    # data[\"created_stamp\"] = data[\"created\"].apply(lambda x: time.mktime(x.timetuple()))\n    #*\n    data[\"latest_list_rank\"] = data[\"latest\"] / data[\"listing_id\"]   \n    # data[\"diff_rank_2\"] = data[\"passed\"] / data[\"listing_id\"]\n    #*\n\n    # image time after leak\n    data.loc[data[\"time_stamp\"] > 1490000000, \"time_stamp\"] = 1478524550\n    data[\"img_created\"] = data[\"time_stamp\"].apply(lambda x: datetime.datetime.fromtimestamp(x))\n    data[\"img_latest\"] = (data[\"img_created\"]- data[\"img_created\"].min())\n    data[\"img_latest\"] = data[\"img_latest\"].apply(lambda x: x.total_seconds())\n    data[\"img_passed\"] = (data[\"img_created\"].max()- data[\"img_created\"])\n    data[\"img_passed\"] = data[\"img_passed\"].apply(lambda x: x.total_seconds())\n    data[\"img_year\"] = data[\"img_created\"].dt.year\n    data['img_month'] = data['img_created'].dt.month\n    data['img_day'] = data['img_created'].dt.day\n    data['img_hour'] = data['img_created'].dt.hour\n    # data['img_weekday'] = data['img_created'].dt.weekday\n    # data['img_week'] = data['img_created'].dt.week\n    # data['img_quarter'] = data['img_created'].dt.quarter\n    # data['img_weekend'] = ((data['img_weekday'] == 5) & (data['img_weekday'] == 6))\n    # data['img_weekend'] = data['img_weekend'].apply(int)\n    data[\"img_latest_list_rank\"] = data[\"img_latest\"] / data[\"listing_id\"] \n\n    return data\n\n\ndef descriptionProcess(data, train_idx, test_idx):\n    data[\"description_words_num\"] = data[\"description\"].apply(lambda x: len(x.split(' ')))\n    data[\"description_len\"] = data[\"description\"].apply(len)\n    # Some info from descriptions\n    desc_feats = {\n                  'bedroom_mentions': ['br ', '---', \"<a\", \"a>\", \"<p>\"],\n                  'html_tag_1':[\"<img \", \"</a>\", \"<li>\", \"</li>\", \"<ul>\", \"</ul>\", \"-->\", \"<close\",\"<hr\"],\n                }\n    for name, kwords in desc_feats.items():\n        data[name] =  data['description'].apply(lambda x: sum([x.count(w)  for w in kwords]))\n\n    data['description'] =  data['description'].apply(lambda x: str(x).encode('utf-8') if len(x)>2 else \"nulldesc\") \n    # Tf-idf Encode\n    tfidfdesc=TfidfVectorizer(min_df=20, max_features=50, strip_accents='unicode',lowercase =True,\n                        analyzer='word', token_pattern=r'\\w{16,}', ngram_range=(1, 2), use_idf=False,smooth_idf=False, \n    sublinear_tf=True, stop_words = 'english')  \n    tr_sparsed = tfidfdesc.fit_transform (data.iloc[train_idx, :][\"description\"])  \n    te_sparsed = tfidfdesc.transform(data.iloc[test_idx, :][\"description\"])\n    feats_names = [\"desc_\" + x for x in tfidfdesc.get_feature_names()]\n    return data, tr_sparsed, te_sparsed, feats_names\n\n\ndef displayAddrProcess(data):\n    # disp_price_dict = dict(data.groupby('display_address')['price'].mean())\n    # data[\"mean_disp_price\"] = data.apply(lambda row: disp_price_dict[row[\"display_address\"]], axis=1)\n    # data[\"addr_sim\"] = data.apply(lambda row: distance.edit_distance(row[\"display_address\"].lower(), row[\"street_address\"].lower()), axis=1)\n    return data\n\n\ndef featuresProcess(data, train_idx, test_idx):\n    def afterRemoveStr(l, s):\n        while s in l:\n            l.remove(s)\n        return l\n\n    def afterRemoveFirstSpace(l):\n        res = []\n        for s in l:\n            res.append(s.strip())\n        return res\n\n    data[\"features_num\"] = data[\"features\"].apply(len)\n    mark = \"#+-+#\"\n    data[\"features\"] = data[\"features\"].apply(lambda x: mark.join([i for i in x]))\n    data[\"features\"] = data[\"features\"].apply(lambda x: x.lower())\n\n    # Deal with list like data\n    data[\"features\"] = data[\"features\"].apply(lambda x: mark.join([i for i in x.split(\" * \")]))\n    data[\"features\"] = data[\"features\"].apply(lambda x: mark.join([i for i in x.split(\"**\")]))\n    data['features']=data['features'].str.replace(\"✓ hardwood floor ✓ high ceilings ✓ dishwasher\",\n        \"hardwood floor\" + mark + \"high ceilings\" + mark + \"dishwasher\")\n    data['features']=data['features'].str.replace(\n        \"• on-site lifestyle concierge by luxury attaché \" + \n        \"•24/7 doorman \" + \n        \"• state of the art cardiovascular and weight training equipment \" +\n        \"• 24-hour valet parking garage \" +\n        \"• valet services including dry cleaning\",\n        \"on-site lifestyle concierge by luxury attaché\" + mark + \n        \"24/7 doorman\" + mark + \n        \"state of the art cardiovascular and weight training equipment\" + mark + \n        \"24-hour valet parking garage\" + mark + \n        \"valet services including dry cleaning\")\n    data['features']=data['features'].str.replace(\n        '{     0 = \"laundry in unit\";     ' + \n        '1 = \"cats allowed\";     '+\n        '10 = hardwood;     '+\n        '11 = \"high ceilings\";     '+\n        '12 = renovated;     '+\n        '13 = \"marble bath\";     '+\n        '14 = \"granite kitchen\";     '+\n        '15 = light;     '+\n        '16 = \"no fee\";     '+\n        '17 = \"walk-in closet\";     '+\n        '2 = \"dogs allowed\";     '+\n        '3 = elevator;     '+\n        '4 = exclusive;     '+\n        '6 = laundry;     '+\n        '7 = subway;     '+\n        '8 = dishwasher;     '+\n        '9 = washer; }',\n        \"laundry in unit\" + mark + \"cats allowed\" + mark + \"hardwood\" + \n        \"high ceilings\" + mark + \"renovated\" + mark + \"marble bath\" + \n        \"granite kitchen\" + mark + \"light\" + mark + \"no fee\" +\n        \"walk-in closet\" + mark + \"dogs allowed\" + mark + \"elevator\" +\n        \"exclusive\" + mark + \"laundry\" + mark + \"subway\"+\n        \"dishwasher\" + mark + \"washer\")\n    data['features']=data['features'].str.replace(\"windowed air-conditioned and monitored laundry room\",\n        \"windowed air-conditioned\" + mark + \"monitored laundry room\")\n    data['features']=data['features'].str.replace(\"wall of windows. huge bedrooms\",\n        \"wall of windows\" + mark + \"huge bedrooms\")\n    data['features']=data['features'].str.replace(\"to relax and recharge. this spacious 3 bedroom/2 bath residence also features oak hardwood flooring\",\n        \"spacious\" + mark + \"3 bedroom\" + mark + \"2 bath\" + mark + \"residence\" + mark + \"oak hardwood flooring\")\n    data['features']=data['features'].str.replace(\"stunning 3 bedroom apartment with a terrace! east harlem! the best deal out now! get it now!!!!\",\n        \"stunning\" + mark + \"3 bedroom\" + mark + \"a terrace\" + mark + \"east harlem\" + mark + \"the best deal out now! get it now!!!!\")\n    data['features']=data['features'].str.replace(\"ss appliances - d/w -  m/w - recessed lighting - hardwood floors - high ceilings - marble bath\",\n        \"ss appliances - d/w -  m/w - \" + mark + \"recessed lighting\" + mark + \"hardwood floors\" + mark + \"high ceilings\" + mark + \"marble bath\")\n    data['features']=data['features'].str.replace(\"spacious living room for any kind of entertainment. prime location in theater distric\",\n        \"spacious living room for any kind of entertainment.\" + mark + \"prime location in theater distric\")\n    data['features']=data['features'].str.replace(\"spacious living room + home office\",\n        \"spacious living room\" + mark + \"home office\")\n    data['features']=data['features'].str.replace(\"spacious and sunny 1st floor apartment \"+\n        \"overlooking the garden  \" + \n        \"*great williamsburg location*  \"+\n        \"steps from shopping and cafes \"+\n        \"and 5 minute walk to graham avenue l train (3rd stop from manhattan)  \"+\n        \"*shared back yard * \"+\n        \"large box style rooms * \"+\n        \"huge living room with high ceilings * \"+\n        \"nice bathroom with granite floor & ceramic tile * \"+\n        \"beautiful kitchen with granite counter tops  lots of closet spacehardwood floors *\"+\n        \" heat included in the rent  \"+\n        \"clean quiet building   \"+\n        \"cat ok  \"+\n        \"great location close to shopping\",\n        \"spacious\"+ mark +\"sunny 1st floor\"+ mark+ \n        \"overlooking the garden\" + mark+ \n        \"great williamsburg location\"+ mark+ \n        \"steps from shopping and cafes\"+ mark+ \n        \"5 minute walk to graham avenue\"+ mark +\"train (3rd stop from manhattan)\"+ mark+ \n        \"shared back yard\"+mark+ \n        \"large box style rooms\"+mark+ \n        \"huge living room \" + mark + \"high ceilings\"+ mark+ \n        \"nice bathroom\" + mark +\"granite floor\" + mark +\"ceramic tile * \"+mark+ \n        \"beautiful kitchen\" + mark +\"granite counter tops\" + mark +\"closet \" + mark +\"spacehardwood floors\"+mark+ \n        \"heat included in the rent\"+mark+ \n        \"clean quiet building\"+mark+ \n        \"cat ok\"+mark+ \n        \"close to shopping\")\n    data['features']=data['features'].str.replace(\"residents-only \" + \n        \"fitness center \" + \n        \"and aerobic room \" + \n        \"professionally outfitted with a full complement of strength and cardio-training equipment\",\n        \"residents-only\"+ mark +\"itness center\"+ mark+ \n        \"and aerobic room\" + mark+ \n        \"cardio-training equipment\")\n    data['features']=data['features'].str.replace(\"owner occupied - \" + \n        \"3 family townhouse - \" + \n        \"no realtor fees -\"+\n        \" this beautiful apt is offered below market rate\",\n        \"owner occupied\"+ mark +\"3 family townhouse\"+ mark+ \n        \"no realtor fees\" + mark+ \n        \"this beautiful apt is offered below market rate\")\n    data['features']=data['features'].str.replace(\"newly renovated \"+\n        \"w/ oak wood floors   \"+\n        \"mid century modern style interior   \"+\n        \"large closets in every bedroom \"+\n        \"extra storage space in hall. \"+\n        \"large living room\",\n        \"newly renovated\"+ mark +\"oak wood floors\"+ mark+ \n        \"mid century modern style interior\" + mark+ \n        \"large closets in every bedroom\" + mark+ \n        \"extra storage space in hall\"+ mark +\"large living room\")\n    data['features']=data['features'].str.replace(\"live-in super package room \"+\n        \"smoke-free \"+\n        \"storage available \"+\n        \"virtual doorman \"+\n        \"guarantors accepted\",\n\n        \"live-in super package room\"+ mark +\"smoke-free\"+ mark+ \n        \"storage available\" + mark+ \n        \"virtual doorman\" + mark+ \n        \"guarantors accepted\")\n    data['features']=data['features'].str.replace(\"live-in super package room \"+\n        \"smoke-free \"+\n        \"storage available \"+\n        \"virtual doorman \"+\n        \"guarantors accepted\",\n\n        \"live-in super package room\"+ mark +\"smoke-free\"+ mark+ \n        \"storage available\" + mark+ \n        \"virtual doorman\" + mark+ \n        \"guarantors accepted\")\n\n    # Merging some features\n    data['features']=data['features'].str.replace(\"washer/dyer combo\",\"washer/dyer\")\n    data['features']=data['features'].str.replace(\"washer/dryer inside the unit\",\"washer/dyer\")\n    data['features']=data['features'].str.replace(\"washer/dryer in-unit\",\"washer/dyer\")\n    data['features']=data['features'].str.replace(\"washer/dryer in unit\",\"washer/dyer\")\n    data['features']=data['features'].str.replace(\"washer/dryer in building\",\"washer/dyer\")\n    data['features']=data['features'].str.replace(\"washer/dryer in bldg\",\"washer/dyer\")\n    data['features']=data['features'].str.replace(\"washer/dryer hookup\",\"washer/dyer\")\n    data['features']=data['features'].str.replace(\"washer/dryer  stove/oven\",\"washer/dyer\")\n    data['features']=data['features'].str.replace(\"washer/drier hookups\",\"washer/dyer\")\n    data['features']=data['features'].str.replace(\"washer/ dryer in unit\",\"washer/dyer\")\n    data['features']=data['features'].str.replace(\"washer/ dryer hookups\",\"washer/dyer\")\n    data['features']=data['features'].str.replace(\"washer-dryer in unit\",\"washer/dyer\")\n    data['features']=data['features'].str.replace(\"washer-dryer hookups\",\"washer/dyer\")\n    data['features']=data['features'].str.replace(\"washer in unit\",\"washer/dyer\")\n    data['features']=data['features'].str.replace(\"washer dryer in unit\",\"washer/dyer\")\n    data['features']=data['features'].str.replace(\"washer dryer hookup\",\"washer/dyer\")\n    data['features']=data['features'].str.replace(\"washer dryer hook up\",\"washer/dyer\")\n    data['features']=data['features'].str.replace(\"washer and dryer in unit\",\"washer/dyer\")\n    data['features']=data['features'].str.replace(\"washer and dryer in the unit\",\"washer/dyer\")\n    data['features']=data['features'].str.replace(\"washer and dryer\",\"washer/dyer\")\n    data['features']=data['features'].str.replace(\"washer / dryer in unit\",\"washer/dyer\")\n    data['features']=data['features'].str.replace(\"washer / dryer (hookup only)\",\"washer/dyer\")\n    data['features']=data['features'].str.replace(\"washer / dryer\",\"washer/dyer\")\n    data['features']=data['features'].str.replace(\"washer & dryer.\",\"washer/dyer\")\n    data['features']=data['features'].str.replace(\"washer\",\"washer/dyer\")\n    data['features']=data['features'].str.replace(\"wash/dryer\",\"washer/dyer\")\n\n\n    data['features']=data['features'].str.replace(\"pets: cats/small dogs\",\"pet-friendly\")\n    data['features']=data['features'].str.replace(\"pets welcome\",\"pet-friendly\")\n    data['features']=data['features'].str.replace(\"pets upon approval\",\"pet-friendly\")\n    data['features']=data['features'].str.replace(\"pets on approval\",\"pet-friendly\")\n    data['features']=data['features'].str.replace(\"pets ok.\",\"pet-friendly\")\n    data['features']=data['features'].str.replace(\"pets ok\",\"pet-friendly\")\n    data['features']=data['features'].str.replace(\"pets are welcome\",\"pet-friendly\")\n    data['features']=data['features'].str.replace(\"pets allowed\",\"pet-friendly\")\n    data['features']=data['features'].str.replace(\"pets accepted (on approval)\",\"pet-friendly\")\n    data['features']=data['features'].str.replace(\"pets\",\"pet-friendly\")\n    data['features']=data['features'].str.replace(\"pet grooming room\",\"pet-friendly\")\n    data['features']=data['features'].str.replace(\"pet friendly building\",\"pet-friendly\")\n    data['features']=data['features'].str.replace(\"pet friendly ( case by case )\",\"pet-friendly\")\n    data['features']=data['features'].str.replace(\"pet friendly\",\"pet-friendly\")\n    data['features']=data['features'].str.replace(\"pet friendly building\",\"pet-friendly\")\n    data['features']=data['features'].str.replace(\"pet friendly building\",\"pet-friendly\")\n\n    data['features']=data['features'].str.replace(\"garden/patio\",\"garden\")\n    data['features']=data['features'].str.replace(\"patio\",\"garden\")\n    data['features']=data['features'].str.replace(\"residents_garden\",\"garden\")\n    data['features']=data['features'].str.replace(\"common garden\",\"garden\")\n\n    data['features']=data['features'].str.replace(\"wifi access\",\"wifi\")\n    data['features']=data['features'].str.replace(\"wifi included\",\"wifi\")\n    data['features']=data['features'].str.replace(\"wifi in resident lounge\",\"wifi\")\n    data['features']=data['features'].str.replace(\"wifi + utilities\",\"wifi\")\n    data['features']=data['features'].str.replace(\"wi fi work lounge\",\"wifi\")\n    data['features']=data['features'].str.replace(\"wi-fi access\",\"wifi\")\n\n    data['features']=data['features'].str.replace(\"24/7\",\"24\")\n    data['features']=data['features'].str.replace(\"24-hour\",\"24\")\n    data['features']=data['features'].str.replace(\"24hr\",\"24\")\n    data['features']=data['features'].str.replace(\"concierge\",\"doorman\")\n    data['features']=data['features'].str.replace(\"ft doorman\",\"doorman\")\n    data['features']=data['features'].str.replace(\"24 doorman\",\"doorman\")\n    data['features']=data['features'].str.replace(\"24 hr doorman\",\"doorman\")\n    data['features']=data['features'].str.replace(\"doorman service\",\"doorman\")\n    data['features']=data['features'].str.replace(\"full-time doorman\",\"doorman\")\n\n    data['features']=data['features'].str.replace(\"gym/fitness\",\"fitness\")\n    data['features']=data['features'].str.replace(\"fitness room\",\"fitness\")\n\n    data['features']=data['features'].str.replace(\"washer\",\"laundry\")\n    data['features']=data['features'].str.replace(\"laundry in bldg\",\"laundry\")\n    data['features']=data['features'].str.replace(\"laundry in building\",\"laundry\")\n    data['features']=data['features'].str.replace(\"laundry in building/dryer\",\"laundry\")\n    data['features']=data['features'].str.replace(\"laundry in building_&_dryer\",\"laundry\")\n    data['features']=data['features'].str.replace(\"laundry room\",\"laundry\")\n    data['features']=data['features'].str.replace(\"laundry & housekeeping\",\"laundry\")\n    data['features']=data['features'].str.replace(\"laundry in unit\",\"laundry\")\n    data['features']=data['features'].str.replace(\"laundry in-unit\",\"laundry\")\n    data['features']=data['features'].str.replace(\"laundry on every floor\",\"laundry\")\n    data['features']=data['features'].str.replace(\"laundry on floor\",\"laundry\")\n    data['features']=data['features'].str.replace(\"in-unit laundry/dryer\",\"laundry\")\n    data['features']=data['features'].str.replace(\"on-site laundry\",\"laundry\")\n    data['features']=data['features'].str.replace(\"laundry/dryer\",\"laundry\")\n\n    data['features']=data['features'].str.replace(\"high-speed internet\",\"high_speed_internet\")\n    data['features']=data['features'].str.replace(\"high speed internet available\",\"high_speed_internet\")\n\n    data['features']=data['features'].str.replace(\"parking available\",\"parking\")\n    data['features']=data['features'].str.replace(\"parking space\",\"parking\")\n    data['features']=data['features'].str.replace(\"on-site garage\",\"parking\")\n    data['features']=data['features'].str.replace(\"on-site parking\",\"parking\")\n    data['features']=data['features'].str.replace(\"on-site parking lot\",\"parking\")\n    data['features']=data['features'].str.replace(\"full service garage\",\"parking\")\n    data['features']=data['features'].str.replace(\"common parking/garage\",\"parking\")\n    data['features']=data['features'].str.replace(\"garage\",\"parking\")\n    data['features']=data['features'].str.replace(\"assigned-parking-space\",\"private_parking\")\n\n    data['features']=data['features'].str.replace(\"storage available\",\"storage\")\n    data['features']=data['features'].str.replace(\"storage facilities available\",\"storage\")\n    data['features']=data['features'].str.replace(\"storage space\",\"storage\")\n    data['features']=data['features'].str.replace(\"storage room\",\"storage\")\n    data['features']=data['features'].str.replace(\"common storage\",\"storage\")\n\n    data['features']=data['features'].str.replace(\"central a/c\",\"central_air\")\n    data['features']=data['features'].str.replace(\"central ac\",\"central_air\")\n    data['features']=data['features'].str.replace(\"air conditioning\",\"central_air\")\n\n    data['features']=data['features'].str.replace(\"close to  subway\",\"subway\")\n\n    data['features']=data['features'].str.replace(\"roofdeck\",\"roof-deck\")\n    data['features']=data['features'].str.replace(\"roof-deck\",\"roof-deck\")\n    data['features']=data['features'].str.replace(\"rooftop terrace\",\"roof-deck\")\n    data['features']=data['features'].str.replace(\"rooftop deck\",\"roof-deck\")\n    data['features']=data['features'].str.replace(\"roof access\",\"roof-deck\")\n    data['features']=data['features'].str.replace(\"common roof deck\",\"roof-deck\")\n    data['features']=data['features'].str.replace(\"roof decks\",\"roof-deck\")\n    data['features']=data['features'].str.replace(\"roof grilling area\",\"roof-deck\")\n    data['features']=data['features'].str.replace(\"roof garden and lounge\",\"roof-deck\")\n    data['features']=data['features'].str.replace(\"roof deck with stunning view\",\"roof-deck\")\n    data['features']=data['features'].str.replace(\"roof deck with real grass\",\"roof-deck\")\n    data['features']=data['features'].str.replace(\"roof deck with grills\",\"roof-deck\")\n    data['features']=data['features'].str.replace(\"roof deck w/ grills\",\"roof-deck\")\n    data['features']=data['features'].str.replace(\"roof deck / sun deck\",\"roof-deck\")\n    data['features']=data['features'].str.replace(\"roof deck\",\"roof-deck\")\n\n    data['features']=data['features'].str.replace(\"swimming pool\",\"pool\")\n    data['features']=data['features'].str.replace(\"indoor pool\",\"pool\")\n\n    data['features']=data['features'].str.replace(\"deco fireplace\",\"fireplaces\")\n    data['features']=data['features'].str.replace(\"decorative fireplace\",\"fireplaces\")\n\n    data['features']=data['features'].str.replace(\"yoga/pilates studio\",\"yoga\")\n    data['features']=data['features'].str.replace(\"yoga studio\",\"yoga\")\n    data['features']=data['features'].str.replace(\"yoga room\",\"yoga\")\n    data['features']=data['features'].str.replace(\"yoga classes\",\"yoga\")\n    data['features']=data['features'].str.replace(\"yoga and spin studios\",\"yoga\")\n    data['features']=data['features'].str.replace(\"yoga an pilates class\",\"yoga\")\n    data['features']=data['features'].str.replace(\"yoga / dance studio\",\"yoga\")\n\n\n    # data[\"features\"] = data[\"features\"].apply(lambda x: afterRemoveStr(x, ''))\n    # data[\"features\"] = data[\"features\"].apply(lambda x: afterRemoveFirstSpace(x))\n    data[\"features\"] = data[\"features\"].apply(lambda x: x.split(mark))\n    data[\"features\"] = data[\"features\"].apply(lambda x: \" \".join([\"_\".join(i.split(\" \")) for i in x]))\n    tfidf = CountVectorizer(stop_words=\"english\", max_features=200)\n    tr_sparse_feats = tfidf.fit_transform(data.iloc[train_idx, :][\"features\"])\n    te_sparse_feats = tfidf.transform(data.iloc[test_idx, :][\"features\"])\n    feats_names = [\"features_\" + x for x in tfidf.get_feature_names()]\n    return data, tr_sparse_feats, te_sparse_feats, feats_names\n\n\ndef locationProcess(data, train_idx, test_idx):\n    # Clustering\n\n    # train_x = data.iloc[train_idx,:][['new_latitude', 'new_longitude']]\n    # stest_x = data.iloc[test_idx,:][['new_latitude', 'new_longitude']]\n    train_x = data.iloc[train_idx, :][['latitude', 'longitude']]\n    test_x = data.iloc[test_idx, :][['latitude', 'longitude']]\n    kmeans_cluster = KMeans(n_clusters=20)\n    res = kmeans_cluster.fit(train_x)\n    res = kmeans_cluster.predict(pd.concat([train_x, test_x]))\n    d = dict(zip(data['listing_id'], res))\n    data['cenroid'] = data['listing_id'].apply(lambda x: d[x])\n    # Manhattan distance\n    center = [data.iloc[train_idx, :]['latitude'].mean(), data.iloc[train_idx, :]['longitude'].mean()]\n    data['distance'] = abs(data['latitude'] - center[0]) + abs(data['longitude'] - center[1])\n    # data['distance_2'] = np.sqrt((data['latitude'] - center[0]) ** 2 + (data['longitude'] - center[1]) ** 2)\n\n    return data\n\n\ndef managerIdProcess(data, y, train_idx, test_idx):\n    manager_lgt_dict = dict(data.groupby('manager_id')['longitude'].mean())\n    manager_ltt_dict =  dict(data.groupby('manager_id')['latitude'].mean())\n\n    # Group manager_id with location info\n    data[\"mean_man_longitude\"] = data.apply(lambda row: manager_lgt_dict[row[\"manager_id\"]], axis=1)\n    data[\"mean_man_latitude\"] = data.apply(lambda row: manager_ltt_dict[row[\"manager_id\"]], axis=1)\n\n    # Group manager_id with time info\n    data = group_with_time_features(data, \"manager_id\")\n    data = group_with_img_time_features(data, \"manager_id\")\n    manager_stamp_dict = dict(data.groupby('manager_id')['time_stamp'].mean())\n    data[\"mean_man_timestamp\"] = data.apply(lambda row: manager_stamp_dict[row[\"manager_id\"]], axis=1)\n    # manager_stamp_dict = dict(data.groupby('manager_id')['created_stamp'].mean())\n    # data[\"mean_man_createdstamp\"] = data.apply(lambda row: manager_stamp_dict[row[\"manager_id\"]], axis=1)  \n    return data\n\n\ndef photoProcess(data):\n    data[\"photo_num\"] = data[\"photos\"].apply(len)\n    return data\n\n\ndef priceProcess(data):\n    #data[\"out_price\"] = data[\"price\"].apply(lambda x: 1 if x < 700 or x > 15000 else 0)\n    # Clean the outlier\n    ulimit = 15000#np.percentile(data.price.values, 99)\n    data.loc[data[\"price\"] > ulimit, \"price\"] = ulimit\n    dlimit = 350\n    data.loc[data[\"price\"] < dlimit, \"price\"] = dlimit\n    data[\"price_per_room\"] = data[\"price\"] / (data[\"bedrooms\"] + data[\"bathrooms\"] + 1.0)\n    data[\"price_per_bed\"] = data[\"price\"] / (data[\"bedrooms\"] + 1.0)\n    #*\n    # data.loc[~np.isfinite(data[\"price_per_room\"]), \"price_per_room\"] = 0\n    # data.loc[~np.isfinite(data[\"price_per_bed\"]), \"price_per_bed\"] = 0\n    data[\"price_latitude\"] = data[\"price\"] / (data[\"latitude\"] + 1.0)\n    data[\"price_longitude\"] = data[\"price\"] / (data[\"longitude\"] + 1.0)\n\n    # Grouping price with size or build\n    median_list = ['bedrooms', 'bathrooms', 'building_id']\n    # median_list = ['month', 'day', 'hour', 'weekday', 'quarter', 'week', 'passed', 'latest']\n    for col in median_list:\n        median_price = data[[col, 'price']].groupby(col)['price'].median()\n        median_price = median_price[data[col]].values.astype(float)\n        data['median_' + col] = median_price\n        data['ratio_' + col] = data['price'] / median_price\n        data['median_' + col] = data['median_' + col].apply(lambda x: np.log(x))\n    # data[\"price\"] = data[\"price\"].apply(lambda x: np.log(x))\n    return data\n\n\ndef streetAddrProcess(data):\n    #data[\"new_addr\"] = data[\"street_address\"].apply(lambda x: ' '.join([x.split()[i] for i in range(1, len(x.split()))]))\n    #data[\"new_addr\"] = preprocessing.LabelEncoder().fit_transform(data[\"new_addr\"])\n    # data[\"street_address\"] = data[\"street_address\"].apply(lambda x: x.replace('\\u00a0', '').strip().lower)\n    return data\n\n\ndef listingIdProcess(data):\n    # It's weird。\n    data[\"listing_id\"] = data[\"listing_id\"] - 68119576.0\n    return data\n\n\ndef coreProcess(data, y_train, train_idx, test_idx):\n    data = listingIdProcess(data)\n    data = bedroomProcess(data, train_idx, test_idx)\n    data = bathroomProcess(data, train_idx, test_idx)\n    data[\"room_diff\"] = data[\"bathrooms\"] - data[\"bedrooms\"]\n    data[\"room_num\"] = data[\"bedrooms\"] + data[\"bathrooms\"]\n    data = createdProcess(data)\n    data = buildingIdProcess(data, y_train, train_idx, test_idx)\n    data, tr_sparsed, te_sparsed, feats_sparsed = descriptionProcess(data, train_idx, test_idx)\n    data = displayAddrProcess(data)\n    data, tr_sparse, te_sparse, feats_sparse = featuresProcess(data, train_idx, test_idx)\n    data = locationProcess(data, train_idx, test_idx)\n    data = managerIdProcess(data, y_train, train_idx, test_idx)\n    data = photoProcess(data)\n    data = priceProcess(data)\n    data = streetAddrProcess(data)\n    \n    categorical = [\"display_address\", \"manager_id\", \"building_id\", \"street_address\"]\n    for f in categorical:\n        if data[f].dtype=='object':\n            cases=defaultdict(int)\n            temp=np.array(data[f]).tolist()\n            for k in temp:\n                cases[k]+=1\n            # print(f, len(cases))\n            data[f] = data[f].apply(lambda x: cases[x])\n            \n    feats_in_use = [col for col in data.columns if col not in FEATURE_NOT_USE]\n\n    data_train = np.array(data.iloc[train_idx, :][feats_in_use])\n    data_test  = np.array(data.iloc[test_idx, :][feats_in_use])\n    # Feature Scaling\n    stda = StandardScaler()  \n    data_test = stda.fit_transform(data_test)          \n    data_train = stda.transform(data_train)\n    #  High cardinality feature\n    high_card_feats = [\"building_id\", \"manager_id\", \"longitude\", \"room_diff\"] # \"building_id\", \"manager_id\", \n    # C0 = [3, 12, 0, 4]\n    C0 = [feats_in_use.index(f) for f in high_card_feats]\n    W_train, W_cv = convert_to_avg(data_train, y_train, data_test, seed=1, cvals=5, roundings=2, columns=C0)\n    #  Add Sparse feature\n    data_train = sparse.hstack([data_train, tr_sparse, tr_sparsed, W_train[:, C0]]).tocsr()\n    data_test = sparse.hstack([data_test, te_sparse, te_sparsed, W_cv[:, C0]]).tocsr()\n    feats_in_use.extend(feats_sparse)\n    feats_in_use.extend(feats_sparsed)\n    feats_in_use.extend([\"build_high_card\", \"manager_high_card\"])\n    # print(len(feats_in_use))\n    # print(tr_sparse.toarray().shape, tr_sparsed.toarray().shape, len(feats_in_use), data_train.shape)\n    return data_train, data_test, feats_in_use\n\n\n# Copy from KazAnova's starter code\ndef convert_dataset_to_avg(xc,yc,xt, rounding=2,cols=None):\n    xc = xc.tolist()\n    xt = xt.tolist()\n    yc = yc.tolist()\n    if cols == None:\n        cols =[k for k in range(0,len(xc[0]))]\n    woe=[ [0.0 for k in range(0,len(cols))] for g in range(0,len(xt))]\n    good=[]\n    bads=[]\n    for col in cols:\n        dictsgoouds=defaultdict(int)        \n        dictsbads=defaultdict(int)\n        good.append(dictsgoouds)\n        bads.append(dictsbads)        \n    total_count=0.0\n    total_sum =0.0\n\n    for a in range (0,len(xc)):\n        target=yc[a]\n        total_sum+=target\n        total_count+=1.0\n        for j in range(0,len(cols)):\n            col=cols[j]\n            good[j][round(xc[a][col],rounding)]+=target\n            bads[j][round(xc[a][col],rounding)]+=1.0  \n    #print(total_goods,total_bads)            \n    \n    for a in range (0,len(xt)):    \n        for j in range(0,len(cols)):\n            col=cols[j]\n            if round(xt[a][col],rounding) in good[j]:\n                 woe[a][j]=float(good[j][round(xt[a][col],rounding)])/float(bads[j][round(xt[a][col],rounding)])  \n            else :\n                 woe[a][j]=round(total_sum/total_count)\n    return woe            \n\n\ndef convert_to_avg(X,y, Xt, seed=1, cvals=5, roundings=2, columns=None):\n    \n    if columns==None:\n        columns=[k for k in range(0,(X.shape[1]))]    \n    #print(\"it is not!!\")        \n    X=X.tolist()\n    Xt=Xt.tolist() \n    woetrain=[ [0.0 for k in range(0,len(X[0]))] for g in range(0,len(X))]\n    woetest=[ [0.0 for k in range(0,len(X[0]))] for g in range(0,len(Xt))]    \n    \n    kfolder=StratifiedKFold(y, n_folds=cvals,shuffle=True, random_state=seed)\n    for train_index, test_index in kfolder:\n        # creaning and validation sets\n        X_train, X_cv = np.array(X)[train_index], np.array(X)[test_index]\n        y_train = np.array(y)[train_index]\n\n        woecv=convert_dataset_to_avg(X_train,y_train,X_cv, rounding=roundings,cols=columns)\n        X_cv=X_cv.tolist()\n        no=0\n        for real_index in test_index:\n            for j in range(0,len(X_cv[0])):\n                woetrain[real_index][j]=X_cv[no][j]\n            no+=1\n        no=0\n        for real_index in test_index:\n            for j in range(0,len(columns)):\n                col=columns[j]\n                woetrain[real_index][col]=woecv[no][j]\n            no+=1      \n    woefinal=convert_dataset_to_avg(np.array(X),np.array(y),np.array(Xt), rounding=roundings,cols=columns) \n\n    for real_index in range(0,len(Xt)):\n        for j in range(0,len(Xt[0])):           \n            woetest[real_index][j]=Xt[real_index][j]\n            \n    for real_index in range(0,len(Xt)):\n        for j in range(0,len(columns)):\n            col=columns[j]\n            woetest[real_index][col]=woefinal[real_index][j]\n            \n    return np.array(woetrain), np.array(woetest)\n\n\n# Grouping (Very important)\ndef group_with_time_features(data, g_feat):\n    mean_month_dict = dict(data.groupby(g_feat)['month'].mean())\n    data[\"mean_\" + g_feat + \"_month\"] = data.apply(lambda row: mean_month_dict[row[g_feat]], axis=1)\n    mean_day_dict = dict(data.groupby(g_feat)['day'].mean())\n    data[\"mean_\" + g_feat + \"_day\"] = data.apply(lambda row: mean_day_dict[row[g_feat]], axis=1)\n    mean_hour_dict = dict(data.groupby(g_feat)['hour'].mean())\n    data[\"mean_\" + g_feat + \"_hour\"] = data.apply(lambda row: mean_hour_dict[row[g_feat]], axis=1)\n    mean_weekday_dict = dict(data.groupby(g_feat)['weekday'].mean())\n    data[\"mean_\" + g_feat + \"_weekday\"] = data.apply(lambda row: mean_weekday_dict[row[g_feat]], axis=1)\n    mean_quarter_dict = dict(data.groupby(g_feat)['quarter'].mean())\n    data[\"mean_\" + g_feat + \"_quater\"] = data.apply(lambda row: mean_quarter_dict[row[g_feat]], axis=1)\n    mean_week_dict = dict(data.groupby(g_feat)['week'].mean())\n    data[\"mean_\" + g_feat + \"_week\"] = data.apply(lambda row: mean_week_dict[row[g_feat]], axis=1)\n    mean_passed_dict = dict(data.groupby(g_feat)['passed'].mean())\n    data[\"mean_\" + g_feat + \"_passed\"] = data.apply(lambda row: mean_passed_dict[row[g_feat]], axis=1)\n    mean_latest_dict = dict(data.groupby(g_feat)['latest'].mean())\n    data[\"mean_\" + g_feat + \"_latest\"] = data.apply(lambda row: mean_latest_dict[row[g_feat]], axis=1)\n\n    return data\n\n\ndef group_with_img_time_features(data, g_feat):\n    mean_month_dict = dict(data.groupby(g_feat)['img_month'].mean())\n    data[\"mean_\" + g_feat + \"_img_month\"] = data.apply(lambda row: mean_month_dict[row[g_feat]], axis=1)\n    mean_day_dict = dict(data.groupby(g_feat)['img_day'].mean())\n    data[\"mean_\" + g_feat + \"_img_day\"] = data.apply(lambda row: mean_day_dict[row[g_feat]], axis=1)\n    mean_hour_dict = dict(data.groupby(g_feat)['img_hour'].mean())\n    data[\"mean_\" + g_feat + \"_img_hour\"] = data.apply(lambda row: mean_hour_dict[row[g_feat]], axis=1)\n    # mean_weekday_dict = dict(data.groupby(g_feat)['img_weekday'].mean())\n    # data[\"mean_\" + g_feat + \"_img_weekday\"] = data.apply(lambda row: mean_weekday_dict[row[g_feat]], axis=1)\n    # mean_quarter_dict = dict(data.groupby(g_feat)['img_quarter'].mean())\n    # data[\"mean_\" + g_feat + \"_img_quater\"] = data.apply(lambda row: mean_quarter_dict[row[g_feat]], axis=1)\n    # mean_week_dict = dict(data.groupby(g_feat)['img_week'].mean())\n    # data[\"mean_\" + g_feat + \"_img_week\"] = data.apply(lambda row: mean_week_dict[row[g_feat]], axis=1)\n    mean_passed_dict = dict(data.groupby(g_feat)['img_passed'].mean())\n    data[\"mean_\" + g_feat + \"_img_passed\"] = data.apply(lambda row: mean_passed_dict[row[g_feat]], axis=1)\n    mean_latest_dict = dict(data.groupby(g_feat)['img_latest'].mean())\n    data[\"mean_\" + g_feat + \"_img_latest\"] = data.apply(lambda row: mean_latest_dict[row[g_feat]], axis=1)\n    return data\n\n\n\n\n\n\n"
  },
  {
    "path": "stack/params.txt",
    "content": "LogisticRegression Type:Liblinear C:6.1 threads:1 usescale:True maxim_Iteration:200 seed:1 verbose:false\nGradientBoostingForestClassifier estimators:300 shrinkage:0.18 threads:1 offset:0.00001 max_depth:3 max_features:0.65 min_leaf:2.0 min_split:7.0 Objective:RMSE row_subsample:1.0 seed:1 verbose:false\nLibFmClassifier maxim_Iteration:70 C:0.0041 C2:0.00120 lfeatures:1 seed:1 usescale:True init_values:0.046 learn_rate:0.05 smooth:0.1 threads:1 verbose:false\nsoftmaxnnclassifier usescale:True seed:1 Type:SGD maxim_Iteration:50 C:0.0000008 shuffle:false tolerance:0.01 learn_rate:0.0065 smooth:0.1 h1:40 h2:35 connection_nonlinearity:Relu init_values:0.020 verbose:false\nRandomForestClassifier bootsrap:false estimators:100 threads:1 offset:0.00001 max_depth:6 max_features:0.4 min_leaf:2.0 min_split:5.0 Objective:ENTROPY row_subsample:0.95 seed:1 verbose:false\nAdaboostRandomForestClassifier bootsrap:false weight_thresold:0.95 estimators:100 threads:1 max_depth:6 max_features:0.5 min_leaf:2.0 min_split:5.0 Objective:ENTROPY row_subsample:0.9 seed:1 verbose:false\nGradientBoostingForestRegressor bootsrap:false estimators:300 shrinkage:0.1 threads:1 offset:0.00001 max_depth:3 max_features:0.4 min_leaf:2.0 min_split:5.0 Objective:RMSE row_subsample:0.9 seed:1 verbose:false\nRandomForestRegressor bootsrap:false estimators:100 threads:1 offset:0.00001 max_depth:6 max_features:0.4 min_leaf:2.0 min_split:5.0 Objective:RMSE row_subsample:0.95 seed:1 verbose:false\nLibFmRegressor maxim_Iteration:70 C:0.0001 C2:0.0009 lfeatures:2 seed:1 usescale:True init_values:0.1 learn_rate:0.1 threads:1 verbose:false\n\nRandomForestClassifier bootsrap:false estimators:500 threads:3 offset:0.00001 max_depth:5 max_features:0.3 min_leaf:1.0 min_split:5.0 Objective:ENTROPY row_subsample:0.8 seed:1 verbose:false"
  },
  {
    "path": "stack/parse.py",
    "content": "import re\nimport numpy as np\n\nwith open(\"result.txt\", \"r\") as f:\n    raw = \"\".join(f.readlines())\n\nstr_res = re.findall(pattern=\"logloss : 0\\.[0-9]+\", string=raw)\nres = [float(x.split(\" : \")[1]) for x in str_res]\nresults = {i: [] for i in range(len(res) // 5)}\nfor i in range(len(res)):\n    results[i % (len(res) // 5)].append(res[i])\nresults = {i: np.mean(results[i]) for i in results}\nfor item in sorted(results.items(), key=lambda x: x[1]):\n    print(item)"
  },
  {
    "path": "stack/start.sh",
    "content": "java -Xmx3048m -jar StackNet.jar train train_file=train_stacknet.csv test_file=test_stacknet.csv params=params.txt pred_file=sigma_stack_pred.csv test_target=true verbose=true Threads=4 stackdata=false folds=5 seed=1 metric=logloss\n"
  },
  {
    "path": "stack/utils.py",
    "content": "import pandas as pd\nimport numpy as np\n\n\ndef getAvgSub(subs_in):\n    subs = []\n    for sub in subs_in:\n        sub = sub.sort_values(by=[\"listing_id\"]).reset_index()\n        subs.append(sub)\n    n = len(subs)\n    new_sub = subs[0].copy()\n    for i in range(1, n):\n        sub = subs[i]\n        new_sub[\"high\"] = new_sub[\"high\"] + sub[\"high\"]\n        new_sub[\"medium\"] = new_sub[\"medium\"] + sub[\"medium\"]\n        new_sub[\"low\"] = new_sub[\"low\"] + sub[\"low\"]\n    new_sub[\"high\"] =  new_sub[\"high\"] / n\n    new_sub[\"medium\"] = new_sub[\"medium\"] / n\n    new_sub[\"low\"] = new_sub[\"low\"] / n\n    del new_sub[\"index\"]\n    return new_sub\n\ndef getWeightedAvgSub(subs_in, weights):\n    assert np.sum(weights) == 1, \"Sum of weights need to be 1\"\n    subs = []\n    for sub in subs_in:\n        sub = sub.sort_values(by=[\"listing_id\"]).reset_index()\n        subs.append(sub)\n    n = len(subs)\n    new_sub = subs[0].copy() \n    new_sub[\"high\"] = new_sub[\"high\"] * weights[0]\n    new_sub[\"medium\"] = new_sub[\"medium\"] * weights[0]\n    new_sub[\"low\"] = new_sub[\"low\"] * weights[0]\n    for i in range(1, n):\n        sub = subs[i]\n        new_sub[\"high\"] = new_sub[\"high\"] + sub[\"high\"] * weights[i]\n        new_sub[\"medium\"] = new_sub[\"medium\"] + sub[\"medium\"] * weights[i]\n        new_sub[\"low\"] = new_sub[\"low\"] + sub[\"low\"] * weights[i]\n    del new_sub[\"index\"]\n    return new_sub\n\ndef generateStackSub(test_file_name, sub_file_name):\n    test_array = np.loadtxt(test_file_name, delimiter=\",\") \n    test = pd.DataFrame(test_array)\n    sub_array = np.loadtxt(sub_file_name, delimiter=\",\") \n    sub = pd.DataFrame(sub_array)\n    sub.columns = [\"high\", \"medium\", \"low\"]\n    sub[\"listing_id\"] = test.iloc[:, 0].apply(lambda x: int(x))\n    sub.to_csv(\"new_sub.csv\", index=False)   \n\n\n\n\ndef correct(df):\n    interest_levels = ['low', 'medium', 'high']\n\n    tau = {\n        'low': 0.69195995, \n        'medium': 0.23108864,\n        'high': 0.07695141, \n    }\n\n    y = df[interest_levels].mean()\n    a = [tau[k] / y[k]  for k in interest_levels]\n    print(a)\n\n    def f(p):\n        for k in range(len(interest_levels)):\n            p[k] *= a[k]\n        return p / p.sum()\n\n    df_correct = df.copy()\n    df_correct[interest_levels] = df_correct[interest_levels].apply(f, axis=1)\n\n    y = df_correct[interest_levels].mean()\n    a = [tau[k] / y[k]  for k in interest_levels]\n    print(a)\n\n    return df_correct"
  }
]