[
  {
    "path": "LICENSE",
    "content": "The MIT License (MIT)\n\nCopyright (c) 2015 \n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n\n"
  },
  {
    "path": "README.md",
    "content": "# kaggle-avito\nWinning solution to the Avito CTR competition\n\nSome high level description of the solution can be found in the .pdf file.\n\nHow to run:\n* change the folder at the top of _fast_10pct_run.R and _full_100pct_run.R to point to where the data files are stored\n* (recommended) run _fast_10pct_run.R and verify that it produces expected results -- this will take a few hours\n* run _full_100pct_run.R to produce a submission file.\n\nSummary of other files:\navito_data1.R -- Load datasets\navito_phone.R -- feature extraction from phone dataset\navito_search.R -- feature extraction from search dataset\navito_visit.R -- feature extration from visit dataset\navito_cat_cat.R -- more features + data merge\navito_data_merge.R -- rest of the features + data merge\navito_train_xgb.R -- fit xgboost models\navito_utils.R -- some utility functions\n\nHardware requirement:\nTo run the full data solution (_full_100pct_run.R) I recomment a machine with 256GB of ram + (at least) 200GB of swap space. This is due to the combination of my own inefficient code and inefficiency in R's memory handling. \n"
  },
  {
    "path": "_fast_10pct_run.R",
    "content": "# run entire script with 10% data -- highly recommended in case there are bugs\n\nsetwd('/home/fast/2015_avito') # modify this line to point the right folder\n\nrequire(inline)\nrequire(sqldf)\nrequire(data.table)\nrequire(xgboost)\nrequire(tau)\n\nsource('kaggle-avito/avito_utils.R')\n\nfull_data <- F\nsource('kaggle-avito/avito_data1.R')\n\nsource('kaggle-avito/avito_phone.R')\n\nsource('kaggle-avito/avito_search.R')\n\nsource('kaggle-avito/avito_visit.R')\n\nsource('kaggle-avito/avito_cat_cnt.R')\n\nsource('kaggle-avito/avito_data_merge.R')\n\nn_repeat <- 8\nfeature_list <- c(feature_list1, feature_list2)\nsource('kaggle-avito/avito_train_xgb.R')\npredv_list2 <- predv / ctr\n\nfeature_list <- feature_list1\nsource('kaggle-avito/avito_train_xgb.R')\npredv_list1 <- predv / ctr\n\npredv_avg12 <- predv_list1 * .5 + predv_list2 * .5\nprint(paste(\"logloss of avg of two xgbs (should be ~.04176): \", logloss(yv, predv_avg12)))\n\n"
  },
  {
    "path": "_full_100pct_run.R",
    "content": "# run entire script with 100% data \n\nsetwd('/home/fast/2015_avito') # modify this line to point the right folder\n\nrequire(inline)\nrequire(sqldf)\nrequire(data.table)\nrequire(xgboost)\nrequire(tau)\n\nsource('kaggle-avito/avito_utils.R')\n\nfull_data <- T\nsource('kaggle-avito/avito_data1.R')\n\nsource('kaggle-avito/avito_phone.R')\n\nsource('kaggle-avito/avito_search.R')\n\nsource('kaggle-avito/avito_visit.R')\n\nsource('kaggle-avito/avito_cat_cnt.R')\n\nsource('kaggle-avito/avito_data_merge.R')\n\nn_repeat <- 8\n\n#one model with shorter feature list and all the data points\nrseq_limit <- 1e6\nxt <- 0\ngc()\nrseed_offset <- 200\nfeature_list <- feature_list1\nsource('kaggle-avito/avito_train_xgb.R')\npredv_list1 <- predv / ctr\npredh_list1 <- predh / ctr\n\n#another model with longer feature_list but only last 200 impressions\nrseq_limit <- 200\nxt <- 0\ngc()\nrseed_offset <- 300\nfeature_list <- c(feature_list1, feature_list2)\nsource('kaggle-avito/avito_train_xgb.R')\npredv_list2_r200 <- predv / ctr\npredh_list2_r200 <- predh / ctr\n\n#test on 2% validation data\npredv_avg2 <- (predv_list1 + predv_list2_r200) / 2\nprint(paste(\"logloss of avg of two xgbs (should be ~.04252): \", logloss(yv, predv_avg2)))\n\n#create final submission\npredh_avg2 <- (predh_list1 + predh_list2_r200) / 2\npredh.dt <- data.frame(ID=dt1$ID[fh], IsClick=round(predh_avg2,6))\nmy_write_csv(predh.dt, file='avito_sub_finalx8_final', row.names=F) #private LB ~.04029\n"
  },
  {
    "path": "avito_cat_cnt.R",
    "content": "\nleft_merge_inplace(s01_ths, s01_ads, by='AdID', verbose=T)\nleft_merge_inplace(s01_ths, cat, by='AdCategoryID', verbose=T)\nleft_merge_inplace(s01_ths, s01_search, by='SearchID', verbose=T)\n\nif(conserve_ram) {\n  rm(s01_search)\n  gc()\n}\n\ns01_ths1 <- s01_ths[, c('UserID', 'AdID', 'search_time3', 'SearchQuery', 'IsClick'), with=F]\ns01_ths1[, one:=1]\ns01_ths1[, org_idx:=c(1:.N)]\ns01_ths1[, y1:=pmax(IsClick, 0)]\n\ns01_ths1 <- s01_ths1[order(UserID, AdID, search_time3), ]\ntmp <- s01_ths1[, list(cumsum(one), cumsum(y1)), by=list(UserID, AdID)]\ntmp[, org_idx:=s01_ths1$org_idx]\nsystem.time(tmp <- tmp[order(org_idx), ])\ns01_ths[, user_ad_seq := tmp$V1]\ns01_ths[, user_ad_prior_clicks := tmp$V2 - pmax(IsClick, 0)]\n\ns01_ths1 <- s01_ths1[order(UserID, SearchQuery, search_time3), ]\ntmp <- s01_ths1[, cumsum(one), by=list(UserID, SearchQuery)]\ntmp[, org_idx:=s01_ths1$org_idx]\ntmp <- tmp[order(org_idx), ]\ns01_ths[, user_searchquery_seq := tmp$V1]\ns01_ths[SearchQuery=='', user_searchquery_seq := -user_searchquery_seq]\n\nif(conserve_ram) {\n  rm(s01_ths1); gc()\n}\n\nif(full_data) {\n  s01_ths1x <- s01_ths[ObjectType==3 & search_time_rseq<= 1000, ]\n} else {\n  s01_ths1x <- s01_ths[ObjectType==3 & search_time_rseq<= 2000 & search_time_rseq >= 2, ]\n}\n\nif(conserve_ram) {\n  rm(s01_ths); gc()\n}\n\n\nad_cnt <- s01_ths1x[, .N, by=list(AdID)]\nsetnames(ad_cnt, 'N', 'ad_cnt')\n\ns01_ths1x[, is_test := as.integer(IsClick <0)]\n\nsearch_sum1 <- s01_ths1x[, list(.N, sum(IsClick)), by=list(SearchID, UserID, is_test, search_time3)]\nsetnames(search_sum1, c('N', 'V2'), c('cnt', 'sumy'))\nsearch_sum1 <- search_sum1[order(UserID, is_test, search_time3), ]\ntmp <- search_sum1[, list(cumsum(cnt), cumsum(sumy)), by='UserID']\nsearch_sum1[, user_csum_cads_cnt := tmp$V1 - cnt]\nsearch_sum1[, user_csum_cads_y := tmp$V2 - sumy]\n\n\ntmp <- search_sum1[, list(cumsum(cnt), cumsum(sumy)), by='UserID']\nsearch_sum1[, user_csum_cads_cnt := tmp$V1 - cnt]\nsearch_sum1[, user_csum_cads_y := tmp$V2 - sumy]\n\nad_info <- s01_ths1x[, c('one', 'IsClick', 'AdID', 'search_time3'), with=F]\nad_info[, org_idx := c(1:dim(ad_info)[1])]\nad_info <- ad_info[order(AdID, search_time3), ]\ntmp <- ad_info[, list(cumsum(one), cumsum(IsClick)), by='AdID']\ntmp[, org_idx := ad_info$org_idx]\ntmp <- tmp[order(org_idx), ]\ns01_ths1x[, ad_csum_cads_cnt := tmp$V1 - one]\ns01_ths1x[, ad_csum_cads_y := tmp$V2 - IsClick]\n\n\nleft_merge_inplace(s01_ths1x, search_sum1[, c('SearchID', 'user_csum_cads_cnt', 'user_csum_cads_y'), with=F], by='SearchID', verbose=T)\nif(conserve_ram) {\n  rm(search_sum1); gc()\n}\nleft_merge_inplace(s01_ths1x, ad_param_v_cnt, by='Params', verbose=T)\nleft_merge_inplace(s01_ths1x, user_v_cnt, by=c('UserID'), verbose=T)\nleft_merge_inplace(s01_ths1x, user_pcat_cnt, by=c('UserID', 'ParentCategoryID'), verbose=T)\nleft_merge_inplace(s01_ths1x, user_visit_cat_entropy, by=c('UserID'), verbose=T)\nleft_merge_inplace(s01_ths1x, search_ad_cnt, by='SearchID', verbose=T)\nleft_merge_inplace(s01_ths1x, ad_cnt, by='AdID', verbose=T)\n#setnames(user_cat_cnt, 'CategoryID', 'AdCategoryID')\nleft_merge_inplace(s01_ths1x, user_cat_cnt, by=c('UserID', 'AdCategoryID'), verbose=T)\n\n\nleft_merge_inplace(s01_ths1x, user_search_loc_entropy, by='UserID', verbose=T)\nleft_merge_inplace(s01_ths1x, user_search_dow_entropy, by='UserID', verbose=T)\nleft_merge_inplace(s01_ths1x, user_search_hour_entropy, by='UserID', verbose=T)\nleft_merge_inplace(s01_ths1x, user_visit_loc_entropy, by='UserID', verbose=T)\nleft_merge_inplace(s01_ths1x, user_visit_dow_entropy, by='UserID', verbose=T)\nleft_merge_inplace(s01_ths1x, user_visit_hour_entropy, by='UserID', verbose=T)\n\n\nleft_merge_inplace(s01_ths1x, user_sparam_entropy, by='UserID', verbose=T)\nleft_merge_inplace(s01_ths1x, user_visit_param_entropy, by='UserID', verbose=T)\n\nleft_merge_inplace(s01_ths1x, s01_ths1x[, c('ID', 'ad_csum_cads_cnt', 'ad_csum_cads_y'), with=F], by='ID', verbose=T)\nleft_merge_inplace(s01_ths1x, s01_user, by='UserID', verbose=T)\n\nleft_merge_inplace(s01_ths1x, user_phone_cnt, by='UserID', verbose=T)\ns01_ths1x[is.na(user_phone_cnt), user_phone_cnt := 0]\n\n\ns01_ths2 <- s01_ths1x\n\ns01_ths2[is.na(user_v_cnt), user_v_cnt:=0]\ns01_ths2[is.na(user_cat_cnt), user_cat_cnt:=0]\ns01_ths2[is.na(user_pcat_cnt), user_pcat_cnt:=0]\ns01_ths2[is.na(user_visit_cat_entropy), user_visit_cat_entropy:=-1]\n\ns01_ths2[, search_cnt := search_time_seq + search_time_rseq - 1]\n\ns01_ths2[, cat_price_diff := Price - user_v_cat_avg_price]\ns01_ths2[, ad_params_nchar := nchar(Params)]\ns01_ths2[, params_nchar_diff := ad_params_nchar - param_nchar]\ns01_ths2[, ad_position2 := search_ad3_cnt * 7 + Position]\ns01_ths2[, cat_equal := 0]\ns01_ths2[CategoryID==AdCategoryID, cat_equal := 1]\ns01_ths2[is.na(CategoryID) | is.na(AdCategoryID), cat_equal := -1]\n\ngc()\n"
  },
  {
    "path": "avito_data1.R",
    "content": "#load data \nsub0 <- fread('input/sampleSubmission_HistCTR.csv')\n\nhs0 <- fread('input/testSearchStream.tsv')\nhs0[, IsClick := -1]\nts0 <- fread('input/trainSearchStream.tsv')\nts0[, ID:=-c(1:dim(ts0)[1])]\n\ncat <- fread('input/Category.tsv')\ncat$SubcategoryID <- NULL\n\nloc <- fread('input/Location.tsv')\nuser <- fread('input/UserInfo.tsv')\n\nad0 <- fread('input/AdsInfo.tsv')\nv0 <- fread('input/VisitsStream.tsv')\n\ns0 <- fread('input/SearchInfo.tsv')\n\npr0 <- fread('input/PhoneRequestsStream.tsv')\n\nconserve_ram <- F\nif(full_data) conserve_ram <- T\n\nif(full_data) {\n  s01_user <- user\n  s01_search <- s0\n  s01_visit <- v0\n  s01_phone <- pr0\n  s01_ths <- rbind(ts0, hs0)\n  s01_ads <- ad0\n  rm(user, s0, v0, pr0, ts0, hs0, ad0)\n  gc()\n} else {\n  sample_k <- 10\n  s01_user <- user[UserID %% sample_k == 1, ]\n  s01_search <- s0[UserID %% sample_k == 1, ]\n  s01_visit <- v0[UserID %% sample_k == 1, ]\n  s01_phone <- pr0[UserID %% sample_k == 1, ]\n  system.time(s01_ts <- ts0[SearchID %in% s01_search$SearchID, ])\n  system.time(s01_hs <- hs0[SearchID %in% s01_search$SearchID, ])\n  s01_ths <- rbind(s01_ts, s01_hs)\n  s01_ads <- ad0[AdID %in% unique(c(s01_ts$AdID, s01_hs$AdID)), ]\n  rm(user, s0, v0, pr0, ts0, hs0, ad0)\n  gc()\n}\n\n"
  },
  {
    "path": "avito_data_merge.R",
    "content": "if(full_data) {\n  dt1 <- s01_ths2\n  base_ft <- dt1$search_time_rseq > 1 & !(dt1$UserID %% 50 == 1) & dt1$IsClick >=0\n  print(sum(base_ft))\n} else {\n  dt1 <- s01_ths2[IsClick >=0, ]\n  base_ft <- dt1$search_time_rseq > 2\n}\n\n#reponse encoding with leave-one-out and credibility adjustment\ncalc_exp2(dt1, base_ft, 'IsClick', c('AdID'), 'exp2_ad', 100, verbose=T)\ncalc_exp2(dt1, base_ft, 'IsClick', c('SearchQuery'), 'exp2_query', 100, mean_y0=NULL, verbose=T)\ncalc_exp2(dt1, base_ft, 'IsClick', c('SearchQuery', 'AdID'), 'exp2_query_ad', 100, mean_y0=NULL, verbose=T)\ncalc_exp2(dt1, base_ft, 'IsClick', c('SearchQuery', 'Title'), 'exp2_query_title', 100, mean_y0=NULL, verbose=T)\ncalc_exp2(dt1, base_ft, 'IsClick', c('Params'), 'exp2_aparam', 100, mean_y0=NULL, verbose=T)\ncalc_exp2(dt1, base_ft, 'IsClick', c('LocationID', 'Params'), 'exp2_loc_aparam', 100, mean_y0=NULL, verbose=T)\n\n\ndt1[, ad_title_nchar := nchar(Title)]\n\nad_sparam_query_cnt <- dt1[, list(ad_sparam_query_cnt=.N), by=.(AdID, SearchParams, SearchQuery)]\n\nuser_ad_sum3 <- dt1[, list(user_ad_cnt=.N), by=.(AdID, UserID)]\n\nad_sparam_entropy <- calc_entropy(dt1, 'AdID', c('SearchParams'), 'adid_sparam') \nad_query_entropy <- calc_entropy(dt1, 'AdID', c('SearchQuery'), 'adid_query') \nleft_merge_inplace(ad_sparam_entropy, ad_query_entropy, 'AdID', verbose=T)\n\nleft_merge_inplace(dt1, ad_sparam_query_cnt, by=c('AdID', 'SearchParams', 'SearchQuery'), verbose=T)\ndt1[, ad_sparam_query_cnt_ratio := ad_sparam_query_cnt * 1.0 / ad_cnt]\nleft_merge_inplace(dt1, user_ad_sum3, by=c('AdID', 'UserID'), verbose=T)\nleft_merge_inplace(dt1, ad_sparam_entropy, by=c('AdID'), verbose=T)\n\n\nuse_exp <- T\nfeature_list1 <- c('Position', 'HistCTR', 'IsUserLoggedOn', 'LocationID', 'search_time3', 'search_time_seq',\n                   'user_cat_cnt', 'user_v_cnt', 'user_pcat_cnt', 'user_visit_cat_entropy', 'search_cnt',\n                   'hour', 'dow', 'CategoryID', 'ParentCategoryID', 'query_nchar', 'param_nchar', 'Price',\n                   'ad_title_nchar', 'user_v_cat_avg_price', 'cat_price_diff', 'ad_params_nchar', 'params_nchar_diff',\n                   'ad_param_v_cnt', 'search_ad_cnt', 'search_ad1_cnt', 'search_ad2_cnt', 'search_ad3_cnt',\n                   'ad_position2', 'time_gap_prev_search', 'user_ad_seq', 'user_searchquery_seq', 'ad_cnt',\n                   'user_csum_cads_cnt', 'user_csum_cads_y', \n                   'cat_equal', \n                   'total_other_ads_visit_cnt',\n                   'user_session_seq', 'user_session_no', 'user_search_no', 'user_same_search_seq',\n                   \"UserAgentID\",       \"UserAgentOSID\",    \"UserDeviceID\",     \"UserAgentFamilyID\",\n                   'user_phone_cnt',\n                   'user_search_loc_entropy', 'user_search_dow_entropy', 'user_search_hour_entropy',\n                   'user_visit_loc_entropy', 'user_visit_dow_entropy', 'user_visit_hour_entropy',\n                   'user_search_sparam_entropy', 'user_visit_param_entropy',\n                   'user_ad_prior_clicks' \n)\n\nfeature_list2 <- c('user_ad_cnt', 'ad_sparam_query_cnt_ratio', \n                   'adid_sparam_entropy', 'adid_query_entropy')\n\nfeature_list_exp <- c('exp2_ad', 'exp2_query', 'exp2_query_ad', 'exp2_query_title', 'exp2_loc_aparam', 'exp2_aparam')\n"
  },
  {
    "path": "avito_phone.R",
    "content": "user_phone_cnt <- s01_phone[, list(user_phone_cnt=.N), by=list(UserID)]\n\n"
  },
  {
    "path": "avito_search.R",
    "content": "#convert time to an integer (seconds from min(time))\ntime2 <- strptime(s01_search$SearchDate, '%Y-%m-%d %H:%M:%S')\n\nmin_time2 <- min(time2)\nmin_time3 <- as.Date(min_time2)\ntime3 <- as.integer(difftime(time2, min_time3, unit='sec'))\nif(conserve_ram) {\n  rm(time2)\n}\ngc()\n\ns01_search[, search_time3 := time3]\ns01_search[, hour := as.integer(round(as.integer(search_time3)  %/% 3600)) %% 24]\ns01_search[, dow := as.integer(round(as.integer(search_time3)  %/% 86400)) %% 7]\ns01_search[, query_nchar := nchar(SearchQuery)]\ns01_search[, param_nchar := nchar(SearchParams)]\n\nuser_search_type_sum2 <- s01_search[, .(user_query_ratio=mean(SearchQuery!='')), by=UserID]\nuser_search_query_sum <- s01_search[, .(user_search_nchar_mean=mean(query_nchar), user_search_param_nchar_mean=mean(param_nchar)), by=UserID]\n\nuser_search_loc_entropy <- calc_entropy(s01_search, 'UserID', 'LocationID', 'user_search_loc') \nuser_search_dow_entropy <- calc_entropy(s01_search, 'UserID', 'dow', 'user_search_dow') \nuser_search_hour_entropy <- calc_entropy(s01_search, 'UserID', 'hour', 'user_search_hour') \nuser_sparam_entropy <- calc_entropy(s01_search, 'UserID', 'SearchParams', 'user_search_sparam') \n\ns01_search[, one := 1]\n\ns01_search <- s01_search[order(UserID, -search_time3), ]\na <- s01_search[, cumsum(one), by=list(UserID)]\ns01_search[, search_time_rseq := a$V1]\n\ns01_search <- s01_search[order(UserID, search_time3), ]\na <- s01_search[, cumsum(one), by=list(UserID)]\ns01_search[, search_time_seq := a$V1]\n\n#s01_search <- s01_search[order(UserID, search_time3), ]\ntmp <- c(NA, head(s01_search$search_time3, -1))\ns01_search[, prev_search_time3:=tmp]\ntmp <- c(NA, head(s01_search$UserID, -1))\ns01_search[, prev_user_id:=tmp]\ns01_search[, prev_search_query:=c(NA, head(SearchQuery, -1))]\ns01_search[, prev_search_param:=c(NA, head(SearchParams, -1))]\n\ns01_search[, time_gap_prev_search := search_time3 - prev_search_time3]\ns01_search[is.na(prev_user_id) | (UserID != prev_user_id), time_gap_prev_search := (-1)]\n\ns01_search[, is_gap := as.numeric(time_gap_prev_search < 0 | time_gap_prev_search > 900)]\ntmp <- s01_search[, cumsum(is_gap), by=list(UserID)]\ns01_search[, user_session_no := tmp$V1]\ntmp <- s01_search[, cumsum(one), by=list(UserID, user_session_no)]\ns01_search[, user_session_seq := tmp$V1]\n\ns01_search[, new_search := as.numeric(is.na(prev_user_id) | UserID != prev_user_id | \n                                           SearchQuery != prev_search_query | \n                                           SearchParams != prev_search_param)]\ntmp <- s01_search[, cumsum(new_search), by=list(UserID)]\ns01_search[, user_search_no := tmp$V1]\ntmp <- s01_search[, cumsum(one), by=list(UserID, user_search_no)]\ns01_search[, user_same_search_seq := tmp$V1]\n\nuser_search_nzquery_cnt <- s01_search[, sum(SearchQuery != ''), by=list(UserID)]\nsetnames(user_search_nzquery_cnt, 'V1', 'user_search_nzquery_cnt')\nuser_search_cnt <- s01_search[, .N, by=list(UserID)]\nsetnames(user_search_cnt, 'N', 'user_search_cnt')\nleft_merge_inplace(user_search_cnt, user_search_nzquery_cnt, by='UserID')\n\nad_visit_cnt <- s01_visit[, .N, by=list(AdID)]\nsetnames(ad_visit_cnt, 'N', 'ad_visit_cnt')\nleft_merge_inplace(s01_ths, ad_visit_cnt, by='AdID', verbose=T)\ns01_ths[is.na(ad_visit_cnt), ad_visit_cnt := 0]\n\nsearch_ad_cnt <- s01_ths[, list(.N, sum(ad_visit_cnt)), by=list(SearchID)]\nsetnames(search_ad_cnt, c('N', 'V2'), c('search_ad_cnt', 'total_other_ads_visit_cnt'))\n\nsearch_ad1_cnt <- s01_ths[ObjectType==1, .N, by=list(SearchID)]\nsetnames(search_ad1_cnt, 'N', 'search_ad1_cnt')\nsearch_ad2_cnt <- s01_ths[ObjectType==2, .N, by=list(SearchID)]\nsetnames(search_ad2_cnt, 'N', 'search_ad2_cnt')\nleft_merge_inplace(search_ad_cnt, search_ad1_cnt, by='SearchID')\nleft_merge_inplace(search_ad_cnt, search_ad2_cnt, by='SearchID')\ndt_fill_na(search_ad_cnt)\nsearch_ad_cnt[, search_ad3_cnt := search_ad_cnt - search_ad1_cnt - search_ad2_cnt]\n\ngc()\n"
  },
  {
    "path": "avito_train_xgb.R",
    "content": "#fit xgboost model with repetitively smart sampled data (keeping all events, randomly sample non-events)\n\nif(use_exp) feature_list <- c(feature_list, feature_list_exp)\n\nMSV <- -999\nfor(vn in feature_list) {\n  if(sum(is.na(dt1[[vn]]))>0) {\n    print(vn)\n    dt1[is.na(dt1[[vn]]), paste(vn, sep='') := MSV]\n  }\n}\n\n\ny <- dt1$IsClick\ntesting <- F\n\nif (full_data) {\n  if(testing) {\n    fv1 <- dt1$search_time_rseq == 2\n    ft <- y >= 0 & (!fv1) & dt1$UserID %% 10 == 1\n    fv <- y >= 0 & fv1\n  } else {\n    fv1 <- dt1$UserID %% 50 == 1 & dt1$search_time_rseq == 2\n    ft <- y >= 0 & (!fv1) & dt1$search_time_rseq < rseq_limit #& dt1$UserID %% 10 == 1\n    fv <- y >= 0 & fv1\n  }\n  fh <- y < 0\n} else {\n  ft <- (dt1$search_time_rseq > 2 & dt1$search_time_rseq < 200)\n  fv <- dt1$search_time_rseq <= 2  \n}\n\n\nxt <- as.matrix(dt1[ft, feature_list, with=F])\nyt <- y[ft]\nxv <- as.matrix(dt1[fv, feature_list, with=F])\nyv <- y[fv]\nif (full_data) {\n  xh <- as.matrix(dt1[fh, feature_list, with=F])\n  yh <- y[fh]\n}\n\n\n\nprint(dim(xt))\nprint(dim(xv))\nif(full_data) {\n  print(dim(xh))\n  if(dim(xh)[1] != dim(sub0)[1]) stop('xh has wrong shape!!!')\n}\n\nMSV2 <- 9999999\nif(full_data & !testing) dtest <- xgb.DMatrix(xh, missing = MSV2)\ndvalid <- xgb.DMatrix(xv, label = yv, missing = MSV2)\n\nds_k <- 50\nds_kt <- 1\nif(full_data) {\n  ds_k <- 50\n  ds_kt <- 1\n}\n\nxgbs <- list()\npredv <- 0\npredh <- 0\npredt <- 0\nctr <- 0\n\nidx_exp <- which(feature_list %in% feature_list_exp)\n\nfor(i in c(1:n_repeat)) {\n  ctr <- ctr + 1\n  \n  set.seed(rseed_offset + i)\n  selt <- get_ds_filter(yt, ds_k, ds_kt)\n  selv <- get_ds_filter(yv, ds_k, ds_kt)\n  \n  t_shuffle_idx <- sample(sum(selt))  \n  xt1 <- xt[selt, ][t_shuffle_idx, ]\n  print(dim(xt1))\n  if(use_exp) xt1[, idx_exp] <- xt1[, idx_exp] * matrix(exp((runif(dim(xt1)[1]*length(feature_list_exp))-.5)*.04), nrow=dim(xt1)[1], ncol=length(feature_list_exp))\n  dtrain1 <- xgb.DMatrix(xt1, label = yt[selt][t_shuffle_idx], missing = MSV2)\n  dvalid1 <- xgb.DMatrix(xv[selv, ], label = yv[selv], missing = MSV2)\n  watchlist <- list(train = dtrain1, valid = dvalid1)\n  \n  tree_depth <- 11\n  if(full_data & !testing) tree_depth <- 14\n  print(paste(\"tree depth\", tree_depth))\n  ts1 <- proc.time()\n  bst <- xgboost2(dtrain1, max.depth = tree_depth, watchlist = watchlist, colsample_bytree = .4, verbose=1, print.every.n=10, eval_metric='logloss',\n                  eta = 0.15, nthread = 16, nround = 150, objective = \"binary:logistic\", set.seed=i, min_child_weight=1, num_parallel_tree=1)\n  print(proc.time() - ts1)\n  xgbs[[i]] <- bst\n  if(full_data & !testing) {\n    predh0 <- calibrate_ds(predict(bst, dtest), ds_k, ds_kt)\n    predh <- predh + predh0   \n  }\n  \n  pred <-  calibrate_ds(predict(bst, dvalid), ds_k, ds_kt)\n  predv <- predv + pred\n  print(paste(i, '-----------------', logloss(yv, predv / ctr)))\n  gc()  \n}\n"
  },
  {
    "path": "avito_utils.R",
    "content": "#hack for xgboost wrapper to get watchlist into the call\nxgboost2 <- function(dtrain, params = list(), nrounds, \n                     verbose = 1, print.every.n = 1L, early.stop.round = NULL, watchlist = NULL,\n                     maximize = NULL, num_parallel_tree=1, ...) {\n  \n  params <- append(params, list(...))\n  \n  bst <- xgb.train(params, dtrain, nrounds, watchlist, verbose = verbose, print.every.n=print.every.n,\n                   early.stop.round = early.stop.round, num_parallel_tree=num_parallel_tree)\n  \n  return(bst)\n}\n\n#compute entropy by group, over subgrp\ncalc_entropy <- function(df, group, subgrp, tgt_vn_prefix) {\n  sum1 <- df[, .N, by=list(df[[group]], df[[subgrp]])]\n  setnames(sum1, c(group, subgrp, 'subgrpcnt'))\n  sum2 <- df[, .N, by=list(df[[group]])]\n  setnames(sum2, c(group, 'cnt'))\n  #sum2[, dummy:=1]\n  #dowdf <- data.table(x=unique(df[[subgrp]]), dummy=1)\n  #setnames(dowdf, c(subgrp, 'dummy'))\n  #sum2a <- merge(sum2, dowdf, by='dummy', allow.cartesian=T)\n  sum3 <- merge(sum2, sum1, by=c(group))\n  #sum3[is.na(subgrpcnt), subgrpcnt:=0]\n  sum3[, entropy := - log(subgrpcnt * 1.0 / cnt) * subgrpcnt * 1.0 / cnt]\n  sum3[is.na(entropy), entropy := 0]\n  sum4 <- sum3[, sum(entropy), by=list(sum3[[group]])]\n  setnames(sum4, c(group, paste(tgt_vn_prefix, 'entropy', sep='_')))\n  return(sum4)\n}\n\n#generate smart downsample index\nget_ds_filter <- function(y, ds_k, ds_kt=1) {\n  r1 <- runif(length(y))\n  sel <- r1 < 1.0 / ds_k\n  sel[y==1] <- r1[y==1] < 1.0 / ds_kt\n  return(sel)\n}\n\n#calibrate the prediction to original scale (before smart downsample)\ncalibrate_ds <- function(p, ds_k, ds_kt=1) {\n  return(p * ds_kt / (p * ds_kt + (1 - p) * ds_k))\n}\n\n#response encoder for categorical features, with credibility adjustment and leave-one-out\ncalc_exp2 <- function(dt, ft, vn_y, by, tgt_vn, k, mean_y0=NULL, verbose=F) {\n  dt[, tmp_y := dt[[vn_y]]]\n  tmp <- dt[ft, list(.N, sum(tmp_y), mean(tmp_y)), by=by]\n  if(verbose) print(paste(\"dim of summary :\", dim(tmp)))\n  setnames(tmp, c(by, 'tmp_cnt', 'tmp_sumy', 'tmp_mean_y'))\n  if(is.null(mean_y0)) mean_y0 <- mean(tmp$tmp_mean_y)\n  if(verbose) print(paste(\"mean_y0 = \", mean_y0))\n  tmp[, tmp_mean_y := NULL]\n  left_merge_inplace(dt, tmp, by=by, verbose=verbose)\n  dt[is.na(tmp_cnt), tmp_cnt := 0]\n  dt[is.na(tmp_sumy), tmp_sumy := 0]\n  dt[ft, tmp_cnt := tmp_cnt - 1L]\n  dt[ft, tmp_sumy := tmp_sumy - tmp_y]\n  dt[, paste(tgt_vn, sep='') := (tmp_sumy + mean_y0 * k) / (tmp_cnt + k)]\n  dt[, tmp_y := NULL]\n  dt[, tmp_sumy := NULL]\n  dt[, tmp_cnt := NULL]\n  return(0)\n}\n\n#simple logistic loss function\nlogloss <- function(act, pred)\n{\n  eps = 1e-15;\n  nr <- nrow(pred)\n  pred = pmin(1-eps, pred, pmax(eps, pred))\n  ll = sum(act*log(pred) + (1-act)*log(1-pred))\n  #print(ll)\n  ll = ll * -1/length(act)      \n  return(ll)\n}\n\n#shift column by n rows\nadd_shift1 <- function(dt, key, n, nv, force=F) {\n  tgt_vn <- paste(key, '_shift', nv, sep='')\n  if(!force) {\n    if(tgt_vn %in% names(dt)) stop(paste('variable ', tgt_vn, 'already exists in the data'))\n  }\n  if(n >0) a <- c(rep(NA, n), head(dt[[key]], -n))\n  else a <- c(tail(dt[[key]], n), rep(NA, -n))\n  dt[, tgt_vn := a, with=F]\n}\n\n#shift multiple columns by n rows\nadd_shift <- function(dt, keys, values, ns, force=F) {\n  for(n in ns) {\n    nv <- n\n    if(n < 0) nv <- paste('N', -n, sep='')\n    if(!is.null(keys)) {\n      for(key in keys) {\n        add_shift1(dt, key, n, nv, force)\n      }\n    }\n    for(value in values) {\n      add_shift1(dt, value, n, nv, force)\n      if(!is.null(keys)) {\n        for(key in keys) {\n          dt[dt[[key]] != dt[[paste(key, '_shift', nv, sep='')]], paste(value, '_shift', nv, sep='') := NA, with=F]\n        }\n      }\n    }\n  }\n}\n\n#compute first n component of svd \nget_svd_u <- function(df1, u, v, x, n, tgt_vn_prefix) {\n  df1[, u_idx:=as.integer(factor(df1[[u]]))]\n  df1[, v_idx:=as.integer(factor(df1[[v]]))]\n  if(is.null(x)) {\n    m1 <- sparseMatrix(i=df1[['u_idx']], j=df1[['v_idx']])\n  } else {\n    m1 <- sparseMatrix(i=df1[['u_idx']], j=df1[['v_idx']], x=df1[[x]])\n  }\n  \n  r <- irlba(m1, nu=n, nv=n)\n  \n  dtu <- data.frame(u_idx=c(1:dim(m1)[1]))\n  for(i in c(1:n)) {\n    dtu[[paste(tgt_vn_prefix, i, sep='_')]] <- r$u[, i]\n  }  \n  \n  #print(names(dtu))\n  #print(names(df1))\n  rmap <- data.table(unique(df1[, c(u, 'u_idx'), with=F]))\n  #print(rmap)\n  dtu1 <- merge(dtu, rmap, by='u_idx')\n  dtu1$u_idx <- NULL\n  return(dtu1)\n}\n\n#wrapper to make calling get_svd_u easier\nget_svd <- function(df, vn_u, vn_v, n, tgt_vn) {\n  tmp1 <- df[, length(time2), by=list(df[[vn_u]], df[[vn_v]])]\n  setnames(tmp1, c(vn_u, vn_v, 'cnt'))\n  tmp1 <- tmp1[!is.na(tmp1[[vn_v]]), ]\n  tmp1[, x1:=log(cnt + 1)]\n  \n  svd1 <- get_svd_u(tmp1, vn_u, vn_v, 'x1', n, tgt_vn)\n  return(svd1)\n}\n\n#compute ngram/tfidf/svd\ncalc_ngram_svd <- function(df, vn_id, vn_text, NN, tf_min, df_max, ngram_n, svd_n, tgt_vn_prefix) {\n  \n  #df <- em_ev2_seq\n  #vn_id <- \"enrollment_id\"\n  #vn_text <- 'em_ev2_seq'\n  #NN <- -1\n  #tf_min <- 10\n  #df_max <- 10000\n  #ngram_n <- c(1:2)\n  #svn_n <- 20\n  #tgt_vn_prefix <- 'em_ev3gram_svd_em'\n  ptm <- proc.time()\n  \n  if(NN<0) NN <- dim(df)[1]\n  \n  texts <- as.list(df[[vn_text]][1:NN])\n  org_ids <- df[[vn_id]][1:NN]\n  \n  a <- list()\n  for (n1 in ngram_n) {\n    a1 <- textcnt(texts, split='[[:space:]]', n=n1, method='string', recursive = T, verbose = F)\n    a <- c(a, a1)\n    print(paste(\"done counting words by document for ngram \", n1, length(a)))\n    print(proc.time() - ptm)\n  }\n  a2 <- unlist(a)\n  an <- lapply(a, function(x) length(x))\n  df1 <- data.frame(id=org_ids, cnt=unlist(an))\n  ids <- unlist(apply(df1[, c(\"id\", \"cnt\")], 1, function(x) rep(x[1], x[2])))\n  dt2 <- data.table(id=ids, ngram=names(a2), cnt=a2)\n  \n  dt3 <- dt2[, length(id), by=list(ngram)]\n  setnames(dt3, c('ngram', 'cnt_total'))\n  dt3 <- dt3[cnt_total >= tf_min, ]\n  \n  dt4 <- merge(dt2, dt3, by='ngram')\n  \n  dt4[, tfidf := log(cnt + 1) * log(NN / cnt_total)]\n  \n  print(paste(dim(dt2), dim(dt3), dim(dt4), sep=\"|\"))\n  setnames(dt4, \"id\", vn_id)\n  svd1 <- get_svd(dt4, vn_id, 'ngram', svd_n,  tgt_vn_prefix)\n  \n  print(proc.time() - ptm)\n  return(svd1)\n}\n\n#write compressed csv.gz\nmy_write_csv <- function(obj, file, row.names=F, timing=T) {\n  st <- proc.time()\n  con <- pipe(paste(\"pigz -p30 > \", file, \".csv.gz\", sep=''), \"wb\")\n  write.csv(obj, file = con, row.names=row.names)\n  close(con)\n  if(timing) {\n    print(proc.time() - st)\n  }\n}\n\n#fill all missings in DT with value\ndt_fill_na = function(DT, value=0) {\n  # either of the following for loops\n  \n  # or by number (slightly faster than by name) :\n  for (j in seq_len(ncol(DT)))\n    set(DT,which(is.na(DT[[j]])),j,value)\n}\n\n#merge a small dataset to a large data.table without copying the large data.table\nleft_merge_inplace <- function(dt1, dt2, by, verbose=F, fill.na=NA) {\n  st <- proc.time()\n  dt1a <- copy(dt1[, by, with=F])\n  dt2a <- copy(dt2[, by, with=F])\n  if (verbose) {\n    print('small datasets created')\n    print(proc.time() - st)\n  }\n  dt1a[, tmp_idx1 := c(1:dim(dt1a)[1])]\n  dt2a[, tmp_idx2 := c(1:dim(dt2a)[1])]\n  if (verbose) {\n    print('row index created')\n    print(proc.time() - st)\n  }\n  dt3 <- merge(dt1a, dt2a, by=by, all.x=T)\n  if (verbose) {\n    print('small datasets merged')\n    print(proc.time() - st)\n  }\n  dt3 <- dt3[order(tmp_idx1), ]\n  if (verbose) {\n    print('merged dataset reordered')\n    print(proc.time() - st)\n  }\n  dt2_idx_map <- dt3$tmp_idx2\n  if (verbose) {\n    print('row index generated')\n    print(proc.time() - st)\n  }\n  \n  for(vn in names(dt2)) {\n    if(!(vn %in% by)) {\n      dt1[, paste(vn, sep='') := dt2[[vn]][dt2_idx_map]]\n      if(!is.na(fill.na)) dt1[is.na(dt1[[vn]]), paste(vn, sep=''):=fill.na]\n      if (verbose) {\n        print(paste('assigned variable ', vn))\n        print(proc.time() - st)\n      }\n    }\n  }\n}\n\n#utility function for ngram based text similarity\nget_ngram_cnts <- function(texts, ngram_n) {\n  ptm <- proc.time()\n  a <- list()\n  for (n1 in ngram_n) {\n    a1 <- textcnt(texts, split='[[:space:]|[:punct:]]', n=n1, method='string', recursive = T, verbose = F)\n    a <- c(a, a1)\n    print(paste(\"done counting words by document for ngram \", n1, length(a)))\n    print(proc.time() - ptm)\n  }\n  return(a)\n}\n\n\n#utility function for ngram based text similarity\nget_rep_id_dt <- function(a, ids0) {\n  an <- lapply(a, function(x) length(x))\n  df1 <- data.frame(id=ids0, cnt=unlist(an))\n  ids <- unlist(apply(df1[, c(\"id\", \"cnt\")], 1, function(x) rep(x[1], x[2])))\n  #print(length(ids))\n  dt1 <- data.table(id=ids, ngram=names(unlist(a)), cnt=unlist(a))\n  return(dt1)\n}\n\n#text similarity based on cosine distance of ngram/tfidf\ncalc_ngram_dist <- function(dt0, vn_id, vn_text1, vn_text2, ngram_n, tgt_vn) {\n  if(F) {\n    dt0 <- lt0[, ]\n    \n    vn_id <- \"id\"\n    vn_text1 <- 'user_skill_str'\n    vn_text2 <- 'req_skill_str'\n    NN <- -1\n    ngram_n <- c(1:2)\n  }\n  ptm <- proc.time()\n  \n  dt0a <- dt0[, c(vn_id, vn_text1, vn_text2), with=F]\n  dt0a[, org_ord:= c(1:dim(dt0a)[1])]\n  \n  dt1 <- data.table(unique(data.frame(dt0a[, c(vn_id, vn_text1, vn_text2), with=F])))\n  setnames(dt1, c('id', vn_text1, vn_text2))\n  NN <- dim(dt1)[1]\n  \n  texts1 <- (dt1[[vn_text1]])[1:NN]\n  texts2 <- (dt1[[vn_text2]])[1:NN]\n  org_ids <- dt1[['id']][1:NN]\n  \n  a1 <- get_ngram_cnts(texts1, ngram_n)\n  a2 <- get_ngram_cnts(texts2, ngram_n)\n  \n  dt1_wids <- get_rep_id_dt(a1, org_ids)\n  dt2_wids <- get_rep_id_dt(a2, org_ids)\n  \n  dt12_wids <- rbind(dt1_wids, dt2_wids)\n  dt12_sum <- dt12_wids[, list(total_cnt=sum(cnt)), by=list(ngram)]\n  \n  setnames(dt1_wids, 'cnt', 'cnt1')\n  setnames(dt2_wids, 'cnt', 'cnt2')\n  \n  dt3 <- merge(merge(dt1_wids, dt2_wids, by=c('id', 'ngram'), all.x=T, all.y=T), dt12_sum, by='ngram')\n  \n  dt3[is.na(cnt1), cnt1:=0]\n  dt3[is.na(cnt2), cnt2:=0]\n  \n  dt3_sum <- dt3[, list(doc_cnt=.N), by=list(ngram)]\n  dt4 <- merge(dt3, dt3_sum, by='ngram')\n  setnames(dt4, 'id', vn_id)\n  \n  dt4[, tfidf1 := log(cnt1 + 1) * log(NN / doc_cnt)]\n  dt4[, tfidf2 := log(cnt2 + 1) * log(NN / doc_cnt)]\n  \n  dt5 <- dt4[, list(cos_sim=sum(tfidf1 * tfidf2) / sqrt(sum(tfidf1*tfidf1)*sum(tfidf2*tfidf2))), by=vn_id]\n  setnames(dt5, 'cos_sim', tgt_vn)\n  return(dt5)\n}\n\n"
  },
  {
    "path": "avito_visit.R",
    "content": "time2 <- strptime(s01_visit$ViewDate, '%Y-%m-%d %H:%M:%S')\n\nmin_time2 <- min(time2)\nmin_time3 <- as.Date(min_time2)\ntime3 <- as.integer(difftime(time2, min_time3, unit='sec'))\nif(conserve_ram) {\n  rm(time2)\n}\ngc()\n\ns01_visit[, visit_time3 := time3]\ns01_visit[, hour := as.integer(round(as.integer(visit_time3)  %/% 3600)) %% 24]\ns01_visit[, dow := as.integer(round(as.integer(visit_time3)  %/% 86400)) %% 7]\n\nuser_visit_dow_entropy <- calc_entropy(s01_visit, 'UserID', 'dow', 'user_visit_dow') \nuser_visit_hour_entropy <- calc_entropy(s01_visit, 'UserID', 'hour', 'user_visit_hour') \n\ns01_visit <- s01_visit[order(UserID, visit_time3), ]\nadd_shift(s01_visit, c('UserID'), c('visit_time3'), 1, force=F)\nuser_visit_time_interval_mean <- s01_visit[, .(user_visit_time_interval_mean=mean(visit_time3 - visit_time3_shift1, na.rm=T)), by=UserID]\n\n\n# count of visits in subcategory and category\n# entropy of subcat and cat\nsetnames(s01_ads, 'LocationID', 'AdLocationID')\nsetnames(s01_ads, 'CategoryID', 'AdCategoryID')\n\nleft_merge_inplace(s01_visit, s01_ads, 'AdID', verbose=T)\nsetnames(cat, 'CategoryID', 'AdCategoryID')\nleft_merge_inplace(s01_visit, cat, 'AdCategoryID', verbose=T)\n\nuser_visit_loc_entropy <- calc_entropy(s01_visit, 'UserID', 'AdLocationID', 'user_visit_loc') \nuser_visit_param_entropy <- calc_entropy(s01_visit, 'UserID', 'Params', 'user_visit_param') \n\n\nad_param_v_cnt <- s01_visit[, .N, by=list(Params)]\nsetnames(ad_param_v_cnt, 'N', 'ad_param_v_cnt')\n\nad_param_avg_price <- s01_visit[, mean(Price), by=list(Params)]\nsetnames(ad_param_avg_price, 'V1', 'ad_param_avg_price')\n\nleft_merge_inplace(ad_param_v_cnt, ad_param_avg_price, by='Params')\n\nuser_v_avg_price <- s01_visit[, mean(Price, na.rm=T), by=list(UserID)]\nsetnames(user_v_avg_price, 'V1', 'user_v_avg_price')\nuser_v_cat_avg_price <- s01_visit[, mean(Price), by=list(UserID, AdCategoryID)]\nsetnames(user_v_cat_avg_price, 'V1', 'user_v_cat_avg_price')\n\nuser_v_price_sum2 <- s01_visit[, .(user_v_min_price=min(Price, na.rm=T), user_v_max_price=max(Price, na.rm=T), user_v_var_price=var(Price, na.rm=T)), by=UserID]\nuser_v_price_sum2[is.infinite(user_v_min_price), user_v_min_price := NA]\nuser_v_price_sum2[is.infinite(user_v_max_price), user_v_max_price := NA]\n\nuser_cat_cnt <- s01_visit[, .N, by=list(UserID, AdCategoryID)]\nsetnames(user_cat_cnt, \"N\", \"user_cat_cnt\")\nuser_pcat_cnt <- s01_visit[, .N, by=list(UserID, ParentCategoryID)]\nsetnames(user_pcat_cnt, \"N\", \"user_pcat_cnt\")\nuser_v_cnt <- s01_visit[, .N, by=list(UserID)]\nsetnames(user_v_cnt, \"N\", \"user_v_cnt\")\n\nleft_merge_inplace(user_v_cnt, user_v_avg_price, by='UserID')\nleft_merge_inplace(user_cat_cnt, user_v_cat_avg_price, by=c('UserID', 'AdCategoryID'))\n\nuser_visit_cat_entropy <- calc_entropy(s01_visit, 'UserID', 'AdCategoryID', 'user_visit_cat') \n\nif(conserve_ram) {\n  rm(s01_visit)\n}\ngc()\n\n"
  },
  {
    "path": "avito_xgb1.R",
    "content": "if(full_data) {\n  dt1 <- s01_ths2\n  base_ft <- dt1$search_time_rseq > 1 & !(dt1$UserID %% 50 == 1) & dt1$IsClick >=0\n  print(sum(base_ft))\n} else {\n  dt1 <- s01_ths2[IsClick >=0, ]\n  base_ft <- dt1$search_time_rseq > 2\n}\nk <- 100\n\ncalc_exp2(dt1, base_ft, 'IsClick', c('AdID'), 'exp2_ad', 100, verbose=T)\ncalc_exp2(dt1, base_ft, 'IsClick', c('SearchQuery'), 'exp2_query', 100, mean_y0=NULL, verbose=T)\ncalc_exp2(dt1, base_ft, 'IsClick', c('SearchQuery', 'AdID'), 'exp2_query_ad', 100, mean_y0=NULL, verbose=T)\ncalc_exp2(dt1, base_ft, 'IsClick', c('SearchQuery', 'Title'), 'exp2_query_title', 100, mean_y0=NULL, verbose=T)\ncalc_exp2(dt1, base_ft, 'IsClick', c('Params'), 'exp2_aparam', 100, mean_y0=NULL, verbose=T)\ncalc_exp2(dt1, base_ft, 'IsClick', c('LocationID', 'Params'), 'exp2_loc_aparam', 100, mean_y0=NULL, verbose=T)\n\n\ndt1[, ad_title_nchar := nchar(Title)]\n\nad_sparam_query_cnt <- dt1[, list(ad_sparam_query_cnt=.N), by=.(AdID, SearchParams, SearchQuery)]\n\nuser_ad_sum3 <- dt1[, list(user_ad_cnt=.N), by=.(AdID, UserID)]\n\nad_sparam_entropy <- calc_entropy(dt1, 'AdID', c('SearchParams'), 'adid_sparam') \nad_query_entropy <- calc_entropy(dt1, 'AdID', c('SearchQuery'), 'adid_query') \nleft_merge_inplace(ad_sparam_entropy, ad_query_entropy, 'AdID', verbose=T)\n\nleft_merge_inplace(dt1, ad_sparam_query_cnt, by=c('AdID', 'SearchParams', 'SearchQuery'), verbose=T)\ndt1[, ad_sparam_query_cnt_ratio := ad_sparam_query_cnt * 1.0 / ad_cnt]\nleft_merge_inplace(dt1, user_ad_sum3, by=c('AdID', 'UserID'), verbose=T)\nleft_merge_inplace(dt1, ad_sparam_entropy, by=c('AdID'), verbose=T)\n\n\nuse_exp <- F\nfeature_list1 <- c('Position', 'HistCTR', 'IsUserLoggedOn', 'LocationID', 'search_time3', 'search_time_seq',\n                  'user_cat_cnt', 'user_v_cnt', 'user_pcat_cnt', 'user_visit_cat_entropy', 'search_cnt',\n                  'hour', 'dow', 'CategoryID', 'ParentCategoryID', 'query_nchar', 'param_nchar', 'Price',\n                  'ad_title_nchar', 'user_v_cat_avg_price', 'cat_price_diff', 'ad_params_nchar', 'params_nchar_diff',\n                  'ad_param_v_cnt', 'search_ad_cnt', 'search_ad1_cnt', 'search_ad2_cnt', 'search_ad3_cnt',\n                  'ad_position2', 'time_gap_prev_search', 'user_ad_seq', 'user_searchquery_seq', 'ad_cnt',\n                  'user_csum_cads_cnt', 'user_csum_cads_y', \n                  'cat_equal', \n                  'total_other_ads_visit_cnt',\n                  'user_session_seq', 'user_session_no', 'user_search_no', 'user_same_search_seq',\n                  \"UserAgentID\",       \"UserAgentOSID\",    \"UserDeviceID\",     \"UserAgentFamilyID\",\n                  'user_phone_cnt',\n                  'user_search_loc_entropy', 'user_search_dow_entropy', 'user_search_hour_entropy',\n                  'user_visit_loc_entropy', 'user_visit_dow_entropy', 'user_visit_hour_entropy',\n                  'user_search_sparam_entropy', 'user_visit_param_entropy',\n                  'user_ad_prior_clicks' \n                  )\n\nfeature_list2 <- c('user_ad_cnt', 'ad_sparam_query_cnt_ratio', \n                   'adid_sparam_entropy', 'adid_query_entropy')\n\nfeature_list_exp <- c('exp2_ad', 'exp2_query', 'exp2_query_ad', 'exp2_query_title', 'exp2_loc_aparam', 'exp2_aparam')\n\nfeature_list <- c(feature_list1, feature_list2)\nif(use_exp) feature_list <- c(feature_list, feature_list_exp)\n  \nMSV <- -999\nfor(vn in feature_list) {\n  if(sum(is.na(dt1[[vn]]))>0) {\n    print(vn)\n    dt1[is.na(dt1[[vn]]), paste(vn, sep='') := MSV]\n  }\n}\n\n\ny <- dt1$IsClick\ntesting <- T\n\nif (full_data) {\n  if(testing) {\n    fv1 <- dt1$search_time_rseq == 2\n    ft <- y >= 0 & (!fv1) & dt1$UserID %% 10 == 1\n    fv <- y >= 0 & fv1\n  } else {\n    fv1 <- dt1$UserID %% 50 == 1 & dt1$search_time_rseq == 2\n    ft <- y >= 0 & (!fv1) #& dt1$search_time_rseq < 200 #& dt1$UserID %% 10 == 1\n    fv <- y >= 0 & fv1\n  }\n  fh <- y < 0\n} else {\n  ft <- (dt1$search_time_rseq > 2 & dt1$search_time_rseq < 200)\n  fv <- dt1$search_time_rseq <= 2  \n}\n\n\nxt <- as.matrix(dt1[ft, feature_list, with=F])\nyt <- y[ft]\nxv <- as.matrix(dt1[fv, feature_list, with=F])\nyv <- y[fv]\nif (full_data) {\n  xh <- as.matrix(dt1[fh, feature_list, with=F])\n  yh <- y[fh]\n}\n\n\n\nprint(dim(xt))\nprint(dim(xv))\nif(full_data) {\n  print(dim(xh))\n  if(dim(xh)[1] != dim(sub0)[1]) stop('xh has wrong shape!!!')\n}\n\nMSV2 <- 9999999\nif(full_data & !testing) dtest <- xgb.DMatrix(xh, missing = MSV2)\ndvalid <- xgb.DMatrix(xv, label = yv, missing = MSV2)\n\nds_k <- 50\nds_kt <- 1\nif(full_data) {\n  ds_k <- 50\n  ds_kt <- 1\n}\n\nif(F) {\nseltv <- get_ds_filter(y, ds_k, ds_kt)\ndt1sel <- dt1[seltv, c('IsClick', feature_list1, feature_list2), with=F]\ndt1sel[, partition := 0]\ndt1sel[dt1$search_time_rseq[seltv]==2, partition := 1]\n\nwrite.csv(dt1sel, file='output/dt1sel.csv', row.names=F)\n}\n\nxgbs <- list()\npredv <- 0\npredh <- 0\npredt <- 0\nctr <- 0\n\nidx_exp <- which(feature_list %in% feature_list_exp)\n\nfor(i in c(1:4)) {\n  ctr <- ctr + 1\n  \n  set.seed(i)\n  selt <- get_ds_filter(yt, ds_k, ds_kt)\n  selv <- get_ds_filter(yv, ds_k, ds_kt)\n  \n  t_shuffle_idx <- sample(sum(selt))  \n  xt1 <- xt[selt, ][t_shuffle_idx, ]\n  print(dim(xt1))\n  if(use_exp) xt1[, idx_exp] <- xt1[, idx_exp] * matrix(exp((runif(dim(xt1)[1]*length(feature_list_exp))-.5)*.04), nrow=dim(xt1)[1], ncol=length(feature_list_exp))\n  dtrain1 <- xgb.DMatrix(xt1, label = yt[selt][t_shuffle_idx], missing = MSV2)\n  dvalid1 <- xgb.DMatrix(xv[selv, ], label = yv[selv], missing = MSV2)\n  watchlist <- list(train = dtrain1, valid = dvalid1)\n\n  tree_depth <- 11\n  if(full_data & !testing) tree_detph <- 14\n  ts1 <- proc.time()\n  bst <- xgboost2(dtrain1, max.depth = tree_depth, watchlist = watchlist, colsample_bytree = .4, verbose=1, print.every.n=10, eval_metric='logloss',\n                  eta = 0.15, nthread = 4, nround = 150, objective = \"binary:logistic\", set.seed=i, min_child_weight=1, num_parallel_tree=1)\n  print(proc.time() - ts1)\n  xgbs[[i]] <- bst\n  if(full_data & !testing) {\n    predh0 <- calibrate_ds(predict(bst, dtest), ds_k, ds_kt)\n    predh <- predh + predh0   \n  }\n  \n  pred <-  calibrate_ds(predict(bst, dvalid), ds_k, ds_kt)\n  predv <- predv + pred\n  print(paste(i, '-----------------', logloss(yv, predv / ctr)))\n  gc()  \n}\n\ndt1v <- dt1[fv, ]\nf1 <- dt1v$user_ad_prior_clicks > 0 | T\nf1 <- !is.na(dt1v$exp2v_title_query)\ndt1v[f1, my_lift((adid2_csum_y + .01*k)/(adid2_csum_cnt + k), predv[f1]/ctr, yv[f1], NULL, 20, print=T)]\ndt1v[f1, my_lift(predv, predv[f1]/ctr, yv[f1], NULL, 500, print=T)]\ndt1v[, pred1 := predv/ctr]\n\ndt1v<-dt1v[order(SearchID, pred1), ]\ntmp_pred1_rank <- dt1v[, .(pred_rank = cumsum(one), pred_mean=mean(pred1)), by=list(SearchID)]\ndt1v[, pred1_rank:=tmp_pred1_rank$pred_rank]\ndt1v[, pred1_mean:=tmp_pred1_rank$pred_mean]\n\nfv1 <- runif(dim(dt1v)[1]) >= .5\nfv2 <- !fv1\n\nxvnew <- cbind(xv, predv/ctr)\nxv1 <- xvnew[fv1, ]\nxv2 <- xvnew[fv2, ]\nyv1 <- yv[fv1]\nyv2 <- yv[fv2]\n\ndv1 <- xgb.DMatrix(xv1, label = yv1, missing = MSV2)\ndv2 <- xgb.DMatrix(xv2, label = yv2, missing = MSV2)\n\nwatchlist2 <- list(train = dv1, valid = dv2)\nbst2 <- xgboost2(dv2, max.depth = 3, watchlist = watchlist2, colsample_bytree = .3, verbose=1, print.every.n=10, eval_metric='logloss',\n                eta = .1, nthread = 16, nround = 150, objective = \"binary:logistic\", set.seed=i, min_child_weight=1)\nprednew <- predict(bst2, xgb.DMatrix(xvnew, missing = MSV2))\nmy_lift(dt1v$cat_price_diff[fv2], predv[fv2]/ctr, yv[fv2], NULL, 10)\n\ndt1t <- dt1[ft, ]\ndt1t[, pred := predt / ctr]\ndt1t1 <- dt1t[UserID %% 100 == 11, ]\ndt1t[search_time_rseq==3, my_lift(ad_rank_in_search_by_price, pred, IsClick, NULL, 10, print=T)]\nk <- 100\ndt1t1[, my_lift((ad_csum_cads_y + .01*k)/(ad_csum_cads_cnt + k), pred, IsClick, NULL, 10, print=T)]\n\n\npredh.dt <- data.frame(ID=dt1$ID[fh], IsClick=round(predh/ctr,5))\nmy_write_csv(predh.dt, file='output/sub0', row.names=F)\n\npredh.dt <- data.frame(ID=dt1$ID[fh], IsClick=round(predh/ctr,5))\nmy_write_csv(predh.dt, file='output/sub1', row.names=F)\n\npredh.dt <- data.frame(ID=dt1$ID[fh], IsClick=round(predh/ctr,6))\nmy_write_csv(predh.dt, file='output/sub2_0401x', row.names=F)\n\nmy_lift(dt1$HistCTR[fh], NULL, predh.dt$IsClick, NULL, 10)\n\nlogloss( yv, pmin(predv/ctr, .2))\nlogloss( yv, dt1$HistCTR[fv])\nlogloss( yv, pmax(pmin(dt1$HistCTR[fv], 0.016), 0.005))\n\nk <- 50\nmy_lift((dt1$user_time_search_cnt_3sec[fv]), pmin(predv/ctr, .3), yv, NULL, 20, print=T)\n\nmy_lift(((dt1$CategoryID[!ft] == dt1$AdCategoryID[!ft])), predv/ctr, yv, NULL, 10, print=T)\nmy_lift(((dt1$cat_equal[!ft])), pred, IsClick, NULL, 10)\n\n\npredh.dt <- data.frame(ID=dt1$ID[fh], IsClick=round(predh/ctr,6))\nmy_write_csv(predh.dt, file='output/sub3_0400x', row.names=F)\n\npredh.dt <- data.frame(ID=dt1$ID[fh], IsClick=round(predh/ctr,6))\nmy_write_csv(predh.dt, file='output/sub3x16_0400x', row.names=F)\n\npredh.dt <- data.frame(ID=dt1$ID[fh], IsClick=round(predh/ctr,6))\nmy_write_csv(predh.dt, file='output/sub3_rseq200_0400x', row.names=F)\n\n\npredh.dt <- data.frame(ID=dt1$ID[fh], IsClick=round(predh/ctr,6))\nmy_write_csv(predh.dt, file='output/sub3x16_run2_0400x', row.names=F)\n\npredh.dt <- data.frame(ID=dt1$ID[fh], IsClick=round(predh/ctr,6))\nmy_write_csv(predh.dt, file='output/sub4x16_0400x', row.names=F)\n"
  }
]