Repository: owenzhang/kaggle-avito Branch: master Commit: a7a2cc853b0c Files: 13 Total size: 41.6 KB Directory structure: gitextract_spgrnvmp/ ├── LICENSE ├── README.md ├── _fast_10pct_run.R ├── _full_100pct_run.R ├── avito_cat_cnt.R ├── avito_data1.R ├── avito_data_merge.R ├── avito_phone.R ├── avito_search.R ├── avito_train_xgb.R ├── avito_utils.R ├── avito_visit.R └── avito_xgb1.R ================================================ FILE CONTENTS ================================================ ================================================ FILE: LICENSE ================================================ The MIT License (MIT) Copyright (c) 2015 Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ # kaggle-avito Winning solution to the Avito CTR competition Some high level description of the solution can be found in the .pdf file. How to run: * change the folder at the top of _fast_10pct_run.R and _full_100pct_run.R to point to where the data files are stored * (recommended) run _fast_10pct_run.R and verify that it produces expected results -- this will take a few hours * run _full_100pct_run.R to produce a submission file. Summary of other files: avito_data1.R -- Load datasets avito_phone.R -- feature extraction from phone dataset avito_search.R -- feature extraction from search dataset avito_visit.R -- feature extration from visit dataset avito_cat_cat.R -- more features + data merge avito_data_merge.R -- rest of the features + data merge avito_train_xgb.R -- fit xgboost models avito_utils.R -- some utility functions Hardware requirement: To run the full data solution (_full_100pct_run.R) I recomment a machine with 256GB of ram + (at least) 200GB of swap space. This is due to the combination of my own inefficient code and inefficiency in R's memory handling. ================================================ FILE: _fast_10pct_run.R ================================================ # run entire script with 10% data -- highly recommended in case there are bugs setwd('/home/fast/2015_avito') # modify this line to point the right folder require(inline) require(sqldf) require(data.table) require(xgboost) require(tau) source('kaggle-avito/avito_utils.R') full_data <- F source('kaggle-avito/avito_data1.R') source('kaggle-avito/avito_phone.R') source('kaggle-avito/avito_search.R') source('kaggle-avito/avito_visit.R') source('kaggle-avito/avito_cat_cnt.R') source('kaggle-avito/avito_data_merge.R') n_repeat <- 8 feature_list <- c(feature_list1, feature_list2) source('kaggle-avito/avito_train_xgb.R') predv_list2 <- predv / ctr feature_list <- feature_list1 source('kaggle-avito/avito_train_xgb.R') predv_list1 <- predv / ctr predv_avg12 <- predv_list1 * .5 + predv_list2 * .5 print(paste("logloss of avg of two xgbs (should be ~.04176): ", logloss(yv, predv_avg12))) ================================================ FILE: _full_100pct_run.R ================================================ # run entire script with 100% data setwd('/home/fast/2015_avito') # modify this line to point the right folder require(inline) require(sqldf) require(data.table) require(xgboost) require(tau) source('kaggle-avito/avito_utils.R') full_data <- T source('kaggle-avito/avito_data1.R') source('kaggle-avito/avito_phone.R') source('kaggle-avito/avito_search.R') source('kaggle-avito/avito_visit.R') source('kaggle-avito/avito_cat_cnt.R') source('kaggle-avito/avito_data_merge.R') n_repeat <- 8 #one model with shorter feature list and all the data points rseq_limit <- 1e6 xt <- 0 gc() rseed_offset <- 200 feature_list <- feature_list1 source('kaggle-avito/avito_train_xgb.R') predv_list1 <- predv / ctr predh_list1 <- predh / ctr #another model with longer feature_list but only last 200 impressions rseq_limit <- 200 xt <- 0 gc() rseed_offset <- 300 feature_list <- c(feature_list1, feature_list2) source('kaggle-avito/avito_train_xgb.R') predv_list2_r200 <- predv / ctr predh_list2_r200 <- predh / ctr #test on 2% validation data predv_avg2 <- (predv_list1 + predv_list2_r200) / 2 print(paste("logloss of avg of two xgbs (should be ~.04252): ", logloss(yv, predv_avg2))) #create final submission predh_avg2 <- (predh_list1 + predh_list2_r200) / 2 predh.dt <- data.frame(ID=dt1$ID[fh], IsClick=round(predh_avg2,6)) my_write_csv(predh.dt, file='avito_sub_finalx8_final', row.names=F) #private LB ~.04029 ================================================ FILE: avito_cat_cnt.R ================================================ left_merge_inplace(s01_ths, s01_ads, by='AdID', verbose=T) left_merge_inplace(s01_ths, cat, by='AdCategoryID', verbose=T) left_merge_inplace(s01_ths, s01_search, by='SearchID', verbose=T) if(conserve_ram) { rm(s01_search) gc() } s01_ths1 <- s01_ths[, c('UserID', 'AdID', 'search_time3', 'SearchQuery', 'IsClick'), with=F] s01_ths1[, one:=1] s01_ths1[, org_idx:=c(1:.N)] s01_ths1[, y1:=pmax(IsClick, 0)] s01_ths1 <- s01_ths1[order(UserID, AdID, search_time3), ] tmp <- s01_ths1[, list(cumsum(one), cumsum(y1)), by=list(UserID, AdID)] tmp[, org_idx:=s01_ths1$org_idx] system.time(tmp <- tmp[order(org_idx), ]) s01_ths[, user_ad_seq := tmp$V1] s01_ths[, user_ad_prior_clicks := tmp$V2 - pmax(IsClick, 0)] s01_ths1 <- s01_ths1[order(UserID, SearchQuery, search_time3), ] tmp <- s01_ths1[, cumsum(one), by=list(UserID, SearchQuery)] tmp[, org_idx:=s01_ths1$org_idx] tmp <- tmp[order(org_idx), ] s01_ths[, user_searchquery_seq := tmp$V1] s01_ths[SearchQuery=='', user_searchquery_seq := -user_searchquery_seq] if(conserve_ram) { rm(s01_ths1); gc() } if(full_data) { s01_ths1x <- s01_ths[ObjectType==3 & search_time_rseq<= 1000, ] } else { s01_ths1x <- s01_ths[ObjectType==3 & search_time_rseq<= 2000 & search_time_rseq >= 2, ] } if(conserve_ram) { rm(s01_ths); gc() } ad_cnt <- s01_ths1x[, .N, by=list(AdID)] setnames(ad_cnt, 'N', 'ad_cnt') s01_ths1x[, is_test := as.integer(IsClick <0)] search_sum1 <- s01_ths1x[, list(.N, sum(IsClick)), by=list(SearchID, UserID, is_test, search_time3)] setnames(search_sum1, c('N', 'V2'), c('cnt', 'sumy')) search_sum1 <- search_sum1[order(UserID, is_test, search_time3), ] tmp <- search_sum1[, list(cumsum(cnt), cumsum(sumy)), by='UserID'] search_sum1[, user_csum_cads_cnt := tmp$V1 - cnt] search_sum1[, user_csum_cads_y := tmp$V2 - sumy] tmp <- search_sum1[, list(cumsum(cnt), cumsum(sumy)), by='UserID'] search_sum1[, user_csum_cads_cnt := tmp$V1 - cnt] search_sum1[, user_csum_cads_y := tmp$V2 - sumy] ad_info <- s01_ths1x[, c('one', 'IsClick', 'AdID', 'search_time3'), with=F] ad_info[, org_idx := c(1:dim(ad_info)[1])] ad_info <- ad_info[order(AdID, search_time3), ] tmp <- ad_info[, list(cumsum(one), cumsum(IsClick)), by='AdID'] tmp[, org_idx := ad_info$org_idx] tmp <- tmp[order(org_idx), ] s01_ths1x[, ad_csum_cads_cnt := tmp$V1 - one] s01_ths1x[, ad_csum_cads_y := tmp$V2 - IsClick] left_merge_inplace(s01_ths1x, search_sum1[, c('SearchID', 'user_csum_cads_cnt', 'user_csum_cads_y'), with=F], by='SearchID', verbose=T) if(conserve_ram) { rm(search_sum1); gc() } left_merge_inplace(s01_ths1x, ad_param_v_cnt, by='Params', verbose=T) left_merge_inplace(s01_ths1x, user_v_cnt, by=c('UserID'), verbose=T) left_merge_inplace(s01_ths1x, user_pcat_cnt, by=c('UserID', 'ParentCategoryID'), verbose=T) left_merge_inplace(s01_ths1x, user_visit_cat_entropy, by=c('UserID'), verbose=T) left_merge_inplace(s01_ths1x, search_ad_cnt, by='SearchID', verbose=T) left_merge_inplace(s01_ths1x, ad_cnt, by='AdID', verbose=T) #setnames(user_cat_cnt, 'CategoryID', 'AdCategoryID') left_merge_inplace(s01_ths1x, user_cat_cnt, by=c('UserID', 'AdCategoryID'), verbose=T) left_merge_inplace(s01_ths1x, user_search_loc_entropy, by='UserID', verbose=T) left_merge_inplace(s01_ths1x, user_search_dow_entropy, by='UserID', verbose=T) left_merge_inplace(s01_ths1x, user_search_hour_entropy, by='UserID', verbose=T) left_merge_inplace(s01_ths1x, user_visit_loc_entropy, by='UserID', verbose=T) left_merge_inplace(s01_ths1x, user_visit_dow_entropy, by='UserID', verbose=T) left_merge_inplace(s01_ths1x, user_visit_hour_entropy, by='UserID', verbose=T) left_merge_inplace(s01_ths1x, user_sparam_entropy, by='UserID', verbose=T) left_merge_inplace(s01_ths1x, user_visit_param_entropy, by='UserID', verbose=T) left_merge_inplace(s01_ths1x, s01_ths1x[, c('ID', 'ad_csum_cads_cnt', 'ad_csum_cads_y'), with=F], by='ID', verbose=T) left_merge_inplace(s01_ths1x, s01_user, by='UserID', verbose=T) left_merge_inplace(s01_ths1x, user_phone_cnt, by='UserID', verbose=T) s01_ths1x[is.na(user_phone_cnt), user_phone_cnt := 0] s01_ths2 <- s01_ths1x s01_ths2[is.na(user_v_cnt), user_v_cnt:=0] s01_ths2[is.na(user_cat_cnt), user_cat_cnt:=0] s01_ths2[is.na(user_pcat_cnt), user_pcat_cnt:=0] s01_ths2[is.na(user_visit_cat_entropy), user_visit_cat_entropy:=-1] s01_ths2[, search_cnt := search_time_seq + search_time_rseq - 1] s01_ths2[, cat_price_diff := Price - user_v_cat_avg_price] s01_ths2[, ad_params_nchar := nchar(Params)] s01_ths2[, params_nchar_diff := ad_params_nchar - param_nchar] s01_ths2[, ad_position2 := search_ad3_cnt * 7 + Position] s01_ths2[, cat_equal := 0] s01_ths2[CategoryID==AdCategoryID, cat_equal := 1] s01_ths2[is.na(CategoryID) | is.na(AdCategoryID), cat_equal := -1] gc() ================================================ FILE: avito_data1.R ================================================ #load data sub0 <- fread('input/sampleSubmission_HistCTR.csv') hs0 <- fread('input/testSearchStream.tsv') hs0[, IsClick := -1] ts0 <- fread('input/trainSearchStream.tsv') ts0[, ID:=-c(1:dim(ts0)[1])] cat <- fread('input/Category.tsv') cat$SubcategoryID <- NULL loc <- fread('input/Location.tsv') user <- fread('input/UserInfo.tsv') ad0 <- fread('input/AdsInfo.tsv') v0 <- fread('input/VisitsStream.tsv') s0 <- fread('input/SearchInfo.tsv') pr0 <- fread('input/PhoneRequestsStream.tsv') conserve_ram <- F if(full_data) conserve_ram <- T if(full_data) { s01_user <- user s01_search <- s0 s01_visit <- v0 s01_phone <- pr0 s01_ths <- rbind(ts0, hs0) s01_ads <- ad0 rm(user, s0, v0, pr0, ts0, hs0, ad0) gc() } else { sample_k <- 10 s01_user <- user[UserID %% sample_k == 1, ] s01_search <- s0[UserID %% sample_k == 1, ] s01_visit <- v0[UserID %% sample_k == 1, ] s01_phone <- pr0[UserID %% sample_k == 1, ] system.time(s01_ts <- ts0[SearchID %in% s01_search$SearchID, ]) system.time(s01_hs <- hs0[SearchID %in% s01_search$SearchID, ]) s01_ths <- rbind(s01_ts, s01_hs) s01_ads <- ad0[AdID %in% unique(c(s01_ts$AdID, s01_hs$AdID)), ] rm(user, s0, v0, pr0, ts0, hs0, ad0) gc() } ================================================ FILE: avito_data_merge.R ================================================ if(full_data) { dt1 <- s01_ths2 base_ft <- dt1$search_time_rseq > 1 & !(dt1$UserID %% 50 == 1) & dt1$IsClick >=0 print(sum(base_ft)) } else { dt1 <- s01_ths2[IsClick >=0, ] base_ft <- dt1$search_time_rseq > 2 } #reponse encoding with leave-one-out and credibility adjustment calc_exp2(dt1, base_ft, 'IsClick', c('AdID'), 'exp2_ad', 100, verbose=T) calc_exp2(dt1, base_ft, 'IsClick', c('SearchQuery'), 'exp2_query', 100, mean_y0=NULL, verbose=T) calc_exp2(dt1, base_ft, 'IsClick', c('SearchQuery', 'AdID'), 'exp2_query_ad', 100, mean_y0=NULL, verbose=T) calc_exp2(dt1, base_ft, 'IsClick', c('SearchQuery', 'Title'), 'exp2_query_title', 100, mean_y0=NULL, verbose=T) calc_exp2(dt1, base_ft, 'IsClick', c('Params'), 'exp2_aparam', 100, mean_y0=NULL, verbose=T) calc_exp2(dt1, base_ft, 'IsClick', c('LocationID', 'Params'), 'exp2_loc_aparam', 100, mean_y0=NULL, verbose=T) dt1[, ad_title_nchar := nchar(Title)] ad_sparam_query_cnt <- dt1[, list(ad_sparam_query_cnt=.N), by=.(AdID, SearchParams, SearchQuery)] user_ad_sum3 <- dt1[, list(user_ad_cnt=.N), by=.(AdID, UserID)] ad_sparam_entropy <- calc_entropy(dt1, 'AdID', c('SearchParams'), 'adid_sparam') ad_query_entropy <- calc_entropy(dt1, 'AdID', c('SearchQuery'), 'adid_query') left_merge_inplace(ad_sparam_entropy, ad_query_entropy, 'AdID', verbose=T) left_merge_inplace(dt1, ad_sparam_query_cnt, by=c('AdID', 'SearchParams', 'SearchQuery'), verbose=T) dt1[, ad_sparam_query_cnt_ratio := ad_sparam_query_cnt * 1.0 / ad_cnt] left_merge_inplace(dt1, user_ad_sum3, by=c('AdID', 'UserID'), verbose=T) left_merge_inplace(dt1, ad_sparam_entropy, by=c('AdID'), verbose=T) use_exp <- T feature_list1 <- c('Position', 'HistCTR', 'IsUserLoggedOn', 'LocationID', 'search_time3', 'search_time_seq', 'user_cat_cnt', 'user_v_cnt', 'user_pcat_cnt', 'user_visit_cat_entropy', 'search_cnt', 'hour', 'dow', 'CategoryID', 'ParentCategoryID', 'query_nchar', 'param_nchar', 'Price', 'ad_title_nchar', 'user_v_cat_avg_price', 'cat_price_diff', 'ad_params_nchar', 'params_nchar_diff', 'ad_param_v_cnt', 'search_ad_cnt', 'search_ad1_cnt', 'search_ad2_cnt', 'search_ad3_cnt', 'ad_position2', 'time_gap_prev_search', 'user_ad_seq', 'user_searchquery_seq', 'ad_cnt', 'user_csum_cads_cnt', 'user_csum_cads_y', 'cat_equal', 'total_other_ads_visit_cnt', 'user_session_seq', 'user_session_no', 'user_search_no', 'user_same_search_seq', "UserAgentID", "UserAgentOSID", "UserDeviceID", "UserAgentFamilyID", 'user_phone_cnt', 'user_search_loc_entropy', 'user_search_dow_entropy', 'user_search_hour_entropy', 'user_visit_loc_entropy', 'user_visit_dow_entropy', 'user_visit_hour_entropy', 'user_search_sparam_entropy', 'user_visit_param_entropy', 'user_ad_prior_clicks' ) feature_list2 <- c('user_ad_cnt', 'ad_sparam_query_cnt_ratio', 'adid_sparam_entropy', 'adid_query_entropy') feature_list_exp <- c('exp2_ad', 'exp2_query', 'exp2_query_ad', 'exp2_query_title', 'exp2_loc_aparam', 'exp2_aparam') ================================================ FILE: avito_phone.R ================================================ user_phone_cnt <- s01_phone[, list(user_phone_cnt=.N), by=list(UserID)] ================================================ FILE: avito_search.R ================================================ #convert time to an integer (seconds from min(time)) time2 <- strptime(s01_search$SearchDate, '%Y-%m-%d %H:%M:%S') min_time2 <- min(time2) min_time3 <- as.Date(min_time2) time3 <- as.integer(difftime(time2, min_time3, unit='sec')) if(conserve_ram) { rm(time2) } gc() s01_search[, search_time3 := time3] s01_search[, hour := as.integer(round(as.integer(search_time3) %/% 3600)) %% 24] s01_search[, dow := as.integer(round(as.integer(search_time3) %/% 86400)) %% 7] s01_search[, query_nchar := nchar(SearchQuery)] s01_search[, param_nchar := nchar(SearchParams)] user_search_type_sum2 <- s01_search[, .(user_query_ratio=mean(SearchQuery!='')), by=UserID] user_search_query_sum <- s01_search[, .(user_search_nchar_mean=mean(query_nchar), user_search_param_nchar_mean=mean(param_nchar)), by=UserID] user_search_loc_entropy <- calc_entropy(s01_search, 'UserID', 'LocationID', 'user_search_loc') user_search_dow_entropy <- calc_entropy(s01_search, 'UserID', 'dow', 'user_search_dow') user_search_hour_entropy <- calc_entropy(s01_search, 'UserID', 'hour', 'user_search_hour') user_sparam_entropy <- calc_entropy(s01_search, 'UserID', 'SearchParams', 'user_search_sparam') s01_search[, one := 1] s01_search <- s01_search[order(UserID, -search_time3), ] a <- s01_search[, cumsum(one), by=list(UserID)] s01_search[, search_time_rseq := a$V1] s01_search <- s01_search[order(UserID, search_time3), ] a <- s01_search[, cumsum(one), by=list(UserID)] s01_search[, search_time_seq := a$V1] #s01_search <- s01_search[order(UserID, search_time3), ] tmp <- c(NA, head(s01_search$search_time3, -1)) s01_search[, prev_search_time3:=tmp] tmp <- c(NA, head(s01_search$UserID, -1)) s01_search[, prev_user_id:=tmp] s01_search[, prev_search_query:=c(NA, head(SearchQuery, -1))] s01_search[, prev_search_param:=c(NA, head(SearchParams, -1))] s01_search[, time_gap_prev_search := search_time3 - prev_search_time3] s01_search[is.na(prev_user_id) | (UserID != prev_user_id), time_gap_prev_search := (-1)] s01_search[, is_gap := as.numeric(time_gap_prev_search < 0 | time_gap_prev_search > 900)] tmp <- s01_search[, cumsum(is_gap), by=list(UserID)] s01_search[, user_session_no := tmp$V1] tmp <- s01_search[, cumsum(one), by=list(UserID, user_session_no)] s01_search[, user_session_seq := tmp$V1] s01_search[, new_search := as.numeric(is.na(prev_user_id) | UserID != prev_user_id | SearchQuery != prev_search_query | SearchParams != prev_search_param)] tmp <- s01_search[, cumsum(new_search), by=list(UserID)] s01_search[, user_search_no := tmp$V1] tmp <- s01_search[, cumsum(one), by=list(UserID, user_search_no)] s01_search[, user_same_search_seq := tmp$V1] user_search_nzquery_cnt <- s01_search[, sum(SearchQuery != ''), by=list(UserID)] setnames(user_search_nzquery_cnt, 'V1', 'user_search_nzquery_cnt') user_search_cnt <- s01_search[, .N, by=list(UserID)] setnames(user_search_cnt, 'N', 'user_search_cnt') left_merge_inplace(user_search_cnt, user_search_nzquery_cnt, by='UserID') ad_visit_cnt <- s01_visit[, .N, by=list(AdID)] setnames(ad_visit_cnt, 'N', 'ad_visit_cnt') left_merge_inplace(s01_ths, ad_visit_cnt, by='AdID', verbose=T) s01_ths[is.na(ad_visit_cnt), ad_visit_cnt := 0] search_ad_cnt <- s01_ths[, list(.N, sum(ad_visit_cnt)), by=list(SearchID)] setnames(search_ad_cnt, c('N', 'V2'), c('search_ad_cnt', 'total_other_ads_visit_cnt')) search_ad1_cnt <- s01_ths[ObjectType==1, .N, by=list(SearchID)] setnames(search_ad1_cnt, 'N', 'search_ad1_cnt') search_ad2_cnt <- s01_ths[ObjectType==2, .N, by=list(SearchID)] setnames(search_ad2_cnt, 'N', 'search_ad2_cnt') left_merge_inplace(search_ad_cnt, search_ad1_cnt, by='SearchID') left_merge_inplace(search_ad_cnt, search_ad2_cnt, by='SearchID') dt_fill_na(search_ad_cnt) search_ad_cnt[, search_ad3_cnt := search_ad_cnt - search_ad1_cnt - search_ad2_cnt] gc() ================================================ FILE: avito_train_xgb.R ================================================ #fit xgboost model with repetitively smart sampled data (keeping all events, randomly sample non-events) if(use_exp) feature_list <- c(feature_list, feature_list_exp) MSV <- -999 for(vn in feature_list) { if(sum(is.na(dt1[[vn]]))>0) { print(vn) dt1[is.na(dt1[[vn]]), paste(vn, sep='') := MSV] } } y <- dt1$IsClick testing <- F if (full_data) { if(testing) { fv1 <- dt1$search_time_rseq == 2 ft <- y >= 0 & (!fv1) & dt1$UserID %% 10 == 1 fv <- y >= 0 & fv1 } else { fv1 <- dt1$UserID %% 50 == 1 & dt1$search_time_rseq == 2 ft <- y >= 0 & (!fv1) & dt1$search_time_rseq < rseq_limit #& dt1$UserID %% 10 == 1 fv <- y >= 0 & fv1 } fh <- y < 0 } else { ft <- (dt1$search_time_rseq > 2 & dt1$search_time_rseq < 200) fv <- dt1$search_time_rseq <= 2 } xt <- as.matrix(dt1[ft, feature_list, with=F]) yt <- y[ft] xv <- as.matrix(dt1[fv, feature_list, with=F]) yv <- y[fv] if (full_data) { xh <- as.matrix(dt1[fh, feature_list, with=F]) yh <- y[fh] } print(dim(xt)) print(dim(xv)) if(full_data) { print(dim(xh)) if(dim(xh)[1] != dim(sub0)[1]) stop('xh has wrong shape!!!') } MSV2 <- 9999999 if(full_data & !testing) dtest <- xgb.DMatrix(xh, missing = MSV2) dvalid <- xgb.DMatrix(xv, label = yv, missing = MSV2) ds_k <- 50 ds_kt <- 1 if(full_data) { ds_k <- 50 ds_kt <- 1 } xgbs <- list() predv <- 0 predh <- 0 predt <- 0 ctr <- 0 idx_exp <- which(feature_list %in% feature_list_exp) for(i in c(1:n_repeat)) { ctr <- ctr + 1 set.seed(rseed_offset + i) selt <- get_ds_filter(yt, ds_k, ds_kt) selv <- get_ds_filter(yv, ds_k, ds_kt) t_shuffle_idx <- sample(sum(selt)) xt1 <- xt[selt, ][t_shuffle_idx, ] print(dim(xt1)) if(use_exp) xt1[, idx_exp] <- xt1[, idx_exp] * matrix(exp((runif(dim(xt1)[1]*length(feature_list_exp))-.5)*.04), nrow=dim(xt1)[1], ncol=length(feature_list_exp)) dtrain1 <- xgb.DMatrix(xt1, label = yt[selt][t_shuffle_idx], missing = MSV2) dvalid1 <- xgb.DMatrix(xv[selv, ], label = yv[selv], missing = MSV2) watchlist <- list(train = dtrain1, valid = dvalid1) tree_depth <- 11 if(full_data & !testing) tree_depth <- 14 print(paste("tree depth", tree_depth)) ts1 <- proc.time() bst <- xgboost2(dtrain1, max.depth = tree_depth, watchlist = watchlist, colsample_bytree = .4, verbose=1, print.every.n=10, eval_metric='logloss', eta = 0.15, nthread = 16, nround = 150, objective = "binary:logistic", set.seed=i, min_child_weight=1, num_parallel_tree=1) print(proc.time() - ts1) xgbs[[i]] <- bst if(full_data & !testing) { predh0 <- calibrate_ds(predict(bst, dtest), ds_k, ds_kt) predh <- predh + predh0 } pred <- calibrate_ds(predict(bst, dvalid), ds_k, ds_kt) predv <- predv + pred print(paste(i, '-----------------', logloss(yv, predv / ctr))) gc() } ================================================ FILE: avito_utils.R ================================================ #hack for xgboost wrapper to get watchlist into the call xgboost2 <- function(dtrain, params = list(), nrounds, verbose = 1, print.every.n = 1L, early.stop.round = NULL, watchlist = NULL, maximize = NULL, num_parallel_tree=1, ...) { params <- append(params, list(...)) bst <- xgb.train(params, dtrain, nrounds, watchlist, verbose = verbose, print.every.n=print.every.n, early.stop.round = early.stop.round, num_parallel_tree=num_parallel_tree) return(bst) } #compute entropy by group, over subgrp calc_entropy <- function(df, group, subgrp, tgt_vn_prefix) { sum1 <- df[, .N, by=list(df[[group]], df[[subgrp]])] setnames(sum1, c(group, subgrp, 'subgrpcnt')) sum2 <- df[, .N, by=list(df[[group]])] setnames(sum2, c(group, 'cnt')) #sum2[, dummy:=1] #dowdf <- data.table(x=unique(df[[subgrp]]), dummy=1) #setnames(dowdf, c(subgrp, 'dummy')) #sum2a <- merge(sum2, dowdf, by='dummy', allow.cartesian=T) sum3 <- merge(sum2, sum1, by=c(group)) #sum3[is.na(subgrpcnt), subgrpcnt:=0] sum3[, entropy := - log(subgrpcnt * 1.0 / cnt) * subgrpcnt * 1.0 / cnt] sum3[is.na(entropy), entropy := 0] sum4 <- sum3[, sum(entropy), by=list(sum3[[group]])] setnames(sum4, c(group, paste(tgt_vn_prefix, 'entropy', sep='_'))) return(sum4) } #generate smart downsample index get_ds_filter <- function(y, ds_k, ds_kt=1) { r1 <- runif(length(y)) sel <- r1 < 1.0 / ds_k sel[y==1] <- r1[y==1] < 1.0 / ds_kt return(sel) } #calibrate the prediction to original scale (before smart downsample) calibrate_ds <- function(p, ds_k, ds_kt=1) { return(p * ds_kt / (p * ds_kt + (1 - p) * ds_k)) } #response encoder for categorical features, with credibility adjustment and leave-one-out calc_exp2 <- function(dt, ft, vn_y, by, tgt_vn, k, mean_y0=NULL, verbose=F) { dt[, tmp_y := dt[[vn_y]]] tmp <- dt[ft, list(.N, sum(tmp_y), mean(tmp_y)), by=by] if(verbose) print(paste("dim of summary :", dim(tmp))) setnames(tmp, c(by, 'tmp_cnt', 'tmp_sumy', 'tmp_mean_y')) if(is.null(mean_y0)) mean_y0 <- mean(tmp$tmp_mean_y) if(verbose) print(paste("mean_y0 = ", mean_y0)) tmp[, tmp_mean_y := NULL] left_merge_inplace(dt, tmp, by=by, verbose=verbose) dt[is.na(tmp_cnt), tmp_cnt := 0] dt[is.na(tmp_sumy), tmp_sumy := 0] dt[ft, tmp_cnt := tmp_cnt - 1L] dt[ft, tmp_sumy := tmp_sumy - tmp_y] dt[, paste(tgt_vn, sep='') := (tmp_sumy + mean_y0 * k) / (tmp_cnt + k)] dt[, tmp_y := NULL] dt[, tmp_sumy := NULL] dt[, tmp_cnt := NULL] return(0) } #simple logistic loss function logloss <- function(act, pred) { eps = 1e-15; nr <- nrow(pred) pred = pmin(1-eps, pred, pmax(eps, pred)) ll = sum(act*log(pred) + (1-act)*log(1-pred)) #print(ll) ll = ll * -1/length(act) return(ll) } #shift column by n rows add_shift1 <- function(dt, key, n, nv, force=F) { tgt_vn <- paste(key, '_shift', nv, sep='') if(!force) { if(tgt_vn %in% names(dt)) stop(paste('variable ', tgt_vn, 'already exists in the data')) } if(n >0) a <- c(rep(NA, n), head(dt[[key]], -n)) else a <- c(tail(dt[[key]], n), rep(NA, -n)) dt[, tgt_vn := a, with=F] } #shift multiple columns by n rows add_shift <- function(dt, keys, values, ns, force=F) { for(n in ns) { nv <- n if(n < 0) nv <- paste('N', -n, sep='') if(!is.null(keys)) { for(key in keys) { add_shift1(dt, key, n, nv, force) } } for(value in values) { add_shift1(dt, value, n, nv, force) if(!is.null(keys)) { for(key in keys) { dt[dt[[key]] != dt[[paste(key, '_shift', nv, sep='')]], paste(value, '_shift', nv, sep='') := NA, with=F] } } } } } #compute first n component of svd get_svd_u <- function(df1, u, v, x, n, tgt_vn_prefix) { df1[, u_idx:=as.integer(factor(df1[[u]]))] df1[, v_idx:=as.integer(factor(df1[[v]]))] if(is.null(x)) { m1 <- sparseMatrix(i=df1[['u_idx']], j=df1[['v_idx']]) } else { m1 <- sparseMatrix(i=df1[['u_idx']], j=df1[['v_idx']], x=df1[[x]]) } r <- irlba(m1, nu=n, nv=n) dtu <- data.frame(u_idx=c(1:dim(m1)[1])) for(i in c(1:n)) { dtu[[paste(tgt_vn_prefix, i, sep='_')]] <- r$u[, i] } #print(names(dtu)) #print(names(df1)) rmap <- data.table(unique(df1[, c(u, 'u_idx'), with=F])) #print(rmap) dtu1 <- merge(dtu, rmap, by='u_idx') dtu1$u_idx <- NULL return(dtu1) } #wrapper to make calling get_svd_u easier get_svd <- function(df, vn_u, vn_v, n, tgt_vn) { tmp1 <- df[, length(time2), by=list(df[[vn_u]], df[[vn_v]])] setnames(tmp1, c(vn_u, vn_v, 'cnt')) tmp1 <- tmp1[!is.na(tmp1[[vn_v]]), ] tmp1[, x1:=log(cnt + 1)] svd1 <- get_svd_u(tmp1, vn_u, vn_v, 'x1', n, tgt_vn) return(svd1) } #compute ngram/tfidf/svd calc_ngram_svd <- function(df, vn_id, vn_text, NN, tf_min, df_max, ngram_n, svd_n, tgt_vn_prefix) { #df <- em_ev2_seq #vn_id <- "enrollment_id" #vn_text <- 'em_ev2_seq' #NN <- -1 #tf_min <- 10 #df_max <- 10000 #ngram_n <- c(1:2) #svn_n <- 20 #tgt_vn_prefix <- 'em_ev3gram_svd_em' ptm <- proc.time() if(NN<0) NN <- dim(df)[1] texts <- as.list(df[[vn_text]][1:NN]) org_ids <- df[[vn_id]][1:NN] a <- list() for (n1 in ngram_n) { a1 <- textcnt(texts, split='[[:space:]]', n=n1, method='string', recursive = T, verbose = F) a <- c(a, a1) print(paste("done counting words by document for ngram ", n1, length(a))) print(proc.time() - ptm) } a2 <- unlist(a) an <- lapply(a, function(x) length(x)) df1 <- data.frame(id=org_ids, cnt=unlist(an)) ids <- unlist(apply(df1[, c("id", "cnt")], 1, function(x) rep(x[1], x[2]))) dt2 <- data.table(id=ids, ngram=names(a2), cnt=a2) dt3 <- dt2[, length(id), by=list(ngram)] setnames(dt3, c('ngram', 'cnt_total')) dt3 <- dt3[cnt_total >= tf_min, ] dt4 <- merge(dt2, dt3, by='ngram') dt4[, tfidf := log(cnt + 1) * log(NN / cnt_total)] print(paste(dim(dt2), dim(dt3), dim(dt4), sep="|")) setnames(dt4, "id", vn_id) svd1 <- get_svd(dt4, vn_id, 'ngram', svd_n, tgt_vn_prefix) print(proc.time() - ptm) return(svd1) } #write compressed csv.gz my_write_csv <- function(obj, file, row.names=F, timing=T) { st <- proc.time() con <- pipe(paste("pigz -p30 > ", file, ".csv.gz", sep=''), "wb") write.csv(obj, file = con, row.names=row.names) close(con) if(timing) { print(proc.time() - st) } } #fill all missings in DT with value dt_fill_na = function(DT, value=0) { # either of the following for loops # or by number (slightly faster than by name) : for (j in seq_len(ncol(DT))) set(DT,which(is.na(DT[[j]])),j,value) } #merge a small dataset to a large data.table without copying the large data.table left_merge_inplace <- function(dt1, dt2, by, verbose=F, fill.na=NA) { st <- proc.time() dt1a <- copy(dt1[, by, with=F]) dt2a <- copy(dt2[, by, with=F]) if (verbose) { print('small datasets created') print(proc.time() - st) } dt1a[, tmp_idx1 := c(1:dim(dt1a)[1])] dt2a[, tmp_idx2 := c(1:dim(dt2a)[1])] if (verbose) { print('row index created') print(proc.time() - st) } dt3 <- merge(dt1a, dt2a, by=by, all.x=T) if (verbose) { print('small datasets merged') print(proc.time() - st) } dt3 <- dt3[order(tmp_idx1), ] if (verbose) { print('merged dataset reordered') print(proc.time() - st) } dt2_idx_map <- dt3$tmp_idx2 if (verbose) { print('row index generated') print(proc.time() - st) } for(vn in names(dt2)) { if(!(vn %in% by)) { dt1[, paste(vn, sep='') := dt2[[vn]][dt2_idx_map]] if(!is.na(fill.na)) dt1[is.na(dt1[[vn]]), paste(vn, sep=''):=fill.na] if (verbose) { print(paste('assigned variable ', vn)) print(proc.time() - st) } } } } #utility function for ngram based text similarity get_ngram_cnts <- function(texts, ngram_n) { ptm <- proc.time() a <- list() for (n1 in ngram_n) { a1 <- textcnt(texts, split='[[:space:]|[:punct:]]', n=n1, method='string', recursive = T, verbose = F) a <- c(a, a1) print(paste("done counting words by document for ngram ", n1, length(a))) print(proc.time() - ptm) } return(a) } #utility function for ngram based text similarity get_rep_id_dt <- function(a, ids0) { an <- lapply(a, function(x) length(x)) df1 <- data.frame(id=ids0, cnt=unlist(an)) ids <- unlist(apply(df1[, c("id", "cnt")], 1, function(x) rep(x[1], x[2]))) #print(length(ids)) dt1 <- data.table(id=ids, ngram=names(unlist(a)), cnt=unlist(a)) return(dt1) } #text similarity based on cosine distance of ngram/tfidf calc_ngram_dist <- function(dt0, vn_id, vn_text1, vn_text2, ngram_n, tgt_vn) { if(F) { dt0 <- lt0[, ] vn_id <- "id" vn_text1 <- 'user_skill_str' vn_text2 <- 'req_skill_str' NN <- -1 ngram_n <- c(1:2) } ptm <- proc.time() dt0a <- dt0[, c(vn_id, vn_text1, vn_text2), with=F] dt0a[, org_ord:= c(1:dim(dt0a)[1])] dt1 <- data.table(unique(data.frame(dt0a[, c(vn_id, vn_text1, vn_text2), with=F]))) setnames(dt1, c('id', vn_text1, vn_text2)) NN <- dim(dt1)[1] texts1 <- (dt1[[vn_text1]])[1:NN] texts2 <- (dt1[[vn_text2]])[1:NN] org_ids <- dt1[['id']][1:NN] a1 <- get_ngram_cnts(texts1, ngram_n) a2 <- get_ngram_cnts(texts2, ngram_n) dt1_wids <- get_rep_id_dt(a1, org_ids) dt2_wids <- get_rep_id_dt(a2, org_ids) dt12_wids <- rbind(dt1_wids, dt2_wids) dt12_sum <- dt12_wids[, list(total_cnt=sum(cnt)), by=list(ngram)] setnames(dt1_wids, 'cnt', 'cnt1') setnames(dt2_wids, 'cnt', 'cnt2') dt3 <- merge(merge(dt1_wids, dt2_wids, by=c('id', 'ngram'), all.x=T, all.y=T), dt12_sum, by='ngram') dt3[is.na(cnt1), cnt1:=0] dt3[is.na(cnt2), cnt2:=0] dt3_sum <- dt3[, list(doc_cnt=.N), by=list(ngram)] dt4 <- merge(dt3, dt3_sum, by='ngram') setnames(dt4, 'id', vn_id) dt4[, tfidf1 := log(cnt1 + 1) * log(NN / doc_cnt)] dt4[, tfidf2 := log(cnt2 + 1) * log(NN / doc_cnt)] dt5 <- dt4[, list(cos_sim=sum(tfidf1 * tfidf2) / sqrt(sum(tfidf1*tfidf1)*sum(tfidf2*tfidf2))), by=vn_id] setnames(dt5, 'cos_sim', tgt_vn) return(dt5) } ================================================ FILE: avito_visit.R ================================================ time2 <- strptime(s01_visit$ViewDate, '%Y-%m-%d %H:%M:%S') min_time2 <- min(time2) min_time3 <- as.Date(min_time2) time3 <- as.integer(difftime(time2, min_time3, unit='sec')) if(conserve_ram) { rm(time2) } gc() s01_visit[, visit_time3 := time3] s01_visit[, hour := as.integer(round(as.integer(visit_time3) %/% 3600)) %% 24] s01_visit[, dow := as.integer(round(as.integer(visit_time3) %/% 86400)) %% 7] user_visit_dow_entropy <- calc_entropy(s01_visit, 'UserID', 'dow', 'user_visit_dow') user_visit_hour_entropy <- calc_entropy(s01_visit, 'UserID', 'hour', 'user_visit_hour') s01_visit <- s01_visit[order(UserID, visit_time3), ] add_shift(s01_visit, c('UserID'), c('visit_time3'), 1, force=F) user_visit_time_interval_mean <- s01_visit[, .(user_visit_time_interval_mean=mean(visit_time3 - visit_time3_shift1, na.rm=T)), by=UserID] # count of visits in subcategory and category # entropy of subcat and cat setnames(s01_ads, 'LocationID', 'AdLocationID') setnames(s01_ads, 'CategoryID', 'AdCategoryID') left_merge_inplace(s01_visit, s01_ads, 'AdID', verbose=T) setnames(cat, 'CategoryID', 'AdCategoryID') left_merge_inplace(s01_visit, cat, 'AdCategoryID', verbose=T) user_visit_loc_entropy <- calc_entropy(s01_visit, 'UserID', 'AdLocationID', 'user_visit_loc') user_visit_param_entropy <- calc_entropy(s01_visit, 'UserID', 'Params', 'user_visit_param') ad_param_v_cnt <- s01_visit[, .N, by=list(Params)] setnames(ad_param_v_cnt, 'N', 'ad_param_v_cnt') ad_param_avg_price <- s01_visit[, mean(Price), by=list(Params)] setnames(ad_param_avg_price, 'V1', 'ad_param_avg_price') left_merge_inplace(ad_param_v_cnt, ad_param_avg_price, by='Params') user_v_avg_price <- s01_visit[, mean(Price, na.rm=T), by=list(UserID)] setnames(user_v_avg_price, 'V1', 'user_v_avg_price') user_v_cat_avg_price <- s01_visit[, mean(Price), by=list(UserID, AdCategoryID)] setnames(user_v_cat_avg_price, 'V1', 'user_v_cat_avg_price') user_v_price_sum2 <- s01_visit[, .(user_v_min_price=min(Price, na.rm=T), user_v_max_price=max(Price, na.rm=T), user_v_var_price=var(Price, na.rm=T)), by=UserID] user_v_price_sum2[is.infinite(user_v_min_price), user_v_min_price := NA] user_v_price_sum2[is.infinite(user_v_max_price), user_v_max_price := NA] user_cat_cnt <- s01_visit[, .N, by=list(UserID, AdCategoryID)] setnames(user_cat_cnt, "N", "user_cat_cnt") user_pcat_cnt <- s01_visit[, .N, by=list(UserID, ParentCategoryID)] setnames(user_pcat_cnt, "N", "user_pcat_cnt") user_v_cnt <- s01_visit[, .N, by=list(UserID)] setnames(user_v_cnt, "N", "user_v_cnt") left_merge_inplace(user_v_cnt, user_v_avg_price, by='UserID') left_merge_inplace(user_cat_cnt, user_v_cat_avg_price, by=c('UserID', 'AdCategoryID')) user_visit_cat_entropy <- calc_entropy(s01_visit, 'UserID', 'AdCategoryID', 'user_visit_cat') if(conserve_ram) { rm(s01_visit) } gc() ================================================ FILE: avito_xgb1.R ================================================ if(full_data) { dt1 <- s01_ths2 base_ft <- dt1$search_time_rseq > 1 & !(dt1$UserID %% 50 == 1) & dt1$IsClick >=0 print(sum(base_ft)) } else { dt1 <- s01_ths2[IsClick >=0, ] base_ft <- dt1$search_time_rseq > 2 } k <- 100 calc_exp2(dt1, base_ft, 'IsClick', c('AdID'), 'exp2_ad', 100, verbose=T) calc_exp2(dt1, base_ft, 'IsClick', c('SearchQuery'), 'exp2_query', 100, mean_y0=NULL, verbose=T) calc_exp2(dt1, base_ft, 'IsClick', c('SearchQuery', 'AdID'), 'exp2_query_ad', 100, mean_y0=NULL, verbose=T) calc_exp2(dt1, base_ft, 'IsClick', c('SearchQuery', 'Title'), 'exp2_query_title', 100, mean_y0=NULL, verbose=T) calc_exp2(dt1, base_ft, 'IsClick', c('Params'), 'exp2_aparam', 100, mean_y0=NULL, verbose=T) calc_exp2(dt1, base_ft, 'IsClick', c('LocationID', 'Params'), 'exp2_loc_aparam', 100, mean_y0=NULL, verbose=T) dt1[, ad_title_nchar := nchar(Title)] ad_sparam_query_cnt <- dt1[, list(ad_sparam_query_cnt=.N), by=.(AdID, SearchParams, SearchQuery)] user_ad_sum3 <- dt1[, list(user_ad_cnt=.N), by=.(AdID, UserID)] ad_sparam_entropy <- calc_entropy(dt1, 'AdID', c('SearchParams'), 'adid_sparam') ad_query_entropy <- calc_entropy(dt1, 'AdID', c('SearchQuery'), 'adid_query') left_merge_inplace(ad_sparam_entropy, ad_query_entropy, 'AdID', verbose=T) left_merge_inplace(dt1, ad_sparam_query_cnt, by=c('AdID', 'SearchParams', 'SearchQuery'), verbose=T) dt1[, ad_sparam_query_cnt_ratio := ad_sparam_query_cnt * 1.0 / ad_cnt] left_merge_inplace(dt1, user_ad_sum3, by=c('AdID', 'UserID'), verbose=T) left_merge_inplace(dt1, ad_sparam_entropy, by=c('AdID'), verbose=T) use_exp <- F feature_list1 <- c('Position', 'HistCTR', 'IsUserLoggedOn', 'LocationID', 'search_time3', 'search_time_seq', 'user_cat_cnt', 'user_v_cnt', 'user_pcat_cnt', 'user_visit_cat_entropy', 'search_cnt', 'hour', 'dow', 'CategoryID', 'ParentCategoryID', 'query_nchar', 'param_nchar', 'Price', 'ad_title_nchar', 'user_v_cat_avg_price', 'cat_price_diff', 'ad_params_nchar', 'params_nchar_diff', 'ad_param_v_cnt', 'search_ad_cnt', 'search_ad1_cnt', 'search_ad2_cnt', 'search_ad3_cnt', 'ad_position2', 'time_gap_prev_search', 'user_ad_seq', 'user_searchquery_seq', 'ad_cnt', 'user_csum_cads_cnt', 'user_csum_cads_y', 'cat_equal', 'total_other_ads_visit_cnt', 'user_session_seq', 'user_session_no', 'user_search_no', 'user_same_search_seq', "UserAgentID", "UserAgentOSID", "UserDeviceID", "UserAgentFamilyID", 'user_phone_cnt', 'user_search_loc_entropy', 'user_search_dow_entropy', 'user_search_hour_entropy', 'user_visit_loc_entropy', 'user_visit_dow_entropy', 'user_visit_hour_entropy', 'user_search_sparam_entropy', 'user_visit_param_entropy', 'user_ad_prior_clicks' ) feature_list2 <- c('user_ad_cnt', 'ad_sparam_query_cnt_ratio', 'adid_sparam_entropy', 'adid_query_entropy') feature_list_exp <- c('exp2_ad', 'exp2_query', 'exp2_query_ad', 'exp2_query_title', 'exp2_loc_aparam', 'exp2_aparam') feature_list <- c(feature_list1, feature_list2) if(use_exp) feature_list <- c(feature_list, feature_list_exp) MSV <- -999 for(vn in feature_list) { if(sum(is.na(dt1[[vn]]))>0) { print(vn) dt1[is.na(dt1[[vn]]), paste(vn, sep='') := MSV] } } y <- dt1$IsClick testing <- T if (full_data) { if(testing) { fv1 <- dt1$search_time_rseq == 2 ft <- y >= 0 & (!fv1) & dt1$UserID %% 10 == 1 fv <- y >= 0 & fv1 } else { fv1 <- dt1$UserID %% 50 == 1 & dt1$search_time_rseq == 2 ft <- y >= 0 & (!fv1) #& dt1$search_time_rseq < 200 #& dt1$UserID %% 10 == 1 fv <- y >= 0 & fv1 } fh <- y < 0 } else { ft <- (dt1$search_time_rseq > 2 & dt1$search_time_rseq < 200) fv <- dt1$search_time_rseq <= 2 } xt <- as.matrix(dt1[ft, feature_list, with=F]) yt <- y[ft] xv <- as.matrix(dt1[fv, feature_list, with=F]) yv <- y[fv] if (full_data) { xh <- as.matrix(dt1[fh, feature_list, with=F]) yh <- y[fh] } print(dim(xt)) print(dim(xv)) if(full_data) { print(dim(xh)) if(dim(xh)[1] != dim(sub0)[1]) stop('xh has wrong shape!!!') } MSV2 <- 9999999 if(full_data & !testing) dtest <- xgb.DMatrix(xh, missing = MSV2) dvalid <- xgb.DMatrix(xv, label = yv, missing = MSV2) ds_k <- 50 ds_kt <- 1 if(full_data) { ds_k <- 50 ds_kt <- 1 } if(F) { seltv <- get_ds_filter(y, ds_k, ds_kt) dt1sel <- dt1[seltv, c('IsClick', feature_list1, feature_list2), with=F] dt1sel[, partition := 0] dt1sel[dt1$search_time_rseq[seltv]==2, partition := 1] write.csv(dt1sel, file='output/dt1sel.csv', row.names=F) } xgbs <- list() predv <- 0 predh <- 0 predt <- 0 ctr <- 0 idx_exp <- which(feature_list %in% feature_list_exp) for(i in c(1:4)) { ctr <- ctr + 1 set.seed(i) selt <- get_ds_filter(yt, ds_k, ds_kt) selv <- get_ds_filter(yv, ds_k, ds_kt) t_shuffle_idx <- sample(sum(selt)) xt1 <- xt[selt, ][t_shuffle_idx, ] print(dim(xt1)) if(use_exp) xt1[, idx_exp] <- xt1[, idx_exp] * matrix(exp((runif(dim(xt1)[1]*length(feature_list_exp))-.5)*.04), nrow=dim(xt1)[1], ncol=length(feature_list_exp)) dtrain1 <- xgb.DMatrix(xt1, label = yt[selt][t_shuffle_idx], missing = MSV2) dvalid1 <- xgb.DMatrix(xv[selv, ], label = yv[selv], missing = MSV2) watchlist <- list(train = dtrain1, valid = dvalid1) tree_depth <- 11 if(full_data & !testing) tree_detph <- 14 ts1 <- proc.time() bst <- xgboost2(dtrain1, max.depth = tree_depth, watchlist = watchlist, colsample_bytree = .4, verbose=1, print.every.n=10, eval_metric='logloss', eta = 0.15, nthread = 4, nround = 150, objective = "binary:logistic", set.seed=i, min_child_weight=1, num_parallel_tree=1) print(proc.time() - ts1) xgbs[[i]] <- bst if(full_data & !testing) { predh0 <- calibrate_ds(predict(bst, dtest), ds_k, ds_kt) predh <- predh + predh0 } pred <- calibrate_ds(predict(bst, dvalid), ds_k, ds_kt) predv <- predv + pred print(paste(i, '-----------------', logloss(yv, predv / ctr))) gc() } dt1v <- dt1[fv, ] f1 <- dt1v$user_ad_prior_clicks > 0 | T f1 <- !is.na(dt1v$exp2v_title_query) dt1v[f1, my_lift((adid2_csum_y + .01*k)/(adid2_csum_cnt + k), predv[f1]/ctr, yv[f1], NULL, 20, print=T)] dt1v[f1, my_lift(predv, predv[f1]/ctr, yv[f1], NULL, 500, print=T)] dt1v[, pred1 := predv/ctr] dt1v<-dt1v[order(SearchID, pred1), ] tmp_pred1_rank <- dt1v[, .(pred_rank = cumsum(one), pred_mean=mean(pred1)), by=list(SearchID)] dt1v[, pred1_rank:=tmp_pred1_rank$pred_rank] dt1v[, pred1_mean:=tmp_pred1_rank$pred_mean] fv1 <- runif(dim(dt1v)[1]) >= .5 fv2 <- !fv1 xvnew <- cbind(xv, predv/ctr) xv1 <- xvnew[fv1, ] xv2 <- xvnew[fv2, ] yv1 <- yv[fv1] yv2 <- yv[fv2] dv1 <- xgb.DMatrix(xv1, label = yv1, missing = MSV2) dv2 <- xgb.DMatrix(xv2, label = yv2, missing = MSV2) watchlist2 <- list(train = dv1, valid = dv2) bst2 <- xgboost2(dv2, max.depth = 3, watchlist = watchlist2, colsample_bytree = .3, verbose=1, print.every.n=10, eval_metric='logloss', eta = .1, nthread = 16, nround = 150, objective = "binary:logistic", set.seed=i, min_child_weight=1) prednew <- predict(bst2, xgb.DMatrix(xvnew, missing = MSV2)) my_lift(dt1v$cat_price_diff[fv2], predv[fv2]/ctr, yv[fv2], NULL, 10) dt1t <- dt1[ft, ] dt1t[, pred := predt / ctr] dt1t1 <- dt1t[UserID %% 100 == 11, ] dt1t[search_time_rseq==3, my_lift(ad_rank_in_search_by_price, pred, IsClick, NULL, 10, print=T)] k <- 100 dt1t1[, my_lift((ad_csum_cads_y + .01*k)/(ad_csum_cads_cnt + k), pred, IsClick, NULL, 10, print=T)] predh.dt <- data.frame(ID=dt1$ID[fh], IsClick=round(predh/ctr,5)) my_write_csv(predh.dt, file='output/sub0', row.names=F) predh.dt <- data.frame(ID=dt1$ID[fh], IsClick=round(predh/ctr,5)) my_write_csv(predh.dt, file='output/sub1', row.names=F) predh.dt <- data.frame(ID=dt1$ID[fh], IsClick=round(predh/ctr,6)) my_write_csv(predh.dt, file='output/sub2_0401x', row.names=F) my_lift(dt1$HistCTR[fh], NULL, predh.dt$IsClick, NULL, 10) logloss( yv, pmin(predv/ctr, .2)) logloss( yv, dt1$HistCTR[fv]) logloss( yv, pmax(pmin(dt1$HistCTR[fv], 0.016), 0.005)) k <- 50 my_lift((dt1$user_time_search_cnt_3sec[fv]), pmin(predv/ctr, .3), yv, NULL, 20, print=T) my_lift(((dt1$CategoryID[!ft] == dt1$AdCategoryID[!ft])), predv/ctr, yv, NULL, 10, print=T) my_lift(((dt1$cat_equal[!ft])), pred, IsClick, NULL, 10) predh.dt <- data.frame(ID=dt1$ID[fh], IsClick=round(predh/ctr,6)) my_write_csv(predh.dt, file='output/sub3_0400x', row.names=F) predh.dt <- data.frame(ID=dt1$ID[fh], IsClick=round(predh/ctr,6)) my_write_csv(predh.dt, file='output/sub3x16_0400x', row.names=F) predh.dt <- data.frame(ID=dt1$ID[fh], IsClick=round(predh/ctr,6)) my_write_csv(predh.dt, file='output/sub3_rseq200_0400x', row.names=F) predh.dt <- data.frame(ID=dt1$ID[fh], IsClick=round(predh/ctr,6)) my_write_csv(predh.dt, file='output/sub3x16_run2_0400x', row.names=F) predh.dt <- data.frame(ID=dt1$ID[fh], IsClick=round(predh/ctr,6)) my_write_csv(predh.dt, file='output/sub4x16_0400x', row.names=F)