Copy disabled (too large)
Download .txt
Showing preview only (12,010K chars total). Download the full file to get everything.
Repository: DUanalytics/rAnalytics
Branch: master
Commit: ba122862509a
Files: 1497
Total size: 11.2 MB
Directory structure:
gitextract_w2ho1432/
├── .gitignore
├── 0-Practise/
│ ├── day1.R
│ ├── day2.R
│ ├── day3.R
│ ├── first.R
│ ├── htmlimport.R
│ ├── iims2.R
│ ├── import2.R
│ ├── kt1.R
│ ├── lm-sim-test1.R
│ ├── practise.R
│ ├── practise2.R
│ ├── rough.R
│ └── vector.R
├── 0-Rdata/
│ ├── madata.Rdata
│ ├── student.rds
│ ├── student1.rds
│ ├── twitter authentication.Rdata
│ └── twitterauthentication.Rdata
├── 0-class/
│ ├── AR-groceries.R
│ ├── CLUST-customer.R
│ ├── DT-CART-sales.R
│ ├── NAvalues.R
│ ├── autoML1.R
│ ├── bigQuery.R
│ ├── hhe_d01.R
│ ├── hhe_d02.R
│ ├── hhe_d03.R
│ ├── hhe_d04.R
│ ├── hhe_d04b.R
│ ├── lm_AIC.R
│ ├── missingValues.R
│ ├── munaz.csv
│ └── purchaseProb.R
├── 01-IIM/
│ ├── 10a-daily.R
│ ├── 11-analyticLevels.R
│ ├── 11a1-start.R
│ ├── 11a2-packages1.R
│ ├── 11a3-packages2.R
│ ├── 11a4-packages3.R
│ ├── 11a5-packages4.R
│ ├── 11b2-DS1.R
│ ├── 11b3-DS2-factor.R
│ ├── 11b4-DS4-reproduce.R
│ ├── 11b5-DS3.R
│ ├── 11e2_vectors1.R
│ ├── 11e3_matrices1.R
│ ├── 11e4_dataframe1.R
│ ├── 12a3-impexp1.R
│ ├── 12a4-datasets.R
│ ├── 12a4-impexp-xls.R
│ ├── 12e4-impexp-gs.R
│ ├── 13a2-NAvalues.R
│ ├── 13b2-outliers.R
│ ├── 15a2-GPH-basic.R
│ ├── 15a3-GPH-graphs.R
│ ├── 15a4-GPH-advgraphs.R
│ ├── 16b1-GPH-wordcloud.R
│ ├── 16e0-GPH-wordcloud.R
│ ├── 16e3-GPH-wordcloud-text1.R
│ ├── 16e4-GPH-wordcloud-text2.R
│ ├── 17a2-STATS-freqtable.R
│ ├── 17c1-STATS-datapartition.R
│ ├── 17d2-STATS-basicstats.R
│ ├── 18d1-DPLYR-mtcars1.R
│ ├── 18d3-DPLYR-mtcars2.R
│ ├── 18d4-DPLYR-dplyr.R
│ ├── 21a1-SLM-women.R
│ ├── 21a2-SLM-women.R
│ ├── 21a4-SLM-women.R
│ ├── 21a5-SLM-women-A.R
│ ├── 21b1-SLM-sales.R
│ ├── 21b3-SLM-areasales.R
│ ├── 21b4-SLM-salesarea.R
│ ├── 21d2-MLM-mtcars1.R
│ ├── 22a4-MLM-allmodels.R
│ ├── 22c2-MLM-mtcars-olsrr.R
│ ├── 22c3-MLM-cars.R
│ ├── 22c3-MLM-salespromotion.R
│ ├── 22d3-MLM-omni.R
│ ├── 22d4-MLM-sales-TV.R
│ ├── 23c1-LOGR-logR.R
│ ├── 23d1-LGR-gre.R
│ ├── 24b1-LOGR-purchase.R
│ ├── 24c2-LOGR-adult.R
│ ├── 24d2-LOGR-gre.R
│ ├── 24e2-LOGR-general.R
│ ├── 24g1-LOGR-cancer.R
│ ├── 24g2-LOGR-sample1.R
│ ├── 31a1-DT-cart-split.R
│ ├── 31a2-DT-outlook.R
│ ├── 31a3-DT-general.R
│ ├── 31a3-DT-practiseCase.R
│ ├── 31b1-DT-CART-carseats.R
│ ├── 31b2-DT-CART-sales.R
│ ├── 31b3-DT-CART-titanic.R
│ ├── 31b4-DT-CART-loan.R
│ ├── 31b5-DT-CART-gre.R
│ ├── 31b5-DT-loanapproved1.R
│ ├── 31b5-DT-rpart-iris.R
│ ├── 31b7-DT-party.R
│ ├── 31f1-DT-cart2.R
│ ├── 32a4-DT-cart1.R
│ ├── 32a4-DT-rf-4.R
│ ├── 32c2-DT-chaid-usvote.R
│ ├── 32d1-DT-CHAID-usvote.R
│ ├── 32d5-DT-CART-RF.R
│ ├── 32k1-CLS-svm1.R
│ ├── 33g3-CLS-randomForest1.R
│ ├── 34a1-CLUST-clustering.R
│ ├── 34a1-CLUST-samplecase.R
│ ├── 34b2-CLUST-customer.R
│ ├── 34d1-CLUST-creditData.R
│ ├── 34e4-clust-NOC-iris.R
│ ├── 34g1-CLUST-segmentation.R
│ ├── 34h1-CLUST-clustering.R
│ ├── 34h1-CLUST-clustering2.R
│ ├── 34j2-CLUST-NOC.R
│ ├── 35d1-CLUST-hclust1.R
│ ├── 35d2-CLUST-hclust2.R
│ ├── 37a2-AR-eg.R
│ ├── 37b1-AR-groceries.R
│ ├── 37b2-AR-samplecase.R
│ ├── 37b3-AR-groceries-subset.R
│ ├── 37b5-AR-finproducts.R
│ ├── 37b5-AR-groceries.R
│ ├── 37c3-AR-redundant.R
│ ├── 44b1-TS-data.R
│ ├── 44b1-TS-dates.R
│ ├── 44c2-TS-dates-lubridate.R
│ ├── 44c4-TS-xts-data.R
│ ├── 45d2-TS-airpassengers.R
│ ├── 45d3-TS-components-airp.R
│ ├── 45d4-TS-johnson.R
│ ├── 45g3-TS-TTR-ma.R
│ ├── 46e3-TS-auto-arima-johnson.R
│ ├── 55c1-TM-twitter.R
│ ├── 55e1-TM-twitter1.R
│ ├── 61b1-LP-marketingspend.R
│ ├── 61c2-LP-marketingspend-case.R
│ ├── 61d2-LP-tpt.R
│ ├── 61e2-LP-machassign.R
│ ├── 61e5-LP-farmer1.R
│ ├── 77a1-FA-quandl.R
│ ├── 77a2-FA-quandl2.R
│ ├── 77a5-FA-quantmod.R
│ ├── 77a6-FA-quantmod-I-stocks.R
│ ├── 77f2-FA-quantmod1.R
│ ├── 77f3-FA-indianstocks.R
│ ├── 91ab3-Case-student1.R
│ ├── 91b4-Case-dencoCase.R
│ ├── 91b5-Case-denco.R
│ ├── 91g4-CASE-dencoCase.R
│ ├── revision1.R
│ ├── test.R
│ ├── x55a1-TM-tweets.R
│ ├── x55a2-TM-twitter.R
│ └── x55a3-TM-twitter2.R
├── 02-IIMcases/
│ ├── DT_diabetis.R
│ ├── case-denco.R
│ ├── case-dplyr-mtcars.R
│ ├── rev_iima20_1.R
│ ├── revision1.R
│ ├── senitmentTwitter.R
│ └── textMiningSentence.R
├── 03-setup/
│ ├── 11a-start.R
│ ├── 11b-gettingstarted.R
│ ├── 12-packageInstall.R
│ ├── 15a-envrm.R
│ ├── 15b-renv.R
│ ├── 15e-rjava.R
│ ├── 16a-pathconfig.R
│ ├── 17a-rstudio.R
│ ├── 18a-processtime.R
│ ├── 21a-floorceiling1.R
│ ├── 21b-options.R
│ ├── 24a-github.R
│ ├── 25a-help.R
│ ├── 51c-deletefiles.R
│ ├── envVar.R
│ ├── help.R
│ └── pathconfig.R
├── 04-lib/
│ ├── 10a-fBasics.R
│ ├── 11a-pysch.R
│ ├── 21b-installpackages.R
│ ├── 21e-installFmGit.R
│ ├── 21g-packages1.R
│ ├── 31b-datasets.R
│ ├── 31c-datasets.R
│ ├── 41-purrr1.R
│ ├── 42-purr2.R
│ ├── 43-purrr3.R
│ ├── 44-purrr4.R
│ ├── 45-purrr5.R
│ ├── 51-plyr1.R
│ ├── 61-splitapplycombine1.R
│ ├── 62-splitapplycombine2.R
│ ├── 71-broom1.R
│ ├── packages1.R
│ ├── switchr.R
│ ├── useful.R
│ └── useful2.R
├── 05-dataIE/
│ ├── 14a-readcsv.R
│ ├── 14b-readcsv.R
│ ├── 14c-importweb.R
│ ├── 14d-importweb.R
│ ├── 14e-readothers.R
│ ├── 15b-datawrangling.R
│ ├── 20a-importgg.R
│ ├── 21b-googlesheet1.R
│ ├── 22b-ggsheets2.R
│ ├── 31a-export.R
│ ├── 32c-writecsv.R
│ ├── datasets.R
│ └── importExcel.R
├── 06-DS/
│ ├── 0FileList.R
│ ├── 10a-TOC
│ ├── 13b-ds-blank.R
│ ├── 14b-Basic_R_v01.R
│ ├── 14b-objectsmethods.R
│ ├── 14c-ds1.R
│ ├── 15a-objects.R
│ ├── 16b-datatypes.R
│ ├── 16c-basicDT.R
│ ├── 16d-ds1.R
│ ├── 20a-vectors1.R
│ ├── 20b-vectors2.R
│ ├── 20c-vectors3.R
│ ├── 20d-vectors4.R
│ ├── 20f-vectors5.R
│ ├── 20g-vectors6.R
│ ├── 25a-matrices.R
│ ├── 25c-matrices.R
│ ├── 25d-matrices.R
│ ├── 25e-matrices.R
│ ├── 27a-arrays.R
│ ├── 27b-arrays.R
│ ├── 27d-arrays.R
│ ├── 30c-basicdatatypes.R
│ ├── 30d-ds1.R
│ ├── 30e-datatypes.R
│ ├── 33b-df.R
│ ├── 33c-df.R
│ ├── 35a-lists.R
│ ├── 35b-lists.R
│ ├── 35e-lists.R
│ ├── 38a-factors.R
│ ├── 38b-factors.R
│ ├── 38c-factors.R
│ ├── 38e-factors.R
│ └── factors_iims.R
├── 07-excel/
│ ├── excelData.xlsx
│ ├── impExpExcel.R
│ ├── importFmExcel.R
│ ├── importFmGS.R
│ ├── mtcars.xlsx
│ ├── student1.R
│ └── student2.xlsx
├── 08-MBB/
│ ├── 01-MBB.R
│ ├── 02-MBB.R
│ ├── 03-MBB.R
│ ├── 04-MBB.R
│ ├── 05-MBB.R
│ └── 06-MBB.R
├── 10-packages/
│ ├── analyzer.R
│ ├── autoreg.R
│ ├── dttr2.R
│ ├── ggpmisc.R
│ ├── packages.txt
│ ├── timeplyr.R
│ └── tsutils.R
├── 11-stats/
│ ├── 10-statslinks.R
│ ├── 10a-distributions.R
│ ├── 10c-allstats.R
│ ├── 11a-normal.R
│ ├── 11b-normalq.R
│ ├── 11c-normald.R
│ ├── 12a-binomial.R
│ ├── 13a-mean.R
│ ├── 14a-median.R
│ ├── 15a-mode.R
│ ├── 15b-mode.R
│ ├── 16a-range.R
│ ├── 17a-sd.R
│ ├── 18a-covariance.R
│ ├── 19a-correlation.R
│ ├── 20a-coev.R
│ ├── 37a-sample1.R
│ ├── 40a-missing1.R
│ ├── 40b-missing1.R
│ ├── 40c-missing2.R
│ ├── 40d-missing3.R
│ ├── 42a-outlier1.R
│ ├── 43a-outliers1.R
│ ├── 45a-sampling.R
│ ├── 55a-traintest1.R
│ ├── 60a-kurtosis.R
│ ├── 60b-kurtosis.R
│ ├── 64a-skewness.R
│ ├── 64b-skewness.R
│ ├── 71a-xtabs.R
│ ├── ave1.R
│ ├── interactions.R
│ ├── mean1.R
│ ├── mean2.R
│ ├── mean3.R
│ ├── meandev.R
│ ├── meanwt1.R
│ ├── median1.R
│ ├── mode1.R
│ ├── normal_height.R
│ ├── normality.R
│ ├── normality2.R
│ ├── outlier2.R
│ ├── outliers1.R
│ ├── poiss1.R
│ └── quantile1.R
├── 13-statsH/
│ ├── 20b-distributions.R
│ ├── 23b-ztest-bsda.R
│ ├── 25c-tdistribution.R
│ ├── 26b-ttestindep.R
│ ├── 26c-ttestpaired.R
│ ├── 27b-TTS1-case1.R
│ ├── 28b-TTS2-case1.R
│ ├── 28c-TTS1-case3.R
│ ├── 28e-TT-sample1i.R
│ ├── 31b-chisqdistr.R
│ ├── 32b-HT-chisq1.R
│ ├── 32c-HT-chisq2.R
│ ├── 33b-HT-chisq.R
│ ├── 33d-chisqtest1.R
│ ├── 33e-chisqtest2.R
│ ├── 34b-goodnessfit.R
│ ├── datadistr.R
│ ├── htestnd1.R
│ ├── randomdistr.R
│ ├── shadeareainplot.R
│ └── tests1.R
├── 15-sampling/
│ ├── 12b-samples.R
│ ├── 12e-sampleint.R
│ ├── 14b-stratified.R
│ ├── 15b-samplesplit.R
│ ├── 15d-datapartition.R
│ └── 15f-partitionfolds.R
├── 22-summary/
│ ├── 22b-aggregate.R
│ ├── 22c-aggregate2.R
│ ├── 23b-freqdistr1.R
│ ├── 23c-freqdistr2.R
│ ├── 23d-freqdistr3.R
│ ├── 23f-FD.R
│ ├── 23f-freqdistr.R
│ ├── 24b-freqdistr4.R
│ ├── 24f-freqdistr5.R
│ ├── 25g-freqdistr6.R
│ ├── 31c-rowsums1.R
│ ├── 32b-addmargin1.R
│ ├── 32c-margintable1.R
│ ├── 32d-proptable.R
│ ├── 32d-tableprop2.R
│ ├── 35b-crosstab.R
│ ├── 99a-Pskim.R
│ ├── 99a-studentdata1.R
│ └── descriptive.R
├── 23-functions/
│ ├── 21b-rep.R
│ ├── 21c-seq.R
│ ├── 21g-replicate.R
│ ├── 21g-seqdates.R
│ ├── 22b-letters.R
│ ├── 25b-interval.R
│ ├── 25c-midpoint.R
│ ├── 27b-recode-car.R
│ ├── 29b-subset.R
│ ├── 29c-split1.R
│ ├── 29d-splitdata.R
│ ├── 29e-partitiondata.R
│ ├── 31b-rowcol1.R
│ ├── 33b-sortorder.R
│ ├── 33c-order.R
│ ├── 33c-sortorderrank.R
│ ├── 33d-rank.R
│ ├── 34b-castmelt1.R
│ ├── 34c-castmelt2.R
│ ├── 37a-mtcars-subset.R
│ ├── 37b-duplicates1.R
│ ├── 37c-unique.R
│ ├── 38b-scale1.R
│ ├── 41b-randnos1.R
│ ├── 41c-randnos.R
│ ├── 42b-normdist.R
│ ├── 45b-forloop1.R
│ ├── 45e-ifelse2.R
│ ├── 45v-switch1.R
│ ├── 46b-withoutapply.R
│ ├── 46c-applyForCompare.R
│ ├── 46d-applyfamily.R
│ ├── 46e-applytype.R
│ ├── 46f-while1.R
│ ├── 47b-apply1.R
│ ├── 47c-apply.R
│ ├── 47d-apply1.R
│ ├── 47h-tapply1.R
│ ├── 47j-lapply1.R
│ ├── 47m-mapply1.R
│ ├── 47n-mapply2.R
│ ├── 47o-rapply.R
│ ├── 47on-eapply.R
│ ├── 47p-sapply1.R
│ ├── 47q-sapply2.R
│ ├── 47s-tapply2.R
│ ├── 47t-vapply1.R
│ ├── 49b-replicate1.R
│ ├── 49c-replicate.R
│ ├── 49e-by.R
│ ├── 49f-by.R
│ ├── 49g-bywith.R
│ ├── 51b-myfunc.R
│ ├── 51c-functions1.R
│ ├── 53b-cbindrbind1.R
│ ├── 53c-joinDFs.R
│ ├── 53c-merge1.R
│ ├── 54b-combination.R
│ ├── 54d-expandgrid.R
│ ├── 55b-sweep1.R
│ ├── 55d-sweep2.R
│ ├── 56b-outer1.R
│ ├── 56c-outer2.R
│ ├── 57b-stack1.R
│ ├── 58-DF-common.R
│ ├── 58-df-matching1.R
│ ├── 58-df2.R
│ ├── 58-hmisc.R
│ ├── 58-pmatchchar.R
│ ├── 61c-missing1.R
│ ├── 61c-missing2.R
│ ├── 61c-missing3.R
│ ├── 62b-outlier.R
│ └── 62c-outlier2.R
├── 24-Strings/
│ ├── abvn.R
│ ├── latex.R
│ ├── output.txt
│ ├── paste1.R
│ ├── setop1.R
│ ├── strcmpt1.R
│ ├── string1.R
│ ├── strjoin.R
│ ├── strlength.R
│ ├── strman1.R
│ ├── strman2.R
│ ├── strman3.R
│ ├── strman4.R
│ ├── strman5.R
│ ├── strman6.R
│ ├── strman7.R
│ ├── strman9.R
│ ├── strprint1.R
│ ├── strreplace1.R
│ ├── strsearch.R
│ ├── strsplit1.R
│ ├── strsplit2.R
│ ├── strsplit3.R
│ ├── strsplit4.R
│ └── tidyr-strseperate.R
├── 31-graphs/
│ ├── 1bubblechart.R
│ ├── 1bubblechart2.R
│ ├── multipleplots1.R
│ └── tableGrob.R
├── 32-basicGraphs/
│ ├── 10a-graphs.R
│ ├── 10b-graphs.R
│ ├── 12b-graphs2.R
│ ├── 12d-title1.R
│ ├── 12e-text.R
│ ├── 12f-abline.R
│ ├── 12g-legend.R
│ ├── 12k-tick.R
│ ├── 12m-axis1.R
│ ├── 13e-multipleplots1.R
│ ├── 13f-multipleplots.R
│ ├── 13g-subplot.R
│ ├── 15a-graphdata1.R
│ ├── 15b-graph1.R
│ ├── 21b-plot-hist1.R
│ ├── 21c-plot.R
│ ├── 23b-line.R
│ ├── 23c-lines2.R
│ ├── 24b-histogram.R
│ ├── 24c-histogram2.R
│ ├── 25b-barplot.R
│ ├── 25c-barplot2.R
│ ├── 26b-boxplot.R
│ ├── 26c-boxplot2.R
│ ├── 26d-boxplot2.R
│ ├── 27b-pie.R
│ ├── 27c-pie2.R
│ ├── 29b-corrgram1.R
│ ├── 32b-freqdistr.R
│ ├── 33b-dotplot.R
│ ├── 33b-matrixplots.R
│ ├── 37b-scatter.R
│ ├── 42b-intplots1.R
│ ├── 43b-mosaic.R
│ ├── 43c-corrplot.R
│ ├── 43c-ggally.R
│ ├── 44b-textplots.R
│ ├── 45b-violinplot.R
│ ├── ria2g1.R
│ ├── ria2g2.R
│ ├── ria2g3.R
│ └── ria3g3.R
├── 33-AdvGraphs/
│ ├── cowplot1.R
│ ├── donut.R
│ ├── donut2.R
│ ├── esquisse.R
│ ├── lattice.R
│ ├── lattice1.R
│ ├── plotsToWord.R
│ ├── survey.R
│ ├── symbols.R
│ ├── vtree1.R
│ ├── waffle.R
│ └── xxggsubplot.R
├── 33-DT/
│ ├── 0-DTsummary.R
│ ├── 1-dt1.R
│ ├── 2-DT.R
│ └── 3-DT.R
├── 34-ggplots/
│ ├── circbarplot.R
│ ├── gg-apexcharts.R
│ ├── gg-bar1.R
│ ├── gg-bar2.R
│ ├── gg-box2.R
│ ├── gg-boxhist.R
│ ├── gg-boxplot1.R
│ ├── gg-halves.R
│ ├── gg-heatmap.R
│ ├── gg-hist1.R
│ ├── gg-hline.R
│ ├── gg-hvlines.R
│ ├── gg-labelend.R
│ ├── gg-legend1.R
│ ├── gg-line.R
│ ├── gg-pie1.R
│ ├── gg-slope.R
│ ├── gg-slope2.R
│ ├── ggbarplot.R
│ ├── ggbarplots.R
│ ├── ggboxhist.R
│ ├── ggboxplot2.R
│ ├── gghistogram.R
│ ├── ggp2.R
│ ├── ggplot-DU1.R
│ ├── ggplot-legend1.R
│ ├── ggplot3.R
│ ├── ggplot5.R
│ ├── ggplot6.R
│ ├── ggplot7.R
│ └── twoaxis-gg.R
├── 35-tidyverse/
│ ├── 20a-dplyr.R
│ ├── 21a-dplyr-select.R
│ ├── 21b-dplyr-slice1.R
│ ├── 21c-dplyr-mutate1.R
│ ├── 21d-dplyr-summarise.R
│ ├── 21e-dplyr-filter1.R
│ ├── 21f-dplyr-str.R
│ ├── 21g-dplyr-arrange.R
│ ├── 22b-dplyr-seperate1.R
│ ├── 22b-group.R
│ ├── 22c-summarise.R
│ ├── 22g-tibble-rownames.R
│ ├── 25b-magrittr.R
│ ├── 26c-tidyr-DSR1.R
│ ├── 26d-tidyr-DSR-who.R
│ ├── 31b-plyr1.R
│ ├── 32b-plyr-mutate.R
│ ├── 33d-dplyr-joins.R
│ ├── 33f-dplyr-split.R
│ ├── plyr-ddply-gpsum.R
│ ├── tidyr1.R
│ ├── zz-dplyr1.R
│ └── zz-tidy-dataformating.R
├── 41-LM/
│ ├── 10a-lm-women2.R
│ ├── 10b-lm-salesarea2.R
│ ├── 10c-MLR-omni.R
│ ├── 10e-lm-errorplot.R
│ ├── 13b-lm-commands.R
│ ├── 16b-SLM-women2.R
│ ├── 16c-SLM-women1.R
│ ├── 16e-SLM-women-A.R
│ ├── 16f-SLM-women-V.R
│ ├── 16f-SLM-women.R
│ ├── 16m-SLM-women2.R
│ ├── 17a-LM-case1.R
│ ├── 17b-LM-stock1.R
│ ├── 18a-SLM-salesarea.R
│ ├── 18b-SLM-salesarea.R
│ ├── 18c-SLM-salesarea.R
│ ├── 23a-MLM-omni.R
│ ├── 23c-MLM-omni.R
│ ├── 24a-MLM-pcsales.R
│ ├── 25a-MLM-mtcars.R
│ ├── 25c-MLM-mtcars.R
│ ├── 25c-MLM-mtcars1.R
│ ├── 25d-MLM-mtcars-A.R
│ ├── 26a-MLM-airquality.R
│ ├── 27a-MLM-marketing.R
│ ├── 35a-MLM-case1.R
│ ├── 37a-LM-dummy-fireplace.R
│ ├── 37b-dummy1.R
│ ├── 38c-LM-dummy1.R
│ ├── 41c-LM-assumptions.R
│ ├── 42b-LM-linearity.R
│ ├── 42c-LM-normality.R
│ ├── 42d-LM-variance.R
│ ├── 42e-LM-outliers.R
│ ├── 42f-LM-autocorr.R
│ ├── 42g-LM-influentialvariables.R
│ ├── 42h-LM-multicollinearity.R
│ ├── 42j-gvlma.R
│ ├── 43a-LM-graphs.R
│ ├── LM-all-mtcars1.R
│ ├── ProbDist.R
│ ├── Simulation.R
│ ├── confusionmatrix.R
│ ├── contrasts1.R
│ ├── dummies.R
│ ├── homosecadicity.R
│ ├── lm-broom.R
│ ├── lm-dummy1.R
│ ├── lm-housing.R
│ ├── lm-mtcars1.R
│ ├── lm-mtcars2.R
│ ├── lm-plot1.R
│ ├── lm-segments1.R
│ ├── mlm-state77.R
│ ├── multvariate1.R
│ ├── plotcoef1.R
│ └── regrplot1.R
├── 44-LogR/
│ ├── 24c-LR-default.R
│ ├── 24d-LR-default.R
│ ├── 24e-LR-default.R
│ ├── 24g-LR-default-accuracy.R
│ ├── 26b-LR-germancredit.R
│ ├── 27b-LR-gre.R
│ ├── 28b-LR-subscribe.R
│ ├── 28c-LR-subscribe.R
│ ├── 29b-LR-ads.R
│ ├── 31b-LR-income.R
│ ├── 31c-income.R
│ ├── 33b-LR-purchase.R
│ ├── 45b-compareAUC.R
│ ├── 45c-roc-default.R
│ ├── 45e-roc-general.R
│ ├── 45f-roc1.R
│ ├── 45h-roc2.R
│ ├── 46c-accuracy.R
│ ├── 48b-auc1.R
│ ├── 48c-auc1.R
│ ├── 48d-auc.R
│ ├── 48e-auc.R
│ ├── 49c-thresholdvalue.R
│ ├── pdpu.R
│ └── zz--logR.R
├── 51-DT/
│ ├── cls-gen
│ ├── cls1M-cancer.R
│ ├── dt-multiplemodels.R
│ ├── giniIndex.R
│ └── rattle.R
├── 52-CART/
│ ├── 10-CART-gen.R
│ ├── 11-cart-understandsplit.R
│ ├── 12-DT-outlook.R
│ ├── 12-IIMBG-wksp.R
│ ├── 12-IIMJ-wksp.R
│ ├── 12-IIMS-wksp.R
│ ├── 12-IITB-wksp.R
│ ├── CARTR_sales.R
│ ├── CART_Regression Tree v01.R
│ ├── DT-germanCredit.R
│ ├── DT-rpart-claims.R
│ ├── c-dt-rpart-Case-DU1.R
│ ├── c-dt-rpart-iris.R
│ ├── c-dt-rpart-sales1.R
│ ├── cls-cart-churn2.R
│ ├── cls-rpart-plot2.R
│ ├── dt-car.R
│ ├── dt-general.R
│ ├── dt-glaucoma.R
│ ├── dt-ionos1.R
│ ├── dt-iris1.R
│ ├── dt-kyphosis.R
│ ├── dt-loanapproved1.R
│ ├── dt-rpart-du.R
│ ├── dt-rpart-du1.R
│ ├── dt-rpart-du2.R
│ ├── dt-rpart-du3.R
│ ├── dt-rpart-metal.R
│ ├── dt-rpart-student1.R
│ ├── dt-rpart-text1.R
│ ├── dt-rpart-varimp1.R
│ ├── dt-rpart-varimp2.R
│ ├── dt-sleep.R
│ ├── dt-tree-car1.R
│ ├── dt3-eyes.R
│ ├── entropy.R
│ ├── multimodel.R
│ ├── tree-houseprices.R
│ └── zz-test.R
├── 53-splitcriteria/
│ ├── cls-entropy.R
│ ├── dt-rpart-criteria.R
│ ├── splitcriteria1.R
│ ├── splitcriteria2.R
│ └── splitcriteria3.R
├── 54-KNN/
│ ├── knn1_cancer.R
│ ├── knn2.R
│ ├── knn3.R
│ ├── knn4.R
│ ├── knn_diamonds.R
│ └── knn_iris.R
├── 54-NLM/
│ ├── nlm1.R
│ └── nlm2-mtcars.R
├── 55-CHAID/
│ ├── CHAID-nps2.R
│ ├── CHAID-xsell1.R
│ ├── c-dt-chaid-nps.R
│ ├── c-dt-chaid-usvote1.R
│ ├── chaid-attrition.R
│ ├── chaid-cancer.R
│ ├── chaid-usvote.R
│ ├── chaid2.R
│ ├── chaid4.R
│ ├── chisq.R
│ └── chisqtest2.R
├── 56-ctree/
│ ├── CTREE NPS R code v01.R
│ ├── ctree-KyCU.R
│ ├── ctree-airquality.R
│ ├── ctree-churn2.R
│ ├── ctree-clsregr-party.R
│ ├── ctree-clsregr.R
│ ├── ctree-readingskills.R
│ ├── ctree2-iris.R
│ ├── ctreee-iris.R
│ └── dt-ctree-playYes.R
├── 57-GLM/
│ ├── Logr-party.R
│ ├── crossfold.R
│ ├── crossval1.R
│ ├── cv-houseprices.R
│ ├── cv-women1.R
│ ├── cv3.R
│ ├── cvlm2.R
│ ├── glm-affairs1.R
│ ├── glm-affairs2.R
│ ├── glm-cars.R
│ ├── glm-titanic1.R
│ ├── logR1.R
│ ├── logpos1.R
│ ├── logr-mtcars.R
│ ├── logr-mtcars1.R
│ ├── logrMaths.R
│ ├── logreg-iris1.R
│ ├── multinominal.R
│ ├── multinominal2.R
│ ├── multinominal3.R
│ ├── multinominal4.R
│ ├── multinominal5.R
│ ├── multinominal6.R
│ └── nls1.R
├── 57-RF/
│ ├── dt-caret-xxx.R
│ ├── dt-rf-DU3.R
│ ├── dt-rf-eg2.R
│ ├── dt-rf-eg3.R
│ └── dt-rf-kyphosis1.R
├── 57-naive/
│ ├── naivbayes1.R
│ └── naivbayes2.R
├── 61-clust/
│ ├── 10-clust-packages.R
│ ├── 16b-km-withinss.R
│ ├── 17b-clust-noclusters1.R
│ ├── 17c-clust-numbers-iris.R
│ ├── 17d-noc-mclust.R
│ ├── 19b-clust-distances.R
│ ├── 19c-clust-distances.R
│ ├── 19d-clust-scaling.R
│ ├── 20b-clust-plots.R
│ ├── 20c-clust-plots2.R
│ ├── 23b-km-marks1.R
│ ├── 23c-km-marks2.R
│ ├── 23d-km-amap-marks3.R
│ ├── 23e-km-student2.R
│ ├── 24b-clust-women.R
│ ├── 25b-km-iris.R
│ ├── 25c-km-iris2.R
│ ├── 25f-km-iris2.R
│ ├── 26h-km-attitude.R
│ ├── 27c-clust-som1.R
│ ├── 33c-hc-nutrients1.R
│ ├── 33c-hc-vegan-dune1.R
│ ├── 33d-hc-protein.R
│ ├── 33f-hc-marks.R
│ ├── 33g-hc-sample.R
│ ├── 35d-pam-iris.R
│ ├── 35e-pam-nutrient.R
│ ├── 40b-mixedclust1.R
│ ├── 40c-clust-dendgm.R
│ ├── 43b-clust-mixedDataTypes1.R
│ ├── 45c-clustering-exist1.R
│ ├── 45e-clustering-animation1.R
│ ├── 50b-clust-ma1.R
│ ├── 50c-clust-ma2.R
│ ├── 50d-clust-ma3.R
│ ├── 61b-clust-custsegm.R
│ ├── animation2.R
│ ├── clust-allcustering.R
│ ├── clust-case-liberty.R
│ ├── clust-class-differences.R
│ ├── clust-compare.R
│ ├── clust-distance-calc.R
│ ├── clust-distance2.R
│ ├── clust-entropy.R
│ ├── clust-iterations.R
│ ├── clust-kselect.R
│ ├── clustering-seeds-dunn.R
│ ├── clustering-women.R
│ ├── clusters3.R
│ ├── hclust-USarrests.R
│ ├── hier-simplecase.R
│ ├── hier-usarrests.R
│ ├── iris.R
│ ├── kmeans-bankdata.R
│ ├── kmeans-pcalike.R
│ ├── kmeans-plots.R
│ ├── kmeans-randomness.R
│ └── pam1.R
├── 65-AR/
│ ├── 11a-measures1.R
│ ├── 12a-ar-samplecase.R
│ ├── 12b-ar-samplecase2.R
│ ├── 14a-ar-datastr.R
│ ├── 15-ar-groceries.R
│ ├── 15a-ar-Groceries1.R
│ ├── 15b-ar-Groceries.R
│ ├── 16b-groceries-summary.R
│ ├── 16d-ar-groceries-subset.R
│ ├── 16f-ar-groceries-vis.R
│ ├── 16f-ar-groceries-vis2.R
│ ├── 16f-ar-groceries-vis3.R
│ ├── 16f-ar-groceries-vis4.R
│ ├── 16f-ar-groceries-vis5.R
│ ├── 16k-ar-grocery-DT.R
│ ├── 17a-ar-transactionformat.R
│ ├── 17d-ar-matrix-transactions.R
│ ├── 17e-ar-df-transcations.R
│ ├── 17f-ar-csv-transactions.R
│ ├── 17f-ar-csv2-transactions.R
│ ├── 17g-ar-list-transcations.R
│ ├── 17h-ar-dataformats.R
│ ├── 18a-arules1.R
│ ├── 20a-ar-DU1.R
│ ├── 20b-ar-DU2.R
│ ├── 20c-ar-DU3.R
│ ├── 22a-ar-edn.R
│ ├── 22b-ar-elective.R
│ ├── 22d-ar-subjects.R
│ ├── 22e-ar-placement.R
│ ├── 22f-myAR1.R
│ ├── 25a-ar-income.R
│ ├── 25b-ar-medical.R
│ ├── 25c-ar-titanic.R
│ ├── 29a-ar-Adult.R
│ ├── 29b-ar-Adult-NW.R
│ ├── 29c-ar-Adult-Draft.R
│ ├── 30a-ar-Finance1.R
│ ├── 30b-ar-Finance.R
│ ├── 32a-ar-visual.R
│ ├── 33a-ar-redundant.R
│ ├── 33b-redundantrules.R
│ ├── 40a-ar-multilevel-Groceries.R
│ ├── 43a-ar-patterns.R
│ ├── 45a-ar-rulesextract.R
│ ├── 99-ar-NW.R
│ ├── 99-ar-OnlineSales.R
│ ├── 99-ar-basketanalysis2.R
│ ├── 99-ar-policechecks.R
│ ├── AR-Weka
│ ├── EDA-placement1.R
│ ├── ar-case-liberty.R
│ ├── ar-groceries2.R
│ ├── ar-practise.R
│ └── my_basket1.txt
├── 75-OR-LP/
│ ├── 15b-lpsolveAPI.R
│ ├── 15c-lpassign.R
│ ├── 21b-LP-mach-prod.R
│ ├── 21c-LP-mach-prod.R
│ ├── 22b-LP-case1.R
│ ├── 22c-LP-assign-case3.R
│ ├── 22d-LP-Case-carmanufacturing.R
│ ├── 25b-LPassign-job.R
│ ├── 30a-LP-tpt-function.R
│ ├── 31b-LP-tpt1.R
│ ├── 31c-LP-tpt2.R
│ ├── 31d-LP-tpt3.R
│ ├── 33d-proptable.R
│ ├── 41b-pricing.R
│ ├── 51b-LP-marketing.R
│ ├── lp-ss.R
│ ├── model.lp
│ ├── zz-LP-clplite.R
│ └── zz-LP-general.R
├── 77-TS/
│ ├── 11-tsdata.R
│ ├── 12b-TS-add-mult.R
│ ├── 14-ts-zoo.R
│ ├── 16-ts-xts.R
│ ├── 16c-dates-split1.R
│ ├── 16d-dates1.R
│ ├── 16d-ts-xts.R
│ ├── 23b-TS-Case-sales.R
│ ├── 23b-lubridate1.R
│ ├── 24b-Data-DFtoTS.R
│ ├── 24b-timeseries1.R
│ ├── 24c-timeseries2.R
│ ├── 24f-ts-data.R
│ ├── 26b-ts-components-airp.R
│ ├── 26c-ts-components.R
│ ├── 27b-ts-johnson.R
│ ├── 27c-ts-lm-uscons.R
│ ├── 28c-ts-lubridate1.R
│ ├── 31c-TS-airp.R
│ ├── 33b-zoo-ts.R
│ ├── 35b-LSM-beer1.R
│ ├── 38b-tsplots2a.R
│ ├── 38c-tsplots3.R
│ ├── 38f-plot-zz.R
│ ├── 41b-arima1.R
│ ├── 41c-arima2.R
│ ├── 41d-arima-airp.R
│ ├── 41d-arima-jj-nile.R
│ ├── 45b-TS-arima.R
│ ├── 52c-Case1-complete.R
│ ├── 53b-sales-ts.R
│ ├── 55b-ts-case-xxx2.R
│ ├── 55c-ts-case-xxxx.R
│ ├── SMA-nile.R
│ ├── TS-P-fpp.R
│ ├── TS-c02.R
│ ├── TS-data-DU1.R
│ ├── TS-fpp-seasonplot.R
│ ├── TS-kings.R
│ ├── TS-links
│ ├── TS-movag1.R
│ ├── TS-nile.R
│ ├── TS-xts.R
│ ├── TS-zoo.R
│ ├── UDFdates.R
│ ├── blank.R
│ ├── ts-P-highfreq.R
│ ├── ts-P-openair.R
│ ├── ts-P-padr.R
│ ├── ts-beer2.R
│ ├── ts-case1.R
│ ├── ts-case2.R
│ ├── ts-lubridate2.R
│ ├── ts-rollingvalues.R
│ ├── ts-rollingvalues2.R
│ ├── ts-splitdate.R
│ ├── ts-timestamp.R
│ ├── ts-yoy.R
│ └── tsforecast-exp.R
├── 78-nlp/
│ ├── SM-rtexttools1.R
│ ├── TM-zz.R
│ ├── downloadfile.R
│ ├── facebook1.R
│ ├── facebook2.R
│ ├── fms.txt
│ ├── linkedin1.R
│ ├── linkedin3.R
│ ├── pagerank.R
│ ├── readpdf.R
│ ├── rowling.txt
│ ├── rquery_wordcloud.R
│ ├── sentiment-tidyr1.R
│ ├── sentiment2.R
│ ├── textmining-DU1.R
│ ├── tm-worldcloud4.R
│ ├── twitter-hotel.R
│ ├── twitter-keys.R
│ ├── twitter-sentiment2.R
│ ├── twitter1-DU1.R
│ ├── twitter1-DU2.R
│ ├── twitter1-authen.R
│ ├── twitter1.R
│ ├── twitter2.R
│ ├── twitteracct
│ ├── wordcloud1.R
│ ├── wordcloud2.R
│ ├── wordcloud3.R
│ └── worldcloud2.R
├── 78-textdocs/
│ └── vit.txt
├── 80-SIM/
│ ├── montecarlo1.R
│ ├── montecarlo2.R
│ ├── mvsim1.R
│ ├── randomNos.R
│ ├── simLinks
│ └── simple1.R
├── 81-case-sum/
│ ├── 31b-DA-dencoCase.R
│ ├── 31c-DA-dencoCase2.R
│ ├── 31d-dsum-denco.R
│ ├── 31e-dencoCase2.R
│ ├── 31f-dencoCase.R
│ ├── 33c-basicDM-mtcars.R
│ ├── 33c-dplyr-mtcars.R
│ ├── 33f-DA-bakerydata1.R
│ ├── 34b-sales1.R
│ ├── 34c-sales2.R
│ ├── 35b-DA-student1.R
│ ├── 35c-dm-student1.R
│ ├── 36b-dsum-Case1.R
│ ├── 36c-dsum-Case2.R
│ ├── 36f-DSA-case2.R
│ ├── 37b-dsum-iris1.R
│ ├── 38b-dsum-haireyecolor1.R
│ ├── 42b-case-sum-graphs.R
│ └── dataexplore.R
├── 83-MA/
│ ├── CA.R
│ ├── campaign.csv
│ ├── caseStudy_juice.R
│ ├── data1.R
│ ├── graph1.R
│ ├── maregression1.R
│ ├── pricing1.R
│ └── tree1.R
├── 84-HR/
│ └── hr_churn.R
├── 85-RFM/
│ ├── rfm1.R
│ └── rfm3.R
├── 87-FA/
│ ├── 10-FAlinks.R
│ ├── Insurance Loss v01.R
│ ├── InsuranceLosses.csv
│ ├── Packages Pre-requisites_v03.R
│ ├── aapl.csv
│ ├── fa-iitg-dataanalysis.R
│ ├── finTS1.R
│ ├── findata1.R
│ ├── finstmts1.R
│ ├── finstmts2.R
│ ├── finstmts3.R
│ ├── gtrends1.R
│ ├── intrino1.R
│ ├── intrino2.R
│ ├── lag1.R
│ ├── logistic_regression.R
│ ├── qf1.R
│ ├── sentianalysistrading1.R
│ ├── shares1.R
│ ├── shares2.R
│ ├── stock3.R
│ ├── stockanalysis1.R
│ ├── stockanalysis2.RData
│ ├── stocks5.R
│ ├── stocksanalysis3.R
│ ├── stocksanalysis4.R
│ └── volatity1.R
├── 88-Network/
│ ├── NetSciX 2016 Workshop.R
│ ├── network1.R
│ ├── network2.R
│ └── traveltime1.R
├── 89-rVideos/
│ └── clustering-rV.R
├── 92-wksp2/
│ ├── 1a1-start.R
│ ├── 1a3-packages1.R
│ ├── 1b2-ds.R
│ ├── 1b3-factor.R
│ ├── 1d2-basicstats.R
│ ├── 1d2-dm-student1.R
│ ├── 1d3-dencoCase.R
│ ├── 1d4-DA-dencoCase.R
│ ├── 1e-graphs-basic.R
│ ├── 1e2-graphs.R
│ ├── 1e3-advgraphs.R
│ ├── 1f-SLR-women.R
│ ├── 1h1-dplyr.R
│ ├── 1h2-freqtable.R
│ ├── 2a-importExport.R
│ ├── 2b-SLR-salesarea.R
│ ├── 2b-allmodels.R
│ ├── 2b2-SLM-women.R
│ ├── 2b3-SLM-women-A.R
│ ├── 2b4-LM-cars.R
│ ├── 2b4-SLR-women.R
│ ├── 2c3-MLM-salespromotion.R
│ ├── 2c4-MLM-mtcars1.R
│ ├── 2d1-missingvalues.R
│ ├── 2d3-datapartition.R
│ ├── 2e1-logR-purchase.R
│ ├── 2e2-LOGR-adult.R
│ ├── 2e3-LOGR-gre.R
│ ├── 3b1-DT-CART-carseats.R
│ ├── 3b2-DT-CART-R-sales.R
│ ├── 3b3-DT-CART-titanic.R
│ ├── 3b4-DT-CART-R-loan.R
│ ├── 3b5-DT-loanapproved1.R
│ ├── 3b5-DT-rpart-iris.R
│ ├── 3d1-DT-CHAID-usvote.R
│ ├── 3e1-clust-customer.R
│ ├── 3e1-clustering.R
│ ├── 3e2-clust-samplecase.R
│ ├── 3e3-clust-segmentation.R
│ ├── 3e4-clust-noOfclusters.R
│ ├── 4b1-AR-groceries.R
│ ├── 4b2-AR-samplecase.R
│ ├── 4b3-AR-groceries-subset.R
│ ├── 4b5-AR-finproducts.R
│ ├── 4e1-twitter1.R
│ ├── 4e2-wordcloud.R
│ ├── 4e3-worldcloud2.R
│ ├── 4e5-wordcloud3.R
│ ├── 4f2-quantmod1.R
│ ├── 4f3-indianstocks.R
│ ├── 5-wordcloud2-New.R
│ ├── 5b-LP-marketingspend.R
│ ├── 5c2-LP-marketingspend-case.R
│ ├── 5d-wordcloud2.R
│ ├── 5d2-LP-tpt.R
│ ├── 5e2-LP-machassign.R
│ ├── 5e5-LP-farmer1.R
│ ├── 6b1-dates.R
│ ├── 6b1-ts-data.R
│ ├── 6c2-dates-lubridate.R
│ ├── 6d-TS-airpassengers.R
│ ├── 6d-ts-components-airp.R
│ ├── 6d-ts-johnson.R
│ ├── 6d-ts-xts-data.R
│ ├── 6e-TS-auto-arima-johnson.R
│ ├── 6g-ts-TTR-ma.R
│ ├── 8-fa-quandl.R
│ ├── 8-fa-quandl2.R
│ ├── 8-fa-quantmod.R
│ ├── 8-quantmod-I-stocks.R
│ └── zz-practise.R
├── 93-wksp3/
│ ├── Graph-matrixplots.R
│ ├── LMtrainTest.R
│ ├── Links_DAR
│ ├── Links_DAR.R
│ ├── TS-arima-johnson.R
│ ├── TS-components-airpassengers.R
│ ├── TS-data.R
│ ├── TS-dates.R
│ ├── TS-lubridate.R
│ ├── TS-movavg-Nile.R
│ ├── TS-movavg.R
│ ├── assocrule1.R
│ ├── assocrule2.R
│ ├── assocrule3.R
│ ├── decisiontree1.R
│ ├── decisiontree2.R
│ ├── decisiontree3.R
│ ├── decisiontree4.R
│ ├── decisiontree5.R
│ ├── decisiontree5CHAID.R
│ ├── df.R
│ ├── environ.R
│ ├── freqtable.R
│ ├── lm-salesarea.R
│ ├── lm-salesqty.R
│ ├── lm-women-simple.R
│ ├── lm.R
│ ├── logR.R
│ ├── logr-gre.R
│ ├── matrix.R
│ ├── missingvalues.R
│ ├── packages1.R
│ ├── packages2.R
│ ├── stats2.R
│ ├── twitter.R
│ ├── vectors.R
│ ├── wordcloud1.R
│ └── wordcloud2.R
├── 95-studqueries/
│ ├── Sapient_Big Data.R
│ ├── achal1.R
│ ├── achal1.csv
│ ├── achal2.R
│ ├── achal2.csv
│ ├── deepak.R
│ ├── hitesh-dec18.R
│ ├── hitesh1.R
│ ├── hitesh2.R
│ ├── hitesh3.R
│ ├── hitesh4.R
│ ├── hiteshJul18.R
│ ├── lalit1.R
│ ├── meena1
│ ├── meena2.R
│ ├── meena3.R
│ ├── missingValue.R
│ ├── sidana2.R
│ ├── tanviTS1.R
│ └── vivekIIMLN.R
├── 96-cancer/
│ ├── data-cancer.R
│ ├── rf-cancer.R
│ ├── svm-cancer1.R
│ └── svm-examples.R
├── 96-iris/
│ └── sumgraph1.R
├── 96-mtcars/
│ ├── 10b-datastructures.R
│ ├── 11b-mtcars.R
│ ├── 11c-mtcars-filter.R
│ ├── 11d-mtcars-descp.R
│ ├── 11f-mtcars-loops.R
│ ├── 11g-mtcars-sort.R
│ ├── 11h-mtcars-dplyr.R
│ ├── 12d-mtcars-graph1.R
│ ├── 12e-mtcars-graph2.R
│ ├── 12e-mtcars-summarise-dplyr.R
│ ├── 12f-diag-ggplot2-mtcars.R
│ ├── 12f-ggplot2-mtcars.R
│ ├── 13b-mtcars-lm1.R
│ ├── 13c-mtcars-lm2.R
│ ├── 13e-mtcars-lm3.R
│ ├── 14b-mtcars-logr.R
│ ├── 15b-mtcars-DT-class.R
│ ├── 15c-mtcars-DT-anova.R
│ ├── 16b-mtcars-cluster1.R
│ ├── 16c-mtcars-cluster2.R
│ ├── 22f-tidyr-mtcars.R
│ ├── mtcars-clust1.R
│ ├── mtcars-hclust.R
│ └── s1.R
├── 97-artwork/
│ ├── AuctionsData - artwork.csv
│ ├── AuctionsData - set1.csv
│ ├── artwork-cls1.R
│ ├── artwork-descp.R
│ ├── artwork-eda1.R
│ ├── artwork-eda2.R
│ ├── artwork-rought.R
│ ├── artwork1.R
│ ├── artwork2.R
│ ├── artwork4.R
│ ├── awdata1.R
│ ├── density.R
│ └── file2.R
├── 99-GD/
│ ├── aboutSL
│ ├── gradientdescent1.R
│ ├── gradientdescent2.R
│ ├── gradientdescent3.R
│ ├── gradientdescent4.R
│ └── regr1.R
├── 99-HTML/
│ └── aboutUSL
├── 99-Misc/
│ ├── dhmethods.R
│ ├── funcpgm1.R
│ ├── h2o.R
│ └── skimr-package.R
├── 99-json/
│ ├── 21b-json-format.R
│ ├── 23b-xml-import.R
│ ├── 25a-httr1.R
│ ├── 25c-httr2.R
│ ├── json-1.R
│ └── json2.R
├── 99-phd/
│ ├── attendance2.R
│ └── grades.R
├── 99-sports/
│ ├── cricket1.R
│ └── cricket2-york.R
├── 99-weka/
│ ├── cls-ID3.R
│ ├── cls-c45weka.R
│ ├── clsW-iris.R
│ └── clsW-iris2.R
├── MBArules.csv
├── README.md
├── Unsorted/
│ ├── CLT.R
│ ├── R-Exercise.R
│ ├── RCommander.R.R
│ ├── basiclm1.R
│ ├── binomial.R
│ ├── boxplot.R
│ ├── c.R
│ ├── central1.R
│ ├── colstats1.R
│ ├── complextables.R
│ ├── cor1.R
│ ├── crossfold1.R
│ ├── cut1.R
│ ├── cutprety1.R
│ ├── datalevels.R
│ ├── dbconnection.R
│ ├── dec17.R
│ ├── dec17b.R
│ ├── demo1.R
│ ├── density2.R
│ ├── descstatsgraphs1.R
│ ├── dplyr1.R
│ ├── ds1.R
│ ├── env1.R
│ ├── examB.R
│ ├── extra.R
│ ├── fd1.R
│ ├── fd2.R
│ ├── fd3.R
│ ├── fd4.R
│ ├── fd5-means.R
│ ├── googleS.R
│ ├── knitr.R
│ ├── kurtosis.R
│ ├── lm-sales.R
│ ├── lm1-sales.R
│ ├── lm1.R
│ ├── miscscripts.R
│ ├── nd1.R
│ ├── normal.R
│ ├── paneldata1.R
│ ├── plot1.R
│ ├── practise-dec17c.R
│ ├── practise.R
│ ├── rattle1.R
│ ├── rattle2.R
│ ├── rcdr1.R
│ ├── rjava.R
│ ├── rle1.R
│ ├── sample1.R
│ ├── sample2.R
│ ├── scripting1.R
│ ├── skewness1.R
│ ├── skewness2.R
│ ├── skewness3.R
│ ├── smpdist1.R
│ ├── summary1.R
│ ├── sumstats1.R
│ ├── ttest1.R
│ └── vaibhavi.R
├── cacert.pem
├── caseStudies/
│ └── allCases.R
├── data/
│ ├── AuctionsData - set1.csv
│ ├── Churn.csv
│ ├── Computers.csv
│ ├── Dataset1-Media-Example-EDGES.csv
│ ├── Dataset1-Media-Example-NODES.csv
│ ├── Dataset2-Media-User-Example-EDGES.csv
│ ├── Dataset2-Media-User-Example-NODES.csv
│ ├── ItemList.csv
│ ├── MA.RData
│ ├── MBA.csv
│ ├── MBArules.csv
│ ├── MMM_raw_data_v02.csv
│ ├── NPS Data Food Order v01.csv
│ ├── Predict Merchant_Sales v01.csv
│ ├── Prostate_Cancer.csv
│ ├── Rules_20.csv
│ ├── Sales.csv
│ ├── Sales_files/
│ │ ├── 6006907
│ │ ├── frameworks-95aff0b550d3fe338b645a4deebdcb1b.css
│ │ ├── frameworks-b3cd8fa1481bc34c4b18cf307ca75438.js.download
│ │ ├── github-542f291c828bb453339765ba3a54c144.js.download
│ │ └── github-cdaf214b636e7d0581fce94eda9de4bd.css
│ ├── Segmentation_Data v01.csv
│ ├── Social_Network_Ads.csv
│ ├── StudentPassFail.csv
│ ├── StudentTid1.csv
│ ├── StudentTid2.csv
│ ├── airpsng.csv
│ ├── ar14.csv
│ ├── ar14b.csv
│ ├── artwork.rds
│ ├── arulesfin.csv
│ ├── attendance1.csv
│ ├── attendance2.csv
│ ├── attrition.csv
│ ├── badata.Rdata
│ ├── bakery.csv
│ ├── bank.csv
│ ├── binary.csv
│ ├── bitsgoa.csv
│ ├── cclogr.csv
│ ├── clscredit.csv
│ ├── clsplay.csv
│ ├── clust_custseg.csv
│ ├── dar1.csv
│ ├── dar1w.csv
│ ├── dar1w.csv.arff
│ ├── dar2.csv
│ ├── dar3.csv
│ ├── dar3a.csv
│ ├── dar3b.csv
│ ├── data1.R
│ ├── data4cluster2.csv
│ ├── data_clus_2.csv
│ ├── dataiitb.csv
│ ├── dateformat1.R
│ ├── denco.csv
│ ├── dhiraj.csv
│ ├── dtdata.csv
│ ├── fintransactions.csv
│ ├── grades.csv
│ ├── groceries.csv
│ ├── heart_tidy.csv
│ ├── hhe.txt
│ ├── iimS.xlsx
│ ├── iima.csv
│ ├── iimc1.csv
│ ├── iimtrichy.csv
│ ├── iitgfa.csv
│ ├── iitgfa.xlsx
│ ├── iitgfa2.xlsx
│ ├── iris.csv
│ ├── iris.xlsx
│ ├── irisF.csv
│ ├── irisT.csv
│ ├── itemlist1
│ ├── km5_c2.csv
│ ├── logr2.csv
│ ├── msales.csv
│ ├── mtcars.csv
│ ├── mtcars.sas7bdat
│ ├── mtcars.xlsx
│ ├── mtcars1.csv
│ ├── mtcarsF.csv
│ ├── mtcarsT.csv
│ ├── mushrooms.csv
│ ├── my_basket
│ ├── myexcel.xlsx
│ ├── myitems1.csv
│ ├── myrules1.csv
│ ├── mytextcars.txt
│ ├── myworkbook.xlsx
│ ├── node1.csv
│ ├── onsen.csv
│ ├── pumba.csv
│ ├── rep2.csv
│ ├── rep4.csv
│ ├── rep5.csv
│ ├── revision1.csv
│ ├── rules.csv
│ ├── rulesR.csv
│ ├── s1.csv
│ ├── salesdata.csv
│ ├── salesdatamonth.csv
│ ├── salesslr.csv
│ ├── slr1.csv
│ ├── splitData1.R
│ ├── stock.csv
│ ├── stock1.csv
│ ├── stock1.txt
│ ├── student.csv
│ ├── student1.csv
│ ├── student1.xlsx
│ ├── student2.xlsx
│ ├── student3.xlsx
│ ├── student3a.xlsx
│ ├── studentdata.R
│ ├── studentdata.csv
│ ├── studentdata2.csv
│ ├── studentdata3.txt
│ ├── studentdata4.csv
│ ├── students.csv
│ ├── students3.csv
│ ├── talltransactions.csv
│ ├── tendulkar.csv
│ ├── titanic.csv
│ ├── titanic.raw.rdata
│ ├── women.sav
│ └── ximb.csv
├── dates/
│ ├── 11-date1.R
│ ├── 12-dates-seq.R
│ ├── 12c-dates-seq2.R
│ ├── 13-dates-format.R
│ ├── 14-dates-format2.R
│ ├── 15-date-subset-arithmetic.R
│ ├── 17-time-chron.R
│ ├── 17c-time-posixt.R
│ ├── 19-datetime-lubridate.R
│ ├── 19c-datetime-lubridate.R
│ ├── 19d-datetime-lubridate.R
│ ├── 19e-datetime-lubridate.R
│ ├── 19f-datetime-lubridate.R
│ └── 30-datetime-zzz.R
├── download/
│ ├── fms.txt
│ ├── iris.csv
│ ├── iris.xlsx
│ ├── rowling.txt
│ └── vector.R
├── iim.txt
├── misc/
│ ├── 1-ds.R
│ ├── cswr.R
│ ├── fms.txt
│ ├── importcsv.R
│ ├── mysqlR.R
│ ├── nest.R
│ └── timeszones.R
├── munaz.csv
├── mycars.csv
├── myexcelcars.xlsx
├── myrules1.csv
├── practise1.R
├── rAnalytics.Rproj
├── report/
│ ├── knit2.R
│ ├── knitr-minimal.R
│ ├── sample1.R
│ └── sample2.R
├── trg/
│ ├── bennett1.R
│ ├── bennett3.R
│ ├── d1-fmssrcc.R
│ ├── d2a-fmssrcc.R
│ ├── d2b-fmssrcc.R
│ ├── iima-d1.R
│ ├── iima-d3.R
│ ├── iima-d4.R
│ ├── iimkpg-d6.R
│ ├── iimkpv-d1.R
│ ├── iimkpv-d4.R
│ └── ximb-r.R
└── twitter authentication.Rdata
================================================
FILE CONTENTS
================================================
================================================
FILE: .gitignore
================================================
.Rproj.user
.Rhistory
.RData
.Ruserdata
*.json
.httr-oauth
================================================
FILE: 0-Practise/day1.R
================================================
# Day 1
library(ISLR)
data('Default')
str(Default)
LR1 = glm(default ~ ., family='binomial', data=Default)
summary(LR1) #leave income
LR2 = glm(default ~ student + balance, family='binomial', data=Default)
summary(LR2)
#
range(Default$balance)
ndata3 = Default[c(1,60,700),]
predict(LR2,newdata=ndata3, type='response' )
#mtcars
str(mtcars)
#vectors, arrays, matrix, list, factor, dataframe
x = 1:5
x1 = c('a','b')
m1 = matrix(1:24, nrow=6)
m1
list1 = list(x, x1, m1)
list1
class(women)
women
str(women)
?women
women
head(women)
tail(women,n=3)
head(women, n=3)
names(women)
summary(women)
dim(women)
data()
library(MASS)
x = women$height
x
plot(x)
mean(x)
sd(x) ; var(x)
max(x)
median(x)
x
sort(x, decreasing = T)
table(x)
quantile(x)
x
seq(0,1,.1)
quantile(x, c(.1, .5, .8))
quantile(x,seq(0,1,.1) )
summary(x)
min(x); max(x)
boxplot(x)
abline(h= c(min(x), max(x),mean(x)+1, median(x)), col=1:5, lwd=4)
#
# LM
head(women)
names(women)
model1 = lm(weight ~ height, data=women)
plot(women)
?lm
#options(scipen=999)
summary(model1)
model1
#y = mx + c
y = 3.45 * x + - 87
women$height
fitted(model1)
cbind(women, fitted(model1))
residuals(model1)
cbind(women, fitted(model1), residuals(model1), diff= fitted(model1) - women$weight)
sqrt(sum(residuals(model1)^2)/nrow(women))
cbind(women, fitted(model1))
range(women$height)
new1= data.frame(height=c(57, 60.5,70))
p1=predict(model1, newdata = new1)
cbind(new1, p1)
#mtcars----
names(mtcars)
?mtcars
mtmodel_1 = lm(mpg ~ wt, data=mtcars )
mtmodel_2 = lm(mpg ~ wt + disp, data=mtcars )
mtmodel_3 = lm(mpg ~ wt + disp + cyl, data=mtcars )
mtmodel_4 = lm(mpg ~ ., data=mtcars )
summary(mtmodel_1) #.745
summary(mtmodel_2) #.766
summary(mtmodel_3) #.766
summary(mtmodel_4) #.807
AIC(mtmodel_1, mtmodel_2,mtmodel_3,mtmodel_4)
summary(mtmodel_4) #.807
step(lm(mpg ~ ., data=mtcars ))
mtmodel_5= lm(mpg ~ wt + qsec + am, data=mtcars)
summary(mtmodel_5) #Adjusted R-squared: 0.834
#
attendance = 1:20
marks = 1:20
summary(lm(marks ~ attendance))
cbind(attendance, marks)
cor(attendance, marks)
#
#y = mx + c
x
y = 3.45 * x + - 87
x
head(women)
(y = 4.45 * 58 - 87)
plot(women)
abline(model1, col='red', lwd=4)
abline(v=64) ; abline(h=150)
x2 = floor(runif(1000, 50, 100))
x2
x2a= sort(x2)
x2a[1000/2]
median(x2)
sort(x)
t1= table(x2)
sort(t1, decreasing = T)
x1 = rep(10,10)
x1
sd(x1)
dim(mtcars)
mtlogmodel = glm(am ~ hp + wt, family='binomial', data=mtcars)
summary(mtlogmodel)
p1=predict(mtlogmodel, newdata=mtcars, type='response')
p2= round(p1, 3)
p3 = ifelse(p2<0.5,0,1)
cbind(mtcars$am, mtcars$hp, mtcars$wt, p2,p3, truefalse= mtcars$am == p3)
================================================
FILE: 0-Practise/day2.R
================================================
# Day 3 - Online batch of MA
#attach function of R
women
names(women)
height
attach(women)
height
weight
women$height
#List
g <- "My First List" #scalar
h <- c(25, 26, 18, 39) # numeric vector
j <- matrix(1:10, nrow=5) #matrix
k <- c("one", "two", "three") # character vector
mylist <- list(title=g, ages=h, j, k, women)
mylist
mylist[[2]]
mylist[[5]]
# plot
plot(x=height, y=weight, type='b', lty=5, pch=11, fg='red', bg='green', col.axis='purple', cex=1.5, cex.axis=2)
title(main='Henry Harvin', sub=' MA Course')
================================================
FILE: 0-Practise/day3.R
================================================
attach(mtcars)
plot(wt, mpg)
abline(lm(mpg~wt))
title("Regression of MPG on Weight")
detach(mtcars)
dose <- c(20, 30, 40, 45, 60)
drugA <- c(16, 20, 27, 40, 60)
drugB <- c(15, 18, 25, 31, 40)
plot(dose, drugA, type="l")
plot(dose, drugA, type="b")
par(no.readonly = T)
opar <- par(no.readonly=TRUE)
plot(dose, drugA, type="b")
par(lty=2, pch=17)
plot(dose, drugA, type="b")
plot(dose, drugB, type="b")
par(opar)
plot(dose, drugA, type="b")
plot(dose, drugA, type="b",fg='red', col='purple', col.axis='green')
library(RColorBrewer)
n <- 7
mycolors <- brewer.pal(n, "Set1")
barplot(rep(1,n), col=mycolors)
barplot(rep(1,n), col=1:7)
n <- 10
mycolors <- rainbow(n)
pie(rep(1, n), labels=mycolors, col=mycolors)
mygrays <- gray(0:n/n)
pie(rep(1, n), labels=mygrays, col=mygrays)
================================================
FILE: 0-Practise/first.R
================================================
# First File in R
x1 <- c(1, 5, 4, 9, 0) # <- is assignment x to have value 1,5,4,9,0
#control + enter
x2 = c(1, 5, 4, 9, 0)
x1
x2
x = c(1,2,3,4,5,6,7,8,9,10)
x
x = 1:100
x
x = runif(100, 50, 200)
?runif
x
x = rnorm(100, mean=50, sd=10)
x
trunc(x)
round(x,1)
floor(x)
ceiling(x)
hist(x)
as.integer(x)
plot(density(x))
abline(v=50)
?runif
head(x)
class(x);mode(x)
typeof(x)
summary(x)
# Types of Data Structures in R
================================================
FILE: 0-Practise/htmlimport.R
================================================
#Installing the web scraping package rvest
#install.packages("rvest")
library(rvest)
#Specifying the url for desired website to be scrapped
url <- 'http://pgdbablog.wordpress.com/2015/12/10/pre-semester-at-iim-calcutta/'
#Reading the HTML code from the website
webpage <- read_html(url)
#Know about the selector gadget
vignette("selectorgadget")
#Using CSS selectors to scrap the post date
post_date_html <- html_nodes(webpage,'.entry-date')
post_date_html <- html_nodes(webpage,'.published , .entry-title')
#Converting the post date to text
post_date <- html_text(post_date_html)
#Verify the date captured
post_date
url="www.imdb.com"
#Using the CSS selector (using ‘www.imdb.com’ website in this example)
rating_html=html_nodes(webpage,'.imdb-rating') #’.imdb-rating’ is taken from CSS selector
#Converting the rating data to text
rating <- html_text(rating_html)
#Check the rating captured
rating
html <- read_html("http://www.imdb.com/title/tt1490017/")
cast <- html_nodes(html, "#titleCast .itemprop")
length(cast)
#> [1] 30
cast[1:2]
html <- read_html("http://www.imdb.com/title/tt1490017/")
cast <- html_nodes(html, ".quicklink")
length(cast)
#> [1] 15
html_text(cast)
url="https://www.timeshighereducation.com/world-university-rankings/2019/world-ranking#!/page/0/length/-1/sort_by/rank/sort_order/asc/cols/stats"
typeof(url)
length(url)
links= ".stats_female_male_ratio , .stats_pc_intl_students , .stats_student_staff_ratio , .stats_number_students , .ranking-institution-title"
html <- read_html("https://www.timeshighereducation.com/world-university-rankings/2019/world-ranking#!/page/0/length/-1/sort_by/rank/sort_order/asc/cols/stats")
cast <- html_nodes(html, ".stats_female_male_ratio")
length(cast)
#> [1] 30
cast[1:2]
cast
ranks <- html_nodes(url, ".ranking-institution-title")
library(rvest)
URL <- "https://www.soccerstats.com/latest.asp?league=netherlands" #Feed page
WS <- read_html (URL) #reads webpage into WS variable
URLs <- WS %>% html_nodes ("a:nth-child(1)") %>% html_attr("href") %>% as.character() # Get the CSS nodes & extract the URLs
URLs <- paste0("http://www.soccerstats.com/",URLs)
oversdf <- data.frame(URLs=URLs)
rownames(oversdf) #returns a vector of row names in the overs data.frame:
URLs <-subset(oversdf, grepl("pmatch", oversdf$URLs),stringsAsFactors = FALSE)
write.csv(URLs,file=paste(getwd(),"/sportURLs.csv",sep=""),row.names=FALSE)
Catcher1 <- data.frame(FMatch=character(),TotalGoals=character (),stringsAsFactors = FALSE)
##################################
#start of workaround
n<-nrow(URLs)
URLs2<-character()
for (i in 1:n) {
URLs2[i]<-as.character(URLs[i,1])
}
library(dplyr)
library(rvest)
web = read_html("https://news.google.com/?hl=en-IN&gl=IN&ceid=IN:en")
web %>% html_nodes(".VDXfz") %>% html_text()
library(rvest)
library(purr)
url_base =html("https://www.cochranelibrary.com/cdsr/table-of-contents/2018/11")
#map_df(1:4)
page = read_html(url_base)
page %>% data.frame(paper = html_text(html_nodes(".search-result-doi"))) %>% df4
url2 = "http://www.espncricinfo.com/india/content/player/28081.html"
library(rvest)
library(curl)
msd = read_html(url2)
msd
msd2 <- msd %>% html_nodes("table") %>% .[1] %>% html_table(fill=T)
msd2
str(msd2)
#-----
url3 = "https://www.timeshighereducation.com/world-university-rankings/2019/world-ranking#!/page/0/length/-1/sort_by/rank/sort_order/asc/cols/stats"
the = read_html(url3)
the
the3 <- the %>% html_nodes("table") %>% .[1] %>% html_table(fill=T)
the3
str(msd2)
html_
================================================
FILE: 0-Practise/iims2.R
================================================
# data structures in R
#way of storing and manipulating data in any programming languages
#vector----
x = c(1,2,3,6) #control + enter or Run button
#vector of single data type
x #print x
class(x)
x2 = c('a',"b", 'Dhiraj',3) #character vector
x2
class(x2)
?c #help
mean(x) #mean of x vector values
sum(x)
(x3 = 1:100) #create a vector and then print
x3
class(x3)
x4 = seq(1,100,3) #sequence
x4
?seq
x5 = list(1,3,5,'a')
x5
class(x5)
#data types - numeric, integer, character, logical
x6 = c(T, T, F, F,F)
length(x6)
x6
class(x6)
#matrix-----
#dataframe-----
================================================
FILE: 0-Practise/import2.R
================================================
#web scrapping
#https://www.analyticsvidhya.com/blog/2017/03/beginners-guide-on-web-scraping-in-r-using-rvest-with-hands-on-knowledge/
#Loading the rvest package
library('rvest')
#Specifying the url for desired website to be scrapped
url <- 'http://www.imdb.com/search/title?count=100&release_date=2016,2016&title_type=feature'
#Reading the HTML code from the website
webpage <- read_html(url)
webpage
#Using CSS selectors to scrap the rankings section
rank_data_html <- html_nodes(webpage,'.text-primary')
#Converting the ranking data to text
rank_data <- html_text(rank_data_html)
#Let's have a look at the rankings
head(rank_data)
#Data-Preprocessing: Converting rankings to numerical
rank_data<-as.numeric(rank_data)
#Let's have another look at the rankings
head(rank_data)
#[1] 1 2 3 4 5 6
#Using CSS selectors to scrap the title section
title_data_html <- html_nodes(webpage,'.lister-item-header a')
#Converting the title data to text
title_data <- html_text(title_data_html)
#Let's have a look at the title
head(title_data)
#[1] "Sing" "Moana" "Moonlight" "Hacksaw Ridge"
#[5] "Passengers" "Trolls"
#Using CSS selectors to scrap the description section
description_data_html <- html_nodes(webpage,'.ratings-bar+ .text-muted')
#Converting the description data to text
description_data <- html_text(description_data_html)
#Let's have a look at the description data
head(description_data)
#Data-Preprocessing: removing '\n'
description_data<-gsub("\n","",description_data)
#Let's have another look at the description data
head(description_data)
================================================
FILE: 0-Practise/kt1.R
================================================
?which
LETTERS
which(LETTERS == "K") #11th alphabet
marks = c(10,30,40, 60)
which(marks > 30) #position
marks[which(marks > 30)] #which of thos valuees
names = c('kanika','dhiraj', 'tanvi','poonam','dhananjay', 'upen')
#which names have dh
grep('[dh]', names, ignore.case=T)
names[grep('[dh]', names, ignore.case=T)]
mtcars
which(mtcars$mpg > 25)
mtcars[which(mtcars$mpg > 25),]
j = c(2,3,4,5,6,7,8,9,23,24,22,1,10)
j > 6
which(j > 6)
j[ which(j > 6) ]
?grep
which(ll <- c(TRUE, FALSE, TRUE, NA, FALSE, FALSE, TRUE)) #> 1 3 7
names(ll) <- letters[seq(ll)]
which(ll)
which((1:12)%%2 == 0) # which are even?
which(1:10 > 3, arr.ind = TRUE)
================================================
FILE: 0-Practise/lm-sim-test1.R
================================================
#LM Simulation
library(car)
library(PerformanceAnalytics)
library(corrgram)
library(corrplot)
#Data
set.seed(1234); x1 <- rnorm(100, 70, 10) + rnorm(100)
set.seed(1234); x2 <- runif(100, 60, 90)
set.seed(1234); x3 <- .75 * x1 + rnorm(10,7,2)
set.seed(1234); x4 = factor(sample(c('M','F'), size=100, replace=T, prob=c(.7,.3)))
set.seed(1234); x5 <- runif(100, 30,50) - rnorm(20,5,2)
set.seed(1234); error <- rnorm(100,15,3)
alias(fit)
y = 5 + 1.2 * x1 + 0.005 * x2 - x3 - .6 * x5 + error
df = data.frame(y, x1, x2, x3, x4, x5)
head(df)
str(df)
#LM
fit = lm(y ~ x1 + x2 + x3 + x4 + x5, data= df)
alias(fit) #any built in correlation
x6 = .1*x1
alias(lm(y ~ x1 + x2 + x6)) #x6 related to x1
attributes(fit)
attributes(summary(fit))
summary(fit)
summary(fit)$r.squared
summary(fit)$adj.r.squared
summary(fit)$sigma # SEE
#Model worthiness
anova(fit)
#Interpret
names(fit)
fit$residuals #resid(fit)
fitted(fit) # predict(fit, df)
#assumptions
#A1:Linearity IV-DV----
car::crPlots(lm(formula, data=df))
corrgram::corrgram(df)
corfit = cor(df[,-5])
corrplot::corrplot(cor(df[,-5]), method="pie")
corrplot::corrplot(cor(df[,-5]), method="number")
PerformanceAnalytics::chart.Correlation(df[,-5], histogram=TRUE, pch=19)
#A : Normality of Residues---
car::qqPlot(fit)
#A2:Autocollinearity
car::durbinWatsonTest(fit)
#if pvalues < 0.05, autocorr exists
#A4:Homoscedascity of Residuals
car::ncvTest(fit)
plot(fit, which=1)
#MultiCollinearity----
car::vif(fit) # variance inflation factors
sqrt(vif(fit)) > 2 # problem?
#remove variable one by one start from highest vif value
# Assessing Outliers
car::outlierTest(fit) # Bonferonni p-value for most extreme obs
car::qqPlot(fit, main="QQ Plot") #qq plot for studentized resid car::leveragePlots(fit) # leverage plots
#Influencing Values
plot(fit,which=4)
car::influencePlot(fit)
avPlots(fit)
#Model2
df
summary(fit)
fit2 = lm(y ~ x1 + x3 + x5, data= df)
summary(fit2)
#Model Compare
anova(fit2,fit) #do not reject Ho ie. Model1 is better
AIC(fit2, fit) # whichever lower AIC is better
================================================
FILE: 0-Practise/practise.R
================================================
#List
x; m1; a1; df1
g ="My First List"
h = c(25, 26,18,39)
j = matrix(1:10,nrow=2)
k = c('one','two','three')
mylist = list(title=g, ages=h, j, h)
mylist2 = list(k, mylist)
mylist2
mylist
mylist[1]
mylist[2]
mylist[[2]]
mylist[['ages']]
mylist$ages
#List end
#Factors
df1
# category type - ordered or unordered
#gender, course, color - unordered
#grades, division, position,likertscale, ratings
summary(df1)
(grades = sample(c('A', 'B', 'C'),size=10, replace=T, prob=c(.4,.3,.3)))
df1$grades = grades
df1
summary(df1)
df1$gender = factor(df1$gender)
summary(df1)
df1$grades =factor(df1$grades, ordered=T)
df1$grades
aggregate(df1$age, by=list(df1$grades), mean)
aggregate(df1$age, by=list(df1$gender), mean)
aggregate(df1$age, by=list(df1$course), mean)
(df1$grades =factor(df1$grades, ordered=T, levels=c('C','B','A')))
(division = sample(c('Excellent', 'Very Good', 'Sat'),size=10, replace=T, prob=c(.4,.3,.3)))
division
summary(division)
Fdivision = factor(division)
summary(Fdivision)
Fdivision2 = factor(division, ordered=T, levels=c('Sat', 'Very Good', 'Excellent'))
summary(Fdivision2)
Fdivision2
Fdivision3 = factor(division, ordered=T)
summary(Fdivision3)
Fdivision3
#factors end
#Data Frame
(rollno = 1:10)
(sname = paste('Student',1:10,sep='-'))
(age = floor(runif(10, 20, 30)))
(gender = c(rep('M',5),rep('F',5)))
(course = sample(c('Engg','Medical','MBA'), 10, replace=T, prob=c(.3, .4, .3)))
table(course)
(married = sample(c(TRUE, FALSE), 10, replace=T))
table(married)
rollno; sname; age ; gender; course; married
(df1 = data.frame(rollno, sname, age , gender, course, married))
df1[1:2,3:4]
df1$sname
df1[df1$married==T, ]
df1[df1$course=='Engg' & df1$age > 25, ]
df1[df1$married==T & df1$course=='Engg' & df1$age > 25, c('sname') ]
?aggregate
aggregate(df1$age, by=list(df1$gender), FUN=mean)
aggregate(df1$age, by=list(df1$course), FUN=mean)
aggregate(df1$age, by=list(df1$course, df1$gender), FUN=mean)
df1
summary(df1)
#DF end
#Array
?array
array(data = NA, dim = length(data), dimnames = NULL)
#Coys - 5, Products-3, Locations-4
ceiling(3.2);
(salesfig = floor(runif(60, 70, 100)))
(a1 = array(data = salesfig, dim = c(4,3,5), dimnames = list(paste('Loc',1:4),paste('Prod',1:3),paste('Coy',1:5))))
apply(a1,1, sum)# sum locationwise
apply(a1,2, sum)
apply(a1,3, sum)
(ma1 = apply(a1,c(1,3), sum))
colSums(ma1)
rowSums(ma1)
apply(a1,c(2,3), sum)
#arrayend
#Matrix
#row x columns
?rnorm
set.seed(1234)
(x = trunc(runif(24,100,500)))
(m1 = matrix(data=x, nrow=4,dimnames = list(c('delhi','mumbai','noida','chennai'),paste('Prod',1:6,sep="-"))))
colMeans(m1);rowMeans(m1)
colSums(m1); rowSums(m1)
pie(x=rowMeans(m1))
barplot(rowMeans(m1)) # barplot for locations
barplot(colMeans(m1)) #barlplot for products
barplot(colMeans(m1), horiz = T)
barplot(colMeans(m1), horiz = T, col=1:6)
m1
#Subset a Matrix
m1[ , 1:2]
m1[ ,c(1,4)]
m1[c(1,3) ,c(1,4)]
m1[c('delhi','mumbai'),c('Prod-3')]
m1[m1 > 300]
m1
m1[c('delhi'),]
sd(m1[c('delhi'),])
sum(m1[c('delhi','mumbai'),c('Prod-3','Prod-4')])
#m end
(m2 = matrix(data=x, nrow=4, byrow = T))
(m3= matrix(x, ncol=4 ))
(m4 = matrix(c(1,2,3,4), nrow=2, ncol=4,byrow=T))
m1
# Vectors
snames = string (single char, multiple char)
marks1 = numeric(integer, decimal)
married - true or F
gender = categories
snames = c('student1', "student2", 'student3')
snames
class(snames)
marks = c(10, 20 ,30)
marks
class(marks)
married = c(TRUE, FALSE, TRUE)
married
class(married)
snames; marks; married
(age = c(30,35,26))
age = c(30,35,26)
age
?class
#subsetting a Vector
(x1 = 1:100)
(x2 = seq(50,100,3))
?seq
x1
x1[10]
x1[20:30]
x2[2:5]
x1[x1 > 50]
x2
x2 > 70
x2[(x2 > 70) | (x2 < 60)]
x2[c(FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, TRUE,TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE)]
x2
#operations on the vector
x3 = c('A','B','C')
x3[c(1,3)]
x3[c(TRUE, FALSE, TRUE)]
x3[(x3=='A') | (x3=='C')]
x2[(x2 > 60) & (x2 < 70)]
x2[x2==69]
x2==69
x2
length(x2)
length(x1)
mean(x2)
sum(x2)/length(x2)
sum(x2)
median(x2)
x2
set.seed(1)
(x4 = trunc(runif(20,5,100)))
median(x4)
(shirtcolors = c("red",'blue','green','blue','green', 'blue'))
mean(shirtcolors)
table(shirtcolors)
sort(x4)
mtcars$mpg
mean(mtcars$mpg)
women
data()
?mtcars
snames
x = rnorm(1000000, 50,10)
head(x)
mean(x)
hist(x)
================================================
FILE: 0-Practise/practise2.R
================================================
#misc practise
x = 1:5
data.entry(x)
================================================
FILE: 0-Practise/rough.R
================================================
# Rough Work
?cat
?dput
?dget
?dump
?write
?write.table
?save
?detach
?attach
?dir
?ls
?rm
?attr
?attributes
data1 = c(3, 60, 19, 9, 4 , 5)
labels1 = c('Building training sets', 'Cleaning and Organising Data', 'Collecting Data sets', 'Mining data for patterns', 'Refining Algorithms', 'Others')
pie(data1, labels= pielabels)
barplot(data1)
pie(data1,labels=NA, clockwise=TRUE,
col=rainbow(6), border="white", radius=1.2,
cex=0.8, main="Average Time Spent by Data Scientists")
legend("bottomright",legend=pielabels,bty="n", # horiz = T,
fill=rainbow(6))
barplot(data1, col=rainbow(6), names.arg=pielabels,
cex.names = 1, horiz=T, angle=90,
main="Average Time Spent by Data Scientists"
)
text(1:6, data1, labels= pielabels)
?barplot
browsers<-read.table("browsers.txt",header=TRUE)
browsers<-data1
browsers
pielabels <- sprintf("%s = %3.1f%s", labels1, browsers, "%")
pielabels
?sprintf
library(RColorBrewer)
pie(browsers,
labels=NA,
clockwise=TRUE,
col=brewer.pal(6,"Set1"),
border="white",
radius=1,
cex=0.8,
main="Percentage Share of Internet Browser usage")
legend("bottomleft",legend=labels1,bty="n",
fill=brewer.pal(6,"Set1"))
?strde
states
================================================
FILE: 0-Practise/vector.R
================================================
#Data Structure - Vectors
x = c(1,5,7,8,4)
x2 <- c(2,5,7,8,4)
x
x2
x4 = c('M','F','M','F','M')
x4
(x5 = 1:100)
(x6 = seq(1,100,by=3))
marks = rnorm(60, mean=60, sd=10)
marks
plot(density(marks))
matrix(marks, ncol=6)
================================================
FILE: 0-class/AR-groceries.R
================================================
# Association Rules - Groceries data set ####
library(arules) #install first
library(arulesViz) #install first
library(datasets) # no need to install, just load it reqd for Groceries
data('Groceries') #different format - transaction format
Groceries
#Structure of Groceries
str(Groceries)
Groceries
arules::LIST(Groceries[1:3]) #another view
arules::inspect(Groceries[1:3])
arules::inspect(Groceries[c(1000,5000,9000)])
arules::inspect(Groceries[sample(1:nrow(Groceries), size=3)])
nrow(Groceries)
#Find Frequent Itemset
#.01 * 9835; A + B + C = 3 items, A + B + C + D : 4 items
frequentItems = eclat(Groceries, parameter = list(supp = 0.01, minlen= 3, maxlen = 3))
#frequentItems = eclat (Groceries, parameter = list(minlen= 3))
inspect(frequentItems[1:3]) #correct lib to used
frequentItems
inspect(frequentItems[]) #inspect for printing the itemssets
options(digits=3)
#inspect(frequentItems[100:122])
#Descending Sort frequent items by count : 1 to 25 itemsets
inspect(sort (frequentItems, by="count", decreasing=TRUE)[1:25])
inspect(sort (frequentItems, by="count", decreasing=F)[1:25])
#Support is : support(A&B) = n(A&B)/ N
#Plot the Frequency Plot
itemFrequencyPlot(Groceries, topN = 15,type="absolute")
itemFrequencyPlot(Groceries, topN = 10, type='relative')
abline(h=0.15)
# Create rules and the relationship between items
#parameters are min filter conditions
options(digits=2)#output decimal places to 2 digits
rules = apriori(Groceries, parameter = list(supp = 0.005, conf = 0.5, minlen=2))
rules
inspect (rules[1:5])
#Sort Rules by confidence, lift and see the data
rulesc <- sort (rules, by="confidence", decreasing=TRUE)
inspect(rulesc[1:5])
rulesl <- sort (rules, by="lift", decreasing=T)
inspect (rulesl[1:5])
#which items have strong confidence and lift
#How To Control The Number Of Rules in Output ?
#maxlen, minlen, supp, conf
rules2 = apriori (Groceries, parameter = list (supp = 0.001, conf = 0.5, minlen=2, maxlen=3))
rules2 #no of rules
rules #earlier rules
inspect(rules2[1:15])
#Find out what events were influenced by a given event - from already created rules
subset1 = subset(rules2, subset=rhs %in% "whole milk")
inspect(subset1[1:5]) # rhs has milk
subset1 = subset(rules2, subset=rhs %in% 'bottled beer' )
inspect(subset1) #rhs has beer
#inspect(rules2)
subset2 = subset(rules2, subset=lhs %ain% c('baking powder','soda') )
inspect(subset2) # all items %ain%
subset2a = subset(rules2, subset=lhs %in% c('baking powder','soda') )
inspect(subset2a) # any of the items %in%
#RHS, Confidence, sort by Lift
subset3 = subset(rules2, subset=rhs %in% 'bottled beer' & confidence > .5, by = 'lift', decreasing = T)
inspect(subset3) #sometimes there may be no rules, change few parameters
subset4 = subset(rules2, subset=lhs %in% 'bottled beer' & rhs %in% 'whole milk' )
inspect(subset4)
subset4b = subset(rules2, subset=rhs %in% 'bottled beer' )
inspect(subset4b) #no such rules
library(arulesViz) #install first
#https://cran.r-project.org/web/packages/arulesViz/vignettes/arulesViz.pdf
#Visualizing The Rules -----
subset1
rules2
inspect(subset2)
plot(subset1[1:2])
plot(subset1[1:2], measure=c("support", "lift"), shading="confidence")
#change the axis
plot(rules2[1:100], measure=c("support", "lift"), jitter=0,shading="confidence")
#
#Find what factors influenced an event ‘X’ - create fresh Rules
rules3 = apriori (data=Groceries, parameter=list (supp=0.002,conf = 0.7), appearance = list (default="lhs",rhs="whole milk"), control = list (verbose=F))
inspect(rules3[1:5])
inspect(rules3)
#rhs as it is, lhs to have tropical fruit or herbs
rules4 = apriori (data=Groceries, parameter=list (supp=0.001,conf = 0.4), appearance = list (default="rhs",lhs=c('tropical fruit','herbs')), control = list (verbose=F))
inspect(rules4[1:5])
inspect(rules4)
plot(subset4)
#legend to filter
#legend to condition commands
# lhs - means left hand side, or antecendent
# rhs - mean right hand side, or consequent
# items - items, that make up itemsets
# %in% - matches any
# %ain% - matches all
# %pin% - matches partially
# default - no restrictions applied
# & - additional restrictions on lift, confidence etc.
#summarise Association Rules / Market Basket analysis
#load libraries - arules, arulesViz
#load dataset in Transaction Format eg Groceries
#Find frequentitems set
#Find rules as per parameters
#Parameters - min support, min confidence, minlen, maxlen
#sort - confidence, lift, count, support
#subset of rules - lhs, rhs, confidence, all, any item
#plot of the rules
#find interesting rules - high lift, high confidence
#put strategy in place - location, bundle, discounts, advertisement
================================================
FILE: 0-class/CLUST-customer.R
================================================
# HH MA example - customer
#install.packages("amap")
library(amap)
##Read the data in the file
url = 'https://docs.google.com/spreadsheets/d/1PWWoMqE5o3ChwJbpexeeYkW6p4BHL9hubVb1fkKSBgA/edit#gid=2073914016'
library(gsheet)
data = as.data.frame(gsheet2tbl(url))
str(data)
head(data)
names(data)
summary(data)
###Verify the data
colnames(data)
class(data$Age)
apply(data, 2, FUN= class) #are all numeric
dim(data)
head(data)
summary(data)
###Run the kmeans algorithm to generate the clusters
#?amap::Kmeans
names(data)
k1 <- amap::Kmeans(data[,-1],centers=3, iter.max = 200, nstart = 1, method = c("euclidean"))
k1$centers # group means
###Fetch size/n of obs for the groups
k1$size
###Fetch sum of squared for the groups
k1$withinss
###Fetch the cluster for each obs
#k1$cluster
k1$cluster
k1$centers
k1$cluster[9000:9800]
table(k1$cluster)
k1$size
data_clus_2 <- data[ k1$cluster == 2,]
(data_clus_2)
mean(data_clus_2$Age)
data_clus_2$Cust_id
# Write CSV
#write.csv(data_clus_2[,1], file = "./data/data_clus_2.csv")
================================================
FILE: 0-class/DT-CART-sales.R
================================================
# CART Models - HH Case Study - Regression
library(rpart)
library(rpart.plot)
library(forecast)
library(gsheet)
url='https://docs.google.com/spreadsheets/d/1PWWoMqE5o3ChwJbpexeeYkW6p4BHL9hubVb1fkKSBgA/edit#gid=1941519952'
data = as.data.frame(gsheet2tbl(url))
str(data)
# Summarize the dataset
summary(data)
names(data)
# Random Sampling
set.seed(777) # To ensure reproducibility
Index = sample(x = 1:nrow(data), size = 0.7*nrow(data))
Index
# Create Train dataset
train= data[Index, ]
nrow(train)
# Create Test dataset
test = data[-Index, ]
nrow(test)
nrow(test) + nrow(train)
########################### Modeling #################################
trainModel = rpart(Annual_Sales ~ . , data = train[,-1], method = "anova")
trainModel
mean(train$Annual_Sales)
# Plot the Regression Tree
rpart.plot(trainModel, type = 4,fallen.leaves = T, cex = 1.0, nn=T)
#cp selection
printcp(trainModel)
trainModel_prune = prune(trainModel, cp=0.01)
rpart.plot(trainModel_prune)
#Predict and check accuracy
predictSales_test = predict(trainModel_prune, newdata = test, type = "vector")
predictSales_test #vector to print values of sales predicted
library(forecast)
# Validate RMSE and MAPE calculation with a function in R
ModelAccuarcy = accuracy(predictSales_test, test$Annual_Sales)
ModelAccuarcy
#RMSE should be as less as possible
================================================
FILE: 0-class/NAvalues.R
================================================
# Missing values
#missing values are indicate NA keyword
x = c(1, ,3) #wrong way to create missing values
x = c(NA, 1, NA, 2,3, NA) #introducing missing values
x
x1=x #make a copy
x1
is.na(x)
sum(is.na(x))
sum(c(T,F,T,F,F))
x
mean(x) #this will not work if NA values are present
?mean
mean(x, na.rm=T)
(1+2+3)/3
sum(x,na.rm=T)/3
x
x[is.na(x)] #list out missing values
mean(x, na.rm=T)
x[is.na(x)] = mean(x, na.rm=T) #replace these missing values with mean of other values
x
cbind(x,x1) #see where values have been filled
x1 = c(4,6,8,9)
length(x1[x1 >= 6])
sum(x1 >= 6)
x1 >= 6
x2 = rnorm(100000, mean=50, sd=5) #normal distributed values
x2
length(x2)
posn=sample(100000, size=30)
posn
x2[posn] = NA
summary(x2)
is.na(x2)
sum(is.na(x2))
mean(x2)
mean(x2, na.rm=T)
x2[is.na(x2)] = mean(x2, na.rm=T)
sum(is.na(x2))
#install this library
library(VIM)
?sleep
data(sleep, package='VIM')
head(sleep) #first few rows of sleep
dim(sleep) #dimensions of sleep data
complete.cases(sleep) # which row have complete data in T/ F
sum(complete.cases(sleep)) # no of rows have which no missing data
sum(!complete.cases(sleep)) # no of rows which have missing data
sleep[complete.cases(sleep),] #rows which are complete 42
sleep[!complete.cases(sleep),] #rows which have missing values 20
summary(sleep)
colSums(is.na(sleep)) #which column how many data missing
rowSums(is.na(sleep)) #which row how many data missing
head(sleep)
df= sleep #make a copy of sleep data
complete.cases(df) #complete cases
mean(df$Dream, na.rm=T) #mean of Dream Col
sum(is.na(df$Dream))
sum(!is.na(df$Dream))
df$Dream
summary(df)
df[ , 4:5]; df[ ,c(1,3,5)]
df[is.na(df$Dream), 'Dream'] #missing values in Dream Column
df[is.na(df$Dream), "Dream"] = mean(df$Dream, na.rm=T) #find and replace
df$Dream
sum(df$Dream == 1.972)
#use mice package
library(mice)
#https://www.analyticsvidhya.com/blog/2016/03/tutorial-powerful-packages-imputing-missing-values/
#MICE, Amelia, missForest, Hmisc, mi
================================================
FILE: 0-class/autoML1.R
================================================
# Auto ML in R
pacman::p_load(caret, randomForest)
set.seed(123)
n = 100
sData <- data.frame(X1=rnorm(n), X2= rnorm(n), Y = rbinom(n, 1, 0.5))
sData
target <- 'Y'
#hyper parameters
ctrl <- trainControl(method='cv', number=5)
tune_grid <- expand.grid(.mtry = 2.5)
aModel1 <- train(sData[, setdiff(names(sData), target)], sData[, target], method='rf', trControl = ctrl, tuneGrid = tune_grid)
aModel1
#predictions
new_data = data.frame(X1=0.1, X2 = -0.2)
predictions = predict(aModel1 , newdata = new_data)
print(aModel1)
#---------
library(lares)
# The data we'll use is the Titanic dataset
data(dft)
df <- subset(dft, select = -c(Ticket, PassengerId, Cabin))
df
r <- h2o_automl(df, y = Survived, max_models = 1, impute = FALSE, target = "TRUE")
#H2o----
library(h2o)
library(caTools)
mt = mtcars
index_mt = sample.split(Y = mt$mpg, SplitRatio = .7)
index_mt
train_mt = mt[index_mt, ]
test_mt = mt[!index_mt, ]
sapply(list(train_mt, test_mt), dim)
y <- 'mpg'
x <- setdiff(names(train_mt), y)
x; y
aModel2 <- h2o.automl(x=x, y = y, training_frame = train_mt, max_models = 5, seed=1)
================================================
FILE: 0-class/bigQuery.R
================================================
# big query
#https://bigrquery.r-dbi.org/dev/
#install.packages("bigrquery")
library(bigrquery)
library(DBI)
billing <- bq_test_project() # replace this with your project ID
sql <- "SELECT year, month, day, weight_pounds FROM `publicdata.samples.natality`"
tb <- bq_project_query(billing, sql)
bq_table_download(tb, n_max = 10)
con <- dbConnect(
bigrquery::bigquery(),
project = "publicdata",
dataset = "samples",
billing = billing
)
con
dbListTables(con)
dbGetQuery(con, sql, n = 10)
================================================
FILE: 0-class/hhe_d01.R
================================================
#HHE Batch : Nov 2023
#05Nov2023
mtcars
#control+enter to run a line and move ahead
#customise shortcuts
#vector----
x1 = c(1,4,6) #control + enter -> modify shortcuts
x1
print(x1)
x1[2]
x1[c(1,3)]
x2 <- c(4,35,34,77)
x2
print(x2)
class(x2)
#vector, matrics, dataframes - data structure
mean(x1)
1:10
X2
(x3 = 100:200) #assign and print
min(x3)
max(x3)
sd(x3)
x5 = c('A', "Chaman", "Divya", 'mUNAZ')
class(x5)
x5
x5[3]
toupper(x5)
tolower(x5)
#matrix----
50:60
length(50:61)
?matrix
m1 = matrix(data=50:61,ncol=3, byrow=T, dimnames = list(c('R1','R2','R3','R4'), c('C1','C2','C3')))
m1
colSums(m1)
colMeans(m1)
rowMeans(m1)
#array----
#will do later
?array
array(data=1:3, dim=c(2,4,2,2))
#dataframe----
rollno = c('S01','S02')
name = c('Dhiraj','Munaz')
age = c(55, 42)
sapply(list(rollno, name,age), class)
df = data.frame(rollno, name, age)
df
write.csv(df, file='./0-class/munaz.csv', row.names=F, na='' )
?read.csv
df1 = read.csv(file='./0-class/munaz.csv')
df1
#list----
ls()
list1 = list(df, x1,m1, age)
list1
list1[1]
list1[4]
#factor----
mtcars
?mtcars
?factor
class(mtcars)
str(mtcars)
mt = mtcars
str(mt)
dim(mt)
head(mt)
tail(mt)
summary(mt)
boxplot(mt$mpg)
hist(mt$mpg)
lines(mt$mpg)
barplot(table(mt$cyl), col=1:3)
barplot(table(mt$cyl), col=c('red','green','yellow'))
pie(table(mt$cyl), col=c('red','green','yellow'))
?pie
table(mt$cyl)
mt$cyl
table(mt$cyl, mt$am)
#dplyr-----
library(dplyr)
?mtcars
names(mt)
mt %>% group_by(gear) %>% count()
mt %>% group_by(gear) %>% tally()
mt %>% group_by(gear) %>% summarise(n=n())
mt %>% group_by(gear) %>% summarise(n=n(), meanMPG = mean(mpg, na.rm=T), minMPG = min(mpg, na.rm=T), maxMPG = max(mpg, na.rm=T))
mt %>% group_by(gear,am) %>% summarise(n=n(), meanMPG = mean(mpg, na.rm=T), minMPG = min(mpg, na.rm=T), maxMPG = max(mpg, na.rm=T))
mt %>% arrange(mpg)
mt %>% arrange(gear, -mpg) %>% select(gear, mpg)
mt %>% group_by(gear) %>% slice_max(mpg, n=1)
mt %>% group_by(gear) %>% slice_min(mpg, n=1)
mt %>% group_by(gear) %>% slice_max(hp, n=2)
mt %>% group_by(gear) %>% summarise(n=n(), meanMPG = mean(mpg, na.rm=T), meanWT = mean(wt, na.rm=T))
mt %>% group_by(gear, am) %>% tally()
================================================
FILE: 0-class/hhe_d02.R
================================================
# HHE Batch : MA with R ; Munaz
#day2------
#vectors-----
(rollNo = 1:100)
paste('student',1:100)
(name = paste('student', sprintf('%003d',1:100)))
set.seed(1234)
(gender = sample(x=c('M','F'), size=100, replace=T, prob=c(.6, .4)))
table(gender)
prop.table(table(gender))
courses = c('BBA', 'MBA', 'PHD')
course = sample(x=courses, size=100, replace=T, prob = c(.5, .3, .2))
course
table(course)
prop.table(table(course))
#marketing, operations
(marketing = round(rnorm(n=100, mean=65, sd=7)))
mean(marketing); sd(marketing)
?distributions
hist(marketing)
(operations = round(runif(n=100, min = 55, max=90)))
range(operations)
hist(operations)
grades = c('A','B','C')
(project = sample(x=grades, size=100, replace=T))
(project = factor(project, ordered=T, levels = c('C', 'B','A')))
varNames = list(rollNo, name, gender, course, marketing, operations, project)
sapply(varNames, length)
?sapply(varNames, IQR)
sapply(varNames, length)
?apply
#matrix------
(m1 = matrix(data= 1:24, nrow=4, ncol=6))
#array -----
(a1 = array(data = 1:24, dim = c(4,6)))
(a2 = array(data = 1:24, dim = c(3,4,2)))
#factor-----
(courses = factor(x=c('BBA', 'MBA', 'PHD')))
(likertScale = factor(x=c('Excellent','Good','Fair','Poor')))
(likertScale2 = factor(x=c('Excellent','Good','Fair','Poor'), ordered = T))
(likertScale2 = factor(x=c('Excellent','Good','Fair','Poor'), ordered = T, levels = c('Poor', 'Fair', 'Good','Excellent')))
#data frame-------
sapply(list(rollNo, name, gender, course), length)
df = data.frame(rollNo, name, gender, course, marketing, operations, project)
head(df)
str(df)
summary(df)
df$name
df$gender = factor(gender)
df$course = factor(course)
df$project
summary(df)
tail(df)
library(dplyr)
#mean marks of subjects - gender wise
df %>% group_by(gender) %>% summarise(meanMarketing = mean(marketing, na.rm=T), meanOperations = mean(operations, na.rm=T), count = n())
df %>% group_by(gender, course) %>% summarise(meanM = mean(marketing, na.rm=T), maxM = max(marketing), minM = min(marketing))
names(df)
df %>% group_by(gender) %>% summarise(across(.cols=marketing:operations, .fns=mean, .names='mean_{col}'))
df %>% group_by(gender) %>% summarise(across(.cols=c(marketing, operations), .fns = list(mean=mean, min=min, max=max), na.rm=T, .names="{fn}_{col}"))
df %>% group_by(gender) %>% summarise(across(where(is.numeric), .fns = list(mean=mean, max=max), na.rm=T, .names="{col}_{fn}"))
df %>% group_by(gender) %>% summarise(across(where(is.numeric), .fns = list(mean=mean, max=max), .unpack=T))
cols = c('marketing', 'operations')
df
df %>% mutate(across(all_of(cols), round,-1))
df %>% mutate(across(where(is.double) & !c('marketing'), round,-1))
x=1234566677.999
round(x,0)
trunc(x,2)
floor(x)
ceiling(x)
?grepl
gsub('4','99999999',x)
#automatically unpack
x='munaz'
gsub('z','j',x)
#list --
#ggplot-------
library(ggplot2)
head(mpg)
?mpg
?ggplot
ggplot(mpg, aes(x=displ, y=hwy, colour = class)) + geom_point()
head(mtcars)
ggplot(mtcars, aes(x=wt, y=mpg)) + geom_point(color='red')
ggplot(mtcars, aes(x=wt, y=mpg, color=factor(gear), shape=factor(cyl), size=hp)) + geom_point()
ggplot(mtcars, aes(x=wt, y=mpg, color=factor(gear), shape=factor(cyl), size=hp)) + geom_point() + geom_text(aes(label=paste0('(',wt,',',mpg,')')), hjust=1.5, size=3) + facet_wrap(am ~ vs, labeller = label_both)
?facet_wrap
#end of day2---------
================================================
FILE: 0-class/hhe_d03.R
================================================
#HHE MAR - Day3
library(dplyr) #load the library
#with and within-----
df = mtcars
#with(DF, expr) : new Col
#within(DF, newCol <- expr) : new DF
names(df)
?mtcars
head(df)
?with
(newMPG <- with(df, mpg * 1.5))
21 * 1.5
class(newMPG)
newMPG
(DF2 <- within(df, newMPG2 <- mpg * 1.5))
head(DF2)
class(DF2)
#table-----
df$gear
table(df$gear)
names(df)
table(df$carb)
#mutate------
df$mpg
range(df$mpg)
mean(df$mpg)
median(df$mpg)
sort(df$mpg)[8] # [32/4]
quantile(df$mpg,c(0, .25, .5, .75,1))
df %>% summary()
df %>% mutate(gear = factor(gear)) %>% summary()
df %>% mutate_at(c('cyl','vs', 'am','carb', 'gear'), as.factor) %>% summary()
df %>% mutate(across(c(cyl, vs, am, carb), .fns= as.factor)) %>% str()
df %>% mutate(across(c(2,8:11), .fns= as.factor)) %>% summary()
names(df)
head(df)
df %>% mutate(across(c(mpg, wt, hp, qsec), .fns= round)) %>% head()
cols = c('mpg', 'hp') #list of cols
df %>% mutate(across(all_of(cols), .fns= round)) %>% head()
#only numeric & mpg & wt
df %>% mutate(across(where(is.double) & c(mpg, wt), .fns= round)) %>% head()
#summarise----purr style
df %>% group_by(gear) %>% summarise(across(all_of(cols), ~mean(.x, na.rm=T)))
df %>% group_by(gear) %>% summarise(across(all_of(cols), list(mean=mean, sd=sd)))
#names of output-----
df %>% group_by(gear) %>% summarise(across(all_of(cols), list(mean=mean, sd=sd), .names = '{.col}.{.fn}'))
df %>% group_by(gear) %>% summarise(across(all_of(cols), list(mean=mean, sd=sd), .names = '{.fn}.{.col}'))
df %>% filter(if_any(.cols = c(1:11), .fns= is.character))
df %>% filter(if_any(.cols = c(1:11), .fns= is.integer))
df %>% filter(if_any(.cols = c(1:11), .fns= is.numeric))
df %>% filter(if_all(.cols = c(1,3,4,5), .fns= is.numeric))
df %>% select_if(is.numeric) %>% names()
df %>% select_if(where(is.numeric)) %>% names()
df %>% purrr::discard(~is.numeric(.)) %>% names()
df %>% purrr::discard(~ !is.numeric(.)) %>% names()
df %>% purrr::keep(~ is.numeric(.)) %>% names()
df %>% purrr::keep(~ is.factor(.)) %>% names()
df %>% mutate_at(c('cyl','vs', 'am','carb'), as.factor) %>% purrr::keep(~ is.factor(.)) %>% names()
df %>% select_if(is.integer)
#groupby-------
names(df)
head(df)
sum1 <- df %>% group_by(gear) %>% summarise(count =n(), meanMPG = mean(mpg, na.rm=T), minMPG = min(mpg), maxMPG = max(mpg), meanWT = mean(wt, na.rm=T))
sum1
library(ggplot2)
sum1 %>% tidyr::pivot_longer(cols=count:meanWT, names_to = 'stats') %>% mutate(gear = factor(gear)) %>% ggplot(., aes(x = gear, y=value, fill=gear)) + geom_bar(stat='identity') + geom_text(aes(label=round(value,1))) + facet_wrap(stats ~., scales='free')
#1st car in each gear group, highest mpg
df %>% group_by(gear) %>% arrange(gear, -mpg) %>% select(gear, mpg,wt, hp) %>% top_n(n=3, wt=mpg)
df %>% filter((mpg > 30 & am == 1) | wt < 2)
df %>% slice(5:7)
sample(x=df, size=4)
?slice_sample
slice_sample(df, n=5)
slice_min(df, order_by = mpg, n=1)
df %>% select(gear, mpg, hp, disp) %>% group_by(gear) %>% summarise(across(everything(), mean))
df %>% group_by(gear) %>% summarise(across(c(mpg, hp, disp, qsec), mean), .groups = 'drop')
#arrange-------
#missingValues------
v1 = c(1, NA, 3, 4, NA, 6)
v1
sum(is.na(v1))
mean(v1)
mean(v1, na.rm=T)
v1[is.na(v1)] = 99
v1
df2 = df
df2[sample(1:32, size=10), 'mpg'] = NA
df2[sample(1:32, size=10), 'hp'] = NA
colSums(is.na(df2))
str(df2)
df2 %>% select(mpg, hp, gear)
df2 %>% mutate(mpg = ifelse(is.na(mpg), mean(mpg, na.rm=T), mpg)) %>% select(mpg, hp, gear)
#statistics-----
summary(df)
names(df)
numCols = c('mpg','hp','disp','qsec', 'wt')
cor(df[numCols])
cor(df$wt, df$mpg)
sapply(df[,numCols], sd)
sapply(df[,numCols], mean)
sapply(df[,numCols], quantile)
sapply(df[,numCols], fivenum)
fivenum(df$wt) #min, lowerHinge, median, upperHinge, max
boxplot(df$wt, main=paste('Five Number Summary Box Plot \n min, lowerHinge, median, upperHinge, max \n', paste(fivenum(df$wt), collapse=',')))
abline(h=fivenum(df$wt))
boxplot(wt ~ am, data=df, ann=T, col=c('red','green'))
library(e1071)
skewness(df$wt)
kurtosis(df$wt)
library(Rfast)
colkurtosis(as.matrix(df[,numCols]))
#devtools::install_github('smin95/smplot')
#devtools::install_github('smin95/smplot2', force=T)
library(smplot2)
?smplot2
================================================
FILE: 0-class/hhe_d04.R
================================================
# HHE Day-4 : 19 Nov 2023 : Munaz
# Supervised and Non-Supervised Learning
# Hypothesis Testing
# Linear & Logistic Regression
# Market Basket & Clustering
# Missing Values
# Customer Churn & Customer Purchase Probability
# autoML
#SLM--------
women
?women
head(women)
cov(women)
cor(women)
plot(women)
?lm
lm1 <- lm(formula = weight ~ height, data=women)
lm1
# weight = coeff + slope * height
# wt = -87.52 + 3.45 * ht
# y = c + mx
range(women$height)
library(dplyr)
women %>% arrange(height)
hist(women$height)
colMeans(women)
head(women)
-87.52 + 3.45 * 58
plot(lm1)
summary(lm1)
residuals(lm1)
nrow(women)
cbind(women, residuals(lm1), fitted.values(lm1))
#MLM-----
head(mtcars)
?mtcars
#predict mpg(DV) on other IVs(wt, hp, gear)
mt2 = mtcars %>% mutate(gear = factor(gear)) %>% select(mpg, wt, hp, gear)
summary(mt2)
lm2 = lm(formula = mpg ~ wt + hp + gear, data = mt2)
lm2
# mpg = 34.87 - 3.2 * wt + -0.034 * hp #G3
# mpg = 34.87 - 3.2 * wt + -0.034 * hp + 1.26 * gear #G4
# mpg = 34.87 - 3.2 * wt + -0.034 * hp + 1.86 * gear #G5
mt3 = mtcars %>% select(mpg, wt, hp, disp)
head(mt3)
lm3 <- lm(formula = mpg ~ wt + hp + disp, data = mt3)
lm3
summary(lm3)
lm4 <- lm(formula = mpg ~ wt + hp, data = mt3)
lm4
summary(lm4)
plot(mt3$mpg, mt3$hp)
lm5 <- lm(mpg ~ ., data=mtcars)
summary(lm5)
#train & Test ----
noYes = sample(x=c('No','Yes'), size=100, replace=T, prob=c(.6, .4))
noYes = sample(x=c(0,1), size=100, replace=T, prob=c(.6, .4))
table(noYes)
prop.table(table(noYes))
salary = round(rnorm(n=100, mean=60, sd=10),0)
salary
df = data.frame(salary, noYes)
df
#df$noYes = factor(df$noYes)
summary(df)
# Splitting dataset
library(caTools)
library(ROCR)
#split DF into 70% and 30% rows
?sample.split
split <- sample.split(df$noYes, SplitRatio = 0.7)
split
prop.table(table(split))
train_reg <- subset(df, split == "TRUE")
test_reg <- subset(df, split == "FALSE")
dim(train_reg)
dim(test_reg)
train_reg
prop.table(table(train_reg$noYes))
prop.table(table(test_reg$noYes))
# Training model
logistic_model <- glm(noYes ~ salary, data = train_reg, family = "binomial")
logistic_model
# Summary
summary(logistic_model)
#https://www.geeksforgeeks.org/logistic-regression-in-r-programming/
predict_reg <- predict(logistic_model, test_reg, type = "response")
test_reg
#https://www.statology.org/logistic-regression-in-r/
#practise this-----
#https://www.geeksforgeeks.org/automated-machine-learning-for-supervised-learning-using-r/
predict_reg
cbind(test_reg, predict_reg)
# Changing probabilities
predict_reg2 <- ifelse(predict_reg > 0.5, 1, 0)
cbind(test_reg, predict_reg, predict_reg2)
library(caret)
confusionMatrix(factor(test_reg$noYes), factor(predict_reg2))
(13+5)/30
================================================
FILE: 0-class/hhe_d04b.R
================================================
# day4 - MAR - Munaz
#plan - Modeling
#What is Linear Regression ? , What does it do ?, How do we run the model in R ? How do we check if the model is good ?
#general eq : y = c + mx ; y = c + b1x1 + b2x2 ....
#x's are IV, y is IV, c - constant/intercept; m/b1/b2 - slope/coeficients
women
?women
head(women)
#Does height depent on weight or vice-versa ?
#create relation ship between height & weight
lm1 = lm(weight ~ height, data=women)
summary(lm1)
print(lm1)
plot(weight ~ height, data=women, main='Height & Weight of Women')
abline(lm1) #line of prediction
#as height increase, weight increases almost linearly
predict(lm1, newdata = data.frame(height = c(50, 65.5, 75.5, 100)))
#prediction for ranges beyond IV is wrong
range(women$height)
#multiple Linear Regression ------
df = mtcars[, c('mpg', 'wt','hp', 'drat')]
head(df)
lm2 = lm(mpg ~ wt + hp + drat, data=df)
print(lm2)
summary(lm2)
sapply(df, range)
(df1A = data.frame(wt= c(2,3), hp=c(100, 200), drat = c(3,4)))
predict(lm2, newdata = df1A) #mpg predicted
cbind(df1A, mpgPredicted = predict(lm2, newdata = df1A))
library(car)
avPlots(lm2)
#about plot
#angle of the line in each plot matches the sign of coef from estimate equation
coef(lm2)
#x- single IV, y - DV (mpg)
#blue line - association bet IV & DV, keeping other IVs constant
#labelled points (2) - largest residuals & largest partial leverage (contribution of individual IV to total leverage/change)
#summary
summary(lm2)
#R2, Adjs R2, Coeff & their p value, FStat p Value
lm2$residuals # diff betw predicted and actual DV
(lm2_residuals = residuals(lm2))
hist(lm2_residuals) #it should look like normal
qqnorm(lm2_residuals) #it should be straight line
qqline(lm2_residuals) #few portions around the straight line
#multi-collinearity
library(ggcorrplot)
df_subset = df[, c('wt', 'hp', 'drat')]
(corr_matrix= round(cor(df_subset),2))
ggcorrplot(corr_matrix, hc.order=T, type='lower', lab=T)
#there is strong corr (< .8) , then keep only 1
#compare model----
anova(lm2)
#anova(model1, model2) #if pvalue < .05, choose model1
#another way to check assumptions
plot(lm2)
par(mfrow=c(2,2))
plot(lm2)
#5key assumptions-----
#linear relationship, Multivariate normality, Little or no Multicollinearity, No Auto Correlation, Homoscedasticity
library(ggfortify)
autoplot(lm2)
par(mfrow=c(1,1))
#residual vs Fitted - Linear Relationship
plot(lm2, 1)
#Normal Q-Q : Normally Distributed Residuals
plot(lm2, 2)
#Scale-Location (Spread) : Homoeneity of variance of residuals. it should b horizontal line
plot(lm2, 3)
#Residuals vs Leverage : Influential cases (outliers)
plot(lm2, 5)
#Cooks Distance
plot(lm2, 4) #3 extreme
plot(lm2, 4, id.n=5) #top 5 extreme values
================================================
FILE: 0-class/lm_AIC.R
================================================
# AIC Linear Regression
#https://bookdown.org/steve_midway/DAR/model-selection.html
mtcars.lm1 <- lm(mpg ~ disp, data=mtcars)
summary(mtcars.lm1)
broom::glance(mtcars.lm1)
mtcars.lm2 <- lm(mpg ~ disp + wt, data=mtcars)
broom::glance(mtcars.lm2)
mtcars.lm3 <- lm(mpg ~ disp + wt + hp, data=mtcars)
broom::glance(mtcars.lm3)
mtcars.lm4 <- lm(mpg ~ disp + wt + hp + cyl, data=mtcars)
broom::glance(mtcars.lm4)
mtcars.lm5 <- lm(mpg ~ disp + wt + hp + cyl + gear, data=mtcars)
broom::glance(mtcars.lm5)
AIC(mtcars.lm1, mtcars.lm2, mtcars.lm3, mtcars.lm4, mtcars.lm5)
sapply(list(mtcars.lm1, mtcars.lm2, mtcars.lm3, mtcars.lm4, mtcars.lm5), broom::glance)
#http://www.sthda.com/english/articles/38-regression-model-validation/158-regression-model-accuracy-metrics-r-square-aic-bic-cp-and-more/
library(MASS)
data(mtcars)
summary(car_model <- lm(mpg ~., data = mtcars))
step_car <- stepAIC(car_model, trace = TRUE, direction= "both")
#The goal is to have the combination of variables that has the lowest AIC or lowest residual sum of squares (RSS).
step_car
#The last line is the final model that we assign to step_car object.
library(stargazer)
stargazer(car_model, step_car, type = "text")
#----
library(olsrr)
step_plot <- ols_step_both_aic(car_model)
plot(step_plot)
?ols_step_both_aic
================================================
FILE: 0-class/missingValues.R
================================================
#missing values
rm(list = ls())
is.na(x) # returns TRUE of x is missing
x <- NA
is.na(x) # returns TRUE of x is missing
y <- c(1,2,3,NA)
is.na(y) # returns a vector (F F F T)
#Recoding Values to Missing
mt = mtcars
mt$v1 = 99
head(mt)
# recode 99 to missing for variable v1
# select rows where v1 is 99 and recode column v1
mt$v1[mt$v1==99] <- NA
mt
#Excluding Missing Values from Analyses
x <- c(1,2,NA,3)
mean(x) # returns NA
mean(x, na.rm=TRUE) # returns 2
mt
mt[c(1,4,10,15,30), 'v1'] = 100
mt
mt[complete.cases(mt),]
library(VIM)
sleep
is.na(sleep)
summary(sleep)
sum(is.na(sleep))
colSums(is.na(sleep))
rowSums(is.na(sleep))
complete.cases(sleep)
sleep[complete.cases(sleep),]
aggr(sleep, prop = F, numbers = T)
library(visdat)
vis_miss(sleep)
================================================
FILE: 0-class/munaz.csv
================================================
"rollno","name","age"
"S01","Dhiraj",55
"S02","Munaz",42
"S03","HHE",30
================================================
FILE: 0-class/purchaseProb.R
================================================
# customer purchase probability
#https://www.masterdataanalysis.com/r/using-r-predict-customer-will-buy/
================================================
FILE: 01-IIM/10a-daily.R
================================================
#daily practise File
#keep yourself updated with common R commands and Modeling Techniques
================================================
FILE: 01-IIM/11-analyticLevels.R
================================================
#all levels of analytics - Descriptive, Diagnostic, Predictive, Prescriptive
#import from ggsheet
library(gsheet)
slr1 = "https://docs.google.com/spreadsheets/d/1qLHa5qFTyWacta8F-IGo6J3Zpf-BVR9OrlqONuJDqYc/edit#gid=2023826519"
df = as.data.frame(gsheet2tbl(slr1))
names(df) = c('area','sales')
head(df)
df
#X-area in sqft, Y-sales in some unit currency
str(df)
linearmodel = lm(sales ~ area, data=df)
plot(df$area, df$sales)
abline(lm(sales ~ area, data=df), col='red')
cor(df$area, df$sales)
cov(df$area, df$sales)
summary(linearmodel)
#Ho: (F Test) : No relationship between Y and any X
#Ha: There is relationship between Y and at least one X
# p < 0.05 Reject Ho in favour of Ha
coef(linearmodel)
#Y = 0.96 + 1.66 * X
range(df$area) #value of X to be betw this range : interpolation not extrapolation
(new2 = data.frame(area=c(1.5,2,3,4,5)))
(p2sales= predict(linearmodel, newdata= new2))
cbind(new2, p2sales)
summary(linearmodel)
head(df)
#residual
(sales = 0.96 + 1.66 * 2.2)
(r = 5.6 - 4.61)
plot(df$area, df$sales)
abline(lm(sales ~ area, data=df), col='red')
abline(v=2.2,h=c(5.6, 4.61))
cbind(resid(linearmodel))
fitted(linearmodel) - df$sales
#assumptions
plot(linearmodel)
#descriptive ------
df
head(df)
summary(df)
hist(df$area)
hist(df$sales)
range(df$sales)
range(df$area)
#diagnostic -----
cor(df)
cov(df)
#predictive -------
model = lm(sales ~ area, data = df)
model
summary(model)
testdata = data.frame(area=c(3,4))
predicted = predict(model, newdata = testdata)
cbind(testdata, predicted)
#prescriptive-----
pacman::p_load(lpSolveAPI)
#we want min sales to be 11 units where area can be between 2 & 5 units
#model : (sales = 0.96 + 1.66 * area)
#First we create an empty model x.
11/1.66 - .96 #5.6 units of area required
#max: 0.96 + 1.66 * area
lprec1 <- make.lp(0, 1)
lprec1
set.objfn(lprec1, c(1.66))
lprec1
#maximise it
lp.control(lprec1, sense="max")
lprec1
set.bounds(lprec1, lower = c(2), columns = c(1))
lprec1
set.bounds(lprec1, upper = c(4.5), columns = c(1))
lprec1
ColNames <- c("area")
dimnames(lprec1)[[2]] <- list(ColNames)
lprec1
solve(lprec1) #[1] 0 ok
#get.dual.solution(lprec)
get.objective(lprec1)
get.variables(lprec1)
get.constraints(lprec1)
print(lprec1)
#another way------
# Set 0 constraints and 1 decision variables
lprec2 <- make.lp(nrow = 0, ncol = 1)
# Set the type of problem we are trying to solve
lp.control(lprec2, sense="max")
# Set type of decision variables
set.type(lprec2, 1, type=c("real"))
## Set the coefficients of the decision variables -> C
C <- c(1.66)
# Set objective function coefficients vector C
set.objfn(lprec2, C)
lprec2
# Add constraints
add.constraint(lprec2, c(1) , "<=", 4.5)
lprec2
add.constraint(lprec2, c(1) , ">=", 2)
lprec2
solve(lprec2)
get.objective(lprec2)
.96 + 1.66 * get.objective(lprec2)
# Get the decision variables values
get.variables(lprec2)
# Get the value of the objective function
get.objective(lprec2)
# Note that the default boundaries on the decision variable are c(0, 0, 0) and c(Inf, Inf, Inf)
get.bounds(lprec2)
# Boundaries can be set with following function
#lpSolveAPI::set.bounds()
set.bounds(lprec2, lower=3, upper=NULL, columns = 1)
#contain a constant, but this constant does not change the optimization problem or the optimal solution.
4.5 * 1.66 + .96
================================================
FILE: 01-IIM/11a1-start.R
================================================
#initial commands
# assign
x1 = 3 #press control + enter to run the line
x2 <- 3 # same
x3 < - 3 #incorrect
#which is better
x1
x2
y
ls() #variables in env
women
?AirPassengers
data() # datasets available for use
library() # libraries currently loaded
?mean #help
help(mean)
??mean #search through other sources
x=0:100000000
x
x <- c(0:10, 50,100,200)
x
mean(x)
xm <- mean(x)
xm
mean(x, trim = 0.10)
x=c(1,1,1,1,5,5,5,5,7,7)
mean(x)
mean(x, trim=.3)
x=c(1,5,5)
mean(x)
c(mean(x), mean(x, trim = 0.10))
1:10
1:10000000
x=c(1,34,5)
x
?c
version #version of R
Sys.Date() # todays date
getwd() # working directory
methods(class='matrix') #methods available for a class of object
plot(10:100) #basic command to plot
plot(women)
#vector
#how to run command
#check datasets
#install library
#check which libraries available
#.....
================================================
FILE: 01-IIM/11a2-packages1.R
================================================
#List of packages to be installed
#Installing
listOfPackages = c('P1','P2')
install.packages(listOfPackages)
#first install package pacman
#loading multiple packages
library(pacman)
pacman::p_load(listOfPackages)
#install and load : first time only
pacman::p_install(listOfPackages)
#unload packages
pacman::p_unload(listOfPackages)
#remove packages from PC
pacman::p_delete(listOfPackages)
#datasets packages from packages
pacman::p_data(listOfPackages)
#installing
library(devtools) #installing from github/dev env
#data manipulation-----
library(dplyr) #summarise
library(data.table) #new DS for manipulating large data
ibrary(tidyverse) # data manipulation
#linear modeling
library(olsrr)
#decision tree-----
library(rpart)
library(rpart.plot)
library(randomForest)
library(CHAID) #library for performing CHAID decision tree
install.packages("CHAID", repos="http://R-Forge.R-project.org")
library("mlbench") #data for DT
library(partykit)
library(party)
library(ISLR) #data for DT
#association rule-----
library(arules)
library(arulesViz)
#clustering-----
library(cluster) # clustering algorithms
library(factoextra) # clustering visualization
library(dendextend) # for comparing two dendrograms
library(fpc)
library(NbClust) # finding the optimal number of clusters
library(amap)
library(flexclust)
#statistics-----
library(survey) #data for stats
library(catspec) #categories
library(survival) #data for statistics
library(MASS)
library(caret) #train & test, accuracy
library(caTools) #sampling
library(Hmisc)
library(e10171)
library(BSDA) #ztest
library(gmodels)
library(questionr)
library(fBasics)
library(outliers)
library(car)
library(mice) #missing value
library(mosaic)
#timeseries-----
tsp1 <- c('timeseries','tseries','zoo','xts','quantmod','TTR','lubridate')
pacman::p_install(listOfPackages)
library(timeSeries)
library(tseries)
library(zoo)
library(xts)
library(quantmod)
library(TTR)
library(lubridate)
library(forecast)
library(astsa)
#graphs-----
library(ggplot2)
library(gridExtra) #ggplot graph combin
library(RColorBrewer) #color combinations
library(corrgram) #corrrgram
library(lattice)
#import/export-----
library(openxlsx) #excel
library(gsheet) #google sheet
library(readxl) #read excel files
library(xlsx) #read/write xlsx : difficult to use
library(rJava) #java reqd for some packages
#reproduce data (keep it small)----
recode_mtcars <- dput(head(mtcars))
dput(droplevels(iris[1:4, ])) #for df with factors
#copy to/from excel: clipboard
fromexcel <- dput(read.table("clipboard",sep="\t",header=TRUE))
#text mining-----
library(wordcloud2)
library(wordcloud)
library(RColorBrewer)
library(SnowballC)
library(RCurl)
library(XML)
library(tm)
library(syuzhet)
library(twitterR)
library(ROAuth)
library(RTextTools)
library(rvest)
#misc----
library(rattle) #multi purpose
library(vcd) #data
library(VIM)
library(mice)
#others
================================================
FILE: 01-IIM/11a3-packages2.R
================================================
# Packages installation
#List avl packages
library()
#Total Avl Packages
nrow(available.packages())
#Install Package amap
install.packages('amap')
#Load package
library(amap)
#Find functions in package
library(help=amap)
#Help wrt a package
help(package='amap') #see on right side pane
#Unload---
install.packages('tm')
library(tm)
library(VIM)
search()
detach('package:tm', unload=TRUE)
detach(package:VIM, unload = T)
search()
#----------------- Part I Over
#Detach Multiple Packages
(detpkg = c('plyr','tm'))
library('plyr') #load lib
library('tm') #load lib
search() #check if loaded
Vectorize(detach)(name=paste0("package:", detpkg), unload=TRUE, character.only=TRUE) #code to detach
search() # confirm if removed
#specify the argument unload=TRUE; otherwise, R removes the package from the search path but doesn’t unload it.
#Detach from memory all packages
rm(list = ls(all = TRUE))
sessionInfo()
#Remove Packages ----- uninstall
remove.packages("tm")
require('tm')# check if unistalled
#multiple packages
#Function
ipak <- function(pkg){
new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])]
if (length(new.pkg))
install.packages(new.pkg, dependencies = TRUE)
sapply(pkg, require, character.only = TRUE)
}
packagelist <- c('NLP','tm', 'lubridate')
ipak(packagelist)
# package lists
pkg1 = c('dplyr','plyr', 'data.table','xlsx', 'Hmisc','rJava','ggplot2','lattice','gsheet','curl','stringr', 'syuzhet','e10171', 'catools','caret', 'olsrr' ,'swirl','sqldf','XML','VIM', 'outliers','car','MASS','DMwR','rvest')
pkg2 = c('forecast','rpart', 'rpart.plot', 'partykit','strucchange', 'didrooRFM')
pkg3 = c('zoo', 'astsa','lubridate','timeSeries','tseries','xts')
pkg4 = c('arules' , 'arulesViz')
pkg5 = c('twitterR','ROAuth','RGtk2','RTextTools','wordcloud')
install.packages("CHAID", repos="http://R-Forge.R-project.org")
library(lubridate)
#multiple load
easypackages::libraries(pkg1)
#easypackages
#install.packages('easypackages')
library(easypackages)
#Install Multiple Packages
packages("plyr", "psych", "tm")
libraries("plyr", "psych", "tm")
#lubripack
install.packages('lubripack') #NA for some versions
#install older version of R packages
#https://support.rstudio.com/hc/en-us/articles/219949047-Installing-older-versions-of-packages
#https://rdrr.io/github/Espanta/lubripack/
require(devtools)
install_github("Espanta/lubripack")
library(lubripack)
lubripack("plyr", "psych", "tm", "quantmod")
pack <- available.packages()
pack["ggplot2","Depends"]
pack["ggplot2","Imports"]
pack["data.table","Depends"]
packrat:::recursivePackageDependencies("ggplot2",lib.loc = .libPaths()[1])
tools::dependsOnPkgs('ggplot2')
tools::dependsOnPkgs('dplyr')
tools::dependsOnPkgs("ggplot2",installed=available.packages())
library(rusk)
#Remove Package
remove.packages('quantmod')
library(quantmod)
#This will remove all/Detach all packages
library(mise)
search()
#mise(vars = TRUE, figs = TRUE, console = TRUE, pkgs = FALSE)
mise(pkgs=T)
search()
# list all packages where an update is available
old.packages()
# update all available packages
update.packages()
# update, without prompts for permission/clarification
update.packages(ask = FALSE)
================================================
FILE: 01-IIM/11a4-packages3.R
================================================
pinstall <- c('rpart','rpart.plot', 'catools', 'caret','arules','arulesViz')
install.packages(pinstall)
================================================
FILE: 01-IIM/11a5-packages4.R
================================================
#Install packages for Data Analytics Course
#install package
install.packages('packageName')
#multiple packages
install.packages(c('package1', 'package2'))
#load library( oneby one)
#library(package1)
library(dplyr)
#check invironment
ls()
.libPaths() # get library location
library() # see all packages installed
search() # see packages currently loaded
#remove library from environment
detach("package:dplyr", unload = TRUE)
search() # see packages currently loaded
#dplyr gone
#install java from web
#https://www.java.com/en/download/
#restart your laptop and then run this file to install
packages1 = c('rJava','xlsx','dplyr','plyr','tidyr', 'gsheet','psych', 'tm', 'ggplot2','quantmod', 'data.table','car', 'VIM')
packages2 = c('lubridate','ISLR','amap','arules','arulesViz','forecast','rattle','rpart','rpart.plot','ts','zoo','twitterR','ROAuth','partykit','strucchange','didrooRFM','curl','syuzhet','stringr','RTextTools','e1071','wordcloud','caret','catools','olsrr')
#Installation steps : #one by change to packages1, package2, packages3
list.of.packages = packages1
new.packages = list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
new.packages
if(length(new.packages)) install.packages(new.packages)
#next set
list.of.packages = packages2
new.packages = list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
#if no installation takes place that means they have been installed or they are not available for the version of R you are working
# some packages have to be installed from other repositories
install.packages("CHAID",repos="http://R-Forge.R-project.org")
================================================
FILE: 01-IIM/11b2-DS1.R
================================================
# Data Structures in R
#control+enter when you are in the line to execute
# Vectors-----
c(2,4,6)
?seq
seq(2,10,.5)
seq(by=.5, from=2,to=3)
rep(1:3,times=4)
rep(1:3,each=4)
rep(c(3,6,7,2),each=4)
rep(c(3,6,7,2), times=4)
?rep
x=1:10 #create seq of nos from 1 to 10
x
(x1 <- 1:20) # brackets - assign & print
(x1=1:30)
(x2=c(1,2,13,4,5))
class(x2)
(x3=c('a',"ABC"))
class(x3)
(x3=letters[1:10])
class(x3)
LETTERS[1:26]
(x3b = c('a',"Henry",4))#should not combine numeric and character
class(x3b)
(x4=c(T,FALSE,TRUE,T,F)) #logical
class(x4)
class(c(3,5))
(x5a = c(3,5.5))
class(x5a)
as.integer(x5a)
(x5=c(3L,5L, 100L))
class(x5)
x5a = c(3,5)
class(x5a)
(x5b = c(1, 'a',T, 4L))
class(x5b)
#blank variable ?
x=3.5677
trunc(x)
round(x)
floor(x)
ceiling(x)
#access elements
?seq
(x6 = seq(0,100,by=3))
seq(0,100,3)
seq(to=100,from=0,by=3)
seq(1,5,2)
?seq
#[1] 0 2 4 6 8 10
ls() #variables in my environment
x6
length(x6)
x6[1]; x6[21]
x6[1:5]
x6[10:20]
x6[ seq(1,length(x6), 2)]
x6
x6[3] # access 3rd element
#[1] 4
x6[c(2, 4)] # access 2nd and 4th element
x6[-1] # access all but 1st element
x6[-c(1:10, 15:20)]
x6[c(2, -4)] # cannot mix positive and negative integers
#Error in x[c(2, -4)] : only 0's may be mixed with negative subscripts
x6[c(2.4, 3.54)] # real numbers are truncated to integers
x6[c(2,3)]
x6[-c(1,5,20)]
x6
x6[x6 > 30]
x6[x6 > 30 & x6 < 40] # 31-39
x6[x6 != 30]
#or | and is & !
length(x6)
x6
x6[-(length(x6)-1)]
x2
(x7 = c(x6, x2))
#------
#modify
x6
set.seed(1234)
(x6 = sample(1:50))
(x6b = sort(sample(1:50)))
sort(x6)
sort(x6[-c(1,2)])
sort(x6, decreasing=T)
x6
rev(x6)
seq(-3, 10, by=.2)
x6[-c(1:12)]
x6
x6[x6> 30 & x6 < 40]
(x = -3:2)
x6
x6[2:10] <- 99; x6 # modify 2nd element
x6[x6 > 30 & x6 < 40] = 999
x6
x6
x7 = x6[1:4]; x7 # truncate x to first 4 elements
1:5
#equal partitions within a range
(x = seq(1,5, length.out = 15))
x
x = NULL
x
#NULL
x[4]
#NULL
?distribution
?rnorm
(x = rnorm(100))
plot(density(x))
abline(v=c(-3,0,3))
mean(x)
(x1 = rnorm(100, mean=50, sd=5))
plot(density(x1))
abline(v=mean(x1),h=0.04)
hist(x1, breaks=7)
hist(x1)
hist(x1, freq=F)
lines(density(x1), col=2)
summary(x1)
quantile(x1)
quantile(x1, seq(0,1,.25))
quantile(x1,c(.1, .5, .8))
quantile(x1,seq(0,1,.01))
stem(x1)
#Matrix-----
100:111
length(100:111)
matrix(1,ncol=3, nrow=4)
(m1 = matrix(100:111, nrow=4))
(m2 = matrix(100:111, ncol=3, byrow=T))
x=101:124
length(x)
matrix(x, ncol=6)
class(m1)
attributes(m1)
dim(m1)
m1
# access elements of matrix
m1[1,]
m1[,1]
m1[,1, drop=F]
m1[,-1] #remove 1st column
m1[1,2:3]
m1[c(1,3),]
m1[,-c(1,3), drop=F]
m1[m1> 105 & m1 < 108]
#names of cols and rows
m1
paste("C","D",sep="-")
paste("C",1:100,sep="-")
paste("C",1:3,sep='')
(colnames(m1) = paste('C',1:3, sep=''))
m1
(rownames(m1) = paste("R",1:4, sep=''))
m1
attributes(m1)
m1[,c('C1','C3')]
m1[,c(1,3)]
#Vector to Matrix
(m3 = 1:24)
m3
dim(m3)= c(6,4)
m3
#access elements
m2
m2[1,] #first row
m2[c(1,3,4),] #1st,3rd,4th row
m2[,1] #first col
m2[,2:3] # 2nd to 3rd coln
m2[c(1,2),c(2,3)]
m2[,]
m2[-2,] # exclude 2nd row
m2
m2[1:5] # matrix is like vector
m2
m2[c(TRUE,F,T,F),c(F, T, T)] #logical indexing
m2[m2 > 5 & m2 < 10]
m1
m1[1:2,1:2]
m1[c('R1','R2'),c('C1','C2')]
m1[1:2,]
m1[c(T,T,F,F),]
m1
#modify Vector
m2
m2[2,2]
m2[2,2] = 10
m2
m2[,2] = 10
m2
m2[m2> 107] = 9999
m2
rbind(m2, c(50,60,70))
rbind(m2,m2)
m2
cbind(m2, c(55,65,75,85))
m2m2= cbind(m2,m2)
m2m2
m2
cbind(m2,m2)
rbind(m2,m2)
#row and col wise summary
m1
colSums(m1)
rowSums(m1)
colMeans(m1)
rowMeans(m1)
t(m1) # transpose
m1
sweep(m1, MARGIN = 1, STATS = c(2,3,4,5), FUN="+" ) #rowise
sweep(m1, MARGIN = 2, STATS = c(2,3,4), FUN="*" ) #colwise
#addmargins
m1
?addmargins
addmargins(m1,margin=1,sum) #colwise function
addmargins(m1,1,sd) #colwise function
addmargins(m1,2,mean) #rowwise function
addmargins(m1,c(1,2),mean) #row & col wise function
?addmargins
(M1sum= addmargins(m1,c(1,2),list(list(mean,sum,max, min), list(var,sd, max, min)))) #row & col wise function
round(M1sum,0)
#Array-----
length(100:123)
4*3*2
#2 coys, 3 products, 4 locations sold qty
(a1 = array(100:123, dim=c(4,3,2)))
(loc = paste('loc', 1:4,sep='-'))
(product = paste('p', 1:3,sep='@'))
(coy = paste('coy', 1:2,sep='%'))
dimnames(a1) = list(loc, product, coy)
a1
apply(a1,1, sum) #locationwise
apply(a1,2, sum) #productwise
apply(a1,c(1,2), sum) #product-location wise
apply(a1,c(2,3), sum) #product-coy wise
apply(a1,c(1,3), sum) #coy-location
apply(a1,3, sum) #coywise
sum(a1) #total
#DataFrame----
#create Vectors to be combined into DF
(rollno = 1:30)
(sname = paste('student',1:30,sep=''))
(gender = sample(c('M','F'), size=30, replace=T, prob=c(.7,.3)))
(marks1 = floor(rnorm(30,mean= 50,sd=10)))
(marks2 = ceiling(rnorm(30,40,5)))
(course = sample(c('BBA','MBA'), size=30, replace=T, prob=c(.5,.5)))
rollno; sname; gender
marks1 ; marks2; course
#create DF
df1= data.frame(rollno, sname, gender, marks1, marks2, course, stringsAsFactors = F)
str(df1) #structure of DF
head(df1) #top 6 rows
head(df1,n=3) #top 3 rows
tail(df1) #last 6 rows
class(df1) # DF
summary(df1) #summary
nrow(df1)
dim(df1)
length(df1)
df1$course
df1$gender = factor(df1$gender)
df1$course = factor(df1$course)
#df1$sname = as.character(df1$sname)
str(df1)
summary(df1)
boxplot(marks1 ~ gender + course, data=df1)
df1 #full data
df1$gender # one column
head(df1[ , c(2,4)]) #multiple columns
df1[1:10 ,] #select rows, all columns
df1[1:5,1:4]
#as per conditionis
df1[ marks1 > 50 & gender=='F', c('rollno', 'sname','gender', 'marks1')]
df1[ marks1 > 50 & gender=='F', c(1,2)]
df1[ marks1 > 50 | gender=='F', ]
names(df1) # names of columns
dim(df1) #Dimensions
aggregate(df1$marks1, by=list(df1$gender), FUN=sum)
aggregate(marks1 ~ gender, data=df1, FUN=max)
aggregate(cbind(marks1, marks2) ~ gender, data=df1, FUN=max)
(df2 = aggregate(cbind(marks1,marks2) ~ gender + course, data=df1, FUN=mean))
df2
df1
#List -----
g ="My First List"
h = c(25, 26,18,39)
j = matrix(1:10,nrow=2)
k = c('one','two','three')
mylist = list(title=g, ages=h, j, h)
mylist
mylist[2]
mylist[[2]]
mylist[['ages']]
mylist$ages
#Factor -----
(grades = sample(c('A','B','C','D'), size=30, replace=T, prob=c(.3,.2,.4,.1)))
summary(grades)
table(grades)
(gradesFactor = factor(grades))
summary(gradesFactor)
(gradesFactorOrdered = factor(grades, ordered=T))
summary(gradesFactorOrdered)
(gradesFactorOrderedLevels = factor(grades, ordered=T, levels=c('D','C','B','A')))
summary(gradesFactorOrderedLevels)
gradesFactor
gradesFactorOrdered
gradesFactorOrderedLevels
pie(c(10,15,17))
pie(summary(gradesFactorOrderedLevels))
barplot(summary(gradesFactorOrderedLevels), col=1:4)
class(grades)
class(gradesFactorOrdered)
class(gradesFactorOrderedLevels)
# Object Properties
#vector
v1= 1:100
class(v1) ; typeof(v1)
v2=letters[1:10]
class(v2) ; typeof(v2)
length(v2)
summary(v1)
#matrix
m1= matrix(1:24,nrow=6)
class(m1)
summary(m1)
dim(m1)
str(m1)
#Array
a1 =array(1:24, dim=c(4,3,2))
class(a1)
str(a1)
dim(a1)
summary(a1)
#DF
#data() #built in datasets
df1= iris
str(df1)
summary(df1)
class(df1); dim(df1)
nrow(df1) ; names(df1) ;NROW(df1)
colnames(df1)
rownames(df1)
#list
list1 = list(v1,m1,a1,df1)
str(list1)
#Statistical Description
library(Hmisc)
describe(df1)
#Next Topics
x= c(123.2234, 33333.544, 43243.8442)
floor(x)
ceiling(x)
trunc(x)
round(x,-2)
round(x, digits = 5)
================================================
FILE: 01-IIM/11b3-DS2-factor.R
================================================
#Factors
#categories without (eg Gender) Order or with (eg Grades) Orders
(grades = sample(c(LETTERS[1:4]), size=30, replace=T, prob=c(.4,.2,.3,.1 )))
summary(grades)
(gradesF = factor(grades))
summary(gradesF)
table(grades)
table(gradesF)
class(gradesF)
(gradesFO = factor(grades, ordered=T))
(gradesFO1 = factor(grades, ordered=T, levels=c('B','C','A','D')))
summary(gradesFO1)
(marks = ceiling(rnorm(30, mean=60, sd=5)))
(gender = factor(sample(c('M', 'F'), size=30, replace=T)))
(student1 = data.frame(marks, gender, gradesFO1))
boxplot( marks ~ gradesFO1, data=student1)
boxplot( marks ~ gradesFO1 + gender, data=student1)
boxplot(marks)
summary(marks)
abline(h = summary(marks))
quantile(marks)
================================================
FILE: 01-IIM/11b4-DS4-reproduce.R
================================================
#Reproducible Code
mtcars
head(mtcars)
recode_mtcars <- dput(head(mtcars))
newdf <- recode_mtcars
newdf
#if Df has factors
recode_iris <- dput(droplevels(iris[1:4, ]))
recode_iris
#One other caveat for dput is that it will not work for keyed data.table objects or for grouped tbl_df (class grouped_df) from dplyr. In these cases you can convert back to a regular data frame before sharing, dput(as.data.frame(my_data))
#links
#https://stackoverflow.com/questions/5963269/how-to-make-a-great-r-reproducible-example
#trick
library(devtools)
source_url("https://raw.github.com/rsaporta/pubR/gitbranch/reproduce.R")
rec_mtcars2 <- reproduce(mtcars, cols=1:4)
?reproduce
#copy and paste the script
df2 <- structure(list(mpg = c(21, 21, 22.8, 21.4, 18.7, 18.1, 14.3,19.7, 15, 21.4), cyl = c(6, 6, 4, 6, 8, 6, 8, 6, 8, 4), disp = c(160,160, 108, 258, 360, 225, 360, 145, 301, 121), hp = c(110, 110,93, 110, 175, 105, 245, 175, 335, 109)), class = "data.frame", row.names = c("Mazda RX4","Mazda RX4 Wag", "Datsun 710", "Hornet 4 Drive", "Hornet Sportabout","Valiant", "Duster 360", "Ferrari Dino", "Maserati Bora", "Volvo 142E"))
df2
#o quickly create a dput of your data you can just copy (a piece of) the data to your clipboard and run the following in R:
#for data in Excel:
#go to excel and copy data
df5 <- dput(read.table("clipboard",sep="\t",header=TRUE))
df5
#for data in a txt file:
#go to text file seperated by space
df6 <- dput(read.table("clipboard",sep="",header=TRUE))
df6
#--------
library(reprex)
(y <- 1:4)
mean(y)
reprex()
#------------
#now copy crom viewer and send it
#https://www.tidyverse.org/help/
#https://stackoverflow.com/help/minimal-reproducible-example
================================================
FILE: 01-IIM/11b5-DS3.R
================================================
#Data Structures - II
#Home Work
#Arrays
length(100:123)
4*3*2
#2 coys, 3 products, 4 locations sold qty
(a1 = array(100:123, dim=c(4,3,2)))
(loc = paste('loc', 1:4,sep='-'))
(product = paste('p', 1:3,sep='@'))
(coy = paste('coy', 1:2,sep='%'))
dimnames(a1) = list(loc, product, coy)
a1
apply(a1,1, sum) #locationwise
apply(a1,2, sum) #productwise
apply(a1,c(1,2), sum) #product-location wise
apply(a1,c(2,3), sum) #product-coy wise
apply(a1,c(1,3), sum) #coy-location
apply(a1,3, sum) #coywise
sum(a1) #total
#Lists
#List -----
g ="My First List"
h = c(25, 26,18,39)
j = matrix(1:10,nrow=2)
k = c('one','two','three')
mylist = list(title=g, ages=h, j, h)
mylist
mylist[2]
mylist[[2]]
mylist[['ages']]
mylist$ages
================================================
FILE: 01-IIM/11e2_vectors1.R
================================================
#Case in Vector in R
#Vector is single dim
#creations of Vectors------
#Vector values from 1 to 100
x1 = 1:100
x1
#vector of values 1, 5, 11, 30
x2 = c(1, 5, 11, 30)
x2
#vector with integer values starting from 5 to 60 with a step of 3
x3 = seq(from=5, to=60, by=3)
x3
#vector with following values using rep & times: 1 2 3 1 2 3 1 2 3
x4 = rep(c(1,2,3), times=3)
x4
#vector with following values using rep & each : 5 5 5 8 8 8 9 9 9
x5 = rep(c(5,8,9), each=3)
x5
#vector with following values using seq & length.out : 10.0 12.5 15.0 17.5 20.0
x6 = seq(from=10, to=20, length.out=5)
x6
#vector with values between 25 and 100 and number of values drawn from another vector
x7a = 60:75
length(x7a)
x7b = seq(from=10, to=20, along.with = x7a)
x7b
length(x7b)
#multiple functions-------
#vector drawn randomly from seq of values from 10 to 50
#sort the data; print the data in reverse order
x8a = 10:50
x8a
x8b = sample(x8a)
x8b
sort(x8b)
rev(x8b)
#print the 5th to 10th element
x8b[5:10]
#print every alternate element between 4th and 12 element
x8b[seq(4,12,2)]
#print values between 30 & 40 and their location also
x8b[x8b >=30 & x8b <=40]
which(x8b >=30 & x8b <=40)
x8b
#basic stats - mean, median of values
mean(x8b)
median(x8b)
#min and max
min(x8b)
max(x8b)
#range of values
range(x8b)
#plots : histogram
hist(x8b)
#with 5 bars only
hist(x8b, breaks=5)
#vector of categorical values of Male(M) & Female(F)
y1 = c('M','F','M','M')
y1
#vector of 100 with M & F, randomly selected
y2a= c('M','F')
y2a
y2b = sample(x=y2a, size=100, replace=T)
y2b
table(y2b) #count of M & F
# Male % approx 60%
y2c = sample(x=y2a, size=100, replace=T, prob=c(.6,.4))
y2c
table(y2c) #count of M & F
prop.table(table(y2c)) #count in Proportion
#barplot & pie
barplot(table(y2c), col=1:2)
pie(table(y2c), col=1:2)
#list only males
y2c[y2c=='M']
#mode
library(modeest)
mlv(y2c, method = "mfv")
#other functions-----
#normal distribution values - 100000
n1 = rnorm(n=10000)
n1
#mean and std dev
mean(n1)
sd(n1)
#plot & histogram
hist(n1)
plot(density(n1))
hist(n1, freq=F); lines(density(n1), col='red')
#uniform distribution values - 100 students with marks between 62 and 90
n2 = runif(n=100, min=65, max=90)
n2
range(n2)
#round the marks to 1 decimal place
n2 = round(n2,digits=1)
n2
#mean and std dev, median, mode,summary, quantile
mean(n2)
sd(n2)
median(n2)
modeest::mlv(n2, method = "mfv") #mode
range(n2)
summary(n2)
quantile(n2) #4 quantiles
quantile(n2, p=c(.1, .7)) #10th & 70th percentile
library(moments)
moments::skewness(n2)
moments::kurtosis(n2)
#plot & histogram
hist(n2)
boxplot(n2)
plot(density(n2))
hist(n2, freq=F); lines(density(n2), col='red')
#how many values between 65 & 72
sum(n2 > 65 & n2 < 72)
#mean of marks of students who scored between 75 & 80
mean(n2[n2 > 75 & n2 < 80])
#class interval : 50-65, 65-75, 75-85, > 85
breaks1 = c(0,50,65,75,85,100)
n2cut = cut(n2, breaks= breaks1)
table(n2cut)
#increase marks by 5 for students who scored < 75
n2
n2[n2 < 75]
which(n2 < 75)
n2[n2 < 75] = n2[n2 < 75] + 5
n2[n2 < 80]
#all of who have scored > 80, make them 82
n2[n2 > 80] = 82
n2
n2[n2 == 82]
#end
str(n2)
summary(n2)
length(n2)
range(n2)
class(n2)
?mean #help
================================================
FILE: 01-IIM/11e3_matrices1.R
================================================
#Matrices in R
#Matrix is a two dimensional data structure in R programming. Matrix is similar to vector but additionally contains the dimension attribute. All attributes of an object can be checked with the attributes() function (dimension can be checked directly with the dim() function).
#vector ----
(v1 = 1:24)
dim(v1)
attributes(v1)
class(v1)
#matrices
(m1 = matrix(data = 1:24, nrow=4)) #bycolumn
(m2 = matrix(data = 1:24, nrow=4, byrow=T)) #by row
(m3 = matrix(data = 1:24, ncol=4)) #by column, ncol
#what to do - filter, sum, other stats
length(m1)
length(m3) #no of elements
dim(m1)
dim(m3) #dimensions rows * cols
attributes(m1) #properties
class(m1) #matrix, array
m1
dimnames(m1)= list(c('Jan','Feb','Mar','Apr'), paste('coy',1:6, sep=''))
m1 #4 months, 6 coys
m3
month.abb #month abbreviations
#another way of giving names to row & col
colnames(m3) = paste('coy',1:4, sep='')
rownames(m3)= month.abb[1:6]
m3 #6 months, 4 coys
m1
m1[1,] #first row
m1[, 2] #first row
m1[,2 , drop=F] #see it is as column
m1[1:2,] #first 2 rows
m1[1, 2:3] #first row, 2 to 3rd col
m1[c(1,3),c(2,3,5)] #what is this
m1[,-1] #skip first column
m1[,-c(2,5)] #what is this
m1[1:24] #matrix as vector
class(m1[1:24]) #integer
m1
m1 > 5 & m1 < 15 #which position are True & False
m1[m1 > 5 & m1 < 15] # values
m1[m1 > 5 & m1 < 15] = 100 #change values
m1
#join matrics
cbind(m1, m3) #matching row & cols numbers
cbind(m1, m2)
rbind(m1,m2)
#stats
m1
colSums(m1)
rowSums(m1)
colMeans(m1)
rowMeans(m1)
m3
t(m3) #transpose
sweep(m3, MARGIN=1, STATS=c(2,3,4,5,6,7), FUN='+') #row + x
sweep(m3, MARGIN=2, STATS=c(1.2,1.3,1.4,1.5), FUN='*') #col * x
#addmargins
m3
addmargins(m3, margin=1, sum) #create a row with sum of columns
addmargins(m3, margin=2, sum) #create a col with sum of rows
addmargins(m3, margin=c(1,2), mean)
M3functions = addmargins(m3, margin=c(1,2), list(list(mean, sum, max, min), list(var, sd, max, min)))
M3functions
round(M3functions,1)
#array is more than 2 dim matrix, same data type
4*3*2
length(100:123)
#2 coy, 3 products, 4 months
(a1 = array(100:123, dim=c(4,3,2)))
dimnames(a1) = list(list('Jan','Feb','Mar','Apr'), list('P1','P2','P3'), list('Coy1','Coy2'))
a1
apply(a1, MARGIN=1, FUN=sum) #rows : month sales
apply(a1, MARGIN=2, FUN=sum) #cols : product sales
apply(a1, MARGIN=3, FUN=sum) #matrix : coy sales
apply(a1, MARGIN=c(2,3), FUN=sum) #cols : product & coy sales
apply(a1, MARGIN=c(1,3), FUN=mean)
#matrix & array - > 2 dim
# same datatype
================================================
FILE: 01-IIM/11e4_dataframe1.R
================================================
#Data Frame in R
#A data frame is a table or a two-dimensional array-like structure in which each column contains values of one variable and each row contains one set of values from each column. ... The data stored in a data frame can be of numeric, factor or character type.
mtcars
class(mtcars)
#lets us create a DF of 50 students
sizeN=50
#rollno, name, gender, grade, marks1, marks2
(rollno = 1:50)
(name = paste('Student', 101:150, sep='_'))
sex = c('M','F')
(gender = sample(x=sex, size=sizeN, replace=T, prob=c(.6,.4)))
table(gender)
(courses = c('BBA', 'MBA'))
(course = sample(x=courses, size=sizeN, replace=T, prob=c(.4,.6)))
(gradetypes = LETTERS[1:4])
(grades = sample(x=gradetypes, size=sizeN, replace=T, prob=c(.2,.3,.4, .1)))
table(grades)
(marks1 = trunc(rnorm(n=sizeN, mean=75, sd=5 )))
(marks2 = trunc(runif(n=sizeN, min=60, max=95 )))
vectorList = list (rollno, name, gender, course, grades, marks1, marks2)
mapply(length, vectorList) #all are of length 50
#df
df1 = data.frame(rollno, name, gender, course, grades, marks1, marks2)
head(df1) #first 6
tail(df1,n=5) #last 5
class(df1) #dataframe
summary(df1) #summarys
str(df1) #structure of data
attributes(df1)
names(df1) # names of cols
dim(df1) # dimensions 50 * 6
length(df1) #no of columns
#selected
df1[1:4, 1:5]
df1[1:5, c('rollno', 'name', 'gender')]
df1[df1$gender == 'M', c('rollno', 'name','gender')]
df1[df1$gender == 'M' & df1$marks1 > 80, c('rollno', 'name','gender')]
df1[df1$grades %in% c('A','C') & df1$marks2 > 80 & df1$marks1 < 75, c('rollno', 'name','gender', 'grades', 'marks1', 'marks2')]
#summary
aggregate(df1$marks1, by= list(df1$gender), FUN=max)
aggregate(cbind(marks1, marks2) ~ gender + course , data=df1, FUN=mean)
#other
df1[order(df1$marks1),] #sort by marks1
df1[order(df1$gender, df1$marks1),] #sort by gender marks1
#factors
(genderF = factor(gender)) #nominal data
(courseF = factor(course)) #course data
(gradesOF = factor(grades, ordered=T, levels = c('C','B','A','D')))
#D is highest
barplot(table(grades), col=1:4)
barplot(table(gradesOF), col=1:4)
summary(df1)
df2 = data.frame(rollno, name, genderF, courseF, gradesOF, marks1, marks2)
summary(df2)
#see the summary statistics
================================================
FILE: 01-IIM/12a3-impexp1.R
================================================
# Read Data into R Environment
#CSV Files---- local or network
#Read from CSV file in PC
head(mtcars)
rownames(mtcars)
write.csv(mtcars, "./data/mtcarsF.csv", row.names=F) #without rownames
write.csv(mtcars, "./data/mtcarsT.csv", row.names=T) #with rownames
head(iris)
write.csv(iris, "./data/irisF.csv", row.names=F) #without rownames
write.csv(iris, "./data/irisT.csv", row.names=T) #with rownames
#go to folder data and open these csv files
read.csv(file='./data/irisF.csv')
readiris = read.csv(file="./data/irisF.csv", header = TRUE,sep = ",")
readiris
readmtcars = read.csv(file="./data/mtcarsF.csv", header = TRUE,sep = ",")
head(readmtcars)
str(readmtcars)
class(readmtcars)
head(readmtcars)
read2 = read.table(file="./data/iris.csv", header = TRUE,sep = ",")
str(read2); class(read2)
head(read2)
read3 = read.delim(file="./data/iris.csv", header = TRUE,sep = ",")
str(read3) ; class(read3)
head(read3)
#difference is use of specify delimeter(read.csv takes default as comma)
#or location is different from Project Folders, or want to search for the file
read4 = read.csv(file=file.choose())
str(read4)
head(read4)
# From URL : Read CSV from Web----
read_web1 = read.csv('http://www.stats.ox.ac.uk/pub/datasets/csb/ch11b.dat')
head(read_web1)
library(data.table)
read_web2 = fread("http://www-bcf.usc.edu/~gareth/ISL/Advertising.csv")
head(read_web2)
class(read_web2)
#Text file from Web-----
read_txt = read.table("https://s3.amazonaws.com/assets.datacamp.com/blog_assets/test.txt", header = FALSE)
head(read_txt)
#other ways - google sheets, excel
#end here
================================================
FILE: 01-IIM/12a4-datasets.R
================================================
#data sets in R
#https://lgatto.github.io/IntroMachineLearningWithR/example-datasets.html#edgar-andersons-iris-data
#Iris
data(iris)
data(mtcars)
#
source("http://www.bioconductor.org/biocLite.R")
biocLite(c("MSnbsase", "pRoloc")) ## software
biocLite("pRolocdata") ## date
library("ggplot2")
data(diamonds)
library("mlbench")
data(Sonar)
#housing values
library("MASS")
data(Boston)
#customer churn
library("C50")
data(churn)
dim(churnTrain)
pacman::p_load(datarium)
data(package='datarium')
data('marketing', package='datarium')
marketing
#it is a dataset containing the impact of three advertising medias (youtube, facebook and newspaper) on sales. The first three columns are the advertising budget in thousands of dollars along with the fourth column as sales. The advertising experiment has been repeated 200 times. Hence, it has 200 rows.
================================================
FILE: 01-IIM/12a4-impexp-xls.R
================================================
# Read Data into R Environment - to/fro XLS
#Excel----
#Create a excel file with data in 2 sheets
# first row contains variable names
#C:\Program Files\Java\jre1.8.0_261
Sys.setenv(JAVA_HOME='C:\\Program Files\\Java\\jre1.8.0_261')
library(xlsx)
library(rJava)
write.xlsx2(mtcars, file='./data/mtcars.xlsx', sheetName = 'mtcars1', row.names=F, append=F)
df_excel1 = read.xlsx( "./data/mtcars.xlsx", 1)
df_excel1
head(iris)
write.xlsx2(iris, file='./data/mtcars.xlsx', sheetName = 'iris1', row.names=F, append=T)
df_excel2 = read.xlsx( "./data/mtcars.xlsx", 2)
df_excel2
write.xlsx2(iris, file='./data/mtcars.xlsx', sheetName = 'iris2', row.names=F, append=T)
df_excel2b = read.xlsx( "./data/mtcars.xlsx", 'iris2')
df_excel2b
#see the excel sheet in data folder: windows explorer
library(readxl) #for reading only, faster
readxl::excel_sheets("./data/mtcars.xlsx") #names of sheets in workbook
df_rxl1 <- readxl::read_excel( "./data/mtcars.xlsx", 'iris1')
df_rxl2 <- readxl::read_excel( "./data/mtcars.xlsx", 2)
head(df_rxl1)
head(df_rxl2)
readxl::read_excel( "./data/mtcars.xlsx", 2, skip=1)
?`readxl-package`
?read_excel
#end here
================================================
FILE: 01-IIM/12e4-impexp-gs.R
================================================
#import Data from Google Sheets
#google sheets #publically shared
url1 = 'docs.google.com/spreadsheets/d/1I9mJsS5QnXF2TNNntTy-HrcdHmIF9wJ8ONYvEJTXSNo'
library(gsheet) #public gsheets
data1 = gsheet2tbl(url1)
head(data1)
url2 = "https://docs.google.com/spreadsheets/d/1Md_ro2t3M7nA9JMH1DsE12jfeX7qq-UPw6p8WQd6A2Y/edit#gid=216113907"
data2 = gsheet2tbl(url2)
data2
#private gsheets
library(googlesheets4)
url1 = 'docs.google.com/spreadsheets/d/1I9mJsS5QnXF2TNNntTy-HrcdHmIF9wJ8ONYvEJTXSNo'
gsDF = googlesheets4::read_sheet(url1, sheet=1) #give permissions
#accept the popup
head(gsDF)
================================================
FILE: 01-IIM/13a2-NAvalues.R
================================================
# Missing values
#missing values are indicate NA keyword
x = c(1, ,3) #wrong way to create missing values
x = c(NA, 1, NA, 2,3, NA) #introducing missing values
x
x1=x #make a copy
x1
is.na(x)
sum(is.na(x))
sum(c(T,F,T,F,F))
x
mean(x) #this will not work if NA values are present
?mean
mean(x, na.rm=T)
(1+2+3)/3
sum(x,na.rm=T)/3
x
x[is.na(x)] #list out missing values
mean(x, na.rm=T)
x[is.na(x)] = mean(x, na.rm=T) #replace these missing values with mean of other values
x
cbind(x,x1) #see where values have been filled
x1 = c(4,6,8,9)
length(x1[x1 >= 6])
sum(x1 >= 6)
x1 >= 6
x2 = rnorm(100000, mean=50, sd=5) #normal distributed values
x2
length(x2)
posn=sample(100000, size=30)
posn
x2[posn] = NA
summary(x2)
is.na(x2)
sum(is.na(x2))
mean(x2)
mean(x2, na.rm=T)
x2[is.na(x2)] = mean(x2, na.rm=T)
sum(is.na(x2))
#install this library
library(VIM)
?sleep
data(sleep, package='VIM')
head(sleep) #first few rows of sleep
dim(sleep) #dimensions of sleep data
complete.cases(sleep) # which row have complete data in T/ F
sum(complete.cases(sleep)) # no of rows have which no missing data
sum(!complete.cases(sleep)) # no of rows which have missing data
sleep[complete.cases(sleep),] #rows which are complete 42
sleep[!complete.cases(sleep),] #rows which have missing values 20
summary(sleep)
colSums(is.na(sleep)) #which column how many data missing
rowSums(is.na(sleep)) #which row how many data missing
head(sleep)
df= sleep #make a copy of sleep data
complete.cases(df) #complete cases
mean(df$Dream, na.rm=T) #mean of Dream Col
sum(is.na(df$Dream))
sum(!is.na(df$Dream))
df$Dream
summary(df)
df[ , 4:5]; df[ ,c(1,3,5)]
df[is.na(df$Dream), 'Dream'] #missing values in Dream Column
df[is.na(df$Dream), "Dream"] = mean(df$Dream, na.rm=T) #find and replace
df$Dream
sum(df$Dream == 1.972)
#use mice package
library(mice)
#https://www.analyticsvidhya.com/blog/2016/03/tutorial-powerful-packages-imputing-missing-values/
#MICE, Amelia, missForest, Hmisc, mi
================================================
FILE: 01-IIM/13b2-outliers.R
================================================
#Outliers
#https://www.statsandr.com/blog/outliers-detection-in-r/
#outliers--------------------------
#An outlier is a value or an observation that is distant from other observations, that is to say, a data point that differs significantly from other data points
data <- ggplot2::mpg
head(data)
data$hwy
summary(data$hwy)
range(data$hwy)
(breaks1 = sqrt(nrow(data))) #no of bins/ break point
hist(data$hwy, xlab = "hwy", main = "Histogram of hwy", breaks = breaks1)
#ggplot2
library(ggplot2)
ggplot(data) + aes(x = hwy) + geom_histogram(bins = 30L, fill = "#0c4c8a") + theme_minimal()
# there seems to be a couple of observations higher than all other observations (see the bar on the right side of the plot).
#boxplot-----
boxplot(data$hwy, ylab = "hwy")
ggplot(data) + aes(x = "", y = hwy) + geom_boxplot(fill = "#0c4c8a") + theme_minimal()
#there are 2 potential outliers (see the 2 points above the vertical line, at the top of the boxplot).
#box plot visualizes a quantitative variable by displaying five common location summary (minimum, median, first and third quartiles and maximum) and any observation that was classified as a suspected outlier using the interquartile range (IQR) criterion. The IQR criterion means that all observations < Q1 - 1.5IQR and > Q3 + 1.5IQR are classified as potential outliers
#IQR is the difference between the third and first quartile) are considered as potential outliers by R.
#extract the values of the potential outliers based on the IQR criterion
boxplot.stats(data$hwy)$out
#row no
out <- boxplot.stats(dat$hwy)$out
out_ind <- which(dat$hwy %in% c(out))
out_ind
dat[out_ind, ] #complete data
#print the values
boxplot(dat$hwy,ylab = "hwy", main = "Boxplot of highway miles per gallon"
)
mtext(paste("Outliers: ", paste(out, collapse = ", ")))
#This method of outliers detection is based on the percentiles. With the percentiles method, all observations that lie outside the interval formed by the 2.5 and 97.5 percentiles will be considered as potential outliers. Other percentiles such as the 1 and 99, or the 5 and 95 percentiles can also be considered to construct the interval.
#The values of the lower and upper percentiles (and thus the lower and upper limits of the interval) can be computed with the quantile() function:
(lower_bound <- quantile(data$hwy, 0.025))
(upper_bound <- quantile(dat$hwy, 0.975))
#all observations below 14 and above 35.175 will be considered as potential outliers. The row numbers of the observations outside of the interval can then be extracted with the which() function:
(outlier_ind <- which(data$hwy < lower_bound | data$hwy > upper_bound))
#values of highway miles per gallon can be printed:
data[outlier_ind, "hwy"]
data[outlier_ind, ]
#There are 11 potential outliers according to the percentiles method. To reduce this number, you can set the percentiles to 1 and 99:
(lower_bound <- quantile(data$hwy, 0.01))
(upper_bound <- quantile(data$hwy, 0.99))
(outlier_ind <- which(dat$hwy < lower_bound | dat$hwy > upper_bound))
#more
#https://www.statsandr.com/blog/outliers-detection-in-r/#introduction
#https://www.itl.nist.gov/div898/handbook/eda/section3/eda35h.htm
#https://www.r-bloggers.com/2020/01/how-to-remove-outliers-in-r/
#https://towardsdatascience.com/ways-to-detect-and-remove-the-outliers-404d16608dba
================================================
FILE: 01-IIM/15a2-GPH-basic.R
================================================
# Basic plots
#plot, histogram, pie, boxplot, linechart, correlation plot
#plot
women
plot(women)
(x= seq(1,10))
(x =c(1,2,3,4,5,6,7,8,9,10))
y= seq(11,20)
plot(x,y)
plot(x=women$height, y=women$weight)
plot(women)
plot(mtcars)
pairs(mtcars[,1:3])
?plot
plot(women, type='p', pch=4, col='blue')
plot(women, type='l')
plot(women, type='b')
plot(women, type='b', pch=18, lty=1, col=2, lwd=4)
plot(women, xlim=c(30,100), ylim=c(min(women$weight)-10, 200), pch=10)
#more features with plot
plot(x=women$weight, y=women$height, pch=15, xlab='Weight', ylab='Height', col='red', cex=2, type='b')
title(main='Main Title', sub='Sub Title')
#see cheat sheet on base graphs
mtcars$cyl
#plot(x=mtcars$wt, y=mtcars$mpg, col=mtcars$gear, pch=c(4,6,8), cex=c(1,2))
#as.numeric(levels(as.factor(mtcars$cyl)))
plot(women)
abline(lm(women$weight ~ women$height), col='red', lty=2, lwd=4)
#boxplot-----
str(women)
boxplot(women$height)
summary(women$height)
abline(h=c(58, 62,65,68,72))
text(x=1.2, y=c(58, 62,65,68,72) ,c('Min','1Q', 'Median','Mean','3Q','Max'), col='red' )
x= rnorm(100000, mean=40, sd=5)
boxplot(x, notch=T)
(sum1= as.vector(summary(x)))
abline(h=sum1)
text(x=1.3, y=sum1 ,c('Min','1Q', 'Median','Mean','3Q','Max'), col='red' )
hist(x)
range(x)
hist(x, breaks=c(0,40,50,60,100))
plot(density(x))
?hist
#draw lines on plot for number summary
summary(women)
quantile(women$height)
quantile(women$height, seq(0,1,.1))
quantile(women$height, seq(0,1,.01))
stem(women$height)
boxplot(women$height, col='green')
abline(h=quantile(women$height))
text(1, quantile(women$height), labels=c('min','1Q','median','3Q','max'))
#histogram
hist(women$height)
hist(women$height, breaks=10)
hist(women$height, breaks=5, col=1:5)
#histogram2
(x = rnorm(100,50,10))
hist(x)
hist(x, freq=F, col=1:10)
lines(density(x))
#density plot : shape of data
plot(density(x), col='red')
#pie
gender= sample(c('M','F'), size=100, replace=T)
table(gender)
pie(table(gender))
x = c(10,20,40,50)
pie(x)
xlabels = c('A ','B ','C ','D ')
x/sum(x)
(labels2 = paste(xlabels, round(x/sum(x),2) * 100 , sep='-'))
(labels3 = paste0(labels2,"%%"))
(labels2 = paste0(xlabels, round(x/sum(x),2) * 100, '%'))
pie(x, labels=labels2)
x
#barplot
barplot(x,col=1:4)
barplot(x,col=1:4, horiz = T)
#correlation plot
pairs(women)
cor(women$height,women$weight)
cov(women$height, women$weight)
head(mtcars)
?mtcars
cor(mtcars)
names(mtcars)
pairs(mtcars)
pairs(mtcars[1:4])
options(digits=4)
pairs(mtcars[c('mpg', 'wt','hp')])
================================================
FILE: 01-IIM/15a3-GPH-graphs.R
================================================
# Combined Plots
#plot, histogram, pie, boxplot, linechart, correlation plot
#plot
women
?women
str(women)
plot(women)
plot(x=women$height, y=women$weight)
?plot
plot(women, type='p', pch=17)
plot(women, type='l')
plot(women, type='b', pch=18, lty=2, col=2)
plot(women, xlim=c(30,100), ylim=c(min(women$weight)-10, 200), pch=10)
data()
#more features with plot
plot(y=women$height, x=women$weight, pch=15, xlab='Weight', ylab='Height', col='red', cex=2, type='b')
title(main='Main Title- PDU', sub='Sub Title')
#see cheat sheet on base graphs
plot(women)
abline(lm(women$weight ~ women$height), col='red', lty=2, lwd=4)
abline(h = c(130, 150), col='green')
abline(v=c(62, 66, 70), col='blue')
abline(v=women$height, col='purple')
#boxplot
boxplot(women$height)
boxplot(df$marks1)
abline(h=c(58, 62,65,68,72))
#draw lines on plot for number summary
summary(women)
quantile(women$height)
boxplot(women$height, col='green')
abline(h=quantile(women$height))
#histogram
hist(women$height)
hist(women$height, breaks=10)
hist(women$height, breaks=5, col=1:5)
hist(df$marks2, breaks=3)
#histogram2
?rnorm
x = rnorm(n=100000,mean=50,sd=10)
hist(x)
hist(x, freq=F, col=1:5)
lines(density(x))
#density plot : shape of data
plot(density(x), col='red')
#pie
x = c(10,20,40,50)
pie(x)
xlabels = c('A','B','C','D')
pie(x, labels=xlabels)
pie(x, labels=paste(round(x/sum(x) * 100,0),'%') )
x
#barplot
barplot(x,col=1:4)
barplot(x,col=1:4, horiz = T)
#correlation plot
pairs(women)
cor(women$height,women$weight)
names(mtcars)
cor(mtcars)
pairs(mtcars)
options(digits=4)
pairs(mtcars[1:4])
================================================
FILE: 01-IIM/15a4-GPH-advgraphs.R
================================================
#Advanced Graphs
library(corrgram)
cor(mtcars[1:4])
corrgram(mtcars[1:4], order=TRUE, lower.panel=panel.shade, upper.panel=panel.pie, text.panel=panel.txt, main="Car Milage Data in PC2/PC1 Order")
#alternative of box plot
boxplot(mpg ~ cyl, data=mtcars)
library(corrplot)
relationship=cor(mtcars)
relationship
corrplot(relationship)
corrplot(relationship, type="upper")
#-----
library(vioplot)
x1 <- mtcars$mpg[mtcars$cyl==4]
x2 <- mtcars$mpg[mtcars$cyl==6]
x3 <- mtcars$mpg[mtcars$cyl==8]
x1; x2; x3
vioplot(x1, x2, x3, names=c("4 cyl", "6 cyl", "8 cyl"), col="gold")
title("Violin Plots of Miles Per Gallon")
abline(h=c(15,20))
================================================
FILE: 01-IIM/16b1-GPH-wordcloud.R
================================================
#Word Cloud
#word & Freq; built in data sets
#-----
# World Cloud
#http://stat.ethz.ch/R-manual/R-devel/library/base/html/strsplit.html
#https://stackoverflow.com/questions/4350440/split-a-column-of-a-data-frame-to-multiple-columns
library(stringr)
library(wordcloud)
library(RColorBrewer)
library(tm)
library(SnowballC)
library(RCurl)
library(XML)
#How to use strsplit
strsplit('IIT-Gawahati', "-")
strsplit('IIT Gawahati', " ")
library(gsheet)
url = 'https://docs.google.com/spreadsheets/d/1_GQ-h4bgdNlIxcAanwRp_ak1u3JoWI-Vx2HKYj4FstA/edit#gid=0'
#check for dashtype, they are not always same
df1 = as.data.frame(gsheet2tbl(url))
df1
head(df1)
df1$wordexplanation
base::strsplit(df1$wordexplanation,'-') #output as list need in DF
#Different Methods split and put in dataframe objects
#Stringr - str_split
stringr::str_split('IIT-Guwahati', "-")
#Method
(out1 = strsplit(as.character('IIT - Guwahati'),'-')) #if the word was not character
head(df1)
out = strsplit(df1$wordexplanation,'-')
head(out)
t(sapply(out[1:5], '['))
df2=data.frame(t(sapply(out[1:200], '[')))
head(df2)
df= cbind(df1,df2)
head(df)
names(df)[c(2,3)] = c('word','explanation') #rename columns
head(df)
# create random frequencies for all words in a col
df$freq = floor(runif(100, 1,100))
head(df)
#Method for word cloud
library(wordcloud)
par(mar = c(1, 1, 1, 1))
wc1 = wordcloud(df$word, df$freq, random.order=T)
par(mar = c(1, 1, 1, 1))
wc2 = wordcloud(df$word, df$freq, random.order=F, colors = topo.colors(10))
par(mar = c(1, 1, 1, 1))
wc3 = wordcloud(df$word, df$freq, scale=c(4,.1), min.freq=2, max.words=50, random.order=F)
wc4 = wordcloud(df$word, df$freq, scale=c(4,.1), random.order=F)
#------
library(RColorBrewer)
library(tm)
pal <- brewer.pal(8,"Dark2")
wc5 = wordcloud(df$word, df$freq, scale=c(4,.1), random.order=F, colors=pal)
#load the function rquery.wordcloud
source('http://www.sthda.com/upload/rquery_wordcloud.r')
res <- rquery.wordcloud(filePath, type ="file", lang = "english",min.freq = 1, max.words = 200)
# Reds color palette
res <- rquery.wordcloud(filePath, type ="file", lang = "english",colorPalette = "Reds")
# RdBu color palette
res <- rquery.wordcloud(filePath, type ="file", lang = "english", colorPalette = "RdBu")
# use unique color
res<-rquery.wordcloud(filePath, type ="file", lang = "english", colorPalette = "black")
tdm <- res$tdm
freqTable <- res$freqTable
# Show the top10 words and their frequency
head(freqTable, 10)
# Bar plot of the frequency for the top10
barplot(freqTable[1:10,]$freq, las = 2,
names.arg = freqTable[1:10,]$word,
col ="lightblue", main ="Most frequent words",
ylab = "Word frequencies")
findFreqTerms(tdm, lowfreq = 4)
findAssocs(tdm, terms = "freedom", corlimit = 0.3)
url = "http://www.sthda.com/english/wiki/create-and-format-powerpoint-documents-from-r-software"
rquery.wordcloud(x=url, type="url")
#------
#wordcloud2
#install.packages('wordcloud2')
library(wordcloud2)
df = data.frame(word=c('mdi','iim','imt'),freq=c(20,23,15))
df
par(mar=c(0,0,0,0))
wordcloud2(df)
head(demoFreq)
dim(demoFreq)
par(mar=c(0,0,0,0))
wordcloud2(demoFreq)
par(mar=c(0,0,0,0))
wordcloud2(demoFreq, size = 2, color = "random-light", backgroundColor = "grey")
names(demoFreq)
par(mar=c(0,0,0,0))
wordcloud2(demoFreq, size = 2, minRotation = -pi/2, maxRotation = -pi/2)
wordcloud2(demoFreq, size = 2, minRotation = -pi/6, maxRotation = -pi/6, rotateRatio = 1)
wordcloud2(demoFreq, size = 2, minRotation = -pi/6, maxRotation = pi/6, rotateRatio = 0.9)
par(mar=c(0,0,0,0))
wordcloud2(demoFreqC, size = 2, color = "random-light", backgroundColor = "grey")
wordcloud2(demoFreqC, size = 2, minRotation = -pi/6, maxRotation = -pi/6, rotateRatio = 1)
# Color Vector
?wordcloud2
colorVec = rep(c('red', 'skyblue'), length.out=nrow(demoFreq))
wordcloud2(demoFreq, color = colorVec, fontWeight = "bold")
wordcloud2(demoFreq, color = ifelse(demoFreq[, 2] > 20, 'red', 'skyblue'))
#Example2 -----
#Word Cloud 2
#(https://www.r-graph-gallery.com/the-wordcloud2-library/)
# library : install it first
library(wordcloud2)
# have a look to the example dataset
head(demoFreq)
dim(demoFreq)
str(demoFreq)
#wordcloud
wordcloud2(demoFreq, size=1.6)
head(demoFreq[order(-demoFreq$freq),])
?wordcloud2
#create your own set of words
word = c('marketing','consumer', 'dhiraj','price','business','iimkashipur', 'sunder','vignesh', 'jyoti','finance', 'operations')
freq = c(30,20,15,36,15,13,11,44,13,44,34)
df1 = data.frame(word, freq)
#rownames(df1)= word
head(df1)
#df1 = head(demoFreq)
wordcloud2(df1, size=.4)
?wordcloud2
# Gives a proposed palette
wordcloud2(demoFreq, size=1.6, color='random-dark')
wordcloud2(df1, size=1, color='random-dark')
# or a vector of colors. vector must be same length than input data
wordcloud2(demoFreq, size=1.6, color=rep_len( c("green","blue"), nrow(demoFreq) ) )
# Change the background color
wordcloud2(demoFreq, size=1.6, color='random-light', backgroundColor="black")
# Change the shape:
wordcloud2(demoFreq, size = 0.7, shape = 'star')
#It is possible to change the shape of the wordcloud. Several shapes are available within the package: ‘circle’ (default), ‘cardioid’, ‘diamond’ (alias of square), ‘triangle-forward’, ‘triangle’, ‘pentagon’, and ‘star’).
wordcloud2(df1, size = 0.7, shape = 'pentagon')
#It is also possible to use any image you have as a mask! Just insert the image in the current working directory and use it as in the code below
# Change the shape using your image - not working
#wordcloud2(demoFreq, figPath = "peace.png", size = 1.5, color = "skyblue", backgroundColor="black")
#rotation
ww=wordcloud2(demoFreq, size = 2.3, minRotation = -pi/6, maxRotation = -pi/6, rotateRatio = 1)
ww
#chinese
demoFreqC
head(demoFreqC)
wordcloud2(demoFreqC, size = 2, fontFamily = "微软雅黑", color = "random-light", backgroundColor = "grey")
wordcloud2(demoFreqC, size = 2, color = "random-light", backgroundColor = "grey")
?demoFreqC
#Try this... not working as of now
#https://unicode.org/charts/PDF/U0900.pdf : 0900–097F
V2 = c('<U+9000><U+9001>', '<U+9002><U+9003>', '<U+900D><U+900F>', '<U+6570><U+636E>')
V1 = c(2000, 1000, 500,1000)
hindi = data.frame(V2, V1)
hindi
wordcloud2(hindi, size = 2, color = "random-light", backgroundColor = "grey")
#notworking : clear the plot area
#The lettercloud function allows to use a letter or a word as a shape for the wordcloud.
letterCloud( demoFreq, word = "R", color='random-light' , backgroundColor="blue")
letterCloud( demoFreq, word = "PEACE", color="white", backgroundColor="pink")
#see the link : https://www.r-graph-gallery.com
#Example3 -----
#wordcloud2
#install.packages('wordcloud2')
library(wordcloud2)
#eg1
df = data.frame(word=factor(c('mdi','iim','imt','fms')),freq=c(20,23,105, 30))
df
wordcloud2(df)
#built in data set
head(demoFreq)
nrow(demoFreq)
wordcloud2(demoFreq, size = 2, color = "random-light", backgroundColor = "grey")
names(demoFreq)
wordcloud2(demoFreq, size = 2, minRotation = -pi/2, maxRotation = -pi/2)
wordcloud2(demoFreq, size = 2, minRotation = -pi/6, maxRotation = -pi/6, rotateRatio = 1)
wordcloud2(demoFreq, size = 2, minRotation = -pi/6, maxRotation = pi/6, rotateRatio = 0.9)
wordcloud2(demoFreqC, size = 2, color = "random-light", backgroundColor = "grey")
wordcloud2(demoFreqC, size = 2, minRotation = -pi/6, maxRotation = -pi/6, rotateRatio = 1)
# Color Vector
colorVec = rep(c('red', 'skyblue'), length.out=nrow(demoFreq))
wordcloud2(demoFreq, color = colorVec, fontWeight = "bold")
wordcloud2(demoFreq, color = ifelse(demoFreq[, 2] > 20, 'red', 'skyblue'))
#
#eg2 : subset these words
str(demoFreq)
df3 = demoFreq[ sample(1:nrow(demoFreq), size=10, replace=T), ]
df3
wordcloud2(df3)
df3=fix(df3)
df3
wordcloud2(df3)
#Example4------
##http://www.sthda.com/english/wiki/text-mining-and-word-cloud-fundamentals-in-r-5-simple-steps-you-should-know
#https://www.r-graph-gallery.com/196-the-wordcloud2-library/
# library #install this
library(wordcloud2)
# have a look to the example dataset
head(demoFreq)
par(mar = c(1, 1, 1, 1))
wordcloud2(demoFreq, size=1)
#create your set of words and freq
df = data.frame(word=c('IIMS','IIMK','IIMKPV','IIMA','IIMC'),freq=c(70,13,35,40,21))
df
wordcloud2(df, size = 1)
# Gives a proposed palette
wordcloud2(demoFreq, size=1.6, color='random-dark')
# or a vector of colors. vector must be same length than input data
wordcloud2(demoFreq, size=1.6, color=rep_len( c("green","blue"), nrow(demoFreq) ) )
# Change the background color
wordcloud2(demoFreq, size=1.6, color='random-light', backgroundColor="black")
# Change the shape:
wordcloud2(demoFreq, size = 0.7, shape = 'star')
head(demoFreq)
?wordcloud2
# Change the shape using your image
wordcloud2(demoFreq, figPath = "india.jpg", size = 1.5, color = "skyblue", backgroundColor="black")
#myeg-----
word = factor(c('Assets', 'Liabilities', 'Expenses', 'CashFlow', 'BottomLine','ProfitandLoss', 'Strategy', 'Planning', 'Forecast','Business'))
length(word)
(freq= c(27,35,32,31,33,24,26,27,28,29))
#freq = as.integer(runif(length(word), 50,100))
(df2 = data.frame(word, freq))
row.names(df2) = word
head(df2)
wordcloud2(df2)
wordcloud2(df2, color = ifelse(df2[, 2] > 30, 'red', 'skyblue'))
================================================
FILE: 01-IIM/16e0-GPH-wordcloud.R
================================================
#Word Cloud in R
library(wordcloud)
?wordcloud
#wordcloud(words,freq,scale=c(4,.5),min.freq=3,max.words=Inf, random.order=TRUE, random.color=FALSE, rot.per=.1, colors="black",ordered.colors=FALSE,use.r.layout=FALSE, fixed.asp=TRUE)
(words1 = c(letters, LETTERS, 0:9))
length(words1)
(freq1 = seq(1, 1000, len = 62))
wordcloud(words= words1, freq=freq1)
(words2 = c('Dhiraj','Vishnu', 'Nayak', 'Aishwarya', 'Kavita'))
(freq2 = runif(n=length(words2), min=10, max=50))
par(mar=c(0,0,0,0))
cbind(words2, freq2)
wordcloud(words= words2, freq=freq2)
#another library - data frame
library(wordcloud2)
?wordcloud2
#wordcloud2(data, size = 1, minSize = 0, gridSize = 0, fontFamily = 'Segoe UI', fontWeight = 'bold', color = 'random-dark', backgroundColor = "white", minRotation = -pi/4, maxRotation = pi/4, shuffle = TRUE, rotateRatio = 0.4, shape = 'circle', ellipticity = 0.65, widgetsize = NULL, figPath = NULL, hoverFunction = NULL)
wordcloud2(demoFreq)
wordcloud2(demoFreq, size = 2)
wordcloud2(demoFreq, size = 1,shape = 'pentagon')
wordcloud2(demoFreq, size = 1,shape = 'star')
wordcloud2(demoFreq, size = 2, color = "random-light", backgroundColor = "grey")
defect = paste('defect', 1:100, sep='_')
freq = round(rnorm(n=100, mean=40, sd=10))
quality = data.frame(defect, freq)
wordcloud2(quality, size = 1,shape = 'star')
(iimstudents = data.frame(word=words2, freq=freq2))
par(mar=c(0,0,0,0))
wordcloud2(iimstudents, size=.5, shuffle = F)
#https://cran.r-project.org/web/packages/wordcloud2/vignettes/wordcloud.html
#https://www.r-graph-gallery.com/wordcloud.html
================================================
FILE: 01-IIM/16e3-GPH-wordcloud-text1.R
================================================
# World Cloud 2
library(wordcloud)
library(RColorBrewer)
library(SnowballC)
library(RCurl)
library(XML)
library(tm)
# Read the text file from file
#text = readLines(file.choose())
text = readLines(con= file("iim.txt"))
text
text[1]
# Load the data as a corpus
docs = Corpus(VectorSource(text))
docs
#Text transformation
toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
docs <- tm_map(docs, toSpace, "/")
docs <- tm_map(docs, toSpace, "@")
docs <- tm_map(docs, toSpace, "\\|")
docs
#Cleaning Text
# Convert the text to lower case
docs <- tm_map(docs, content_transformer(tolower))
# Remove numbers
docs <- tm_map(docs, removeNumbers)
# Remove english common stopwords
docs <- tm_map(docs, removeWords, stopwords("english"))
# Remove your own stop word
# specify your stopwords as a character vector
docs <- tm_map(docs, removeWords, c("blabla1", "blabla2"))
docs
# Remove punctuations
docs <- tm_map(docs, removePunctuation)
# Eliminate extra white spaces
docs <- tm_map(docs, stripWhitespace)
# Text stemming
docs <- tm_map(docs, stemDocument)
#Document Matrix
dtm <- TermDocumentMatrix(docs)
m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
head(d, 10)
#Generate Word Cloud
set.seed(1234)
par(mar=c(0,0,0,0))
wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words=200, random.order=FALSE, rot.per=0.35, colors=brewer.pal(8, "Dark2"))
findFreqTerms(dtm, lowfreq = 4)
findAssocs(dtm, terms = "freedom", corlimit = 0.3)
head(d, 10)
#Plot Freq
barplot(d[1:10,]$freq, las = 2, names.arg = d[1:10,]$word, col ="lightblue", main ="Most frequent words", ylab = "Word frequencies")
#Example2------
# Word Cloud
##http://dni-institute.in/blogs/colorful-word-cloud-using-r/
# tm for text mining
# SnowballC for text stemming
# wordcloud for generating word cloud images
# RCurl and XML packages to download and parse web pages
# RColorBrewer for color palettes
library(wordcloud)
library(RColorBrewer)
library(SnowballC)
library(RCurl)
library(XML)
library(tm)
source('http://www.sthda.com/upload/rquery_wordcloud.r')
#or #source('./TM/rquery_wordcloud.R')
filePath <- "http://www.sthda.com/sthda/RDoc/example-files/martin-luther-king-i-have-a-dream-speech.txt"
#filePath2 <- "./data/martin-luther-king-i-have-a-dream-speech.txt"
res<-rquery.wordcloud(filePath, type ="file", lang = "english")
#res<-rquery.wordcloud(filePath2, type ="file", lang = "english")
================================================
FILE: 01-IIM/16e4-GPH-wordcloud-text2.R
================================================
#Word Cloud Text File
Step 1 : Install and load the required packages
# Load
library("tm")
library("SnowballC")
library("wordcloud")
library("RColorBrewer")
#Step 2: Create/Read a text file------
#read from a text file
#text <- readLines(file.choose())
#or
# Read the text file from internet
filePath <- "http://www.sthda.com/sthda/RDoc/example-files/martin-luther-king-i-have-a-dream-speech.txt"
text <- readLines(filePath)
#Step 3 : Text mining------
# Load the data as a corpus
docs <- Corpus(VectorSource(text))
docs
inspect(docs)
#replace certain characters
toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
docs <- tm_map(docs, toSpace, "/")
docs <- tm_map(docs, toSpace, "@")
docs <- tm_map(docs, toSpace, "\\|")
docs
inspect(docs)
# Convert the text to lower case
docs <- tm_map(docs, content_transformer(tolower))
# Remove numbers
docs <- tm_map(docs, removeNumbers)
# Remove english common stopwords
docs <- tm_map(docs, removeWords, stopwords("english"))
# Remove your own stop word
# specify your stopwords as a character vector
docs <- tm_map(docs, removeWords, c("blabla1", "blabla2"))
# Remove punctuations
docs <- tm_map(docs, removePunctuation)
# Eliminate extra white spaces
docs <- tm_map(docs, stripWhitespace)
# Text stemming
# docs <- tm_map(docs, stemDocument)
#Step 4 : Build a term-document matrix-----
#Document matrix is a table containing the frequency of the words. Column names are words and row names are documents. The function TermDocumentMatrix() from text mining package can be used as follow :
dtm <- TermDocumentMatrix(docs)
m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
head(d, 10)
#Step 5 : Generate the Word cloud-----
set.seed(1234)
wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words=200, random.order=FALSE, rot.per=0.35, colors=brewer.pal(8, "Dark2"))
#words : the words to be plotted
# freq : their frequencies
# min.freq : words with frequency below min.freq will not be plotted
# max.words : maximum number of words to be plotted
# random.order : plot words in random order. If false, they will be plotted in decreasing frequency
# rot.per : proportion words with 90 degree rotation (vertical text)
# colors : color words from least to most frequent. Use, for example, colors =“black” for single color.
#Misc
findFreqTerms(dtm, lowfreq = 4)
#correlation of freedom with
findAssocs(dtm, terms = "freedom", corlimit = 0.3)
#barplot of words
barplot(d[1:10,]$freq, las = 2, names.arg = d[1:10,]$word, col ="lightblue", main ="Most frequent words", ylab = "Word frequencies")
#library wordcloud2
par(mar=c(0,0,0,0))
wordcloud2::wordcloud2(d)
wordcloud2::wordcloud2(d, shape='star')
#refer this
#http://www.sthda.com/english/wiki/text-mining-and-word-cloud-fundamentals-in-r-5-simple-steps-you-should-know
================================================
FILE: 01-IIM/17a2-STATS-freqtable.R
================================================
# Frequency Distribution
#Discrete Cat Data
(attend = c('A','P','P','A','P','A'))
table(attend)
cbind(table(attend)) # A & P on left side
#Discrete Numeric Data
color=c('Blue','Green','Magenta','Green','Black','Blue','Black')
x2 = table(color)
x2
x2a = cbind(x2)
x2a
hist(x2a) # not relevant
barplot(x2a) # not good
barplot(x2a, beside=T) # Better
unique(color) ; length(unique(color))
barplot(x2a, beside=T, col=rainbow(length(unique(color))))
pie(x2a)
#Continuous Data
set.seed(1234)
x3 = runif(100,0,150) # 0 to 150 marks range, 100 values
x3
x3 = ceiling(x3) #round to higher value
x3
range(x3)
# Divide range into step of 15 ie 10 levels
breaks = seq(0,150,by=15)
breaks
length(breaks)
x3
#x3[1] = 60; x3[2] = 75
x3.cut = cut(x3, breaks)
x3.cut
table(x3.cut)
cbind(table(x3.cut)) #see it vertically
#give intervals a character values a, b..
(x3.cut = cut(x3, breaks, labels=letters[1:10]))
#(x3.cut = cut(x3, breaks, labels=letters[1:length(breaks)-1]))
x3.cut
(x3a = table(x3.cut))
(x3b = cbind(x3a))
#plot these freq Table: which is better
hist(x3b)
pie(x3b)
barplot(x3b, beside=T)
barplot(x3b, beside=T, names.arg =rownames(x3b))
plot(x3b)
# and so..on like previous eg
================================================
FILE: 01-IIM/17c1-STATS-datapartition.R
================================================
#partition the data into train and test set
mtcars
nrow(mtcars)
#train-70%, test-30%
(myvalues = 1:32)
selected = sample(x=myvalues, size=.7 * 32)
length(selected)
index = sample(x=1:nrow(mtcars), size=.7 * nrow(mtcars), replace=F)
index
mtcars[c(1,4),] #first & 4th row no
mtcars[-c(1,4),] #exclude 1 & 4
train= mtcars[index,]
test= mtcars[-index,]
nrow(train)
nrow(test)
nrow(train) + nrow(test)
#
(rollnos = 1:50)
rollnos[rollnos > 20]
rollnos[10:15]
set.seed(1234)
(select = sample(x=rollnos, size=.75 * length(rollnos)))
length(select)
#
#40 students - M & F
(gender = c('M','F','M'))
(gender = sample(x=c('M','F'), size=40, replace=T))
(gender = sample(x=c('M','F'), size=10000, replace=T, prob=c(.6, .4)))
table(gender)
prop.table(table(gender))
#-------
## 75% of the sample size
smp_size <- floor(0.75 * nrow(mtcars))
## set the seed to make your partition reproducible
set.seed(123)
train_ind <- sample(seq_len(nrow(mtcars)), size = smp_size)
train <- mtcars[train_ind, ]
test <- mtcars[-train_ind, ]
# -----
library(caTools)
set.seed(101)
sample = sample.split(mtcars$am, SplitRatio = .75)
sample
train = subset(mtcars, sample == TRUE)
test = subset(mtcars, sample == FALSE)
train
; test
nrow(train); nrow(test)
table(train$am); table(test$am)
mtcars$id <- 1:nrow(mtcars)
train <- mtcars %>% dplyr::sample_frac(.75)
test <- dplyr::anti_join(mtcars, train, by = 'id')
library(caret)
intrain<-createDataPartition(y=factor(mtcars$am),p=0.7,list=FALSE)
intrain
training<-mtcars[intrain,]
testing<-mtcars[-intrain,]
training
testing
table(training$am)
table(testing$am)
#data partition is used for modeling when you want to divide data in to parts - Train and Test.
#caret package is most popular
library(dplyr)
mtcars %>% group_by(am) %>% sample_n(5)
mtcars %>% group_by(am) %>% sample_n(.2)
================================================
FILE: 01-IIM/17d2-STATS-basicstats.R
================================================
# Basic Stats
x = ceiling(rnorm(10000, mean=60, sd=20))
mean(x)
median(x)
#there is no mode function for mode stats
table(x)
sort(table(x), decreasing=T)
#mode
library(modeest)
mlv(x,method='shorth')
#quantile
quantile(x)
quantile(x,seq(.1,1,by=.1)) #decile
quantile(x,seq(.01,1,by=.01)) #percentile
ccdlibrary(e1071) # load e1071
plot(density(x)) #density plot
e1071::skewness(x) # apply the skewness
kurtosis(x)
sd(x); var(x)
cov(women$weight, women$height)
cor(women$height, women$height)
stem(x)
#Freq Table
library(fdth) #fast way of creating FT
ftable1 = fdt(x)
ftable1
================================================
FILE: 01-IIM/18d1-DPLYR-mtcars1.R
================================================
#Data Summarisation using Dplyr
#dataset - mtcars
#dplyr - mtcars
#install.packages('dplyr')
library(dplyr) #install this library
head(mtcars)
names(mtcars)
#Plan how would you like to summarise this dataset
#dplyr - mtcars
library(dplyr)
df = mtcars
df$carnames = rownames(mtcars)
df = cbind(cars=rownames(mtcars), mtcars)
df %>% mutate(add_c)
head(df)
head(df)
#Filter----
filter(df, cyl == 8) %>% select(carnames, mpg,cyl)
filter(mtcars, cyl == 8)
filter(mtcars, cyl < 6)
# Multiple criteria
filter(mtcars, cyl < 6 & vs == 1)
filter(mtcars, cyl < 6 | vs == 1)
# Multiple arguments are equivalent to and
filter(mtcars, cyl < 6, vs == 1)
#Select rows
#by rownumber
filter(mtcars, row_number() == 1L)
filter(mtcars, row_number() == n())
filter(mtcars, between(row_number(), 5, n()))
#slice-----
slice(mtcars, 1L)
slice(mtcars, n())
slice(mtcars, 5:n())
slice(mtcars, c(2,4,5,10))
#mutate----
#create new columns based on existing columns
mutate(mtcars, displ_l = disp / 61.0237) #keeps other col
#GroupBy summary
mtcars %>% group_by(am) %>% summarise(mean(mpg), max(wt))
mtcars %>% group_by(am) %>% summarise(MEANMPG = mean(mpg), MAXWT= max(wt))
mtcars %>% group_by(am, gear) %>% summarise_all(mean)
mtcars %>% group_by(am, gear)%>% summarise_all(c("min", "max"))
#specific columns
mtcars %>% summarise_at(c("mpg", "wt"), mean, na.rm = TRUE)
#select rows on the basis of sample
sample_frac(mtcars, 0.2, replace=T)
sample_n(mtcars, 60, replace=T) %>% select(mpg)
#Rows having least mpg (last 2)
top_n(mtcars,-2, mpg)
select(mtcars, mpg) %>% arrange(desc(mpg))
================================================
FILE: 01-IIM/18d3-DPLYR-mtcars2.R
================================================
# Data Summarisation - dplyr
#Home work
#Data Summarisation using Dplyr
#dataset - mtcars
#dplyr - mtcars
#install.packages('dplyr')
library(dplyr) #install this library
head(mtcars)
names(mtcars)
#Plan how would you like to summarise this dataset
#Filter----
filter(mtcars, cyl == 6)
filter(mtcars, cyl < 6)
# Multiple criteria
filter(mtcars, cyl < 6 & vs == 1)
filter(mtcars, cyl < 6 | vs == 1)
# Multiple arguments are equivalent to and
filter(mtcars, cyl < 6, vs == 1)
filter(mtcars, row_number() == 1L)
filter(mtcars, row_number() == n())
filter(mtcars, between(row_number(), 5, n()))
#mutate----
mutate(mtcars, displ_l = disp / 61.0237) #keeps other col
transmute(mtcars, displ_l = disp / 61.0237) #removes other cols
mutate(mtcars, cyl = NULL) #do not display cyl
#slice-----
slice(mtcars, 1L)
slice(mtcars, n())
slice(mtcars, 5:n())
slice(mtcars, c(2,4,5,10))
(by_cyl <- group_by(mtcars, cyl)) # ???
slice(by_cyl, 1:2)
#structure----
tbl_df(mtcars) # convert to tbl class
glimpse(mtcars) # dense summary of tbl data
View(mtcars) # spreasheet like form base pacakge
#rownames----
df = tibble::rownames_to_column(df, var='cars')
df2 = mtcars
names(df)
#has
tibble::has_rownames(mtcars)
tibble::has_rownames(df)
#remove rownames
#see without rownames
head(df2[1:5])
tibble::remove_rownames(df2)
#rowid as column
tibble::rowid_to_column(df, var = "rowid")
#column to rownames
head(df[1:5])
tibble::column_to_rownames(df, var = "cars")
mtcars %>% group_by(am)
#nothing - just separation
mtcars %>% group_by(am) %>% summarise(mean(mpg), max(wt))
#summarise----
summarise(mtcars, mean(disp))
summarise(group_by(mtcars, cyl), mean(disp))
summarise(group_by(mtcars, cyl), m = mean(disp), sd = sd(disp))
#summarise_all
mtcars %>% group_by(am, gear) %>% summarise_all(mean)
mtcars %>% group_by(am, gear) %>% summarise_all(c("min", "max"))
mtcars %>% group_by(am, gear) %>% summarise_all(funs(med = median))
mtcars %>% group_by(cyl, am) %>% summarise(max(mpg), mean(hp))
?mtcars
#without Group
mtcars %>% summarise(mean(mpg), max(wt))
mtcars %>% summarise_all(mean)
mtcars %>% select(wt, gear)%>% summarise_all(c("min", "max"))
mtcars %>% summarise_all(funs(med = median))
mtcars %>% summarise_if(is.numeric, mean, na.rm = TRUE)
iris %>% summarise_if(is.numeric, mean, na.rm = TRUE)
#specific columns
mtcars %>% summarise_at(c("mpg", "wt"), mean, na.rm = TRUE)
#------------------------------------
#unsortd----
dplyr::tbl_df(iris)
print(dplyr::tbl_df(mtcars), n=20) #display more columns and rows
#print(dplyr::tbl_df(mtcars), width=11)
tbl_df(mtcars) %>% print(n = Inf)
tbl_df(mtcars) %>% print(width = Inf)
tbl_df(mtcars) %>% as.data.frame(mtcars)
glimpse(mtcars)
df = mtcars
row.names(df) = NULL
df %>% select(mpg)
#head(mtcars)
select(mtcars, mpg, vs)
mtcars %>% dplyr::select(vs, mpg, wt)
mtcars %>% group_by(cyl) %>% summarise(avgwt = mean(wt), meanhp = mean(hp)) %>% arrange( desc(meanhp), avgwt)
mtcars
names(mtcars)
filter(mtcars, mpg > 23 | wt < 2)
mtcars %>% filter(mpg > 23 & wt > 2)
mtcars %>% select(mpg, wt) %>% filter(mpg > 23)
mtcars %>%
filter(iris, Sepal.Length > 7)
filter(mtcars, cyl == 4)
distinct(mtcars)
df = data.frame(a=c(2,2),b=c(2,2))
df
distinct(df)
sample_frac(mtcars, 0.2, replace=T)
sample_n(mtcars, 60, replace=T) %>% select(mpg)
slice(mtcars,10:14)
top_n(mtcars,-2, mpg)
select(mtcars, mpg) %>% arrange(desc(mpg))
#Columns
select(mtcars, mpg, wt)
select(mtcars, contains('a'))
names(mtcars)
select(mtcars, contains ='vs')
select(mtcars, everything())
mtcars %>% group_by(cyl, am) %>% summarise_all(mean)
df = data.frame(marks=c(1,2,3,7,1))
cbind(df, dplyr::mutate_each(df, funs(min_rank)))
mtcars %>% lead() %>% lag()
dplyr::n(mtcars)
select(mtcars, mpg2 = mpg)
df = mtcars[1:4]
names(df) = c('MPG','C1','C2','C3')
df= rename(df, C5=C1)
names(df)
df
rename(df, marks2 = marks)
df %>% mutate(marks2 = marks + 2, marks3 = marks + 4)
df %>% transmute(marks2 = marks + 2, marks3 = marks + 4)
library(nycflights13)
data(flights)
destinations <- group_by(flights, dest)
destinations
summarise(destinations,
planes = n_distinct(tailnum),
flights = n()
)
select(iris, -ends_with("Width")) %>% head
vars <- c("Petal.Length", "Petal.Width1")
select(iris, from=1, to=n())
filter(mtcars, row_number() == n())
filter(mtcars, between(row_number(), 5, n()))
mtcars %>% group_by(cyl) %>% filter(1:3)
> mtcars %.% group_by(cyl) %.% filter(sample(n(), 10))
group_by( mtcars, cyl ) %>% integer_filter(1:2)
?integer_filter
# Select odd
mtcars %>% slice(from = 1, to = n(), by = 2)
# Select even
slice(mtcars, from = 2, to = n(), by = 2)
# Select first 10
slice(mtcars, from = 1, to = 10)
# Select last 10
slice(mtcars, n()-10: n())
slice(mtcars, 1:4)
mtcars
df <- tibble(
g1 = c(1, 1, 2, 2, 2),
g2 = c(1, 2, 1, 2, 1),
a = sample(5),
b = sample(5)
)
df
df %>% slice(n()-2:n())
var1 <- quo(letters[1:5])
var1
quo(toupper(!!var1))
# Here we capture `letters[1:5]` as an expression:
quo(toupper(letters[1:5]))
#> ~toupper(letters[1:5])
# Here we capture the value of `letters[1:5]`
quo(toupper(!!letters[1:5]))
#> ~toupper(c("a", "b", "c", "d", "e"))
quo(toupper(UQ(letters[1:5])))
#> ~toupper(c("a", "b", "c", "d", "e"))
#
toupper(letters[1:5])
quote(toupper(letters[1:5]))
head(mtcars)
slice(mtcars, 1:5)
slice(mtcars, 1) #rowno missing, first row
slice(mtcars, 1L)
tail(mtcars,n=5)
slice(mtcars, n()-5:n())
slice(mtcars, n()) #last row
slice(mtcars, n() - 1) #2nd last row
mtcars %>% top_n(2)
mtcars %>% top_n(-2)
mtcars %>% group_by(cyl) %>% tally(cyl) %>% top_n(1, cyl)
dim(mtcars)
bind_rows(mtcars, mtcars)
bind_cols(mtcars,mtcars)
gtable_combine(list(mtcars, mtcars))
dim_desc(mtcars)
# combine applies the same coercion rules
f1 <- factor("a")
f2 <- factor("b")
c(f1, f2)
unlist(list(f1, f2))
gtable_combine(f1, f2)
gtable_combine(list(f1, f2))
slice( mtcars, c(1L,3L,2L,7L))
by_cyl <- mtcars %>% group_by(cyl)
# Select first row in each group
mtcars %>% slice(1)
by_cyl %>% slice(1)
# Select last row in each group
mtcars %>% slice(n())
by_cyl %>% slice(n())
# Rows not present in group silently ignored
mtcars %>% slice(10)
by_cyl %>% slice(10)
# Select arbitrary rows
mtcars %>% slice(1:9)
by_cyl %>% slice(1:3)
mtcars %>% slice(c(1, 3, 9))
by_cyl %>% slice(c(1, 3, 5))
# Select even rows
mtcars %>% slice(seq(2, n(), by = 2))
by_cyl %>% slice(seq(2, n(), by = 2)) %>% select(cyl, everything())
# Drop first row in each group
mtcars %>% group_by(cyl, am) %>% slice(1)
by_cyl %>% slice(1)
# Returns all values
by_cyl %>% slice()
df <- data.frame(x = c(10, 4, 1, 6, 3, 1, 1))
df %>% top_n(2)
# Negative values select bottom from group. Note that we get more
# than 2 values here because there's a tie: top_n() either takes
# all rows with a value, or none.
df %>% top_n(-2)
#gr
================================================
FILE: 01-IIM/18d4-DPLYR-dplyr.R
================================================
#dplyr - mtcars
library(dplyr)
#library(tidyverse)
#Filter----
filter(mtcars, cyl == 8)
filter(mtcars, cyl < 6)
# Multiple criteria
filter(mtcars, cyl < 6 & vs == 1)
filter(mtcars, cyl < 6 | vs == 1)
# Multiple arguments are equivalent to and
filter(mtcars, cyl < 6, vs == 1)
filter(mtcars, row_number() == 1L)
filter(mtcars, row_number() == n())
filter(mtcars, between(row_number(), 5, n()-2))
#mutate----
mutate(mtcars, displ_l = disp / 61.0237) #keeps other col
transmute(mtcars, displ_l = disp / 61.0237) #removes other cols
mutate(mtcars, cyl = NULL) #do not display cyl
#slice-----
slice(mtcars, 1L)
mtcars %>% slice(1L)
slice(mtcars, n())
slice(mtcars, 5:n())
slice(mtcars, c(2,4,5,10))
(by_cyl <- group_by(mtcars, cyl)) # ???
slice(by_cyl, 1:2)
mtcars %>% group_by(cyl) %>% slice(1:2)
#structure----
tbl_df(mtcars) # convert to tbl class
glimpse(mtcars) # dense summary of tbl data
View(mtcars) # spreasheet like form base pacakge
mtcars %>% group_by(am)
#nothing - just separation
mtcars %>% group_by(am) %>% summarise(mean(mpg), max(wt), min(wt))
#summarise----
summarise(mtcars, mean(disp))
summarise(group_by(mtcars, cyl), mean(disp))
summarise(group_by(mtcars, cyl), m = mean(disp), sd = sd(disp))
#summarise_all
mtcars %>% group_by(am, gear) %>% summarise_all(mean)
mtcars %>% group_by(am, gear)%>% summarise_all(c("min", "max")) %>% as.data.frame()
mtcars %>% group_by(am, gear)%>% summarise_all(list(med = median))
#without Group
mtcars %>% summarise(mean(mpg), max(wt))
mtcars %>% summarise_all(mean)
mtcars %>% select(wt, gear)%>% summarise_all(c("min", "max"))
mtcars %>% summarise_all(funs(med = median))
#summarise if :
mtcars %>% summarise_if(is.numeric, mean, na.rm = TRUE)
str(iris) #Species is a factor
iris %>% summarise_all(mean)
iris %>% summarise_if(is.numeric, mean, na.rm = TRUE)
#specific columns
mtcars %>% summarise_at(c("mpg", "wt"), mean, na.rm = TRUE)
#------------------------------------
#unsorted----
dplyr::tbl_df(iris) #all rows not displayed
print(dplyr::tbl_df(mtcars), n=20) #display more columns and rows
#print(dplyr::tbl_df(mtcars), width=11)
tbl_df(mtcars) %>% print(n = Inf) #all rows
tbl_df(mtcars) %>% print(width = Inf)
tbl_df(mtcars) %>% as.data.frame(mtcars)
glimpse(mtcars)
df = mtcars
row.names(df) = NULL #remove rownames
df %>% select(mpg)
#head(mtcars)
select(mtcars, mpg, vs)
mtcars %>% dplyr::select(vs, mpg, wt)
mtcars %>% group_by(cyl) %>% summarise(avgwt = mean(wt), meanhp = mean(hp)) %>% arrange( desc(meanhp), avgwt)
mtcars
names(mtcars)
filter(mtcars, mpg > 23 | wt < 2)
mtcars %>% filter(mpg > 23 & wt > 2)
mtcars %>% select(mpg, wt) %>% filter(mpg > 23)
mtcars %>%
filter(iris, Sepal.Length > 7)
filter(mtcars, cyl == 4)
#distinct rows
distinct(mtcars)
(df3 = data.frame(a=c(2,2,3),b=c(2,2,1)))
distinct(df3)
#sampling
sample_frac(mtcars, 0.2, replace=F)
sample_n(mtcars, 2, replace=F)
#%>% select(mpg)
slice(mtcars,10:14)
sort(mtcars$mpg, decreasing = T)
top_n(mtcars,-2, mpg) #least 2 mpg
select(mtcars, mpg) %>% arrange(desc(mpg))
#Columns
select(mtcars, mpg, wt)
select(mtcars, contains('a'))
names(mtcars)
select(mtcars, contains ='vs')
select(mtcars, everything())
df= mtcars
df$names = rownames(mtcars)
head(df)
df %>% select(1:5,12) %>% arrange(mpg)
mtcars %>% group_by(cyl, am) %>% summarise_all(mean)
(df4 = data.frame(marks=c(1,2,2,3,7,1,100)))
cbind(df4, dplyr::mutate_all(df4, funs(min_rank)))
#shift the columns
mtcars %>% lead()
mtcars %>% lag()
mtcars %>% summarise(n())
select(mtcars, mpg2 = mpg)
df = mtcars[1:4]
names(df) = c('MPG','C1','C2','C3')
df= rename(df, C5=C1)
names(df)
df
df = women
rename(df, HeightWomen = height)
df %>% mutate(height2 = height + 2, weight2 = weight + 4)
#does not show orginal columns
df %>% transmute(height2 = height + 2, weight2 = weight + 4)
library(nycflights13)
data(flights)
head(flights)
destinations <- group_by(flights, dest)
destinations
summarise(destinations, planes = n_distinct(tailnum), flights = n())
select(iris, -ends_with("Width")) %>% head
vars <- c("Petal.Length", "Petal.Width1")
select(iris, from=1, to=n())
filter(mtcars, row_number() == n())
filter(mtcars, between(row_number(), 5, n()))
mtcars %>% group_by(cyl) %>% filter(1:3)
group_by( mtcars, cyl ) %>% integer_filter(1:2)
?integer_filter
#error ???????
# Select odd
mtcars %>% slice(from = 1, to = n(), by = 2)
# Select even
slice(mtcars, from = 2, to = n(), by = 2)
# Select first 10
slice(mtcars, from = 1, to = 10)
# Select last 10
slice(mtcars, n()-10: n())
slice(mtcars, 1:4)
mtcars
df <- tibble(
g1 = c(1, 1, 2, 2, 2),
g2 = c(1, 2, 1, 2, 1),
a = sample(5),
b = sample(5)
)
df
df %>% slice(n()-2:n())
var1 <- quo(letters[1:5])
var1
quo(toupper(!!var1))
# Here we capture `letters[1:5]` as an expression:
quo(toupper(letters[1:5]))
#> ~toupper(letters[1:5])
# Here we capture the value of `letters[1:5]`
quo(toupper(!!letters[1:5]))
#> ~toupper(c("a", "b", "c", "d", "e"))
quo(toupper(UQ(letters[1:5])))
#> ~toupper(c("a", "b", "c", "d", "e"))
#
toupper(letters[1:5])
quote(toupper(letters[1:5]))
head(mtcars)
slice(mtcars, 1:5)
slice(mtcars, 1) #rowno missing, first row
slice(mtcars, 1L)
tail(mtcars,n=5)
slice(mtcars, n()-5:n())
slice(mtcars, n()) #last row
slice(mtcars, n() - 1) #2nd last row
mtcars %>% top_n(2)
mtcars %>% top_n(-2)
mtcars %>% group_by(cyl) %>% tally(cyl) %>% top_n(1, cyl)
dim(mtcars)
bind_rows(mtcars, mtcars)
bind_cols(mtcars,mtcars)
gtable_combine(list(mtcars, mtcars))
dim_desc(mtcars)
# combine applies the same coercion rules
f1 <- factor("a")
f2 <- factor("b")
c(f1, f2)
unlist(list(f1, f2))
gtable_combine(f1, f2)
gtable_combine(list(f1, f2))
slice( mtcars, c(1L,3L,2L,7L))
by_cyl <- mtcars %>% group_by(cyl)
# Select first row in each group
mtcars %>% slice(1)
by_cyl %>% slice(1)
# Select last row in each group
mtcars %>% slice(n())
by_cyl %>% slice(n())
# Rows not present in group silently ignored
mtcars %>% slice(10)
by_cyl %>% slice(10)
# Select arbitrary rows
mtcars %>% slice(1:9)
by_cyl %>% slice(1:3)
mtcars %>% slice(c(1, 3, 9))
by_cyl %>% slice(c(1, 3, 5))
# Select even rows
mtcars %>% slice(seq(2, n(), by = 2))
by_cyl %>% slice(seq(2, n(), by = 2)) %>% select(cyl, everything())
# Drop first row in each group
mtcars %>% group_by(cyl, am) %>% slice(1)
by_cyl %>% slice(1)
# Returns all values
by_cyl %>% slice()
df <- data.frame(x = c(10, 4, 1, 6, 3, 1, 1))
df %>% top_n(2)
# Negative values select bottom from group. Note that we get more
# than 2 values here because there's a tie: top_n() either takes
# all rows with a value, or none.
df %>% top_n(-2)
================================================
FILE: 01-IIM/21a1-SLM-women.R
================================================
#topics ----
#factors, env, import/export. package install
#rep, recode, split, partition, subset, loops, cast & melt
#missing values. duplicates, apply
#graphs - bar, multiple line, pie, box, corrgram
# predict weight for certain height
women
head(women)
?women
dim(women)
dim(women)
plot(women)
cor(women) #strength & direction of relations -1 to 0 to +1
cor(women$height, women$weight)
cov(women$height, women$weight)
#is height the cause of change in weight ???? - do linear regression
fit = lm(weight ~ height,data = women)
summary(fit)
range(women$height)
y = mx + c
weight = m * height + c
weight = 3.45 * height + - 87
#cannot do exterpoliation, only interpolate withing IV range
(ndata = data.frame(height = c(60.5, 70.5, 71.5)))
(predictedwt = predict(fit, newdata = ndata))
cbind(ndata, predictedwt)
3.45 * 60.5 + - 87
resid(fit)
fitted(fit)
cbind(women, fitted(fit), resid(fit))
#assumptions
plot(fit)
#multiple linear regression
head(mtcars)
df <- mtcars %>% select(mpg, wt, hp)
head(df)
fit2 = lm(mpg ~ wt + hp, data = df)
summary(fit2)
mpg = 37 + -3.87 * wt + - 0.03 * hp
37 + -3.87 * 2.620 + - 0.03 * 110
resid(fit2)
df$mpg
fitted(fit2)
cbind(df$mpg, fitted(fit2), resid(fit2), df$mpg - fitted(fit2))
#cpp, java.. object oriented - correlation
residuals(fit2)
df$mpg - fitted(fit2) #predicted values
resid(fit2) == residuals(fit2)
plot(fit2)
women
df2 = women
df3 = rbind(df2, c(73, 500))
fit2b = lm(weight ~ height, data = df3)
summary(fit2b)
summary(fit)
plot(fit2b)
df3
================================================
FILE: 01-IIM/21a2-SLM-women.R
================================================
# Regression Analysis
# Simple Linear with 1 IV and 1 DV
data(women)
women
names(women)
str(women)
cov(women$height, women$weight)
#69 : which show positive relationship between height and weight
cor(women$height, women$weight)
#0.995 : which shows Strong and Positive relationship betw height & weight
0.995^2 # also equal R^2 value in this case (Simple Linear Regression)
plot(x=women$height, y=women$weight, type='b')
#lm( y ~ x , data)
abline(lm(weight ~ height, data=women), col='red')
names(women) #x- IV, y -DV
fit1 = lm(weight ~ height, data=women) # creating a model
summary(fit1) #summary of the Model
#Ho: (F Test) : No relationship between Y and any X
#Ha: There is relationship between Y and at least one X
# p < 0.05 Reject Ho in favour of Ha
#linear relationship between X & Y axist
attributes(fit1) #output of model
#coefficients
coef(fit1)
#p values for b0 & b1 are significant as it is < 0.05
#pvalue1 = 1.71e-09 < 0.05
#pvalue2 = 1.09e-14 < 0.05
coef(fit1)
#Y = -87 + 3.4 * X # no extrapolations
(Y = -87 + 3.4 * 20) # weight cannot be negative
range(women$height)
(Y = -87 + 3.4 * 58)
(Y = -87 + 3.4 * 61.5)
(Y = -87 + 3.4 * 72)
range(women$height)
women
#R^2
(summary(fit1))$r.squared #0.991
(summary(fit1))$adj.r.squared #0.9903
#99% of variation in weight is explained by Height
#Good Linear Model for Prediction
#FStats : p value
(summary(fit1))$fstatistic
#pvalue : 1.09e-14 < 0.05 : Model exists
#There is at least 1 IV which explains variation in Y (DV)
#Prediction : for height = 65, 66
(new1 = data.frame(height=c(65,66,66.5)))
new1
(p1=predict(fit1, newdata = new1))
cbind(new1, p1)
#136.7333 140.1833
(new2 = data.frame(height=c(60,69)))
(p2=predict(fit1, newdata = new2))
cbind(new2, p2)
?predict
predict(fit1, newdata = new1, interval='confidence')
predict(fit1, newdata = new1, interval='prediction')
women
women$weight
fitted(fit1) # predicted values of all original Xs
predict(fit1, newdata = data.frame(women$height)) # same as above for single X
new3 = data.frame(women$height) # DF using height of original data
cbind(women, fitted(fit1), residuals(fit1)) # compare women, predicted, errors
residuals(fit1) # diff between actual and predicted values of weight
summary(fit1)
summary(residuals(fit1))
plot(fit1)
cor(fitted(fit1), women$weight)
summary(fit1)
================================================
FILE: 01-IIM/21a4-SLM-women.R
================================================
fit = lm(weight ~ height, data=women)
summary(fit)
range(women$height)
(ndata = data.frame(height= c(58.5, 60.7)))
(p = predict(fit, newdata = ndata))
cbind(ndata, p)
plot(fit)
sum((fitted(fit) - women$weight)^2)
================================================
FILE: 01-IIM/21a5-SLM-women-A.R
================================================
# Simple Linear Regression : Built in Data Set Women
# Check for assumptions of Regression in the data Set
women
?women
str(women)
fit = lm(weight ~ height, data=women)
?plot
#Initial Checks
cor(women$height, women$weight)
#there is Strong and Positive Relationship between height and weight
plot(women$height, women$weight)
par(mfrow=c(1,1))
plot(fit, which=2)
plot(women)
str(women)
head(women)
women[,2]
fitted(fit)
cbind(women, fitted(fit), predicted=3.45 * women$height - 87)
cbind(women, fitted(fit), residue=fitted(fit)-women$weight, resid(fit))
y = 3.4 * height - 87
plot(residuals(fit))
hist(women$height, breaks=4)
hist(residuals(fit))
hist(residuals(fit), freq=F)
lines(density(residuals(fit)))
#Prediction
ndata = data.frame(height=52.5)
predict(fit, newdata=ndata, type='response')
#Assumptions Regression
#Linearity----
#Linearity of the data. The relationship between the predictor (x) and the outcome (y) is assumed to be linear.
# component + residual plot
plot(women$height, fitted(fit))
plot(residuals(fit) ~ fitted(fit))
plot(fit, which=1)
#residuals should be randomly distributed and not increase or decrease
#Normality----
#Normality of residuals. The residual errors are assumed to be normally distributed.
plot(density(resid(fit)))
plot(fit, which=2)
#Homoscedasticity----
#Homogeneity of residuals variance. The residuals are assumed to have a constant variance (homoscedasticity - opposite of heteroscedasticity)
plot(fit, which=3)
#No funnel shape, random distribution of residuals
plot(fit, which=4)
#Auto-Correlation----
library(car)
#Independence of residuals error terms. (Not dependent on previous values)
durbinWatsonTest(fit)
#pvalue < 0; Ho that there is no correlation (r2=0) is accepted
#Outliers
plot(fit, which=4)
summary(fit)
#potential outliers are highlighted 1, 14, 15 row
women[c(1,14,15),]
#Lets remove these values and then find R2
fit2 = lm(weight ~ height, data=women[-c(1,14,15),])
summary(fit2)
(summary(fit))$r.squared
(summary(fit2))$r.squared
AIC(fit, fit2) #lower value of AIC is better
#Potential Problems
#Non-linearity of the outcome - predictor relationships
#Heteroscedasticity: Non-constant variance of error terms.
#Auto Collinearity, Multi-collinearity
#Presence of influential values in the data that can be:
#Outliers: extreme values in the outcome (y) variable
#High-leverage points: extreme values in the predictors (x) variable
#All these assumptions and potential problems can be checked by producing some diagnostic plots visualizing the residual errors.
plot(fit1)
================================================
FILE: 01-IIM/21b1-SLM-sales.R
================================================
#Multiple Linear Regression
#Linear Modeling : DV vs more than 1 IVs
#sales Qty vs price & promotion
#Omni Store
#creating data using Vector
sales= c(4141,3842,3056,3519,4226, 4630,3507,3754, 5000,5120,4011, 5015,1916,675, 3636,3224,2295, 2730,2618,4421, 4113,3746, 3532, 3825,1096, 761,2088,820,2114, 1882,2159,1602,3354,2927)
price = c(59,59,59,59,59,59,59,59,59,59,59,59, 79,79,79,79,79,79,79,79,79, 79,79,79,99,99, 99,99,99,99,99,99,99,99)
promotion= c(200,200,200,200,400,400,400,400, 600,600,600,600,200,200,200,200, 400,400,400,400,600,600,600,600, 200,200,200,200,400,400,400,400,600,600)
omni1 = data.frame(sales, price, promotion)
head(omni1)
str(omni1)
#2nd Method : CSV file
omni2 = read.csv(file.choose())
#3rd Method : gsheet
library(gsheet)
url = "https://docs.google.com/spreadsheets/d/1h7HU0X_Q4T5h5D1Q36qoK40Tplz94x_HZYHOJJC_edU/edit#gid=1595306231"
omni3 = as.data.frame(gsheet::gsheet2tbl(url))
#Make one of data frames active
omni = omni1
head(omni)
?lm #see help of LM
#Simple Linear Model would look like this
slr1 = lm(formula = sales ~ price, data=omni) # sales depend on price of item
slr2 = lm(formula = sales ~ promotion, data=omni) # sales depend on promotion exp
summary(slr1)
summary(slr2)
#MLR Create Multiple Linear Regression
# we want to see how Sales Qty depend on Price and Promotion Values
mlrmodel1 = lm(sales ~ price + promotion, omni)
#mlrmodel1 = lm(omni, sales ~ price + promotion)
?lm
#how to give parameter values in different sequence, use arguments names if in different order
mlrmodel1 = lm( data=omni, formula = sales ~ price + promotion)
range(omni$sales)
summary(mlrmodel1) # summary statistics IMP STEP
#understand values : R2, AdjR2, Fstats pvalue, Coeff, ***, Residuals
coef(mlrmodel1) #coefficients b1, b2
#anova(mlrmodel1) #seeing from anova model
head(omni)
plot(y=omni$sales, x=omni$promotion)
#Predicted Values----
dim(omni)
fitted(mlrmodel1)
cbind(omni, fitted(mlrmodel1), residuals(mlrmodel1))
summary(mlrmodel1)
names(omni)
#create a dataframe of new sample values
(ndata1 = data.frame(price=c(60,70), promotion=c(300,400)))
range(omni$price); range(omni$promotion)
predict(mlrmodel1, newdata=ndata1)
cbind(ndata1, Predict=predict(mlrmodel1, newdata=ndata1, predict='response'))
#R2 and Adjs R2
names(mlrmodel1)
summary(mlrmodel1)
summary(mlrmodel1)$r.squared
summary(mlrmodel1)$adj.r.squared
#Manual Calculation of Adjs R2
(r2 = summary(mlrmodel1)$r.squared)
k = 2 # no of IVs
(n = nrow(omni)) # sample size
(adjr2 = 1 - ( (1 - r2) * ((n - 1)/ (n - k - 1))))
# Fstatistics
summary(mlrmodel1)$fstatistic[1] # from output of model
(df1 = k) ; (df2 = n-k-1)
qf(.95, df1, df2) # from table wrt df1 & df2
#Model Fstats > table(Fstat)
# Pvalue of Model
fstat = summary(mlrmodel1)$fstatistic
pf(fstat[1], fstat[2], fstat[3], lower.tail=FALSE)
# this is < 0.05 : Significant
#
#Plots of the Modle
plot(mlrmodel1,1) # no pattern, equal variance
plot(mlrmodel1,2) # Residues are normally distributed
plot(mlrmodel1,3)
plot(mlrmodel1,4) # tells outliers which affect model
# Confidence Intervals
#Fitted values : Predicting on IVs using model
fitted(mlrmodel1)
residuals(mlrmodel1)
mlrmodel1$residuals
cbind(omni$sales, fitted(mlrmodel1), omni$sales - fitted(mlrmodel1), residuals(mlrmodel1))
#sqrt(sum((residuals(mlrmodel1)^2)))
names(mlrmodel1)
summary(mlrmodel1)
#Diagnostics Test for Checking Assumptions
#Should be Linear relationship between Residuals Vs Ypi, X1i, X2i
cbind(fitted(mlrmodel1), residuals(mlrmodel1))
plot(cbind(fitted(mlrmodel1), residuals(mlrmodel1)))
#not quadratic
plot(cbind(omni$price, residuals(mlrmodel1)))
plot(cbind(omni$promotion, residuals(mlrmodel1)))
#May indicate quadratic term of IVs
#Train and Test Data
# RMSE
omni
names(omni)
mlr2 = lm(sales ~ price + promotion, data= omni)
summary(mlr2)
new1=data.frame(price=60:70, promotion=400)
predict(mlr2, newdata = new1)
cbind(new1,predict(mlr2, newdata = new1) )
================================================
FILE: 01-IIM/21b3-SLM-areasales.R
================================================
# SLR Area vs Sales
#https://www.statisticshowto.datasciencecentral.com/excel-regression-analysis-output-explained/
#import from ggsheet
library(gsheet)
slr1 = "https://docs.google.com/spreadsheets/d/1qLHa5qFTyWacta8F-IGo6J3Zpf-BVR9OrlqONuJDqYc/edit#gid=2023826519"
df = as.data.frame(gsheet2tbl(slr1))
head(df)
#X-area in sqft, Y-sales in some unit currency
str(df)
linearmodel = lm(Y ~ X, data=df)
plot(df$X, df$Y)
abline(lm(Y ~ X, data=df), col='red')
cor(df$X, df$Y)
cov(df$X, df$Y)
summary(linearmodel)
#Ho: (F Test) : No relationship between Y and any X
#Ha: There is relationship between Y and at least one X
# p < 0.05 Reject Ho in favour of Ha
coef(linearmodel)
#Y = 0.96 + 1.66 * X
range(df$X) #value of X to be betw this range : interpolation not extrapolation
(new2 = data.frame(X=c(1.5,2,3,4,5)))
(p2sales= predict(linearmodel, newdata= new2))
cbind(new2, p2sales)
summary(linearmodel)
head(df)
#residual
(Y = 0.96 + 1.66 * 2.2)
(r = 5.6 - 4.61)
plot(df$X, df$Y)
abline(lm(Y ~ X, data=df), col='red')
abline(v=2.2,h=c(5.6, 4.61))
cbind(resid(linearmodel))
fitted(linearmodel) - df$Y
#assumptions
plot(linearmodel)
#second case
#for women dataset; make a linear model
#predict weight for ht=62.5 and 66.5
head(women)
names(women)
range(women$height)
df1 = women
head(df1)
model1 = lm(weight ~ height, data = df1)
summary(model1)
#F-stat pvalue < 0.05 : Lin Model Exists
#Coeff : Both are significant
#if ht is increased by 1 unit, wt incr 3.45 units
#Multiple r2 : 99% of variation in Wt is due to Ht
#Assumptions : plot ; linearity, normality, equal variance, outliers, no autocorrelation, no multi-collinearity
#predict
(ndata2 = data.frame(height=c(62.5, 66.5)))
(p2wt = predict( level=.95,interval= 'confidence', newdata= ndata2, object=model1))
cbind(ndata2, p2wt)
?predict
================================================
FILE: 01-IIM/21b4-SLM-salesarea.R
================================================
#Simple Linear Regression - Case Study
# Regression : Areas vs Sales
#Given data of area and sales, predict value for sales for specific areas eg : (1.5,2,3,4,5)
#manual way of doing regression see online URL
#Create/ Import Data-----
#Method1 : creating data from Vectors
#X -is area sqft Y-sales in 1000s units; Find relationship betn X & Y
X = c(1.7,1.6,2.8,5.6,1.3,2.2,1.3,1.1,3.2,1.5,5.2,4.6,5.8,3 )
Y = c(3.7,3.9,6.7,9.5,3.4,5.6,3.7,2.7,5.5,2.9,10.7,7.6,11.8,4.1 )
df1 = data.frame(X,Y)
head(df1)
#2nd method of importing data
#import from ggsheet #pickup the correct url
library(gsheet)
area1 = "https://docs.google.com/spreadsheets/d/1h7HU0X_Q4T5h5D1Q36qoK40Tplz94x_HZYHOJJC_edU/edit#gid=2023826519"
df2 = as.data.frame(gsheet::gsheet2tbl(area1))
str(df2)
head(df2)
#Third method of importing data from csv
df3 = read.csv('./data/slr1.csv')
str(df3)
#Method4 for importing from CSV file with choose location
df4 = read.csv(file.choose())
str(df4)
# Use Vector Data or method used to import data
#make one of the DF active
df = df1
df
#simple stats
mean(df$X); mean(df$Y)
sum(df$X); sum(df$Y)
sd(df$X) ; var(df$Y)
cov(df$X,df$Y); cov(df$Y,df$X)
cor(df$X,df$Y) ; cor(df$Y,df$X)
#cor.test(df$X,df$Y)
#some plots to understand pattern
plot(df$X, df$Y) #simple command to plot : Next with features
plot(y=df$Y, x=df$X,xlab='Area in sqft', ylab='Sales Amount', type='p', ylim=c(0, max(df$Y)+1), main='Plot of Area Vs Sales', xlim=c(0,max(df$X)+ 1), col='red',pch=17)
?plot
abline(lm(Y ~ X,data=df1), lty=1, lwd=2, col='green') # with regression line
abline(v=c(3,5, min(df$X), max(df$X)),h=c(6,10, min(df$Y), max(df$Y)), col=c('red','blue','green','yellow')) # few straight lines at x & y axis
range(df$X)
#Model
fit1 = lm(Y ~ X, data=df) # create Simple Linear Model Y wrt X
fit1
summary(fit1)
range(df$X)
(newdata = data.frame(X= c(2.5,3.2)))
predict(fit1, newdata = newdata)
#few extras
names(fit1) # attributes(fit1)
system.time(lm(Y ~ X, data=df)) #time taken to compute linear regression
coef(fit1) # Coefficients of Equation Y = mX + C
fitted(fit1) # predicted values for all X in orginal data
predict(fit1, newdata=data.frame(X))
residuals(fit1) # diff between actual and predicted values - residuals
(R= df$Y - fitted(fit1))
plot(density(residuals(fit1)))
#residuals should be less :Diff of Y actual - Y predicted
#abline(h=coef(fit1)[1])
summary(fit1) # summary statistics of Linear Model(LM)
#understand the model values - R2, AdjR2, FStats, Residuals, Coeff p values - IMP STEP
names(fit1) #output variables names of LM
names(summary(fit1))
#u can select them to see output
summary(fit1)$r.squared
coef(fit1)[2] # slope or beta
fitted(fit1)
#combine the data with Ypredicted, errors
cbind(df, fitted(fit1), fitted(fit1)- df$Y, residuals(fit1))
#Mathematical Equation and predictions
(Y = 0.9645 + 1.6699 * 4) # Predict Y for X=4
#predict for area = 4
#using equation
summary(fit1)
coef(fit1)
(Y = coef(fit1)[1] + coef(fit1)[2] * 4)
#using model
range(df$X)
predict(fit1, newdata= data.frame(X=c(4,5)))
(new1 = data.frame(X=c(4,5,3,2)))
predicted2= predict(fit1, newdata= new1)
cbind(new1, predicted2)
#prediction can only be interpolated not extrapolated
range(df$X) #min to max value of X: area
#select new data given in the case
(new1 = data.frame(X=c(1.5,2,3,4,5)))
#sample data for X for prediction, should be between the range of X values
predict(fit1, newdata= new1) # Predict Function for 4 values of X
#columnbind with input and predicted values
cbind(new1, predictedY = predict(fit1, newdata= new1) )
library(forecast) #install it first
?accuracy(fit1)
#RMSE is generally used and should be least when selecting models
anova(fit1)
summary(fit1)$r.squared # R2 from Model
summary(fit1)$sigma #Residual Std Error SD along the LM Line
#---------------------------------------#Assumptions--------
#Assumption : Graphical Analysis : IMP STEP
plot(fit1)
par(mfrow=c(2,2))
plot(fit1)
par(mfrow=c(1,1))
plot(fit1, which=1)
# Linearity plot of residuals & X # No pattern for assumption that there is linearity betw X & Y
plot(df)
abline(h=0)
plot(residuals(fit1))
#Auto Collinearity : relation between successive values of Y
car::durbinWatsonTest(fit1)
?car::durbinWatsonTest
#pvalue > 0 : Do not reject Ho. that means there is no autocorrelation
#Normality of residuals
resid(fit1)
#qqplot(fitted(fit1),resid(fit1) )
plot(fit1, which=2)
#points to be around the straight line
#Equal Variance : 4th Assumption : homoscedasticity
plot(fit1, which=1)
#no funnel shape to show hetero-cedasticity
#Outlier Analysis
plot(fit1, which=4)
#abline(h=c(.5))
#no value of cooks distance > .5 : no data to be removed
#outlier values can affect the model
#see all diagnostic plots together
par(mfrow=c(2,2))
#multiple frames per row : rowwise filling, 2 rows, 2 columns
plot(fit1)
par(mfrow=c(1,1))
#SUMMARY ------
summary(fit1)$r.squared # 90% variation in Y explained by X
summary(fit1)
#F Stats pvalue < 0.05: Model exists : At least 1 indep variable has strong relationship with Dependent variable (Y)
#pvalue of Coef (X) < 0.05 : Significant X
#End of Simple Linear Regression
#Do different SLR on different data sets
#Learn what to do if there are violations of assumptions
df # dataset being used for LM
fit= lm(Y ~ X, data=df) #model creation
summary(fit) #summary of linear model
plot(fit) #diagnostic plots
predict(fit, newdata=data.frame(X=mean(df$X)))
#Multiple R2 explains the variation, model fitness
================================================
FILE: 01-IIM/21d2-MLM-mtcars1.R
================================================
#https://cran.r-project.org/web/packages/olsrr/olsrr.pdf
#install.packages('olsrr')
library(olsrr) #install it first
model <- lm(mpg ~ disp + hp + wt + qsec, data = mtcars)
summary(model)
#model <- lm(mpg ~ hp + wt , data = mtcars)
#summary(model)
k <- ols_step_all_possible(model)
#plot(k)
k
summary(lm(mpg ~ wt, data=mtcars))
summary(lm(mpg ~ wt+ hp, data=mtcars))
summary(lm(mpg ~ hp + wt + qsec, data=mtcars))
#library(olsrr)
fit = lm(mpg ~ disp + hp + wt + qsec, data = mtcars)
k = ols_step_all_possible(fit)
plot(k)
k
summary(lm(mpg ~ wt, data= train))
summary(lm(mpg ~ wt + hp, data= train))
finalmodel = lm(mpg ~ wt + hp, data= train)
library(gvlma)
gvmodel = gvlma(finalmodel)
gvmodel
finalmodel = lm(mpg ~ wt + hp, data= train)
(predictedvalues = predict(finalmodel, ndata=test))
cbind(test$mpg, predictedvalues)
================================================
FILE: 01-IIM/22a4-MLM-allmodels.R
================================================
# All models - This code performs all modeling in quick method.
# for details go detail code.
library(dplyr)
#Linear Regression
head(women)
head(mtcars)
#predict weight for new height
plot(women)
plot(women, ylim=c(0, 160), xlim=c(0,90))
fit1 = lm(weight ~ height, data=women)
summary(fit1)
range(women$height)
(ndata1=data.frame(height=c(59.5, 62.5)))
(predicted1 = predict(fit1, newdata= ndata1))
cbind(ndata1, predicted1)
plot(fit1) #check for assumptions
par(mfrow=c(2,2))
plot(fit1)
par(mfrow=c(1,1))
#Logistic Regression
data2 = read.csv("https://stats.idre.ucla.edu/stat/data/binary.csv")
head(data2)
str(data2)
summary(data2)
data2$rank = factor(data2$rank)
data2$admit = factor(data2$admit)
table(data2$rank, data2$admit) #2 way table
xtabs(~admit + rank, data = data2)
#create Logistic Model
fit2 <- glm(admit ~ gre + gpa + rank,data=data2,family="binomial")
summary(fit2)
(ndata2 = sample_n(data2, 3)) #pick up sample rows
#Predict admit for input data
(predicted2=predict(fit2,newdata=ndata2, type="response"))
(predictedclass2=ifelse(predicted2 > .5, 1,0))
cbind(ndata2, predicted2, predictedclass2)
#Data Partition
#see detailed case
#----------------------------------------------------
#Decision Tree - Classification
url3 = 'https://raw.githubusercontent.com/DUanalytics/datasets/master/csv/titanic_train.csv'
data3a = read.csv(url3)
head(data3a)
names(data3a)
data3 = data3a[,c(2,3,5,6,7)] #select few columns only
head(data3)
#install & load libraries
library(rpart)
library(rpart.plot)
#Decision Tree
fit3 = rpart(Survived ~ ., data = data3, method = 'class')
fit3
rpart.plot(fit3, extra = 106, cex=.8,nn=T) #plot
printcp(fit3) #select complexity parameter
prunetree3 = prune(fit3, cp=.014) #make tree smaller
rpart.plot(prunetree3, cex=.8,nn=T)
prunetree3
table(data3$survived)
#Predict class category or probabilities
(ndata3 = sample_n(data3,4))
predicted3 = predict(prunetree3, newdata=ndata3, type='class')
predicted3b= predict(prunetree3, newdata=ndata3, type='prob')
cbind(ndata3, predicted3, predicted3b)
#predict on test set
url3b ='https://raw.githubusercontent.com/DUanalytics/datasets/master/csv/titanic_test.csv' #test set
data3b = read.csv(url3b)
head(data3b)
names(data3b)
data3b = data3b[, c('Pclass','Sex','Age','SibSp')]
#select few columns only
head(data3b)
#similarly Regression Tree can be made
#Clustering----
#install.packages("amap")
##Read the data in the file
url4 = 'https://docs.google.com/spreadsheets/d/1PWWoMqE5o3ChwJbpexeeYkW6p4BHL9hubVb1fkKSBgA/edit#gid=2073914016'
library(gsheet)
data4 = as.data.frame(gsheet2tbl(url4))
head(data4)
summary(data4)
str(data4)
nrow(data4)
###Verify the data
colnames(data4)
apply(data4, 2, FUN= class) #are all numeric
fit4 = kmeans(data4[,-1],centers=3)
fit4$centers # group means
fit4$size #how rows in which cluster
fit4$withinss #math difference within each cluster; Which is more cohesive gp?
fit4$cluster
table(fit4$cluster)
cluster2 = data4[ fit4$cluster == 2,]
head(cluster2)
cluster2[-1] %>% summarise_all(mean)
write.csv(cluster2, file = "./data/data4cluster2.csv")
## Association Rules - Groceries data set----
library(arules) #install first
library(arulesViz) #install first
library(datasets) # no need to install, just load it reqd for Groceries
data('Groceries')
Groceries
arules::LIST(Groceries[1:6]) #different format
#Find Frequent Itemset
frequentItems = eclat(Groceries,parameter=list(supp = 0.01,minlen=3, maxlen=5))
inspect(frequentItems[1:10])
frequentItems
inspect(sort(frequentItems,by="count",decreasing=TRUE)[1:25])
#Support is : support(A&B) = n(A&B)/ N
#Plot the Frequency Plot
itemFrequencyPlot(Groceries,topN = 15,type="absolute")
itemFrequencyPlot(Groceries, topN = 10, type='relative')
abline(h=0.15)
# Create rules and the relationship between items
#parameters are min filter conditions
rules = apriori(Groceries, parameter = list(supp = 0.001, conf = 0.5, minlen=2))
rules
inspect (rules[1:5])
#Sort Rules by confidence, lift and see the data
rulesc <- sort (rules, by="confidence", decreasing=TRUE)
inspect(rulesc[1:5])
#similary it can be done for lift and support
#which items have strong confidence and lift
#How To Control The Number Of Rules in Output ?
#maxlen, minlen, supp, conf
#subset -----
#legend to condition commands
# lhs - means left hand side, or antecendent
# rhs - mean right hand side, or consequent
# items - items, that make up itemsets
# %in% - matches any
# %ain% - matches all
# %pin% - matches partially
# default - no restrictions applied
# & - additional restrictions on lift, confidence etc.
#Find what factors influenced an event ‘X’ :
#Find out what events were influenced by a given event
subset1a = subset(rules, subset=rhs %in% "whole milk")
inspect(subset1a[1:10])
subset1b = subset(rules, subset=rhs %in% 'bottled beer' )
inspect(subset1b) #no such rule with beer on rhs, change some parameters to such rules
#inspect(rules)
#Items in : all or any
subset2a = subset(rules, subset=lhs %ain% c('baking powder','soda') )
inspect(subset2a) #all items in 1 rule
subset2b = subset(rules, subset=lhs %in% c('baking powder','soda') )
inspect(subset2b[1:5]) #any of the items in the rule
#rhs- beer, confidence , sort by lift
subset3a = subset(rules, subset=rhs %in% 'bottled beer' & confidence > .7, by = 'lift', decreasing = T)
inspect(subset3a)
subset4 = subset(rules, subset=lhs %in% 'bottled beer' & rhs %in% 'whole milk' ) # lhs- beer, rhs- milk
inspect(subset4)
#Visualizing The Rules -----
plot(subset1[1:10])
plot(subset1[1:10], measure=c("support", "lift"), shading="confidence")
#
#install.packages('wordcloud2')
library(wordcloud2)
df = data.frame(word=c('cbap','cmap','iim','imt','calcutta'),freq=c(20,23,15,10,13))
df
wordcloud2(df)
head(demoFreq)
wordcloud2(demoFreq, size = 2, color = "random-light", backgroundColor = "grey")
names(demoFreq)
================================================
FILE: 01-IIM/22c2-MLM-mtcars-olsrr.R
================================================
#https://cran.r-project.org/web/packages/olsrr/olsrr.pdf
#install.packages('olsrr')
library(olsrr)
head(mtcars)
names(mtcars)
model <- lm(mpg ~ disp + hp + wt + qsec, data = mtcars)
k <- ols_step_all_possible(model)
plot(k)
k
summary(lm(mpg ~ wt, data=mtcars))
summary(lm(mpg ~ wt + hp + qsec, data=mtcars))
================================================
FILE: 01-IIM/22c3-MLM-cars.R
================================================
#http://r-statistics.co/Linear-Regression.html
head(cars)
fit2=lm(dist ~ speed, data=cars)
summary(fit2)
predict(fit2, newdata=data.frame(speed=c(5,8)))
plot(fit2)
nrow(cars)
index = sample(1:nrow(cars), size=0.8 * nrow(cars))
index
length(index)
length(unique(index))
traindata = cars[index,]
traindata
testdata = cars[-index,]
testdata
nrow(traindata) + nrow(testdata)
model1 = lm(dist ~ speed, data = traindata)
coef(model1)
(P1 = predict(model1, newdata = testdata))
df_test = cbind(testdata, P1)
cor(df_test$dist, P1)
(error = testdata$dist - P1)
AIC(model1)
BIC(model1)
(MAPE_error = mean(abs(error)/testdata$dist))
library(forecast)
accuracy(model1)
accuracy(error)
#https://www.ritchieng.com/machine-learning-evaluate-linear-regression-model/
#https://www.guru99.com/r-decision-trees.html
#http://www.sthda.com/english/articles/35-statistical-machine-learning-essentials/141-cart-model-decision-tree-essentials/
================================================
FILE: 01-IIM/22c3-MLM-salespromotion.R
================================================
#Multiple Linear Regression : DV vs more than 1 IVs
#sales Qty vs price & promotion
#Predict Sales Qty from Price and Promotion of the Product
#Omni Store
#creating data using Vector
sales= c(4141,3842,3056,3519,4226, 4630,3507,3754, 5000,5120,4011, 5015,1916,675, 3636,3224,2295, 2730,2618,4421, 4113,3746, 3532, 3825,1096, 761,2088,820,2114, 1882,2159,1602,3354,2927)
price = c(59,59,59,59,59,59,59,59,59,59,59,59, 79,79,79,79,79,79,79,79,79, 79,79,79,99,99, 99,99,99,99,99,99,99,99)
promotion= c(200,200,200,200,400,400,400,400, 600,600,600,600,200,200,200,200, 400,400,400,400,600,600,600,600, 200,200,200,200,400,400,400,400,600,600)
#Create a DF from 3 variables
omni1 = data.frame(sales,price,promotion)
head(omni1)
#2nd Method : from CSV file
#omni2 = read.csv(file.choose())
#3rd Method : from gsheet
library(gsheet)
url = "https://docs.google.com/spreadsheets/d/1h7HU0X_Q4T5h5D1Q36qoK40Tplz94x_HZYHOJJC_edU/edit#gid=1595306231"
omni3 = as.data.frame(gsheet::gsheet2tbl(url))
head(omni3)
#Make one of data frames active
omni = omni1
head(omni)
str(omni)
nrow(omni)
dim(omni)
#MLR Create Multiple Linear Regression
# we want to see how Sales Qty depend on Price and Promotion Values
fit2 = lm(sales ~ price + promotion, data=omni)
# summary statistics of model IMP STEP
summary(fit2)
#understand values : R2, AdjR2, Fstats pvalue, Coeff, ***, Residuals
#F Stats pvalue = 2.86e-10 < 0.05 : Model Exists
#At least 1 IV can be used to predict sales
names(summary(fit2))
summary(fit2)$adj.r.squared # Adjt R2 here > .6
#>74% of variation in sales is explained by price and promotion
#coefficients b1, b2
coef(fit2)
summary(fit2)
#price : -53 , pvalue = 9.2e-09 < 0.05 *** : Significant
#keeping promotion constant, if price is increased by 1 unit, salesqty decreases by 53 units
#promotion : +3.6 , pvalue = 9.82e-06 < 0.05 ***: Significant
#keeping price constant, if promotion is increased by 1 unit, salesqty increases by 3 units
fitted(fit2)#predicted sales values for input/ actual price and promtion
omni$sales #actual sales
residuals(fit2) #diff between predicted and actual sales
summary(residuals(fit2))
summary(fit2)
#Predict SalesQty for new combination of Values----
#create a dataframe of new sample values
range(omni$price) ; range(omni$promotion)
(ndata2 = data.frame(price=c(60,70,72), promotion=c(300,400,350)))
(p2sales = predict(fit2, newdata=ndata2, type='response'))
cbind(ndata2, p2sales)
head(omni)
#Assumptions
par(mfrow=c(2,2))
plot(fit2)
par(mfrow=c(1,1))
plot(fit2)
plot(fit2,which=1) # no pattern, equal variance
plot(fit2,2) # Residuals are normally distributed
plot(fit2,3) # No hetero-scedascity
plot(fit2,4) # tells outliers which affect model
omni[c(11,14,15),]
fit3 = lm(sales ~ price + promotion, data=omni[-c(11,14,15),])
plot(fit3,4)
summary(fit2)
summary(fit3)
#End of Multiple Linear Regression
#when variables are large, select only significant variables
#Model with higher R2 to be selected
#other measures of model selection : AIC, BIC, RMSE
#Dataset can be divided into train(70%) and test(30%) set to check the accuracy
#create model with t
#questions
fit2
summary(fit2)
head(omni)
cbind(omni, predict(fit2, newdata = data.frame(omni$price, omni$promotion)))
cbind(omni, fitted(fit2))
cbind(omni, fitted(fit2), omni$sales - fitted(fit2), residuals(fit2))
#divided data into parts
#training set =70%
#test set - 30%
#
head(women)
nrow(women)
library(dplyr)
women %>% sample_frac(.7) -> train1
================================================
FILE: 01-IIM/22d3-MLM-omni.R
================================================
#Multiple Linear Regression
#Linear Modeling : DV vs more than 1 IVs
#sales Qty vs price & promotion
#Omni Store
#creating data using Vector
sales= c(4141,3842,3056,3519,4226, 4630,3507,3754, 5000,5120,4011, 5015,1916,675, 3636,3224,2295, 2730,2618,4421, 4113,3746, 3532, 3825,1096, 761,2088,820,2114, 1882,2159,1602,3354,2927)
price = c(59,59,59,59,59,59,59,59,59,59,59,59, 79,79,79,79,79,79,79,79,79, 79,79,79,99,99, 99,99,99,99,99,99,99,99)
promotion= c(200,200,200,200,400,400,400,400, 600,600,600,600,200,200,200,200, 400,400,400,400,600,600,600,600, 200,200,200,200,400,400,400,400,600,600)
omni1 = data.frame(sales, price, promotion)
head(omni1)
str(omni1)
#Make one of data frames active
omni = omni1
head(omni)
options(scipen=999)
model2 = lm(sales ~ price + promotion, data = omni)
summary(model2)
#F-stats : pvalue < .05 : LM exists : Y is related to one of the Xs
#Adj R2 > .6 : 74% of variation in price is captured by price and promotion
#coeff : all are significant
#keeping price constant, if we increase promotion by 1 unit, sales increase by 3.6 units
range(omni$price)
dim(omni)
sdata <- omni %>% sample_n(2)
sdata
ndata3 = data.frame(price, promotion)
cbind(sdata, predict(model2, newdata= sdata))
plot(model2)
plot(model2, which=4)
omni[14,]
model2b = lm(sales ~ price + promotion, data = omni[-14,])
summary(model2b)
AIC(model2b)
AIC(model2)
================================================
FILE: 01-IIM/22d4-MLM-sales-TV.R
================================================
#Linear Model : Sales - TV, Radio, Newspaper
library(dplyr)
df = read.table("https://online.stat.psu.edu/onlinecourses/sites/stat508/files/lesson01/Advertising.data", header = T)
df
head(df)
cor(df)
cov(df)
plot(df$TV, df$Sales)
plot(df$Radio, df$Sales)
plot(df$Newspaper, df$Sales)
linModel = lm(Sales ~ TV + Radio + Newspaper, data=df)
summary(linModel)
#Newspaper is not significant
linModel2 = lm(Sales ~ TV + Radio, data=df)
summary(linModel2)
#all are significant; Adjust R2 > .89
anova(linModel, linModel2)
AIC(linModel, linModel2)
s1 <- sample_n(df,2)
predict(linModel2, newdata=s1, type='response')
df
================================================
FILE: 01-IIM/23c1-LOGR-logR.R
================================================
# Logistic Regression
#data() # datasets available for use in R
# Load the textbook R package
library(ISLR) #install the package
?Default # data set
# Load in the credit data
data("Default")
# See the properties
str(Default)
class(Default)
head(Default)
names(Default)
dim(Default)
summary(Default)
# How many people actual default?
(tmp = table(Default$default))
paste(333/10000,' people default in payment of credit card')
#How many student default
?table
table(Default$default, Default$student, dnn=c("default","students"))
ftable(Default$default, Default$student)
t1= table(Default$default, Default$student, dnn=c("default","students"))
addmargins(t1)
table(Default$student)
?glm
#Multiple Logistic Regression
logit1 = glm(default ~ income + balance + student, family='binomial', data=Default)
summary(logit1)
exp(coef(logit1))
#No Rs
#income is not significant - remove it
logit2 = glm(default ~ balance + student, family='binomial', data=Default)
summary(logit2)
coef(logit2)
exp(coef(logit2))
anova(logit2, logit1)
AIC(logit2, logit1)
#if pvalue < 0.05, logit1 is better model otherwise logit2
#use logit2
# Predict : use sample values
head(Default)
seq(1, 10000,500)
Default[c(1,501),]
ndata1= Default[seq(1, 10000,500),]
ndata1
nrow(Default[seq(1, 10000,500),])
10000/500
library(dplyr)
(ndata = (slice(Default, seq(1,n(),500))))
ndata
slice(Default, seq(1,n(),1000)) # another way
head(ndata)
addmargins(prop.table(table(Default$default,Default$student)))
0.2817/0.9667; 0.0127/0.0333
options(digits=10)
fitted.results = predict(logit2, newdata=ndata,type='response')
fitted.results = round(fitted.results,4)
head(fitted.results)
fitted.results
cbind(ndata, fitted.results)
ndata
p2 <- ndata %>% mutate(predictnew = ifelse(fitted.results < 0.5, 'No','Yes'))
cbind(p2,fitted.results)
fitted.results
ifelse(fitted.results < 0.05, 0,1)
(ndata2 = data.frame(student=c('Yes','No'), balance=mean(Default$balance), income=mean(Default$income)))
(fitted.results2 <- predict(logit2, newdata=ndata2,type='response'))
cbind(ndata2, fitted.results2)
Default
xtabs( ~ default + student, data=Default)
table(Default$default, Default$student)
ftable(Default$default, Default$student)
#Accuracy of Model
library(caret)
set.seed(3456)
str(Default)
trainIndex <- createDataPartition(Default$default, p = .67, list = FALSE, times = 1)
Train <- Default[ trainIndex,]
Test <- Default[-trainIndex,]
head(Train)
head(Test)
# Logistic Regression Model
model = glm(default ~ student, data=Default, family='binomial')
Test$model_prob <- predict(model, Test, type = "response")
head(Test)
Test <- Test %>% mutate(default_pred = ifelse(model_prob > .5,'Yes','No'))
head(Test)
Test <- Test %>% mutate(accurate = 1*(default == default_pred))
sum(Test$accurate)/nrow(Test)
#96% Accuracy
================================================
FILE: 01-IIM/23d1-LGR-gre.R
================================================
#Logistic Regresion : GRE
#https://stats.idre.ucla.edu/r/dae/logit-regression/
#A researcher is interested in how variables, such as GRE (Graduate Record Exam scores), GPA (grade point average) and prestige of the undergraduate institution, effect admission into graduate school. The response variable, admit/don’t admit, is a binary variable
inputData <- read.csv("https://stats.idre.ucla.edu/stat/data/binary.csv")
## view the first few rows of the data
head(inputData)
inputData
dim(inputData)
summary(inputData)
sapply(inputData, sd) # sd(inputData$gre)
str(inputData)
data= inputData # make a copy for futher analysis
data$rank = factor(data$rank)
data$admit = factor(data$admit)
summary(data)
## 2way contingency table of cat outcome and predictors we want
## to make sure there are not 0 cells
xtabs(~ admit + rank, data = data)
xtabs(~ cyl + gear + am + vs , data = mtcars)
head(data)
#create Logistic Model
mylogit <- glm(admit ~ gre + gpa + rank, data = data, family = "binomial")
summary(mylogit)
#gre,gpa, rank are statistically significant,
#For every one unit change in gre, the log odds of admission (versus non-admission) increases by 0.002.
#For a one unit increase in gpa, the log odds of being admitted to graduate school increases by 0.804.
#The indicator variables for rank have a slightly different interpretation. For example, having attended an undergraduate institution with rank of 2, versus an institution with a rank of 1, changes the log odds of admission by -0.675.
## odds ratios only
exp(coef(mylogit))
range(data$gre); range(data$gpa)
head(data)
data[data$admit==1,]
(ndata4 = data.frame(gre=c(379,520), gpa=c(3.71,4), rank=factor(c(4,1))))
(p3 =predict(mylogit, newdata=ndata4, type="response"))
p3
cbind(ndata4, p3)
(p3b=ifelse(p3 < .5, 0, 1))
cbind(ndata4, p3, p3b)
#divide data into 2 parts : train 70% & test 30%
# train
nrow(data) * .7
index = sample(nrow(data), size=nrow(data)* .7, replace=F)
length(index)
train = data[index, ]
dim(train)
# test
test = data[-index, ]
dim(test)
# create model using train set data
mylogit2 = glm(admit ~ gre + gpa + rank, data = train, family = "binomial")
summary(mylogit2)
# predict on test set data
(ptest = predict(mylogit2, newdata = test, type='response'))
# prob -> 0, 1 using if else statements
AIC(mylogit); AIC(mylogit2) #comparing models
(ptestb=ifelse(ptest < .5, 0, 1))
# test$admin compare with predicted admit values
compare = cbind(test, ptestb)
head(compare)
# confusion matrix
(cm=table(compare$admit, compare$ptestb))
(73 + 8) / (73 + 6 + 33 + 8) #Accuracy of model
library(caret)
confusionMatrix(cm)
#---------------------------------
#Predict admit for input data
prob=predict(mylogit,type=c("response"))
cbind(data, prob)
#cutoff value
library(InformationValue)
(optCutOff <- optimalCutoff(data$admit, prob)[1] ) #.46
confusionMatrix(data$admit, prob, threshold = optCutOff)
(accuracy = (247+38)/ (sum(247+38+89+26))) # .715
confusionMatrix(data$admit, prob, threshold = .7)
(accuracy = (272+2)/ (sum(272+2+125+1))) #.685
confusionMatrix(data$admit, prob, threshold = .2)
## view data frame
library(dplyr)
sample_n(data,size=1)
(newdata1 = data.frame(gre=450, gpa=3.7, rank=factor(3) ))
(newdata1$admitPredicted <- predict(mylogit, newdata = newdata1, type = "response"))
(newdata1$admitClass = ifelse(newdata1$admitPredicted > .46,1,0))
newdata1 #b=not admitted to institute
#End of Logistic Regression
#also check for assumptions of residues, VIF, Multi-collinearity
#Parition the data into train and test
library(caret)
Index <- createDataPartition(y=data$admit, p=0.70, list=FALSE)
head(Index)
nrow(data)
trainData = data[Index ,]
testData = data[-Index, ]
table(data$admit); prop.table(table(data$admit))
summary(trainData$admit); summary(testData$admit)
nrow(trainData) ; nrow(testData); nrow(trainData) + nrow(testData)
prop.table(table(trainData$admit))
prop.table(table(testData$admit))
#same promotion of admit in test and train
str(testData)
#now construct a model with train and then test on testdata
================================================
FILE: 01-IIM/24b1-LOGR-purchase.R
================================================
# Logistic Regression : Predict Purchase
# Import the dataset
#df1 = read.csv('./data/logr2.csv')
#head(df1)
url="https://docs.google.com/spreadsheets/d/1Md_ro2t3M7nA9JMH1DsE12jfeX7qq-UPw6p8WQd6A2Y/edit#gid=120271978"
library(gsheet)
df2 = as.data.frame(gsheet2tbl(url))
head(df2)
dim(df2)
dataset=df2 #or df2 if data is imported from google sheets
head(dataset)
str(dataset)
summary(dataset)
dim(dataset)
View(dataset)
dataset$gender = factor(dataset$gender)
summary(dataset)
# Split the dataset into the Training set and Test set
#install.packages('caTools')
library(caTools)
set.seed(2000)
split = sample.split(dataset$purchased, SplitRatio = 0.75)
split
training_set = subset(dataset, split == TRUE)
test_set = subset(dataset, split == FALSE)
dim(dataset); dim(training_set); dim(test_set)
names(dataset)
prop.table(table(training_set$purchased))
prop.table(table(test_set$purchased))
#------------------------------------
# Logisitic Model on Training Set
logitmodel1 = glm(purchased ~ gender + age + salary, family = binomial, data = training_set)
summary(logitmodel1)
#AIC - 209.13
# gender not insignificant dropped here
logitmodel2 = glm(purchased ~ age + salary, family = binomial, data = training_set)
summary(logitmodel2)
#AIC - 209.95
#summary(logitmodel2)$coefficient # they are in log terms
head(training_set)
#predict on sample data
test_set2 = data.frame(age=c(40,65), gender=c('Male', 'Female'), salary=c(40000, 50000))
test_set2
(prob_pred2 = predict(logitmodel2, type = 'response', newdata = test_set2))
cbind(test_set2, prob_pred2)
#age=65 person likely to purchase
# Predicting the Test set results from testset
head(test_set)
prob_pred = predict(logitmodel2, type = 'response', newdata = test_set)
summary(prob_pred)
head(cbind(test_set,prob_pred ),10)
#if prob > 0.5 make it 1, else 0
y_pred = ifelse(prob_pred > 0.5, 1, 0)
head(cbind(test_set$purchased, y_pred),100)
# Making the Confusion Matrix
cm = table(test_set[,5], y_pred)
cm
library(caret)
caret::confusionMatrix(cm)
names(dataset)
#predict binary classification
#84% accuracy in predict whether customer will purchase the product on the basis of IV variables - age & salary
coef(logitmodel1)
coef(logitmodel2)
================================================
FILE: 01-IIM/24c2-LOGR-adult.R
================================================
#Logistic Regression : Binary Cls : 0 or 1
#Case Study : predict if an individual will earn more than $50K using logistic regression based on demographic variables available in the adult data.
#Steps
# Import the data
# Check for class bias
# Create training and test samples
# Compute information value to find out important variables
# Build logit models and predict on test data
# Do model diagnostics
#Data Import ----
#from URL
inputData <- read.csv("http://rstatistics.net/wp-content/uploads/2015/09/adult.csv")
head(inputData)
str(inputData)
names(inputData)
removeColumns = c('FNLWGT','EDUCATION')
data= inputData[,-which(names(inputData) %in% removeColumns)]
names(data)
data$ABOVE50K = factor(data$ABOVE50K)
#train and test sets
library(caret)
Index <- createDataPartition(y=data$ABOVE50K, p=0.70, list=FALSE)
head(Index)
nrow(data)
trainData = data[Index ,]
testData = data[-Index, ]
table(data$ABOVE50K); prop.table(table(data$ABOVE50K))
summary(trainData$ABOVE50K); summary(testData$ABOVE50K)
nrow(trainData) ; nrow(testData); nrow(trainData) + nrow(testData)
prop.table(table(trainData$ABOVE50K))
prop.table(table(testData$ABOVE50K))
str(testData)
#Logistic Regression on selected columns
names(data)
logitMod <- glm(ABOVE50K ~ RELATIONSHIP + AGE + CAPITALGAIN + OCCUPATION , data=trainData, family='binomial')
summary(logitMod)
AIC(logitMod)
#Check the probabilities predicted for test data - 2 methods
predicted <- plogis(predict(logitMod, testData))
head(predicted)
predicted <- predict(logitMod, testData, type="response")
head(predicted)
#what should be the cutoff value between 0 and 1 to categorise them into 0 or 1, so that accuracy is high (correct splitting)
#cutoff value
library(InformationValue)
(optCutOff <- optimalCutoff(testData$ABOVE50K, predicted)[1] )
#Confusion Matrix
(cm1= confusionMatrix(testData$ABOVE50K, predicted, threshold = optCutOff))
#0 classified as 0, 1 classified as 1
#diagnostics
car::vif(logitMod)
#all X variables in the model to have VIF below 4.
misClassError(testData$ABOVE50K, predicted, threshold = optCutOff)
#Misclassification error is the percentage mismatch of predcited vs actuals, irrespective of 1’s or 0’s. The lower the misclassification error, the better is your model.
ROC
#Receiver Operating Characteristics Curve traces the percentage of true positives accurately predicted by a given logit model as the prediction probability cutoff is lowered from 1 to 0. For a good model, as the cutoff is lowered, it should mark more of actual 1’s as positives and lesser of actual 0’s as 1’s. So for a good model, the curve should rise steeply, indicating that the TPR (Y-Axis) increases faster than the FPR (X-Axis) as the cutoff score decreases. Greater the area under the ROC curve, better the predictive ability of the model.
plotROC(testData$ABOVE50K, predicted)
#http://r-statistics.co/Logistic-Regression-With-R.html
#https://rpubs.com/H_Zhu/235617
================================================
FILE: 01-IIM/24d2-LOGR-gre.R
================================================
#Logistic Regresion : GRE
#https://stats.idre.ucla.edu/r/dae/logit-regression/
#A researcher is interested in how variables, such as GRE (Graduate Record Exam scores), GPA (grade point average) and prestige of the undergraduate institution, effect admission into graduate school. The response variable, admit/don’t admit, is a binary variable
inputData <- read.csv("https://stats.idre.ucla.edu/stat/data/binary.csv")
## view the first few rows of the data
head(inputData)
dim(inputData)
inputData
summary(inputData)
#sd of all columns
sd(inputData$admit)
sd(inputData[,1])
sd(inputData[,2])
sd(inputData[,3])
sd(inputData[,4])
for (i in 1:4) { print(sd(inputData[,i]))}
#use apply functions
sapply(inputData, sd)
lapply(inputData, sd)
quantile(inputData$gre)
quantile(inputData$gre, c(.1,.2, .3, .7,.8,1))
quantile(inputData$gre, seq(0,1,.1))
quantile(inputData$gre, seq(0,1,.01))
lapply(inputData, quantile)
sapply(inputData, quantile)
?sapply
str(inputData)
data= inputData # make a copy for futher analysis
data$rank = as.factor(data$rank)
data$admit = factor(data$admit)
str(data)
## 2way contingency table of cat outcome and predictors we want
## to make sure there are not 0 cells
table(data$rank, data$admit)
#which rank of institute are more successful (in nos/ %) in getting admitted - 2 /1
(t1= xtabs(~ admit + rank, data = data))
addmargins(t1)
prop.table(t1,2)
?prop.table
#xtabs(~ gear + cyl + am , data=mtcars)
#create Logistic Model
head(data)
mylogit = glm(admit ~ gre + gpa + rank, data = data, family = "binomial")
summary(mylogit)
#gre,gpa, rank are statistically significant, Star in each variable rows
#For every one unit change in gre, the log odds of admission (versus non-admission) increases by 0.002.
#For a one unit increase in gpa, the log odds of being admitted to graduate school increases by 0.804. (keeping others constants)
#The indicator variables for rank have a slightly different interpretation. For example, having attended an undergraduate institution with rank of 2, versus an institution with a rank of 1, changes the log odds of admission by -0.675.
## odds ratios only
exp(coef(mylogit))
library(dplyr)
set.seed(1234)
(ndata1 = sample_n(data, 3))
(p1=predict(mylogit,newdata=ndata1, type=c("response")))
#Predict admit for input data
(ndata2 = data.frame(gre=c(600, 700), gpa=c(2,3), rank=factor(c(1,1))))
(p2= predict(mylogit,newdata=ndata2, type=c("response")))
cbind(ndata1, p1, predict=ifelse(p1 < .5, 0, 1))
cbind(ndata2, p2, predict=ifelse(p2 < .5, 0, 1))
#predict for original values
(p3 = predict(mylogit, newdata=data, type='response'))
nrow(data)
(predicted3 = factor(ifelse(p3 < .5, 0, 1)))
(actual = data$admit)
df = data.frame(predicted3, actual)
#cutoff value
head(df)
df %>% mutate(same = predicted3 == actual)
caret::confusionMatrix(df$predicted3, df$actual)
(254+30)/(19+97+254+30)
?confusionMatrix
## view data frame
library(dplyr)
sample_n(data,size=1)
(newdata1 = data.frame(gre=450, gpa=3.7, rank=factor(3) ))
(newdata1$admitPredicted <- predict(mylogit, newdata = newdata1, type = "response"))
(newdata1$admitClass = ifelse(newdata1$admitPredicted > .46,1,0))
newdata1 #b=not admitted to institute
#End of Logistic Regression
#also check for assumptions of residues, VIF, Multi-collinearity
#Parition the data into train and test
library(caret)
Index <- createDataPartition(y=data$admit, p=0.70, list=FALSE)
head(Index) #rownumbers for T
Index ; length(Index)
length(Index)/ nrow(data)
nrow(data)
trainData = data[Index , ]
nrow(trainData)
testData = data[-Index, ]
nrow(testData)
nrow(trainData) ; nrow(testData); nrow(trainData) + nrow(testData)
table(data$admit); prop.table(table(data$admit)) #original table
prop.table(table(trainData$admit))
prop.table(table(testData$admit))
summary(trainData$admit); summary(testData$admit)
#proportion of Admit (0 and 1) are same/similar in train and test set
#this is partition of data
#trainData & testData
log1 = glm(admit ~ gre + gpa + rank, data=trainData, family='binomial')
summary(log1)
car::vif(log1)
#we will use all variables
#predict on test set
(testData$predictNew = predict(log1, newdata = testData, type='response'))
library(InformationValue) #finds optimal cutoff values
(optCutOff <- optimalCutoff(testData$admit, testData$predictNew)[1] )
str(testData)
#find class with optimal & .5 cutoff value
testData2 = cbind(testData, predictClass1 = factor(ifelse(testData$predictNew < optCutOff, 0, 1)) , predictClass2 = factor(ifelse(testData$predictNew < .5, 0, 1)))
caret::confusionMatrix(testData2$admit, testData2$predictClass1) #better
caret::confusionMatrix(testData2$admit, testData2$predictClass2)
head(testData2)
misClassError(testData2$admit, testData2$predictNew, threshold = optCutOff)
plotROC(testData2$admit, testData2$predictNew)
Concordance(testData2$admit, testData2$predictNew)
#now construct a model with train and then test on testdata
#http://r-statistics.co/Logistic-Regression-With-R.html
#logistic Regression
library(InformationValue)
(optCutOff <- optimalCutoff(data$admit, prob)[1] ) #.46
confusionMatrix(data$admit, prob, threshold = optCutOff)
(accuracy = (247+38)/ (sum(247+38+89+26))) # .715
confusionMatrix(data$admit, prob, threshold = .7)
(accuracy = (272+2)/ (sum(272+2+125+1))) #.685
confusionMatrix(data$admit, prob, threshold = .2)
print(summary(am.data))
#end
================================================
FILE: 01-IIM/24e2-LOGR-general.R
================================================
#generic LogRegession
#load data
data <- read.csv(....)
#create training and validation data from given data
install.packages('caTools')
library(caTools)
set.seed(88)
split <- sample.split(data$class, SplitRatio = 0.75)
#get training and test data
train <- subset(data, split == TRUE)
test <- subset(data, split == FALSE)
#logistic regression model (remove ID coln if any)
model <- glm (class ~ ., data = train, family = binomial)
summary(model)
predict <- predict(model, type = 'response')
#confusion matrix
table(train$class, predict > 0.5)
#ROCR Curve
library(ROCR)
ROCRpred <- prediction(predict, train$class)
ROCRperf <- performance(ROCRpred, 'tpr','fpr')
plot(ROCRperf, colorize = TRUE, text.adj = c(-0.2,1.7))
#plot glm with any xvar and class
library(ggplot2)
ggplot(train, aes(x=xvar, y=class)) + geom_point() +
stat_smooth(method="glm", family="binomial", se=FALSE)
================================================
FILE: 01-IIM/24g1-LOGR-cancer.R
================================================
#logistic Regression
#https://www.machinelearningplus.com/machine-learning/logistic-regression-tutorial-examples-r/
# Load data
# install.packages('mlbench')
library(mlbench)
data(BreastCancer, package="mlbench")
bc <- BreastCancer[complete.cases(BreastCancer), ] # keep complete rows
# remove id column
bc <- bc[,-1]
# convert to numeric
for(i in 1:9) {
bc[, i] <- as.numeric(as.character(bc[, i]))
}
# Change Y values to 1's and 0's
bc$Class <- ifelse(bc$Class == "malignant", 1, 0)
bc$Class <- factor(bc$Class, levels = c(0, 1))
# Prep Training and Test data.
library(caret)
'%ni%' <- Negate('%in%') # define 'not in' func
options(scipen=999) # prevents printing scientific notations.
set.seed(100)
trainDataIndex <- createDataPartition(bc$Class, p=0.7, list = F)
trainData <- bc[trainDataIndex, ]
testData <- bc[-trainDataIndex, ]
# Class distribution of train data
table(trainData$Class)
table(testData$Class)
prop.table(table(trainData$Class))
prop.table(table(testData$Class))
# Down Sample
set.seed(100)
down_train <- downSample(x = trainData[, colnames(trainData) %ni% "Class"], y = trainData$Class)
table(down_train$Class)
?downSample
# Up Sample (optional)
set.seed(100)
up_train <- upSample(x = trainData[, colnames(trainData) %ni% "Class"],
y = trainData$Class)
table(up_train$Class)
# Build Logistic Model
logitmod <- glm(Class ~ Cl.thickness + Cell.size + Cell.shape, family = "binomial", data=down_train)
summary(logitmod)
pred <- predict(logitmod, newdata = testData, type = "response")
pred
# Recode factors
y_pred_num <- ifelse(pred > 0.5, 1, 0)
y_pred <- factor(y_pred_num, levels=c(0, 1))
y_act <- testData$Class
# Accuracy
mean(y_pred == y_act) # 94%
================================================
FILE: 01-IIM/24g2-LOGR-sample1.R
================================================
#generic LogRegession
#load data
data <- read.csv(....)
data = mtcars
#create training and validation data from given data
install.packages('caTools')
library(caTools)
set.seed(88)
#which category to be predicted: transmission 0/1
data$am = factor(data$am)
table(data$am)
split <- sample.split(data$am, SplitRatio = 0.75)
split
#get training and test data
train <- subset(data, split == TRUE)
test <- subset(data, split == FALSE)
table(train$am); table(test$am)
#logistic regression model (remove ID coln if any)
model <- glm (am ~ ., data = train, family = binomial)
summary(model)
predict <- predict(model, type = 'response')
#confusion matrix
table(train$am, predict > 0.5)
#ROCR Curve
library(ROCR)
ROCRpred <- prediction(predict, train$am)
ROCRperf <- performance(ROCRpred, 'tpr','fpr')
plot(ROCRperf, colorize = TRUE, text.adj = c(-0.2,1.7))
#plot glm with any xvar and class
library(ggplot2)
ggplot(train, aes(x=mpg, y=am)) + geom_point() + stat_smooth(method="glm", family="binomial", se=FALSE)
================================================
FILE: 01-IIM/31a1-DT-cart-split.R
================================================
#Understanding Splitting and selection of variables
#install the libraries
pacman::p_load(rpart, rpart.plot)
#library(rpart); library(rpart.plot)
#---
(gender = c(rep('M', 60), rep('F', 40)))
(play = c(rep(c('Yes','No'), c(30,30)), rep(c('Yes','No'), c(20,20))))
students = data.frame(gender, play)
head(students)
(t1=table(students$gender, students$play))
prop.table(t1)
addmargins(t1) # 50% of each category play and don't play
dtree1 = rpart(play ~ gender, data=students, control = list(cp=-1, minsplit=5))
dtree1
rpart.plot(dtree1)
#no decision tree as proportion is 50%
#----------------------
(gender2 = c(rep('M', 60), rep('F', 40)))
(play2 = c(rep(c('Yes','No'), c(40,20)), rep(c('Yes','No'), c(20,20))))
students2 = data.frame(gender2, play2)
head(students2)
(t2=table(students2$gender2, students2$play2))
addmargins(t2)
prop.table(t2) #more % of males play
prop.table(t1)
addmargins(prop.table(t2))
#how many play - .6, 60% from 100% values
dtree2 = rpart(play2 ~ gender2, data=students2, control = list(cp=-1, minsplit=5))
dtree2
rpart.plot(dtree2, extra=104, nn=T)
rpart.plot(dtree2)
table(students2$play2)
rpart.plot(dtree2, extra=104, cex=1)
rpart.plot(dtree2, extra=104, yesno=2, left=F, xflip=T, yflip=T,faclen=3, cex=1.5)
predict(dtree2, newdata = data.frame(gender2='M'), type='class')
predict(dtree2, newdata = data.frame(gender2='M'), type='prob')
#explore below document
#https://cran.r-project.org/web/packages/rpart.plot/rpart.plot.pdf
prop.table(t2,1) #40/60
#majority play (at root node)
#Eg3----------------------
(gender3 = c(rep('M', 60), rep('F', 40)))
(play3 = c(rep(c('Yes','No'), c(40,20)), rep(c('Yes','No'), c(20,20))))
(married3 = c(rep(c('Md','Sg'), c(50,10)), rep(c('Md','Sg'), c(10,30))))
students3 = data.frame(gender3, play3, married3)
head(students3)
(t3a= table(students3$play3, students3$gender3))
(t3b= table(students3$play3, students3$married3))
addmargins(t3a)
addmargins(t3b)
dtree3 = rpart(play3 ~ gender3 + married3, data=students3, control = list(cp=-1, minsplit=5))
dtree3
rpart.plot(dtree3)
addmargins(prop.table(table(students3$play3, students3$married3)))
#decision tree as proportion of M is not 50%
table(students3$play3)
#majority play (at root node)
#rpart(y~x+z, data=df, parms=list(split='gini'))
#Variable with lower Gini Index value, should be chosen as a variable that gives best split. The next step would be to take the results from the split and further partition.
table(students3$play3)
table(students3$married3)
#married Variable : Gini Index
table(students3$play3, students3$married3)
(gini_md_sg = 1 - (30/40)^2 - (10/40)^2)
(gini_md_md = 1 - (10/60)^2 - (50/60)^2)
(gini_md = gini_md_sg + gini_md_md)
#gender Variable : Gini Index
table(students3$play3, students3$gender3)
(gini_gd_m = 1 - (20/60)^2 - (40/60)^2)
(gini_gd_f = 1 - (20/40)^2 - (20/40)^2)
(gini_gd = gini_gd_m + gini_gd_f)
gini_md < gini_gd
#gini_md is lower hence should be selected as split variable
#After Splitting : look at left tree ie. Single students
#singles = split(students3, married3="Sg")
================================================
FILE: 01-IIM/31a2-DT-outlook.R
================================================
#Decision Tree - Book Eg : Weather - Predict Play
outlook =c('Sunny', 'Sunny', 'Overcast', 'Rain','Rain', 'Rain','Overcast', 'Sunny', 'Sunny','Rain','Sunny', 'Overcast','Overcast','Rain')
temperature = c( 'Hot','Hot','Hot','Mild','Cool','Cool','Cool','Mild','Cool','Mild','Mild','Mild','Hot','Mild')
humidity = c('Weak','Strong','Weak','Weak','Weak','Strong','Strong','Weak','Weak','Weak','Strong','Strong','Weak','Strong')
wind = c( 'High','High','High','High','Normal','Normal','Normal','High','Normal','Normal','Normal','High','Normal','High')
play = c('No','No','Yes','Yes','Yes','No','Yes','No','Yes','Yes','Yes','Yes','Yes','No')
game = data.frame(outlook, temperature, humidity, wind, play)
head(game)
library(rpart)
library(rpart.plot)
dt_game1 = rpart(play ~ . , data=game)
dt_game1
#no tree
#cp=-1
dt_game2 = rpart(play ~ . , data=game, control=rpart.control(cp = -1, minsplit=1, minbucket = 1))
dt_game2
rpart.plot(dt_game2,cex=.8, nn=T)
#make tree smaller
dt_game3 = rpart(play ~ . , data=game, control=rpart.control(cp = -1, minsplit=3, minbucket = 2, split = "gini"))
dt_game3
rpart.plot(dt_game3,cex=.8, nn=T)
?rpart
#make tree smaller
dt_game3 = rpart(play ~ . , data=game, control=rpart.control(cp = -1, minsplit=3, minbucket = 2, split = "information"))
dt_game3
attributes(dt_game3)
rpart.plot(dt_game3,cex=.8, nn=T)
dt_game3$parms
dt_game3$variable.importance
dt_game3$method
dt_game3$cptable
dt_game3$control
dt_game3$splits
dt_game3$numresp
dt_game3$y
game$play
dt_game3$ordered
================================================
FILE: 01-IIM/31a3-DT-general.R
================================================
#Basic Steps in CART
#Collect Data
#Identify if it is Classification or Regression Model
#Write the formula : y ~ x` + x2 ....
#load libraries #install packages - rpart, rpart.plot, RColorBrewer
library(rpart)# doing classification
library(rpart.plot) # visualising the tree
#model= rpart(y ~ ., method='class', data= , control=rpart.control(minsplit=4, cp=0.00001))
rpart.plot()
#Set any control Parameters
control_1 = rpart.control(cp = 0.005, minsplit=10, minbucket = 5)
control_2 = rpart.control(cp = -1)
# # Finding how people take decisions to buy products
# Creating Decision Tree
#Students : Gender - (Male & Female) buy a product
#Variable Gender
set.seed(123)
n=1000
gender = sample(x=c('Male','Female'), size=n, replace=T, prob=c(0.5,0.5) )
head(gender)
table(gender)
#Variable- Buy : Decision
set.seed(135)
buy = sample(x=c('Buy','NotBuy'), size=n, replace=T, prob=c(.41,.59) )
head(buy)
table(buy)
prop.table(table(buy))
#create Data Frame
students1 = data.frame(gender, buy)
head(students1)
xtabs(~ gender + buy,data = students1)
#Table
table(students1)
prop.table(table(students1))
addmargins(prop.table(table(students1)))
(t1= table(students1$gender, students1$buy))
addmargins(t1)
prop.table(table(students1$gender, students1$buy))
addmargins(prop.table(table(students1$gender, students1$buy))
)
#Model1
head(students1)
fit = rpart(buy ~ gender, data= students1)
fit
table(students1$buy)
dev.off()
rpart.plot(fit,nn=T)
fit1 = rpart(buy ~ gender, data=students1, minsplit=4, minbucket=2)
#'minsplit' is min 4 obsv reqd to split a node
#'minbucket' determines the minimal number of observations per leaf ('minbucket')
fit1 #print(fit1)
table(students1$gender, students1$buy)
head(students1)
library(rpart.plot)
rpart.plot(fit1, main='Classification Tree', nn=T, type=4, extra=104)
fit1
predict(fit1, newdata = data.frame(gender='Female'))
predict(fit1, newdata = data.frame(gender='Female'), type=c('class'))
predict(fit1, newdata = data.frame(gender=c('Male','Female','Male')), type='class')
#---- Part -2 Add another column----
set.seed(1234)
married = sample(x=c('Married','Single'), size=1000, replace=T, prob=c(0.6,0.4) )
table(married)
#new data frame
students2 = data.frame(gender, married, buy)
rownames(students2) = rollno
head(students2)
str(students2)
prop.table(ftable(students2))
table(students2$buy) # Majority - Don't Buy
addmargins(prop.table(table(students2)))
#write.csv(students2, 'dtdata.csv')
# Model2
#library(rpart)
names(students2)
head(students2)
fit2 = rpart(buy ~ gender + married, data=students2, minsplit=10, cp=-1)
#fit2 = rpart(buy ~ gender + married, data=students2, minsplit=10)
summary(fit2)
fit2
rpart.plot(fit2,type=2,extra=104, tweak=1.2, under=T, shadow=c('brown', 'green','red'), nn=T)
fit2
prp(fit2)
prp(fit2, main="An Example",
type=4, fallen=T, branch=.3, round=0, leaf.round=9,
clip.right.labs=F, under.cex=1,
box.palette="GnYlRd",
prefix="Student\n", branch.col="gray", branch.lwd=2,
extra=101, under=T, lt=" < ", ge=" >= ", cex.main=1.5)
prp(fit2, branch.type=5)
labels(fit2)
#Plot----
library(RColorBrewer)
rpart.plot::rpart.plot(fit2, main='Classification Tree')
rpart.plot::rpart.plot(fit2, extra=104, box.palette="GnBu", branch.lty=3, shadow.col="gray", nn=TRUE)
rpart.plot::prp(fit2,fallen.leaves = F)
prp(fit2, type=2)
#Predict
predict(fit2, newdata = data.frame(gender='Male', married='Married'), type='prob')
predict(fit2, newdata = data.frame(gender='Male', married='Married'), type='class')
predict(fit2, newdata = data.frame(gender='Male', married='Married'), type='vector')
predict(fit2, newdata = data.frame(gender='Male', married='Married'))
testdata = data.frame(gender=c('Male','Male','Female','Female'), married=c('Married','Single','Married','Single'))
testdata
(p1 = predict(fit2, newdata = testdata, type='vector')) #node/level
#play=2, notplay=1
(p2 = predict(fit2, newdata = testdata, type='class')) #factor
(p3 = predict(fit2, newdata = testdata, type='prob')) # prob
cbind(testdata, p1, p2, p3)
#level number, class frequencies, probabilities
predict(fit2, newdata= testdata, type = "matrix")
head(students2)
#Parameters Setting : CP
printcp(fit2)
getOption('digits')
plotcp(fit2)
names(fit2)
fit2$where #which row at which node no
students2[1:5,]
cbind(students2, nodeno=rownames(fit2$frame) [ fit2$where])
fit2
rpart.plot(fit2)
pfit= prune(fit2, cp=0.02) # from cptable
pfit
rpart.plot(pfit)
#Interactive----
new.tree <- prp(fit2, snip=TRUE)$obj # interactively trim the tree
prp(new.tree) # display the new tree
#click on quit
#using iris dataset
set.seed(123)
irisclass = rpart(Species ~ ., data = iris, control = control_1)
irisclass
rpart.plot(irisclass)
printcp(irisclass)
#--------
irisclass2 = rpart(Species ~ ., data = iris, control = control_2)
irisclass2
rpart.plot(irisclass2, cex=.8)
#----------------------
#regression Tree-----
names(iris) #which is numeric column - Length
irisregression = rpart(Sepal.Length ~ ., data = iris, method="anova", control = control_1)
irisregression
rpart.plot(irisregression)
printcp(irisregression)
pruneRT = prune(irisregression, cp=.008)
rpart.plot(pruneRT)
================================================
FILE: 01-IIM/31a3-DT-practiseCase.R
================================================
#DT - simple case
#https://www.gormanalysis.com/blog/decision-trees-in-r-using-rpart/
#https://en.wikipedia.org/wiki/Staged_crash
#RearEnd crash - Fraud
library(rpart)
library(rpart.plot)
train <- data.frame(
ClaimID = c(1,2,3),
RearEnd = c(TRUE, FALSE, TRUE),
Fraud = c(TRUE, FALSE, TRUE)
)
train
mytree <- rpart( Fraud ~ RearEnd, data = train, method = "class" )
mytree
#Notice the output shows only a root node. This is because rpart has
gitextract_w2ho1432/ ├── .gitignore ├── 0-Practise/ │ ├── day1.R │ ├── day2.R │ ├── day3.R │ ├── first.R │ ├── htmlimport.R │ ├── iims2.R │ ├── import2.R │ ├── kt1.R │ ├── lm-sim-test1.R │ ├── practise.R │ ├── practise2.R │ ├── rough.R │ └── vector.R ├── 0-Rdata/ │ ├── madata.Rdata │ ├── student.rds │ ├── student1.rds │ ├── twitter authentication.Rdata │ └── twitterauthentication.Rdata ├── 0-class/ │ ├── AR-groceries.R │ ├── CLUST-customer.R │ ├── DT-CART-sales.R │ ├── NAvalues.R │ ├── autoML1.R │ ├── bigQuery.R │ ├── hhe_d01.R │ ├── hhe_d02.R │ ├── hhe_d03.R │ ├── hhe_d04.R │ ├── hhe_d04b.R │ ├── lm_AIC.R │ ├── missingValues.R │ ├── munaz.csv │ └── purchaseProb.R ├── 01-IIM/ │ ├── 10a-daily.R │ ├── 11-analyticLevels.R │ ├── 11a1-start.R │ ├── 11a2-packages1.R │ ├── 11a3-packages2.R │ ├── 11a4-packages3.R │ ├── 11a5-packages4.R │ ├── 11b2-DS1.R │ ├── 11b3-DS2-factor.R │ ├── 11b4-DS4-reproduce.R │ ├── 11b5-DS3.R │ ├── 11e2_vectors1.R │ ├── 11e3_matrices1.R │ ├── 11e4_dataframe1.R │ ├── 12a3-impexp1.R │ ├── 12a4-datasets.R │ ├── 12a4-impexp-xls.R │ ├── 12e4-impexp-gs.R │ ├── 13a2-NAvalues.R │ ├── 13b2-outliers.R │ ├── 15a2-GPH-basic.R │ ├── 15a3-GPH-graphs.R │ ├── 15a4-GPH-advgraphs.R │ ├── 16b1-GPH-wordcloud.R │ ├── 16e0-GPH-wordcloud.R │ ├── 16e3-GPH-wordcloud-text1.R │ ├── 16e4-GPH-wordcloud-text2.R │ ├── 17a2-STATS-freqtable.R │ ├── 17c1-STATS-datapartition.R │ ├── 17d2-STATS-basicstats.R │ ├── 18d1-DPLYR-mtcars1.R │ ├── 18d3-DPLYR-mtcars2.R │ ├── 18d4-DPLYR-dplyr.R │ ├── 21a1-SLM-women.R │ ├── 21a2-SLM-women.R │ ├── 21a4-SLM-women.R │ ├── 21a5-SLM-women-A.R │ ├── 21b1-SLM-sales.R │ ├── 21b3-SLM-areasales.R │ ├── 21b4-SLM-salesarea.R │ ├── 21d2-MLM-mtcars1.R │ ├── 22a4-MLM-allmodels.R │ ├── 22c2-MLM-mtcars-olsrr.R │ ├── 22c3-MLM-cars.R │ ├── 22c3-MLM-salespromotion.R │ ├── 22d3-MLM-omni.R │ ├── 22d4-MLM-sales-TV.R │ ├── 23c1-LOGR-logR.R │ ├── 23d1-LGR-gre.R │ ├── 24b1-LOGR-purchase.R │ ├── 24c2-LOGR-adult.R │ ├── 24d2-LOGR-gre.R │ ├── 24e2-LOGR-general.R │ ├── 24g1-LOGR-cancer.R │ ├── 24g2-LOGR-sample1.R │ ├── 31a1-DT-cart-split.R │ ├── 31a2-DT-outlook.R │ ├── 31a3-DT-general.R │ ├── 31a3-DT-practiseCase.R │ ├── 31b1-DT-CART-carseats.R │ ├── 31b2-DT-CART-sales.R │ ├── 31b3-DT-CART-titanic.R │ ├── 31b4-DT-CART-loan.R │ ├── 31b5-DT-CART-gre.R │ ├── 31b5-DT-loanapproved1.R │ ├── 31b5-DT-rpart-iris.R │ ├── 31b7-DT-party.R │ ├── 31f1-DT-cart2.R │ ├── 32a4-DT-cart1.R │ ├── 32a4-DT-rf-4.R │ ├── 32c2-DT-chaid-usvote.R │ ├── 32d1-DT-CHAID-usvote.R │ ├── 32d5-DT-CART-RF.R │ ├── 32k1-CLS-svm1.R │ ├── 33g3-CLS-randomForest1.R │ ├── 34a1-CLUST-clustering.R │ ├── 34a1-CLUST-samplecase.R │ ├── 34b2-CLUST-customer.R │ ├── 34d1-CLUST-creditData.R │ ├── 34e4-clust-NOC-iris.R │ ├── 34g1-CLUST-segmentation.R │ ├── 34h1-CLUST-clustering.R │ ├── 34h1-CLUST-clustering2.R │ ├── 34j2-CLUST-NOC.R │ ├── 35d1-CLUST-hclust1.R │ ├── 35d2-CLUST-hclust2.R │ ├── 37a2-AR-eg.R │ ├── 37b1-AR-groceries.R │ ├── 37b2-AR-samplecase.R │ ├── 37b3-AR-groceries-subset.R │ ├── 37b5-AR-finproducts.R │ ├── 37b5-AR-groceries.R │ ├── 37c3-AR-redundant.R │ ├── 44b1-TS-data.R │ ├── 44b1-TS-dates.R │ ├── 44c2-TS-dates-lubridate.R │ ├── 44c4-TS-xts-data.R │ ├── 45d2-TS-airpassengers.R │ ├── 45d3-TS-components-airp.R │ ├── 45d4-TS-johnson.R │ ├── 45g3-TS-TTR-ma.R │ ├── 46e3-TS-auto-arima-johnson.R │ ├── 55c1-TM-twitter.R │ ├── 55e1-TM-twitter1.R │ ├── 61b1-LP-marketingspend.R │ ├── 61c2-LP-marketingspend-case.R │ ├── 61d2-LP-tpt.R │ ├── 61e2-LP-machassign.R │ ├── 61e5-LP-farmer1.R │ ├── 77a1-FA-quandl.R │ ├── 77a2-FA-quandl2.R │ ├── 77a5-FA-quantmod.R │ ├── 77a6-FA-quantmod-I-stocks.R │ ├── 77f2-FA-quantmod1.R │ ├── 77f3-FA-indianstocks.R │ ├── 91ab3-Case-student1.R │ ├── 91b4-Case-dencoCase.R │ ├── 91b5-Case-denco.R │ ├── 91g4-CASE-dencoCase.R │ ├── revision1.R │ ├── test.R │ ├── x55a1-TM-tweets.R │ ├── x55a2-TM-twitter.R │ └── x55a3-TM-twitter2.R ├── 02-IIMcases/ │ ├── DT_diabetis.R │ ├── case-denco.R │ ├── case-dplyr-mtcars.R │ ├── rev_iima20_1.R │ ├── revision1.R │ ├── senitmentTwitter.R │ └── textMiningSentence.R ├── 03-setup/ │ ├── 11a-start.R │ ├── 11b-gettingstarted.R │ ├── 12-packageInstall.R │ ├── 15a-envrm.R │ ├── 15b-renv.R │ ├── 15e-rjava.R │ ├── 16a-pathconfig.R │ ├── 17a-rstudio.R │ ├── 18a-processtime.R │ ├── 21a-floorceiling1.R │ ├── 21b-options.R │ ├── 24a-github.R │ ├── 25a-help.R │ ├── 51c-deletefiles.R │ ├── envVar.R │ ├── help.R │ └── pathconfig.R ├── 04-lib/ │ ├── 10a-fBasics.R │ ├── 11a-pysch.R │ ├── 21b-installpackages.R │ ├── 21e-installFmGit.R │ ├── 21g-packages1.R │ ├── 31b-datasets.R │ ├── 31c-datasets.R │ ├── 41-purrr1.R │ ├── 42-purr2.R │ ├── 43-purrr3.R │ ├── 44-purrr4.R │ ├── 45-purrr5.R │ ├── 51-plyr1.R │ ├── 61-splitapplycombine1.R │ ├── 62-splitapplycombine2.R │ ├── 71-broom1.R │ ├── packages1.R │ ├── switchr.R │ ├── useful.R │ └── useful2.R ├── 05-dataIE/ │ ├── 14a-readcsv.R │ ├── 14b-readcsv.R │ ├── 14c-importweb.R │ ├── 14d-importweb.R │ ├── 14e-readothers.R │ ├── 15b-datawrangling.R │ ├── 20a-importgg.R │ ├── 21b-googlesheet1.R │ ├── 22b-ggsheets2.R │ ├── 31a-export.R │ ├── 32c-writecsv.R │ ├── datasets.R │ └── importExcel.R ├── 06-DS/ │ ├── 0FileList.R │ ├── 10a-TOC │ ├── 13b-ds-blank.R │ ├── 14b-Basic_R_v01.R │ ├── 14b-objectsmethods.R │ ├── 14c-ds1.R │ ├── 15a-objects.R │ ├── 16b-datatypes.R │ ├── 16c-basicDT.R │ ├── 16d-ds1.R │ ├── 20a-vectors1.R │ ├── 20b-vectors2.R │ ├── 20c-vectors3.R │ ├── 20d-vectors4.R │ ├── 20f-vectors5.R │ ├── 20g-vectors6.R │ ├── 25a-matrices.R │ ├── 25c-matrices.R │ ├── 25d-matrices.R │ ├── 25e-matrices.R │ ├── 27a-arrays.R │ ├── 27b-arrays.R │ ├── 27d-arrays.R │ ├── 30c-basicdatatypes.R │ ├── 30d-ds1.R │ ├── 30e-datatypes.R │ ├── 33b-df.R │ ├── 33c-df.R │ ├── 35a-lists.R │ ├── 35b-lists.R │ ├── 35e-lists.R │ ├── 38a-factors.R │ ├── 38b-factors.R │ ├── 38c-factors.R │ ├── 38e-factors.R │ └── factors_iims.R ├── 07-excel/ │ ├── excelData.xlsx │ ├── impExpExcel.R │ ├── importFmExcel.R │ ├── importFmGS.R │ ├── mtcars.xlsx │ ├── student1.R │ └── student2.xlsx ├── 08-MBB/ │ ├── 01-MBB.R │ ├── 02-MBB.R │ ├── 03-MBB.R │ ├── 04-MBB.R │ ├── 05-MBB.R │ └── 06-MBB.R ├── 10-packages/ │ ├── analyzer.R │ ├── autoreg.R │ ├── dttr2.R │ ├── ggpmisc.R │ ├── packages.txt │ ├── timeplyr.R │ └── tsutils.R ├── 11-stats/ │ ├── 10-statslinks.R │ ├── 10a-distributions.R │ ├── 10c-allstats.R │ ├── 11a-normal.R │ ├── 11b-normalq.R │ ├── 11c-normald.R │ ├── 12a-binomial.R │ ├── 13a-mean.R │ ├── 14a-median.R │ ├── 15a-mode.R │ ├── 15b-mode.R │ ├── 16a-range.R │ ├── 17a-sd.R │ ├── 18a-covariance.R │ ├── 19a-correlation.R │ ├── 20a-coev.R │ ├── 37a-sample1.R │ ├── 40a-missing1.R │ ├── 40b-missing1.R │ ├── 40c-missing2.R │ ├── 40d-missing3.R │ ├── 42a-outlier1.R │ ├── 43a-outliers1.R │ ├── 45a-sampling.R │ ├── 55a-traintest1.R │ ├── 60a-kurtosis.R │ ├── 60b-kurtosis.R │ ├── 64a-skewness.R │ ├── 64b-skewness.R │ ├── 71a-xtabs.R │ ├── ave1.R │ ├── interactions.R │ ├── mean1.R │ ├── mean2.R │ ├── mean3.R │ ├── meandev.R │ ├── meanwt1.R │ ├── median1.R │ ├── mode1.R │ ├── normal_height.R │ ├── normality.R │ ├── normality2.R │ ├── outlier2.R │ ├── outliers1.R │ ├── poiss1.R │ └── quantile1.R ├── 13-statsH/ │ ├── 20b-distributions.R │ ├── 23b-ztest-bsda.R │ ├── 25c-tdistribution.R │ ├── 26b-ttestindep.R │ ├── 26c-ttestpaired.R │ ├── 27b-TTS1-case1.R │ ├── 28b-TTS2-case1.R │ ├── 28c-TTS1-case3.R │ ├── 28e-TT-sample1i.R │ ├── 31b-chisqdistr.R │ ├── 32b-HT-chisq1.R │ ├── 32c-HT-chisq2.R │ ├── 33b-HT-chisq.R │ ├── 33d-chisqtest1.R │ ├── 33e-chisqtest2.R │ ├── 34b-goodnessfit.R │ ├── datadistr.R │ ├── htestnd1.R │ ├── randomdistr.R │ ├── shadeareainplot.R │ └── tests1.R ├── 15-sampling/ │ ├── 12b-samples.R │ ├── 12e-sampleint.R │ ├── 14b-stratified.R │ ├── 15b-samplesplit.R │ ├── 15d-datapartition.R │ └── 15f-partitionfolds.R ├── 22-summary/ │ ├── 22b-aggregate.R │ ├── 22c-aggregate2.R │ ├── 23b-freqdistr1.R │ ├── 23c-freqdistr2.R │ ├── 23d-freqdistr3.R │ ├── 23f-FD.R │ ├── 23f-freqdistr.R │ ├── 24b-freqdistr4.R │ ├── 24f-freqdistr5.R │ ├── 25g-freqdistr6.R │ ├── 31c-rowsums1.R │ ├── 32b-addmargin1.R │ ├── 32c-margintable1.R │ ├── 32d-proptable.R │ ├── 32d-tableprop2.R │ ├── 35b-crosstab.R │ ├── 99a-Pskim.R │ ├── 99a-studentdata1.R │ └── descriptive.R ├── 23-functions/ │ ├── 21b-rep.R │ ├── 21c-seq.R │ ├── 21g-replicate.R │ ├── 21g-seqdates.R │ ├── 22b-letters.R │ ├── 25b-interval.R │ ├── 25c-midpoint.R │ ├── 27b-recode-car.R │ ├── 29b-subset.R │ ├── 29c-split1.R │ ├── 29d-splitdata.R │ ├── 29e-partitiondata.R │ ├── 31b-rowcol1.R │ ├── 33b-sortorder.R │ ├── 33c-order.R │ ├── 33c-sortorderrank.R │ ├── 33d-rank.R │ ├── 34b-castmelt1.R │ ├── 34c-castmelt2.R │ ├── 37a-mtcars-subset.R │ ├── 37b-duplicates1.R │ ├── 37c-unique.R │ ├── 38b-scale1.R │ ├── 41b-randnos1.R │ ├── 41c-randnos.R │ ├── 42b-normdist.R │ ├── 45b-forloop1.R │ ├── 45e-ifelse2.R │ ├── 45v-switch1.R │ ├── 46b-withoutapply.R │ ├── 46c-applyForCompare.R │ ├── 46d-applyfamily.R │ ├── 46e-applytype.R │ ├── 46f-while1.R │ ├── 47b-apply1.R │ ├── 47c-apply.R │ ├── 47d-apply1.R │ ├── 47h-tapply1.R │ ├── 47j-lapply1.R │ ├── 47m-mapply1.R │ ├── 47n-mapply2.R │ ├── 47o-rapply.R │ ├── 47on-eapply.R │ ├── 47p-sapply1.R │ ├── 47q-sapply2.R │ ├── 47s-tapply2.R │ ├── 47t-vapply1.R │ ├── 49b-replicate1.R │ ├── 49c-replicate.R │ ├── 49e-by.R │ ├── 49f-by.R │ ├── 49g-bywith.R │ ├── 51b-myfunc.R │ ├── 51c-functions1.R │ ├── 53b-cbindrbind1.R │ ├── 53c-joinDFs.R │ ├── 53c-merge1.R │ ├── 54b-combination.R │ ├── 54d-expandgrid.R │ ├── 55b-sweep1.R │ ├── 55d-sweep2.R │ ├── 56b-outer1.R │ ├── 56c-outer2.R │ ├── 57b-stack1.R │ ├── 58-DF-common.R │ ├── 58-df-matching1.R │ ├── 58-df2.R │ ├── 58-hmisc.R │ ├── 58-pmatchchar.R │ ├── 61c-missing1.R │ ├── 61c-missing2.R │ ├── 61c-missing3.R │ ├── 62b-outlier.R │ └── 62c-outlier2.R ├── 24-Strings/ │ ├── abvn.R │ ├── latex.R │ ├── output.txt │ ├── paste1.R │ ├── setop1.R │ ├── strcmpt1.R │ ├── string1.R │ ├── strjoin.R │ ├── strlength.R │ ├── strman1.R │ ├── strman2.R │ ├── strman3.R │ ├── strman4.R │ ├── strman5.R │ ├── strman6.R │ ├── strman7.R │ ├── strman9.R │ ├── strprint1.R │ ├── strreplace1.R │ ├── strsearch.R │ ├── strsplit1.R │ ├── strsplit2.R │ ├── strsplit3.R │ ├── strsplit4.R │ └── tidyr-strseperate.R ├── 31-graphs/ │ ├── 1bubblechart.R │ ├── 1bubblechart2.R │ ├── multipleplots1.R │ └── tableGrob.R ├── 32-basicGraphs/ │ ├── 10a-graphs.R │ ├── 10b-graphs.R │ ├── 12b-graphs2.R │ ├── 12d-title1.R │ ├── 12e-text.R │ ├── 12f-abline.R │ ├── 12g-legend.R │ ├── 12k-tick.R │ ├── 12m-axis1.R │ ├── 13e-multipleplots1.R │ ├── 13f-multipleplots.R │ ├── 13g-subplot.R │ ├── 15a-graphdata1.R │ ├── 15b-graph1.R │ ├── 21b-plot-hist1.R │ ├── 21c-plot.R │ ├── 23b-line.R │ ├── 23c-lines2.R │ ├── 24b-histogram.R │ ├── 24c-histogram2.R │ ├── 25b-barplot.R │ ├── 25c-barplot2.R │ ├── 26b-boxplot.R │ ├── 26c-boxplot2.R │ ├── 26d-boxplot2.R │ ├── 27b-pie.R │ ├── 27c-pie2.R │ ├── 29b-corrgram1.R │ ├── 32b-freqdistr.R │ ├── 33b-dotplot.R │ ├── 33b-matrixplots.R │ ├── 37b-scatter.R │ ├── 42b-intplots1.R │ ├── 43b-mosaic.R │ ├── 43c-corrplot.R │ ├── 43c-ggally.R │ ├── 44b-textplots.R │ ├── 45b-violinplot.R │ ├── ria2g1.R │ ├── ria2g2.R │ ├── ria2g3.R │ └── ria3g3.R ├── 33-AdvGraphs/ │ ├── cowplot1.R │ ├── donut.R │ ├── donut2.R │ ├── esquisse.R │ ├── lattice.R │ ├── lattice1.R │ ├── plotsToWord.R │ ├── survey.R │ ├── symbols.R │ ├── vtree1.R │ ├── waffle.R │ └── xxggsubplot.R ├── 33-DT/ │ ├── 0-DTsummary.R │ ├── 1-dt1.R │ ├── 2-DT.R │ └── 3-DT.R ├── 34-ggplots/ │ ├── circbarplot.R │ ├── gg-apexcharts.R │ ├── gg-bar1.R │ ├── gg-bar2.R │ ├── gg-box2.R │ ├── gg-boxhist.R │ ├── gg-boxplot1.R │ ├── gg-halves.R │ ├── gg-heatmap.R │ ├── gg-hist1.R │ ├── gg-hline.R │ ├── gg-hvlines.R │ ├── gg-labelend.R │ ├── gg-legend1.R │ ├── gg-line.R │ ├── gg-pie1.R │ ├── gg-slope.R │ ├── gg-slope2.R │ ├── ggbarplot.R │ ├── ggbarplots.R │ ├── ggboxhist.R │ ├── ggboxplot2.R │ ├── gghistogram.R │ ├── ggp2.R │ ├── ggplot-DU1.R │ ├── ggplot-legend1.R │ ├── ggplot3.R │ ├── ggplot5.R │ ├── ggplot6.R │ ├── ggplot7.R │ └── twoaxis-gg.R ├── 35-tidyverse/ │ ├── 20a-dplyr.R │ ├── 21a-dplyr-select.R │ ├── 21b-dplyr-slice1.R │ ├── 21c-dplyr-mutate1.R │ ├── 21d-dplyr-summarise.R │ ├── 21e-dplyr-filter1.R │ ├── 21f-dplyr-str.R │ ├── 21g-dplyr-arrange.R │ ├── 22b-dplyr-seperate1.R │ ├── 22b-group.R │ ├── 22c-summarise.R │ ├── 22g-tibble-rownames.R │ ├── 25b-magrittr.R │ ├── 26c-tidyr-DSR1.R │ ├── 26d-tidyr-DSR-who.R │ ├── 31b-plyr1.R │ ├── 32b-plyr-mutate.R │ ├── 33d-dplyr-joins.R │ ├── 33f-dplyr-split.R │ ├── plyr-ddply-gpsum.R │ ├── tidyr1.R │ ├── zz-dplyr1.R │ └── zz-tidy-dataformating.R ├── 41-LM/ │ ├── 10a-lm-women2.R │ ├── 10b-lm-salesarea2.R │ ├── 10c-MLR-omni.R │ ├── 10e-lm-errorplot.R │ ├── 13b-lm-commands.R │ ├── 16b-SLM-women2.R │ ├── 16c-SLM-women1.R │ ├── 16e-SLM-women-A.R │ ├── 16f-SLM-women-V.R │ ├── 16f-SLM-women.R │ ├── 16m-SLM-women2.R │ ├── 17a-LM-case1.R │ ├── 17b-LM-stock1.R │ ├── 18a-SLM-salesarea.R │ ├── 18b-SLM-salesarea.R │ ├── 18c-SLM-salesarea.R │ ├── 23a-MLM-omni.R │ ├── 23c-MLM-omni.R │ ├── 24a-MLM-pcsales.R │ ├── 25a-MLM-mtcars.R │ ├── 25c-MLM-mtcars.R │ ├── 25c-MLM-mtcars1.R │ ├── 25d-MLM-mtcars-A.R │ ├── 26a-MLM-airquality.R │ ├── 27a-MLM-marketing.R │ ├── 35a-MLM-case1.R │ ├── 37a-LM-dummy-fireplace.R │ ├── 37b-dummy1.R │ ├── 38c-LM-dummy1.R │ ├── 41c-LM-assumptions.R │ ├── 42b-LM-linearity.R │ ├── 42c-LM-normality.R │ ├── 42d-LM-variance.R │ ├── 42e-LM-outliers.R │ ├── 42f-LM-autocorr.R │ ├── 42g-LM-influentialvariables.R │ ├── 42h-LM-multicollinearity.R │ ├── 42j-gvlma.R │ ├── 43a-LM-graphs.R │ ├── LM-all-mtcars1.R │ ├── ProbDist.R │ ├── Simulation.R │ ├── confusionmatrix.R │ ├── contrasts1.R │ ├── dummies.R │ ├── homosecadicity.R │ ├── lm-broom.R │ ├── lm-dummy1.R │ ├── lm-housing.R │ ├── lm-mtcars1.R │ ├── lm-mtcars2.R │ ├── lm-plot1.R │ ├── lm-segments1.R │ ├── mlm-state77.R │ ├── multvariate1.R │ ├── plotcoef1.R │ └── regrplot1.R ├── 44-LogR/ │ ├── 24c-LR-default.R │ ├── 24d-LR-default.R │ ├── 24e-LR-default.R │ ├── 24g-LR-default-accuracy.R │ ├── 26b-LR-germancredit.R │ ├── 27b-LR-gre.R │ ├── 28b-LR-subscribe.R │ ├── 28c-LR-subscribe.R │ ├── 29b-LR-ads.R │ ├── 31b-LR-income.R │ ├── 31c-income.R │ ├── 33b-LR-purchase.R │ ├── 45b-compareAUC.R │ ├── 45c-roc-default.R │ ├── 45e-roc-general.R │ ├── 45f-roc1.R │ ├── 45h-roc2.R │ ├── 46c-accuracy.R │ ├── 48b-auc1.R │ ├── 48c-auc1.R │ ├── 48d-auc.R │ ├── 48e-auc.R │ ├── 49c-thresholdvalue.R │ ├── pdpu.R │ └── zz--logR.R ├── 51-DT/ │ ├── cls-gen │ ├── cls1M-cancer.R │ ├── dt-multiplemodels.R │ ├── giniIndex.R │ └── rattle.R ├── 52-CART/ │ ├── 10-CART-gen.R │ ├── 11-cart-understandsplit.R │ ├── 12-DT-outlook.R │ ├── 12-IIMBG-wksp.R │ ├── 12-IIMJ-wksp.R │ ├── 12-IIMS-wksp.R │ ├── 12-IITB-wksp.R │ ├── CARTR_sales.R │ ├── CART_Regression Tree v01.R │ ├── DT-germanCredit.R │ ├── DT-rpart-claims.R │ ├── c-dt-rpart-Case-DU1.R │ ├── c-dt-rpart-iris.R │ ├── c-dt-rpart-sales1.R │ ├── cls-cart-churn2.R │ ├── cls-rpart-plot2.R │ ├── dt-car.R │ ├── dt-general.R │ ├── dt-glaucoma.R │ ├── dt-ionos1.R │ ├── dt-iris1.R │ ├── dt-kyphosis.R │ ├── dt-loanapproved1.R │ ├── dt-rpart-du.R │ ├── dt-rpart-du1.R │ ├── dt-rpart-du2.R │ ├── dt-rpart-du3.R │ ├── dt-rpart-metal.R │ ├── dt-rpart-student1.R │ ├── dt-rpart-text1.R │ ├── dt-rpart-varimp1.R │ ├── dt-rpart-varimp2.R │ ├── dt-sleep.R │ ├── dt-tree-car1.R │ ├── dt3-eyes.R │ ├── entropy.R │ ├── multimodel.R │ ├── tree-houseprices.R │ └── zz-test.R ├── 53-splitcriteria/ │ ├── cls-entropy.R │ ├── dt-rpart-criteria.R │ ├── splitcriteria1.R │ ├── splitcriteria2.R │ └── splitcriteria3.R ├── 54-KNN/ │ ├── knn1_cancer.R │ ├── knn2.R │ ├── knn3.R │ ├── knn4.R │ ├── knn_diamonds.R │ └── knn_iris.R ├── 54-NLM/ │ ├── nlm1.R │ └── nlm2-mtcars.R ├── 55-CHAID/ │ ├── CHAID-nps2.R │ ├── CHAID-xsell1.R │ ├── c-dt-chaid-nps.R │ ├── c-dt-chaid-usvote1.R │ ├── chaid-attrition.R │ ├── chaid-cancer.R │ ├── chaid-usvote.R │ ├── chaid2.R │ ├── chaid4.R │ ├── chisq.R │ └── chisqtest2.R ├── 56-ctree/ │ ├── CTREE NPS R code v01.R │ ├── ctree-KyCU.R │ ├── ctree-airquality.R │ ├── ctree-churn2.R │ ├── ctree-clsregr-party.R │ ├── ctree-clsregr.R │ ├── ctree-readingskills.R │ ├── ctree2-iris.R │ ├── ctreee-iris.R │ └── dt-ctree-playYes.R ├── 57-GLM/ │ ├── Logr-party.R │ ├── crossfold.R │ ├── crossval1.R │ ├── cv-houseprices.R │ ├── cv-women1.R │ ├── cv3.R │ ├── cvlm2.R │ ├── glm-affairs1.R │ ├── glm-affairs2.R │ ├── glm-cars.R │ ├── glm-titanic1.R │ ├── logR1.R │ ├── logpos1.R │ ├── logr-mtcars.R │ ├── logr-mtcars1.R │ ├── logrMaths.R │ ├── logreg-iris1.R │ ├── multinominal.R │ ├── multinominal2.R │ ├── multinominal3.R │ ├── multinominal4.R │ ├── multinominal5.R │ ├── multinominal6.R │ └── nls1.R ├── 57-RF/ │ ├── dt-caret-xxx.R │ ├── dt-rf-DU3.R │ ├── dt-rf-eg2.R │ ├── dt-rf-eg3.R │ └── dt-rf-kyphosis1.R ├── 57-naive/ │ ├── naivbayes1.R │ └── naivbayes2.R ├── 61-clust/ │ ├── 10-clust-packages.R │ ├── 16b-km-withinss.R │ ├── 17b-clust-noclusters1.R │ ├── 17c-clust-numbers-iris.R │ ├── 17d-noc-mclust.R │ ├── 19b-clust-distances.R │ ├── 19c-clust-distances.R │ ├── 19d-clust-scaling.R │ ├── 20b-clust-plots.R │ ├── 20c-clust-plots2.R │ ├── 23b-km-marks1.R │ ├── 23c-km-marks2.R │ ├── 23d-km-amap-marks3.R │ ├── 23e-km-student2.R │ ├── 24b-clust-women.R │ ├── 25b-km-iris.R │ ├── 25c-km-iris2.R │ ├── 25f-km-iris2.R │ ├── 26h-km-attitude.R │ ├── 27c-clust-som1.R │ ├── 33c-hc-nutrients1.R │ ├── 33c-hc-vegan-dune1.R │ ├── 33d-hc-protein.R │ ├── 33f-hc-marks.R │ ├── 33g-hc-sample.R │ ├── 35d-pam-iris.R │ ├── 35e-pam-nutrient.R │ ├── 40b-mixedclust1.R │ ├── 40c-clust-dendgm.R │ ├── 43b-clust-mixedDataTypes1.R │ ├── 45c-clustering-exist1.R │ ├── 45e-clustering-animation1.R │ ├── 50b-clust-ma1.R │ ├── 50c-clust-ma2.R │ ├── 50d-clust-ma3.R │ ├── 61b-clust-custsegm.R │ ├── animation2.R │ ├── clust-allcustering.R │ ├── clust-case-liberty.R │ ├── clust-class-differences.R │ ├── clust-compare.R │ ├── clust-distance-calc.R │ ├── clust-distance2.R │ ├── clust-entropy.R │ ├── clust-iterations.R │ ├── clust-kselect.R │ ├── clustering-seeds-dunn.R │ ├── clustering-women.R │ ├── clusters3.R │ ├── hclust-USarrests.R │ ├── hier-simplecase.R │ ├── hier-usarrests.R │ ├── iris.R │ ├── kmeans-bankdata.R │ ├── kmeans-pcalike.R │ ├── kmeans-plots.R │ ├── kmeans-randomness.R │ └── pam1.R ├── 65-AR/ │ ├── 11a-measures1.R │ ├── 12a-ar-samplecase.R │ ├── 12b-ar-samplecase2.R │ ├── 14a-ar-datastr.R │ ├── 15-ar-groceries.R │ ├── 15a-ar-Groceries1.R │ ├── 15b-ar-Groceries.R │ ├── 16b-groceries-summary.R │ ├── 16d-ar-groceries-subset.R │ ├── 16f-ar-groceries-vis.R │ ├── 16f-ar-groceries-vis2.R │ ├── 16f-ar-groceries-vis3.R │ ├── 16f-ar-groceries-vis4.R │ ├── 16f-ar-groceries-vis5.R │ ├── 16k-ar-grocery-DT.R │ ├── 17a-ar-transactionformat.R │ ├── 17d-ar-matrix-transactions.R │ ├── 17e-ar-df-transcations.R │ ├── 17f-ar-csv-transactions.R │ ├── 17f-ar-csv2-transactions.R │ ├── 17g-ar-list-transcations.R │ ├── 17h-ar-dataformats.R │ ├── 18a-arules1.R │ ├── 20a-ar-DU1.R │ ├── 20b-ar-DU2.R │ ├── 20c-ar-DU3.R │ ├── 22a-ar-edn.R │ ├── 22b-ar-elective.R │ ├── 22d-ar-subjects.R │ ├── 22e-ar-placement.R │ ├── 22f-myAR1.R │ ├── 25a-ar-income.R │ ├── 25b-ar-medical.R │ ├── 25c-ar-titanic.R │ ├── 29a-ar-Adult.R │ ├── 29b-ar-Adult-NW.R │ ├── 29c-ar-Adult-Draft.R │ ├── 30a-ar-Finance1.R │ ├── 30b-ar-Finance.R │ ├── 32a-ar-visual.R │ ├── 33a-ar-redundant.R │ ├── 33b-redundantrules.R │ ├── 40a-ar-multilevel-Groceries.R │ ├── 43a-ar-patterns.R │ ├── 45a-ar-rulesextract.R │ ├── 99-ar-NW.R │ ├── 99-ar-OnlineSales.R │ ├── 99-ar-basketanalysis2.R │ ├── 99-ar-policechecks.R │ ├── AR-Weka │ ├── EDA-placement1.R │ ├── ar-case-liberty.R │ ├── ar-groceries2.R │ ├── ar-practise.R │ └── my_basket1.txt ├── 75-OR-LP/ │ ├── 15b-lpsolveAPI.R │ ├── 15c-lpassign.R │ ├── 21b-LP-mach-prod.R │ ├── 21c-LP-mach-prod.R │ ├── 22b-LP-case1.R │ ├── 22c-LP-assign-case3.R │ ├── 22d-LP-Case-carmanufacturing.R │ ├── 25b-LPassign-job.R │ ├── 30a-LP-tpt-function.R │ ├── 31b-LP-tpt1.R │ ├── 31c-LP-tpt2.R │ ├── 31d-LP-tpt3.R │ ├── 33d-proptable.R │ ├── 41b-pricing.R │ ├── 51b-LP-marketing.R │ ├── lp-ss.R │ ├── model.lp │ ├── zz-LP-clplite.R │ └── zz-LP-general.R ├── 77-TS/ │ ├── 11-tsdata.R │ ├── 12b-TS-add-mult.R │ ├── 14-ts-zoo.R │ ├── 16-ts-xts.R │ ├── 16c-dates-split1.R │ ├── 16d-dates1.R │ ├── 16d-ts-xts.R │ ├── 23b-TS-Case-sales.R │ ├── 23b-lubridate1.R │ ├── 24b-Data-DFtoTS.R │ ├── 24b-timeseries1.R │ ├── 24c-timeseries2.R │ ├── 24f-ts-data.R │ ├── 26b-ts-components-airp.R │ ├── 26c-ts-components.R │ ├── 27b-ts-johnson.R │ ├── 27c-ts-lm-uscons.R │ ├── 28c-ts-lubridate1.R │ ├── 31c-TS-airp.R │ ├── 33b-zoo-ts.R │ ├── 35b-LSM-beer1.R │ ├── 38b-tsplots2a.R │ ├── 38c-tsplots3.R │ ├── 38f-plot-zz.R │ ├── 41b-arima1.R │ ├── 41c-arima2.R │ ├── 41d-arima-airp.R │ ├── 41d-arima-jj-nile.R │ ├── 45b-TS-arima.R │ ├── 52c-Case1-complete.R │ ├── 53b-sales-ts.R │ ├── 55b-ts-case-xxx2.R │ ├── 55c-ts-case-xxxx.R │ ├── SMA-nile.R │ ├── TS-P-fpp.R │ ├── TS-c02.R │ ├── TS-data-DU1.R │ ├── TS-fpp-seasonplot.R │ ├── TS-kings.R │ ├── TS-links │ ├── TS-movag1.R │ ├── TS-nile.R │ ├── TS-xts.R │ ├── TS-zoo.R │ ├── UDFdates.R │ ├── blank.R │ ├── ts-P-highfreq.R │ ├── ts-P-openair.R │ ├── ts-P-padr.R │ ├── ts-beer2.R │ ├── ts-case1.R │ ├── ts-case2.R │ ├── ts-lubridate2.R │ ├── ts-rollingvalues.R │ ├── ts-rollingvalues2.R │ ├── ts-splitdate.R │ ├── ts-timestamp.R │ ├── ts-yoy.R │ └── tsforecast-exp.R ├── 78-nlp/ │ ├── SM-rtexttools1.R │ ├── TM-zz.R │ ├── downloadfile.R │ ├── facebook1.R │ ├── facebook2.R │ ├── fms.txt │ ├── linkedin1.R │ ├── linkedin3.R │ ├── pagerank.R │ ├── readpdf.R │ ├── rowling.txt │ ├── rquery_wordcloud.R │ ├── sentiment-tidyr1.R │ ├── sentiment2.R │ ├── textmining-DU1.R │ ├── tm-worldcloud4.R │ ├── twitter-hotel.R │ ├── twitter-keys.R │ ├── twitter-sentiment2.R │ ├── twitter1-DU1.R │ ├── twitter1-DU2.R │ ├── twitter1-authen.R │ ├── twitter1.R │ ├── twitter2.R │ ├── twitteracct │ ├── wordcloud1.R │ ├── wordcloud2.R │ ├── wordcloud3.R │ └── worldcloud2.R ├── 78-textdocs/ │ └── vit.txt ├── 80-SIM/ │ ├── montecarlo1.R │ ├── montecarlo2.R │ ├── mvsim1.R │ ├── randomNos.R │ ├── simLinks │ └── simple1.R ├── 81-case-sum/ │ ├── 31b-DA-dencoCase.R │ ├── 31c-DA-dencoCase2.R │ ├── 31d-dsum-denco.R │ ├── 31e-dencoCase2.R │ ├── 31f-dencoCase.R │ ├── 33c-basicDM-mtcars.R │ ├── 33c-dplyr-mtcars.R │ ├── 33f-DA-bakerydata1.R │ ├── 34b-sales1.R │ ├── 34c-sales2.R │ ├── 35b-DA-student1.R │ ├── 35c-dm-student1.R │ ├── 36b-dsum-Case1.R │ ├── 36c-dsum-Case2.R │ ├── 36f-DSA-case2.R │ ├── 37b-dsum-iris1.R │ ├── 38b-dsum-haireyecolor1.R │ ├── 42b-case-sum-graphs.R │ └── dataexplore.R ├── 83-MA/ │ ├── CA.R │ ├── campaign.csv │ ├── caseStudy_juice.R │ ├── data1.R │ ├── graph1.R │ ├── maregression1.R │ ├── pricing1.R │ └── tree1.R ├── 84-HR/ │ └── hr_churn.R ├── 85-RFM/ │ ├── rfm1.R │ └── rfm3.R ├── 87-FA/ │ ├── 10-FAlinks.R │ ├── Insurance Loss v01.R │ ├── InsuranceLosses.csv │ ├── Packages Pre-requisites_v03.R │ ├── aapl.csv │ ├── fa-iitg-dataanalysis.R │ ├── finTS1.R │ ├── findata1.R │ ├── finstmts1.R │ ├── finstmts2.R │ ├── finstmts3.R │ ├── gtrends1.R │ ├── intrino1.R │ ├── intrino2.R │ ├── lag1.R │ ├── logistic_regression.R │ ├── qf1.R │ ├── sentianalysistrading1.R │ ├── shares1.R │ ├── shares2.R │ ├── stock3.R │ ├── stockanalysis1.R │ ├── stockanalysis2.RData │ ├── stocks5.R │ ├── stocksanalysis3.R │ ├── stocksanalysis4.R │ └── volatity1.R ├── 88-Network/ │ ├── NetSciX 2016 Workshop.R │ ├── network1.R │ ├── network2.R │ └── traveltime1.R ├── 89-rVideos/ │ └── clustering-rV.R ├── 92-wksp2/ │ ├── 1a1-start.R │ ├── 1a3-packages1.R │ ├── 1b2-ds.R │ ├── 1b3-factor.R │ ├── 1d2-basicstats.R │ ├── 1d2-dm-student1.R │ ├── 1d3-dencoCase.R │ ├── 1d4-DA-dencoCase.R │ ├── 1e-graphs-basic.R │ ├── 1e2-graphs.R │ ├── 1e3-advgraphs.R │ ├── 1f-SLR-women.R │ ├── 1h1-dplyr.R │ ├── 1h2-freqtable.R │ ├── 2a-importExport.R │ ├── 2b-SLR-salesarea.R │ ├── 2b-allmodels.R │ ├── 2b2-SLM-women.R │ ├── 2b3-SLM-women-A.R │ ├── 2b4-LM-cars.R │ ├── 2b4-SLR-women.R │ ├── 2c3-MLM-salespromotion.R │ ├── 2c4-MLM-mtcars1.R │ ├── 2d1-missingvalues.R │ ├── 2d3-datapartition.R │ ├── 2e1-logR-purchase.R │ ├── 2e2-LOGR-adult.R │ ├── 2e3-LOGR-gre.R │ ├── 3b1-DT-CART-carseats.R │ ├── 3b2-DT-CART-R-sales.R │ ├── 3b3-DT-CART-titanic.R │ ├── 3b4-DT-CART-R-loan.R │ ├── 3b5-DT-loanapproved1.R │ ├── 3b5-DT-rpart-iris.R │ ├── 3d1-DT-CHAID-usvote.R │ ├── 3e1-clust-customer.R │ ├── 3e1-clustering.R │ ├── 3e2-clust-samplecase.R │ ├── 3e3-clust-segmentation.R │ ├── 3e4-clust-noOfclusters.R │ ├── 4b1-AR-groceries.R │ ├── 4b2-AR-samplecase.R │ ├── 4b3-AR-groceries-subset.R │ ├── 4b5-AR-finproducts.R │ ├── 4e1-twitter1.R │ ├── 4e2-wordcloud.R │ ├── 4e3-worldcloud2.R │ ├── 4e5-wordcloud3.R │ ├── 4f2-quantmod1.R │ ├── 4f3-indianstocks.R │ ├── 5-wordcloud2-New.R │ ├── 5b-LP-marketingspend.R │ ├── 5c2-LP-marketingspend-case.R │ ├── 5d-wordcloud2.R │ ├── 5d2-LP-tpt.R │ ├── 5e2-LP-machassign.R │ ├── 5e5-LP-farmer1.R │ ├── 6b1-dates.R │ ├── 6b1-ts-data.R │ ├── 6c2-dates-lubridate.R │ ├── 6d-TS-airpassengers.R │ ├── 6d-ts-components-airp.R │ ├── 6d-ts-johnson.R │ ├── 6d-ts-xts-data.R │ ├── 6e-TS-auto-arima-johnson.R │ ├── 6g-ts-TTR-ma.R │ ├── 8-fa-quandl.R │ ├── 8-fa-quandl2.R │ ├── 8-fa-quantmod.R │ ├── 8-quantmod-I-stocks.R │ └── zz-practise.R ├── 93-wksp3/ │ ├── Graph-matrixplots.R │ ├── LMtrainTest.R │ ├── Links_DAR │ ├── Links_DAR.R │ ├── TS-arima-johnson.R │ ├── TS-components-airpassengers.R │ ├── TS-data.R │ ├── TS-dates.R │ ├── TS-lubridate.R │ ├── TS-movavg-Nile.R │ ├── TS-movavg.R │ ├── assocrule1.R │ ├── assocrule2.R │ ├── assocrule3.R │ ├── decisiontree1.R │ ├── decisiontree2.R │ ├── decisiontree3.R │ ├── decisiontree4.R │ ├── decisiontree5.R │ ├── decisiontree5CHAID.R │ ├── df.R │ ├── environ.R │ ├── freqtable.R │ ├── lm-salesarea.R │ ├── lm-salesqty.R │ ├── lm-women-simple.R │ ├── lm.R │ ├── logR.R │ ├── logr-gre.R │ ├── matrix.R │ ├── missingvalues.R │ ├── packages1.R │ ├── packages2.R │ ├── stats2.R │ ├── twitter.R │ ├── vectors.R │ ├── wordcloud1.R │ └── wordcloud2.R ├── 95-studqueries/ │ ├── Sapient_Big Data.R │ ├── achal1.R │ ├── achal1.csv │ ├── achal2.R │ ├── achal2.csv │ ├── deepak.R │ ├── hitesh-dec18.R │ ├── hitesh1.R │ ├── hitesh2.R │ ├── hitesh3.R │ ├── hitesh4.R │ ├── hiteshJul18.R │ ├── lalit1.R │ ├── meena1 │ ├── meena2.R │ ├── meena3.R │ ├── missingValue.R │ ├── sidana2.R │ ├── tanviTS1.R │ └── vivekIIMLN.R ├── 96-cancer/ │ ├── data-cancer.R │ ├── rf-cancer.R │ ├── svm-cancer1.R │ └── svm-examples.R ├── 96-iris/ │ └── sumgraph1.R ├── 96-mtcars/ │ ├── 10b-datastructures.R │ ├── 11b-mtcars.R │ ├── 11c-mtcars-filter.R │ ├── 11d-mtcars-descp.R │ ├── 11f-mtcars-loops.R │ ├── 11g-mtcars-sort.R │ ├── 11h-mtcars-dplyr.R │ ├── 12d-mtcars-graph1.R │ ├── 12e-mtcars-graph2.R │ ├── 12e-mtcars-summarise-dplyr.R │ ├── 12f-diag-ggplot2-mtcars.R │ ├── 12f-ggplot2-mtcars.R │ ├── 13b-mtcars-lm1.R │ ├── 13c-mtcars-lm2.R │ ├── 13e-mtcars-lm3.R │ ├── 14b-mtcars-logr.R │ ├── 15b-mtcars-DT-class.R │ ├── 15c-mtcars-DT-anova.R │ ├── 16b-mtcars-cluster1.R │ ├── 16c-mtcars-cluster2.R │ ├── 22f-tidyr-mtcars.R │ ├── mtcars-clust1.R │ ├── mtcars-hclust.R │ └── s1.R ├── 97-artwork/ │ ├── AuctionsData - artwork.csv │ ├── AuctionsData - set1.csv │ ├── artwork-cls1.R │ ├── artwork-descp.R │ ├── artwork-eda1.R │ ├── artwork-eda2.R │ ├── artwork-rought.R │ ├── artwork1.R │ ├── artwork2.R │ ├── artwork4.R │ ├── awdata1.R │ ├── density.R │ └── file2.R ├── 99-GD/ │ ├── aboutSL │ ├── gradientdescent1.R │ ├── gradientdescent2.R │ ├── gradientdescent3.R │ ├── gradientdescent4.R │ └── regr1.R ├── 99-HTML/ │ └── aboutUSL ├── 99-Misc/ │ ├── dhmethods.R │ ├── funcpgm1.R │ ├── h2o.R │ └── skimr-package.R ├── 99-json/ │ ├── 21b-json-format.R │ ├── 23b-xml-import.R │ ├── 25a-httr1.R │ ├── 25c-httr2.R │ ├── json-1.R │ └── json2.R ├── 99-phd/ │ ├── attendance2.R │ └── grades.R ├── 99-sports/ │ ├── cricket1.R │ └── cricket2-york.R ├── 99-weka/ │ ├── cls-ID3.R │ ├── cls-c45weka.R │ ├── clsW-iris.R │ └── clsW-iris2.R ├── MBArules.csv ├── README.md ├── Unsorted/ │ ├── CLT.R │ ├── R-Exercise.R │ ├── RCommander.R.R │ ├── basiclm1.R │ ├── binomial.R │ ├── boxplot.R │ ├── c.R │ ├── central1.R │ ├── colstats1.R │ ├── complextables.R │ ├── cor1.R │ ├── crossfold1.R │ ├── cut1.R │ ├── cutprety1.R │ ├── datalevels.R │ ├── dbconnection.R │ ├── dec17.R │ ├── dec17b.R │ ├── demo1.R │ ├── density2.R │ ├── descstatsgraphs1.R │ ├── dplyr1.R │ ├── ds1.R │ ├── env1.R │ ├── examB.R │ ├── extra.R │ ├── fd1.R │ ├── fd2.R │ ├── fd3.R │ ├── fd4.R │ ├── fd5-means.R │ ├── googleS.R │ ├── knitr.R │ ├── kurtosis.R │ ├── lm-sales.R │ ├── lm1-sales.R │ ├── lm1.R │ ├── miscscripts.R │ ├── nd1.R │ ├── normal.R │ ├── paneldata1.R │ ├── plot1.R │ ├── practise-dec17c.R │ ├── practise.R │ ├── rattle1.R │ ├── rattle2.R │ ├── rcdr1.R │ ├── rjava.R │ ├── rle1.R │ ├── sample1.R │ ├── sample2.R │ ├── scripting1.R │ ├── skewness1.R │ ├── skewness2.R │ ├── skewness3.R │ ├── smpdist1.R │ ├── summary1.R │ ├── sumstats1.R │ ├── ttest1.R │ └── vaibhavi.R ├── cacert.pem ├── caseStudies/ │ └── allCases.R ├── data/ │ ├── AuctionsData - set1.csv │ ├── Churn.csv │ ├── Computers.csv │ ├── Dataset1-Media-Example-EDGES.csv │ ├── Dataset1-Media-Example-NODES.csv │ ├── Dataset2-Media-User-Example-EDGES.csv │ ├── Dataset2-Media-User-Example-NODES.csv │ ├── ItemList.csv │ ├── MA.RData │ ├── MBA.csv │ ├── MBArules.csv │ ├── MMM_raw_data_v02.csv │ ├── NPS Data Food Order v01.csv │ ├── Predict Merchant_Sales v01.csv │ ├── Prostate_Cancer.csv │ ├── Rules_20.csv │ ├── Sales.csv │ ├── Sales_files/ │ │ ├── 6006907 │ │ ├── frameworks-95aff0b550d3fe338b645a4deebdcb1b.css │ │ ├── frameworks-b3cd8fa1481bc34c4b18cf307ca75438.js.download │ │ ├── github-542f291c828bb453339765ba3a54c144.js.download │ │ └── github-cdaf214b636e7d0581fce94eda9de4bd.css │ ├── Segmentation_Data v01.csv │ ├── Social_Network_Ads.csv │ ├── StudentPassFail.csv │ ├── StudentTid1.csv │ ├── StudentTid2.csv │ ├── airpsng.csv │ ├── ar14.csv │ ├── ar14b.csv │ ├── artwork.rds │ ├── arulesfin.csv │ ├── attendance1.csv │ ├── attendance2.csv │ ├── attrition.csv │ ├── badata.Rdata │ ├── bakery.csv │ ├── bank.csv │ ├── binary.csv │ ├── bitsgoa.csv │ ├── cclogr.csv │ ├── clscredit.csv │ ├── clsplay.csv │ ├── clust_custseg.csv │ ├── dar1.csv │ ├── dar1w.csv │ ├── dar1w.csv.arff │ ├── dar2.csv │ ├── dar3.csv │ ├── dar3a.csv │ ├── dar3b.csv │ ├── data1.R │ ├── data4cluster2.csv │ ├── data_clus_2.csv │ ├── dataiitb.csv │ ├── dateformat1.R │ ├── denco.csv │ ├── dhiraj.csv │ ├── dtdata.csv │ ├── fintransactions.csv │ ├── grades.csv │ ├── groceries.csv │ ├── heart_tidy.csv │ ├── hhe.txt │ ├── iimS.xlsx │ ├── iima.csv │ ├── iimc1.csv │ ├── iimtrichy.csv │ ├── iitgfa.csv │ ├── iitgfa.xlsx │ ├── iitgfa2.xlsx │ ├── iris.csv │ ├── iris.xlsx │ ├── irisF.csv │ ├── irisT.csv │ ├── itemlist1 │ ├── km5_c2.csv │ ├── logr2.csv │ ├── msales.csv │ ├── mtcars.csv │ ├── mtcars.sas7bdat │ ├── mtcars.xlsx │ ├── mtcars1.csv │ ├── mtcarsF.csv │ ├── mtcarsT.csv │ ├── mushrooms.csv │ ├── my_basket │ ├── myexcel.xlsx │ ├── myitems1.csv │ ├── myrules1.csv │ ├── mytextcars.txt │ ├── myworkbook.xlsx │ ├── node1.csv │ ├── onsen.csv │ ├── pumba.csv │ ├── rep2.csv │ ├── rep4.csv │ ├── rep5.csv │ ├── revision1.csv │ ├── rules.csv │ ├── rulesR.csv │ ├── s1.csv │ ├── salesdata.csv │ ├── salesdatamonth.csv │ ├── salesslr.csv │ ├── slr1.csv │ ├── splitData1.R │ ├── stock.csv │ ├── stock1.csv │ ├── stock1.txt │ ├── student.csv │ ├── student1.csv │ ├── student1.xlsx │ ├── student2.xlsx │ ├── student3.xlsx │ ├── student3a.xlsx │ ├── studentdata.R │ ├── studentdata.csv │ ├── studentdata2.csv │ ├── studentdata3.txt │ ├── studentdata4.csv │ ├── students.csv │ ├── students3.csv │ ├── talltransactions.csv │ ├── tendulkar.csv │ ├── titanic.csv │ ├── titanic.raw.rdata │ ├── women.sav │ └── ximb.csv ├── dates/ │ ├── 11-date1.R │ ├── 12-dates-seq.R │ ├── 12c-dates-seq2.R │ ├── 13-dates-format.R │ ├── 14-dates-format2.R │ ├── 15-date-subset-arithmetic.R │ ├── 17-time-chron.R │ ├── 17c-time-posixt.R │ ├── 19-datetime-lubridate.R │ ├── 19c-datetime-lubridate.R │ ├── 19d-datetime-lubridate.R │ ├── 19e-datetime-lubridate.R │ ├── 19f-datetime-lubridate.R │ └── 30-datetime-zzz.R ├── download/ │ ├── fms.txt │ ├── iris.csv │ ├── iris.xlsx │ ├── rowling.txt │ └── vector.R ├── iim.txt ├── misc/ │ ├── 1-ds.R │ ├── cswr.R │ ├── fms.txt │ ├── importcsv.R │ ├── mysqlR.R │ ├── nest.R │ └── timeszones.R ├── munaz.csv ├── mycars.csv ├── myexcelcars.xlsx ├── myrules1.csv ├── practise1.R ├── rAnalytics.Rproj ├── report/ │ ├── knit2.R │ ├── knitr-minimal.R │ ├── sample1.R │ └── sample2.R ├── trg/ │ ├── bennett1.R │ ├── bennett3.R │ ├── d1-fmssrcc.R │ ├── d2a-fmssrcc.R │ ├── d2b-fmssrcc.R │ ├── iima-d1.R │ ├── iima-d3.R │ ├── iima-d4.R │ ├── iimkpg-d6.R │ ├── iimkpv-d1.R │ ├── iimkpv-d4.R │ └── ximb-r.R └── twitter authentication.Rdata
Copy disabled (too large)
Download .json
Condensed preview — 1497 files, each showing path, character count, and a content snippet. Download the .json file for the full structured content (12,537K chars).
[
{
"path": ".gitignore",
"chars": 59,
"preview": ".Rproj.user\n.Rhistory\n.RData\n.Ruserdata\n*.json\n.httr-oauth\n"
},
{
"path": "0-Practise/day1.R",
"chars": 2625,
"preview": "# Day 1\n\nlibrary(ISLR)\ndata('Default')\nstr(Default)\nLR1 = glm(default ~ ., family='binomial', data=Default)\nsummary(LR1)"
},
{
"path": "0-Practise/day2.R",
"chars": 523,
"preview": "# Day 3 - Online batch of MA\n\n#attach function of R\nwomen\nnames(women)\nheight\nattach(women)\nheight\nweight\nwomen$height\n\n"
},
{
"path": "0-Practise/day3.R",
"chars": 779,
"preview": "attach(mtcars)\nplot(wt, mpg)\nabline(lm(mpg~wt))\ntitle(\"Regression of MPG on Weight\")\ndetach(mtcars)\n\ndose <- c(20, 30, 4"
},
{
"path": "0-Practise/first.R",
"chars": 421,
"preview": "# First File in R\nx1 <- c(1, 5, 4, 9, 0) # <- is assignment x to have value 1,5,4,9,0\n#control + enter\nx2 = c(1, 5, 4, 9"
},
{
"path": "0-Practise/htmlimport.R",
"chars": 3543,
"preview": "#Installing the web scraping package rvest\n#install.packages(\"rvest\")\nlibrary(rvest)\n#Specifying the url for desired web"
},
{
"path": "0-Practise/iims2.R",
"chars": 585,
"preview": "# data structures in R\n\n#way of storing and manipulating data in any programming languages\n\n#vector----\nx = c(1,2,3,6) #"
},
{
"path": "0-Practise/import2.R",
"chars": 1594,
"preview": "#web scrapping\n#https://www.analyticsvidhya.com/blog/2017/03/beginners-guide-on-web-scraping-in-r-using-rvest-with-hands"
},
{
"path": "0-Practise/kt1.R",
"chars": 650,
"preview": "?which\n\nLETTERS\nwhich(LETTERS == \"K\") #11th alphabet\n\nmarks = c(10,30,40, 60)\nwhich(marks > 30) #position\nmarks[which("
},
{
"path": "0-Practise/lm-sim-test1.R",
"chars": 2052,
"preview": "#LM Simulation\n\nlibrary(car)\nlibrary(PerformanceAnalytics)\nlibrary(corrgram)\nlibrary(corrplot)\n\n#Data\nset.seed(1234); x1"
},
{
"path": "0-Practise/practise.R",
"chars": 4298,
"preview": "#List\nx; m1; a1; df1\ng =\"My First List\"\nh = c(25, 26,18,39)\nj = matrix(1:10,nrow=2)\nk = c('one','two','three')\nmylist = "
},
{
"path": "0-Practise/practise2.R",
"chars": 38,
"preview": "#misc practise\n\nx = 1:5\ndata.entry(x)\n"
},
{
"path": "0-Practise/rough.R",
"chars": 1226,
"preview": "# Rough Work\n?cat\n?dput\n?dget\n?dump\n?write\n?write.table\n?save\n?detach\n?attach\n?dir\n?ls\n?rm\n?attr\n?attributes\n\ndata1 = c("
},
{
"path": "0-Practise/vector.R",
"chars": 225,
"preview": "#Data Structure - Vectors\n\nx = c(1,5,7,8,4)\nx2 <- c(2,5,7,8,4)\nx\nx2\nx4 = c('M','F','M','F','M')\nx4\n(x5 = 1:100)\n(x6 = se"
},
{
"path": "0-class/AR-groceries.R",
"chars": 4648,
"preview": "# Association Rules - Groceries data set ####\n\nlibrary(arules) #install first\nlibrary(arulesViz) #install first\nlibrary"
},
{
"path": "0-class/CLUST-customer.R",
"chars": 1026,
"preview": "# HH MA example - customer\n\n#install.packages(\"amap\")\nlibrary(amap)\n##Read the data in the file\nurl = 'https://docs.goo"
},
{
"path": "0-class/DT-CART-sales.R",
"chars": 1334,
"preview": "# CART Models - HH Case Study - Regression\nlibrary(rpart)\nlibrary(rpart.plot)\nlibrary(forecast)\n\n\nlibrary(gsheet)\nurl='h"
},
{
"path": "0-class/NAvalues.R",
"chars": 1978,
"preview": "# Missing values\n\n#missing values are indicate NA keyword\nx = c(1, ,3) #wrong way to create missing values\nx = c(NA, 1"
},
{
"path": "0-class/autoML1.R",
"chars": 1094,
"preview": "# Auto ML in R\n\npacman::p_load(caret, randomForest)\nset.seed(123)\nn = 100\n\nsData <- data.frame(X1=rnorm(n), X2= rnorm(n)"
},
{
"path": "0-class/bigQuery.R",
"chars": 500,
"preview": "# big query\n#https://bigrquery.r-dbi.org/dev/\n#install.packages(\"bigrquery\")\n\nlibrary(bigrquery)\n\nlibrary(DBI)\n\nbilling "
},
{
"path": "0-class/hhe_d01.R",
"chars": 2180,
"preview": "#HHE Batch : Nov 2023\n#05Nov2023\nmtcars\n#control+enter to run a line and move ahead\n#customise shortcuts \n\n#vector----"
},
{
"path": "0-class/hhe_d02.R",
"chars": 3356,
"preview": "# HHE Batch : MA with R ; Munaz\n#day2------\n\n#vectors-----\n(rollNo = 1:100)\npaste('student',1:100)\n(name = paste('stud"
},
{
"path": "0-class/hhe_d03.R",
"chars": 4234,
"preview": "#HHE MAR - Day3\nlibrary(dplyr) #load the library\n\n#with and within-----\n\ndf = mtcars\n\n#with(DF, expr) : new Col\n#withi"
},
{
"path": "0-class/hhe_d04.R",
"chars": 2697,
"preview": "# HHE Day-4 : 19 Nov 2023 : Munaz\n# Supervised and Non-Supervised Learning\n# Hypothesis Testing\n# Linear & Logistic Regr"
},
{
"path": "0-class/hhe_d04b.R",
"chars": 2706,
"preview": "# day4 - MAR - Munaz\n#plan - Modeling\n\n#What is Linear Regression ? , What does it do ?, How do we run the model in R ? "
},
{
"path": "0-class/lm_AIC.R",
"chars": 1294,
"preview": "# AIC Linear Regression\n\n#https://bookdown.org/steve_midway/DAR/model-selection.html\n\nmtcars.lm1 <- lm(mpg ~ disp, data="
},
{
"path": "0-class/missingValues.R",
"chars": 758,
"preview": "#missing values\n\nrm(list = ls())\nis.na(x) # returns TRUE of x is missing\nx <- NA\nis.na(x) # returns TRUE of x is missing"
},
{
"path": "0-class/munaz.csv",
"chars": 72,
"preview": "\"rollno\",\"name\",\"age\"\n\"S01\",\"Dhiraj\",55\n\"S02\",\"Munaz\",42\n\"S03\",\"HHE\",30\n"
},
{
"path": "0-class/purchaseProb.R",
"chars": 106,
"preview": "# customer purchase probability\n\n#https://www.masterdataanalysis.com/r/using-r-predict-customer-will-buy/\n"
},
{
"path": "01-IIM/10a-daily.R",
"chars": 90,
"preview": "#daily practise File\n#keep yourself updated with common R commands and Modeling Techniques"
},
{
"path": "01-IIM/11-analyticLevels.R",
"chars": 3286,
"preview": "#all levels of analytics - Descriptive, Diagnostic, Predictive, Prescriptive\n\n#import from ggsheet\nlibrary(gsheet)\nslr1 "
},
{
"path": "01-IIM/11a1-start.R",
"chars": 843,
"preview": "#initial commands\n\n# assign\nx1 = 3 #press control + enter to run the line\nx2 <- 3 # same \nx3 < - 3 #incorrect\n#which is"
},
{
"path": "01-IIM/11a2-packages1.R",
"chars": 2903,
"preview": "#List of packages to be installed\n\n#Installing\nlistOfPackages = c('P1','P2')\ninstall.packages(listOfPackages)\n\n#first in"
},
{
"path": "01-IIM/11a3-packages2.R",
"chars": 3210,
"preview": "# Packages installation\n\n#List avl packages\nlibrary()\n\n\n#Total Avl Packages\nnrow(available.packages())\n\n#Install Package"
},
{
"path": "01-IIM/11a4-packages3.R",
"chars": 107,
"preview": "\npinstall <- c('rpart','rpart.plot', 'catools', 'caret','arules','arulesViz')\n\ninstall.packages(pinstall)\n\n"
},
{
"path": "01-IIM/11a5-packages4.R",
"chars": 1642,
"preview": "#Install packages for Data Analytics Course\n\n#install package\ninstall.packages('packageName')\n#multiple packages\ninstall"
},
{
"path": "01-IIM/11b2-DS1.R",
"chars": 7405,
"preview": "# Data Structures in R\n\n#control+enter when you are in the line to execute\n# Vectors-----\nc(2,4,6)\n?seq\nseq(2,10,.5)\nseq"
},
{
"path": "01-IIM/11b3-DS2-factor.R",
"chars": 716,
"preview": "#Factors\n#categories without (eg Gender) Order or with (eg Grades) Orders\n\n(grades = sample(c(LETTERS[1:4]), size=30, re"
},
{
"path": "01-IIM/11b4-DS4-reproduce.R",
"chars": 1694,
"preview": "#Reproducible Code\n\nmtcars \nhead(mtcars)\nrecode_mtcars <- dput(head(mtcars))\nnewdf <- recode_mtcars\nnewdf\n#if Df has fac"
},
{
"path": "01-IIM/11b5-DS3.R",
"chars": 718,
"preview": "#Data Structures - II\n#Home Work\n\n#Arrays\nlength(100:123)\n4*3*2\n#2 coys, 3 products, 4 locations sold qty\n(a1 = array(10"
},
{
"path": "01-IIM/11e2_vectors1.R",
"chars": 3215,
"preview": "#Case in Vector in R\n#Vector is single dim\n\n#creations of Vectors------\n#Vector values from 1 to 100\nx1 = 1:100\nx1\n\n#vec"
},
{
"path": "01-IIM/11e3_matrices1.R",
"chars": 2512,
"preview": "#Matrices in R\n#Matrix is a two dimensional data structure in R programming. Matrix is similar to vector but additionall"
},
{
"path": "01-IIM/11e4_dataframe1.R",
"chars": 2212,
"preview": "#Data Frame in R\n\n#A data frame is a table or a two-dimensional array-like structure in which each column contains value"
},
{
"path": "01-IIM/12a3-impexp1.R",
"chars": 1577,
"preview": "# Read Data into R Environment\n#CSV Files---- local or network\n#Read from CSV file in PC\nhead(mtcars)\nrownames(mtcars)\nw"
},
{
"path": "01-IIM/12a4-datasets.R",
"chars": 852,
"preview": "#data sets in R\n#https://lgatto.github.io/IntroMachineLearningWithR/example-datasets.html#edgar-andersons-iris-data\n\n#Ir"
},
{
"path": "01-IIM/12a4-impexp-xls.R",
"chars": 1140,
"preview": "# Read Data into R Environment - to/fro XLS\n\n#Excel----\n#Create a excel file with data in 2 sheets\n# first row contains "
},
{
"path": "01-IIM/12e4-impexp-gs.R",
"chars": 591,
"preview": "#import Data from Google Sheets\n\n#google sheets #publically shared\nurl1 = 'docs.google.com/spreadsheets/d/1I9mJsS5QnXF2T"
},
{
"path": "01-IIM/13a2-NAvalues.R",
"chars": 1978,
"preview": "# Missing values\n\n#missing values are indicate NA keyword\nx = c(1, ,3) #wrong way to create missing values\nx = c(NA, 1"
},
{
"path": "01-IIM/13b2-outliers.R",
"chars": 3333,
"preview": "#Outliers\n#https://www.statsandr.com/blog/outliers-detection-in-r/\n#outliers--------------------------\n#An outlier is a "
},
{
"path": "01-IIM/15a2-GPH-basic.R",
"chars": 2493,
"preview": "# Basic plots\n#plot, histogram, pie, boxplot, linechart, correlation plot\n\n#plot\nwomen\nplot(women)\n\n(x= seq(1,10))\n(x =c"
},
{
"path": "01-IIM/15a3-GPH-graphs.R",
"chars": 1590,
"preview": "# Combined Plots\n\n#plot, histogram, pie, boxplot, linechart, correlation plot\n\n#plot\nwomen\n?women\nstr(women)\nplot(women)"
},
{
"path": "01-IIM/15a4-GPH-advgraphs.R",
"chars": 640,
"preview": "#Advanced Graphs\nlibrary(corrgram)\ncor(mtcars[1:4])\ncorrgram(mtcars[1:4], order=TRUE, lower.panel=panel.shade, upper.pan"
},
{
"path": "01-IIM/16b1-GPH-wordcloud.R",
"chars": 9244,
"preview": "#Word Cloud\n#word & Freq; built in data sets\n\n#-----\n# World Cloud\n#http://stat.ethz.ch/R-manual/R-devel/library/base/ht"
},
{
"path": "01-IIM/16e0-GPH-wordcloud.R",
"chars": 1580,
"preview": "#Word Cloud in R\n\nlibrary(wordcloud)\n\n?wordcloud\n#wordcloud(words,freq,scale=c(4,.5),min.freq=3,max.words=Inf, random.or"
},
{
"path": "01-IIM/16e3-GPH-wordcloud-text1.R",
"chars": 2473,
"preview": "# World Cloud 2\nlibrary(wordcloud)\nlibrary(RColorBrewer)\nlibrary(SnowballC)\nlibrary(RCurl)\nlibrary(XML)\nlibrary(tm)\n\n# R"
},
{
"path": "01-IIM/16e4-GPH-wordcloud-text2.R",
"chars": 2857,
"preview": "#Word Cloud Text File\n\nStep 1 : Install and load the required packages\n# Load\nlibrary(\"tm\")\nlibrary(\"SnowballC\")\nlibrary"
},
{
"path": "01-IIM/17a2-STATS-freqtable.R",
"chars": 1199,
"preview": "# Frequency Distribution\n\n#Discrete Cat Data\n(attend = c('A','P','P','A','P','A'))\ntable(attend)\ncbind(table(attend)) #"
},
{
"path": "01-IIM/17c1-STATS-datapartition.R",
"chars": 1829,
"preview": "#partition the data into train and test set\nmtcars\nnrow(mtcars)\n#train-70%, test-30%\n(myvalues = 1:32)\nselected = sample"
},
{
"path": "01-IIM/17d2-STATS-basicstats.R",
"chars": 625,
"preview": "# Basic Stats\nx = ceiling(rnorm(10000, mean=60, sd=20))\nmean(x)\nmedian(x)\n#there is no mode function for mode stats\ntabl"
},
{
"path": "01-IIM/18d1-DPLYR-mtcars1.R",
"chars": 1581,
"preview": "#Data Summarisation using Dplyr \n#dataset - mtcars\n\n#dplyr - mtcars\n#install.packages('dplyr')\nlibrary(dplyr) #install t"
},
{
"path": "01-IIM/18d3-DPLYR-mtcars2.R",
"chars": 6825,
"preview": "# Data Summarisation - dplyr\n#Home work\n\n#Data Summarisation using Dplyr \n#dataset - mtcars\n\n#dplyr - mtcars\n#install.pa"
},
{
"path": "01-IIM/18d4-DPLYR-dplyr.R",
"chars": 6614,
"preview": "#dplyr - mtcars\nlibrary(dplyr)\n#library(tidyverse)\n#Filter----\n\nfilter(mtcars, cyl == 8)\nfilter(mtcars, cyl < 6)\n\n# Mult"
},
{
"path": "01-IIM/21a1-SLM-women.R",
"chars": 1497,
"preview": "#topics ----\n#factors, env, import/export. package install\n#rep, recode, split, partition, subset, loops, cast & melt\n#m"
},
{
"path": "01-IIM/21a2-SLM-women.R",
"chars": 2330,
"preview": "# Regression Analysis\n# Simple Linear with 1 IV and 1 DV\n\ndata(women)\nwomen\nnames(women)\nstr(women)\n\ncov(women$height, w"
},
{
"path": "01-IIM/21a4-SLM-women.R",
"chars": 215,
"preview": "\nfit = lm(weight ~ height, data=women)\nsummary(fit)\nrange(women$height)\n(ndata = data.frame(height= c(58.5, 60.7)))\n(p ="
},
{
"path": "01-IIM/21a5-SLM-women-A.R",
"chars": 2553,
"preview": "# Simple Linear Regression : Built in Data Set Women\n# Check for assumptions of Regression in the data Set\nwomen\n?women\n"
},
{
"path": "01-IIM/21b1-SLM-sales.R",
"chars": 3961,
"preview": "#Multiple Linear Regression \n#Linear Modeling : DV vs more than 1 IVs\n#sales Qty vs price & promotion\n\n#Omni Store\n#crea"
},
{
"path": "01-IIM/21b3-SLM-areasales.R",
"chars": 1802,
"preview": "# SLR Area vs Sales\n#https://www.statisticshowto.datasciencecentral.com/excel-regression-analysis-output-explained/\n\n#im"
},
{
"path": "01-IIM/21b4-SLM-salesarea.R",
"chars": 5461,
"preview": "#Simple Linear Regression - Case Study\n# Regression : Areas vs Sales\n#Given data of area and sales, predict value for sa"
},
{
"path": "01-IIM/21d2-MLM-mtcars1.R",
"chars": 828,
"preview": "#https://cran.r-project.org/web/packages/olsrr/olsrr.pdf\n#install.packages('olsrr')\nlibrary(olsrr) #install it first\n\nmo"
},
{
"path": "01-IIM/22a4-MLM-allmodels.R",
"chars": 5854,
"preview": "# All models - This code performs all modeling in quick method.\n# for details go detail code.\n\nlibrary(dplyr)\n\n#Linear R"
},
{
"path": "01-IIM/22c2-MLM-mtcars-olsrr.R",
"chars": 309,
"preview": "#https://cran.r-project.org/web/packages/olsrr/olsrr.pdf\n#install.packages('olsrr')\nlibrary(olsrr)\nhead(mtcars)\nnames(mt"
},
{
"path": "01-IIM/22c3-MLM-cars.R",
"chars": 931,
"preview": "\n\n#http://r-statistics.co/Linear-Regression.html\nhead(cars) \nfit2=lm(dist ~ speed, data=cars)\nsummary(fit2)\npredict(fit2"
},
{
"path": "01-IIM/22c3-MLM-salespromotion.R",
"chars": 3469,
"preview": "#Multiple Linear Regression : DV vs more than 1 IVs\n#sales Qty vs price & promotion\n#Predict Sales Qty from Price and Pr"
},
{
"path": "01-IIM/22d3-MLM-omni.R",
"chars": 1370,
"preview": "#Multiple Linear Regression \n#Linear Modeling : DV vs more than 1 IVs\n#sales Qty vs price & promotion\n\n#Omni Store\n#crea"
},
{
"path": "01-IIM/22d4-MLM-sales-TV.R",
"chars": 615,
"preview": "#Linear Model : Sales - TV, Radio, Newspaper\n\nlibrary(dplyr)\ndf = read.table(\"https://online.stat.psu.edu/onlinecourses/"
},
{
"path": "01-IIM/23c1-LOGR-logR.R",
"chars": 2819,
"preview": "# Logistic Regression\n#data() # datasets available for use in R\n\n# Load the textbook R package\nlibrary(ISLR) #install"
},
{
"path": "01-IIM/23d1-LGR-gre.R",
"chars": 4039,
"preview": "#Logistic Regresion : GRE\n#https://stats.idre.ucla.edu/r/dae/logit-regression/\n#A researcher is interested in how variab"
},
{
"path": "01-IIM/24b1-LOGR-purchase.R",
"chars": 2205,
"preview": "# Logistic Regression : Predict Purchase\n\n# Import the dataset\n#df1 = read.csv('./data/logr2.csv')\n#head(df1)\n\nurl=\"http"
},
{
"path": "01-IIM/24c2-LOGR-adult.R",
"chars": 2962,
"preview": "#Logistic Regression : Binary Cls : 0 or 1\n\n#Case Study : predict if an individual will earn more than $50K using logis"
},
{
"path": "01-IIM/24d2-LOGR-gre.R",
"chars": 5354,
"preview": "#Logistic Regresion : GRE\n#https://stats.idre.ucla.edu/r/dae/logit-regression/\n#A researcher is interested in how variab"
},
{
"path": "01-IIM/24e2-LOGR-general.R",
"chars": 887,
"preview": "#generic LogRegession\n\n\n#load data\ndata <- read.csv(....)\n\n#create training and validation data from given data\ninstall."
},
{
"path": "01-IIM/24g1-LOGR-cancer.R",
"chars": 1715,
"preview": "#logistic Regression\n#https://www.machinelearningplus.com/machine-learning/logistic-regression-tutorial-examples-r/\n# Lo"
},
{
"path": "01-IIM/24g2-LOGR-sample1.R",
"chars": 1015,
"preview": "#generic LogRegession\n\n\n#load data\ndata <- read.csv(....)\ndata = mtcars\n\n#create training and validation data from given"
},
{
"path": "01-IIM/31a1-DT-cart-split.R",
"chars": 3071,
"preview": "#Understanding Splitting and selection of variables\n\n#install the libraries\npacman::p_load(rpart, rpart.plot)\n#library(r"
},
{
"path": "01-IIM/31a2-DT-outlook.R",
"chars": 1514,
"preview": "#Decision Tree - Book Eg : Weather - Predict Play\n\noutlook =c('Sunny', 'Sunny', 'Overcast', 'Rain','Rain', 'Rain','Overc"
},
{
"path": "01-IIM/31a3-DT-general.R",
"chars": 5175,
"preview": "#Basic Steps in CART\n\n#Collect Data\n#Identify if it is Classification or Regression Model\n#Write the formula : y ~ x` + "
},
{
"path": "01-IIM/31a3-DT-practiseCase.R",
"chars": 4546,
"preview": "#DT - simple case\n#https://www.gormanalysis.com/blog/decision-trees-in-r-using-rpart/\n#https://en.wikipedia.org/wiki/Sta"
},
{
"path": "01-IIM/31b1-DT-CART-carseats.R",
"chars": 1482,
"preview": "# Decision Trees : - regression tree\n#install and load this library\nlibrary(ISLR)\ndata(Carseats)\n?Carseats\ndata = Carsea"
},
{
"path": "01-IIM/31b2-DT-CART-sales.R",
"chars": 1334,
"preview": "# CART Models - HH Case Study - Regression\nlibrary(rpart)\nlibrary(rpart.plot)\nlibrary(forecast)\n\n\nlibrary(gsheet)\nurl='h"
},
{
"path": "01-IIM/31b3-DT-CART-titanic.R",
"chars": 2033,
"preview": "# Decision Tree - Classification\n#we want predict for combination of input variables, is a person likely to survive or n"
},
{
"path": "01-IIM/31b4-DT-CART-loan.R",
"chars": 433,
"preview": "#Decision Tree - Not completed\n#(https://rpubs.com/fabiorocha5150/decisiontreemodel)\n\nurl='https://raw.githubusercontent"
},
{
"path": "01-IIM/31b5-DT-CART-gre.R",
"chars": 2762,
"preview": "#Decision Tree - GRE\nlibrary(dplyr)\nlibrary(rpart)\nlibrary(rpart.plot)\nlibrary(ggplot2)\n\n#https://stats.idre.ucla.edu/r"
},
{
"path": "01-IIM/31b5-DT-loanapproved1.R",
"chars": 1144,
"preview": "# Decision Tree # loanapproved = age + job + house + credit\n\nlibrary(rpart)\nlibrary(rpart.plot)\nlibrary(dplyr)\n\nloanappr"
},
{
"path": "01-IIM/31b5-DT-rpart-iris.R",
"chars": 1563,
"preview": "#CART Regression Tree\n\n#Load Libraries\nlibrary(rpart) #does only binary splits; CART\nlibrary(rpart.plot)\n\n#DataSet\nstr("
},
{
"path": "01-IIM/31b7-DT-party.R",
"chars": 1719,
"preview": "#Decision Tree 2\n#libraries - partykit, strucchange\n\n#ctree - Classification and Regression\nlibrary(partykit) # use this"
},
{
"path": "01-IIM/31f1-DT-cart2.R",
"chars": 1473,
"preview": "#outlook\n\noutlook =c('Sunny', 'Sunny', 'Overcast', 'Rain','Rain', 'Rain','Overcast', 'Sunny', 'Sunny','Rain','Sunny', 'O"
},
{
"path": "01-IIM/32a4-DT-cart1.R",
"chars": 3566,
"preview": "#Decision Tree\n\n#CART : Classification and Regression Tree\nlibrary(rpart)\nlibrary(rpart.plot)\nlibrary(dplyr)\n#model= rpa"
},
{
"path": "01-IIM/32a4-DT-rf-4.R",
"chars": 466,
"preview": "#https://lgatto.github.io/IntroMachineLearningWithR/supervised-learning.html#random-forest\n\nlibrary(\"mlbench\")\ndata(Sona"
},
{
"path": "01-IIM/32c2-DT-chaid-usvote.R",
"chars": 809,
"preview": "#CHAID - dataset USvote #multisplit\n# require(rsample) # for dataset and splitting also loads broom and tidyr\ninstall.pa"
},
{
"path": "01-IIM/32d1-DT-CHAID-usvote.R",
"chars": 987,
"preview": "#CHAID - dataset USvote #multisplit\n# require(rsample) # for dataset and splitting also loads broom and tidyr\n#install.p"
},
{
"path": "01-IIM/32d5-DT-CART-RF.R",
"chars": 4084,
"preview": "# Classification using DT and RF\n\n#load libraries\nlibrary(ggplot2)\nlibrary(randomForest)\nlibrary(rpart)\nlibrary(rpart.pl"
},
{
"path": "01-IIM/32k1-CLS-svm1.R",
"chars": 1025,
"preview": "#SVM using caret package\n#http://dataaspirant.com/2017/01/19/support-vector-machine-classifier-implementation-r-caret-pa"
},
{
"path": "01-IIM/33g3-CLS-randomForest1.R",
"chars": 5465,
"preview": "# Random Forests\n#http://rpubs.com/Jeffery/titanic\n\n#import data from online site\npath = 'https://raw.githubusercontent."
},
{
"path": "01-IIM/34a1-CLUST-clustering.R",
"chars": 3827,
"preview": "#Clustering\n#sample data, iris, no of clusters\nlibrary(cluster)\nlibrary(fpc)\nlibrary(dplyr)\n#kmeans -\n#sample Data\nmarks"
},
{
"path": "01-IIM/34a1-CLUST-samplecase.R",
"chars": 719,
"preview": "# Clustering\n\nset.seed(1234)\nsubject1 = trunc(rnorm(30, mean=60, sd=15))\nrange(subject1)\nsubject1\nmarks = data.frame(sub"
},
{
"path": "01-IIM/34b2-CLUST-customer.R",
"chars": 1024,
"preview": "# HH MA example - customer\n\n#install.packages(\"amap\")\nlibrary(amap)\n##Read the data in the file\nurl = 'https://docs.goo"
},
{
"path": "01-IIM/34d1-CLUST-creditData.R",
"chars": 4152,
"preview": "#Clustering - Credit Data\nlibrary(dplyr)\nlibrary(ggplot2)\nlibrary(gridExtra)\nlibrary(cluster) # clustering algorithms"
},
{
"path": "01-IIM/34e4-clust-NOC-iris.R",
"chars": 1375,
"preview": "#Optimal Number of Clusters in data\n#Reduce total within ss\niris\ndim(iris)\nhead(iris)\ntable(iris$Species)\n\ndata = iris[-"
},
{
"path": "01-IIM/34g1-CLUST-segmentation.R",
"chars": 1028,
"preview": "# HH MA example - customer\n\n#install.packages(\"amap\")\nlibrary(amap)\n##Read the data in the file\nurl = 'https://docs.goo"
},
{
"path": "01-IIM/34h1-CLUST-clustering.R",
"chars": 1006,
"preview": "## Clustering\n\n#install.packages(\"amap\")\nlibrary(amap)\n##Read the data in the file\nurl = 'https://docs.google.com/spread"
},
{
"path": "01-IIM/34h1-CLUST-clustering2.R",
"chars": 695,
"preview": "#clustering - segmentation into groups\n\n(salary = rnorm(20, mean=60, sd=10))\n(age = rnorm(20, mean=30, sd=5))\ndf1 = data"
},
{
"path": "01-IIM/34j2-CLUST-NOC.R",
"chars": 1403,
"preview": "#Optimal Number of Clusters in data\n#Reduce total within ss\nlibrary(NbClust)\n\niris\ndim(iris)\nhead(iris)\ntable(iris$Speci"
},
{
"path": "01-IIM/35d1-CLUST-hclust1.R",
"chars": 3686,
"preview": "# RinA - Hierarchical Clustering\n\n#Packages Required - flexclust,cluster, NbClust\n#Dataset - fkexclust::nutrient\n#data(n"
},
{
"path": "01-IIM/35d2-CLUST-hclust2.R",
"chars": 1152,
"preview": "#simple case of hierarchial clustering\n\nlibrary(flexclust)\ndata(nutrient, package='flexclust')\nhead(nutrient)\nnutrient.s"
},
{
"path": "01-IIM/37a2-AR-eg.R",
"chars": 1845,
"preview": "# Association Rule - Simple Example Case\n# read this pdf for help\n#https://cran.r-project.org/web/packages/arules/arules"
},
{
"path": "01-IIM/37b1-AR-groceries.R",
"chars": 4648,
"preview": "# Association Rules - Groceries data set ####\n\nlibrary(arules) #install first\nlibrary(arulesViz) #install first\nlibrary"
},
{
"path": "01-IIM/37b2-AR-samplecase.R",
"chars": 2802,
"preview": "# Association Rule - Simple Example Case\n# read this pdf for help\n#https://cran.r-project.org/web/packages/arules/arules"
},
{
"path": "01-IIM/37b3-AR-groceries-subset.R",
"chars": 3625,
"preview": "#AR - Groceries - Subset\n\n#Subsetting rules and itemsets\nrules <- apriori(Groceries, parameter = list(support=.001, conf"
},
{
"path": "01-IIM/37b5-AR-finproducts.R",
"chars": 3747,
"preview": "# AR data for Finance\n\nlibrary(\"arules\")\n\nset.seed(101)\ntransactionID = sample(1:500, 1000, replace=T)\ntransactionID\ntab"
},
{
"path": "01-IIM/37b5-AR-groceries.R",
"chars": 4456,
"preview": "# Association Rules - Groceries data set ####\n#load the libraries\n#load the data set - transaction format built in - Gro"
},
{
"path": "01-IIM/37c3-AR-redundant.R",
"chars": 2830,
"preview": "# Association Rule - Simple Example Case\n# read this pdf for help\n#https://cran.r-project.org/web/packages/arules/arules"
},
{
"path": "01-IIM/44b1-TS-data.R",
"chars": 2334,
"preview": "# create a time series data\n\n#first create a vector of numerical values\n# 36 observations \nset.seed(1234)\n(sales = round"
},
{
"path": "01-IIM/44b1-TS-dates.R",
"chars": 3833,
"preview": "#Data Format in R \n#When we import data into R, dates and times are usually stored as character or factor by default due"
},
{
"path": "01-IIM/44c2-TS-dates-lubridate.R",
"chars": 4640,
"preview": "#Package Lubridate https://data.library.virginia.edu/working-with-dates-and-time-in-r-using-the-lubridate-package/\n# Dat"
},
{
"path": "01-IIM/44c4-TS-xts-data.R",
"chars": 662,
"preview": "# xts - create object and export data\n\nlibrary(xts)\n\n#create matrix : 1 col for 1 share\n(stockprices = matrix(c(100,103"
},
{
"path": "01-IIM/45d2-TS-airpassengers.R",
"chars": 3657,
"preview": "# Time Series Case Study - Decomposition\n\n#https://rpubs.com/emb90/137525\n# Data Set - AirPassengers\nx=c(9.23221232,5.34"
},
{
"path": "01-IIM/45d3-TS-components-airp.R",
"chars": 3469,
"preview": "# Time Series Case Study - Decomposition\n\n#https://rpubs.com/emb90/137525\n# Data Set - AirPassengers\nx=c(9.23221232,5.34"
},
{
"path": "01-IIM/45d4-TS-johnson.R",
"chars": 908,
"preview": "#Johnson Case - TS\n#time series analysis\n#plot, decompose, forecast, \nJohnsonJohnson\n?JohnsonJohnson\n\nmonthplot(JohnsonJ"
},
{
"path": "01-IIM/45g3-TS-TTR-ma.R",
"chars": 1319,
"preview": "#Time Series - SMA\n#packages used - TTR and forecast\n\nlibrary(forecast)\n#MA\n#https://www.rdocumentation.org/packages/for"
},
{
"path": "01-IIM/46e3-TS-auto-arima-johnson.R",
"chars": 2667,
"preview": "#Times Series Analysis \n# is the price of Johnson and Johnson shares change over time\n# are there quarterly effects with"
},
{
"path": "01-IIM/55c1-TM-twitter.R",
"chars": 7282,
"preview": "#new Twitter\n#Learning Business Analytics #businessanalytics using R @R_Programming at @IIMBodhGaya with @curtisak"
},
{
"path": "01-IIM/55e1-TM-twitter1.R",
"chars": 3941,
"preview": "#Twitter 1 - Configure Tweets and Download them\n#@dupadhyaya #Working using my Keys\n#---Learning Business Analytics #b"
},
{
"path": "01-IIM/61b1-LP-marketingspend.R",
"chars": 149,
"preview": "#LP in R : Marketing Spend\n#https://analyticsprofile.com/business-analytics/how-to-optimise-digital-marketing-spend-usin"
},
{
"path": "01-IIM/61c2-LP-marketingspend-case.R",
"chars": 620,
"preview": "# LP - Marketing Spend\n## Code to solve LP\n\n#install.packages(\"linprog\")\nlibrary(linprog)\n\nMax_ROI = c(0.07, 0.03, 0.15,"
},
{
"path": "01-IIM/61d2-LP-tpt.R",
"chars": 1209,
"preview": "# LP - Transportation Problem \n#https://docs.google.com/spreadsheets/d/1G6-iPDoD_i4THQAHwBeOLeiTfuqn7a6Q7MrOg9v1C5U/edit"
},
{
"path": "01-IIM/61e2-LP-machassign.R",
"chars": 1232,
"preview": "#----------------------------------------------#\n#Another Method\n#https://cran.r-project.org/web/packages/lpSolveAPI/lpS"
},
{
"path": "01-IIM/61e5-LP-farmer1.R",
"chars": 2592,
"preview": "#Farmer Problem in LP\n#A farmer plans to plant two crops, A and B. The cost of cultivating crop A is $40/acre, whereas t"
},
{
"path": "01-IIM/77a1-FA-quandl.R",
"chars": 4103,
"preview": "# Finance Stock Analysis\n#Stock Download\n\nlibrary(Quandl)\n#https://www.quandl.com/account/api 4D8hkYAV4WEkcTmD9LMW\n\nQuan"
},
{
"path": "01-IIM/77a2-FA-quandl2.R",
"chars": 3687,
"preview": "# Finance Stock Analysis\n\n#Install Packages \n#pckgs<-c(\"Quandl\",\"Sweep\",\"tidyverse\")\n#install.packages(pckgs,dependencie"
},
{
"path": "01-IIM/77a5-FA-quantmod.R",
"chars": 3748,
"preview": "#https://ntguardian.wordpress.com/2017/03/27/introduction-stock-market-data-r-1/\n#Stock Analysis \n\n# Get quantmod\nif (!"
},
{
"path": "01-IIM/77a6-FA-quantmod-I-stocks.R",
"chars": 1822,
"preview": "#Indian Stocks\n\n# Indian Stocks\n#stocks2 - Quantitive Financial Modeling\n#https://cran.r-project.org/web/packages/quantm"
},
{
"path": "01-IIM/77f2-FA-quantmod1.R",
"chars": 3588,
"preview": "#https://ntguardian.wordpress.com/2017/03/27/introduction-stock-market-data-r-1/\n#Stock Analysis \n\n# Get quantmod\nif (!"
},
{
"path": "01-IIM/77f3-FA-indianstocks.R",
"chars": 863,
"preview": "# Indian Stocks\n\nlibrary(quantmod)\nstart <- as.Date(\"2017-01-01\")\nend <- as.Date(\"2018-10-01\")\ngetSymbols(\"SBIN.NS\", src"
},
{
"path": "01-IIM/91ab3-Case-student1.R",
"chars": 5943,
"preview": "# Data Manipulation : Academic Data\n\n#Method1 : gsheet\nlibrary(gsheet)\nurl= \"https://docs.google.com/spreadsheets/d/1qLH"
},
{
"path": "01-IIM/91b4-Case-dencoCase.R",
"chars": 4430,
"preview": "# Case Study - Denco \n#Manufacturing Firm with sales data of partnum and customer with region wise sales\n\n# Should know"
},
{
"path": "01-IIM/91b5-Case-denco.R",
"chars": 3868,
"preview": "#Case Study - Manufacturing Coy\n\nlibrary(gsheet)\nurl = \"https://docs.google.com/spreadsheets/d/1h7HU0X_Q4T5h5D1Q36qoK40T"
},
{
"path": "01-IIM/91g4-CASE-dencoCase.R",
"chars": 3394,
"preview": "# Case Study - Denco \n\n#read file : Method1\nsales1 = read.csv(\"./data/denco.csv\")\nstr(sales1)\n\n#read file : Method2\nsal"
},
{
"path": "01-IIM/revision1.R",
"chars": 18153,
"preview": "#Revision of Topics\n\nmtcars\n?mtcars #help on mtcars\nclass(mtcars)\nx= 1:5\nclass(x)\nx\ny=c(1, 3.5,5)\ny\nclass(y)\nclass(as."
},
{
"path": "01-IIM/test.R",
"chars": 121,
"preview": "\nmtcars\nprint(tail(mtcars))\niris\nprint(head(iris))\nwomen\nplot(women)\n\n\nlapply(mtcars, FUN=mean)\nsapply(mtcars, FUN=mean)"
},
{
"path": "01-IIM/x55a1-TM-tweets.R",
"chars": 142,
"preview": "#TWeets\n\n\nLearning Business Analytics #businessanalytics using R @R_Programming at @IIMBodhGaya with @curtisakshay"
},
{
"path": "01-IIM/x55a2-TM-twitter.R",
"chars": 3404,
"preview": "#Twitter 1 - Configure Tweets and Download them\n#@dupadhyaya #Working using my Keys\n#Load libraries\nlibrary(\"curl\")\nlib"
},
{
"path": "01-IIM/x55a3-TM-twitter2.R",
"chars": 6389,
"preview": "\n#https://cran.r-project.org/web/packages/rtweet/vignettes/intro.html\n#https://rtweet.info/\n#https://www.rdocumentation."
},
{
"path": "02-IIMcases/DT_diabetis.R",
"chars": 1901,
"preview": "#Decision Tree : Similar eg in Python\n#https://www.kaggle.com/aungpyaeap/diabetes-test-using-decision-tree\n\nurl='https:/"
},
{
"path": "02-IIMcases/case-denco.R",
"chars": 1547,
"preview": "#Case Study - Manufacturing Coy\n\nlibrary(gsheet)\nurl = \"https://docs.google.com/spreadsheets/d/1h7HU0X_Q4T5h5D1Q36qoK40T"
},
{
"path": "02-IIMcases/case-dplyr-mtcars.R",
"chars": 6616,
"preview": "#dplyr - mtcars\nlibrary(dplyr)\nmtcars\nhead(mtcars)\n\n#Filter----\n\nfilter(mtcars, cyl == 8)\nfilter(mtcars, cyl < 6)\nmtcars"
},
{
"path": "02-IIMcases/rev_iima20_1.R",
"chars": 608,
"preview": "#Revision Session - IIMA - Sep 2020\n\n#vectors-----\nx = 1:10000\nclass(x)\n(marks = rnorm(n=100, mean=60, sd=10))\n\n#matrice"
},
{
"path": "02-IIMcases/revision1.R",
"chars": 5194,
"preview": "#Revision - IIM\n\n#data Structures - vectors, matrix, dataframe, factors\n(vector1 = c('a','b','E', 'Dhiraj', 'Vishnu', '"
},
{
"path": "02-IIMcases/senitmentTwitter.R",
"chars": 2684,
"preview": "#Sentiment - twitter\n\n# Tweeter Download and Analysis\n#connect all libraries\nlibrary(plyr)\nlibrary(dplyr)\nlibrary(string"
},
{
"path": "02-IIMcases/textMiningSentence.R",
"chars": 4869,
"preview": "#Text Mining\n\n#Loading Packages\nlibrary(tm)\nlibrary(wordcloud)\nlibrary(RColorBrewer)\n#Now in order to process or clean t"
},
{
"path": "03-setup/11a-start.R",
"chars": 432,
"preview": "# R Environment\n\n# The operators <- and = assign into the environment in which they are # evaluated. The operator <- can"
},
{
"path": "03-setup/11b-gettingstarted.R",
"chars": 186,
"preview": "# Getting Started- Basics/Shortcuts in R\n# For Help\n?mean\n\nsessionInfo()\nlibrary(dplyr)\nsessionInfo()\n\ndetach(\"package:d"
},
{
"path": "03-setup/12-packageInstall.R",
"chars": 1510,
"preview": "#packages install\n\n#Total Avl Packages\nnrow(available.packages()) #C\n#16000\n\n#install r tools\n\ninstall.packages('pacman'"
},
{
"path": "03-setup/15a-envrm.R",
"chars": 984,
"preview": "# Workspace & Environment\n\n# Identify the directory\ngetwd()\n\n# List all the objects in directory\ndir()\n\n# How to run the"
},
{
"path": "03-setup/15b-renv.R",
"chars": 383,
"preview": "# Sys Env\n\n## whether HOST is set will be shell-dependent e.g. Solaris' csh does not.\nSys.getenv(c(\"R_HOME\", \"R_PAPERSIZ"
},
{
"path": "03-setup/15e-rjava.R",
"chars": 208,
"preview": "#rJava Settings\n\n\n#install java https://java.com/en/download/win10.jsp\n\n\nlibrary(xlsx)\nlibrary(rJava)\n\nSys.setenv(JAVA_H"
},
{
"path": "03-setup/16a-pathconfig.R",
"chars": 284,
"preview": "# Lib paths\n\n.libPaths()\n\nfile.exists(\"~/.Rprofile\")\nfile.edit(\"~/.Rprofile\")\n# Add these lines to Rprofile\n# .First = f"
},
{
"path": "03-setup/17a-rstudio.R",
"chars": 109,
"preview": "# rstudio\n\n#Tools -> Global Options\n#code -> editing -> Soft Wrap\n\n# themes\n\n#fonts\n\n#colors\n\n#size of Fonts\n"
},
{
"path": "03-setup/18a-processtime.R",
"chars": 1678,
"preview": "# Execution Time\nproc.time()\ng <- rnorm(100000)\nh <- rep(NA, 100000)\n\n# Start the clock!\nptm <- proc.time()\n\n# Loop thro"
},
{
"path": "03-setup/21a-floorceiling1.R",
"chars": 690,
"preview": "# R Tips1\n\n#round----\nround(14.5378, digits=2) # 14.54\nround(14.5378, digits=1) # 14.5\nround(14.5378) #15\n\n#Significant-"
},
{
"path": "03-setup/21b-options.R",
"chars": 1204,
"preview": "#Options\n#Allow the user to set and examine a variety of global options which affect the way in which R computes and dis"
},
{
"path": "03-setup/24a-github.R",
"chars": 8,
"preview": "# github"
},
{
"path": "03-setup/25a-help.R",
"chars": 1579,
"preview": "# Misc Commands\n#https://www.r-project.org/help.html\n\n#library for help-----\nlibrary(swirl) # for online help\nswirl()\n\n"
},
{
"path": "03-setup/51c-deletefiles.R",
"chars": 393,
"preview": "# Delete Files from command\n\nwrite.csv(mtcars, 'mcars.csv')\nfile.exists('mcars.csv')\nunlink('mcars.csv')\nfile.exists('mc"
},
{
"path": "03-setup/envVar.R",
"chars": 290,
"preview": "#Env Variables\nSys.getenv(\"PATH\")\nSys.getenv(\"JAVA_HOME\")\nSys.setenv(JAVA_HOME='C\\\\Program Files\\\\Java\\\\jre1.8.0_291\\\\bi"
},
{
"path": "03-setup/help.R",
"chars": 128,
"preview": "# Misc Commands\n\nlibrary(swirl) # for online help\nswirl()\n\ndata() # to see in built data sets\n\nmean(mtcars$mgp)\n?mean\n"
},
{
"path": "03-setup/pathconfig.R",
"chars": 283,
"preview": "# Lib paths\n\n.libPaths()\nfile.exists(\"~/.Rprofile\")\nfile.edit(\"~/.Rprofile\")\n# Add these lines to Rprofile\n# .First = fu"
},
{
"path": "04-lib/10a-fBasics.R",
"chars": 990,
"preview": "#Package fBasics\n#http://math.furman.edu/~dcs/courses/math47/R/library/fBasics/html/015A-BasicStatistics.html\n\nlibrary(f"
},
{
"path": "04-lib/11a-pysch.R",
"chars": 65,
"preview": "#Descriptive Values of data set\n\nlibrary(psych)\ndescribe(mtcars)\n"
},
{
"path": "04-lib/21b-installpackages.R",
"chars": 696,
"preview": "#Install packages for Data Analytics Course\n\n\npackages1 = c('rJava','xlsx','dplyr')\npackages2 = c('plyr', 'psych', 'tm',"
},
{
"path": "04-lib/21e-installFmGit.R",
"chars": 692,
"preview": "#install packages from Git Hub\n\n#lubripack\ninstall.packages('lubripack') #NA for some versions\n#install older version o"
},
{
"path": "04-lib/21g-packages1.R",
"chars": 2864,
"preview": "# Packages installation\n#https://www.rstudio.com/products/rpackages/\n\n#List avl packages\nlibrary()\n\n#Total Avl Packages\n"
},
{
"path": "04-lib/31b-datasets.R",
"chars": 1126,
"preview": "# Data Sets\n#https://stat.ethz.ch/R-manual/R-devel/library/datasets/html/00Index.html\n\n#built in datasets in base\nmtcars"
},
{
"path": "04-lib/31c-datasets.R",
"chars": 448,
"preview": "#Datasets\n\n?datasets # Using R's built in data sets\ndata()\n\nlibrary(help=datasets)\n\ndata(mtcars) "
},
{
"path": "04-lib/41-purrr1.R",
"chars": 672,
"preview": "#purrr\n#https://www.weirdfishes.blog/blog/practical-purrr/\n\ndf = mtcars\n\nlibrary(purrr)\nlibrary(dplyr)\n\n#functions\n\nmu ="
},
{
"path": "04-lib/42-purr2.R",
"chars": 960,
"preview": "#purrr\n\ndf = mtcars\n\nlibrary(purrr)\nlibrary(dplyr)\n\ndf %>% split(.$cyl)\n\ndf %>% split(.$cyl) %>% walk(print)\n\n\ndf %>% s"
},
{
"path": "04-lib/43-purrr3.R",
"chars": 1156,
"preview": "#purrr3\n\nlibrary(purrr)\nlibrary(dplyr)\n\ncar_data <- transform(aggregate(. ~ cyl, data = subset(mtcars, hp > 100), FUN ="
},
{
"path": "04-lib/44-purrr4.R",
"chars": 1832,
"preview": "#purrr4\n\n#purrr is designed to help with “functional programming”, which you can take broadly as trying to use functions"
},
{
"path": "04-lib/45-purrr5.R",
"chars": 1765,
"preview": "#purrr5\n#https://adv-r.hadley.nz/functionals.html\n\nlibrary(purrr)\nlibrary(dplyr)\n\ntriple <- function(x) x * 3\nmap(1:3, t"
},
{
"path": "04-lib/51-plyr1.R",
"chars": 1525,
"preview": "# split - apply - combine\n\n#https://vita.had.co.nz/papers/plyr.pdf\na*ply(.data, .margins, .fun, ..., .progress = \"none\")"
},
{
"path": "04-lib/61-splitapplycombine1.R",
"chars": 440,
"preview": "#plyr, dplyr, \n\n#http://krlmlr.github.io/pdlyr/vignettes/pdlyr.html\n\n#https://coolbutuseless.bitbucket.io/2018/03/03/spl"
},
{
"path": "04-lib/62-splitapplycombine2.R",
"chars": 457,
"preview": "#split apply combine\n\ndf = mtcars\nbaseball\nlibrary(plyr)\nbaseball.1 <- ddply(baseball, .(id), transform, cyear = year - "
},
{
"path": "04-lib/71-broom1.R",
"chars": 1082,
"preview": "\nlibrary(tidyr)\nlibrary(dplyr)\nlibrary(broom)\nmdply(expand.grid(mean = 1:5, sd = 1:5), as.data.frame(rnorm), n = 10)\nmtc"
},
{
"path": "04-lib/packages1.R",
"chars": 3162,
"preview": "# Packages installation\n\n#List avl packages\nlibrary()\n\n\n#Total Avl Packages\nnrow(available.packages())\n\n#Install Package"
},
{
"path": "04-lib/switchr.R",
"chars": 553,
"preview": "#switchR for switching between libraries\n#https://cran.r-project.org/web/packages/switchr/switchr.pdf\n\nlibrary(switchr)\n"
},
{
"path": "04-lib/useful.R",
"chars": 2317,
"preview": "\n\n\n\n\n#library - useful\nlibrary(useful)\nk1 <- kmeans(x=iris[, 1:4], centers=3)\nplot(k1)\nplot(k1, data=iris)\n\ntoCheck <- c"
},
{
"path": "04-lib/useful2.R",
"chars": 4433,
"preview": "#useful Functions\n\nlibrary(useful)\n\n\nbinary.flip(c(1,1,0,1,0,0,1))\n\nclassdf(mtcars, cols=c(1:5))\nclassdf(mtcars)\n\nbottom"
},
{
"path": "05-dataIE/14a-readcsv.R",
"chars": 408,
"preview": "# Read csv\n\n# How to read CSV File\n\n#read.csv(file, header = TRUE, sep = \",\", quote = \"\\\"\", dec = \".\", fill = TRUE, comm"
},
{
"path": "05-dataIE/14b-readcsv.R",
"chars": 202,
"preview": "# reading from file\n\n#used generally when u don't want to path to the file\n#or location is different from Project Folder"
},
{
"path": "05-dataIE/14c-importweb.R",
"chars": 478,
"preview": "# read from Internet \n\n#read.csv(url(\"http://some.where.net/data/foo.csv\"))\n#url is optional\n\ndf2 = read.csv('http://www"
}
]
// ... and 1297 more files (download for full content)
About this extraction
This page contains the full source code of the DUanalytics/rAnalytics GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 1497 files (11.2 MB), approximately 3.0M tokens. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.