Full Code of dupadhyaya/analytics for AI

master d92d4ae2937c cached

1258 files

18.8 MB

5.0M tokens

1 requests

Copy disabled (too large) Download .txt

Showing preview only (19,866K chars total). Download the full file to get everything.

Repository: dupadhyaya/analytics
Branch: master
Commit: d92d4ae2937c
Files: 1258
Total size: 18.8 MB

Directory structure:
gitextract_5fftlku6/

├── .RDataTmp
├── .gitignore
├── 0-Practise/
│   ├── day1.R
│   ├── day2.R
│   ├── day3.R
│   ├── first.R
│   ├── htmlimport.R
│   ├── import2.R
│   ├── practise.R
│   ├── practise2.R
│   ├── rough.R
│   └── vector.R
├── 0-Rdata/
│   ├── madata.Rdata
│   ├── student.rds
│   ├── student1.rds
│   ├── twitter authentication.Rdata
│   └── twitterauthentication.Rdata
├── 00-toc.R
├── 02-lms/
│   ├── 1-ds.R
│   ├── fms.txt
│   └── importcsv.R
├── 03-wksp1/
│   ├── 1a1-start.R
│   ├── 1a3-packages1.R
│   ├── 1b2-ds.R
│   ├── 1b3-factor.R
│   ├── 1d2-basicstats.R
│   ├── 1d2-dm-student1.R
│   ├── 1d3-dencoCase.R
│   ├── 1d4-DA-dencoCase.R
│   ├── 1e-graphs-basic.R
│   ├── 1e2-graphs.R
│   ├── 1e3-advgraphs.R
│   ├── 1f-SLR-women.R
│   ├── 1h1-dplyr.R
│   ├── 1h2-freqtable.R
│   ├── 2a-importExport.R
│   ├── 2b-SLR-salesarea.R
│   ├── 2b-allmodels.R
│   ├── 2b2-SLM-women.R
│   ├── 2b3-SLM-women-A.R
│   ├── 2b4-LM-cars.R
│   ├── 2b4-SLR-women.R
│   ├── 2c3-MLM-salespromotion.R
│   ├── 2c4-MLM-mtcars1.R
│   ├── 2d1-missingvalues.R
│   ├── 2d3-datapartition.R
│   ├── 2e1-logR-purchase.R
│   ├── 2e2-LOGR-adult.R
│   ├── 2e3-LOGR-gre.R
│   ├── 3b1-DT-CART-carseats.R
│   ├── 3b2-DT-CART-R-sales.R
│   ├── 3b3-DT-CART-titanic.R
│   ├── 3b4-DT-CART-R-loan.R
│   ├── 3b5-DT-loanapproved1.R
│   ├── 3b5-DT-rpart-iris.R
│   ├── 3d1-DT-CHAID-usvote.R
│   ├── 3e1-clust-customer.R
│   ├── 3e1-clustering.R
│   ├── 3e2-clust-samplecase.R
│   ├── 3e3-clust-segmentation.R
│   ├── 3e4-clust-noOfclusters.R
│   ├── 4b1-AR-groceries.R
│   ├── 4b2-AR-samplecase.R
│   ├── 4b3-AR-groceries-subset.R
│   ├── 4b5-AR-finproducts.R
│   ├── 4e1-twitter1.R
│   ├── 4e2-wordcloud.R
│   ├── 4e3-worldcloud2.R
│   ├── 4e5-wordcloud3.R
│   ├── 4f2-quantmod1.R
│   ├── 4f3-indianstocks.R
│   ├── 5-wordcloud2-New.R
│   ├── 5b-LP-marketingspend.R
│   ├── 5c2-LP-marketingspend-case.R
│   ├── 5d-wordcloud2.R
│   ├── 5d2-LP-tpt.R
│   ├── 5e2-LP-machassign.R
│   ├── 5e5-LP-farmer1.R
│   ├── 6b1-dates.R
│   ├── 6b1-ts-data.R
│   ├── 6c2-dates-lubridate.R
│   ├── 6d-TS-airpassengers.R
│   ├── 6d-ts-components-airp.R
│   ├── 6d-ts-johnson.R
│   ├── 6d-ts-xts-data.R
│   ├── 6e-TS-auto-arima-johnson.R
│   ├── 6g-ts-TTR-ma.R
│   ├── 8-fa-quandl.R
│   ├── 8-fa-quandl2.R
│   ├── 8-fa-quantmod.R
│   ├── 8-quantmod-I-stocks.R
│   └── zz-practise.R
├── 04-wksp2/
│   ├── Graph-matrixplots.R
│   ├── LMtrainTest.R
│   ├── Links_DAR
│   ├── Links_DAR.R
│   ├── TS-arima-johnson.R
│   ├── TS-components-airpassengers.R
│   ├── TS-data.R
│   ├── TS-dates.R
│   ├── TS-lubridate.R
│   ├── TS-movavg-Nile.R
│   ├── TS-movavg.R
│   ├── assocrule1.R
│   ├── assocrule2.R
│   ├── assocrule3.R
│   ├── decisiontree1.R
│   ├── decisiontree2.R
│   ├── decisiontree3.R
│   ├── decisiontree4.R
│   ├── decisiontree5.R
│   ├── decisiontree5CHAID.R
│   ├── df.R
│   ├── environ.R
│   ├── freqtable.R
│   ├── lm-salesarea.R
│   ├── lm-salesqty.R
│   ├── lm-women-simple.R
│   ├── lm.R
│   ├── logR.R
│   ├── logr-gre.R
│   ├── matrix.R
│   ├── missingvalues.R
│   ├── packages1.R
│   ├── packages2.R
│   ├── stats2.R
│   ├── twitter.R
│   ├── vectors.R
│   ├── wordcloud1.R
│   └── wordcloud2.R
├── 10a-setup/
│   ├── 11a-start.R
│   ├── 11b-gettingstarted.R
│   ├── 15a-envrm.R
│   ├── 15b-renv.R
│   ├── 15e-rjava.R
│   ├── 16a-pathconfig.R
│   ├── 17a-rstudio.R
│   ├── 18a-processtime.R
│   ├── 21a-floorceiling1.R
│   ├── 21b-options.R
│   ├── 24a-github.R
│   ├── 25a-help.R
│   ├── 51c-deletefiles.R
│   ├── help.R
│   └── pathconfig.R
├── 10d-excel/
│   ├── student1.R
│   └── student2.xlsx
├── 10e-impexp/
│   ├── 14a-readcsv.R
│   ├── 14b-readcsv.R
│   ├── 14c-importweb.R
│   ├── 14d-importweb.R
│   ├── 14e-readothers.R
│   ├── 15b-datawrangling.R
│   ├── 20a-importgg.R
│   ├── 21b-googlesheet1.R
│   ├── 22b-ggsheets2.R
│   ├── 31a-export.R
│   └── 32c-writecsv.R
├── 12a-packages1/
│   ├── 21b-installpackages.R
│   ├── 21e-installFmGit.R
│   ├── 21g-packages1.R
│   ├── 31b-datasets.R
│   ├── 31c-datasets.R
│   ├── 41-purrr1.R
│   ├── 42-purr2.R
│   ├── 43-purrr3.R
│   ├── 44-purrr4.R
│   ├── 45-purrr5.R
│   ├── 51-plyr1.R
│   ├── 61-splitapplycombine1.R
│   ├── 62-splitapplycombine2.R
│   ├── 71-broom1.R
│   └── packages1.R
├── 13a-Packages2/
│   ├── 10a-fBasics.R
│   └── 11a-pysch.R
├── 15a-DS/
│   ├── 0FileList.R
│   ├── 10a-TOC
│   ├── 13b-ds-blank.R
│   ├── 14b-Basic_R_v01.R
│   ├── 14b-objectsmethods.R
│   ├── 14c-ds1.R
│   ├── 15a-objects.R
│   ├── 16b-datatypes.R
│   ├── 16c-basicDT.R
│   ├── 16d-ds1.R
│   ├── 20a-vectors.R
│   ├── 20b-vectors2.R
│   ├── 20c-vectors.R
│   ├── 20d-vectorfunctions.R
│   ├── 20f-vectors.R
│   ├── 20g-valuegenerate.R
│   ├── 20h-vectors2.R
│   ├── 25a-matrices.R
│   ├── 25c-matrices.R
│   ├── 25d-matrices.R
│   ├── 25e-matrices.R
│   ├── 27a-arrays.R
│   ├── 27b-arrays.R
│   ├── 27d-arrays.R
│   ├── 30c-basicdatatypes.R
│   ├── 30d-ds1.R
│   ├── 30e-datatypes.R
│   ├── 33b-df.R
│   ├── 33c-df.R
│   ├── 35a-lists.R
│   ├── 35b-lists.R
│   ├── 35e-lists.R
│   ├── 38a-factors.R
│   ├── 38b-factors.R
│   ├── 38c-factors.R
│   └── 38e-factors.R
├── 15b-DM/
│   ├── 21b-rep.R
│   ├── 21c-seq.R
│   ├── 21g-replicate.R
│   ├── 21g-seqdates.R
│   ├── 22b-letters.R
│   ├── 25b-interval.R
│   ├── 25c-midpoint.R
│   ├── 27b-recode-car.R
│   ├── 29b-subset.R
│   ├── 29c-split1.R
│   ├── 29d-splitdata.R
│   ├── 29e-partitiondata.R
│   ├── 31b-rowcol1.R
│   ├── 33b-sortorder.R
│   ├── 33c-order.R
│   ├── 33c-sortorderrank.R
│   ├── 33d-rank.R
│   ├── 34b-castmelt1.R
│   ├── 34c-castmelt2.R
│   ├── 37a-mtcars-subset.R
│   ├── 37b-duplicates1.R
│   ├── 37c-unique.R
│   ├── 38b-scale1.R
│   ├── 41b-randnos1.R
│   ├── 41c-randnos.R
│   ├── 42b-normdist.R
│   ├── 45b-forloop1.R
│   ├── 45e-ifelse2.R
│   ├── 45v-switch1.R
│   ├── 46b-withoutapply.R
│   ├── 46c-applyForCompare.R
│   ├── 46d-applyfamily.R
│   ├── 46e-applytype.R
│   ├── 46f-while1.R
│   ├── 47b-apply1.R
│   ├── 47c-apply.R
│   ├── 47d-apply1.R
│   ├── 47h-tapply1.R
│   ├── 47j-lapply1.R
│   ├── 47m-mapply1.R
│   ├── 47n-mapply2.R
│   ├── 47o-rapply.R
│   ├── 47on-eapply.R
│   ├── 47p-sapply1.R
│   ├── 47q-sapply2.R
│   ├── 47s-tapply2.R
│   ├── 47t-vapply1.R
│   ├── 49b-replicate1.R
│   ├── 49c-replicate.R
│   ├── 49e-by.R
│   ├── 49f-by.R
│   ├── 49g-bywith.R
│   ├── 51b-myfunc.R
│   ├── 51c-functions1.R
│   ├── 53b-cbindrbind1.R
│   ├── 53c-joinDFs.R
│   ├── 53c-merge1.R
│   ├── 54b-combination.R
│   ├── 54d-expandgrid.R
│   ├── 55b-sweep1.R
│   ├── 55d-sweep2.R
│   ├── 56b-outer1.R
│   ├── 56c-outer2.R
│   ├── 57b-stack1.R
│   ├── 58-DF-common.R
│   ├── 58-df-matching1.R
│   ├── 58-df2.R
│   ├── 58-hmisc.R
│   ├── 58-pmatchchar.R
│   ├── 61c-missing1.R
│   ├── 61c-missing2.R
│   ├── 61c-missing3.R
│   ├── 62b-outlier.R
│   └── 62c-outlier2.R
├── 15c-Summary/
│   ├── 22b-aggregate.R
│   ├── 22c-aggregate2.R
│   ├── 23b-freqdistr1.R
│   ├── 23c-freqdistr2.R
│   ├── 23d-freqdistr3.R
│   ├── 23f-FD.R
│   ├── 23f-freqdistr.R
│   ├── 24b-freqdistr4.R
│   ├── 24f-freqdistr5.R
│   ├── 25g-freqdistr6.R
│   ├── 31c-rowsums1.R
│   ├── 32b-addmargin1.R
│   ├── 32c-margintable1.R
│   ├── 32d-proptable.R
│   ├── 32d-tableprop2.R
│   ├── 35b-crosstab.R
│   ├── 99a-Pskim.R
│   ├── 99a-studentdata1.R
│   └── descriptive.R
├── 16a-tidyverse/
│   ├── 20a-dplyr.R
│   ├── 21a-dplyr-select.R
│   ├── 21b-dplyr-slice1.R
│   ├── 21c-dplyr-mutate1.R
│   ├── 21d-dplyr-summarise.R
│   ├── 21e-dplyr-filter1.R
│   ├── 21f-dplyr-str.R
│   ├── 21g-dplyr-arrange.R
│   ├── 22b-dplyr-seperate1.R
│   ├── 22b-group.R
│   ├── 22c-summarise.R
│   ├── 22g-tibble-rownames.R
│   ├── 25b-magrittr.R
│   ├── 26c-tidyr-DSR1.R
│   ├── 26d-tidyr-DSR-who.R
│   ├── 31b-plyr1.R
│   ├── 32b-plyr-mutate.R
│   ├── 33d-dplyr-joins.R
│   ├── 33f-dplyr-split.R
│   ├── plyr-ddply-gpsum.R
│   ├── tidyr1.R
│   ├── zz-dplyr1.R
│   └── zz-tidy-dataformating.R
├── 16b-DT/
│   ├── 0-DTsummary.R
│   ├── 1-dt1.R
│   ├── 2-DT.R
│   └── 3-DT.R
├── 17a-Stats/
│   ├── 10-statslinks.R
│   ├── 10a-distributions.R
│   ├── 11a-normal.R
│   ├── 11b-normalq.R
│   ├── 11c-normald.R
│   ├── 12a-binomial.R
│   ├── 13a-mean.R
│   ├── 14a-median.R
│   ├── 15a-mode.R
│   ├── 15b-mode.R
│   ├── 16a-range.R
│   ├── 17a-sd.R
│   ├── 18a-covariance.R
│   ├── 19a-correlation.R
│   ├── 20a-coev.R
│   ├── 37a-sample1.R
│   ├── 40a-missing1.R
│   ├── 40b-missing1.R
│   ├── 40c-missing2.R
│   ├── 40d-missing3.R
│   ├── 42a-outlier1.R
│   ├── 43a-outliers1.R
│   ├── 45a-sampling.R
│   ├── 55a-traintest1.R
│   ├── 60a-kurtosis.R
│   ├── 60b-kurtosis.R
│   ├── 64a-skewness.R
│   ├── 64b-skewness.R
│   ├── ave1.R
│   ├── interactions.R
│   ├── mean1.R
│   ├── mean2.R
│   ├── mean3.R
│   ├── meandev.R
│   ├── meanwt1.R
│   ├── median1.R
│   ├── mode1.R
│   ├── normal_height.R
│   ├── normality.R
│   ├── normality2.R
│   ├── outlier2.R
│   ├── outliers1.R
│   ├── poiss1.R
│   └── quantile1.R
├── 18a-HypoTests/
│   ├── 20b-distributions.R
│   ├── 23b-ztest-bsda.R
│   ├── 25c-tdistribution.R
│   ├── 26b-ttestindep.R
│   ├── 26c-ttestpaired.R
│   ├── 27b-TTS1-case1.R
│   ├── 28b-TTS2-case1.R
│   ├── 28c-TTS1-case3.R
│   ├── 28e-TT-sample1i.R
│   ├── 31b-chisqdistr.R
│   ├── 32b-HT-chisq1.R
│   ├── 32c-HT-chisq2.R
│   ├── 33b-HT-chisq.R
│   ├── 33d-chisqtest1.R
│   ├── 33e-chisqtest2.R
│   ├── 34b-goodnessfit.R
│   ├── datadistr.R
│   ├── htestnd1.R
│   ├── randomdistr.R
│   ├── shadeareainplot.R
│   └── tests1.R
├── 19a-sum-cases/
│   ├── 31b-DA-dencoCase.R
│   ├── 31c-DA-dencoCase2.R
│   ├── 31d-dsum-denco.R
│   ├── 31e-dencoCase2.R
│   ├── 31f-dencoCase.R
│   ├── 33c-basicDM-mtcars.R
│   ├── 33c-dplyr-mtcars.R
│   ├── 33f-DA-bakerydata1.R
│   ├── 34b-sales1.R
│   ├── 34c-sales2.R
│   ├── 35b-DA-student1.R
│   ├── 35c-dm-student1.R
│   ├── 36b-dsum-Case1.R
│   ├── 36c-dsum-Case2.R
│   ├── 36f-DSA-case2.R
│   ├── 37b-dsum-iris1.R
│   ├── 38b-dsum-haireyecolor1.R
│   ├── 42b-case-sum-graphs.R
│   └── dataexplore.R
├── 19c-mtcars/
│   ├── 10b-datastructures.R
│   ├── 11b-mtcars.R
│   ├── 11c-mtcars-filter.R
│   ├── 11d-mtcars-descp.R
│   ├── 11f-mtcars-loops.R
│   ├── 11g-mtcars-sort.R
│   ├── 11h-mtcars-dplyr.R
│   ├── 12d-mtcars-graph1.R
│   ├── 12e-mtcars-graph2.R
│   ├── 12e-mtcars-summarise-dplyr.R
│   ├── 12f-diag-ggplot2-mtcars.R
│   ├── 12f-ggplot2-mtcars.R
│   ├── 13b-mtcars-lm1.R
│   ├── 13c-mtcars-lm2.R
│   ├── 13e-mtcars-lm3.R
│   ├── 14b-mtcars-logr.R
│   ├── 15b-mtcars-DT-class.R
│   ├── 15c-mtcars-DT-anova.R
│   ├── 16b-mtcars-cluster1.R
│   ├── 16c-mtcars-cluster2.R
│   ├── 22f-tidyr-mtcars.R
│   ├── mtcars-clust1.R
│   └── s1.R
├── 19d-iris/
│   └── sumgraph1.R
├── 20a-BasicGraphs/
│   ├── 10a-graphs.R
│   ├── 10b-graphs.R
│   ├── 12b-graphs2.R
│   ├── 12d-title1.R
│   ├── 12e-text.R
│   ├── 12f-abline.R
│   ├── 12g-legend.R
│   ├── 12k-tick.R
│   ├── 12m-axis1.R
│   ├── 13e-multipleplots1.R
│   ├── 13f-multipleplots.R
│   ├── 13g-subplot.R
│   ├── 15a-graphdata1.R
│   ├── 15b-graph1.R
│   ├── 21b-plot-hist1.R
│   ├── 21c-plot.R
│   ├── 23b-line.R
│   ├── 23c-lines2.R
│   ├── 24b-histogram.R
│   ├── 24c-histogram2.R
│   ├── 25b-barplot.R
│   ├── 25c-barplot2.R
│   ├── 26b-boxplot.R
│   ├── 26c-boxplot2.R
│   ├── 26d-boxplot2.R
│   ├── 27b-pie.R
│   ├── 27c-pie2.R
│   ├── 29b-corrgram1.R
│   ├── 32b-freqdistr.R
│   ├── 33b-dotplot.R
│   ├── 33b-matrixplots.R
│   ├── 37b-scatter.R
│   ├── 42b-intplots1.R
│   ├── 43b-mosaic.R
│   ├── 43c-corrplot.R
│   ├── 43c-ggally.R
│   ├── 44b-textplots.R
│   ├── 45b-violinplot.R
│   ├── ria2g1.R
│   ├── ria2g2.R
│   ├── ria2g3.R
│   └── ria3g3.R
├── 20d-AdvGraphs/
│   ├── cowplot1.R
│   ├── donut.R
│   ├── donut2.R
│   ├── esquisse.R
│   ├── lattice.R
│   ├── lattice1.R
│   ├── survey.R
│   ├── symbols.R
│   └── vtree1.R
├── 20f-ggplots/
│   ├── circbarplot.R
│   ├── gg-bar1.R
│   ├── gg-bar2.R
│   ├── gg-box2.R
│   ├── gg-boxhist.R
│   ├── gg-boxplot1.R
│   ├── gg-heatmap.R
│   ├── gg-hist1.R
│   ├── gg-legend1.R
│   ├── gg-line.R
│   ├── gg-slope.R
│   ├── gg-slope2.R
│   ├── ggp2.R
│   ├── ggplot-DU1.R
│   ├── ggplot3.R
│   ├── ggplot5.R
│   ├── ggplot6.R
│   ├── ggplot7.R
│   └── twoaxis-gg.R
├── 20g-Network/
│   ├── NetSciX 2016 Workshop.R
│   ├── network1.R
│   ├── network2.R
│   └── traveltime1.R
├── 21a-OneGday/
│   ├── 1bubblechart.R
│   ├── 1bubblechart2.R
│   ├── multipleplots1.R
│   └── tableGrob.R
├── 23a-Strings/
│   ├── abvn.R
│   ├── latex.R
│   ├── output.txt
│   ├── paste1.R
│   ├── setop1.R
│   ├── strcmpt1.R
│   ├── string1.R
│   ├── strjoin.R
│   ├── strlength.R
│   ├── strman1.R
│   ├── strman2.R
│   ├── strman3.R
│   ├── strman4.R
│   ├── strman5.R
│   ├── strman6.R
│   ├── strman7.R
│   ├── strman9.R
│   ├── strprint1.R
│   ├── strreplace1.R
│   ├── strsearch.R
│   ├── strsplit1.R
│   ├── strsplit2.R
│   ├── strsplit3.R
│   ├── strsplit4.R
│   └── tidyr-strseperate.R
├── 24a-LM/
│   ├── 10a-lm-women2.R
│   ├── 10b-lm-salesarea2.R
│   ├── 10c-MLR-omni.R
│   ├── 10e-lm-errorplot.R
│   ├── 13b-lm-commands.R
│   ├── 16b-SLM-women2.R
│   ├── 16c-SLM-women1.R
│   ├── 16e-SLM-women-A.R
│   ├── 16f-SLM-women-V.R
│   ├── 16f-SLM-women.R
│   ├── 16m-SLM-women2.R
│   ├── 17a-LM-case1.R
│   ├── 17b-LM-stock1.R
│   ├── 18a-SLM-salesarea.R
│   ├── 18b-SLM-salesarea.R
│   ├── 18c-SLM-salesarea.R
│   ├── 23a-MLM-omni.R
│   ├── 23c-MLM-omni.R
│   ├── 24a-MLM-pcsales.R
│   ├── 25a-MLM-mtcars.R
│   ├── 25c-MLM--mtcars1.R
│   ├── 25c-MLM-mtcars.R
│   ├── 25d-MLM-mtcars-A.R
│   ├── 26a-MLM-airquality.R
│   ├── 27a-MLM-marketing.R
│   ├── 35a-MLM-case1.R
│   ├── 37a-LM-dummy-fireplace.R
│   ├── 37b-dummy1.R
│   ├── 38c-LM-dummy1.R
│   ├── 41c-LM-assumptions.R
│   ├── 42b-LM-linearity.R
│   ├── 42c-LM-normality.R
│   ├── 42d-LM-variance.R
│   ├── 42e-LM-outliers.R
│   ├── 42f-LM-autocorr.R
│   ├── 42g-LM-influentialvariables.R
│   ├── 42h-LM-multicollinearity.R
│   ├── 42j-gvlma.R
│   ├── 43a-LM-graphs.R
│   ├── LM-all-mtcars1.R
│   ├── ProbDist.R
│   ├── Simulation.R
│   ├── confusionmatrix.R
│   ├── contrasts1.R
│   ├── dummies.R
│   ├── homosecadicity.R
│   ├── lm-broom.R
│   ├── lm-dummy1.R
│   ├── lm-housing.R
│   ├── lm-mtcars1.R
│   ├── lm-mtcars2.R
│   ├── lm-plot1.R
│   ├── lm-segments1.R
│   ├── mlm-state77.R
│   ├── multvariate1.R
│   ├── plotcoef1.R
│   └── regrplot1.R
├── 24c-NLM/
│   ├── nlm1.R
│   └── nlm2-mtcars.R
├── 28a-LogR/
│   ├── 24c-LR-default.R
│   ├── 24d-LR-default.R
│   ├── 24e-LR-default.R
│   ├── 24g-LR-default-accuracy.R
│   ├── 26b-LR-germancredit.R
│   ├── 27b-LR-gre.R
│   ├── 28b-LR-subscribe.R
│   ├── 28c-LR-subscribe.R
│   ├── 29b-LR-ads.R
│   ├── 31b-LR-income.R
│   ├── 31c-income.R
│   ├── 33b-LR-purchase.R
│   ├── 45b-compareAUC.R
│   ├── 45c-roc-default.R
│   ├── 45e-roc-general.R
│   ├── 45f-roc1.R
│   ├── 45h-roc2.R
│   ├── 46c-accuracy.R
│   ├── 48b-auc1.R
│   ├── 48c-auc1.R
│   ├── 48d-auc.R
│   ├── 48e-auc.R
│   ├── 49c-thresholdvalue.R
│   ├── pdpu.R
│   └── zz--logR.R
├── 29a-GLM/
│   ├── Logr-party.R
│   ├── crossfold.R
│   ├── crossval1.R
│   ├── cv-houseprices.R
│   ├── cv-women1.R
│   ├── cv3.R
│   ├── cvlm2.R
│   ├── glm-affairs1.R
│   ├── glm-affairs2.R
│   ├── glm-cars.R
│   ├── glm-titanic1.R
│   ├── logR1.R
│   ├── logpos1.R
│   ├── logr-mtcars.R
│   ├── logr-mtcars1.R
│   ├── logrMaths.R
│   ├── logreg-iris1.R
│   ├── multinominal.R
│   ├── multinominal2.R
│   ├── multinominal3.R
│   ├── multinominal4.R
│   ├── multinominal5.R
│   ├── multinominal6.R
│   └── nls1.R
├── 30a-CLS/
│   ├── cls-gen
│   ├── cls1M-cancer.R
│   ├── dt-multiplemodels.R
│   ├── giniIndex.R
│   └── rattle.R
├── 30b-CART/
│   ├── 10-CART-gen.R
│   ├── 11-cart-understandsplit.R
│   ├── 12-DT-outlook.R
│   ├── CARTR_sales.R
│   ├── CART_Regression Tree v01.R
│   ├── DT-rpart-claims.R
│   ├── c-dt-rpart-Case-DU1.R
│   ├── c-dt-rpart-iris.R
│   ├── c-dt-rpart-sales1.R
│   ├── cls-cart-churn2.R
│   ├── cls-rpart-plot2.R
│   ├── dt-car.R
│   ├── dt-general.R
│   ├── dt-glaucoma.R
│   ├── dt-ionos1.R
│   ├── dt-iris1.R
│   ├── dt-kyphosis.R
│   ├── dt-loanapproved1.R
│   ├── dt-rpart-du.R
│   ├── dt-rpart-du1.R
│   ├── dt-rpart-du2.R
│   ├── dt-rpart-du3.R
│   ├── dt-rpart-metal.R
│   ├── dt-rpart-student1.R
│   ├── dt-rpart-text1.R
│   ├── dt-rpart-varimp1.R
│   ├── dt-rpart-varimp2.R
│   ├── dt-sleep.R
│   ├── dt-tree-car1.R
│   ├── dt3-eyes.R
│   ├── entropy.R
│   ├── multimodel.R
│   ├── tree-houseprices.R
│   └── zz-test.R
├── 30c-Ctree/
│   ├── CTREE NPS R code v01.R
│   ├── ctree-KyCU.R
│   ├── ctree-airquality.R
│   ├── ctree-churn2.R
│   ├── ctree-clsregr-party.R
│   ├── ctree-clsregr.R
│   ├── ctree-readingskills.R
│   ├── ctree2-iris.R
│   ├── ctreee-iris.R
│   └── dt-ctree-playYes.R
├── 30d-CHAID/
│   ├── CHAID-nps2.R
│   ├── CHAID-xsell1.R
│   ├── c-dt-chaid-nps.R
│   ├── c-dt-chaid-usvote1.R
│   ├── chaid-attrition.R
│   ├── chaid-cancer.R
│   ├── chaid-usvote.R
│   ├── chaid2.R
│   ├── chaid4.R
│   ├── chisq.R
│   └── chisqtest2.R
├── 30d-splitcriteria/
│   ├── cls-entropy.R
│   ├── dt-rpart-criteria.R
│   ├── splitcriteria1.R
│   ├── splitcriteria2.R
│   └── splitcriteria3.R
├── 31b-KNN/
│   ├── knn1_cancer.R
│   ├── knn3_KKNN.R
│   ├── knn4.R
│   ├── knn_diamonds.R
│   └── knn_iris.R
├── 31c-naive/
│   ├── naivbayes1.R
│   └── naivbayes2.R
├── 31d-randomforest/
│   ├── dt-caret-xxx.R
│   ├── dt-rf-DU3.R
│   ├── dt-rf-eg2.R
│   ├── dt-rf-eg3.R
│   └── dt-rf-kyphosis1.R
├── 31d-weka/
│   ├── cls-ID3.R
│   ├── cls-c45weka.R
│   ├── clsW-iris.R
│   └── clsW-iris2.R
├── 31e-Case-Cancer/
│   ├── data-cancer.R
│   ├── rf-cancer.R
│   ├── svm-cancer1.R
│   └── svm-examples.R
├── 40a-CLUST/
│   ├── 10-clust-packages.R
│   ├── 16b-km-withinss.R
│   ├── 17b-clust-noclusters1.R
│   ├── 17c-clust-numbers-iris.R
│   ├── 17d-noc-mclust.R
│   ├── 19b-clust-distances.R
│   ├── 19c-clust-distances.R
│   ├── 19d-clust-scaling.R
│   ├── 20b-clust-plots.R
│   ├── 20c-clust-plots2.R
│   ├── 23b-km-marks1.R
│   ├── 23c-km-marks2.R
│   ├── 23d-km-amap-marks3.R
│   ├── 23e-km-student2.R
│   ├── 24b-clust-women.R
│   ├── 25b-km-iris.R
│   ├── 25c-km-iris2.R
│   ├── 25f-km-iris2.R
│   ├── 26h-km-attitude.R
│   ├── 27c-clust-som1.R
│   ├── 33c-hc-nutrients1.R
│   ├── 33c-hc-vegan-dune1.R
│   ├── 33d-hc-protein.R
│   ├── 33f-hc-marks.R
│   ├── 33g-hc-sample.R
│   ├── 35d-pam-iris.R
│   ├── 35e-pam-nutrient.R
│   ├── 40b-mixedclust1.R
│   ├── 40c-clust-dendgm.R
│   ├── 43b-clust-mixedDataTypes1.R
│   ├── 45c-clustering-exist1.R
│   ├── 45e-clustering-animation1.R
│   ├── 50b-clust-ma1.R
│   ├── 50c-clust-ma2.R
│   ├── 50d-clust-ma3.R
│   ├── 61b-clust-custsegm.R
│   ├── animation2.R
│   ├── clust-allcustering.R
│   ├── clust-case-liberty.R
│   ├── clust-class-differences.R
│   ├── clust-compare.R
│   ├── clust-distance-calc.R
│   ├── clust-distance2.R
│   ├── clust-entropy.R
│   ├── clust-iterations.R
│   ├── clust-kselect.R
│   ├── clustering-women.R
│   ├── clusters3.R
│   ├── hier-simplecase.R
│   ├── hier-usarrests.R
│   ├── iris.R
│   ├── kmeans-bankdata.R
│   ├── kmeans-pcalike.R
│   ├── kmeans-plots.R
│   ├── kmeans-randomness.R
│   └── pam1.R
├── 45a-AR/
│   ├── 11a-measures1.R
│   ├── 12a-ar-samplecase.R
│   ├── 12b-ar-samplecase2.R
│   ├── 14a-ar-datastr.R
│   ├── 15-ar-groceries.R
│   ├── 15a-ar-Groceries1.R
│   ├── 15b-ar-Groceries.R
│   ├── 16b-groceries-summary.R
│   ├── 16d-ar-groceries-subset.R
│   ├── 16f-ar-groceries-vis.R
│   ├── 16f-ar-groceries-vis2.R
│   ├── 16f-ar-groceries-vis3.R
│   ├── 16f-ar-groceries-vis4.R
│   ├── 16f-ar-groceries-vis5.R
│   ├── 16k-ar-grocery-DT.R
│   ├── 17a-ar-transactionformat.R
│   ├── 17d-ar-matrix-transactions.R
│   ├── 17e-ar-df-transcations.R
│   ├── 17f-ar-csv-transactions.R
│   ├── 17f-ar-csv2-transactions.R
│   ├── 17g-ar-list-transcations.R
│   ├── 17h-ar-dataformats.R
│   ├── 18a-arules1.R
│   ├── 20a-ar-DU1.R
│   ├── 20b-ar-DU2.R
│   ├── 20c-ar-DU3.R
│   ├── 22a-ar-edn.R
│   ├── 22b-ar-elective.R
│   ├── 22d-ar-subjects.R
│   ├── 22e-ar-placement.R
│   ├── 22f-myAR1.R
│   ├── 25a-ar-income.R
│   ├── 25b-ar-medical.R
│   ├── 25c-ar-titanic.R
│   ├── 29a-ar-Adult.R
│   ├── 29b-ar-Adult-NW.R
│   ├── 29c-ar-Adult-Draft.R
│   ├── 30a-ar-Finance1.R
│   ├── 30b-ar-Finance.R
│   ├── 32a-ar-visual.R
│   ├── 33a-ar-redundant.R
│   ├── 33b-redundantrules.R
│   ├── 40a-ar-multilevel-Groceries.R
│   ├── 43a-ar-patterns.R
│   ├── 45a-ar-rulesextract.R
│   ├── 99-ar-NW.R
│   ├── 99-ar-OnlineSales.R
│   ├── 99-ar-basketanalysis2.R
│   ├── 99-ar-policechecks.R
│   ├── AR-Weka
│   ├── ar-case-liberty.R
│   ├── ar-groceries2.R
│   └── my_basket1.txt
├── 46a-GD/
│   ├── aboutSL
│   ├── gradientdescent1.R
│   ├── gradientdescent2.R
│   ├── gradientdescent3.R
│   ├── gradientdescent4.R
│   └── regr1.R
├── 47A-TS/
│   ├── 12b-TS-add-mult.R
│   ├── 16c-dates-split1.R
│   ├── 16d-dates1.R
│   ├── 23b-TS-Case-sales.R
│   ├── 23b-lubridate1.R
│   ├── 24b-Data-DFtoTS.R
│   ├── 24b-timeseries1.R
│   ├── 24c-timeseries2.R
│   ├── 24f-ts-data.R
│   ├── 26b-ts-components-airp.R
│   ├── 26c-ts-components.R
│   ├── 27b-ts-johnson.R
│   ├── 27c-ts-lm-uscons.R
│   ├── 28c-ts-lubridate1.R
│   ├── 31c-TS-airp.R
│   ├── 33b-zoo-ts.R
│   ├── 35b-LSM-beer1.R
│   ├── 38b-tsplots2a.R
│   ├── 38c-tsplots3.R
│   ├── 38f-plot-zz.R
│   ├── 41b-arima1.R
│   ├── 41c-arima2.R
│   ├── 41d-arima-airp.R
│   ├── 41d-arima-jj-nile.R
│   ├── 45b-TS-arima.R
│   ├── 52c-Case1-complete.R
│   ├── 53b-sales-ts.R
│   ├── 55b-ts-case-xxx2.R
│   ├── 55c-ts-case-xxxx.R
│   ├── SMA-nile.R
│   ├── TS-P-fpp.R
│   ├── TS-c02.R
│   ├── TS-data-DU1.R
│   ├── TS-fpp-seasonplot.R
│   ├── TS-kings.R
│   ├── TS-links
│   ├── TS-movag1.R
│   ├── TS-nile.R
│   ├── TS-xts.R
│   ├── TS-zoo.R
│   ├── ts-P-highfreq.R
│   ├── ts-P-openair.R
│   ├── ts-P-padr.R
│   ├── ts-beer2.R
│   ├── ts-case1.R
│   ├── ts-case2.R
│   ├── ts-lubridate2.R
│   ├── ts-rollingvalues.R
│   ├── ts-rollingvalues2.R
│   ├── ts-splitdate.R
│   ├── ts-timestamp.R
│   └── tsforecast-exp.R
├── 48A-HTML/
│   └── aboutUSL
├── 48c-TM/
│   ├── SM-rtexttools1.R
│   ├── TM-zz.R
│   ├── downloadfile.R
│   ├── facebook1.R
│   ├── facebook2.R
│   ├── fms.txt
│   ├── linkedin1.R
│   ├── linkedin3.R
│   ├── pagerank.R
│   ├── rowling.txt
│   ├── rquery_wordcloud.R
│   ├── sentiment-tidyr1.R
│   ├── sentiment2.R
│   ├── textmining-DU1.R
│   ├── tm-worldcloud4.R
│   ├── twitter-hotel.R
│   ├── twitter-keys.R
│   ├── twitter-sentiment2.R
│   ├── twitter1-DU1.R
│   ├── twitter1-DU2.R
│   ├── twitter1-authen.R
│   ├── twitter1.R
│   ├── twitter2.R
│   ├── twitteracct
│   ├── wordcloud1.R
│   ├── wordcloud2.R
│   ├── wordcloud3.R
│   └── worldcloud2.R
├── 48g-textdocs/
│   └── vit.txt
├── 51a-OR-LP/
│   ├── 15b-lpsolveAPI.R
│   ├── 15c-lpassign.R
│   ├── 21b-LP-mach-prod.R
│   ├── 21c-LP-mach-prod.R
│   ├── 22b-LP-case1.R
│   ├── 22c-LP-assign-case3.R
│   ├── 22d-LP-Case-carmanufacturing.R
│   ├── 25b-LPassign-job.R
│   ├── 30a-LP-tpt-function.R
│   ├── 31b-LP-tpt1.R
│   ├── 31c-LP-tpt2.R
│   ├── 31d-LP-tpt3.R
│   ├── 33d-proptable.R
│   ├── 41b-pricing.R
│   ├── 51b-LP-marketing.R
│   ├── model.lp
│   ├── zz-LP-clplite.R
│   └── zz-LP-general.R
├── 62A-MA/
│   ├── CA.R
│   ├── campaign.csv
│   ├── data1.R
│   ├── graph1.R
│   ├── maregression1.R
│   ├── pricing1.R
│   └── tree1.R
├── 62c-RFM/
│   ├── rfm1.R
│   └── rfm3.R
├── 63A-FA/
│   ├── 10-FAlinks.R
│   ├── Insurance Loss v01.R
│   ├── InsuranceLosses.csv
│   ├── Packages Pre-requisites_v03.R
│   ├── aapl.csv
│   ├── fa-iitg-dataanalysis.R
│   ├── finTS1.R
│   ├── findata1.R
│   ├── finstmts1.R
│   ├── finstmts2.R
│   ├── finstmts3.R
│   ├── gtrends1.R
│   ├── intrino1.R
│   ├── intrino2.R
│   ├── lag1.R
│   ├── logistic_regression.R
│   ├── qf1.R
│   ├── sentianalysistrading1.R
│   ├── shares1.R
│   ├── shares2.R
│   ├── stock3.R
│   ├── stockanalysis1.R
│   ├── stockanalysis2.RData
│   ├── stocks5.R
│   ├── stocksanalysis3.R
│   ├── stocksanalysis4.R
│   └── volatity1.R
├── 70a-report/
│   ├── report1.Rmd
│   └── report1.tex
├── 70d-myTest/
│   └── lm-sim-test1.R
├── 70e-phd/
│   ├── attendance2.R
│   └── grades.R
├── 75a-sports/
│   ├── cricket1.R
│   └── cricket2-york.R
├── 76a-Misc/
│   ├── dhmethods.R
│   ├── funcpgm1.R
│   ├── h2o.R
│   └── skimr-package.R
├── 78b-json/
│   ├── 21b-json-format.R
│   ├── 23b-xml-import.R
│   ├── 25a-httr1.R
│   ├── 25c-httr2.R
│   ├── json-1.R
│   └── json2.R
├── 80a-artwork/
│   ├── AuctionsData - artwork.csv
│   ├── AuctionsData - set1.csv
│   ├── artwork-cls1.R
│   ├── artwork-descp.R
│   ├── artwork-eda1.R
│   ├── artwork-eda2.R
│   ├── artwork-rought.R
│   ├── artwork1.R
│   ├── artwork2.R
│   ├── artwork4.R
│   ├── awdata1.R
│   ├── density.R
│   └── file2.R
├── 80c-studqueries/
│   ├── Sapient_Big Data.R
│   ├── achal1.R
│   ├── achal1.csv
│   ├── achal2.R
│   ├── achal2.csv
│   ├── deepak.R
│   ├── hitesh-dec18.R
│   ├── hitesh1.R
│   ├── hitesh2.R
│   ├── hitesh3.R
│   ├── hitesh4.R
│   ├── hiteshJul18.R
│   ├── lalit1.R
│   ├── meena1
│   ├── meena2.R
│   ├── meena3.R
│   └── sidana2.R
├── Data/
│   ├── AuctionsData - set1.csv
│   ├── Churn.csv
│   ├── Computers.csv
│   ├── MA.RData
│   ├── MMM_raw_data_v02.csv
│   ├── NPS Data Food Order v01.csv
│   ├── Predict Merchant_Sales v01.csv
│   ├── Prostate_Cancer.csv
│   ├── Rdatasets.R
│   ├── Sales.csv
│   ├── Sales_files/
│   │   ├── 6006907
│   │   ├── frameworks-95aff0b550d3fe338b645a4deebdcb1b.css
│   │   ├── frameworks-b3cd8fa1481bc34c4b18cf307ca75438.js.download
│   │   ├── github-542f291c828bb453339765ba3a54c144.js.download
│   │   └── github-cdaf214b636e7d0581fce94eda9de4bd.css
│   ├── Segmentation_Data v01.csv
│   ├── Social_Network_Ads.csv
│   ├── airpsng.csv
│   ├── artwork.rds
│   ├── arulesfin.csv
│   ├── attendance1.csv
│   ├── attendance2.csv
│   ├── badata.Rdata
│   ├── bakery.csv
│   ├── bank.csv
│   ├── binary.csv
│   ├── bitsgoa.csv
│   ├── cclogr.csv
│   ├── clscredit.csv
│   ├── clsplay.csv
│   ├── clust_custseg.csv
│   ├── data4cluster2.csv
│   ├── data_clus_2.csv
│   ├── denco.csv
│   ├── dhiraj.csv
│   ├── dtdata.csv
│   ├── fintransactions.csv
│   ├── grades.csv
│   ├── hhe.txt
│   ├── iimc1.csv
│   ├── iimtrichy.csv
│   ├── iitgfa.csv
│   ├── iitgfa.xlsx
│   ├── iitgfa2.xlsx
│   ├── iris.csv
│   ├── itemlist1
│   ├── km5_c2.csv
│   ├── logr2.csv
│   ├── msales.csv
│   ├── mtcars.csv
│   ├── mtcars.sas7bdat
│   ├── mtcars1.csv
│   ├── myexcel.xlsx
│   ├── myitems1.csv
│   ├── myrules1.csv
│   ├── myworkbook.xlsx
│   ├── node1.csv
│   ├── pumba.csv
│   ├── rules.csv
│   ├── rulesR.csv
│   ├── salesslr.csv
│   ├── slr1.csv
│   ├── student.csv
│   ├── student1.xlsx
│   ├── student2.xlsx
│   ├── student3.xlsx
│   ├── student3a.xlsx
│   ├── studentdata2.csv
│   ├── studentdata3.txt
│   ├── studentdata4.csv
│   ├── students3.csv
│   ├── talltransactions.csv
│   ├── tendulkar.csv
│   ├── women.sav
│   └── ximb.csv
├── Unsorted/
│   ├── CLT.R
│   ├── R-Exercise.R
│   ├── RCommander.R.R
│   ├── basiclm1.R
│   ├── binomial.R
│   ├── boxplot.R
│   ├── c.R
│   ├── central1.R
│   ├── colstats1.R
│   ├── complextables.R
│   ├── cor1.R
│   ├── crossfold1.R
│   ├── cut1.R
│   ├── cutprety1.R
│   ├── datalevels.R
│   ├── dbconnection.R
│   ├── dec17.R
│   ├── dec17b.R
│   ├── demo1.R
│   ├── density2.R
│   ├── descstatsgraphs1.R
│   ├── dplyr1.R
│   ├── ds1.R
│   ├── env1.R
│   ├── examB.R
│   ├── extra.R
│   ├── fd1.R
│   ├── fd2.R
│   ├── fd3.R
│   ├── fd4.R
│   ├── fd5-means.R
│   ├── googleS.R
│   ├── knitr.R
│   ├── kurtosis.R
│   ├── lm-sales.R
│   ├── lm1-sales.R
│   ├── lm1.R
│   ├── miscscripts.R
│   ├── nd1.R
│   ├── normal.R
│   ├── paneldata1.R
│   ├── plot1.R
│   ├── practise-dec17c.R
│   ├── practise.R
│   ├── rattle1.R
│   ├── rattle2.R
│   ├── rcdr1.R
│   ├── rjava.R
│   ├── rle1.R
│   ├── sample1.R
│   ├── sample2.R
│   ├── scripting1.R
│   ├── skewness1.R
│   ├── skewness2.R
│   ├── skewness3.R
│   ├── smpdist1.R
│   ├── summary1.R
│   ├── sumstats1.R
│   ├── ttest1.R
│   └── vaibhavi.R
├── _config.yml
├── all_letters.csv
├── analytics.Rproj
├── blank.R
├── cacert.pem
├── car.data
├── data/
│   ├── Dataset1-Media-Example-EDGES.csv
│   ├── Dataset1-Media-Example-NODES.csv
│   ├── Dataset2-Media-User-Example-EDGES.csv
│   ├── Dataset2-Media-User-Example-NODES.csv
│   ├── ItemList.csv
│   ├── MBA.csv
│   ├── MBArules.csv
│   ├── Prostate_Cancer.csv
│   ├── Rules_20.csv
│   ├── StudentPassFail.csv
│   ├── StudentTid1.csv
│   ├── StudentTid2.csv
│   ├── ar14.csv
│   ├── ar14b.csv
│   ├── dar1.csv
│   ├── dar1w.csv
│   ├── dar1w.csv.arff
│   ├── dar2.csv
│   ├── dar3.csv
│   ├── dar3a.csv
│   ├── dar3b.csv
│   ├── data1.R
│   ├── dateformat1.R
│   ├── groceries.csv
│   ├── mushrooms.csv
│   ├── my_basket
│   ├── onsen.csv
│   ├── splitData1.R
│   ├── student1.csv
│   ├── studentdata.R
│   ├── studentdata.csv
│   ├── titanic.csv
│   └── titanic.raw.rdata
├── download/
│   ├── fms.txt
│   ├── iris.csv
│   ├── iris.xlsx
│   ├── rowling.txt
│   └── vector.R
├── file1.R
├── fms.txt
├── iimc1.R
├── mdi1.R
├── packages/
│   ├── switchr.R
│   ├── useful.R
│   └── useful2.R
├── report/
│   ├── example1.Rmd
│   ├── knit2.R
│   ├── knitr-minimal.R
│   ├── knitr-minimal.tex
│   ├── mdreport1.Rmd
│   ├── report2.Rmd
│   ├── report3.Rmd
│   ├── report4.Rmd
│   ├── report4.html
│   ├── report4.log
│   ├── reportnotes
│   ├── sample1.R
│   ├── sample2.R
│   └── xbrl.Cache/
│       ├── aapl-20140927.xml
│       ├── aapl-20140927.xsd
│       ├── aapl-20140927_cal.xml
│       ├── aapl-20140927_def.xml
│       ├── aapl-20140927_lab.xml
│       ├── aapl-20140927_pre.xml
│       ├── country-2013-01-31.xsd
│       ├── currency-2014-01-31.xsd
│       ├── dei-2014-01-31.xsd
│       ├── exch-2014-01-31.xsd
│       ├── invest-2013-01-31.xsd
│       ├── naics-2011-01-31.xsd
│       ├── nonNumeric-2009-12-16.xsd
│       ├── numeric-2009-12-16.xsd
│       ├── ref-2006-02-27.xsd
│       ├── us-gaap-2014-01-31.xsd
│       ├── us-roles-2014-01-31.xsd
│       ├── us-types-2014-01-31.xsd
│       ├── xbrl-instance-2003-12-31.xsd
│       ├── xbrl-linkbase-2003-12-31.xsd
│       ├── xbrldt-2005.xsd
│       ├── xl-2003-12-31.xsd
│       └── xlink-2003-12-31.xsd
├── studentdata.csv
├── twitter authentication.Rdata
└── ximb.txt

================================================
FILE CONTENTS
================================================

================================================
FILE: .gitignore
================================================
.Rproj.user
.Rhistory
.RData
.Ruserdata
.httr-oauth
SQF 2012.csv


================================================
FILE: 0-Practise/day1.R
================================================
# Day 1

library(ISLR)
data('Default')
str(Default)
LR1 = glm(default ~ ., family='binomial', data=Default)
summary(LR1) #leave income
LR2 = glm(default ~ student + balance, family='binomial', data=Default)
summary(LR2)
#
range(Default$balance)

ndata3 = Default[c(1,60,700),]
predict(LR2,newdata=ndata3, type='response' )

#mtcars
str(mtcars)






#vectors, arrays, matrix, list, factor, dataframe

x = 1:5
x1 = c('a','b')
m1 = matrix(1:24, nrow=6)
m1
list1 = list(x, x1, m1)
list1
class(women)
women
str(women)
?women
women
head(women)
tail(women,n=3)
head(women, n=3)
names(women)
summary(women)
dim(women)
data()
library(MASS)
x = women$height
x
plot(x)
mean(x)
sd(x) ; var(x)
max(x)
median(x)
x
sort(x, decreasing = T)
table(x)
quantile(x)
x
seq(0,1,.1)
quantile(x, c(.1, .5, .8))
quantile(x,seq(0,1,.1) )
summary(x)
min(x); max(x)
boxplot(x)
abline(h= c(min(x), max(x),mean(x)+1, median(x)), col=1:5, lwd=4)
#


# LM
head(women)
names(women)
model1 = lm(weight ~ height, data=women)
plot(women)
?lm
#options(scipen=999)
summary(model1)
model1
#y = mx + c
y = 3.45 * x + - 87
women$height
fitted(model1)
cbind(women, fitted(model1))
residuals(model1)
cbind(women, fitted(model1), residuals(model1), diff= fitted(model1) - women$weight)
sqrt(sum(residuals(model1)^2)/nrow(women))
cbind(women, fitted(model1))
range(women$height)
new1= data.frame(height=c(57, 60.5,70))
p1=predict(model1, newdata = new1)
cbind(new1, p1)

#mtcars----
names(mtcars)
?mtcars
mtmodel_1 = lm(mpg ~ wt, data=mtcars )
mtmodel_2 = lm(mpg ~ wt + disp, data=mtcars )
mtmodel_3 = lm(mpg ~ wt + disp + cyl, data=mtcars )
mtmodel_4 = lm(mpg ~ ., data=mtcars )
summary(mtmodel_1)  #.745
summary(mtmodel_2)  #.766
summary(mtmodel_3)  #.766
summary(mtmodel_4)  #.807
AIC(mtmodel_1, mtmodel_2,mtmodel_3,mtmodel_4)
summary(mtmodel_4)  #.807
step(lm(mpg ~ ., data=mtcars ))
mtmodel_5= lm(mpg ~ wt + qsec + am, data=mtcars)
summary(mtmodel_5) #Adjusted R-squared:  0.834 


#

attendance = 1:20
marks = 1:20
summary(lm(marks ~ attendance))
cbind(attendance, marks)
cor(attendance, marks)
#





















#y = mx + c
x
y = 3.45 * x + - 87
x
head(women)
(y = 4.45 * 58 - 87)
plot(women)
abline(model1, col='red', lwd=4)
abline(v=64) ; abline(h=150)



x2 = floor(runif(1000, 50, 100))
x2
x2a= sort(x2)
x2a[1000/2]
median(x2)
sort(x)

t1= table(x2)
sort(t1, decreasing = T)

x1 = rep(10,10)
x1
sd(x1)




dim(mtcars)
mtlogmodel = glm(am ~ hp + wt, family='binomial', data=mtcars)
summary(mtlogmodel)
p1=predict(mtlogmodel, newdata=mtcars, type='response')
p2= round(p1, 3)
p3 = ifelse(p2<0.5,0,1)
cbind(mtcars$am, mtcars$hp, mtcars$wt, p2,p3, truefalse= mtcars$am == p3)




================================================
FILE: 0-Practise/day2.R
================================================
# Day 3 - Online batch of MA

#attach function of R
women
names(women)
height
attach(women)
height
weight
women$height

#List
g <- "My First List"  #scalar
h <- c(25, 26, 18, 39)  # numeric vector
j <- matrix(1:10, nrow=5) #matrix
k <- c("one", "two", "three") # character vector
mylist <- list(title=g, ages=h, j, k, women)
mylist

mylist[[2]]
mylist[[5]]


# plot
plot(x=height, y=weight, type='b', lty=5, pch=11, fg='red', bg='green', col.axis='purple', cex=1.5, cex.axis=2)
title(main='Henry Harvin', sub=' MA Course')


================================================
FILE: 0-Practise/day3.R
================================================
attach(mtcars)
plot(wt, mpg)
abline(lm(mpg~wt))
title("Regression of MPG on Weight")
detach(mtcars)

dose <- c(20, 30, 40, 45, 60)
drugA <- c(16, 20, 27, 40, 60)
drugB <- c(15, 18, 25, 31, 40)
plot(dose, drugA, type="l")
plot(dose, drugA, type="b")

par(no.readonly = T)
opar <- par(no.readonly=TRUE)
plot(dose, drugA, type="b")
par(lty=2, pch=17)
plot(dose, drugA, type="b")
plot(dose, drugB, type="b")
par(opar)
plot(dose, drugA, type="b")
plot(dose, drugA, type="b",fg='red', col='purple', col.axis='green')

library(RColorBrewer)
n <- 7
mycolors <- brewer.pal(n, "Set1")
barplot(rep(1,n), col=mycolors)
barplot(rep(1,n), col=1:7)

n <- 10
mycolors <- rainbow(n)
pie(rep(1, n), labels=mycolors, col=mycolors)
mygrays <- gray(0:n/n)
pie(rep(1, n), labels=mygrays, col=mygrays)


================================================
FILE: 0-Practise/first.R
================================================
# First File in R
x1 <- c(1, 5, 4, 9, 0) # <- is assignment x to have value 1,5,4,9,0
#control + enter
x2 = c(1, 5, 4, 9, 0)
x1
x2

x = c(1,2,3,4,5,6,7,8,9,10)
x
x = 1:100
x
x = runif(100, 50, 200)
?runif
x
x = rnorm(100, mean=50, sd=10)
x
trunc(x)
round(x,1)
floor(x)
ceiling(x)
hist(x)
as.integer(x)
plot(density(x))
abline(v=50)
?runif
head(x)
class(x);mode(x)
typeof(x)
summary(x)


# Types of Data Structures in R




================================================
FILE: 0-Practise/htmlimport.R
================================================
#Installing the web scraping package rvest
#install.packages("rvest")
library(rvest)
#Specifying the url for desired website to be scrapped
url <- 'http://pgdbablog.wordpress.com/2015/12/10/pre-semester-at-iim-calcutta/'


#Reading the HTML code from the website
webpage <- read_html(url)

#Know about the selector gadget
vignette("selectorgadget")

#Using CSS selectors to scrap the post date
post_date_html <- html_nodes(webpage,'.entry-date')

post_date_html <- html_nodes(webpage,'.published , .entry-title')
#Converting the post date to text
post_date <- html_text(post_date_html)

#Verify the date captured
post_date



url="www.imdb.com"

#Using the CSS selector (using ‘www.imdb.com’ website in this example)
rating_html=html_nodes(webpage,'.imdb-rating')   #’.imdb-rating’ is taken from CSS selector

#Converting the rating data to text
rating <- html_text(rating_html)

#Check the rating captured
rating

html <- read_html("http://www.imdb.com/title/tt1490017/")
cast <- html_nodes(html, "#titleCast .itemprop")
length(cast)
#> [1] 30
cast[1:2]


html <- read_html("http://www.imdb.com/title/tt1490017/")
cast <- html_nodes(html, ".quicklink")
length(cast)
#> [1] 15

html_text(cast)



url="https://www.timeshighereducation.com/world-university-rankings/2019/world-ranking#!/page/0/length/-1/sort_by/rank/sort_order/asc/cols/stats"
typeof(url)
length(url)
links= ".stats_female_male_ratio , .stats_pc_intl_students , .stats_student_staff_ratio , .stats_number_students , .ranking-institution-title"

html <- read_html("https://www.timeshighereducation.com/world-university-rankings/2019/world-ranking#!/page/0/length/-1/sort_by/rank/sort_order/asc/cols/stats")
cast <- html_nodes(html, ".stats_female_male_ratio")
length(cast)
#> [1] 30
cast[1:2]
cast
ranks <- html_nodes(url, ".ranking-institution-title")




library(rvest)

URL <- "https://www.soccerstats.com/latest.asp?league=netherlands" #Feed page

WS <- read_html (URL) #reads webpage into WS variable

URLs <- WS %>% html_nodes ("a:nth-child(1)") %>% html_attr("href")         %>% as.character() # Get the CSS nodes & extract the URLs 

URLs <- paste0("http://www.soccerstats.com/",URLs) 

oversdf <- data.frame(URLs=URLs)

rownames(oversdf)  #returns a vector of row names in the overs data.frame:

URLs <-subset(oversdf, grepl("pmatch", oversdf$URLs),stringsAsFactors       =       FALSE)

write.csv(URLs,file=paste(getwd(),"/sportURLs.csv",sep=""),row.names=FALSE)

Catcher1 <- data.frame(FMatch=character(),TotalGoals=character    (),stringsAsFactors = FALSE)

##################################
#start of workaround
n<-nrow(URLs)
URLs2<-character()
for (i in 1:n) {
  URLs2[i]<-as.character(URLs[i,1])
}


library(dplyr)
library(rvest)
web = read_html("https://news.google.com/?hl=en-IN&gl=IN&ceid=IN:en")

web %>% html_nodes(".VDXfz") %>% html_text()

library(rvest)
library(purr)
url_base =html("https://www.cochranelibrary.com/cdsr/table-of-contents/2018/11")
#map_df(1:4)

page = read_html(url_base)
page %>% data.frame(paper = html_text(html_nodes(".search-result-doi"))) %>% df4


url2 = "http://www.espncricinfo.com/india/content/player/28081.html"
library(rvest)
library(curl)
msd = read_html(url2)
msd

msd2 <- msd %>% html_nodes("table") %>% .[1] %>% html_table(fill=T)
msd2
str(msd2)

#-----
url3 = "https://www.timeshighereducation.com/world-university-rankings/2019/world-ranking#!/page/0/length/-1/sort_by/rank/sort_order/asc/cols/stats"
the = read_html(url3)
the

the3 <- the %>% html_nodes("table") %>% .[1] %>% html_table(fill=T)
the3
str(msd2)

html_

================================================
FILE: 0-Practise/import2.R
================================================
#web scrapping
#https://www.analyticsvidhya.com/blog/2017/03/beginners-guide-on-web-scraping-in-r-using-rvest-with-hands-on-knowledge/


#Loading the rvest package
library('rvest')

#Specifying the url for desired website to be scrapped
url <- 'http://www.imdb.com/search/title?count=100&release_date=2016,2016&title_type=feature'

#Reading the HTML code from the website
webpage <- read_html(url)
webpage

#Using CSS selectors to scrap the rankings section
rank_data_html <- html_nodes(webpage,'.text-primary')

#Converting the ranking data to text
rank_data <- html_text(rank_data_html)

#Let's have a look at the rankings
head(rank_data)

#Data-Preprocessing: Converting rankings to numerical
rank_data<-as.numeric(rank_data)

#Let's have another look at the rankings
head(rank_data)

#[1] 1 2 3 4 5 6


#Using CSS selectors to scrap the title section
title_data_html <- html_nodes(webpage,'.lister-item-header a')

#Converting the title data to text
title_data <- html_text(title_data_html)

#Let's have a look at the title
head(title_data)

#[1] "Sing"          "Moana"         "Moonlight"     "Hacksaw Ridge"
#[5] "Passengers"    "Trolls"

#Using CSS selectors to scrap the description section
description_data_html <- html_nodes(webpage,'.ratings-bar+ .text-muted')

#Converting the description data to text
description_data <- html_text(description_data_html)

#Let's have a look at the description data
head(description_data)

#Data-Preprocessing: removing '\n'
description_data<-gsub("\n","",description_data)

#Let's have another look at the description data 
head(description_data)


================================================
FILE: 0-Practise/practise.R
================================================
#List
x; m1; a1; df1
g ="My First List"
h = c(25, 26,18,39)
j = matrix(1:10,nrow=2)
k = c('one','two','three')
mylist = list(title=g, ages=h, j, h)
mylist2 = list(k, mylist)
mylist2
mylist
mylist[1]
mylist[2]
mylist[[2]]
mylist[['ages']]
mylist$ages

#List end


#Factors
df1
# category type - ordered or unordered
#gender,  course, color - unordered
#grades, division, position,likertscale, ratings
summary(df1)
(grades = sample(c('A', 'B', 'C'),size=10, replace=T, prob=c(.4,.3,.3)))

df1$grades = grades
df1
summary(df1)
df1$gender = factor(df1$gender)
summary(df1)
df1$grades =factor(df1$grades, ordered=T)
df1$grades
aggregate(df1$age, by=list(df1$grades), mean)
aggregate(df1$age, by=list(df1$gender), mean)
aggregate(df1$age, by=list(df1$course), mean)
(df1$grades =factor(df1$grades, ordered=T, levels=c('C','B','A')))

(division = sample(c('Excellent', 'Very Good', 'Sat'),size=10, replace=T, prob=c(.4,.3,.3)))
division
summary(division)
Fdivision = factor(division)
summary(Fdivision)
Fdivision2 = factor(division, ordered=T, levels=c('Sat', 'Very Good', 'Excellent'))
summary(Fdivision2)
Fdivision2
Fdivision3 = factor(division, ordered=T)
summary(Fdivision3)
Fdivision3
#factors end


#Data Frame
(rollno = 1:10)
(sname = paste('Student',1:10,sep='-'))
(age = floor(runif(10, 20, 30)))
(gender = c(rep('M',5),rep('F',5)))
(course = sample(c('Engg','Medical','MBA'), 10, replace=T, prob=c(.3, .4, .3)))
table(course)
(married = sample(c(TRUE, FALSE), 10, replace=T))
table(married)
rollno; sname; age ; gender; course; married

(df1 = data.frame(rollno, sname, age , gender, course, married))
df1[1:2,3:4]
df1$sname
df1[df1$married==T,  ]
df1[df1$course=='Engg' & df1$age > 25,  ]
df1[df1$married==T & df1$course=='Engg' & df1$age > 25, c('sname') ]
?aggregate
aggregate(df1$age, by=list(df1$gender), FUN=mean)
aggregate(df1$age, by=list(df1$course), FUN=mean)
aggregate(df1$age, by=list(df1$course, df1$gender), FUN=mean)


df1
summary(df1)

#DF end








#Array
?array
array(data = NA, dim = length(data), dimnames = NULL)
#Coys - 5, Products-3, Locations-4
ceiling(3.2);

(salesfig = floor(runif(60, 70, 100)))
(a1 = array(data = salesfig, dim = c(4,3,5), dimnames = list(paste('Loc',1:4),paste('Prod',1:3),paste('Coy',1:5))))
apply(a1,1, sum)# sum locationwise
apply(a1,2, sum)
apply(a1,3, sum)
(ma1 = apply(a1,c(1,3), sum))
colSums(ma1)
rowSums(ma1)

apply(a1,c(2,3), sum)




#arrayend


#Matrix
#row x columns
?rnorm
set.seed(1234)
(x = trunc(runif(24,100,500)))
(m1 = matrix(data=x, nrow=4,dimnames = list(c('delhi','mumbai','noida','chennai'),paste('Prod',1:6,sep="-"))))
colMeans(m1);rowMeans(m1)
colSums(m1); rowSums(m1)
pie(x=rowMeans(m1))
barplot(rowMeans(m1))  # barplot for locations
barplot(colMeans(m1))  #barlplot for products
barplot(colMeans(m1), horiz = T)
barplot(colMeans(m1), horiz = T, col=1:6)
m1
#Subset a Matrix
m1[ , 1:2]
m1[ ,c(1,4)]
m1[c(1,3) ,c(1,4)]
m1[c('delhi','mumbai'),c('Prod-3')]
m1[m1 > 300]
m1
m1[c('delhi'),]
sd(m1[c('delhi'),])
sum(m1[c('delhi','mumbai'),c('Prod-3','Prod-4')])

#m end

(m2 = matrix(data=x, nrow=4, byrow = T))
(m3= matrix(x, ncol=4 ))
(m4 = matrix(c(1,2,3,4), nrow=2, ncol=4,byrow=T))
m1




# Vectors
snames = string (single char, multiple char)
marks1 = numeric(integer, decimal)
married - true or F
gender = categories
snames = c('student1', "student2", 'student3')
snames
class(snames)
marks = c(10, 20 ,30)
marks
class(marks)
married = c(TRUE, FALSE, TRUE)
married
class(married)
snames; marks; married
(age = c(30,35,26))
age = c(30,35,26)
age
?class

#subsetting a Vector
(x1 = 1:100)
(x2 = seq(50,100,3))
?seq
x1
x1[10]
x1[20:30]
x2[2:5]
x1[x1 > 50]
x2
x2 > 70
x2[(x2 > 70) | (x2 < 60)]
x2[c(FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,  TRUE,  TRUE,  TRUE,TRUE,  TRUE,  TRUE,  TRUE,  TRUE,  TRUE,  TRUE)]
x2
#operations on the vector
x3 = c('A','B','C')
x3[c(1,3)]
x3[c(TRUE, FALSE, TRUE)]
x3[(x3=='A') | (x3=='C')]

x2[(x2 > 60) & (x2 < 70)]
x2[x2==69]
x2==69

x2
length(x2)
length(x1)
mean(x2)
sum(x2)/length(x2)
sum(x2)
median(x2)
x2

set.seed(1)
(x4 = trunc(runif(20,5,100)))
median(x4)
(shirtcolors = c("red",'blue','green','blue','green', 'blue'))
mean(shirtcolors)
table(shirtcolors)

sort(x4)

mtcars$mpg
mean(mtcars$mpg)
women
data()
?mtcars

snames

x = rnorm(1000000, 50,10)
head(x)
mean(x)
hist(x)


================================================
FILE: 0-Practise/practise2.R
================================================
#misc practise

x = 1:5
data.entry(x)


================================================
FILE: 0-Practise/rough.R
================================================
# Rough Work
?cat
?dput
?dget
?dump
?write
?write.table
?save
?detach
?attach
?dir
?ls
?rm
?attr
?attributes

data1 = c(3, 60, 19, 9, 4 , 5)

labels1 = c('Building training sets', 'Cleaning and Organising Data', 'Collecting Data sets', 'Mining data for patterns', 'Refining Algorithms', 'Others')
pie(data1, labels= pielabels)

barplot(data1)
pie(data1,labels=NA,   clockwise=TRUE,
    col=rainbow(6),  border="white",  radius=1.2,
    cex=0.8,  main="Average Time Spent by Data Scientists")
legend("bottomright",legend=pielabels,bty="n", # horiz = T,
       fill=rainbow(6))
barplot(data1, col=rainbow(6), names.arg=pielabels, 
        cex.names = 1, horiz=T, angle=90, 
        main="Average Time Spent by Data Scientists"
)
text(1:6, data1, labels= pielabels)
?barplot



browsers<-read.table("browsers.txt",header=TRUE)
browsers<-data1
browsers
pielabels <- sprintf("%s = %3.1f%s", labels1, browsers, "%")
pielabels
?sprintf
library(RColorBrewer)
pie(browsers,
    labels=NA,
    clockwise=TRUE,
    col=brewer.pal(6,"Set1"),
    border="white",
    radius=1,
    cex=0.8,
    main="Percentage Share of Internet Browser usage")
legend("bottomleft",legend=labels1,bty="n",
       fill=brewer.pal(6,"Set1"))


?strde
states


================================================
FILE: 0-Practise/vector.R
================================================
#Data Structure - Vectors

x = c(1,5,7,8,4)
x2 <- c(2,5,7,8,4)
x
x2
x4 = c('M','F','M','F','M')
x4
(x5 = 1:100)
(x6 = seq(1,100,by=3))

marks = rnorm(60, mean=60, sd=10)
marks
plot(density(marks))
matrix(marks, ncol=6)








================================================
FILE: 00-toc.R
================================================
# Table of Contents

#Folder Name - Topic

lms 

================================================
FILE: 02-lms/1-ds.R
================================================
# Data Structures

#vectors----
v1 = 1:100 #create vector from 1 to 100
v2 = c(1,4,5,10)
class(v1)
class(v2)
v3 = c('a','Dhiraj','Ashish')
v3   #print the vector
class(v3)
v4 = c(TRUE, FALSE, T, F, T)
class(v4)

#summary on vectors
mean(v1)
median(v1)
sd(v1)
var(v1)
hist(v1)
hist(women$height)
v2
v2[v2>=5]

x = rnorm(60, mean=60, sd=10)
x
plot(x)
hist(x)
plot(density(x))
abline(v=60)
#rectangles and density together
hist(x, freq=F)
lines(density(x))

hist(x, breaks=10, col=1:10)
length(x)
sd(x)

?sample
x1 = LETTERS[5:20]
x1

set.seed(1234)
y1 = sample(x1)
y1

set.seed(53)
(y2= sample(x1, size=5))

(gender=sample(c('M','F'), size=1000000, replace=TRUE, prob=c(.3,.7)))
(t1=table(gender))
prop.table(t1)
pie(t1)
barplot(t1, col=1:2, horiz=T)
#


#matrix----
(m1 = matrix(1:24, nrow=4))
(m2 = matrix(1:24, nrow=4, byrow=T))
(m3 = matrix(1:24, ncol=4, byrow=T))
(x=trunc(runif(60,60,100)))
#round, trunc, ceiling, floor
plot(density(x))
(m4 = matrix(x, ncol=6))
colSums(m4)
rowSums(m4)
rowMeans(m4)
colMeans(m4)
m4[m4> 67 & m4 < 86]
m4[8:10, ]
m4[8:10, c(1,3,5) ]
rowSums(m4[8:10, c(1,3,5) ])
m4[ ,c(1,5,6) ]
#
#array----


#data.frame
#rollno, name, gender, course, marks1, marks2,grades
(rollno = 1:60)
(name=paste('student1',1:60,sep='-'))
(gender=sample(c('Male','Female'), size=60, replace=T,prob=c(.3,.7)))
(course=sample(c('BBA','MBA','FPM'), size=60, replace=T, prob=c(.4,.2,.1)))
(marks1 = ceiling(rnorm(60, mean=65, sd=7)))
(marks2 = ceiling(rnorm(60, mean=60, sd=11)))
(grades = sample(c('A','B','C'), size=60, replace=T))
students = data.frame(rollno, name,gender, course, marks1, marks2, grades, stringsAsFactors = F)
class(students )

summary(students)
students[, c('name')]
students[students$gender == 'Male', c('rollno','gender','marks1') ]
students[students$gender == 'Male' & students$grades == 'C', c('rollno','gender','marks1') ]
students[students$marks1 > 55 | students$marks1 < 75, c('name','marks1') ]
students$gender
t1=table(students$gender)
barplot(table(students$course), ylim=c(0,50), col=1:3)
text(1:3,table(students$course)+5,table(students$course) )
str(students)
nrow(students)
names(students)
dim(students)
head(students)
tail(students)
head(students,n=7)
#
#avg marks scored by each gender in marks1
#gender, marks1
aggregate(students$marks1, by=list(students$gender), FUN=mean)
aggregate(students$marks2, by=list(students$course), FUN=max)
aggregate(students$marks2, by=list(students$course, students$gender), FUN=mean)

#dplyr
library(dplyr)
students %>% group_by(gender) %>% summarise(mean(marks1))
students %>% group_by(course,gender) %>% summarise(meanmarks1= mean(marks1), min(marks2),max(marks2)) %>% arrange(desc(meanmarks1))
students %>% arrange(desc(marks1)) %>% filter(gender=='Male') %>% head(n=3)
students %>% sample_frac(.1) %>% arrange(course) %>% select(name, gender)
students %>% sample_n(2) 
students %>% arrange(desc(course), gender, marks1)
students %>% arrange(course, grades,marks1) %>% select(course, grades, marks1) %>% filter(course=='BBA')
#
students %>% group_by(course, gender) %>% arrange(marks1) %>% top_n(n=1)


#factor
names(students)
students$gender = factor(students$gender)
summary(students$gender)
summary(students$course)
students$course = factor(students$course,ordered=T)
summary(students$course)
students$course = factor(students$course,ordered=T, levels=c('FPM','MBA','BBA'))
summary(students$course)
students$grades
# C, A, B
students$grades = factor(students$grades, ordered=T, levels=c('C','A','B'))
students$grades
table(students$grades)
barplot(table(students$grades))

#

students
write.csv(students,'./data/iimtrichy.csv' )
students2 = read.csv('./data/iimtrichy.csv')
head(students2)
students3 = read.csv(file.choose())
head(students3)
students3 = students3[,-1]
head(students3)
#






#extra commands
name[1:10]
name[15:20]
name[c(15,20,37)]
name[-c(1:10)]
rev(name)
name[60:1]
name[-c(1:10, 35:40)]


mtcars
plot(women)
women
install.packages('car')
?women
?mean
x = 1:100
x
y = seq(1,100, by=2)
y = seq(1,100, by=2)



================================================
FILE: 02-lms/fms.txt
================================================
The Faculty of Management Studies focuses on management education more than just business management. The commitment is thought leadership with a deep understanding of business. The approach to pedagogy combines fieldwork, case studies and instrumented feedback with a strong emphasis on concepts and theory. The intent is to encourage intellectual curiosity and open minds to the adventure of ideas.

But much about the school is not just about what is taught within its confines. FMS has the unique privilege to be part of one of the premier universities world – the University of Delhi - with some of the finest departments in Economics, Law, Sociology, Commerce and Operations Research. Our collaborative approach involves inputs from various departments which gives the benefit of a much wider view and deeper understanding. This is indicated by the success of our alumni who are in positions of leadership in industries & governments across the world. The network of more than 10000 alumni gives students unmatched access to information, mentors and careers.

FMS rewards initiative, novelty and thinking outside the box. Our students are individualists of enormous intellectual energy with a talent for collaboration and teamwork. We are a diverse lot, possessing flair and dynamism that develops in a metropolis like Delhi and thrives on challenges both inside and outside the classroom.

Corporate recruiters value our graduates for their intellectual abilities, their collaborative mind-set, their individuality and their ability to hit the ground running.

We invite you to discover FMS for yourself.

================================================
FILE: 02-lms/importcsv.R
================================================
#import from csv

df1 = read.csv('pdpu.csv')
df1
head(df1)
names(df1)
#avg marks gender wise
aggregate(cbind(df1$marks1,df1$marks2), by=list(df1$gender), FUN=mean)
t1=table(df1$gender)
table(df1$batch)
barplot(table(df1$gender), col=1:2)
barplot(t1, col=c('red','green'))
pie(t1)


write.csv(df1,'pdpu2.csv')
write.csv(t1,'pdpu3.csv')


mtcars
str(mtcars)
df2 = mtcars
df3=rbind(df2,mtcars)
dim(df3)


================================================
FILE: 03-wksp1/1a1-start.R
================================================
#initial commands

# assign
x1 = 3 #press control + enter to run the line
x2 <- 3 # same 
#which is better
x1
x2
y
ls() #variables in env
women

?AirPassengers
data() # datasets available for use

library()  # libraries currently loaded

?mean  #help
help(mean)
??mean  #search through other sources
x=0:10
x
x <- c(0:10, 50)
x
xm <- mean(x)
xm
mean(x, trim = 0.10)
x=c(1,1,1,1,5,5,5,5,7,7)
mean(x)
mean(x, trim=.3)
x=c(1,5,5)
mean(x)
c(mean(x), mean(x, trim = 0.10))

1:10
1:10000000
x=c(1,34,5)
x
?c
version #version of R

Sys.Date() # todays date

getwd()  # working directory

methods(class='matrix')  #methods available for a class of object

plot(10:100) #basic command to plot
plot(women)


================================================
FILE: 03-wksp1/1a3-packages1.R
================================================
# Packages installation

#List avl packages
library()


#Total Avl Packages
nrow(available.packages())

#Install Package amap
install.packages('amap')

#Load package
library(amap)

#Find functions in package
library(help=amap)

#Help wrt a package
help(package='amap')  #see on right side pane


#Unload---
install.packages('tm')

library(tm)
library(VIM)
search()
detach('package:tm', unload=TRUE)
detach(package:VIM, unload = T) 
search()


#----------------- Part I Over
#Detach Multiple Packages
(detpkg = c('plyr','tm'))
library('plyr') #load lib
library('tm')  #load lib
search()  #check if loaded
Vectorize(detach)(name=paste0("package:", detpkg), unload=TRUE, character.only=TRUE)  #code to detach
search()  # confirm if removed
#specify the argument unload=TRUE; otherwise, R removes the package from the search path but doesn’t unload it.

#Detach from memory all packages
rm(list = ls(all = TRUE))
sessionInfo()


#Remove Packages ----- uninstall 
remove.packages("tm")
require('tm')# check if unistalled



#multiple packages

#Function
ipak <- function(pkg){
  new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])]
  if (length(new.pkg)) 
    install.packages(new.pkg, dependencies = TRUE)
  sapply(pkg, require, character.only = TRUE)
}
packagelist <- c('NLP','tm', 'lubridate')
ipak(packagelist)

#  package lists
pkg1 = c('dplyr','plyr', 'data.table','xlsx', 'Hmisc','rJava','ggplot2','lattice','gsheet','curl','stringr', 'syuzhet','e10171', 'catools','caret', 'olsrr' ,'swirl','sqldf','XML','VIM', 'outliers','car','MASS','DMwR','rvest')
pkg2 = c('forecast','rpart', 'rpart.plot', 'partykit','strucchange', 'didrooRFM')
pkg3 = c('zoo', 'astsa','lubridate','timeSeries','tseries','xts')
pkg4 = c('arules' , 'arulesViz')
pkg5 = c('twitterR','ROAuth','RGtk2','RTextTools','wordcloud')
install.packages("CHAID", repos="http://R-Forge.R-project.org")


library(lubridate)

#multiple load 
easypackages::libraries(pkg1)


#easypackages
#install.packages('easypackages')
library(easypackages)

#Install Multiple Packages
packages("plyr", "psych", "tm")
libraries("plyr", "psych", "tm")



#lubripack
install.packages('lubripack')  #NA for some versions
#install older version of R packages
#https://support.rstudio.com/hc/en-us/articles/219949047-Installing-older-versions-of-packages
#https://rdrr.io/github/Espanta/lubripack/

require(devtools)
install_github("Espanta/lubripack")
library(lubripack)
lubripack("plyr", "psych", "tm", "quantmod")



pack <- available.packages()
pack["ggplot2","Depends"]
pack["ggplot2","Imports"]
pack["data.table","Depends"]

packrat:::recursivePackageDependencies("ggplot2",lib.loc = .libPaths()[1])
tools::dependsOnPkgs('ggplot2')
tools::dependsOnPkgs('dplyr')
tools::dependsOnPkgs("ggplot2",installed=available.packages())
library(rusk)
#Remove Package
remove.packages('quantmod')
library(quantmod)



#This will remove all/Detach all  packages
library(mise)
search()
#mise(vars = TRUE, figs = TRUE, console = TRUE, pkgs = FALSE)
mise(pkgs=T)
search()



# list all packages where an update is available
old.packages()

# update all available packages
update.packages()

# update, without prompts for permission/clarification
update.packages(ask = FALSE)


================================================
FILE: 03-wksp1/1b2-ds.R
================================================
# Data Structures in R

#control+enter when you are in the line to execute
# Vectors-----
c(2,4,6)
seq(2,3,.5)
seq(by=.5, from=2,to=3)
rep(1:3,times=4)
rep(c(3,6,7,2),each=4)
rep(c(3,6,7,2), times=4)

?rep

x=1:10   #create seq of nos from 1 to 10
x
(x1 <- 1:20)

(x1=1:30)
(x2=c(1,2,13,4,5))
class(x2)

(x3=c('a',"ABC"))
class(x3)
(x3=letters[1:10])
class(x3)
LETTERS[1:26]
(x3b = c('a',"Henry",4))
class(x3b)

(x4=c(T,FALSE,TRUE,T,F))
class(x4)
class(c(3,5))
x5a = c(3,5.5)
class(x5a)
as.integer(x5a)

x5=c(3L,5L)
class(x5)
x5a = c(3,5)
class(x5a)
(x5b = c(1, 'a',T, 4L))
class(x5b)
#blank variable ?

#access elements
?seq
(x6 = seq(0,100,by=3))
seq(0,100,3)
seq(to=100,from=0,by=3)
seq(1,5,2)
?seq
#[1]  0  2  4  6  8 10
ls()  #variables in my environment
x6
length(x6)
x6[1:5]
x6[10:20]
x6[ seq(1,length(x6), 2)]
x6[3]  # access 3rd element
#[1] 4
x6[c(2, 4)]     # access 2nd and 4th element
x6[-1]          # access all but 1st element
x6[-c(1:10, 15:20)]
x6[c(2, -4)]    # cannot mix positive and negative integers
#Error in x[c(2, -4)] : only 0's may be mixed with negative subscripts
x6[c(2.4, 3.54)]    # real numbers are truncated to integers
x6[c(2,3)]

x6[-c(1,5,20)]
x6
x6[x6 > 30]

x6[x6 > 30 & x6 < 40]  # 31-39
#or |
length(x6)
x6[-(length(x6)-1)]
(x7 = c(x6, x2))


#modify
x6
set.seed(1234)
(x6 = sample(1:50))
(x6b = sort(sample(1:50)))
sort(x6)
sort(x6[-c(1,2)])
sort(x6, decreasing=T)
x6
rev(x6)

seq(-3, 10, by=.2)
x6[-c(1:12)]
x6
x6[x6> 30 & x6 < 40]
(x = -3:2)
x6
x6[2:10] <- 99; x6        # modify 2nd element
x6[x6 > 30 & x6 < 40] = 999
x6


x6
x7 = x6[1:4]; x7      # truncate x to first 4 elements

1:5
#equal partitions within a range
(x = seq(1,5, length.out = 15))
x
x = NULL
x
#NULL
x[4]
#NULL
?distribution
?rnorm
(x = rnorm(100))
plot(density(x))
abline(v=c(-3,0,3))
mean(x)
(x1 = rnorm(100, mean=50, sd=5))
plot(density(x1))
abline(v=mean(x1),h=0.04)
hist(x1, breaks=7)
hist(x1)
hist(x1, freq=F)
lines(density(x1), col=2)
summary(x1)
quantile(x1)
quantile(x1, seq(0,1,.25))
quantile(x1,c(.1, .5, .8))
quantile(x1,seq(0,1,.01))
stem(x1)

#Matrix-----
100:111
length(100:111)
matrix(1,ncol=3, nrow=4)
(m1 = matrix(100:111, nrow=4))
(m2 = matrix(100:111, ncol=3, byrow=T))

x=101:124
length(x)
matrix(x, ncol=6)
class(m1)
attributes(m1)
dim(m1)
m1

# access elements of matrix
m1[1,]
m1[,1]
m1[,1, drop=F]
m1[,-1]  #remove 1st column
m1[1,2:3]
m1[c(1,3),]
m1[,-c(1,3), drop=F]
m1[m1> 105 & m1 < 108]

#names of cols and rows
m1

paste("C","D",sep="-")
paste("C",1:100,sep="-")
paste("C",1:3,sep='')
(colnames(m1) = paste('C',1:3, sep=''))
m1
(rownames(m1) = paste("R",1:4, sep=''))
m1
attributes(m1)
m1[,c('C1','C3')]
m1[,c(1,3)]
#Vector to Matrix
(m3 = 1:24)
m3
dim(m3)= c(6,4)
m3

#access elements
m2
m2[1,]  #first row
m2[c(1,3,4),]  #1st,3rd,4th row

m2[,1]  #first col
m2[,2:3] # 2nd to 3rd coln

m2[c(1,2),c(2,3)]
m2[,]
m2[-2,] # exclude 2nd row
m2
m2[1:5] # matrix is like vector
m2
m2[c(TRUE,F,T,F),c(F, T, T)] #logical indexing
m2[m2 > 5 & m2 < 10]

m1
m1[1:2,1:2]
m1[c('R1','R2'),c('C1','C2')]
m1[1:2,]
m1[c(T,T,F,F),]
m1

#modify Vector
m2
m2[2,2]
m2[2,2] = 10
m2
m2[,2] = 10
m2
m2[m2> 107] = 9999
m2
rbind(m2, c(50,60,70))
rbind(m2,m2)
m2
cbind(m2, c(55,65,75,85))
m2m2= cbind(m2,m2)
m2m2
m2
cbind(m2,m2)
rbind(m2,m2)
#row and col wise summary

m1
colSums(m1)
rowSums(m1)
colMeans(m1)
rowMeans(m1)

t(m1) # transpose
m1
sweep(m1, MARGIN = 1, STATS = c(2,3,4,5), FUN="+" ) #rowise
sweep(m1, MARGIN = 2, STATS = c(2,3,4), FUN="*" ) #colwise

#addmargins
m1
?addmargins
addmargins(m1,margin=1,sum) #colwise function
addmargins(m1,1,sd) #colwise function

addmargins(m1,2,mean) #rowwise function
addmargins(m1,c(1,2),mean) #row & col wise function
?addmargins
(M1sum= addmargins(m1,c(1,2),list(list(mean,sum,max, min), list(var,sd, max, min)))) #row & col wise function
round(M1sum,0)

#Array-----
length(100:123)
4*3*2
#2 coys, 3 products, 4 locations sold qty
(a1 = array(100:123, dim=c(4,3,2)))
(loc = paste('loc', 1:4,sep='-'))
(product = paste('p', 1:3,sep='@'))
(coy = paste('coy', 1:2,sep='%'))
dimnames(a1) = list(loc, product, coy)
a1
apply(a1,1, sum) #locationwise
apply(a1,2, sum) #productwise
apply(a1,c(1,2), sum) #product-location wise
apply(a1,c(2,3), sum) #product-coy wise
apply(a1,c(1,3), sum) #coy-location
apply(a1,3, sum) #coywise
sum(a1) #total


#DataFrame----
#create Vectors to be combined into DF
(rollno = 1:30)
(sname = paste('student',1:30,sep=''))
(gender = sample(c('M','F'), size=30, replace=T, prob=c(.7,.3)))
(marks1 = floor(rnorm(30,mean= 50,sd=10)))
(marks2 = ceiling(rnorm(30,40,5)))
(course = sample(c('BBA','MBA'), size=30, replace=T, prob=c(.5,.5)))
rollno; sname; gender
marks1 ; marks2; course

#create DF
df1= data.frame(rollno, sname, gender, marks1, marks2, course, stringsAsFactors = F)
str(df1) #structure of DF
head(df1) #top 6 rows
head(df1,n=3) #top 3 rows
tail(df1) #last 6 rows
class(df1) # DF
summary(df1) #summary
nrow(df1) 
dim(df1)
length(df1)
df1$course
df1$gender = factor(df1$gender)
df1$course = factor(df1$course)
#df1$sname = as.character(df1$sname)
str(df1)
summary(df1)
boxplot(marks1 ~ gender + course, data=df1)

df1  #full data
df1$gender  # one column
head(df1[ , c(2,4)]) #multiple columns
df1[1:10 ,] #select rows, all columns
df1[1:5,1:4]
#as per conditionis
df1[ marks1 > 50 & gender=='F', c('rollno', 'sname','gender', 'marks1')]
df1[ marks1 > 50 & gender=='F', c(1,2)]
df1[ marks1 > 50 | gender=='F', ]

names(df1)  # names of columns
dim(df1)  #Dimensions

aggregate(df1$marks1, by=list(df1$gender), FUN=sum)
aggregate(marks1 ~ gender, data=df1, FUN=max)
aggregate(cbind(marks1, marks2) ~ gender, data=df1, FUN=max)


(df2 = aggregate(cbind(marks1,marks2) ~ gender + course, data=df1, FUN=mean))
df2

df1





#List -----
g ="My First List"
h = c(25, 26,18,39)
j = matrix(1:10,nrow=2)
k = c('one','two','three')
mylist = list(title=g, ages=h, j, h)
mylist
mylist[2]
mylist[[2]]
mylist[['ages']]
mylist$ages





#Factor -----

(grades = sample(c('A','B','C','D'), size=30, replace=T, prob=c(.3,.2,.4,.1)))
summary(grades)
table(grades)
(gradesFactor = factor(grades))
summary(gradesFactor)

(gradesFactorOrdered = factor(grades, ordered=T))
summary(gradesFactorOrdered)

(gradesFactorOrderedLevels = factor(grades, ordered=T, levels=c('D','C','B','A')))
summary(gradesFactorOrderedLevels)
gradesFactor
gradesFactorOrdered
gradesFactorOrderedLevels
pie(c(10,15,17))
pie(summary(gradesFactorOrderedLevels))
barplot(summary(gradesFactorOrderedLevels), col=1:4)

class(grades)
class(gradesFactorOrdered)
class(gradesFactorOrderedLevels)



# Object Properties
#vector
v1= 1:100
class(v1) ; typeof(v1)
v2=letters[1:10]
class(v2) ; typeof(v2)
length(v2)
summary(v1)

#matrix
m1= matrix(1:24,nrow=6)
class(m1)
summary(m1)
dim(m1)
str(m1)

#Array
a1 =array(1:24, dim=c(4,3,2))
class(a1)
str(a1)
dim(a1)
summary(a1)



#DF
#data() #built in datasets
df1= iris 
str(df1)
summary(df1)
class(df1); dim(df1)
nrow(df1) ; names(df1) ;NROW(df1)
colnames(df1)
rownames(df1)

#list
list1 = list(v1,m1,a1,df1)
str(list1)

#Statistical Description
library(Hmisc)
describe(df1)


#Next Topics
x= c(123.2234, 33333.544, 43243.8442)
floor(x)
ceiling(x)
trunc(x)
round(x,-2)
round(x, digits = 5)


================================================
FILE: 03-wksp1/1b3-factor.R
================================================


(grades = sample(c(LETTERS[1:4]), size=30, replace=T, prob=c(.4,.2,.3,.1 )))
summary(grades)
(gradesF = factor(grades))
summary(gradesF)
table(grades)
table(gradesF)
class(gradesF)
(gradesFO = factor(grades, ordered=T))
(gradesFO1 = factor(grades, ordered=T, levels=c('B','C','A','D')))
summary(gradesFO1)

(marks = ceiling(rnorm(30, mean=60, sd=5)))
(gender = factor(sample(c('M', 'F'), size=30, replace=T)))
(student1 = data.frame(marks, gender, gradesFO1))
boxplot( marks ~ gradesFO1, data=student1)      
boxplot( marks ~ gradesFO1 + gender, data=student1)      

boxplot(marks)
summary(marks)
abline(h = summary(marks))
quantile(marks)


================================================
FILE: 03-wksp1/1d2-basicstats.R
================================================
# Basic Stats
x = ceiling(rnorm(10000, mean=60, sd=20))
mean(x)
median(x)
#there is no mode function for mode stats
table(x)
sort(table(x), decreasing=T)

#mode
library(modeest)
mlv(x,method='shorth')

#quantile
quantile(x)
quantile(x,seq(.1,1,by=.1)) #decile
quantile(x,seq(.01,1,by=.01)) #percentile

library(e1071)                    # load e1071 

plot(density(x))    #density plot
e1071::skewness(x)                # apply the skewness 
kurtosis(x)

sd(x); var(x)
cov(women$weight, women$height)
cor(women$height, women$height)

stem(x)

#Freq Table
library(fdth)  #fast way of creating FT
ftable1 = fdt(x)
ftable1


================================================
FILE: 03-wksp1/1d2-dm-student1.R
================================================
# Data Manipulation : Academic Data

#Method1 : gsheet
library(gsheet)
url= "https://docs.google.com/spreadsheets/d/1qLHa5qFTyWacta8F-IGo6J3Zpf-BVR9OrlqONuJDqYc/edit#gid=2051155174"
student1 = as.data.frame(gsheet2tbl(url))

#Method2 : read.csv
student2 = read.csv('./data/student1.csv')
names(student2)

#save the imported data object into a new object
student=student1  # or student2
str(student)
names(student)
df1= student
lm(df1$btechmarks ~ df1$attnd, data=df1)

(colnames = names(df1))
#attributes(df1) = NULL
#df1 = as.data.frame(df1)
attributes(df1)
attr(df1, which='spec') = NULL
#names(df1) = colnames
str(df1)
#class of each column
class(df1$gender)
sapply(df1, class)

# convert character to factor
factorcols = c('gender', 'cat', 'class12', 'batch','batchyr', 'br', 'city', 'finalgrade', 'btechfinal')

df1[factorcols] = lapply(df1[factorcols] ,factor)
sapply(df1, class)
str(df1)


# Now do summarisation
#attach(df1)
names(df1)
table(df1$gender)
#combine it in single command 
(l1= lapply(df1[factorcols],factor,ordered=TRUE)) #ordering is not necessary for all
sapply(l1,table)

table(df1$gender)
table(df1$batchyr)
sapply(df1[factorcols],table)


sapply(lapply(df1[factorcols],factor,ordered=TRUE), table)  #Method1


sapply(df1[factorcols], table)   #Method2
sapply(df1[c('gender', 'cat', 'class12')], table)
?lapply
str(df1)
# Numeric Cols
sapply(df1, is.numeric)
(numcols = sapply(df1, is.numeric))
class(numcols)
(numcols = names(df1[numcols]))
#remove rollno
(numcols = numcols[-1])
head(df1[numcols])
colSums(df1[numcols])
colSums(df1[numcols],dims=1)
colMeans(df1[numcols],dims=1)
numcols[c(1,3)]
colMeans(df1[c('age','java', 'cpp')])
names(df1)
colMeans(df1[numcols[c(1:3)]])

names(df1)
#Look for other summarisation and grouping
aggregate(df1[numcols], by=list(df1$br), FUN=mean)
aggregate(cbind(sem1, sem2) ~ br + gender, data=student, FUN=mean)

aggregate(java + cbnst ~ br, data=df1, FUN=mean)  #sum of java & cbnst wrt branch
aggregate( cbnst ~ gender, data=df1, FUN=mean)
aggregate( cbnst ~ gender + br, data=df1, FUN=mean)



#using dplyr package
library(dplyr)

#Top 2 students from each Branch---- 
df1 %>% select(br, sname, btechmarks) %>% group_by(br) %>% arrange(desc(btechmarks)) %>% top_n(n=2)

# Average Fees Paid by Batch Yr----
df1 %>% group_by(batchyr) %>% select(batchyr, feepaid) %>% summarize(mean_fees = mean(feepaid, na.rm = TRUE))

names(df1)
# Avg BTech Marks and Min Marks in Java : Group by Gender, Granch and Final Grade ----
df1 %>% filter(finalgrade == 'A') %>% group_by(gender, br, finalgrade) %>% summarize(btechmks = mean(btechmarks, na.rm = TRUE), javamin = min(java, na.rm = TRUE))

#Count by Gender ----
df1 %>%   group_by(gender) %>%  tally()

#Filter by Final Grade = B : select only few columns ----
df1 %>% filter(finalgrade == 'B') %>% select(rollno, sname, finalgrade)

#sample : select few rows on randowm basis ----
df1 %>% sample_frac(0.2, replace = TRUE)  %>% select(rollno, sname)  # % of Total
df1 %>% sample_frac(0.2, replace = F)  %>% select(rollno, sname) # % of total : replace should be False to have non repeated rows
df1 %>% sample_n(3, replace = F)  %>% select(rollno, sname) # select only 3 rows

#few rows : 10 to 15
slice(df1, 10:15)

library(tidyr) #Reshape
df1long <- df1 %>% select(sname, java, cbnst)
df1long <- df1 %>% select(sname, java, cbnst) %>% tidyr::gather(key = subject, value = marks, java, cbnst) 

head(df1long)
distinct(df1long)

tidyr::unite(df1, 'rollnoname', c(rollno, sname), sep="-") %>% select(1:5)%>% head


# data
str(df1$dob)
as.Date("5-Apr-91", '%d-%b-%y')
df1$dob = as.Date(df1$dob, '%d-%b-%y')
df1 %>% select(sname, dob) %>% tidyr::separate(dob, c("y", "m", "d"))


# Rows
df1 %>% slice(1:n():5)
slice(df1, 1:5)

df1 %>% slice(1:n())
df1 %>% slice(1:10)
slice(df1, n()-10: n())  #different way

names(df1)
#defaults to the last variable in the tbl
df1 %>% group_by(gender) %>% top_n(2)  #Rank not data on last column
# top_n(n = 5, wt = x)
df1 %>% group_by(gender) %>% top_n(2,wt=class10)  #Rank not data wrt class10 marks

df1 %>% top_n(-2, wt=class10) %>% select(rollno, sname, class10, cgpa) #bottom 2 ranks - see 3 values
df1 %>% group_by(gender) %>% tally(java)  #total count
df1 %>% select(finalgrade, btechmarks, sname) %>% top_n(1, btechmarks)

#Selecting Columns
df1 %>% select(1:5)
df1 %>% select(contains('java'))
df1 %>% select(starts_with('btech'))
df1 %>% select(ends_with('s'))
df1 %>% select(everything())
df1 %>% select(finalgrade, everything()) %>% select(1:5) %>% select(-2)
names(df1)


# Summarise
df1 %>% summarise(avgjava = mean(java))
df1 %>% group_by(gender) %>% summarise(avgjava = mean(java),sumcbnst = sum(cbnst))
df1 %>% group_by(br, batch) %>% select(numcols) %>% summarise_each( funs(mean))

df1 %>% group_by(cat, gender) %>% count(class12, finalgrade)




#Summary Functions
v1 = df1$btechmarks
v1
v1 %>% first
v1 %>% last
v1 %>% sd
v1 %>% mean
v1 %>% n_distinct()


# Combine Data Sets

(a=data.frame(x1=c('A','B','C'), x2=c(1,2,3)))
(b=data.frame(x1=c('A','B','D'), x2=c('T','F','T')))

dplyr::left_join(a, b, by = "x1")
#Join matching rows from b to a.

dplyr::right_join(a, b, by = "x1")
#Join matching rows from a to b.

dplyr::inner_join(a, b, by = "x1")
#Join data. Retain only rows in both sets.

dplyr::full_join(a, b, by = "x1")
#Join data. Retain all values, all rows.

dplyr::semi_join(a, b, by = "x1")
#All rows in a that have a match in b.

cbind(a,b)

dplyr::anti_join(a, b, by = "x1")
#All rows in a that do not have a match in b.

#-----
y = data.frame(x1=c('A','B','C'), x2=c(1,2,3))
z = data.frame(x1=c('B','C','D'), x2=c(2,3,4))
cbind(y,z)

dplyr::intersect(y, z)  # B&C
#Rows that appear in both y and z.

dplyr::union(y, z)
#Rows that appear in either or both y and z.

dplyr::setdiff(y, z)
#Rows that appear in y but not z.

#Bind
dplyr::bind_rows(y, z)
#Append z to y as new rows.
dplyr::bind_cols(y, z)
#Append z to y as new columns.
#Caution: matches rows by position




================================================
FILE: 03-wksp1/1d3-dencoCase.R
================================================
# Case Study - Denco  
#Manufacturing Firm with sales data of partnum and customer with region wise sales

# Should know - import, table, dplyr, aggregate etc

#read file : Method1
sales1 = read.csv("./data/denco.csv")
str(sales1) #see if data is loaded and check its structure
head(sales1)
#read file : Method2 : when location is not in project folder
sales2 = read.csv(file.choose())
str(sales2)
head(sales2)

#read file: Method3
#install.packages('gsheet')
library(gsheet)
url = "https://docs.google.com/spreadsheets/d/1h7HU0X_Q4T5h5D1Q36qoK40Tplz94x_HZYHOJJC_edU/edit#gid=216113907"
sales3 = as.data.frame(gsheet2tbl(url))
str(sales3)
head(sales3)


# whichever you read the data, store it in sales object
sales = sales1  # keeping a backuph
head(sales)  #top 6 rows
str(sales)  # structure
class(sales) # class type - Df
dim(sales)  # rows & columns
summary(sales) # summary- colname, type
names(sales) # col names
head(sales) # first 6 values
tail(sales) # last 6 values

# Some other functions
unique(sales$custname)  # unique customer names
length(unique(sales$custname)) # no of unique customers
length(unique(sales$region )) # no of distinct regions

#Information Required----
# 1. Who are the most loyal Customers - What to do : Improve repeated sales, Target customers with low sales Volumes
# 2. Which customers contribute the most to their revenue : How do I retain these customers & target incentives
# 3a. What part numbers bring in to significant portion of revenue : Maximise revenue from high value parts
# 3b. What parts have the highest profit margin : What parts are driving profits & what parts need to build further


#dplyr
names(sales)
library(dplyr)  #use this library to do summarisation

#Case-1 : Loyal Customers----
# Finding Frequency and sort them in descending order
names(sales)
(t1=table(sales$custname))  # freq table for all customers
class(t1); length(t1)
head(t1) # top 6 but not sorted
t2= sort(t1,decreasing=T )
head(t2)
#Ans: CHIZ BROS INC most loyal with 253 times purchase

#other methods through dplyr
library(dplyr)
sales %>% count(custname, sort=TRUE) %>% head(n=5)
#Ans: CHIZ BROS INC most loyal with 253 times purchase

sales %>% dplyr::group_by(custname) %>% dplyr::summarise(n = n()) %>% dplyr::arrange(desc(n)) %>% head(n=5)
#Ans: CHIZ BROS INC most loyal with 253 times purchase


#Case-2 : Customer- Highest Revenue---
sales %>% group_by(custname) %>%  summarize(Revenue = sum(revenue)) %>% arrange(desc(Revenue)) %>% head(n=5)
#Ans2: Triumph Insulation gave max revenue

# save this object and then analyse
salesrevenue = sales %>% group_by(custname) %>%  summarize(Revenue = sum(revenue)) %>% arrange(desc(Revenue))

#dply uses tibble format and does not show all rows/cols by default
options(tibble.width = Inf) # displays all columns.
options(tibble.print_max = Inf) # to show all the rows.
salesrevenue[1:5,] # first 5 rows
#Ans2: here also Triump insulation gives max revenue = 35592K

#Case3 : Most Profitable PartsNums ----
# Look for freq, revenue and/ or profit margin
head(sales)
# Summarise by Part Num for frequency
sales %>% dplyr::group_by(partnum) %>% dplyr::summarise(n = n()) %>% dplyr::arrange(desc(n))  %>% head(n=5)
#Ans3a: Part Num- 764821000 was purchased max times -   122

# Summarise Partnum for Profit(margin) : sum(profit)
names(sales)
sales %>% group_by (partnum) %>% summarise(TotalMargin= sum(margin)) %>% arrange(desc(TotalMargin)) %>% head()
#Ans 3b: Part Num - 733648000 gave max margin profit - 11003367

# Summarise Partnum for revenue
#(practise it )


#Case : Extra ----
#Find Regions wise sales

(salesregionrevenue = sales %>% group_by(region) %>% summarise(Revenue = sum(revenue)) %>% arrange(desc(Revenue)))

#Some Graphs related to Data Summarised
pie(x = salesregionrevenue$Revenue, labels=unique(sales$region))
barplot(salesregionrevenue$Revenue, col=1:4)

# Filter Data----
#Rows where revenue > 700000 : show only custname, revenue
sales %>% filter(revenue >= 700000 ) %>% select(custname, revenue)

#select 1% rows on random basis
sales %>% sample_frac(.01)  %>% select(custname, revenue)

#select 10 rows on random basis and sort them regionwise
sales %>% sample_n(10)  %>% select(custname, region) %>% arrange(region)
sample_n(sales, 10) # similar method

#End of session for Denco Case - Manufacturing Firm
#Descriptive Analysis
#Loyal Customers, Customer giving max revenue
#Profitable Partnums - Freq, Revenue and Margin



================================================
FILE: 03-wksp1/1d4-DA-dencoCase.R
================================================
# Case Study - Denco  

#read file : Method1
sales1 = read.csv("./data/denco.csv")
str(sales1)

#read file : Method2
sales2 = read.csv(file.choose())
str(sales2)

#read file: Method3
#install.packages('gsheet')
library(gsheet)
url = "https://docs.google.com/spreadsheets/d/1h7HU0X_Q4T5h5D1Q36qoK40Tplz94x_HZYHOJJC_edU/edit#gid=216113907"
sales3 = as.data.frame(gsheet2tbl(url))
str(sales3)


#using gsheet
library(gsheet)
denco2 = as.data.frame(gsheet2tbl(url))
str(denco2)

#head(sales1,n=7)
#names(sales1)

# whichever you read the data, store it in sales object
sales = sales1  # keeping a backup
str(sales)
class(sales)
str(sales)
?summary
summary(sales)

str(sales)
dim(sales) #dimensios of DF rows & colnum
unique(sales$custname)
length(unique(sales$custname))
length(unique(sales$region ))

# aggregation
aggregate(sales$revenue , by=list(sales$custname), FUN=sum)
df1 = aggregate(sales$revenue , by=list(sales$custname), FUN=sum)
head(df1)
str(df1)

df1=df1[order(df1$x, decreasing=TRUE),]
head(df1,10)

head(df1[order(df1$x, decreasing=TRUE),], 5)

aggregate(sales$revenue, by=list(sales$region), FUN=mean)
df2= aggregate(formula=revenue ~ region, data=sales, FUN=sum)
df2[order(df2$revenue, decreasing=F),]


#Aggregate Formula
(df2 = aggregate(revenue ~ custname + region, data=sales, FUN=sum))
head(df2[order(df2$revenue,decreasing=T),],10)

#List
list1= tapply(sales$revenue, sales$custname, FUN=sum)
head(list1)
list1
head(sort(list1, decreasing=T))
summary(df3)
str(df3)

#dplyr
names(sales)

library(dplyr)

sales %>% filter(margin > 10000) %>% arrange(region, desc(revenue))
filter(sales, margin > 1000000)

sales %>% filter(region == '01-East' & revenue > 400000) %>% select(partnum, region, revenue)

names(sales)
sales %>% group_by(custname) %>% 
  summarize(Revenue = sum(revenue)) %>% arrange(desc(Revenue))


library(data.table)
dt1 = as.data.table(sales)
dt2 = dt1[, sum(revenue), by=custname]
names(dt2)
dt1[, dt1[, sum(revenue), by=custname]]
#dt1[, order( decreasing = T)]

# Select
library(sqldf)
df5 =sqldf("Select custname, sum(revenue) from sales Group By custname order by sum(revenue) desc ")
head(df5)

# Frequency --------
names(sales)
t1=table(sales$custname)
class(t1)
length(t1)
head(t1)
t2= sort(t1,decreasing=T )
head(t2)
head(sort(table(sales$custname), decreasing=T), n=10)
tail(sort(table(sales$custname), decreasing=T), n=10)

#xtab
head(sort(xtabs(~ custname, sales), decreasing=T))

#
library(dplyr)
sales %>% dplyr::count(custname, sort=TRUE)

sales %>% dplyr::group_by(custname) %>% dplyr::summarise(n = n()) %>% dplyr::arrange(desc(n))



#plyr
df2a= plyr::count(sales, c('custname'))
str(df2a); names(df2a)
head(df2a[order(-df2a$freq),])

# Summarise by Part Num

df3a= aggregate(sales$revenue, by=list(sales$partnum), FUN=sum)
head(df3a)
str(df3a)
df3a[order(-df3a$x),][1:6,]
head(df3a[order(-df3a$x),])

df3b = aggregate(revenue ~ partnum, data=sales, FUN=sum)
head(df3b)
head(df3b[order(df3b$revenue, decreasing=T),])

sales %>% dplyr::group_by(partnum) %>% dplyr::summarise(n = n()) %>% dplyr::arrange(desc(n))


# which parts have highest Profit : partno - sum(profit)
names(sales)
df4a = aggregate(margin ~ partnum, data=sales, FUN=sum)
aggregate(margin ~ partnum, data=sales, FUN=sum)
head(df4a[order(df4a$margin, decreasing = T),])

sales %>% group_by (partnum) %>% summarise(TotalMargin= sum(margin)) %>% arrange(desc(TotalMargin)) %>% head()



================================================
FILE: 03-wksp1/1e-graphs-basic.R
================================================
# Combined Plots
#plot, histogram, pie, boxplot, linechart, correlation plot

#plot
women
plot(women)
?plot
plot(women, type='p', pch=16, col='red')
plot(women, type='l')
plot(women, type='b')
plot(women, type='b', pch=18, lty=1, col=2, lwd=4)
plot(women, xlim=c(30,100), ylim=c(min(women$weight)-10, 200), pch=10)

#more features with plot
plot(x=women$weight, y=women$height, pch=15, xlab='Weight', ylab='Height', col='red', cex=2, type='b')
title(main='Main Title', sub='Sub Title')
#see cheat sheet on base graphs
mtcars$cyl
#plot(x=mtcars$wt, y=mtcars$mpg, col=mtcars$gear, pch=c(4,6,8), cex=c(1,2))
#as.numeric(levels(as.factor(mtcars$cyl)))
plot(women)
abline(lm(women$weight ~ women$height), col='red', lty=2, lwd=4)


#boxplot
boxplot(women$height)
abline(h=c(58, 62,65,68,72))

#draw lines on plot for number summary
summary(women)
quantile(women$height)
quantile(women$height, seq(0,1,.1))
quantile(women$height, seq(0,1,.01))
stem(women$height)
boxplot(women$height, col='green')
abline(h=quantile(women$height))
text(1, quantile(women$height), labels=c('min','1Q','median','3Q','max'))

#histogram
hist(women$height)
hist(women$height, breaks=10)
hist(women$height, breaks=5, col=1:5)

#histogram2
(x = rnorm(100,50,10))
hist(x)

hist(x, freq=F, col=1:10)
lines(density(x))

#density plot : shape of data
plot(density(x), col='red')

#pie
gender= sample(c('M','F'), size=100, replace=T)
table(gender)
pie(table(gender))
x = c(10,20,40,50)
pie(x)
xlabels = c('A ','B ','C ','D ')
x/sum(x)
(labels2 = paste(xlabels, round(x/sum(x),2) * 100 , sep='-'))
(labels3 = paste0(labels2,"%%"))
(labels2 = paste0(xlabels, round(x/sum(x),2) * 100, '%'))

pie(x, labels=labels2)
x
#barplot
barplot(x,col=1:4)
barplot(x,col=1:4, horiz = T)

#correlation plot
pairs(women)
cor(women$height,women$weight)
cov(women$height, women$weight)
head(mtcars)
?mtcars

cor(mtcars)
names(mtcars)
pairs(mtcars)
pairs(mtcars[1:4])
options(digits=4)
pairs(mtcars[c('mpg', 'wt','hp')])


================================================
FILE: 03-wksp1/1e2-graphs.R
================================================
# Combined Plots

#plot, histogram, pie, boxplot, linechart, correlation plot


#plot
women
?women
str(women)
plot(women)
plot(x=women$height, y=women$weight)
?plot
plot(women, type='p', pch=17)
plot(women, type='l')
plot(women, type='b', pch=18, lty=2, col=2)
plot(women, xlim=c(30,100), ylim=c(min(women$weight)-10, 200), pch=10)
data()

#more features with plot
plot(y=women$height, x=women$weight, pch=15, xlab='Weight', ylab='Height', col='red', cex=2, type='b')
title(main='Main Title- PDU', sub='Sub Title')
#see cheat sheet on base graphs

plot(women)
abline(lm(women$weight ~ women$height), col='red', lty=2, lwd=4)
abline(h = c(130, 150), col='green') 
abline(v=c(62, 66, 70), col='blue')
abline(v=women$height, col='purple')

#boxplot
boxplot(women$height)
boxplot(df$marks1)

abline(h=c(58, 62,65,68,72))

#draw lines on plot for number summary
summary(women)
quantile(women$height)
boxplot(women$height, col='green')
abline(h=quantile(women$height))


#histogram
hist(women$height)
hist(women$height, breaks=10)
hist(women$height, breaks=5, col=1:5)
hist(df$marks2, breaks=3)
#histogram2
?rnorm
x = rnorm(n=100000,mean=50,sd=10)
hist(x)

hist(x, freq=F, col=1:5)
lines(density(x))

#density plot : shape of data
plot(density(x), col='red')


#pie
x = c(10,20,40,50)
pie(x)
xlabels = c('A','B','C','D')
pie(x, labels=xlabels)
pie(x, labels=paste(round(x/sum(x) * 100,0),'%') )

x
#barplot
barplot(x,col=1:4)
barplot(x,col=1:4, horiz = T)

#correlation plot
pairs(women)
cor(women$height,women$weight)

names(mtcars)
cor(mtcars)
pairs(mtcars)
options(digits=4)
pairs(mtcars[1:4])


================================================
FILE: 03-wksp1/1e3-advgraphs.R
================================================
#Advanced Graphs


library(corrgram)
cor(mtcars[1:4])
corrgram(mtcars[1:4], order=TRUE, lower.panel=panel.shade,
         upper.panel=panel.pie, text.panel=panel.txt,
         main="Car Milage Data in PC2/PC1 Order")


#alternative of box plot

boxplot(mpg ~ cyl, data=mtcars)


library(corrplot)
relationship=cor(mtcars)
relationship
corrplot(relationship)
corrplot(relationship, type="upper")

#-----

library(vioplot)
x1 <- mtcars$mpg[mtcars$cyl==4]
x2 <- mtcars$mpg[mtcars$cyl==6]
x3 <- mtcars$mpg[mtcars$cyl==8]
x1; x2; x3
vioplot(x1, x2, x3, names=c("4 cyl", "6 cyl", "8 cyl"), 
        col="gold")
title("Violin Plots of Miles Per Gallon")
abline(h=c(15,20))


================================================
FILE: 03-wksp1/1f-SLR-women.R
================================================
#topics ----
#factors, env, import/export. package install
#rep, recode, split, partition, subset, loops, cast & melt
#missing values. duplicates, apply
#graphs - bar, multiple line, pie, box, corrgram
# predict weight for certain height

head(women)
dim(women)
fit = lm(weight ~ height,data = women)
summary(fit)
range(women$height)
(ndata = data.frame(height = c(60.5, 70.5, 71.5)))
(predictedwt = predict(fit, newdata = ndata))
cbind(ndata, predictedwt)

resid(fit)
fitted(fit)
cbind(women, fitted(fit), resid(fit))
#assumptions

plot(fit)




================================================
FILE: 03-wksp1/1h1-dplyr.R
================================================
#dplyr - mtcars
library(dplyr)
#library(tidyverse)
#Filter----

filter(mtcars, cyl == 8)
filter(mtcars, cyl < 6)

# Multiple criteria
filter(mtcars, cyl < 6 & vs == 1)
filter(mtcars, cyl < 6 | vs == 1)

# Multiple arguments are equivalent to and
filter(mtcars, cyl < 6, vs == 1)


filter(mtcars, row_number() == 1L)
filter(mtcars, row_number() == n())
filter(mtcars, between(row_number(), 5, n()-2))



#mutate----
mutate(mtcars, displ_l = disp / 61.0237) #keeps other col
transmute(mtcars, displ_l = disp / 61.0237) #removes other cols
mutate(mtcars, cyl = NULL) #do not display cyl


#slice-----
slice(mtcars, 1L)
slice(mtcars, n())
slice(mtcars, 5:n())
slice(mtcars, c(2,4,5,10))

(by_cyl <- group_by(mtcars, cyl)) # ???
slice(by_cyl, 1:2)

#structure----
tbl_df(mtcars) # convert to tbl class
glimpse(mtcars)  # dense summary of tbl data
View(mtcars) # spreasheet like form base pacakge

mtcars %>% group_by(am) 
#nothing - just separation

mtcars %>% group_by(am) %>% summarise(mean(mpg), max(wt), min(wt))


#summarise----
summarise(mtcars, mean(disp))  
summarise(group_by(mtcars, cyl), mean(disp)) 
summarise(group_by(mtcars, cyl), m = mean(disp), sd = sd(disp))


#summarise_all
mtcars %>% group_by(am, gear) %>% summarise_all(mean)
mtcars %>% group_by(am, gear)%>% summarise_all(c("min", "max"))
mtcars %>% group_by(am, gear)%>% summarise_all(funs(med = median))



#without Group
mtcars %>% summarise(mean(mpg), max(wt))
mtcars %>% summarise_all(mean)
mtcars %>% select(wt, gear)%>% summarise_all(c("min", "max"))
mtcars %>% summarise_all(funs(med = median))



#summarise if : 
mtcars %>% summarise_if(is.numeric, mean, na.rm = TRUE)
str(iris)  #Species is a factor
iris %>% summarise_all(mean)

iris %>% summarise_if(is.numeric, mean, na.rm = TRUE)

#specific columns
mtcars %>% summarise_at(c("mpg", "wt"), mean, na.rm = TRUE)


#------------------------------------
#unsorted----
dplyr::tbl_df(iris)  #all rows not displayed
print(dplyr::tbl_df(mtcars), n=20)  #display more columns and rows
#print(dplyr::tbl_df(mtcars), width=11)
tbl_df(mtcars) %>% print(n = Inf)  #all rows
tbl_df(mtcars) %>% print(width = Inf)
tbl_df(mtcars) %>% as.data.frame(mtcars)

glimpse(mtcars)
df = mtcars
row.names(df) = NULL  #remove rownames
df %>% select(mpg)
#head(mtcars)
select(mtcars, mpg, vs)
mtcars %>% dplyr::select(vs, mpg, wt)
mtcars %>% group_by(cyl) %>% summarise(avgwt = mean(wt), meanhp = mean(hp)) %>% arrange( desc(meanhp), avgwt)
mtcars

names(mtcars)
filter(mtcars, mpg > 23 | wt < 2)
mtcars %>% filter(mpg > 23 & wt > 2)
mtcars %>% select(mpg, wt) %>% filter(mpg > 23) 
mtcars %>% 
  
filter(iris, Sepal.Length > 7)
filter(mtcars, cyl == 4)

#distinct rows
distinct(mtcars)
(df3  = data.frame(a=c(2,2,3),b=c(2,2,1)))
distinct(df3)

#sampling
sample_frac(mtcars, 0.2, replace=F)
sample_n(mtcars, 2, replace=F)
#%>% select(mpg)
slice(mtcars,10:14)
sort(mtcars$mpg, decreasing = T)
top_n(mtcars,-2, mpg)  #least 2 mpg

select(mtcars, mpg) %>% arrange(desc(mpg))

#Columns
select(mtcars, mpg, wt)
select(mtcars, contains('a'))
names(mtcars)
select(mtcars, contains ='vs')
select(mtcars, everything())

df= mtcars
df$names = rownames(mtcars)
head(df)
df %>% select(1:5,12) %>% arrange(mpg)

mtcars %>% group_by(cyl, am) %>% summarise_all(mean)

(df4 = data.frame(marks=c(1,2,2,3,7,1,100)))
cbind(df4, dplyr::mutate_all(df4, funs(min_rank)))

#shift the columns
mtcars %>% lead() 
mtcars %>% lag()
mtcars %>% summarise(n())

select(mtcars, mpg2 = mpg)

df = mtcars[1:4]
names(df) = c('MPG','C1','C2','C3')
df= rename(df, C5=C1)
names(df)
df

df = women
rename(df, HeightWomen = height)
df %>% mutate(height2 = height + 2, weight2 = weight + 4)
#does not show orginal columns
df %>% transmute(height2 = height + 2, weight2 = weight + 4)

library(nycflights13)
data(flights)
head(flights)
destinations <- group_by(flights, dest)
destinations
summarise(destinations, planes = n_distinct(tailnum), flights = n())


select(iris, -ends_with("Width")) %>% head
vars <- c("Petal.Length", "Petal.Width1")
select(iris, from=1, to=n()) 
filter(mtcars, row_number() == n())
filter(mtcars, between(row_number(), 5, n()))

mtcars %>% group_by(cyl) %>% filter(1:3)
group_by( mtcars, cyl ) %>% integer_filter(1:2)
?integer_filter

#error ???????
# Select odd
mtcars %>% slice(from = 1, to = n(), by = 2)
# Select even
slice(mtcars, from = 2, to = n(), by = 2)
# Select first 10
slice(mtcars, from = 1, to = 10)
# Select last 10
slice(mtcars, n()-10: n())
slice(mtcars, 1:4)
mtcars

df <- tibble(
  g1 = c(1, 1, 2, 2, 2),
  g2 = c(1, 2, 1, 2, 1),
  a = sample(5), 
  b = sample(5)
)
df
df %>%  slice(n()-2:n())

var1 <- quo(letters[1:5])
var1
quo(toupper(!!var1))

# Here we capture `letters[1:5]` as an expression:
quo(toupper(letters[1:5]))
#> ~toupper(letters[1:5])

# Here we capture the value of `letters[1:5]`
quo(toupper(!!letters[1:5]))
#> ~toupper(c("a", "b", "c", "d", "e"))
quo(toupper(UQ(letters[1:5])))
#> ~toupper(c("a", "b", "c", "d", "e"))
#
toupper(letters[1:5])
quote(toupper(letters[1:5]))

head(mtcars)
slice(mtcars, 1:5)
slice(mtcars, 1)  #rowno missing, first row
slice(mtcars, 1L)

tail(mtcars,n=5)
slice(mtcars, n()-5:n())
slice(mtcars, n()) #last row
slice(mtcars, n() - 1) #2nd last row

mtcars %>% top_n(2)
mtcars %>% top_n(-2)
mtcars %>% group_by(cyl) %>% tally(cyl) %>% top_n(1, cyl)
dim(mtcars)
bind_rows(mtcars, mtcars)
bind_cols(mtcars,mtcars)
gtable_combine(list(mtcars, mtcars))
dim_desc(mtcars)

# combine applies the same coercion rules
f1 <- factor("a")
f2 <- factor("b")
c(f1, f2)
unlist(list(f1, f2))

gtable_combine(f1, f2)
gtable_combine(list(f1, f2))

slice( mtcars, c(1L,3L,2L,7L)) 

by_cyl <- mtcars %>% group_by(cyl)
# Select first row in each group
mtcars %>% slice(1)
by_cyl %>% slice(1)
# Select last row in each group
mtcars %>% slice(n())
by_cyl %>% slice(n())
# Rows not present in group silently ignored
mtcars %>% slice(10)
by_cyl %>% slice(10)

# Select arbitrary rows
mtcars %>% slice(1:9)
by_cyl %>% slice(1:3)

mtcars %>% slice(c(1, 3, 9))
by_cyl %>% slice(c(1, 3, 5))

# Select even rows
mtcars %>% slice(seq(2, n(), by = 2))
by_cyl %>% slice(seq(2, n(), by = 2)) %>% select(cyl, everything())

# Drop first row in each group
mtcars %>% group_by(cyl, am) %>% slice(1)
by_cyl %>% slice(1)

# Returns all values
by_cyl %>% slice()




df <- data.frame(x = c(10, 4, 1, 6, 3, 1, 1))
df %>% top_n(2)

# Negative values select bottom from group. Note that we get more
# than 2 values here because there's a tie: top_n() either takes
# all rows with a value, or none.
df %>% top_n(-2)






================================================
FILE: 03-wksp1/1h2-freqtable.R
================================================
# Frequency Distribution

#Discrete Cat Data
(attend = c('A','P','P','A','P','A'))
table(attend)
cbind(table(attend))  # A & P on left side

#Discrete Numeric Data

color=c('Blue','Green','Magenta','Green','Black','Blue','Black')
x2 = table(color)
x2
x2a = cbind(x2)
x2a
hist(x2a)  # not relevant 
barplot(x2a)  # not good
barplot(x2a, beside=T)  # Better
unique(color) ; length(unique(color))
barplot(x2a, beside=T, col=rainbow(length(unique(color))))
pie(x2a)

#Continuous Data 
set.seed(1234)
x3 = runif(100,0,150)  # 0 to 150 marks range, 100 values 
x3
x3 = ceiling(x3)  #round to higher value
x3
range(x3)
# Divide range into step of 15 ie 10 levels
breaks = seq(0,150,by=15)
breaks
length(breaks)
x3
#x3[1] = 60; x3[2] = 75
x3.cut = cut(x3, breaks)
x3.cut
table(x3.cut)
cbind(table(x3.cut))  #see it vertically

#give intervals a character values a, b..
(x3.cut = cut(x3, breaks, labels=letters[1:10]))
#(x3.cut = cut(x3, breaks, labels=letters[1:length(breaks)-1]))
x3.cut

(x3a = table(x3.cut))
(x3b = cbind(x3a))

#plot these freq Table: which is better
hist(x3b)
pie(x3b)
barplot(x3b, beside=T)
barplot(x3b, beside=T, names.arg =rownames(x3b))
plot(x3b)


# and so..on like previous eg




================================================
FILE: 03-wksp1/2a-importExport.R
================================================
# Read Data into R Environment

#CSV Files----
#Read from CSV file in PC
head(iris)
write.csv(iris, "./data/iris.csv", row.names=F)
read1 = read.csv(file="./data/iris.csv", header = TRUE,sep = ",")
read1 = read.csv(file="./data/dhiraj.csv", header = TRUE,sep = ",")

str(read1)
class(read1)
head(read1)
read2 = read.table(file="./data/iris.csv", header = TRUE,sep = ",")
str(read2); class(read2)
head(read2)
read3 = read.delim(file="./data/iris.csv", header = TRUE,sep = ",")
str(read3) ; class(read3)
head(read3)
#difference is use of specify delimeter(read.csv takes default as comma)
#or location is different from Project Folders, or want to search for the file
read4 = read.csv(file=file.choose())
str(read4)
head(read4)


# From URL : Read CSV from Web----
read_web1 = read.csv('http://www.stats.ox.ac.uk/pub/datasets/csb/ch11b.dat')
head(read_web1)
library(data.table)
read_web2 = fread("http://www-bcf.usc.edu/~gareth/ISL/Advertising.csv")
head(read_web2)
class(read_web2)

#Text file from Web-----
read_txt = read.table("https://s3.amazonaws.com/assets.datacamp.com/blog_assets/test.txt", header = FALSE)
head(read_txt)

#Google Sheets-----
library(gsheet) #install it
install.packages('gsheet')
library(gsheet)
url_gsheet = "https://docs.google.com/spreadsheets/d/1QogGSuEab5SZyZIw1Q8h-0yrBNs1Z_eEBJG7oRESW5k/edit#gid=107865534"
df_gsheet = as.data.frame(gsheet2tbl(url_gsheet))
head(df_gsheet)

#Excel----
#Create a excel file with data in 2 sheets
# first row contains variable names
#C:\Program Files\Java\jre1.8.0_191
Sys.setenv(JAVA_HOME='C:\\Program Files\\Java\\jre1.8.0_191')
#library(rjava)
library(xlsx)
df_excel1 = read.xlsx( "./data/myexcel.xlsx", 1)
df_excel1
# read in the worksheet named mysheet
df_excel2a = read.xlsx("./data/myexcel.xlsx", sheetName = "bowlers")
df_excel2a
df_excel2b = read.xlsx( "./data/myexcel.xlsx", sheetIndex = 2)
df_excel2b


================================================
FILE: 03-wksp1/2b-SLR-salesarea.R
================================================
#Simple Linear Regression - Case Study
# Regression : Areas vs Sales
#Given data of area and sales, predict value for sales for specific areas eg : (1.5,2,3,4,5)
#manual way of doing regression see online URL

#Create/ Import Data-----
#Method1 : creating data from Vectors
#X -is area sqft Y-sales in 1000s units; Find relationship betn X & Y
X = c(1.7,1.6,2.8,5.6,1.3,2.2,1.3,1.1,3.2,1.5,5.2,4.6,5.8,3 )
Y = c(3.7,3.9,6.7,9.5,3.4,5.6,3.7,2.7,5.5,2.9,10.7,7.6,11.8,4.1 )
df1 = data.frame(X,Y)
head(df1)

#2nd method of importing data
#import from ggsheet  #pickup the correct url
library(gsheet)
area1 = "https://docs.google.com/spreadsheets/d/1h7HU0X_Q4T5h5D1Q36qoK40Tplz94x_HZYHOJJC_edU/edit#gid=2023826519"
df2 = as.data.frame(gsheet::gsheet2tbl(area1))
str(df2)
head(df2)

#Third method of importing data from csv
df3 = read.csv('./data/slr1.csv')
str(df3)

#Method4 for importing from CSV file with choose location
df4 = read.csv(file.choose())
str(df4)

# Use Vector Data or method used to import data
#make one of the DF active
df = df1
df
#simple stats
mean(df$X); mean(df$Y)
sum(df$X); sum(df$Y)
sd(df$X) ; var(df$Y)
cov(df$X,df$Y); cov(df$Y,df$X)
cor(df$X,df$Y) ; cor(df$Y,df$X)
#cor.test(df$X,df$Y)

#some plots to understand pattern
plot(df$X, df$Y)  #simple command to plot : Next with features
plot(y=df$Y, x=df$X,xlab='Area in sqft', ylab='Sales Amount', type='p', ylim=c(0, max(df$Y)+1), main='Plot of Area Vs Sales', xlim=c(0,max(df$X)+ 1), col='red',pch=17)
?plot
abline(lm(Y ~ X,data=df1), lty=1, lwd=2, col='green') # with regression line
abline(v=c(3,5, min(df$X), max(df$X)),h=c(6,10, min(df$Y), max(df$Y)), col=c('red','blue','green','yellow')) # few straight lines at x & y axis
range(df$X)
#Model
fit1 = lm(Y ~ X, data=df) # create Simple Linear Model Y wrt X
fit1
summary(fit1)


#few extras
names(fit1) # attributes(fit1)
system.time(lm(Y ~ X, data=df)) #time taken to compute linear regression
coef(fit1)  # Coefficients of Equation Y = mX + C
fitted(fit1) # predicted values for all X in orginal data
predict(fit1, newdata=data.frame(X))
residuals(fit1) # diff between actual and predicted values - residuals
(R= df$Y - fitted(fit1))
plot(density(residuals(fit1)))
#residuals should be less :Diff of Y actual - Y predicted
#abline(h=coef(fit1)[1])

summary(fit1)  # summary statistics of Linear Model(LM)
#understand the model values - R2, AdjR2, FStats, Residuals, Coeff p values - IMP STEP

names(fit1)  #output variables names of LM
names(summary(fit1))
#u can select them to see output

summary(fit1)$r.squared
coef(fit1)[2]  # slope or beta
fitted(fit1)
#combine the data with Ypredicted, errors
cbind(df, fitted(fit1), fitted(fit1)- df$Y, residuals(fit1))

#Mathematical Equation and predictions
(Y = 0.9645 + 1.6699 * 4)  # Predict Y for X=4
#predict for area = 4
#using equation
summary(fit1)
coef(fit1)
(Y = coef(fit1)[1] + coef(fit1)[2] * 4)
#using model
range(df$X)
(new1 = data.frame(X=c(4,5,3,2)))
predicted2= predict(fit1, newdata= new1)
cbind(new1, predicted2)
#prediction can only be interpolated not extrapolated
range(df$X)  #min to max value of X: area

#select new data given in the case
(new1 = data.frame(X=c(1.5,2,3,4,5)))
#sample data for X for prediction, should be between the range of X values
predict(fit1, newdata= new1) # Predict Function for 4 values of X
#columnbind with input and predicted values
cbind(new1, predictedY = predict(fit1, newdata= new1) )

library(forecast) #install it first
?accuracy(fit1)
#RMSE is generally used and should be least when selecting models

anova(fit1)
summary(fit1)$r.squared  # R2 from Model
summary(fit1)$sigma  #Residual Std Error SD along the LM Line

#---------------------------------------#Assumptions--------
#Assumption : Graphical Analysis : IMP STEP
plot(fit1)
par(mfrow=c(2,2))
plot(fit1)
par(mfrow=c(1,1))

plot(fit1, which=1)
# Linearity plot of residuals & X # No pattern for assumption that there is linearity betw X & Y
plot(df)
abline(h=0)
plot(residuals(fit1))

#Auto Collinearity : relation between successive values of Y
car::durbinWatsonTest(fit1)
?car::durbinWatsonTest
#pvalue > 0 : Do not reject Ho. that means there is no autocorrelation

#Normality of residuals
resid(fit1)
#qqplot(fitted(fit1),resid(fit1) )
plot(fit1, which=2)
#points to be around the straight line

#Equal Variance : 4th Assumption : homoscedasticity
plot(fit1, which=1)
#no funnel shape to show hetero-cedasticity

#Outlier Analysis
plot(fit1, which=4)
#abline(h=c(.5))
#no value of cooks distance > .5 : no data to be removed
#outlier values can affect the model

#see all diagnostic plots together
par(mfrow=c(2,2))
#multiple frames per row : rowwise filling, 2 rows, 2 columns
plot(fit1)
par(mfrow=c(1,1))

#SUMMARY ------
summary(fit1)$r.squared  # 90% variation in Y explained by X
summary(fit1)
#F Stats pvalue < 0.05: Model exists : At least 1 indep variable has strong relationship with Dependent variable (Y)
#pvalue of Coef (X) < 0.05 : Significant X


#End of Simple Linear Regression
#Do different SLR on different data sets
#Learn what to do if there are violations of assumptions




df # dataset being used for LM
fit= lm(Y ~ X, data=df) #model creation
summary(fit)  #summary of linear model
plot(fit)  #diagnostic plots
predict(fit, newdata=data.frame(X=mean(df$X)))
#Multiple R2 explains the variation, model fitness



================================================
FILE: 03-wksp1/2b-allmodels.R
================================================
# All models
library(dplyr)

#Linear Regression
head(women)
head(mtcars)
#predict weight for new height
plot(women)
plot(women, ylim=c(0, 160), xlim=c(0,90))
fit1 = lm(weight ~ height, data=women)
summary(fit1)
range(women$height)
(ndata1=data.frame(height=c(59.5, 62.5)))
(predicted1 = predict(fit1, newdata= ndata1))
cbind(ndata1, predicted1)
plot(fit1) #check for assumptions
par(mfrow=c(2,2))
plot(fit1)
par(mfrow=c(1,1))


#Logistic Regression
data2 = read.csv("https://stats.idre.ucla.edu/stat/data/binary.csv")
head(data2)
str(data2)
summary(data2)
data2$rank = factor(data2$rank)
data2$admit = factor(data2$admit)
table(data2$rank, data2$admit) #2 way table
xtabs(~admit + rank, data = data2)
#create Logistic Model
fit2 <- glm(admit ~ gre + gpa + rank,data=data2,family="binomial")
summary(fit2)
(ndata2 = sample_n(data2, 3))  #pick up sample rows
#Predict admit for input data
(predicted2=predict(fit2,newdata=ndata2, type="response"))
(predictedclass2=ifelse(predicted2 > .5, 1,0))
cbind(ndata2, predicted2, predictedclass2)

#Data Partition




#Decision Tree - Classification 
url3 = 'https://raw.githubusercontent.com/thomaspernet/data_csv_r/master/data/titanic_csv.csv'
data3a = read.csv(url3)
head(data3a)
names(data3a)
data3 = data3a[,c(2,3,5,6,7)]  #select few columns only
head(data3)

#install & load libraries
library(rpart)
library(rpart.plot)

#Decision Tree
fit3 = rpart(survived ~ ., data = data3, method = 'class')
fit3
rpart.plot(fit3, extra = 106, cex=.8,nn=T)  #plot
printcp(fit3) #select complexity parameter
prunetree3 = prune(fit3, cp=.014) #make tree smaller
rpart.plot(prunetree3, cex=.8,nn=T)
prunetree3
table(data3$survived)
#Predict class category or probabilities
(ndata3 = sample_n(data3,4))
predicted3 = predict(prunetree3, newdata=ndata3, type='class')
predicted3b= predict(prunetree3, newdata=ndata3, type='prob')
cbind(ndata3, predicted3, predicted3b)

#similarly Regression Tree can be made



#Clustering----
#install.packages("amap")
##Read the data in the file
url4 = 'https://docs.google.com/spreadsheets/d/1PWWoMqE5o3ChwJbpexeeYkW6p4BHL9hubVb1fkKSBgA/edit#gid=2073914016'
library(gsheet)
data4 = as.data.frame(gsheet2tbl(url4))
head(data4)
summary(data4)
str(data4)
nrow(data4)
###Verify the data
colnames(data4)
apply(data4, 2, FUN= class)  #are all numeric
fit4 = kmeans(data4[,-1],centers=3)
fit4$centers  # group means
fit4$size  #how rows in which cluster
fit4$withinss  #math difference within each cluster; Which is more cohesive gp?
fit4$cluster
table(fit4$cluster)
cluster2 = data4[ fit4$cluster == 2,]
head(cluster2)
cluster2[-1] %>% summarise_all(mean)
write.csv(cluster2, file = "./data/data4cluster2.csv")


## Association Rules - Groceries data set----
library(arules)  #install first
library(arulesViz) #install first
library(datasets)  # no need to install, just load it reqd for Groceries
data('Groceries')
Groceries
arules::LIST(Groceries[1:6]) #different format
#Find Frequent Itemset
frequentItems = eclat(Groceries,parameter=list(supp = 0.01,minlen=3, maxlen=5)) 
inspect(frequentItems[1:10])
frequentItems
inspect(sort(frequentItems,by="count",decreasing=TRUE)[1:25])

#Support is : support(A&B) = n(A&B)/ N
#Plot the Frequency Plot
itemFrequencyPlot(Groceries,topN = 15,type="absolute")
itemFrequencyPlot(Groceries, topN = 10, type='relative')
abline(h=0.15)

# Create rules and the relationship between items
#parameters are min filter conditions 
rules = apriori(Groceries, parameter = list(supp = 0.001, conf = 0.5, minlen=2))
rules
inspect (rules[1:5])
#Sort Rules by confidence, lift and see the data
rulesc <- sort (rules, by="confidence", decreasing=TRUE)
inspect(rulesc[1:5])
#similary it can be done for lift and support
#which items have strong confidence and lift 

#How To Control The Number Of Rules in Output ?
#maxlen, minlen, supp, conf

#subset -----
#legend to condition commands 
# lhs - means left hand side, or antecendent
# rhs - mean right hand side, or consequent
# items - items, that make up itemsets
# %in% - matches any
# %ain% - matches all
# %pin% - matches partially
# default - no restrictions applied
# & - additional restrictions on lift, confidence etc.
#Find what factors influenced an event ‘X’ :
#Find out what events were influenced by a given event
subset1a = subset(rules, subset=rhs %in% "whole milk")
inspect(subset1a[1:10])
subset1b = subset(rules, subset=rhs %in% 'bottled beer' )
inspect(subset1b)  #no such rule with beer on rhs, change some parameters to such rules 
#inspect(rules)
#Items in : all or any
subset2a = subset(rules, subset=lhs %ain% c('baking powder','soda') )
inspect(subset2a) #all items in 1 rule
subset2b = subset(rules, subset=lhs %in% c('baking powder','soda') )
inspect(subset2b[1:5]) #any of the items in the rule
#rhs- beer, confidence , sort by lift
subset3a = subset(rules, subset=rhs %in% 'bottled beer' & confidence > .7, by = 'lift', decreasing = T)
inspect(subset3a)
subset4 = subset(rules, subset=lhs %in% 'bottled beer' & rhs %in% 'whole milk' ) # lhs- beer, rhs- milk
inspect(subset4)

#Visualizing The Rules -----
plot(subset1[1:10]) 
plot(subset1[1:10], measure=c("support", "lift"), shading="confidence")

#

#install.packages('wordcloud2')
library(wordcloud2)

df = data.frame(word=c('cbap','cmap','iim','imt','calcutta'),freq=c(20,23,15,10,13))
df
wordcloud2(df)


head(demoFreq)
wordcloud2(demoFreq, size = 2, color = "random-light", backgroundColor = "grey")
names(demoFreq)


================================================
FILE: 03-wksp1/2b2-SLM-women.R
================================================
# Regression Analysis
# Simple Linear with 1 IV and 1 DV

data(women)
women
names(women)
str(women)

cov(women$height, women$weight)
#69 : which show positive relationship between height and weight

cor(women$height, women$weight)
#0.995 : which shows Strong and Positive relationship betw height & weight
0.995^2 # also equal R^2 value in this case (Simple Linear Regression)

plot(x=women$height, y=women$weight, type='b')
abline(lm(weight ~ height, data=women), col='red')
names(women)  #x- IV, y -DV
fit1 = lm(weight ~ height, data=women)  # creating a model
summary(fit1)  #summary of the Model

#Ho: (F Test) : No relationship between Y and any X
#Ha:  There is relationship between Y and at least one X
# p < 0.05 Reject Ho in favour of Ha

attributes(fit1)  #output of model

#coefficients
coef(fit1)
#p values for b0 & b1 are significant as it is < 0.05
#pvalue1 = 1.71e-09 < 0.05
#pvalue2 = 1.09e-14 < 0.05

coef(fit1)
#Y = -87 + 3.4 * X  # no extrapolations
(Y = -87 + 3.4 * 20)  # weight cannot be negative
range(women$height)
(Y = -87 + 3.4 * 58)
(Y = -87 + 3.4 * 61.5)
(Y = -87 + 3.4 * 72)
range(women$height)
women


#R^2
(summary(fit1))$r.squared  #0.991
(summary(fit1))$adj.r.squared  #0.9903
#99% of variation in weight is explained by Height
#Good Linear Model for Prediction


#FStats : p value
(summary(fit1))$fstatistic 
#pvalue : 1.09e-14  < 0.05 : Model exists
#There is at least 1 IV which explains variation in Y (DV)


#Prediction : for height = 65, 66

(new1 = data.frame(height=c(65,66,66.5)))
new1
(p1=predict(fit1, newdata = new1))
cbind(new1, p1)
#136.7333 140.1833

(new2 = data.frame(height=c(60,69)))
(p2=predict(fit1, newdata = new2))
cbind(new2, p2)

?predict
predict(fit1, newdata = new1, interval='confidence')
predict(fit1, newdata = new1, interval='prediction')

women
women$weight
fitted(fit1) # predicted values of all original Xs
predict(fit1, newdata = data.frame(women$height)) # same as above for single X

new3 = data.frame(women$height)  # DF using height of original data
cbind(women, fitted(fit1), residuals(fit1))  # compare women, predicted, errors
residuals(fit1)  # diff between actual and predicted values of weight
summary(fit1)
summary(residuals(fit1))

plot(fit1)
cor(fitted(fit1), women$weight)
summary(fit1)


================================================
FILE: 03-wksp1/2b3-SLM-women-A.R
================================================
# Simple Linear Regression : Built in Data Set Women
# Check for assumptions of Regression in the data Set
women
?women
str(women)
fit = lm(weight ~ height, data=women)
?plot
#Initial Checks
cor(women$height, women$weight)
#there is Strong and Positive Relationship between height and weight
plot(women$height, women$weight)
par(mfrow=c(1,1))
plot(fit, which=2)
plot(women)

str(women)
head(women)
women[,2]
fitted(fit)
cbind(women, fitted(fit), predicted=3.45 * women$height - 87)
cbind(women, fitted(fit), residue=fitted(fit)-women$weight, resid(fit))
y = 3.4 * height - 87
plot(residuals(fit))
hist(women$height, breaks=4)
hist(residuals(fit))

hist(residuals(fit), freq=F)
lines(density(residuals(fit)))

#Prediction
ndata = data.frame(height=52.5)
predict(fit, newdata=ndata, type='response')


#Assumptions Regression
#Linearity----
#Linearity of the data. The relationship between the predictor (x) and the outcome (y) is assumed to be linear.
# component + residual plot 
plot(women$height, fitted(fit))
plot(residuals(fit) ~ fitted(fit))
plot(fit, which=1)
#residuals should be randomly distributed and not increase or decrease 


#Normality----
#Normality of residuals. The residual errors are assumed to be normally distributed.
plot(density(resid(fit)))
plot(fit, which=2)

#Homoscedasticity----
#Homogeneity of residuals variance. The residuals are assumed to have a constant variance (homoscedasticity - opposite of heteroscedasticity)
plot(fit, which=3)
#No funnel shape, random distribution of residuals
plot(fit, which=4)

#Auto-Correlation----
#Independence of residuals error terms. (Not dependent on previous values)
durbinWatsonTest(fit)
#pvalue < 0; Ho that there is no correlation (r2=0) is accepted

#Outliers
plot(fit, which=4)
#potential outliers are highlighted 1, 14, 15 row
women[c(1,14,15),]

#Lets remove these values and then find R2
fit2 = lm(weight ~ height, data=women[-c(1,14,15),])
summary(fit2)
(summary(fit))$r.squared
(summary(fit2))$r.squared
AIC(fit, fit2) #lower value of AIC is better


#Potential Problems
#Non-linearity of the outcome - predictor relationships
#Heteroscedasticity: Non-constant variance of error terms.
#Auto Collinearity, Multi-collinearity
#Presence of influential values in the data that can be:
#Outliers: extreme values in the outcome (y) variable
#High-leverage points: extreme values in the predictors (x) variable

#All these assumptions and potential problems can be checked by producing some diagnostic plots visualizing the residual errors.

plot(fit1)


================================================
FILE: 03-wksp1/2b4-LM-cars.R
================================================

#http://r-statistics.co/Linear-Regression.html
head(cars) 
fit2=lm(dist ~ speed, data=cars)
summary(fit2)
predict(fit2, newdata=data.frame(speed=c(5,8)))
plot(fit2)
nrow(cars)

index = sample(1:nrow(cars), size=0.8 * nrow(cars))
index
length(index)
length(unique(index))
traindata = cars[index,]
traindata
testdata = cars[-index,]
testdata
nrow(traindata) + nrow(testdata)

model1 = lm(dist ~ speed, data = traindata)
coef(model1)
(P1 = predict(model1, newdata = testdata))
df_test = cbind(testdata, P1)
cor(df_test$dist, P1)
(error = testdata$dist - P1)

AIC(model1)
BIC(model1)
(MAPE_error = mean(abs(error)/testdata$dist))

library(forecast)
accuracy(model1)
accuracy(error)

#https://www.ritchieng.com/machine-learning-evaluate-linear-regression-model/
#https://www.guru99.com/r-decision-trees.html
#http://www.sthda.com/english/articles/35-statistical-machine-learning-essentials/141-cart-model-decision-tree-essentials/
  

================================================
FILE: 03-wksp1/2b4-SLR-women.R
================================================

fit = lm(weight ~ height, data=women)
summary(fit)
range(women$height)
(ndata = data.frame(height= c(58.5, 60.7)))
(p = predict(fit, newdata = ndata))
cbind(ndata, p)
plot(fit)

sum((fitted(fit) - women$weight)^2)


================================================
FILE: 03-wksp1/2c3-MLM-salespromotion.R
================================================
#Multiple Linear Regression : DV vs more than 1 IVs
#sales Qty vs price & promotion
#Predict Sales Qty from Price and Promotion of the Product


#Omni Store
#creating data using Vector
sales= c(4141,3842,3056,3519,4226, 4630,3507,3754, 5000,5120,4011, 5015,1916,675, 3636,3224,2295, 2730,2618,4421, 4113,3746, 3532, 3825,1096, 761,2088,820,2114, 1882,2159,1602,3354,2927)
price = c(59,59,59,59,59,59,59,59,59,59,59,59, 79,79,79,79,79,79,79,79,79, 79,79,79,99,99, 99,99,99,99,99,99,99,99)
promotion= c(200,200,200,200,400,400,400,400, 600,600,600,600,200,200,200,200, 400,400,400,400,600,600,600,600, 200,200,200,200,400,400,400,400,600,600)
#Create a DF from 3 variables
omni1 = data.frame(sales,price,promotion)
head(omni1)

#2nd Method : from CSV file
#omni2 = read.csv(file.choose())

#3rd Method : from gsheet 
library(gsheet)
url = "https://docs.google.com/spreadsheets/d/1h7HU0X_Q4T5h5D1Q36qoK40Tplz94x_HZYHOJJC_edU/edit#gid=1595306231"
omni3 = as.data.frame(gsheet::gsheet2tbl(url))
head(omni3)
#Make one of data frames active
omni = omni1
head(omni)
str(omni)
nrow(omni)
dim(omni)
#MLR  Create Multiple Linear Regression
# we want to see how Sales Qty depend on Price and Promotion Values
fit2 = lm(sales ~ price + promotion, data=omni)

# summary statistics of model IMP STEP
summary(fit2)
#understand values : R2, AdjR2, Fstats pvalue, Coeff, ***, Residuals
#F Stats pvalue = 2.86e-10 < 0.05 : Model Exists
#At least 1 IV can be used to predict sales
names(summary(fit2))
summary(fit2)$adj.r.squared  # Adjt R2 here > .6 
#>74% of variation in sales is explained by price and promotion

#coefficients b1, b2
coef(fit2)
summary(fit2)
#price  : -53 , pvalue = 9.2e-09 < 0.05 *** : Significant
#keeping promotion constant, if price is increased by 1 unit, salesqty decreases by 53 units
#promotion  : +3.6 , pvalue = 9.82e-06 < 0.05 ***: Significant
#keeping price constant, if promotion is increased by 1 unit, salesqty increases by 53 units

fitted(fit2)
omni$sales
residuals(fit2)
summary(residuals(fit2))
summary(fit2)
#Predict SalesQty for new combination of Values----

#create a dataframe of new sample values
range(omni$price) ; range(omni$promotion)
(ndata2 = data.frame(price=c(60,70), promotion=c(300,400)))
p2sales = predict(fit2, newdata=ndata2)
cbind(ndata2, p2sales)

#Assumptions
par(mfrow=c(2,2))
plot(fit2)
par(mfrow=c(1,1))

plot(fit2,which=1)  # no pattern, equal variance
plot(fit2,2)  # Residuals are normally distributed
plot(fit2,3)  # No hetero-scedascity
plot(fit2,4)  # tells outliers which affect model
omni[c(11,14,15),]

fit3 = lm(sales ~ price + promotion, data=omni[-c(11,14,15),])
plot(fit3,4)
summary(fit3)

#End of Multiple Linear Regression

#when variables are large, select only significant variables
#Model with higher R2 to be selected
#other measures of model selection : AIC, BIC, RMSE
#Dataset can be divided into train(70%) and test(30%) set to check the accuracy

#create model with t





#questions
fit2
summary(fit2)
head(omni)
cbind(omni, predict(fit2, newdata = data.frame(omni$price, omni$promotion)))
cbind(omni, fitted(fit2))
cbind(omni, fitted(fit2), omni$sales - fitted(fit2), residuals(fit2))


================================================
FILE: 03-wksp1/2c4-MLM-mtcars1.R
================================================
#https://cran.r-project.org/web/packages/olsrr/olsrr.pdf
#install.packages('olsrr')
library(olsrr) #install it first

model <- lm(mpg ~ disp + hp + wt + qsec, data = mtcars)
summary(model)
#model <- lm(mpg ~ hp + wt , data = mtcars)
#summary(model)
k <- ols_step_all_possible(model)
#plot(k)
k
summary(lm(mpg ~ wt, data=mtcars))
summary(lm(mpg ~ wt+ hp, data=mtcars))
summary(lm(mpg ~ hp + wt + qsec, data=mtcars))


#library(olsrr)
fit = lm(mpg ~ disp + hp + wt + qsec, data = train)
k = ols_step_all_possible(fit)
plot(k)
k
summary(lm(mpg ~ wt, data= train))
summary(lm(mpg ~ wt + hp, data= train))
finalmodel = lm(mpg ~ wt + hp, data= train)
library(gvlma)
gvmodel = gvlma(finalmodel)
gvmodel

finalmodel = lm(mpg ~ wt + hp, data= train)
(predictedvalues = predict(finalmodel, ndata=test))
cbind(test$mpg, predictedvalues)



================================================
FILE: 03-wksp1/2d1-missingvalues.R
================================================
# Missing values

x = c(NA, 1, NA, 2,3, NA)
is.na(x)
sum(is.na(x))
sum(c(T,F,T,F,F))
mean(x)
?mean
mean(x, na.rm=T)
x
x[is.na(x)]
x[is.na(x)] = mean(x, na.rm=T)
x
x1 = c(4,6,8,9)
length(x1[x1 >= 6])
sum(x1 >= 6)
x1 >= 6

x2 = rnorm(100000, mean=50, sd=5)
length(x2)
posn=sample(100000, size=30)
x2[posn] = NA
sum(is.na(x2))

#install this library
library(VIM)
?sleep

head(sleep) #first few rows of sleep
dim(sleep)  #dimensions of sleep data
complete.cases(sleep)  # which row have complete data in T/ F
sum(complete.cases(sleep))  # no of rows have which no missing data
sum(!complete.cases(sleep))  # no of rows which have missing data
sleep[complete.cases(sleep),]  #rows which are complete
sleep[!complete.cases(sleep),] #rows which have missing values
colSums(is.na(sleep))  #which column how many data missing
rowSums(is.na(sleep))  #which row how many data missing


#use mice package
library(mice)


================================================
FILE: 03-wksp1/2d3-datapartition.R
================================================
#partition the data into train and test set
mtcars
nrow(mtcars)
#train-70%, test-30%
sample(x=1:32, size=.7 * 32)
index = sample(x=1:nrow(mtcars), size=.7 * nrow(mtcars), replace=F)
index
train= mtcars[index,]
test= mtcars[-index,]
nrow(train)
nrow(test)
nrow(train) + nrow(test)





#-------
data(mtcars)

## 75% of the sample size
smp_size <- floor(0.75 * nrow(mtcars))

## set the seed to make your partition reproducible
set.seed(123)
train_ind <- sample(seq_len(nrow(mtcars)), size = smp_size)

train <- mtcars[train_ind, ]
test <- mtcars[-train_ind, ]

# -----
require(caTools)
set.seed(101) 
sample = sample.split(mtcars$am, SplitRatio = .75)
sample
train = subset(mtcars, sample == TRUE)
test  = subset(mtcars, sample == FALSE)
train; test
table(train$am); table(test$am)

mtcars$id <- 1:nrow(mtcars)
train <- mtcars %>% dplyr::sample_frac(.75)
test  <- dplyr::anti_join(mtcars, train, by = 'id')

library(caret)
intrain<-createDataPartition(y=factor(mtcars$am),p=0.7,list=FALSE)
intrain
training<-mtcars[intrain,]
testing<-mtcars[-intrain,]
training  
testing
table(training$am)
table(testing$am)















================================================
FILE: 03-wksp1/2e1-logR-purchase.R
================================================
# Logistic Regression : Predict Purchase


# Import the dataset
df1 = read.csv('./data/logr2.csv')
head(df1)

url="https://docs.google.com/spreadsheets/d/1Md_ro2t3M7nA9JMH1DsE12jfeX7qq-UPw6p8WQd6A2Y/edit#gid=120271978"
library(gsheet)
df2 = as.data.frame(gsheet2tbl(url))
head(df2)

dataset=df2  #or df2 if data is imported from google sheets
head(dataset)
str(dataset)
summary(dataset)
dim(dataset)
View(dataset)
dataset$gender = factor(dataset$gender)


# Split the dataset into the Training set and Test set
#install.packages('caTools')
library(caTools)
set.seed(2000)
split = sample.split(dataset$purchased, SplitRatio = 0.75)
training_set = subset(dataset, split == TRUE)
test_set = subset(dataset, split == FALSE)

dim(dataset); dim(training_set); dim(test_set)
names(dataset)


# Logisitic Model on Training Set
logitmodel1 = glm(purchased ~ gender + age + salary, family = binomial,  data = training_set)
summary(logitmodel1)

# gender not insignificant dropped here
logitmodel2 = glm(purchased ~ age + salary, family = binomial, data = training_set)
summary(logitmodel2)

#summary(logitmodel2)$coefficient  # they are in log terms
head(training_set)
#predict on sample data
test_set2 = data.frame(age=c(40,65), gender=c('Male', 'Female'), salary=c(40000, 50000))
test_set2
(prob_pred2 = predict(logitmodel1, type = 'response', newdata = test_set2))
cbind(test_set2, prob_pred2)
#age=65 person likely to purchase

# Predicting the Test set results from testset
head(test_set)
prob_pred = predict(logitmodel1, type = 'response', newdata = test_set)
summary(prob_pred)
head(cbind(test_set,prob_pred ),10)

#if prob > 0.5 make it 1, else 0
y_pred = ifelse(prob_pred > 0.5, 1, 0)
head(cbind(test_set$purchased, y_pred),100)

# Making the Confusion Matrix
cm = table(test_set[,5], y_pred)
cm
library(caret)
caret::confusionMatrix(cm)

names(dataset)




================================================
FILE: 03-wksp1/2e2-LOGR-adult.R
================================================
#Logistic Regression : Binary Cls : 0 or 1

#Case Study :  predict if an individual will earn more than $50K using logistic regression based on demographic variables available in the adult data.

#Steps
# Import the data
# Check for class bias
# Create training and test samples
# Compute information value to find out important variables
# Build logit models and predict on test data
# Do model diagnostics

#Data Import ----
#from URL
inputData <- read.csv("http://rstatistics.net/wp-content/uploads/2015/09/adult.csv")
head(inputData)
str(inputData)
names(inputData)
removeColumns = c('FNLWGT','EDUCATION')

data= inputData[,-which(names(inputData) %in% removeColumns)]
names(data)

data$ABOVE50K = factor(data$ABOVE50K)

#train and test sets
library(caret)
Index <- createDataPartition(y=data$ABOVE50K, p=0.70, list=FALSE)
head(Index)
nrow(data)
trainData = data[Index ,]
testData = data[-Index, ]
table(data$ABOVE50K); prop.table(table(data$ABOVE50K))
summary(trainData$ABOVE50K); summary(testData$ABOVE50K)
nrow(trainData) ; nrow(testData); nrow(trainData) + nrow(testData)
prop.table(table(trainData$ABOVE50K))
prop.table(table(testData$ABOVE50K))
str(testData)


#Logistic Regression on selected columns
names(data)
logitMod <- glm(ABOVE50K ~ RELATIONSHIP + AGE + CAPITALGAIN + OCCUPATION , data=trainData, family='binomial')
summary(logitMod)
AIC(logitMod)

#Check the probabilities predicted for test data - 2 methods
predicted <- plogis(predict(logitMod, testData))  
head(predicted)
predicted <- predict(logitMod, testData, type="response")
head(predicted)

#what should be the cutoff value between 0 and 1 to categorise them into 0 or 1, so that accuracy is high (correct splitting)

#cutoff value
library(InformationValue)
(optCutOff <- optimalCutoff(testData$ABOVE50K, predicted)[1] )

#Confusion Matrix
(cm1= confusionMatrix(testData$ABOVE50K, predicted, threshold = optCutOff))
#0 classified as 0, 1 classified as 1



#diagnostics

car::vif(logitMod)
#all X variables in the model to have VIF below 4.

misClassError(testData$ABOVE50K, predicted, threshold = optCutOff)
#Misclassification error is the percentage mismatch of predcited vs actuals, irrespective of 1’s or 0’s. The lower the misclassification error, the better is your model.




ROC
#Receiver Operating Characteristics Curve traces the percentage of true positives accurately predicted by a given logit model as the prediction probability cutoff is lowered from 1 to 0. For a good model, as the cutoff is lowered, it should mark more of actual 1’s as positives and lesser of actual 0’s as 1’s. So for a good model, the curve should rise steeply, indicating that the TPR (Y-Axis) increases faster than the FPR (X-Axis) as the cutoff score decreases. Greater the area under the ROC curve, better the predictive ability of the model.

plotROC(testData$ABOVE50K, predicted)



















#http://r-statistics.co/Logistic-Regression-With-R.html
#https://rpubs.com/H_Zhu/235617




================================================
FILE: 03-wksp1/2e3-LOGR-gre.R
================================================
#Logistic Regresion : GRE
#https://stats.idre.ucla.edu/r/dae/logit-regression/
#A researcher is interested in how variables, such as GRE (Graduate Record Exam scores), GPA (grade point average) and prestige of the undergraduate institution, effect admission into graduate school. The response variable, admit/don’t admit, is a binary variable


inputData <- read.csv("https://stats.idre.ucla.edu/stat/data/binary.csv")
## view the first few rows of the data
head(inputData)
dim(inputData)
inputData
summary(inputData)
sapply(inputData, sd)
str(inputData)
data= inputData  # make a copy for futher analysis

data$rank = factor(data$rank)
data$admit = factor(data$admit)
str(data)
## 2way contingency table of cat outcome and predictors we want
## to make sure there are not 0 cells
table(data$rank, data$admit)
xtabs(~admit + rank, data = data)
xtabs(~ gear + cyl + am , data=mtcars)
#create Logistic Model
mylogit <- glm(admit ~ gre + gpa + rank, data = data, family = "binomial")

summary(mylogit)
#gre,gpa, rank are statistically significant, 
#For every one unit change in gre, the log odds of admission (versus non-admission) increases by 0.002.
#For a one unit increase in gpa, the log odds of being admitted to graduate school increases by 0.804.
#The indicator variables for rank have a slightly different interpretation. For example, having attended an undergraduate institution with rank of 2, versus an institution with a rank of 1, changes the log odds of admission by -0.675.

## odds ratios only
exp(coef(mylogit))
library(dplyr)
(ndata = sample_n(data, 3))
#Predict admit for input data
(prob=predict(mylogit,newdata=ndata, type=c("response")))
prob
cbind(ndata, prob)


#cutoff value
library(InformationValue)
(optCutOff <- optimalCutoff(data$admit, prob)[1] ) #.46
confusionMatrix(data$admit, prob, threshold = optCutOff)
(accuracy = (247+38)/ (sum(247+38+89+26))) # .715
confusionMatrix(data$admit, prob, threshold = .7)
(accuracy = (272+2)/ (sum(272+2+125+1))) #.685

confusionMatrix(data$admit, prob, threshold = .2)


## view data frame
library(dplyr)
sample_n(data,size=1)
(newdata1 = data.frame(gre=450, gpa=3.7, rank=factor(3) ))
(newdata1$admitPredicted <- predict(mylogit, newdata = newdata1, type = "response"))
(newdata1$admitClass = ifelse(newdata1$admitPredicted > .46,1,0))
newdata1  #b=not admitted to institute

#End of Logistic Regression
#also check for assumptions of residues, VIF, Multi-collinearity
#Parition the data into train and test



library(caret)
Index <- createDataPartition(y=data$admit, p=0.70, list=FALSE)
head(Index)
nrow(data)
trainData = data[Index ,]
testData = data[-Index, ]
table(data$admit); prop.table(table(data$admit))
summary(trainData$admit); summary(testData$admit)
nrow(trainData) ; nrow(testData); nrow(trainData) + nrow(testData)
prop.table(table(trainData$admit))
prop.table(table(testData$admit))
#same promotion of admit in test and train
str(testData)
#now construct a model with train and then test on testdata



================================================
FILE: 03-wksp1/3b1-DT-CART-carseats.R
================================================
# Decision Trees : - regression tree
#install and load this library
library(ISLR)
data(Carseats)
?Carseats
data = Carseats
head(data)

#Libraries for Decision Tree
library(rpart)
library(rpart.plot)

#Model
tree1 = rpart(Sales ~ . , data=data, method='anova' )
tree1
rpart.plot(tree1, cex=.8,nn=T)

#this is large tree, so prune it: check cp
printcp(tree1)
#cp value should be chosen such that xerror is least
prunetree = prune(tree1, cp=0.03)
prunetree
#here we have selected a different value to simplify the tree
mean(data$Sales)
prunetree
rpart.plot(prunetree, nn=T)
#Interpretation
#if ShelveLoc=Good, and Price >= 109.5, sales predicted is 9.2

#improve the plot
rpart.plot(prunetree, nn=T, cex=.8, type=4)
#read this document to improve the plot
#https://cran.r-project.org/web/packages/rpart.plot/rpart.plot.pdf
#http://www.milbo.org/rpart-plot/prp.pdf
library(dplyr)
#Predict for test value
(testdata = sample_n(data,2))
(predictedSales=predict(prunetree, newdata=testdata, type='vector'))
cbind(testdata, predictedSales)
#next line will show error because we have to predict numerical value instead of class/ category, so type of response reqd is vector not class
(predict(prunetree, newdata=testdata, type='class'))
#ERROR - use vector

#see online help here
#https://www.datacamp.com/community/tutorials/decision-trees-R



================================================
FILE: 03-wksp1/3b2-DT-CART-R-sales.R
================================================
# CART Models - HH Case Study - Regression
library(rpart)
library(rpart.plot)
library(forecast)

library(gsheet)
url='https://docs.google.com/spreadsheets/d/1PWWoMqE5o3ChwJbpexeeYkW6p4BHL9hubVb1fkKSBgA/edit#gid=1941519952'
data = as.data.frame(gsheet2tbl(url))
str(data)

# Summarize the dataset
summary(data)
names(data)
# Random Sampling
set.seed(777) # To ensure reproducibility
Index = sample(x = 1:nrow(data), size = 0.7*nrow(data))
Index

# Create Train dataset
train= data[Index, ]
nrow(train)

# Create Test dataset
test = data[-Index, ]
nrow(test)
nrow(test) + nrow(train)

########################### Modeling #################################

trainModel = rpart(Annual_Sales ~ . , data = train[,-1], method = "anova")
trainModel
mean(train$Annual_Sales)

# Plot the Regression Tree
rpart.plot(trainModel, type = 4,fallen.leaves = T, cex = 1.0, nn=T)

#cp selection
printcp(trainModel)
trainModel_prune = prune(trainModel, cp=0.01)
rpart.plot(trainModel_prune)

#Predict and check accuracy
predictSales_test = predict(trainModel_prune, newdata = test, type = "vector")
predictSales_test  #vector to print values of sales predicted

library(forecast)
# Validate RMSE and MAPE calculation with a function in R
ModelAccuarcy = accuracy(predictSales_test, test$Annual_Sales)
ModelAccuarcy
#RMSE should be as less as possible



================================================
FILE: 03-wksp1/3b3-DT-CART-titanic.R
================================================
# Decision Tree - Classification
#we want predict for combination of input variables, is a person likely to servive or not

#import data from online site
path = 'https://raw.githubusercontent.com/thomaspernet/data_csv_r/master/data/titanic_csv.csv'
titanic <- read.csv(path)
head(titanic)
names(titanic)
data = titanic[,c(2,3,5,6,7)]  #select few columns only
head(data)
dim(data)
#load libraries
library(rpart)
library(rpart.plot)
str(data)
#Decision Tree
names(data)
fit <- rpart(survived ~ ., data = data, method = 'class')
fit
rpart.plot(fit, extra = 104, cex=.8,nn=T)  #plot

printcp(fit) #select complexity parameter
prunetree2 = prune(fit, cp=.014)
rpart.plot(prunetree2, cex=.8,nn=T, extra=104)
prunetree2
nrow(data)
table(data$survived)
#Predict class category or probabilities
(testdata = sample_n(data,2))
predict(prunetree2, newdata=testdata, type='class')
predict(prunetree2, newdata=testdata, type='prob')

#Use decision trees for predicting
#customer is likely to buy a product or not with probabilities
#customer is likely to default on payment or not with probabilities
#Student is likely to get selected, cricket team likely to win etc

#Imp steps
#select columns for prediction
#load libraries, create model
#prune the tree with cp value
#plot the graph
#predict for new cases

================================================
FILE: 03-wksp1/3b4-DT-CART-R-loan.R
================================================


#Decision Tree
#(https://rpubs.com/fabiorocha5150/decisiontreemodel)

url='https://raw.githubusercontent.com/fabiorcampos/Bank-Marketing/master/data/bank-full.csv'
data= read.table(url,sep=';', header = T )
head(data)
str(data)
names(data)
df1 = data
df1
str(df1)
hist(df1$age)
hist(df1$balance)
table(df1$marital)
barplot(table(df1$marital), col=1:3, beside=T)
barplot(table(df1$loan), col=1:3, beside=T)


================================================
FILE: 03-wksp1/3b5-DT-loanapproved1.R
================================================
# Decision Tree # loanapproved = age + job + house + credit
loanapproved = sample(x=c('Yes','No'), size=50, replace=T)
age = runif(50,30,60)
house = sample(x=c('Yes','No'), size=50, replace=T, prob=c(.4,.6))
job = sample(x=c('Yes','No'), size=50, replace=T, prob=c(.6,.4))
credit = ceiling(rnorm(50,100, 10))
loan = data.frame(loanapproved, age,job,house, credit)
head(loan)
ctree = rpart(loanapproved ~ ., data=loan)
ctree
rpart.plot(ctree, cex=1)
ndata = data.frame(age=c(45,55), house=c("No",'Yes'), job=c("Yes",'No'), credit=c(90,100))
ndata
(p1=predict(ctree, newdata=ndata,type='class'))

(p2=predict(ctree, newdata=ndata,type='prob'))
(p3=predict(ctree, newdata=ndata,type='matrix'))
cbind(ndata, p1, p2, p3)




loan_tree = rpart(loanapproved ~ ., data=loan, method='class', control=rpart.control(minsplit=5, cp=-1))
loan_tree
rpart.plot(loan_tree)
rpart.plot(loan_tree,  type = 4,fallen.leaves = T, cex = 0.6)
rpart.plot(loan_tree,type=2, extra=104, cex=1, tweak=1, under=T, shadow=c('brown', 'green','red'), nn=T)

printcp(loan_tree)





plot(loan_tree)
text(loan_tree)

================================================
FILE: 03-wksp1/3b5-DT-rpart-iris.R
================================================
#CART Regression Tree

#Load Libraries
library(rpart)  #does only binary splits; CART
library(rpart.plot)

#DataSet
str(iris)
head(iris)

table(iris$Species)

library(dplyr)
sample_n(iris, 3)
sample_frac(iris, .1)
# Classification Tree
summary(iris)
set.seed(1234)
#Predict Species
ctree = rpart(Species ~ ., method='class', data=iris)
ctree
rpart.plot(ctree, main='Classification Tree', nn=T, type=4, cex=1,extra=104)
printcp(ctree)
ctreeprune = prune(ctree, cp=0.44)
ctreeprune
rpart.plot(ctreeprune, main='Classification Tree', nn=T, type=4, extra=104)
# Regression Tree - Predict Continuous Value Length
#Predict Sepal.Length
rtree = rpart(Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width + Species, method="anova", data=iris )
rtree
rpart.plot(rtree, main='Regression Tree', nn=T, type=2, cex=1)

#another way to plot
plot(rtree, uniform=TRUE, main="Regression Tree for Sepal Length")
text(rtree, use.n=TRUE, cex = 1)
rtree

#CP
printcp(rtree)
rsq.rpart(rtree)  #r2 on anova only

rtreeprune = prune(rtree, cp=0.023)
rpart.plot(rtreeprune, main='Pruned Regression Tree', nn=T, type=2, cex=1)



#https://analytics4all.org/2016/11/23/r-decision-trees-regression/


#Predict : Class  : Category of Flower
str(iris)
testData1  = data.frame ( Sepal.Length = 5, Sepal.Width = 4, Petal.Length =1.2, Petal.Width=0.3)
predict(ctree, testData1, method = "class")  #setosa


#Predict : Continous Value : Length

testData2  = data.frame (Species = 'setosa', Sepal.Width = 4, Petal.Length =1.2, Petal.Width=0.3)
predict(rtree, testData2, method = "anova") #5.17



================================================
FILE: 03-wksp1/3d1-DT-CHAID-usvote.R
================================================
#CHAID - dataset USvote #multisplit
# require(rsample) # for dataset and splitting also loads broom and tidyr
#install.packages("CHAID", repos="http://R-Forge.R-project.org")
library(CHAID)  #library for performing CHAID decision tree

#Dataset
data(USvote)  #from lib CHAID
?USvote

#Quick CHAID analysis
set.seed(101)
sample1 = USvote[sample(1:nrow(USvote), 1000),]
head(sample1)
str(sample1)
chaidModel <- chaid(vote3 ~ ., data = sample1, control=chaid_control(minbucket = 10, minsplit=20, minprob=0))

?chaid
print(chaidModel)
plot(chaidModel)
sample1 %>% filter(marstat=='married') %>% tally()
#sample1 %>% filter(ager=='18-24') %>% tally
dim(sample1)
chaidModel2 <- chaid(vote3 ~ ., data = sample1, control=chaid_control(minbucket = 20, minsplit=20, minprob=0, maxheight=2))
plot(chaidModel2) 

#Summary
#More than 2 splits
#used in Marketing / Business 
#Net Promoter Score, Feedback, Where all variables are categories

================================================
FILE: 03-wksp1/3e1-clust-customer.R
================================================
# HH MA example  - customer

#install.packages("amap")
library(amap)
##Read the data in the file
url = 'https://docs.google.com/spreadsheets/d/1PWWoMqE5o3ChwJbpexeeYkW6p4BHL9hubVb1fkKSBgA/edit#gid=2073914016'
library(gsheet)
data = as.data.frame(gsheet2tbl(url))
str(data)
head(data)

names(data)
summary(data)
###Verify the data
colnames(data)
class(data$Age)
apply(data, 2, FUN= class)  #are all numeric
dim(data)
head(data)
summary(data)
###Run the kmeans algorithm to generate the clusters
#?amap::Kmeans
names(data)

k1 <- amap::Kmeans(data[,-1],centers=3, iter.max = 200,nstart = 1,  method = c("euclidean"))

k1$centers  # group means
###Fetch size/n of obs for the groups
k1$size
###Fetch sum of squared  for the groups
k1$withinss
###Fetch the cluster for each obs
#k1$cluster
k1$cluster
k1$centers
k1$cluster[9000:9800]
table(k1$cluster)
k1$size
data_clus_2 <- data[ k1$cluster == 2,]
(data_clus_2)
mean(data_clus_2$Age)
data_clus_2$Cust_id

# Write CSV
write.csv(data_clus_2[,1], file = "./data/data_clus_2.csv")


================================================
FILE: 03-wksp1/3e1-clustering.R
================================================
#Clustering
#sample data, iris, no of clusters
library(cluster)
library(fpc)
library(dplyr)

#sample Data
marks = data.frame(sub1=c(0,1,2,4,5,4,6,7),sub2=c(0,1,2,3,3,4,5,5))
marks
#with 1 column
marks[,1]
km12 = kmeans(marks[,1], center=3)
#analyse output
km12

km12$cluster  #which row is assigned to which cluster No
km12$size  #size of each cluster
km12$centers  #average of each variable for each cluster
km12$iter  #no cycles, optimal cluster was found

marks %>% mutate(cluster = km12$cluster) %>% group_by(cluster) %>%  summarise(mean(sub1))

#Distances
km12$withinss
km12$tot.withinss  #sum of withinss
km12$betweenss
km12$tot.withinss + km12$betweenss
km12$totss
km12$iter

#now do it for both the columns
km23 = kmeans(marks, center=3)
km23
km23$centers

km23$betweenss
km23$tot.withinss
km23$withinss
km23$betweenss/ (km23$betweenss + km23$tot.withinss)
km23$betweenss; km23$betweenss/ (km23$totss) #same as below
km23  #within cluster sum of sq by cluster
# should be high for good clustering

#plots
plot(marks,col=km23$cluster,cex=1.5)
points(km23$center,col=1:2,pch=8,cex=2)  #center point
#see the change in columns : use PCA
plotcluster(marks, km23$cluster)
clusplot(marks, clus=km23$cluster, color=TRUE, shade=TRUE, labels=2, lines=1, plotchar=T, span=T,main="Cluster Plot")
?clusplot

#iris dataset

#Plot1----
library(cluster)
library(fpc)

data(iris)
head(iris)
data = iris[, -5] # without known classification 
# Kmeans cluster analysis
iriskm1 =  kmeans(data, centers=3)
plotcluster(data, iriskm1$cluster)

# More complex : PCA method: 2 dim
clusplot(data, iriskm1$cluster, color=TRUE, shade=TRUE, labels=2, lines=0)
iris[c(16,19),]

# another method
with(iris, pairs(data, col=c(1:3)[iriskm1$cluster])) 
with(iris, pairs(data, col=c(1:3)[iriskm1$cluster])) 


#Finding optimal No of Clusters
iris
table(iris$Species)
data = iris[-5]  #remove target column
head(data)

km1= kmeans(data,centers=1)
km1$tot.withinss; km1$withinss

km2= kmeans(data,centers=2)
km2$tot.withinss; km2$withinss

km3= kmeans(data,centers=3)
km3$tot.withinss ; km3$withinss


km4= kmeans(data,centers=4)
km4$tot.withinss; km4$withinss

km5= kmeans(data,centers=5)
km5$tot.withinss;km5$withinss

km1$tot.withinss; km2$tot.withinss ; km3$tot.withinss ; km5$tot.withinss ; km5$tot.withinss
data[km5$cluster==4,]

#Selecting the number of clusters
library(NbClust)
nc = NbClust(data, distance="euclidean",min.nc=2, max.nc=15, method="average")  
#avg -  distance Dij between two clusters Ci and Cj is the mean of the distances between the pair of points x and y

kiris = kmeans(data, centers=3)
kiris$centers
irisclusters = cbind(iris$Species, data, kiris$cluster)
head(irisclusters)

library(dplyr)
iris %>% group_by(Species) %>% summarise_all(mean)
kiris$centers
#setsoa - Cluster1, etc
#grouping may not be perfect, but close to orginal classification
#this way we can do Customer, Product Segmentation
#Other Clustering Techniques - Hierarchical etc





#Scaling in Clusters: marks of different max
set.seed(1234); marks50 = ceiling(runif(100, 1,50)) 
set.seed(1234); marks500 = ceiling(runif(100, 250,500)) 

students1= data.frame(marks50, marks500)
head(students1)

skm1 = kmeans(students1, centers=3)
skm1$centers
skm1$withinss

#scaling gives equal importance to all variables
#avoid obtaining clusters that are dominated by variables having the largest amount of variation

students2 = scale(students1)
head(students2)
skm2 = kmeans(students2, centers=3)
skm2$centers
skm2$withinss

par(mfrow=c(1,2))
plot(students1$marks50, students1$marks500, col=skm1$cluster)
plot(students1$marks50, students1$marks500, col=skm2$cluster)
#better cluster in 2nd case
par(mfrow=c(1,1))


================================================
FILE: 03-wksp1/3e2-clust-samplecase.R
================================================
# Clustering

set.seed(1234)
subject1 = trunc(rnorm(30, mean=60, sd=15))
range(subject1)
subject1
marks = data.frame(subject1)
head(marks)

k2 = kmeans(marks, centers=2)
k2
k2$size
length(marks[k2$cluster==1,])
marks[k2$cluster==2,]
mean(marks[k2$cluster==1,])
mean(marks[k2$cluster==2,])
k2$centers



k2a = kmeans(marks, centers=c(50,70))
k2a
k2a$centers


================================================
FILE: 03-wksp1/3e3-clust-segmentation.R
================================================
# HH MA example  - customer

#install.packages("amap")
library(amap)
##Read the data in the file
url = 'https://docs.google.com/spreadsheets/d/1PWWoMqE5o3ChwJbpexeeYkW6p4BHL9hubVb1fkKSBgA/edit#gid=2073914016'
library(gsheet)
data = as.data.frame(gsheet2tbl(url))
str(data)
head(data)
dim(data)
names(data)
summary(data)
str(data)
###Verify the data
colnames(data)
class(data$Age)
apply(data, 2, FUN= class)  #are all numeric
dim(data)
head(data)
summary(data)
###Run the kmeans algorithm to generate the clusters
#?amap::Kmeans
names(data)

k1 <- amap::Kmeans(data[,-1],centers=3, iter.max = 200)
?Kmeans
k1$centers  # group means
###Fetch size/n of obs for the groups
attributes(k1)
k1$size
###Fetch sum of squared  for the groups
k1$withinss
###Fetch the cluster for each obs
#k1$cluster
k1$cluster
k1$centers
k1$cluster[9000:9800]
table(k1$cluster)
k1$size
data_clus_2 <- data[ k1$cluster == 2,]
(data_clus_2)
mean(data_clus_2$Age)
data_clus_2$Cust_id

# Write CSV
write.csv(data_clus_2[,1], file = "./data/data_clus_2.csv")


================================================
FILE: 03-wksp1/3e4-clust-noOfclusters.R
================================================
#Optimal Number of Clusters in data
#Reduce total within ss
iris
head(iris)
table(iris$Species)

data = iris[-5]
head(data)

km1= kmeans(data,centers=1)
km1$withinss
km1$tot.withinss

km2= kmeans(data,centers=2)
km2$tot.withinss
km2$withinss

km3= kmeans(data,centers=3)
km3$tot.withinss

km4= kmeans(data,centers=4)
km4$tot.withinss

km5= kmeans(data,centers=5)
km5$tot.withinss

km1$tot.withinss; km2$tot.withinss ; km3$tot.withinss ; km4$tot.withinss ; km5$tot.withinss

#Selecting the number of clusters
library(NbClust)
nc = NbClust(data, distance="euclidean",min.nc=2, max.nc=15, method="average")
nc

?NbClust
kiris = kmeans(data, centers=3)
kiris$centers

cbind(iris$Species, data, kiris$cluster)
aggregate(cbind(Sepal.Length, Sepal.Width) ~ Species, data=iris, mean)

names(mtcars)
data2 = mtcars[c('mpg','disp','hp','wt')]
head(data2)
nc = NbClust(data2, distance="euclidean",min.nc=2, max.nc=15, method="average")
det(as.matrix(mtcars))
?na.action
km3= kmeans(data,centers=3)
km3$tot.withinss

cbind(km1$tot.withinss, km2$tot.withinss, km3$tot.withinss, km4$tot.withinss,km5$tot.withinss)

#we select no clusters at elbow point
#adding more clusters does not significantly reduce total withinss


================================================
FILE: 03-wksp1/4b1-AR-groceries.R
================================================
# Association Rules - Groceries data set ####

library(arules)  #install first
library(arulesViz) #install first
library(datasets)  # no need to install, just load it reqd for Groceries
data('Groceries')
Groceries

#Structure of Groceries
str(Groceries)
Groceries
arules::LIST(Groceries[1:6])  #another view
arules::inspect(Groceries[1:5])

#Find Frequent Itemset
frequentItems = eclat (Groceries, parameter = list(supp = 0.006, minlen= 4, maxlen = 5)) 
inspect(frequentItems[1:4])
frequentItems
inspect(frequentItems[10:200])
#inspect(frequentItems[100:122])
#Descending Sort frequent items by count : 1 to 25 itemsets
inspect(sort (frequentItems, by="count", decreasing=TRUE)[1:25])
inspect(sort (frequentItems, by="count", decreasing=F)[1:25])

#Support is : support(A&B) = n(A&B)/ N
#Plot the Frequency Plot
itemFrequencyPlot(Groceries,topN = 15,type="absolute")
itemFrequencyPlot(Groceries, topN = 10, type='relative')
abline(h=0.15)

# Create rules and the relationship between items
#parameters are min filter conditions 
rules = apriori(Groceries, parameter = list(supp = 0.005, conf = 0.5, minlen=2))
rules
inspect (rules[1:5])
#Sort Rules by confidence, lift and see the data
rulesc <- sort (rules, by="confidence", decreasing=TRUE)
inspect(rulesc[1:5])
rulesl <- sort (rules, by="lift", decreasing=TRUE)
inspect (rulesl[1:5])
#which items have strong confidence and lift 

#How To Control The Number Of Rules in Output ?
#maxlen, minlen, supp, conf
rules2 = apriori (Groceries, parameter = list (supp = 0.001, conf = 0.5, minlen=2, maxlen=3)) 
inspect(rules2[1:15])

#Find what factors influenced an event ‘X’
rules3 = apriori (data=Groceries, parameter=list (supp=0.002,conf = 0.8), appearance = list (default="lhs",rhs="whole milk"), control = list (verbose=F))
inspect(rules3[1:5])
inspect(rules3)

#Find out what events were influenced by a given event
subset1 = subset(rules2, subset=rhs %in% "whole milk")
inspect(subset1)
subset1 = subset(rules2, subset=rhs %in% 'bottled beer' )
inspect(subset1)
inspect(rules2)
subset2 = subset(rules2, subset=lhs %ain% c('baking powder','soda') )
inspect(subset2)
subset2a = subset(rules2, subset=lhs %in% c('baking powder','soda') )
inspect(subset2a)


subset3 = subset(rules2, subset=rhs %in% 'bottled beer' & confidence > .7, by = 'lift', decreasing = T)
inspect(subset3)
subset4 = subset(rules2, subset=lhs %in% 'bottled beer' & rhs %in% 'whole milk' )
inspect(subset4)

#Visualizing The Rules -----
plot(subset1) 
plot(subset1, measure=c("support", "lift"), shading="confidence")

#


rules4 = apriori (data=Groceries, parameter=list (supp=0.001,conf = 0.4), appearance = list (default="rhs",lhs=c('tropical fruit','herbs')), control = list (verbose=F))
inspect(rules4[1:5])
inspect(rules4)



================================================
FILE: 03-wksp1/4b2-AR-samplecase.R
================================================
# Association Rule - Simple Example Case
# read this pdf for help
#https://cran.r-project.org/web/packages/arules/arules.pdf

#libraries
library(arules)
library(arulesViz)

#Create Data

#Method3 Use: ----
#Data in the form of list
itemlist = list(c('I1','I2','I5'), c('I2','I4'), c('I2','I3'),c('I1','I2','I4'),c('I1','I3'),c('I2','I3'),c('I1','I3'),c('I1','I2','I3','I5'),c('I1','I2','I3'))
itemlist
length(itemlist)
## set transaction names
names(itemlist) <- paste("Tr",c(1:9), sep = "")
itemlist
## coerce into transactions
tdata3 <- as(itemlist, "transactions")
tdata3
summary(tdata3)

tdata=tdata3

#Data ready - Perform AR ----
## analyze transactions
summary(tdata)
itemlist
image(tdata)

#Analysis
freqitems = eclat(tdata) #default support=.1
freqitems = eclat(tdata, parameter = list(minlen=1, supp=.1, maxlen=2 ))

freqitems
inspect(freqitems)

support(items(freqitems[1:2]), transactions=tdata)
inspect(freqitems[1])
inspect(items(freqitems[1]))

itemFrequencyPlot(tdata,topN = 5,type="absolute")
itemFrequencyPlot(tdata,topN = 5,type="relative", horiz=T)
write.csv(as.data.frame(inspect(freqitems)),'freqitems1.csv')


#Construct the Rules
rules = apriori(tdata, parameter = list(supp = 0.2, conf = 0.5, minlen=2))
itemFrequencyPlot(items(rules))

inspect(rules[1:5])
inspect(rules)
write.csv(as.data.frame(inspect(rules)),'rules1.csv')
#sort rules by support
rules_s = sort(rules, by="support", decreasing=TRUE )
inspect(rules_s)
inspect(rules_s[1:5])  #itemsset having high support

#sort rules by confidence
rules_c = sort(rules, by="confidence", decreasing=TRUE )
inspect(rules_c)
inspect(rules_c[1:5])  #itemsset having high confidence

#sort rules by lift
inspect(head(rules, n = 3, by ="lift"))
rules_l = sort(rules, by="lift", decreasing=TRUE )
inspect(rules_l)
inspect(rules_l[1:5])  #itemsset having high confidence

#Quality Data of Rules
quality(rules_c) 

#Redundant Rules
inspect(rules)
(redundant = which(is.redundant(rules)))
inspect(rules[c(8,9,10,11,12,14,14)])

inspect(rules[redundant])
inspect(rules)
write.csv(as(rules,"data.frame"), file='./data/rulesR.csv')

#Remove Redundant Rules
rulesNR <- rules[-redundant] 
is.redundant(rulesNR)
sum(is.redundant(rulesNR))  #ok now
inspect(rulesNR)

#Rules with LHS and RHS: single or combination
rules2= rulesNR
inspect(rules2)
rules2.lhs1 <- subset(rules2, lhs %in% c("I1", "I5"))
inspect(rules2.lhs1)

rules2.rhs1 <- subset(rules2, rhs %in% c("I3"))
inspect(rules2.rhs1)

rules2.lhsrhs1 = subset(rules2, lhs %in% c("I1") & rhs %in% c("I3"))
inspect(rules2.lhsrhs1)

rules2.lhsrhs2 = subset(rules2, lhs %in% c("I1") | rhs %in% c("I3"))
inspect(rules2.lhsrhs2)



# Rules as DF: original rules
rules_DF <- as(rules,"data.frame")
rules_DF
str(rules_DF)
write.csv(rules_DF, './data/myrules1.csv')

#Visualisation
plot(rules)


================================================
FILE: 03-wksp1/4b3-AR-groceries-subset.R
================================================
#AR - Groceries - Subset

#Subsetting rules and itemsets
rules <- apriori(Groceries, parameter = list(support=.001, confidence=.7,   maxlen=5, target='rules' ))
rules
#target='rules' # to mine for rules)

inspect(sort(rules, by="confidence", decreasing = T)[1:5])

#subset conditions
#rhs should be ‘bottled beer’
#confidence should be above .7
#results should be sorted by lift

#Subset1----
inspect(sort(subset(rules, subset=rhs %in% 'bottled beer' & confidence > .7),   by = 'lift', decreasing = T))

#people buying “liquor” and “red wine” are almost certain to buy “bottled beer” (9 times out of 10), but not “canned beer”

# Another rule with different parameters
#subset2----
canned_rules <- apriori(Groceries,parameter = list(support=.001,  confidence=.01, maxlen=5, target='rules' ))

inspect(subset(canned_rules, subset=lhs %ain% c("liquor", "red/blush wine") & rhs %in% 'canned beer' ))
#no rule found or it could be significant low % less than 1 in 100 people would do

#legend to condition commands 
# lhs - means left hand side, or antecendent
# rhs - mean right hand side, or consequent
# items - items, that make up itemsets
# %in% - matches any
# %ain% - matches all
# %pin% - matches partially
# default - no restrictions applied
# & - additional restrictions on lift, confidence etc.

#either side lhs or rhs
#“whole milk” and “yogurt” must be present and rule’s confidence must be higher than .95
#subset3----
inspect(subset(rules, subset=items %ain% c("whole milk","yogurt") & confidence >.95))

#whole milk” and “yogurt” must be present in lhs and rule’s confidence must be higher than .9
inspect(subset(rules, subset=lhs %ain% c("whole milk","yogurt") & confidence >.9))

#Subset4----
#“Bread” must be present in lhs: any type of “bread” – “white bread”, “brown bread” – both qualify. “Whole milk” must be present in rhs “as is”. confidence of the rule must be higher than .9

inspect(subset(rules, subset= lhs %pin% "bread" & rhs %in% "whole milk" & confidence > .9))


#Subset5----
#what we can expect at rhs with confidence higher than .7 if we have both “flour” and “whole milk” on the lhs
inspect(subset(rules, subset= lhs %ain% c("flour","whole milk") & confidence>.7))



#Let’s consider case “Bottled beer Vs. Canned beer” and prove that people tend to buy either one or the other, and rarely do they buy both, qualifying these two as substitute products.
rules <- apriori(Groceries,  parameter = list(support=.001, conf = .01, minlen=2, maxlen=2, target='rules'))

#Let’s only look at the rules where “beer” is present at both left- and right-hand-side of the rule and add chiSquared p-value to prove statistical significance of our findings:

inspect(subset(rules, lhs %pin% 'beer' & rhs %pin% 'beer'))

crossTable(Groceries)['canned beer','bottled beer']

#the probability of a consecutive purchase (confidence) is pretty small: ~3%
#this is despite both bottled beer and canned beer being pretty popular purchases
eclat(Groceries, maxlen=1)
singleitems = eclat (Groceries, parameter = list(supp = 0.05, maxlen = 2)) 
inspect(singleitems)
inspect(sort (singleitems, by="count", decreasing=TRUE)[1:25])

crossTable(Groceries)['canned beer','canned beer']
crossTable(Groceries)['bottled beer','bottled beer']

crossTable(Groceries)['bottled beer','canned beer']
crossTable(Groceries)['canned beer','bottled beer']

quality(rules)$chi  <- interestMeasure(rules, measure='chi', significance=T, Groceries)
quality(rules)$chi[4785:4786]
#All these figures, combined with statistically significant lift below 1 (chi ~ 1e-6) tells us that “bottled beer” and “canned beer” do behave as substitutes.



================================================
FILE: 03-wksp1/4b5-AR-finproducts.R
================================================
# AR data for Finance

library("arules")

set.seed(101)
transactionID = sample(1:500, 1000, replace=T)
transactionID

finproducts = c('Mutual Funds', 'NPS', 'Savings Account', 'PPF', 'FD', 'Bonds', 'Stocks', 'General Insurance', 'NRI Banking', 'Car Insurance', 'Debit Card', 'Credit Card', 'Mobile Banking')
length(finproducts)
item = sample(finproducts,1000, replace=T)
item
orders = data.frame(transactionID, item)
head(orders)

write.csv(orders, "./data/fintransactions.csv")

ordertrans <- arules::read.transactions(
  file = "./data/fintransactions.csv",  format = "single",
  sep = ",",  cols=c("transactionID","item"),  rm.duplicates = T
)
ordertrans
inspect(ordertrans[1:5])
ordertrans
summary(ordertrans)
str(ordertrans)
head(ordertrans)


#find frequent item set
frequentItems <- eclat (ordertrans, parameter = list(supp = 0.005, minlen= 2, maxlen = 5)) 
inspect(frequentItems[1:10])
inspect(sort (frequentItems, by="count", decreasing=TRUE)[1:15])
#support(A&B) = n(A&B)/ N

frequentItems

itemFrequencyPlot (ordertrans,topN = 15,type="absolute")
itemFrequencyPlot(ordertrans, topN = 10, type='relative')
abline(h=0.15)

#Create Rules
rules1 <- arules::apriori(ordertrans, parameter = list(supp = 0.005, conf = 0.5))
rules1
write.csv(inspect(rules1, './data/rules.csv'))
inspect(rules1[1:5])

rules1L = sort (rules1, by="lift", decreasing=TRUE)
inspect (rules1L[1:5])


#How To Control The Number Of Rules in Output ?
rules2 = apriori (ordertrans, parameter = list (supp = 0.005, conf = 0.5, minlen=3, maxlen=5)) # minlen = 3 & maxlen limits the elements in a rule to 3 & 5
inspect(rules2)

rules3 = apriori(ordertrans)  #use default values  no rules


#Find what factors influenced an event ‘X’


rules4 <- apriori(data=ordertrans, parameter=list (supp=0.001,conf = 0.08), appearance = list (default="lhs",rhs="Credit Card"), control = list (verbose=F))  #verbose=F no log
inspect(rules4[1:15])

#Find out what events were influenced by a given event
rules5 <- apriori (data=ordertrans, parameter=list (supp=0.001,conf = 0.05,minlen=2), appearance = list (default="rhs",lhs="General Insurance"), control = list (verbose=F)) 
inspect(rules5)

inspect( subset( rules5, subset = rhs %pin% c("Bonds") ))
inspect( subset( rules5, subset = rhs %pin% c("FD") ))


#Control lhs and rhs
rules6 <- apriori(ordertrans, parameter=list(support =0.00001, confidence =0.05, minlen=2, maxlen=5), appearance = list(lhs=c("Debit Card"), rhs=c("Credit Card"), default="none"))
inspect(rules6)

# commands for subset 
# lhs - means left hand side, or antecendent
# rhs - mean right hand side, or consequent
# items - items, that make up itemsets
# %in% - matches any
# %ain% - matches all
# %pin% - matches partially
# default - no restrictions applied
# & - additional restrictions on lift, confidence etc.

inspect(subset(rules1, subset=items %ain% c("Mobile Banking") & confidence >.95))
inspect(subset(rules1, subset=lhs %ain% c("Mobile Banking") & confidence >.9))

inspect(subset(rules1, subset= lhs %pin% "Debit Card" & rhs %in% "Savings Account" & confidence > .7))

#Export Rules into a table
library(data.table)
rules_dt <- data.table( lhs = labels( lhs(rules2) ),      rhs = labels( rhs(rules2) ), 
                        quality(rules2) )[ order(-lift), ]
rules_dt
DT::datatable(rules_dt)  # wrapper for datatables
write.csv(rules_dt, './data/arulesfin.csv')


#Visualizing The Rules -----
library(arulesViz)
inspect(rules1)
plot(rules1, measure=c("support", "lift"), shading="confidence")
plot(rules1, measure=c("support", "confidence"), shading="lift")

plot(rules1,method="graph",engine='interactive', shading="confidence") 



================================================
FILE: 03-wksp1/4e1-twitter1.R
================================================
#Twitter 1 - Configure Tweets and Download them
#@dupadhyaya  #Working using my Keys
#Load libraries
library("curl")
library("twitteR")
library("ROAuth")
library("syuzhet") #library for sentiment analysis - comparison

download.file(url="http://curl.haxx.se/ca/cacert.pem",destfile="cacert.pem")

#https://apps.twitter.com/
#different for each account
consumerKey="uRDuync3BziwQnor1MZFBKp0x"
consumerSecret="t8QPLr7RKpAg4qa7vth1SBsDvoPKawwwdEhNRjdpY0mfMMdRnV"
AccessToken="14366551-Fga25zWM1YefkTb2TZYxsrx2LVVSsK0uSpF08sugW"
AccessTokenSecret="3ap8BZNVoBhE2GaMGLfuvuPF2OrHzM3MhGuPm96p3k6Cz"

#Common for all accounts except the keys

cred <- OAuthFactory$new(consumerKey=consumerKey, consumerSecret=consumerSecret, requestURL='https://api.twitter.com/oauth/request_token', accessURL='https://api.twitter.com/oauth/access_token', authURL='https://api.twitter.com/oauth/authorize')

cred$handshake(cainfo="cacert.pem") # it will take you to browser: authorise, copy key and paste in R Studio at Console. Once it stores, it move to R prompt
save(cred, file="twitter authentication.Rdata") # store this to avoid asking again
#Load saved authentication cert
load("twitter authentication.Rdata")
#registerTwitterOAuth(cred)

setup_twitter_oauth(consumerKey, consumerSecret, AccessToken, AccessTokenSecret)

#type 1 : Yes 
search.string <- "#businessanalytics"
#search.string <- "#marketinganalytics"

no.of.tweets <- 100

tweets <- searchTwitter(search.string, n=no.of.tweets,lang="en")
tweets
tweets[1:10]
#Another Topics
search.string <- "#indvsaus"
#search.string <- "#asiacup"

no.of.tweets <- 100

tweets <- searchTwitter(search.string, n=no.of.tweets,lang="en")
tweets[1:5]

#My Tweets : will change if you use your own account
homeTimeline(n=15)  #my tweets
mentions(n=15)   # my tags
mentions(n=5)

#for user - 
(tweets = userTimeline("riddheishad", n=10))
userTimeline("PrabhanshRai", n=5)

#------------------------------------
?userTimeline
tweets = userTimeline("realDonaldTrump", n=100)
#english
tweets[1:5]
n.tweet <- length(tweets)
n.tweet
tweets.df = twListToDF(tweets) 
head(tweets.df)
summary(tweets.df)

#Remove hashtags & unnecessary characters
tweets.df2 <- gsub("http.*","",tweets.df$text)
tweets.df2 <- gsub("https.*","",tweets.df2)
tweets.df2 <- gsub("#.*","",tweets.df2)
tweets.df2 <- gsub("@.*","",tweets.df2)

head(tweets.df2)
#-----
library("syuzhet") #library for sentiment analysis - comparison
word.df <- as.vector(tweets.df2)
emotion.df <- get_nrc_sentiment(word.df)
emotion.df2 <- cbind(tweets.df2, emotion.df) 
head(emotion.df2)

#-----
sent.value <- get_sentiment(word.df)
most.positive <- word.df[sent.value == max(sent.value)]
most.positive
most.negative<- word.df[sent.value <= min(sent.value)] 
most.negative
sent.value

#-----
positive.tweets <- word.df[sent.value > 0]
head(positive.tweets)
negative.tweets <- word.df[sent.value < 0] 
head(negative.tweets)
neutral.tweets <- word.df[sent.value == 0]
head(neutral.tweets)
#----
# Alternate way to classify as Positive, Negative or Neutral tweets
category_senti <- ifelse(sent.value < 0, "Negative", ifelse(sent.value > 0, "Positive", "Neutral"))
head(category_senti)
category_senti2 <- cbind(tweets,category_senti,sent.value) 
head(category_senti2)

#----
table(category_senti)
tweets[13]


================================================
FILE: 03-wksp1/4e2-wordcloud.R
================================================
# Word Cloud

##http://dni-institute.in/blogs/colorful-word-cloud-using-r/
# tm for text mining
# SnowballC for text stemming
# wordcloud for generating word cloud images
# RCurl and XML packages to download and parse web pages
# RColorBrewer for color palettes

library(wordcloud)
library(RColorBrewer)
library(SnowballC)
library(RCurl)
library(XML)
library(tm)

source('http://www.sthda.com/upload/rquery_wordcloud.r')
#or
#source('./TM/rquery_wordcloud.R')
filePath <- "http://www.sthda.com/sthda/RDoc/example-files/martin-luther-king-i-have-a-dream-speech.txt"
#filePath2 <- "./data/martin-luther-king-i-have-a-dream-speech.txt"

res<-rquery.wordcloud(filePath, type ="file", lang = "english")
#res<-rquery.wordcloud(filePath2, type ="file", lang = "english")



================================================
FILE: 03-wksp1/4e3-worldcloud2.R
================================================
# World Cloud 2
library(wordcloud)
library(RColorBrewer)
library(SnowballC)
library(RCurl)
library(XML)
library(tm)

# Read the text file from file
#text = readLines(file.choose())
text = readLines(con= file("ximb.txt"))
text
text[1]
# Load the data as a corpus
docs = Corpus(VectorSource(text))
docs

#Text transformation
toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
docs <- tm_map(docs, toSpace, "/")
docs <- tm_map(docs, toSpace, "@")
docs <- tm_map(docs, toSpace, "\\|")
docs
#Cleaning Text
# Convert the text to lower case
docs <- tm_map(docs, content_transformer(tolower))
# Remove numbers
docs <- tm_map(docs, removeNumbers)
# Remove english common stopwords
docs <- tm_map(docs, removeWords, stopwords("english"))
# Remove your own stop word
# specify your stopwords as a character vector
docs <- tm_map(docs, removeWords, c("blabla1", "blabla2")) 
docs
# Remove punctuations
docs <- tm_map(docs, removePunctuation)
# Eliminate extra white spaces
docs <- tm_map(docs, stripWhitespace)
# Text stemming
docs <- tm_map(docs, stemDocument)


#Document Matrix
dtm <- TermDocumentMatrix(docs)
m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
head(d, 10)

#Generate Word Cloud
set.seed(1234)
wordcloud(words = d$word, freq = d$freq, min.freq = 1,    max.words=200, random.order=FALSE, rot.per=0.35,        colors=brewer.pal(8, "Dark2"))

findFreqTerms(dtm, lowfreq = 4)
findAssocs(dtm, terms = "freedom", corlimit = 0.3)
head(d, 10)

#Plot Freq
barplot(d[1:10,]$freq, las = 2, names.arg = d[1:10,]$word,
       col ="lightblue", main ="Most frequent words",
       ylab = "Word frequencies")



================================================
FILE: 03-wksp1/4e5-wordcloud3.R
================================================
# World Cloud
#http://stat.ethz.ch/R-manual/R-devel/library/base/html/strsplit.html
#https://stackoverflow.com/questions/4350440/split-a-column-of-a-data-frame-to-multiple-columns

library(stringr)
library(wordcloud)
library(RColorBrewer)
library(tm)
library(SnowballC)
library(RCurl)
library(XML)



#How to use strsplit
strsplit('IIT-Gawahati', "-")
strsplit('IIT Gawahati', " ")

library(gsheet)
url = 'https://docs.google.com/spreadsheets/d/1_GQ-h4bgdNlIxcAanwRp_ak1u3JoWI-Vx2HKYj4FstA/edit#gid=0'
#check for dashtype, they are not always same
df1 = as.data.frame(gsheet2tbl(url))
df1
head(df1)
df1$wordexplanation
base::strsplit(df1$wordexplanation,'-')  #output as list need in DF

#Different Methods split and put in dataframe objects

#Stringr - str_split
stringr::str_split('IIT-Guwahati', "-")

#Method
(out1 = strsplit(as.character('IIT - Guwahati'),'-')) #if the word was not character
head(df1)
out = strsplit(df1$wordexplanation,'-') 
head(out)

t(sapply(out[1:5], '['))
df2=data.frame(t(sapply(out[1:200], '[')))

head(df2)
df= cbind(df1,df2)
head(df)
names(df)[c(2,3)] = c('word','explanation') #rename columns
head(df)

# create random frequencies for all words in a col
df$freq = floor(runif(100, 1,100))
head(df)

#Method for word cloud
library(wordcloud)
wc1 = wordcloud(df$word, df$freq, random.order=T)
wc2 = wordcloud(df$word, df$freq, random.order=F, colors = topo.colors(10))
wc3 = wordcloud(df$word, df$freq, scale=c(4,.1), min.freq=2, max.words=50,  random.order=F)
wc4 = wordcloud(df$word, df$freq, scale=c(4,.1), random.order=F)


#------

library(RColorBrewer)
library(tm)
pal <- brewer.pal(8,"Dark2")
wc5 = wordcloud(df$word, df$freq, scale=c(4,.1), random.order=F, colors=pal)


#load the function rquery.wordcloud
source('http://www.sthda.com/upload/rquery_wordcloud.r')

res <- rquery.wordcloud(filePath, type ="file", lang = "english",min.freq = 1,  max.words = 200)

# Reds color palette
res <- rquery.wordcloud(filePath, type ="file", lang = "english",colorPalette = "Reds")
# RdBu color palette
res <- rquery.wordcloud(filePath, type ="file", lang = "english", colorPalette = "RdBu")
# use unique color
res<-rquery.wordcloud(filePath, type ="file", lang = "english",  colorPalette = "black")


tdm <- res$tdm
freqTable <- res$freqTable
# Show the top10 words and their frequency
head(freqTable, 10)

# Bar plot of the frequency for the top10
barplot(freqTable[1:10,]$freq, las = 2, 
        names.arg = freqTable[1:10,]$word,
        col ="lightblue", main ="Most frequent words",
        ylab = "Word frequencies")

findFreqTerms(tdm, lowfreq = 4)

findAssocs(tdm, terms = "freedom", corlimit = 0.3)

url = "http://www.sthda.com/english/wiki/create-and-format-powerpoint-documents-from-r-software"
rquery.wordcloud(x=url, type="url")



================================================
FILE: 03-wksp1/4f2-quantmod1.R
================================================
#https://ntguardian.wordpress.com/2017/03/27/introduction-stock-market-data-r-1/
#Stock Analysis  

# Get quantmod
if (!require("quantmod")) {
  install.packages("quantmod")
  library(quantmod)
}

start <- as.Date("2017-01-01")
end <- as.Date("2018-10-01")

# Let's get Apple stock data; Apple's ticker symbol is AAPL. We use the quantmod function getSymbols, and pass a string as a first argument to identify the desired ticker symbol, pass 'yahoo' to src for Yahoo! Finance, and from and to specify date ranges
# The default behavior for getSymbols is to load data directly into the global environment, with the object being named after the loaded ticker symbol. This feature may become deprecated in the future, but we exploit it now.

getSymbols("AAPL", src = "yahoo", from = start, to = end)

# What is AAPL?
class(AAPL)
head(AAPL)
tail(AAPL)
plot(AAPL[, "AAPL.Close"], main = "AAPL")
candleChart(AAPL, up.col = "black", dn.col = "red", theme = "white")


# Let's get data for Microsoft (MSFT) and Google (GOOG) (actually, Google isheld by a holding company called Alphabet, Inc., which is the company traded on the exchange and uses the ticker symbol GOOG).
getSymbols(c("MSFT", "GOOG"), src = "yahoo", from = start, to = end)

# Create an xts object (xts is loaded with quantmod) that contains closing prices for AAPL, MSFT, and GOOG
stocks = as.xts(data.frame(AAPL = AAPL[, "AAPL.Close"], MSFT = MSFT[, "MSFT.Close"], GOOG = GOOG[, "GOOG.Close"]))
head(stocks)
tail(stocks)

# Create a plot showing all series as lines; must use as.zoo to use the zoo method for plot, which allows for multiple series to be plotted on same plot
plot(as.zoo(stocks), screens = 1, lty = 1:3, xlab = "Date", ylab = "Price")
legend("right", c("AAPL", "MSFT", "GOOG"), lty = 1:3, cex = 0.5)

plot(as.zoo(stocks[, c("AAPL.Close", "MSFT.Close")]), screens = 1, lty = 1:2,  xlab = "Date", ylab = "Price")
par(new = TRUE)
plot(as.zoo(stocks[, "GOOG.Close"]), screens = 1, lty = 3, xaxt = "n", yaxt = "n", xlab = "", ylab = "")
axis(4)
mtext("Price", side = 4, line = 3)
legend("topleft", c("AAPL (left)", "MSFT (left)", "GOOG"), lty = 1:3, cex = 0.5)


# Get pipe operator!
if (!require("magrittr")) {
  install.packages("magrittr")
  library(magrittr)
}
stock_return = apply(stocks, 1, function(x) {x / stocks[1,]}) %>%  t %>% as.xts

head(stock_return)

plot(as.zoo(stock_return), screens = 1, lty = 1:3, xlab = "Date", ylab = "Return")
legend("topleft", c("AAPL", "MSFT", "GOOG"), lty = 1:3, cex = 0.5)

stock_change = stocks %>% log %>% diff
head(stock_change)

plot(as.zoo(stock_change), screens = 1, lty = 1:3, xlab = "Date", ylab = "Log Difference")
legend("topleft", c("AAPL", "MSFT", "GOOG"), lty = 1:3, cex = 0.5)  


candleChart(AAPL, up.col = "black", dn.col = "red", theme = "white")
addSMA(n = 20)


#------
start = as.Date("2010-01-01")
getSymbols(c("AAPL", "MSFT", "GOOG"), src = "yahoo", from = start, to = end)
# The subset argument allows specifying the date range to view in the chart.
# This uses xts style subsetting. Here, we r using the idiom
# 'YYYY-MM-DD/YYYY-MM-DD', where the date on the left-hand side of the / is
# the start date, and the date on the right-hand side is the end date. If
# either is left blank, either the earliest date or latest date in the series is used (as appropriate). This method can be used for any xts object, say, AAPL

candleChart(AAPL, up.col = "black", dn.col = "red", theme = "white", subset = "2016-01-04/")
addSMA(n = 20)

candleChart(AAPL, up.col = "black", dn.col = "red", theme = "white", subset = "2016-01-04/")
addSMA(n = c(20, 50, 200))



================================================
FILE: 03-wksp1/4f3-indianstocks.R
================================================
# Indian Stocks

library(quantmod)
start <- as.Date("2017-01-01")
end <- as.Date("2018-10-01")
getSymbols("SBIN.NS", src = "yahoo", from = start, to = end)

# What is SBI?
class(SBIN.NS)
head(SBIN.NS)
tail(SBIN.NS)
plot(SBIN.NS[, "SBIN.NS.Close"], main = "SBIN.NS")
candleChart(SBIN.NS, up.col = "black", dn.col = "red", theme = "white")


#ICICIBANK.NS
#TATAMOTORS.NS
getSymbols(c("ICICIBANK.NS", "TATAMOTORS.NS"), src = "yahoo", from = start, to = end)

stocks = as.xts(data.frame(SBIN = SBIN.NS[, "SBIN.NS.Close"]))
stocks
stocks = as.xts(data.frame(SBIN = SBIN.NS[, "SBIN.NS.Close"], ICICI = ICICIBANK.NS[, "ICICIBANK.NS.Close"], TATAMOTORS = TATAMOTORS.NS[, "TATAMOTORS.NS.Close"]))
head(stocks)

plot(as.zoo(stocks), screens = 1, lty = 1:3, xlab = "Date", ylab = "Price")
legend("right", c("SBIN", "ICICI", "TATATMOTORS"), lty = 1:3, cex = 0.5)



================================================
FILE: 03-wksp1/5-wordcloud2-New.R
================================================
#wordcloud2

#install.packages('wordcloud2')
library(wordcloud2)

?wordcloud2

df = data.frame(word=c('mdi','iim','imt'),freq=c(20,23,15))
df
wordcloud2(df)


head(demoFreq)
wordcloud2(demoFreq, size = 2, color = "random-light", backgroundColor = "grey")
names(demoFreq)
wordcloud2(demoFreq, size = 2, minRotation = -pi/2, maxRotation = -pi/2)
wordcloud2(demoFreq, size = 2, minRotation = -pi/6, maxRotation = -pi/6,   rotateRatio = 1)
wordcloud2(demoFreq, size = 2, minRotation = -pi/6, maxRotation = pi/6,    rotateRatio = 0.9)

wordcloud2(demoFreqC, size = 2,  color = "random-light", backgroundColor = "grey")
wordcloud2(demoFreqC, size = 2, minRotation = -pi/6, maxRotation = -pi/6,  rotateRatio = 1)

# Color Vector

colorVec = rep(c('red', 'skyblue'), length.out=nrow(demoFreq))
wordcloud2(demoFreq, color = colorVec, fontWeight = "bold")

wordcloud2(demoFreq,
           color = ifelse(demoFreq[, 2] > 20, 'red', 'skyblue'))


================================================
FILE: 03-wksp1/5b-LP-marketingspend.R
================================================
#LP in R : Marketing Spend
#https://analyticsprofile.com/business-analytics/how-to-optimise-digital-marketing-spend-using-linear-programming-in-r/
  

================================================
FILE: 03-wksp1/5c2-LP-marketingspend-case.R
================================================
# LP - Marketing Spend
## Code to solve LP

#install.packages("linprog")
library(linprog)

Max_ROI = c(0.07, 0.03, 0.15, 0.12, 0.05) #Objective Function

Contraint_Vector = c(5000, -500, 0, -200, 0, 0, -300, 900, -100, 2500) #Constraints

#Decision variables under constraints 
Decision_Var <- rbind(
  c(1,1,1,1,1), 
  c(-1, 0, 0, 0, 0), 
  c(-0.05, 0.95, -0.05, -0.05, -0.05), 
  c(-0, -1, 0, 0, 0), 
  c(0.5, 0.5, -0.5, -0.5, 0.5), 
  c(0, 0, 1, -2.5, 0), 
  c(0, 0, 0, -1, 0), 
  c(0, 0, 0, 1, 0), 
  c(0, 0, 0, 0, -1),
  c(2, 0.3, 1.8, 0.9, 2)
)
Decision_Var
solveLP(Max_ROI, Contraint_Vector, Decision_Var, TRUE)



================================================
FILE: 03-wksp1/5d-wordcloud2.R
================================================
##http://www.sthda.com/english/wiki/text-mining-and-word-cloud-fundamentals-in-r-5-simple-steps-you-should-know

#https://www.r-graph-gallery.com/196-the-wordcloud2-library/

# library #install this
library(wordcloud2) 

# have a look to the example dataset
head(demoFreq)
wordcloud2(demoFreq, size=1)
?wordcloud2
#create your set of words and freq
df = data.frame(word=c('cbap','cmap','iim','imt','calcutta'),freq=c(20,23,15,10,13))
df
wordcloud2(df)



# Gives a proposed palette
wordcloud2(demoFreq, size=1.6, color='random-dark')

# or a vector of colors. vector must be same length than input data
wordcloud2(demoFreq, size=1.6, color=rep_len( c("green","blue"), nrow(demoFreq) ) )

# Change the background color
wordcloud2(demoFreq, size=1.6, color='random-light', backgroundColor="black")

# Change the shape:
wordcloud2(demoFreq, size = 0.7, shape = 'star')
head(demoFreq)
?wordcloud2
# Change the shape using your image
wordcloud2(demoFreq, figPath = "india.jpg", size = 1.5, color = "skyblue", backgroundColor="black")


================================================
FILE: 03-wksp1/5d2-LP-tpt.R
================================================
# LP - Transportation Problem 
#https://docs.google.com/spreadsheets/d/1G6-iPDoD_i4THQAHwBeOLeiTfuqn7a6Q7MrOg9v1C5U/edit#gid=166724984
#https://cran.r-project.org/web/packages/lpSolve/lpSolve.pdf
library(lpSolve)

(costs <- matrix (c(3,1,5,2,5,4),ncol=2))
row.signs <- c("<", "<", "<")
row.rhs <- c(45, 60, 35)
col.signs <- rep (">", 2)
col.rhs <- c(50, 60)
#edit(costs)
# Run
tptproblem = lp.transport (costs, "min", row.signs, row.rhs, col.signs, col.rhs)
## Not run: Success: the objective function is 7790
tptproblem$solution
#lp.transport (costs, "min", row.signs, row.rhs, col.signs, col.rhs)$solution

#Eg2
(cost <- matrix(c(3,1,5,2,5,4),ncol=2))
lp.transport(cost, "min" , rep("<",3) , c(45, 60, 35) , rep(">=",2) , c(50, 60) )$solution
lp.transport(cost, "min" , rep("<",3) , c(45, 60, 35) , rep(">=",2) , c(50, 60) )
?lp.transport

library(lpSolve)
x <- matrix(c(.91,.32,.86,.14,.59,.36,.67,.34,.87,.56,.10,.09),ncol=3,byrow=T)
x
lp.transport(x, "min" , rep("==",4) , rep(1,4) , rep(">=",3) , rep(1,3)) 
lp.transport(x, "min" , rep("==",4) , rep(1,4) , rep(">=",3) , rep(1,3) )$solution
lp.transport(x, "min" , rep("==",4) , rep(1,4) , rep(">=",3) , rep(1,3) )$object


================================================
FILE: 03-wksp1/5e2-LP-machassign.R
================================================
#----------------------------------------------#
#Another Method
#https://cran.r-project.org/web/packages/lpSolveAPI/lpSolveAPI.pdf
#https://ecreee.wikischolars.columbia.edu/file/view/lpSolveAPI+Tutorial.Rmd

library(lpSolveAPI)

#First we create an empty model x.
?make.lp
#max: x + y - 50
lprec <- make.lp(0, 2)
lprec
set.objfn(lprec, c(1, 1))
lprec
#maximise it
lp.control(lprec, sense="max")
lprec

#set.type(lprec, c(1,2), type = c("integer"))
lprec
#50x + 24y <= 2400
add.constraint(lprec, c(50,24), "<=", 2400)
lprec
#30x + 33y <= 2100
add.constraint(lprec, c(30,33), "<=", 2100)
lprec
#x >= 45
#add.constraint(lprec, c(1,0), ">=", 45)
#lprec
#y >= 5
#add.constraint(lprec, c(0,1), ">=", 5)
#lprec
#50 >= x + y
add.constraint(lprec, c(1,1), ">=", 50)
lprec

set.bounds(lprec, lower = c(45, 5), columns = c(1, 2))
lprec

#set.bounds(lprec, upper = 48.98, columns = 4)
#RowNames <- c("MachineA", "MachineB","InitalA", "InitalB","TotalInitial")
RowNames <- c("MachineA", "MachineB","TotalInitial")
ColNames <- c("ProductX", "ProductY")
dimnames(lprec) <- list(RowNames, ColNames)
lprec
solve(lprec)   #[1] 0  ok
#get.dual.solution(lprec)
get.objective(lprec)
get.variables(lprec)
get.constraints(lprec)
plot(lprec)
print(lprec)


================================================
FILE: 03-wksp1/5e5-LP-farmer1.R
================================================
#Farmer Problem in LP
#A farmer plans to plant two crops, A and B. The cost of cultivating crop A is $40/acre, whereas the cost of crop B is $60/acre. The farmer has a maximum of $7400 available for land cultivation. Each acre of crop A requires 20 labor-hours and each acre of crop B requires 25 labor-hours. The farmer has a maximum of 3300 labor-hours available. If she expects to make a profit of $150/acre on crop A and $200/acre on crop B, how many acres of each crop should she plant in order to maximize her profit?
  
library(lpSolveAPI)

#First we create an empty model x.
?make.lp
#two variables ie. crops A & B: find which crop to be grown how many acres to max profit
lprecF1 <- make.lp(0, 2)
lprecF1
#Profit :: 150A + 200B
set.objfn(lprecF1, c(150, 200))
lprecF1
#Change from min to max problem
lp.control(lprecF1, sense="max")
lprecF1

#answer required in integer or real no for A & B: default Real
lprecF1
#1st Constraint : Budget Avl
#40x + 60y <= 7400
add.constraint(lprecF1, c(40,60), "<=", 7400)
lprecF1
#2nd constraint : Labour Hours Avl
#20x + 25y <= 3300
add.constraint(lprecF1, c(20,25), "<=", 3300)
lprecF1
#set lower limits : A & B > 0
set.bounds(lprecF1, lower = c(0, 0), columns = c(1, 2))
lprecF1
#upper bounds can also be set only for 1 or more columns
#set.bounds(lprec, upper = c(200), columns = 2)
ColNames <- c("CropA", "CropB")
RowNames <- c("Budget", "Labor")
dimnames(lprecF1) <- list(RowNames, ColNames)
lprecF1
solve(lprecF1)   #if 0  then solution found
#get.dual.solution(lprec)
get.objective(lprecF1) # profit achieved
get.variables(lprecF1)  #how much of each crop A & B
150* 65 + 200 * 80
get.constraints(lprecF1) #constraints of budget & labor used
plot(lprecF1)  # print graphical output : only when type is real
#if type is integer, the plot will not work
print(lprecF1)  #see the model


#add more constraints like water
#35x + 40y <= 10000
add.constraint(lprecF1, c(5,10), "<=", 1000)
lprecF1
delete.constraint(lprecF1, 3)
solve(lprecF1)   #if 0  then solution found
get.objective(lprecF1) # profit achieved
get.variables(lprecF1)  #how much of each crop A & B

#setting integer value
set.type(lprecF1, c(1,2), type = c("integer"))
lprecF1
solve(lprecF1)   #if 0  then solution found
get.objective(lprecF1) # profit achieved
get.variables(lprecF1)  #how much of each crop A & B


#http://lpsolve.sourceforge.net/5.5/R.htm
?lp
?lp.assign
?lp.object

?lp.transport
?print.lp


================================================
FILE: 03-wksp1/6b1-dates.R
================================================
#Data Format in R 
#When we import data into R, dates and times are usually stored as character or factor by default due to symbols such as “-”, “:” and “/”

cdate1='15/August/1947'
cdate1
cdate2 = c('15-Aug-1947', "26-Jan-1950", "01-Oct-2018")
cdate2

class(cdate1) ; class(cdate2)
#convert to Dates
#Default Format of Date in R
?as.Date()

#See/ Convert Date format
as.Date('2018-12-01')
#Default Format : Year-Month-Date : %Y-%m-%d
(date3= as.Date('2018-12-01',format='%Y-%m-%d'))
class(date3)

#Format Types ----
#%Y: 4-digit year (1982),%y: 2-digit year (82)
#%m: 2-digit month (01)
#%B: month (January), %b: abbreviated month (Jan) 
#%d: 2-digit day of the month (13)
#%A: weekday (Wednesday), %a: abbreviated weekday (Wed)

#convert cdate1 cdate2 to date formats
cdate1 #"15/August/1947
date1 = as.Date(cdate1, format='%d/%B/%Y')
date1
class(date1)
date1 - 1:15
#
cdate2 #"15-Aug-1947" "26-Jan-1950"
date2 = as.Date(cdate2, format='%d-%b-%Y')
date2
#

#one more practise on date format
date3 =as.Date('30Apr18',format("%d%b%y"))
class(date3)

#Date to Characters
class(date3)
(cdate2 = as.character(date3))
class(cdate2)

#excel date columns
#01 Jan 2018, 01/Jan/18, 01-Jan 2018, 2018-Jan-01, 2018 01 jan
#this is wrong
#The “Date” class means dates are stored as the number of days since January 1, 1970, with negative values for earlier dates. We can use the as.numeric function to view the raw values.

date1
class(date1)
as.numeric(date1)
date2
as.numeric(date2)
as.Date('1970-01-01') 
as.numeric(Sys.Date())
Sys.Date() - as.Date('1970-01-01') 

#Sequence of Dates
#Create Sequence of Dates

#Next day after certain date
as.Date('2018-12-01') + 1
#Sequence of Dates
as.Date('2018-12-01') + 0:14
as.Date('2018-12-01') + 0:33

#alternative Dates
seq(1,30,2)
as.Date('2018-12-09') + seq(0,29,7)


#system date
Sys.Date()
#next 15 days after todays date
Sys.Date() + 1:15

##increment/ decrement dates 
(startdt = as.Date("2018-12-01", format="%Y-%m-%d"))
(next20days = startdt + 1:20) # start : 20 days course
startdt - 1
(prev20days = startdt - c(0:19)) # before : 20 days course
rev(prev20days)



#Data Arithmetic
# date Arithmetic and subsets

#start course on YYYY-MM-DD for 30 days
(course1 = as.Date("2018-12-01") + 0:29)

#Difference in dates
course1
length(course1)  #no of days
min(course1)  # first date
max(course1)  # last date
range(course1) # start to end
mean(course1)  # center of the course period
median(course1) # middle date

#sd(course1) #no meaning

#subset / select dates
course1
#1st and 5th dates
course1[c(1,5)] # 1st & 5th date 
#1st and 5th dates
course1[1] ; course1[5]

#duration from 8th date to first date
(duration1 = course1[8] - course1[1])
#total duration
(duration2 = max(course1) - min(course1)+ 1)

#duration since independence
(independencedays = Sys.Date() - as.Date('15-08-1947', '%d-%m-%Y'))
independencedays/365  #years

as.character(Sys.Date(), format="%Y--%m--/%d %A")
course1
as.character(course1, format="%A")
paste(course1 , as.character(course1, format="%A"), sep=":: ")

#day on which you were born
dob = "07/03/1996"
dob_date = as.Date(dob,format='%d/%m/%Y')
dob_date
as.character(dob_date, format="%A")



================================================
FILE: 03-wksp1/6b1-ts-data.R
================================================
# create a time series data

#first create a vector of numerical values
# 36 observations 
set.seed(1234)
(sales = round(runif(36, 0,100)))
length(sales)

#This data can be daily, weekly, monthly, quarter, yearly data
#create yearly time series : start year 1980

#Yearly----
(ysales = ts(sales, frequency = 1))

(yearlysales = ts(sales, start=c(1980), frequency=1))
plot(yearlysales)
(yearlysales1 = ts(sales, start=c(1980,3), frequency=1)) 
# 3rd yr from 1980
plot(yearlysales)

#find the year when sales was > 50
yearlysales1[ yearlysales1 > 50]
class(yearlysales1)
methods(class=ts)
yearlysales1
(w1= window(yearlysales1, start=1983, end=1990))
plot(w1)



#Quarterly -----
12/4  # freq=4
(qtrsales = ts(sales, start=c(1980), frequency=4))
plot(qtrsales)
#list data from Qtr3 1980 to  1985
window(qtrsales, start=c(1980, 3), end=c(1985, 2))


#Monthly -----
12/12  # freq=12 start month=Apr/ 1990
(monsales = ts(sales, start=c(1990,4), frequency=12))
plot(monsales)
window(monsales, start=c(1991, 3))

#create data from Feb 2000 to Nov 2002
(monsales1 = ts(sales, start=c(2000,2), end=c(2003,3), frequency=12)) #recycling of elements beyond given sales value
monsales1
str(monsales1)
length(monsales1)

#see subset of sales data : May 2000 to Aug 2001
window(monsales1, start=c(2000, 5), end=c(2001, 8))

#Monthly TS
sales2 = ceiling(rnorm(365, mean=100, sd=10))
sales2
#YYYY,day
(dailysales = ts(sales2, start=c(2017,10), frequency=365))
window(dailysales, start=c(2017,50), end=c(2017,100))
mean(window(dailysales, start=c(2017,50), end=c(2017,100)))
head(sales2)
plot(dailysales)
class(dailysales)

#quarterly
sales3 = floor(rnorm(16, mean=200, sd = 12))
(qtrsales = ts(sales3, start = c(2018,1), frequency = 4))
plot(qtrsales)

#weekly



================================================
FILE: 03-wksp1/6c2-dates-lubridate.R
================================================
#Package Lubridate https://data.library.virginia.edu/working-with-dates-and-time-in-r-using-the-lubridate-package/
# Date Functions - Deal with Date & Time

#lubridate provides a series of functions that are a permutation of the letters “m”, “d” and “y” to represent the ordering of month, day and year. For example, if our data has a column of dates such as May 11, 1996, our dates are ordered month-day-year. Therefore we would use the mdy function to transform the column to a date object. If our dates were in the order of, say, year-month-day, we would use the ymd function. lubridate provides functions for every permutation of “m”, “d”, “y”.
  
  
#Eg1
library(lubridate)
(date8a = lubridate::ymd("20110604"))
(date8 = ymd("20110604"))
class(date8)
date8 + 1:10
#date in different format seq
mdy("06-04-2011") #mon-date-year
dmy("04/06/2011") #date-mon-year

#Parsing functions automatically handle a wide variety of formats and separators, which simplifies the parsing process.

begin = c("May 11, 1996", "September 12, 2001", "July 1, 1988")
end = c("7/8/97","10/23/02","1/4/91")
class(begin)  ## [1] "character"
class(end) ## [1] "character"

(begin = mdy(begin))
## [1] "1996-05-11" "2001-09-12" "1988-07-01"
class(begin)

(end = mdy(end))
## [1] "1997-07-08" "2002-10-23" "1991-01-04"

class(begin) ; class(end)## [1] "Date"


#If your date includes time information, add h, m, and/or s to the name of the function. ymd_hms() is probably the most common date time format. To read the dates in with a certain time zone, supply the official name of that time zone in the tz argument.

begin1 = c("May 11, 1996 12:05", "September 12, 2001 1:00", "July 1, 1988 3:32")
end1 = c("7/8/97 8:00","10/23/02: 12:00","1/4/91 2:05")
(begin1a = mdy_hm(begin1))
(end1a = mdy_hm(end1))

class(begin1a) ; class(end1a) ## [1] "POSIXct" "POSIXt"
begin1a ; as.numeric(begin1a)
end1a ; as.numeric(end1a)

#class is now “POSIXct”. “POSIXct” represents the number of seconds since the beginning of 1970. If a date is before 1970, the number of seconds is negative. 

#Notice also the the letters “UTC” have been appended to the date-times. UTC is short for Universal Coordinated Time. it’s basically the time standard by which the world regulates clocks. If we prefer we can specify a time zone when formatting dates by using the tz argument. Here’s how we can specify the Eastern Time Zone in the United States when formatting our dates.

(begin1b = mdy_hm(begin1, tz = "US/Eastern"))
(begin1c = mdy_hm(begin1, tz = "Asia/Calcutta"))

#use the OlsonNames function to see a character vector of all time zone names
OlsonNames()
Sys.timezone()

#read in times without dates using the functions ms, hm, or hms, where again “h”, “m”, and “s” stand for “hours”, “minutes”, and “seconds”

time1 = c("1:13", "0:58", "1:01")
time2 = c("12:23:11", "09:45:31", "12:05:22")
time3 = c("2:14", "2:16", "3:35")

(time1a <- ms(time1)) ## [1] "1M 13S" "58S"    "1M 1S"
(time2a <- hms(time2)) ## [1] "12H 23M 11S" "9H 45M 31S"  "12H 5M 22S"
(time3a <- hm(time3)) ## [1] "2H 14M 0S" "2H 16M 0S" "3H 35M 0S"



arrive <- ymd_hms("2018-04-27 18:40:15", tz = "Asia/Calcutta")
arrive
leave <- ymd_hms("2018-04-29 22:00:00", tz = "Asia/Calcutta")
leave

leave - arrive  # Time Difference

#Setting and Extracting information
#Functions

#Eg
second(arrive)
second(arrive) = 25  #change
second(arrive)
arrive
minute(arrive)
hour(arrive)
day(arrive)
wday(arrive)
wday(arrive)
wday(arrive, label = TRUE)

week(arrive)

month(arrive)
year(arrive)
tz(arrive)

#Time Intervals
#save an interval of time as an Interval class object
(student1 <- interval(arrive, leave))
(student2 <- arrive %--% leave)
(student3 = interval(ymd(20180320, tz = "Asia/Calcutta"),
                     ymd(20180327, tz = "Asia/Calcutta")))

(courseperiod = interval(ymd(20180421, tz = "Asia/Calcutta"),
                         ymd(20180425, tz = "Asia/Calcutta")))

int_overlaps(student1, courseperiod)
#TRUE
int_overlaps(student3, courseperiod)

setdiff(student2, courseperiod)
setdiff(student3, courseperiod)

?setdiff
#2017-07-05 IST--2017-07-15 IST

#Other Functions
#int_start, int_end, int_flip, int_shift, int_aligns, union, intersect, and %within%.
int_start(courseperiod)
int_end(courseperiod)
int_flip(courseperiod)
int_aligns(student1, courseperiod)  #share end point

dates = now() + days(1:10)
dates
int_diff(dates)

# Years Betw ----------
ref_date <- as.Date('20/04/08',format='%d/%m/%y')
today <- as.Date(Sys.Date(), format='%d/%m/%y')
year(arrive) = 2015
arrive
arrive2 = as.Date(arrive, format='%d/%m/%y')
(yrsbetw = year(today)-year(ref_date))
(yrsbetw = year(today)-year(arrive2))



================================================
FILE: 03-wksp1/6d-TS-airpassengers.R
================================================
# Time Series Case Study - Decomposition

#https://rpubs.com/emb90/137525
# Data Set - AirPassengers
x=c(9.23221232,5.3430000)
x
options(digits=3)
x

?AirPassengers
head(AirPassengers)
AirPassengers
str(AirPassengers)
class(AirPassengers)

#The decomposition of time series is a statistical task that deconstructs a time series into several components, each representing one of the underlying categories of patterns
# TS data components : Level + Irregular + Seasonal

#stl(x, s.window, t.window = ) # command to do decomp
stl(AirPassengers, s.window = 'periodic') # seasons to be considered periodic ie not changing over time
# save it in an object

plot(AirPassengers) # Pattern of data : see increasing seasonal values suggesting multiplicative Model
#no cyclic here - only seasonal, trend, irregular
#s.window - specifies seasonal effects to be identical across years
#can handle on additive models

stl1 = stl(AirPassengers, s.window = 'periodic')
plot(stl1) # actual data, seasonal, long term trends, remainder/ irregular

class(stl1)

stl1$time.series
#(df = stl1$time.series)
#df = as.data.frame(df)
#write.csv(df, './data/airpsng.csv')


#Additive Model Y = Trend + Seasonal + Irregular
#sales increase by 300 qty in month of Nov
#Multiplicative Model Y = Trend * Seasonal * Irregular
#sales increase by 10% in month of Nov


#dataset
AirPassengers
class(AirPassengers)

# Plot
plot(AirPassengers)
#variability increases with level. at low values of passengers variations are less, at later years seasonal variations seem to be more -> Multiplicative model suggested

#stabilise the plot
LogAirPassengers = log(AirPassengers)  # make it additive because stl handles only additive models

# YA = T + S + I  : 
#YM= T * S * I  : take log of this
# log(YM) = log(T) + log(S) +log(I)

plot(LogAirPassengers)  #stabilises variation due to multiplication
#looks like additive : no increase of seasonsal component now over years
plot(AirPassengers)
(m1 = matrix(1:2, nrow=1, byrow = F))
layout(m1)
plot(AirPassengers); plot(LogAirPassengers)  # see again the change

#STL
?stl
fit = stl(LogAirPassengers, s.window = 'periodic' )
#Seasonal components constrainted to be same across years : periodic

plot(fit)
fit$time.series  #decompose the data into S, T, R/I 

#december of all months same value for seasonal
#this was after taking log : so take antilog
#toprow = actual data with all series
exp(fit$time.series)

head(exp(fit$time.series),n=20)  # first 20 values see them

# df= exp(fit$time.series)
# names(df) = c('S','T','I')
# head(AirPassengers)
# head(cbind(AirPassengers, df))

#Various Plots - Monthwise, quarter, 
layout(matrix(1,nrow=1))
#Avg of each month
AirPassengers
stl1 = stl(AirPassengers, s.window = 'periodic')
monthplot(AirPassengers) #max traffic in Jun/ Jul across years
monthplot(LogAirPassengers)
monthplot(stl1, choice='seasonal') # less in winters, more in summers
monthplot(stl1, choice='trend')  #slight increase from Jan to Dec
#trend increasing for each month, highest passengers in Jul
monthplot(stl1, choice='remainder') # irregular components
?monthplot

# see combined plots
(m2 = matrix(1:3, nrow=3, byrow = T))
layout(m2)  # change layout of plots
monthplot(stl1, choice='seasonal')
monthplot(stl1, choice='trend')
monthplot(stl1, choice='remainder')


# Practise with different methods - Self Practise

#Decompose another way
AP.decompM = decompose(AirPassengers, type = "multiplicative")
plot(AP.decompM)

library(forecast) #install the library
# Forecast # adjust for multiplicative model
?ets
fit2b = ets(AirPassengers, model='MAM')
fit2b
tail(AirPassengers)
(f2b=forecast(fit2b, 12))
head(f2b)$mean

plot(f2b)


================================================
FILE: 03-wksp1/6d-ts-components-airp.R
================================================
# Time Series Case Study - Decomposition

#https://rpubs.com/emb90/137525
# Data Set - AirPassengers
x=c(9.23221232,5.3430000)
x
options(digits=2)
x

?AirPassengers
head(AirPassengers)
AirPassengers
str(AirPassengers)
class(AirPassengers)

#The decomposition of time series is a statistical task that deconstructs a time series into several components, each representing one of the underlying categories of patterns
# TS data components : Level + Irregular + Seasonal

#stl(x, s.window, t.window = ) # command to do decomp
stl(AirPassengers, s.window = 'periodic') # seasons to be considered periodic ie not changing over time
# save it in an object

plot(AirPassengers) # Pattern of data : see increasing seasonal values suggesting multiplicative Model
#no cyclic here - only seasonal, trend, irregual
#s.window - specifies seasonal effects to be identical across years
#can handle on additive models

stl1 = stl(AirPassengers, s.window = 'periodic')
plot(stl1) # actual data, seasonal, long term trends, remainder/ irregular

class(stl1)

stl1$time.series
#(df = stl1$time.series)
#df = as.data.frame(df)
#write.csv(df, './data/airpsng.csv')


#Additive Model Y = Trend + Seasonal + Irregular
#sales increase by 300 qty in month of Nov
#Multiplicative Model Y = Trend * Seasonal * Irregular
#sales increase by 10% in month of Nov


#dataset
AirPassengers
class(AirPassengers)

# Plot
plot(AirPassengers)
#variability increases with level. at low values of passengers variations are less, at later years seasonal variations seem to be more -> Multiplicative model suggested

#stabilise the plot
LogAirPassengers = log(AirPassengers)  # make it additive because stl handles only additive models

# YA = T + S + I  : 
#YM= T * S * I  : take log of this
# log(YM) = log(T) + log(S) +log(I)

plot(LogAirPassengers)  #stabilises variation due to multiplication
#looks like additive : no increase of seasonsal component now over years

(m1 = matrix(1:2, nrow=1, byrow = F))
layout(m1)
plot(AirPassengers); plot(LogAirPassengers)  # see again the change


#STL
fit = stl(LogAirPassengers, s.window = 'periodic' )
#Seasonal components constrainted to be same across years : periodic

plot(fit)
fit$time.series  #decompose the data into S, T, R/I 

#december of all months same value for seasonal
#this was after taking log : so take antilog
#toprow = actual data with all series
exp(fit$time.series)

head(exp(fit$time.series),n=20)  # first 20 values see them

# df= exp(fit$time.series)
# names(df) = c('S','T','I')
# head(AirPassengers)
# head(cbind(AirPassengers, df))

#Various Plots - Monthwise, quarter, 
layout(matrix(1,nrow=1))
#Avg of each month
monthplot(AirPassengers) #max traffic in Jun/ Jul across years
monthplot(fit, choice='seasonal') # less in winters, more in summers
monthplot(fit, choice='trend')  #slight increase from Jan to Dec
#trend increasing for each month, highest passengers in Jul
monthplot(fit, choice='remainder') # irregular components

# see combined plots
(m2 = matrix(1:3, nrow=3, byrow = T))
layout(m2)  # change layout of plots
monthplot(fit, choice='seasonal')
monthplot(fit, choice='trend')
monthplot(fit, choice='remainder')


# Practise with different methods - Self Practise

#Decompose another way
AP.decompM = decompose(AirPassengers, type = "multiplicative")
plot(AP.decompM)

library(forecast)
# Forecast # adjust for multiplicative model
fit2b = ets(AirPassengers, model='MAM')
fit2b
(f2b=forecast(fit2b, 12))
head(f2b)$mean



================================================
FILE: 03-wksp1/6d-ts-johnson.R
================================================
#Johnson Case - TS
#time series analysis
#plot, decompose, forecast, 
JohnsonJohnson
?JohnsonJohnson

monthplot(JohnsonJohnson)
quarters.Date(Sys.Date())

stl_jj = stl(JohnsonJohnson, s.window = 'periodic')
plot(stl_jj)
plot(JohnsonJohnson)
monthplot(stl_jj, choice='seasonal')
monthplot(stl_jj, choice='trend')
monthplot(stl_jj, choice='remainder')

library(forecast)
ets_jj = ets(JohnsonJohnson, model = "MAM")
forecast(ets_jj,h=3)
plot(forecast(ets_jj,h=3))


#---
require(stats); require(graphics)
JJ <- log10(JohnsonJohnson)
plot(JohnsonJohnson)
plot(JJ)

## This example gives a possible-non-convergence warning on some
## platforms, but does seem to converge on x86 Linux and Windows.
(fit <- StructTS(JJ, type = "BSM"))
tsdiag(fit)
sm <- tsSmooth(fit)
plot(cbind(JJ, sm[, 1], sm[, 3]-0.5), plot.type = "single",
     col = c("black", "green", "blue"))
abline(h = -0.5, col = "grey60")

monthplot(fit)

================================================
FILE: 03-wksp1/6d-ts-xts-data.R
================================================
# xts  - create object and export data

library(xts)

#create matrix : 1 col for 1 share
(stockprices = matrix(c(100,103, 105, 205, 210, 207, 530, 500, 535), ncol=3, byrow = F))
stockprices

(cdtindex = c('25-09-2018', '27-09-2018', '28-09-2018'))

#convert to date format
(dtindex = as.Date(cdtindex, format="%d-%m-%Y"))
#now we have index + matrix for xts object

(ts_xts = xts(x=stockprices, order.by=dtindex))
#this series is not continuous can have missing dates

ts_xts
colnames(ts_xts) = c("SBI", "ICICI", 'HDFC')
ts_xts
#-----
#matrix data extract
coredata(ts_xts)
#extract dates
index(ts_xts)


#write to csv file
write.zoo(ts_xts, "./data/zoodata.csv")

================================================
FILE: 03-wksp1/6e-TS-auto-arima-johnson.R
================================================
#Times Series Analysis 
# is the price of Johnson and Johnson shares change over time
# are there quarterly effects with share prices rising & falling in a regular fashion throughtout the year
# Can you forecast what future share prices will be and to what degree of accuracy

#dataset - Johnson
#Quarterly earnings per Johnson Shares
#Steps - Plot, Describe, Decompose, Forecast - Simple MA, Exp, ARIMA

JohnsonJohnson

library(forecast)

#ets auto select best predicton model
?ets
#smoothing time series data using the exponential window function. Whereas in the simple moving average the past observations are weighted equally, exponential functions are used to assign exponentially decreasing weights over time

fit1 = ets(JohnsonJohnson)
fit1
#alpha - trend
#beta  = seasonal
#gamma  - irregular

JohnsonJohnson
head(JohnsonJohnson)
tail(JohnsonJohnson)

(f1= forecast(fit1,h=10))  # 
?forecast.ets
plot(f1, main='Johnson Shares', ylab='Quartery Earnings', xlab='Time', flty = 3)  # linetype for forecast area
#shaded portion is confidence intervals area

par(mfrow=c(1,1))

# ARIMA Forecasting : 
#popular and widely used statistical method for time series forecasting is the ARIMA model. ARIMA is an acronym that stands for AutoRegressive Integrated Moving Average.

#http://slideplayer.com/5259056/16/images/98/Seasonal+Components--Model+Selection.jpg

f2 = auto.arima(JohnsonJohnson)
summary(f2)
tail(JohnsonJohnson)  # last few values
forecast(f2,h=5)
plot(forecast(f2,h=5))

#ARIMA Forecasting  : compare two datasets
library(tseries)
plot(JohnsonJohnson)
ndiffs(JohnsonJohnson)
plot(diff(JohnsonJohnson))

plot(Nile)
plot(diff(Nile))
ndiffs(Nile)

#-----
djj = diff(JohnsonJohnson)
plot(djj)

dnile = diff(Nile)
plot(dnile)

#----
adf.test(djj)
#if pv < 0.05 accept Alt Hypothesis that series is stationary

#Model Selection 
#parameters p, d , q
# d = no of diffs applied to make the series stationary
#https://people.duke.edu/~rnau/arimrule.htm
Acf(dnile)
#Trail off to zero : Zero after lag ; 0,1(p)
#Zero after lag q : Trails off to zero ; 1(q), 0 
#Trails off to zero : Trial off to zero : 0,0
#Nile - 1 large auto correlation at lag 1 : 
#Nile - pacf trails off to zero as the lags gets bigger
?arima
Pacf(dnile)

fit3 = arima(Nile, order=c(0,1,1)) # p,d,q
fit3

(fit3b = arima(Nile, order=c(1,1,1)))


#Model Test
qqnorm(fit3$residuals) # residuals ND
qqline(fit3$residuals)

#auto correl = 0 : check
Box.test(fit3$residuals, type='Ljung-Box')
#Test auto corr : H0= r=0 (True)

#Forecast
forecast(fit3,4)

Nile
#Auto ARIMA
plot(Nile)

library(forecast)
#forecast::auto.arima()
fit4 = auto.arima(Nile)
fit4

forecast(fit4,5)
plot(forecast(fit4,5))



================================================
FILE: 03-wksp1/6g-ts-TTR-ma.R
================================================
#Time Series - SMA
library(TTR)

library(forecast)
#MA
#https://www.rdocumentation.org/packages/forecast/versions/8.4/topics/ma
head(wineind)
?wineind
class(wineind)
str(wineind)
plot(wineind)
sm <- ma(wineind,order=3)
?ma
sm
head(wineind,n=3)
head(sm, n=3)
sum(head(wineind,n=3)) / 3

lines(sm,col="red")
smF <- ma(wineind,order=3, centre=F)
head(smF)


#centered
x = ts(runif(25,50,80))
x
forecast::ma(x, order=2, centre=T)

plot(wineind)
csm <- ma(wineind,order=12, centre=T)
lines(csm,col="red")



#method1
#https://rpubs.com/ajaydecis/ts5
kings <- scan("http://robjhyndman.com/tsdldata/misc/kings.dat",skip=3)
kings
kingstimeseries <- ts(kings)
kingstimeseries
plot.ts(kingstimeseries)
#install.packages("TTR")
library("TTR")
(kingstimeseriesSMA8 <- SMA(kingstimeseries,n=8))
?SMA
plot.ts(kingstimeseriesSMA8)

library(forecast)
fit <- HoltWinters(kingstimeseries, beta=FALSE, gamma=FALSE)
fit1<- ets(kingstimeseries)
?ets
forecast(fit, 3)
plot(forecast(fit, 3)) 

forecast(fit1, 3)
plot(forecast(fit1, 3)) 




#Method2
#https://cran.r-project.org/web/packages/smooth/vignettes/sma.html
require(smooth)
require(Mcomp)
sma(M3$N2457$x, h=18, silent=FALSE)
sma(M3$N2568$x, h=18)


================================================
FILE: 03-wksp1/8-fa-quandl.R
================================================
# Finance Stock Analysis
#Stock Download

library(Quandl)
#https://www.quandl.com/account/api 4D8hkYAV4WEkcTmD9LMW

Quandl.api_key("4D8hkYAV4WEkcTmD9LMW")

## Download the data Set
ICICI = Quandl("NSE/ICICIBANK",collapse="daily",start_date="2017-09-01",type="raw")
ICICI
HDFC = Quandl("NSE/HDFCBANK",collapse="daily",start_date="2017-09-01",type="raw")
HDFC
PNB= Quandl("NSE/PNB",collapse="daily",start_date="2017-09-01",type="raw")
SBI=Quandl("NSE/SBIN",collapse="daily",start_date="2017-09-01",type="raw")

## Add another ("Stock") column in Datasets using cbind command
head(ICICI)
## Paste the stock name in stock column

ICICI$Stock = "ICICI"
PNB$Stock = "PNB"
SBI$Stock = "SBI"
SBI

allstocks = rbind(ICICI, PNB, SBI)
str(allstocks)
allstocks$Stock = factor(allstocks$Stock)
str(allstocks)
names(allstocks)
names(allstocks)[c(7,8)] = c('Qty','Turnover')
names(allstocks)


#Using Aggregations
names(allstocks)
dim(allstocks)
aggregate(allstocks$Close, by=list(allstocks$Stock), mean)
aggregate(cbind(Close,  Open) ~ Stock , data= allstocks, mean)
#https://www.statmethods.net/input/dates.html
aggregate(allstocks['Close'], by=list(allstocks$Date), mean)
aggregate(allstocks['Close'], by=list(format(allstocks$Date,"%d")), mean)
aggregate(allstocks['Close'], by=list(format(allstocks$Date,"%b")), mean)
aggregate(allstocks[c(3,7)], by=list(format(allstocks$Date,"%m")), mean)
aggregate(allstocks[c('Close','Open')], by=list(format(allstocks$Date,"%Y")), mean)

#Library to store data in xlsx files
Sys.setenv(JAVA_HOME="C:\\Program Files\\Java\\jre1.8.0_191")
library(xlsx) #needs rJava

#using Dplyr Package to do Data Manipulation
library(dplyr)
names(allstocks)
#select Columns
(df1 <- allstocks %>% select(Stock, Open, Close))
df1 = as.data.frame(df1)
write.xlsx(df1, './data/iitgfa.xlsx', sheetName = "IITG12", append = T)


#Groupby
allstocks %>% group_by(Stock)  #nothing summarised

#summarise
allstocks %>% summarise(mean(Open), max(High))

allstocks %>% group_by(Stock) %>% summarise_all(mean)
#store this data into DF
(df2 <- allstocks %>% group_by(Stock, format(Date,'%b')) %>% summarise_all(mean) )
write.csv(df2, './data/iitgfa.csv')

#http://www.sthda.com/english/wiki/writing-data-from-r-to-excel-files-xls-xlsx#using-xlsx-package


#write.xlsx(USArrests, file = "myworkbook.xlsx", #check with this file sheetName = "USA-ARRESTS", append = FALSE)
df2 = as.data.frame(df2)
write.xlsx(df2, './data/iitgfa.xlsx', sheetName = "IITG2", append = T)


options(dplyr.print_max = 1e9)  #print all rows
allstocks %>% group_by(Stock, format(Date,'%b')) %>% summarise_all(mean)

allstocks %>% group_by(Stock, format(Date,'%b')) %>% summarise_all(funs(mean, max))

#sample using dplyr
allstocks %>% sample_n(10)

allstocks %>% sample_frac(.05)
allstocks %>% group_by(Stock) %>%   sample_n(2)

allstocks %>% group_by(Stock) %>% tally(Qty) %>%  top_n(1)
allstocks %>% group_by(Stock, format(Date,'%Y')) %>% top_n(2, Turnover)


# Linear Modeling
names(ICICI)
df= cbind(ICICI[2],SBI[2] )
names(df) = c('icici','sbi')
head(df)
plot(df)
fit = lm(icici ~ sbi, data=df)
summary(fit)
new1 = data.frame(sbi=200)
(p1=predict(fit,new=new1, interval='confidence' ))
cbind(new1, p1)
plot(x=df$sbi, y=residuals(fit))  #Linearity
qqnorm(residuals(fit))
qqline(residuals(fit))

library(car)
#Multiple LM
df1 = cbind(ICICI[2],SBI[2],PNB[2] )
names(df1) = c('icici','sbi','pnb')
head(df1)
#No Plots
pairs(df1)
fit1 = lm(icici ~ sbi + pnb, data=df1)
summary(fit1)
summary(fit1)$r.squared
summary(fit1)$adj.r.squared
new2= data.frame(sbi=c(200,300), pnb=c(250,350))
p2=predict(fit1, new=new2, interval='confidence')
cbind(new2, p2)

crPlots(fit1)
vif(fit1)# variance inflation factors 
sqrt(vif(fit1)) > 2 # problem? Use only 1 variable
durbinWatsonTest(fit1)
car::outlierTest(fit1) # Bonferonni p-value for most extreme obs
car::qqPlot(fit1, main="QQ Plot")
library(gvlma)
gvmodel <- gvlma(fit1) 
summary(gvmodel)

av.Plots(fit1)
?car::av.Plots
car::influencePlot(fit1,	id.method="identify", main="Influence Plot", sub="Circle size is proportial to Cook's Distance" )
car::ncvTest(fit1)
car::spreadLevelPlot(fit1)


================================================
FILE: 03-wksp1/8-fa-quandl2.R
================================================
# Finance Stock Analysis

#Install Packages 
#pckgs<-c("Quandl","Sweep","tidyverse")
#install.packages(pckgs,dependencies = TRUE)

library(Quandl)
# library(tidyverse)
# library(ggplot2)
# library(stringr)
# library(plyr)
# library(stringr)
# library(gridExtra)

#Quandl(Code=“NSE/—”,collapse=“—”,start_date=“—-”,type=“…”)
#ICICI SBI PNB  price band of Rs 200 to Rs 500. 
#https://www.quandl.com/account/api 4D8hkYAV4WEkcTmD9LMW

Quandl.api_key("4D8hkYAV4WEkcTmD9LMW")

## Download the data Set
ICICI = Quandl("NSE/ICICIBANK",collapse="daily",start_date="2017-09-01",type="raw")
PNB= Quandl("NSE/PNB",collapse="daily",start_date="2017-09-01",type="raw")
SBI=Quandl("NSE/SBIN",collapse="daily",start_date="2017-09-01",type="raw")

## Add another ("Stock") column in Datasets using cbind command
head(ICICI)
## Paste the stock name in stock column

ICICI$Stock = "ICICI"
PNB$Stock = "PNB"
SBI$Stock = "SBI"
SBI
#combine them
allstocks = rbind(ICICI, PNB, SBI)
str(allstocks)
allstocks$Stock = factor(allstocks$Stock)
str(allstocks)
names(allstocks)
names(allstocks)[c(7,8)] = c('Qty','Turnover')
names(allstocks)


#Using Aggregations
names(allstocks)
aggregate(allstocks$Close, by=list(allstocks$Stock), mean)
aggregate(cbind(Close,  Open) ~ Stock , data= allstocks, mean)

#https://www.statmethods.net/input/dates.html
aggregate(allstocks['Close'], by=list(allstocks$Date), mean)
aggregate(allstocks['Close'], by=list(format(allstocks$Date,"%d")), mean)
aggregate(allstocks['Close'], by=list(format(allstocks$Date,"%b")), mean)
aggregate(allstocks[c(3,7)], by=list(format(allstocks$Date,"%m")), mean)
aggregate(allstocks[c('Close','Open')], by=list(format(allstocks$Date,"%Y")), mean)

#using Dplyr Package to do Data Manipulation
library(dplyr)
names(allstocks)
#select Columns
allstocks %>% select(Stock, Open, Close)

#Groupby
allstocks %>% group_by(Stock)  #nothing summarised

#summarise
allstocks %>% summarise(mean(Open), max(High))

allstocks %>% group_by(Stock) %>% summarise_all(mean)
allstocks %>% group_by(Stock, format(Date,'%b')) %>% summarise_all(mean)
options(dplyr.print_max = 1e9)
allstocks %>% group_by(Stock, format(Date,'%b')) %>% summarise_all(mean)

allstocks %>% group_by(Stock, format(Date,'%b')) %>% summarise_all(funs(mean, max))

#sample using dplyr
allstocks %>% sample_n(10)
allstocks %>% sample_frac(.05)
allstocks %>% group_by(Stock) %>%   sample_n(2)

allstocks %>% group_by(Stock) %>% tally(Qty) %>%  top_n(1)
allstocks %>% group_by(Stock, format(Date,'%Y')) %>% top_n(2, Turnover)


# Linear Modeling
names(ICICI)
df= cbind(ICICI[2],SBI[2] )
names(df) = c('icici','sbi')
head(df)
plot(df)
fit = lm(icici ~ sbi, data=df)
summary(fit)
new1 = data.frame(sbi=200)
(p1=predict(fit,new=new1, interval='confidence' ))
cbind(new1, p1)
plot(x=df$sbi, y=residuals(fit))  #Linearity
qqnorm(residuals(fit))
qqline(residuals(fit))

library(car)
#Multiple LM
df1 = cbind(ICICI[2],SBI[2],PNB[2] )
names(df1) = c('icici','sbi','pnb')
head(df1)
#No Plots
pairs(df1)
fit1 = lm(icici ~ sbi + pnb, data=df1)
summary(fit1)
summary(fit1)$r.squared
summary(fit1)$adj.r.squared
new2= data.frame(sbi=c(200,300), pnb=c(250,350))
p2=predict(fit1, new=new2, interval='confidence')
cbind(new2, p2)

crPlots(fit1)
vif(fit1)# variance inflation factors 
sqrt(vif(fit1)) > 2 # problem? Use only 1 variable
durbinWatsonTest(fit1)
car::outlierTest(fit1) # Bonferonni p-value for most extreme obs
car::qqPlot(fit1, main="QQ Plot")
library(gvlma)
gvmodel <- gvlma(fit1) 
summary(gvmodel)

av.Plots(fit1)
?car::av.Plots
car::influencePlot(fit1,	id.method="identify", main="Influence Plot", sub="Circle size is proportial to Cook's Distance" )
car::ncvTest(fit1)
car::spreadLevelPlot(fit1)


================================================
FILE: 03-wksp1/8-fa-quantmod.R
================================================
#https://ntguardian.wordpress.com/2017/03/27/introduction-stock-market-data-r-1/
#Stock Analysis  

# Get quantmod
if (!require("quantmod")) {
  install.packages("quantmod")
  library(quantmod)
}

start <- as.Date("2018-01-01")
end <- as.Date("2018-12-09")

# Let's get Apple stock data; Apple's ticker symbol is AAPL. We use the
# quantmod function getSymbols, and pass a string as a first argument to
# identify the desired ticker symbol, pass 'yahoo' to src for Yahoo!
# Finance, and from and to specify date ranges

# The default behavior for getSymbols is to load data directly into the
# global environment, with the object being named after the loaded ticker
# symbol. This feature may become deprecated in the future, but we exploit
# it now.

getSymbols("AAPL", src = "yahoo", from = start, to = end)

# What is AAPL?
class(AAPL)
head(AAPL)
tail(AAPL)
plot(AAPL[, "AAPL.Close"], main = "AAPL")
candleChart(AAPL[1:10,], up.col = "black", dn.col = "red", theme = "white")
candleChart(AAPL, up.col = "black", dn.col = "red", theme = "white")
#30-Nov-2018	180.29	180.33	177.03	178.58	178.58	3,94,83,800

# Let's get data for Microsoft (MSFT) and Google (GOOG) (actually, Google is
# held by a holding company called Alphabet, Inc., which is the company
# traded on the exchange and uses the ticker symbol GOOG).
getSymbols(c("MSFT", "GOOG"), src = "yahoo", from = start, to = end)
MSFT
# Create an xts object (xts is loaded with quantmod) that contains closing
# prices for AAPL, MSFT, and GOOG
stocks = as.xts(data.frame(AAPL = AAPL[, "AAPL.Close"], MSFT = MSFT[, "MSFT.Close"], GOOG = GOOG[, "GOOG.Close"]))
head(stocks)
class(stocks)

# Create a plot showing all series as lines; must use as.zoo to use the zoo
# method for plot, which allows for multiple series to be plotted on same
# plot
plot(as.zoo(stocks), screens = 1, lty = 1:3, xlab = "Date", ylab = "Price")
legend("right", c("AAPL", "MSFT", "GOOG"), lty = 1:3, cex = 0.5)

plot(as.zoo(stocks[, c("AAPL.Close", "MSFT.Close")]), screens = 1, lty = 1:2,  xlab = "Date", ylab = "Price")
par(new = TRUE)
plot(as.zoo(stocks[, "GOOG.Close"]), screens = 1, lty = 3, xaxt = "n", yaxt = "n", xlab = "", ylab = "")
axis(4)
mtext("Price", side = 4, line = 3)
legend("topleft", c("AAPL (left)", "MSFT (left)", "GOOG"), lty = 1:3, cex = 0.5)


# Get pipe operator!
if (!require("magrittr")) {
  install.packages("magrittr")
  library(magrittr)
}
stock_return = apply(stocks, 1, function(x) {x / stocks[1,]}) %>% 
  t %>% as.xts

head(stock_return)

plot(as.zoo(stock_return), screens = 1, lty = 1:3, xlab = "Date", ylab = "Return")
legend("topleft", c("AAPL", "MSFT", "GOOG"), lty = 1:3, cex = 0.5)

stock_change = stocks %>% log %>% diff
head(stock_change)

plot(as.zoo(stock_change), screens = 1, lty = 1:3, xlab = "Date", ylab = "Log Difference")
legend("topleft", c("AAPL", "MSFT", "GOOG"), lty = 1:3, cex = 0.5)  


candleChart(AAPL, up.col = "black", dn.col = "red", theme = "white")
addSMA(n = 20)



start = as.Date("2010-01-01")
getSymbols(c("AAPL", "MSFT", "GOOG"), src = "yahoo", from = start, to = end)
# The subset argument allows specifying the date range to view in the chart.
# This uses xts style subsetting. Here, I'm using the idiom
# 'YYYY-MM-DD/YYYY-MM-DD', where the date on the left-hand side of the / is
# the start date, and the date on the right-hand side is the end date. If
# either is left blank, either the earliest date or latest date in the
# series is used (as appropriate). This method can be used for any xts
# object, say, AAPL
candleChart(AAPL, up.col = "black", dn.col = "red", theme = "white", subset = "2016-01-04/")
addSMA(n = 20)

candleChart(AAPL, up.col = "black", dn.col = "red", theme = "white", subset = "2016-01-04/")
addSMA(n = c(20, 50, 200))


================================================
FILE: 03-wksp1/8-quantmod-I-stocks.R
================================================
#Indian Stocks

# Indian Stocks
#stocks2

# Get quantmod
if (!require("quantmod")) {
  install.packages("quantmod")
  library(quantmod)
}

start <- as.Date("2018-01-01")
end <- as.Date("2018-12-09")
getSymbols("SBIN.NS", src = "yahoo", from = start, to = end)
# What is SBI?
class(SBIN.NS)
head(SBIN.NS)
tail(SBIN.NS)
plot(SBIN.NS[, "SBIN.NS.Close"], main = "SBIN.NS")
candleChart(SBIN.NS, up.col = "black", dn.col = "red", theme = "white")


#ICICIBANK.NS
#TATAMOTORS.NS
getSymbols(c("ICICIBANK.NS", "TATAMOTORS.NS"), src = "yahoo", from = start, to = end)

stocks = as.xts(data.frame(SBIN = SBIN.NS[, "SBIN.NS.Close"]))
stocks
stocks = as.xts(data.frame(SBIN = SBIN.NS[, "SBIN.NS.Close"], ICICI = ICICIBANK.NS[, "ICICIBANK.NS.Close"], TATAMOTORS = TATAMOTORS.NS[, "TATAMOTORS.NS.Close"]))
head(stocks)

plot(as.zoo(stocks), screens = 1, lty = 1:3, xlab = "Date", ylab = "Price")
legend("right", c("SBIN", "ICICI", "TATATMOTORS"), lty = 1:3, cex = 0.5)



================================================
FILE: 03-wksp1/zz-practise.R
================================================
# Practise Exercise - XIMB

#Create a 100 row DF with following Variables
#gender, spl, age, experience, grade, placement
head(students)
str(students)
#


(gender = sample(c('M','F'), size=100, replace=T, prob=c(.6,.4)))
(spl = sample(c('Marketing','Finance','HR'), size=100, replace=T, prob=c(.3,.4,.3)))
(age = round(runif(100, 21, 30),2))
(experience = round(rnorm(100, 4, 1),2))
(grade = sample(c('A','B','C','D'), size=100, replace=T, prob=c(.4,.3,.2,.1)))
(placement = sample(c('yes','no'), size=100, replace=T, prob=c(.7,.3)))

students = data.frame(gender, spl, age, experience, grade, placement)
str(students)
summary(students)

#summarise the data in various ways using dplyr
library(dplyr)
names(students)
students %>% group_by(placement, gender) %>% summarise(mean(experience), max(experience), mean(age))

students %>% filter(spl=='Marketing') %>% summarise(mean(age), mean(experience))
students %>% filter(spl=='Marketing') %>% group_by(spl) %>% summarise(mean(age), mean(experience))
students %>% filter(spl!='Marketing') %>% group_by(spl) %>% summarise(mean(age), mean(experience))
students %>% filter(spl=='Marketing'& gender=='F' & grade=='A') %>% group_by(spl) %>% summarise(mean(age), mean(experience))


#draw few graphs to understand the distribution of data

hist(students$age)
t1=table(students$placement)
barplot(t1, col=1:2)
boxplot(students$age)
pairs(students[,c('age','experience')])
pie(t1)

par(mfrow=c(2,2))
pie(table(students$gender))
pie(table(students$placement))
pie(table(students$grade))
pie(table(students$spl))
par(mfrow=c(1,1))
table(students$gender, students$placement, students$spl)



#find students having largest experience in each spl for each gender

students %>% group_by(spl,gender) %>% arrange(spl, gender,experience) %>% top_n(1, experience) 



#how many were placed : draw pie plot



write.csv(students, './data/ximb.csv')

students2 = read.csv('./data/ximb.csv')
head(students2)


#clustering
km3 = kmeans(students[,c('age','experience')], centers=3)
km3
km3$centers
plot(students[,c('age','experience')], col=km3$cluster)

#decision tree
library(rpart)
library(rpart.plot)

tree = rpart(placement ~ . , data=students)
tree
rpart.plot(tree, nn=T, cex=1)
printcp(tree)
prune(tree, cp=.03)

ndata = sample_n(students, 5)
ndata
predict(tree, newdata= ndata, type='class')
predict(tree, newdata= ndata, type='prob')


#logistic Regression
logitmodel1 = glm(placement ~ . , data =students, family='binomial')
summary(logitmodel1)
logitmodel1a = glm(placement ~ age , data =students, family='binomial')
summary(logitmodel1a)
logitmodel2 = glm(placement ~ age + gender , data =students, family='binomial')
summary(logitmodel2)

#linear regression
linear1 = lm(age ~ . , data=students)
summary(linear1)


#iimkpv

#vectors
x = 1:100
class(x)
x = c(1,3,5.5)
class(x)
x = LETTERS[1:10]
class(x)
x = c(T,F,T,F,F)
class(x)
#matrix
m = matrix(1:24, ncol=6)
m
class(m)
#dataframes
head(mtcars)
class(mtcars)

#datastructures
#filter, sort, delete, missingvalues, stats operations
colMeans(mtcars)
colMeans(m)
?colMeans

#manipulation / summarisation
library(dplyr)
names(mtcars)
mtcars %>% group_by(cyl) %>% summarise(mean(mpg), max(wt), n())

plot(y=mtcars$mpg, x=mtcars$wt)
abline(lm(mpg ~ wt, data=mtcars))
#visualisation

#Modeling
fit = lm(mpg ~ wt + hp, data=mtcars)
summary(fit)
#linear regression, logistic regression, decision tree, clustering, market basket analysis

#Time Series 
#stock data, sales data
#Financial Analytics, Business 

#Dates are handled in the R
#create TS objects in R
#manipulate, Plot, Forecasting
#decomp

# Decision Trees

#types - Regression  & Classification
iris
head(iris)

library(rpart)
library(rpart.plot)
#regression tree
#DV - Sepal.Length
rtree = rpart(Sepal.Length ~ . , data= iris, method='anova')
rtree
rpart.plot(rtree,  nn=T, cex=.8, fallen.leaves = T)
#classification tree
#DV - Species
ctree = rpart(Species ~ . ,data = iris, method='class')
ctree
rpart.plot(ctree, cex=.8, nn=T, extra=104)
printcp(ctree)
ctreeprune1 = prune(ctree, cp=.44)
ctreeprune1
rpart.plot(ctreeprune1, cex=.8, nn=T, extra=104)
(ndata = sample_n(iris,3))
predict(ctree,newdata=ndata, type='prob')


library(gsheet)
url= "https://docs.google.com/spreadsheets/d/1FTlwpywOynI9dXbgewKCf4WhBc2PwoQmHzlqnAEisuA"
df = as.data.frame(gsheet2tbl(url))
head(df)

url2= "https://docs.google.com/spreadsheets/d/1FTlwpywOynI9dXbgewKCf4WhBc2PwoQmHzlqnAEisuA/edit#gid=1123010023"
df2 = as.data.frame(gsheet2tbl(url2))
head(df2)


df3 = read.csv('./data/iimc1.csv')
head(df3)



================================================
FILE: 04-wksp2/Graph-matrixplots.R
================================================
# Matrix Plots
# Multiple Series on one Graph

table(iris$Species) # is data.frame with 'Species' factor
iS <- iris$Species == "setosa"
iV <- iris$Species == "versicolor"
iS;iV
op <- par(bg = "bisque")
matplot(c(1, 8), c(0, 4.5), type =  "n", xlab = "Length", 
        ylab = "Width",  main = "Petal and Sepal Dimensions in Iris Blossoms")
matpoints(iris[iS,c(1,3)], iris[iS,c(2,4)], pch = "sS", 
          col = c(2,4))
matpoints(iris[iV,c(1,3)], iris[iV,c(2,4)], pch = "vV", 
          col = c(2,4))
legend(1, 4, c("    Setosa Petals", "    Setosa Sepals",
               "Versicolor Petals", "Versicolor Sepals"),
       pch = "sSvV", col = rep(c(2,4), 2))

# Example 2

a <- rnorm(100)
b <- 2*a + 3
c <- 3*a + 2
a;b;c
matplot(a, cbind(b,c), pch=1:2, col=c(2,4))
legend("bottomright", inset=.05, legend=c("b", "c"), 
       pch=1:2, col=c(2,4), horiz=F)



# Example 3
x = 1:10
A = c(15, 36, 54, 60, 68, 71, 73, 75, 78, 78)
B = c(20, 49, 58, 69, 75, 80, 83, 86, 88, 89)
C = c(24, 58, 68, 75, 83, 90, 93, 93, 95, 96)
Performance = data.frame(A,B,C)
matplot(x,Performance, type="o", pch=c(1,2,3), col=c("red","green","blue"))
legend("bottomright", inset=.05, legend=c("A", "B",'C'),
      pch=1:3, col=c("red","green","blue"), horiz=F)


================================================
FILE: 04-wksp2/LMtrainTest.R
================================================
#partition the data into train and test set
mtcars
library(caret)
nrow(mtcars)
index = sample(x=1:nrow(mtcars), size=.7 * nrow(mtcars))
index
train= mtcars[index,]
test= mtcars[-index,]
nrow(train)
nrow(test)
nrow(train) + nrow(test)

library(olsrr)
fit = lm(mpg ~ disp + hp + wt + qsec, data = train)
k = ols_step_all_possible(fit)
plot(k)
k
summary(lm(mpg ~ wt, data= train))
summary(lm(mpg ~ wt + hp, data= train))

library(gvlma)
gvmodel = gvlma(finalmodel)
gvmodel

finalmodel = lm(mpg ~ wt + hp, data= train)
(predictedvalues = predict(finalmodel, ndata=test))
cbind(test$mpg, predictedvalues)


================================================
FILE: 04-wksp2/Links_DAR
================================================

Classification
http://dataaspirant.com/2017/01/30/how-decision-tree-algorithm-works/

#Clustering
https://www.statmethods.net/advstats/cluster.html

http://www.sthda.com/english/articles/25-cluster-analysis-in-r-practical-guide/111-types-of-clustering-methods-overview-and-quick-start-r-code/
  
https://rstudio-pubs-static.s3.amazonaws.com/33876_1d7794d9a86647ca90c4f182df93f0e8.html


================================================
FILE: 04-wksp2/Links_DAR.R
================================================


================================================
FILE: 04-wksp2/TS-arima-johnson.R
================================================
#Times Series Analysis 
# is the price of Johnson and Johnson shares change over time
# are there quarterly effects with share prices rising & falling in a regular fashion throughtout the year
# Can you forecast what future share prices will be and to what degree of accuracy

#dataset - Johnson
#Quarterly earnings per Johnson Shares
#Steps - Plot, Describe, Decompose, Forecast - Simple MA, Exp, ARIMA

JohnsonJohnson

library(forecast)

#ets auto select best predicton model
?ets
#smoothing time series data using the exponential window function. Whereas in the simple moving average the past observations are weighted equally, exponential functions are used to assign exponentially decreasing weights over time

fit1 = ets(JohnsonJohnson)
fit1
#alpha - trend
#beta  = seasonal
#gamma  - irregular

JohnsonJohnson
head(JohnsonJohnson)
tail(JohnsonJohnson)

(f1= forecast(fit1,h=10))  # 
?forecast.ets
plot(f1, main='Johnson Shares', ylab='Quartery Earnings', xlab='Time', flty = 3)  # linetype for forecast area
#shaded portion is confidence intervals area

par(mfrow=c(1,1))

# ARIMA Forecasting : 
#popular and widely used statistical method for time series forecasting is the ARIMA model. ARIMA is an acronym that stands for AutoRegressive Integrated Moving Average.

#http://slideplayer.com/5259056/16/images/98/Seasonal+Components--Model+Selection.jpg

f2 = auto.arima(JohnsonJohnson)
summary(f2)
tail(JohnsonJohnson)  # last few values
forecast(f2,h=5)


#ARIMA Forecasting  : compare two datasets
library(tseries)
plot(JohnsonJohnson)
ndiffs(JohnsonJohnson)
plot(diff(JohnsonJohnson))

plot(Nile)
plot(diff(Nile))
ndiffs(Nile)

#-----
djj = diff(JohnsonJohnson)
plot(djj)

dnile = diff(Nile)
plot(dnile)

#----
adf.test(djj)
#if pv < 0.05 accept Alt Hypothesis that series is stationary

#Model Selection 
#parameters p, d , q
# d = no of diffs applied to make the series stationary
#https://people.duke.edu/~rnau/arimrule.htm
Acf(dnile)
#Trail off to zero : Zero after lag ; 0,1(p)
#Zero after lag q : Trails off to zero ; 1(q), 0 
#Trails off to zero : Trial off to zero : 0,0
#Nile - 1 large auto correlation at lag 1 : 
#Nile - pacf trails off to zero as the lags gets bigger
?arima
Pacf(dnile)

fit3 = arima(Nile, order=c(0,1,1)) # p,d,q
fit3

(fit3b = arima(Nile, order=c(1,1,1)))


#Model Test
qqnorm(fit3$residuals) # residuals ND
qqline(fit3$residuals)

#auto correl = 0 : check
Box.test(fit3$residuals, type='Ljung-Box')
#Test auto corr : H0= r=0 (True)

#Forecast
forecast(fit3,4)

Nile
#Auto ARIMA
library(forecast)
#forecast::auto.arima()
fit4 = auto.arima(Nile)
fit4

forecast(fit4,5)
plot(forecast(fit4,5))



================================================
FILE: 04-wksp2/TS-components-airpassengers.R
================================================
# Time Series Case Study - Decomposition

#https://rpubs.com/emb90/137525
# Data Set - AirPassengers
x=c(9.23221232,5.3430000)
x
options(digits=2)
x

?AirPassengers
head(AirPassengers)
AirPassengers
str(AirPassengers)
class(AirPassengers)

#The decomposition of time series is a statistical task that deconstructs a time series into several components, each representing one of the underlying categories of patterns
# TS data components : Level + Irregular + Seasonal

#stl(x, s.window, t.window = ) # command to do decomp
stl(AirPassengers, s.window = 'periodic') # seasons to be considered periodic ie not changing over time
# save it in an object
AirPassengers[1]
plot(AirPassengers) # Pattern of data : see increasing seasonal values suggesting multiplicative Model
#no cyclic here - only seasonal, trend, irregual
#s.window - specifies seasonal effects to be identical across years
#can handle on additive models

stl1 = stl(AirPassengers, s.window = 'periodic')
plot(stl1) # actual data, seasonal, long term trends, remainder/ irregular

class(stl1)

stl1$time.series
#(df = stl1$time.series)
#df = as.data.frame(df)
#write.csv(df, './data/airpsng.csv')


#Additive Model Y = Trend + Seasonal + Irregular
#sales increase by 300 qty in month of Nov
#Multiplicative Model Y = Trend * Seasonal * Irregular
#sales increase by 10% in month of Nov


#dataset
AirPassengers
class(AirPassengers)

# Plot
plot(AirPassengers)
#variability increases with level. at low values of passengers variations are less, at later years seasonal variations seem to be more -> Multiplicative model suggested

#stabilise the plot
LogAirPassengers = log(AirPassengers)  # make it additive because stl handles only additive models

# YA = T + S + I  : 
#YM= T * S * I  : take log of this
# log(YM) = log(T) + log(S) +log(I)

plot(LogAirPassengers)  #stabilises variation due to multiplication
#looks like additive : no increase of seasonsal component now over years

(m1 = matrix(1:2, nrow=1, byrow = F))
layout(m1)
plot(AirPassengers); plot(LogAirPassengers)  # see again the change


#STL
fit = stl(LogAirPassengers, s.window = 'periodic' )
#Seasonal components constrainted to be same across years : periodic

plot(fit)
fit$time.series  #decompose the data into S, T, R/I 

#december of all months same value for seasonal
#this was after taking log : so take antilog
#toprow = actual data with all series
exp(fit$time.series)

head(exp(fit$time.series),n=20)  # first 20 values see them

# df= exp(fit$time.series)
# names(df) = c('S','T','I')
# head(AirPassengers)
# head(cbind(AirPassengers, df))

#Various Plots - Monthwise, quarter, 
layout(matrix(1,nrow=1))
#Avg of each month
monthplot(AirPassengers) #max traffic in Jun/ Jul across years

monthplot(fit, choice='seasonal') # less in winters, more in summers
monthplot(fit, choice='trend')  #slight increase from Jan to Dec
#trend increasing for each month, highest passengers in Jul
mo

Download .txt

gitextract_5fftlku6/

├── .RDataTmp
├── .gitignore
├── 0-Practise/
│   ├── day1.R
│   ├── day2.R
│   ├── day3.R
│   ├── first.R
│   ├── htmlimport.R
│   ├── import2.R
│   ├── practise.R
│   ├── practise2.R
│   ├── rough.R
│   └── vector.R
├── 0-Rdata/
│   ├── madata.Rdata
│   ├── student.rds
│   ├── student1.rds
│   ├── twitter authentication.Rdata
│   └── twitterauthentication.Rdata
├── 00-toc.R
├── 02-lms/
│   ├── 1-ds.R
│   ├── fms.txt
│   └── importcsv.R
├── 03-wksp1/
│   ├── 1a1-start.R
│   ├── 1a3-packages1.R
│   ├── 1b2-ds.R
│   ├── 1b3-factor.R
│   ├── 1d2-basicstats.R
│   ├── 1d2-dm-student1.R
│   ├── 1d3-dencoCase.R
│   ├── 1d4-DA-dencoCase.R
│   ├── 1e-graphs-basic.R
│   ├── 1e2-graphs.R
│   ├── 1e3-advgraphs.R
│   ├── 1f-SLR-women.R
│   ├── 1h1-dplyr.R
│   ├── 1h2-freqtable.R
│   ├── 2a-importExport.R
│   ├── 2b-SLR-salesarea.R
│   ├── 2b-allmodels.R
│   ├── 2b2-SLM-women.R
│   ├── 2b3-SLM-women-A.R
│   ├── 2b4-LM-cars.R
│   ├── 2b4-SLR-women.R
│   ├── 2c3-MLM-salespromotion.R
│   ├── 2c4-MLM-mtcars1.R
│   ├── 2d1-missingvalues.R
│   ├── 2d3-datapartition.R
│   ├── 2e1-logR-purchase.R
│   ├── 2e2-LOGR-adult.R
│   ├── 2e3-LOGR-gre.R
│   ├── 3b1-DT-CART-carseats.R
│   ├── 3b2-DT-CART-R-sales.R
│   ├── 3b3-DT-CART-titanic.R
│   ├── 3b4-DT-CART-R-loan.R
│   ├── 3b5-DT-loanapproved1.R
│   ├── 3b5-DT-rpart-iris.R
│   ├── 3d1-DT-CHAID-usvote.R
│   ├── 3e1-clust-customer.R
│   ├── 3e1-clustering.R
│   ├── 3e2-clust-samplecase.R
│   ├── 3e3-clust-segmentation.R
│   ├── 3e4-clust-noOfclusters.R
│   ├── 4b1-AR-groceries.R
│   ├── 4b2-AR-samplecase.R
│   ├── 4b3-AR-groceries-subset.R
│   ├── 4b5-AR-finproducts.R
│   ├── 4e1-twitter1.R
│   ├── 4e2-wordcloud.R
│   ├── 4e3-worldcloud2.R
│   ├── 4e5-wordcloud3.R
│   ├── 4f2-quantmod1.R
│   ├── 4f3-indianstocks.R
│   ├── 5-wordcloud2-New.R
│   ├── 5b-LP-marketingspend.R
│   ├── 5c2-LP-marketingspend-case.R
│   ├── 5d-wordcloud2.R
│   ├── 5d2-LP-tpt.R
│   ├── 5e2-LP-machassign.R
│   ├── 5e5-LP-farmer1.R
│   ├── 6b1-dates.R
│   ├── 6b1-ts-data.R
│   ├── 6c2-dates-lubridate.R
│   ├── 6d-TS-airpassengers.R
│   ├── 6d-ts-components-airp.R
│   ├── 6d-ts-johnson.R
│   ├── 6d-ts-xts-data.R
│   ├── 6e-TS-auto-arima-johnson.R
│   ├── 6g-ts-TTR-ma.R
│   ├── 8-fa-quandl.R
│   ├── 8-fa-quandl2.R
│   ├── 8-fa-quantmod.R
│   ├── 8-quantmod-I-stocks.R
│   └── zz-practise.R
├── 04-wksp2/
│   ├── Graph-matrixplots.R
│   ├── LMtrainTest.R
│   ├── Links_DAR
│   ├── Links_DAR.R
│   ├── TS-arima-johnson.R
│   ├── TS-components-airpassengers.R
│   ├── TS-data.R
│   ├── TS-dates.R
│   ├── TS-lubridate.R
│   ├── TS-movavg-Nile.R
│   ├── TS-movavg.R
│   ├── assocrule1.R
│   ├── assocrule2.R
│   ├── assocrule3.R
│   ├── decisiontree1.R
│   ├── decisiontree2.R
│   ├── decisiontree3.R
│   ├── decisiontree4.R
│   ├── decisiontree5.R
│   ├── decisiontree5CHAID.R
│   ├── df.R
│   ├── environ.R
│   ├── freqtable.R
│   ├── lm-salesarea.R
│   ├── lm-salesqty.R
│   ├── lm-women-simple.R
│   ├── lm.R
│   ├── logR.R
│   ├── logr-gre.R
│   ├── matrix.R
│   ├── missingvalues.R
│   ├── packages1.R
│   ├── packages2.R
│   ├── stats2.R
│   ├── twitter.R
│   ├── vectors.R
│   ├── wordcloud1.R
│   └── wordcloud2.R
├── 10a-setup/
│   ├── 11a-start.R
│   ├── 11b-gettingstarted.R
│   ├── 15a-envrm.R
│   ├── 15b-renv.R
│   ├── 15e-rjava.R
│   ├── 16a-pathconfig.R
│   ├── 17a-rstudio.R
│   ├── 18a-processtime.R
│   ├── 21a-floorceiling1.R
│   ├── 21b-options.R
│   ├── 24a-github.R
│   ├── 25a-help.R
│   ├── 51c-deletefiles.R
│   ├── help.R
│   └── pathconfig.R
├── 10d-excel/
│   ├── student1.R
│   └── student2.xlsx
├── 10e-impexp/
│   ├── 14a-readcsv.R
│   ├── 14b-readcsv.R
│   ├── 14c-importweb.R
│   ├── 14d-importweb.R
│   ├── 14e-readothers.R
│   ├── 15b-datawrangling.R
│   ├── 20a-importgg.R
│   ├── 21b-googlesheet1.R
│   ├── 22b-ggsheets2.R
│   ├── 31a-export.R
│   └── 32c-writecsv.R
├── 12a-packages1/
│   ├── 21b-installpackages.R
│   ├── 21e-installFmGit.R
│   ├── 21g-packages1.R
│   ├── 31b-datasets.R
│   ├── 31c-datasets.R
│   ├── 41-purrr1.R
│   ├── 42-purr2.R
│   ├── 43-purrr3.R
│   ├── 44-purrr4.R
│   ├── 45-purrr5.R
│   ├── 51-plyr1.R
│   ├── 61-splitapplycombine1.R
│   ├── 62-splitapplycombine2.R
│   ├── 71-broom1.R
│   └── packages1.R
├── 13a-Packages2/
│   ├── 10a-fBasics.R
│   └── 11a-pysch.R
├── 15a-DS/
│   ├── 0FileList.R
│   ├── 10a-TOC
│   ├── 13b-ds-blank.R
│   ├── 14b-Basic_R_v01.R
│   ├── 14b-objectsmethods.R
│   ├── 14c-ds1.R
│   ├── 15a-objects.R
│   ├── 16b-datatypes.R
│   ├── 16c-basicDT.R
│   ├── 16d-ds1.R
│   ├── 20a-vectors.R
│   ├── 20b-vectors2.R
│   ├── 20c-vectors.R
│   ├── 20d-vectorfunctions.R
│   ├── 20f-vectors.R
│   ├── 20g-valuegenerate.R
│   ├── 20h-vectors2.R
│   ├── 25a-matrices.R
│   ├── 25c-matrices.R
│   ├── 25d-matrices.R
│   ├── 25e-matrices.R
│   ├── 27a-arrays.R
│   ├── 27b-arrays.R
│   ├── 27d-arrays.R
│   ├── 30c-basicdatatypes.R
│   ├── 30d-ds1.R
│   ├── 30e-datatypes.R
│   ├── 33b-df.R
│   ├── 33c-df.R
│   ├── 35a-lists.R
│   ├── 35b-lists.R
│   ├── 35e-lists.R
│   ├── 38a-factors.R
│   ├── 38b-factors.R
│   ├── 38c-factors.R
│   └── 38e-factors.R
├── 15b-DM/
│   ├── 21b-rep.R
│   ├── 21c-seq.R
│   ├── 21g-replicate.R
│   ├── 21g-seqdates.R
│   ├── 22b-letters.R
│   ├── 25b-interval.R
│   ├── 25c-midpoint.R
│   ├── 27b-recode-car.R
│   ├── 29b-subset.R
│   ├── 29c-split1.R
│   ├── 29d-splitdata.R
│   ├── 29e-partitiondata.R
│   ├── 31b-rowcol1.R
│   ├── 33b-sortorder.R
│   ├── 33c-order.R
│   ├── 33c-sortorderrank.R
│   ├── 33d-rank.R
│   ├── 34b-castmelt1.R
│   ├── 34c-castmelt2.R
│   ├── 37a-mtcars-subset.R
│   ├── 37b-duplicates1.R
│   ├── 37c-unique.R
│   ├── 38b-scale1.R
│   ├── 41b-randnos1.R
│   ├── 41c-randnos.R
│   ├── 42b-normdist.R
│   ├── 45b-forloop1.R
│   ├── 45e-ifelse2.R
│   ├── 45v-switch1.R
│   ├── 46b-withoutapply.R
│   ├── 46c-applyForCompare.R
│   ├── 46d-applyfamily.R
│   ├── 46e-applytype.R
│   ├── 46f-while1.R
│   ├── 47b-apply1.R
│   ├── 47c-apply.R
│   ├── 47d-apply1.R
│   ├── 47h-tapply1.R
│   ├── 47j-lapply1.R
│   ├── 47m-mapply1.R
│   ├── 47n-mapply2.R
│   ├── 47o-rapply.R
│   ├── 47on-eapply.R
│   ├── 47p-sapply1.R
│   ├── 47q-sapply2.R
│   ├── 47s-tapply2.R
│   ├── 47t-vapply1.R
│   ├── 49b-replicate1.R
│   ├── 49c-replicate.R
│   ├── 49e-by.R
│   ├── 49f-by.R
│   ├── 49g-bywith.R
│   ├── 51b-myfunc.R
│   ├── 51c-functions1.R
│   ├── 53b-cbindrbind1.R
│   ├── 53c-joinDFs.R
│   ├── 53c-merge1.R
│   ├── 54b-combination.R
│   ├── 54d-expandgrid.R
│   ├── 55b-sweep1.R
│   ├── 55d-sweep2.R
│   ├── 56b-outer1.R
│   ├── 56c-outer2.R
│   ├── 57b-stack1.R
│   ├── 58-DF-common.R
│   ├── 58-df-matching1.R
│   ├── 58-df2.R
│   ├── 58-hmisc.R
│   ├── 58-pmatchchar.R
│   ├── 61c-missing1.R
│   ├── 61c-missing2.R
│   ├── 61c-missing3.R
│   ├── 62b-outlier.R
│   └── 62c-outlier2.R
├── 15c-Summary/
│   ├── 22b-aggregate.R
│   ├── 22c-aggregate2.R
│   ├── 23b-freqdistr1.R
│   ├── 23c-freqdistr2.R
│   ├── 23d-freqdistr3.R
│   ├── 23f-FD.R
│   ├── 23f-freqdistr.R
│   ├── 24b-freqdistr4.R
│   ├── 24f-freqdistr5.R
│   ├── 25g-freqdistr6.R
│   ├── 31c-rowsums1.R
│   ├── 32b-addmargin1.R
│   ├── 32c-margintable1.R
│   ├── 32d-proptable.R
│   ├── 32d-tableprop2.R
│   ├── 35b-crosstab.R
│   ├── 99a-Pskim.R
│   ├── 99a-studentdata1.R
│   └── descriptive.R
├── 16a-tidyverse/
│   ├── 20a-dplyr.R
│   ├── 21a-dplyr-select.R
│   ├── 21b-dplyr-slice1.R
│   ├── 21c-dplyr-mutate1.R
│   ├── 21d-dplyr-summarise.R
│   ├── 21e-dplyr-filter1.R
│   ├── 21f-dplyr-str.R
│   ├── 21g-dplyr-arrange.R
│   ├── 22b-dplyr-seperate1.R
│   ├── 22b-group.R
│   ├── 22c-summarise.R
│   ├── 22g-tibble-rownames.R
│   ├── 25b-magrittr.R
│   ├── 26c-tidyr-DSR1.R
│   ├── 26d-tidyr-DSR-who.R
│   ├── 31b-plyr1.R
│   ├── 32b-plyr-mutate.R
│   ├── 33d-dplyr-joins.R
│   ├── 33f-dplyr-split.R
│   ├── plyr-ddply-gpsum.R
│   ├── tidyr1.R
│   ├── zz-dplyr1.R
│   └── zz-tidy-dataformating.R
├── 16b-DT/
│   ├── 0-DTsummary.R
│   ├── 1-dt1.R
│   ├── 2-DT.R
│   └── 3-DT.R
├── 17a-Stats/
│   ├── 10-statslinks.R
│   ├── 10a-distributions.R
│   ├── 11a-normal.R
│   ├── 11b-normalq.R
│   ├── 11c-normald.R
│   ├── 12a-binomial.R
│   ├── 13a-mean.R
│   ├── 14a-median.R
│   ├── 15a-mode.R
│   ├── 15b-mode.R
│   ├── 16a-range.R
│   ├── 17a-sd.R
│   ├── 18a-covariance.R
│   ├── 19a-correlation.R
│   ├── 20a-coev.R
│   ├── 37a-sample1.R
│   ├── 40a-missing1.R
│   ├── 40b-missing1.R
│   ├── 40c-missing2.R
│   ├── 40d-missing3.R
│   ├── 42a-outlier1.R
│   ├── 43a-outliers1.R
│   ├── 45a-sampling.R
│   ├── 55a-traintest1.R
│   ├── 60a-kurtosis.R
│   ├── 60b-kurtosis.R
│   ├── 64a-skewness.R
│   ├── 64b-skewness.R
│   ├── ave1.R
│   ├── interactions.R
│   ├── mean1.R
│   ├── mean2.R
│   ├── mean3.R
│   ├── meandev.R
│   ├── meanwt1.R
│   ├── median1.R
│   ├── mode1.R
│   ├── normal_height.R
│   ├── normality.R
│   ├── normality2.R
│   ├── outlier2.R
│   ├── outliers1.R
│   ├── poiss1.R
│   └── quantile1.R
├── 18a-HypoTests/
│   ├── 20b-distributions.R
│   ├── 23b-ztest-bsda.R
│   ├── 25c-tdistribution.R
│   ├── 26b-ttestindep.R
│   ├── 26c-ttestpaired.R
│   ├── 27b-TTS1-case1.R
│   ├── 28b-TTS2-case1.R
│   ├── 28c-TTS1-case3.R
│   ├── 28e-TT-sample1i.R
│   ├── 31b-chisqdistr.R
│   ├── 32b-HT-chisq1.R
│   ├── 32c-HT-chisq2.R
│   ├── 33b-HT-chisq.R
│   ├── 33d-chisqtest1.R
│   ├── 33e-chisqtest2.R
│   ├── 34b-goodnessfit.R
│   ├── datadistr.R
│   ├── htestnd1.R
│   ├── randomdistr.R
│   ├── shadeareainplot.R
│   └── tests1.R
├── 19a-sum-cases/
│   ├── 31b-DA-dencoCase.R
│   ├── 31c-DA-dencoCase2.R
│   ├── 31d-dsum-denco.R
│   ├── 31e-dencoCase2.R
│   ├── 31f-dencoCase.R
│   ├── 33c-basicDM-mtcars.R
│   ├── 33c-dplyr-mtcars.R
│   ├── 33f-DA-bakerydata1.R
│   ├── 34b-sales1.R
│   ├── 34c-sales2.R
│   ├── 35b-DA-student1.R
│   ├── 35c-dm-student1.R
│   ├── 36b-dsum-Case1.R
│   ├── 36c-dsum-Case2.R
│   ├── 36f-DSA-case2.R
│   ├── 37b-dsum-iris1.R
│   ├── 38b-dsum-haireyecolor1.R
│   ├── 42b-case-sum-graphs.R
│   └── dataexplore.R
├── 19c-mtcars/
│   ├── 10b-datastructures.R
│   ├── 11b-mtcars.R
│   ├── 11c-mtcars-filter.R
│   ├── 11d-mtcars-descp.R
│   ├── 11f-mtcars-loops.R
│   ├── 11g-mtcars-sort.R
│   ├── 11h-mtcars-dplyr.R
│   ├── 12d-mtcars-graph1.R
│   ├── 12e-mtcars-graph2.R
│   ├── 12e-mtcars-summarise-dplyr.R
│   ├── 12f-diag-ggplot2-mtcars.R
│   ├── 12f-ggplot2-mtcars.R
│   ├── 13b-mtcars-lm1.R
│   ├── 13c-mtcars-lm2.R
│   ├── 13e-mtcars-lm3.R
│   ├── 14b-mtcars-logr.R
│   ├── 15b-mtcars-DT-class.R
│   ├── 15c-mtcars-DT-anova.R
│   ├── 16b-mtcars-cluster1.R
│   ├── 16c-mtcars-cluster2.R
│   ├── 22f-tidyr-mtcars.R
│   ├── mtcars-clust1.R
│   └── s1.R
├── 19d-iris/
│   └── sumgraph1.R
├── 20a-BasicGraphs/
│   ├── 10a-graphs.R
│   ├── 10b-graphs.R
│   ├── 12b-graphs2.R
│   ├── 12d-title1.R
│   ├── 12e-text.R
│   ├── 12f-abline.R
│   ├── 12g-legend.R
│   ├── 12k-tick.R
│   ├── 12m-axis1.R
│   ├── 13e-multipleplots1.R
│   ├── 13f-multipleplots.R
│   ├── 13g-subplot.R
│   ├── 15a-graphdata1.R
│   ├── 15b-graph1.R
│   ├── 21b-plot-hist1.R
│   ├── 21c-plot.R
│   ├── 23b-line.R
│   ├── 23c-lines2.R
│   ├── 24b-histogram.R
│   ├── 24c-histogram2.R
│   ├── 25b-barplot.R
│   ├── 25c-barplot2.R
│   ├── 26b-boxplot.R
│   ├── 26c-boxplot2.R
│   ├── 26d-boxplot2.R
│   ├── 27b-pie.R
│   ├── 27c-pie2.R
│   ├── 29b-corrgram1.R
│   ├── 32b-freqdistr.R
│   ├── 33b-dotplot.R
│   ├── 33b-matrixplots.R
│   ├── 37b-scatter.R
│   ├── 42b-intplots1.R
│   ├── 43b-mosaic.R
│   ├── 43c-corrplot.R
│   ├── 43c-ggally.R
│   ├── 44b-textplots.R
│   ├── 45b-violinplot.R
│   ├── ria2g1.R
│   ├── ria2g2.R
│   ├── ria2g3.R
│   └── ria3g3.R
├── 20d-AdvGraphs/
│   ├── cowplot1.R
│   ├── donut.R
│   ├── donut2.R
│   ├── esquisse.R
│   ├── lattice.R
│   ├── lattice1.R
│   ├── survey.R
│   ├── symbols.R
│   └── vtree1.R
├── 20f-ggplots/
│   ├── circbarplot.R
│   ├── gg-bar1.R
│   ├── gg-bar2.R
│   ├── gg-box2.R
│   ├── gg-boxhist.R
│   ├── gg-boxplot1.R
│   ├── gg-heatmap.R
│   ├── gg-hist1.R
│   ├── gg-legend1.R
│   ├── gg-line.R
│   ├── gg-slope.R
│   ├── gg-slope2.R
│   ├── ggp2.R
│   ├── ggplot-DU1.R
│   ├── ggplot3.R
│   ├── ggplot5.R
│   ├── ggplot6.R
│   ├── ggplot7.R
│   └── twoaxis-gg.R
├── 20g-Network/
│   ├── NetSciX 2016 Workshop.R
│   ├── network1.R
│   ├── network2.R
│   └── traveltime1.R
├── 21a-OneGday/
│   ├── 1bubblechart.R
│   ├── 1bubblechart2.R
│   ├── multipleplots1.R
│   └── tableGrob.R
├── 23a-Strings/
│   ├── abvn.R
│   ├── latex.R
│   ├── output.txt
│   ├── paste1.R
│   ├── setop1.R
│   ├── strcmpt1.R
│   ├── string1.R
│   ├── strjoin.R
│   ├── strlength.R
│   ├── strman1.R
│   ├── strman2.R
│   ├── strman3.R
│   ├── strman4.R
│   ├── strman5.R
│   ├── strman6.R
│   ├── strman7.R
│   ├── strman9.R
│   ├── strprint1.R
│   ├── strreplace1.R
│   ├── strsearch.R
│   ├── strsplit1.R
│   ├── strsplit2.R
│   ├── strsplit3.R
│   ├── strsplit4.R
│   └── tidyr-strseperate.R
├── 24a-LM/
│   ├── 10a-lm-women2.R
│   ├── 10b-lm-salesarea2.R
│   ├── 10c-MLR-omni.R
│   ├── 10e-lm-errorplot.R
│   ├── 13b-lm-commands.R
│   ├── 16b-SLM-women2.R
│   ├── 16c-SLM-women1.R
│   ├── 16e-SLM-women-A.R
│   ├── 16f-SLM-women-V.R
│   ├── 16f-SLM-women.R
│   ├── 16m-SLM-women2.R
│   ├── 17a-LM-case1.R
│   ├── 17b-LM-stock1.R
│   ├── 18a-SLM-salesarea.R
│   ├── 18b-SLM-salesarea.R
│   ├── 18c-SLM-salesarea.R
│   ├── 23a-MLM-omni.R
│   ├── 23c-MLM-omni.R
│   ├── 24a-MLM-pcsales.R
│   ├── 25a-MLM-mtcars.R
│   ├── 25c-MLM--mtcars1.R
│   ├── 25c-MLM-mtcars.R
│   ├── 25d-MLM-mtcars-A.R
│   ├── 26a-MLM-airquality.R
│   ├── 27a-MLM-marketing.R
│   ├── 35a-MLM-case1.R
│   ├── 37a-LM-dummy-fireplace.R
│   ├── 37b-dummy1.R
│   ├── 38c-LM-dummy1.R
│   ├── 41c-LM-assumptions.R
│   ├── 42b-LM-linearity.R
│   ├── 42c-LM-normality.R
│   ├── 42d-LM-variance.R
│   ├── 42e-LM-outliers.R
│   ├── 42f-LM-autocorr.R
│   ├── 42g-LM-influentialvariables.R
│   ├── 42h-LM-multicollinearity.R
│   ├── 42j-gvlma.R
│   ├── 43a-LM-graphs.R
│   ├── LM-all-mtcars1.R
│   ├── ProbDist.R
│   ├── Simulation.R
│   ├── confusionmatrix.R
│   ├── contrasts1.R
│   ├── dummies.R
│   ├── homosecadicity.R
│   ├── lm-broom.R
│   ├── lm-dummy1.R
│   ├── lm-housing.R
│   ├── lm-mtcars1.R
│   ├── lm-mtcars2.R
│   ├── lm-plot1.R
│   ├── lm-segments1.R
│   ├── mlm-state77.R
│   ├── multvariate1.R
│   ├── plotcoef1.R
│   └── regrplot1.R
├── 24c-NLM/
│   ├── nlm1.R
│   └── nlm2-mtcars.R
├── 28a-LogR/
│   ├── 24c-LR-default.R
│   ├── 24d-LR-default.R
│   ├── 24e-LR-default.R
│   ├── 24g-LR-default-accuracy.R
│   ├── 26b-LR-germancredit.R
│   ├── 27b-LR-gre.R
│   ├── 28b-LR-subscribe.R
│   ├── 28c-LR-subscribe.R
│   ├── 29b-LR-ads.R
│   ├── 31b-LR-income.R
│   ├── 31c-income.R
│   ├── 33b-LR-purchase.R
│   ├── 45b-compareAUC.R
│   ├── 45c-roc-default.R
│   ├── 45e-roc-general.R
│   ├── 45f-roc1.R
│   ├── 45h-roc2.R
│   ├── 46c-accuracy.R
│   ├── 48b-auc1.R
│   ├── 48c-auc1.R
│   ├── 48d-auc.R
│   ├── 48e-auc.R
│   ├── 49c-thresholdvalue.R
│   ├── pdpu.R
│   └── zz--logR.R
├── 29a-GLM/
│   ├── Logr-party.R
│   ├── crossfold.R
│   ├── crossval1.R
│   ├── cv-houseprices.R
│   ├── cv-women1.R
│   ├── cv3.R
│   ├── cvlm2.R
│   ├── glm-affairs1.R
│   ├── glm-affairs2.R
│   ├── glm-cars.R
│   ├── glm-titanic1.R
│   ├── logR1.R
│   ├── logpos1.R
│   ├── logr-mtcars.R
│   ├── logr-mtcars1.R
│   ├── logrMaths.R
│   ├── logreg-iris1.R
│   ├── multinominal.R
│   ├── multinominal2.R
│   ├── multinominal3.R
│   ├── multinominal4.R
│   ├── multinominal5.R
│   ├── multinominal6.R
│   └── nls1.R
├── 30a-CLS/
│   ├── cls-gen
│   ├── cls1M-cancer.R
│   ├── dt-multiplemodels.R
│   ├── giniIndex.R
│   └── rattle.R
├── 30b-CART/
│   ├── 10-CART-gen.R
│   ├── 11-cart-understandsplit.R
│   ├── 12-DT-outlook.R
│   ├── CARTR_sales.R
│   ├── CART_Regression Tree v01.R
│   ├── DT-rpart-claims.R
│   ├── c-dt-rpart-Case-DU1.R
│   ├── c-dt-rpart-iris.R
│   ├── c-dt-rpart-sales1.R
│   ├── cls-cart-churn2.R
│   ├── cls-rpart-plot2.R
│   ├── dt-car.R
│   ├── dt-general.R
│   ├── dt-glaucoma.R
│   ├── dt-ionos1.R
│   ├── dt-iris1.R
│   ├── dt-kyphosis.R
│   ├── dt-loanapproved1.R
│   ├── dt-rpart-du.R
│   ├── dt-rpart-du1.R
│   ├── dt-rpart-du2.R
│   ├── dt-rpart-du3.R
│   ├── dt-rpart-metal.R
│   ├── dt-rpart-student1.R
│   ├── dt-rpart-text1.R
│   ├── dt-rpart-varimp1.R
│   ├── dt-rpart-varimp2.R
│   ├── dt-sleep.R
│   ├── dt-tree-car1.R
│   ├── dt3-eyes.R
│   ├── entropy.R
│   ├── multimodel.R
│   ├── tree-houseprices.R
│   └── zz-test.R
├── 30c-Ctree/
│   ├── CTREE NPS R code v01.R
│   ├── ctree-KyCU.R
│   ├── ctree-airquality.R
│   ├── ctree-churn2.R
│   ├── ctree-clsregr-party.R
│   ├── ctree-clsregr.R
│   ├── ctree-readingskills.R
│   ├── ctree2-iris.R
│   ├── ctreee-iris.R
│   └── dt-ctree-playYes.R
├── 30d-CHAID/
│   ├── CHAID-nps2.R
│   ├── CHAID-xsell1.R
│   ├── c-dt-chaid-nps.R
│   ├── c-dt-chaid-usvote1.R
│   ├── chaid-attrition.R
│   ├── chaid-cancer.R
│   ├── chaid-usvote.R
│   ├── chaid2.R
│   ├── chaid4.R
│   ├── chisq.R
│   └── chisqtest2.R
├── 30d-splitcriteria/
│   ├── cls-entropy.R
│   ├── dt-rpart-criteria.R
│   ├── splitcriteria1.R
│   ├── splitcriteria2.R
│   └── splitcriteria3.R
├── 31b-KNN/
│   ├── knn1_cancer.R
│   ├── knn3_KKNN.R
│   ├── knn4.R
│   ├── knn_diamonds.R
│   └── knn_iris.R
├── 31c-naive/
│   ├── naivbayes1.R
│   └── naivbayes2.R
├── 31d-randomforest/
│   ├── dt-caret-xxx.R
│   ├── dt-rf-DU3.R
│   ├── dt-rf-eg2.R
│   ├── dt-rf-eg3.R
│   └── dt-rf-kyphosis1.R
├── 31d-weka/
│   ├── cls-ID3.R
│   ├── cls-c45weka.R
│   ├── clsW-iris.R
│   └── clsW-iris2.R
├── 31e-Case-Cancer/
│   ├── data-cancer.R
│   ├── rf-cancer.R
│   ├── svm-cancer1.R
│   └── svm-examples.R
├── 40a-CLUST/
│   ├── 10-clust-packages.R
│   ├── 16b-km-withinss.R
│   ├── 17b-clust-noclusters1.R
│   ├── 17c-clust-numbers-iris.R
│   ├── 17d-noc-mclust.R
│   ├── 19b-clust-distances.R
│   ├── 19c-clust-distances.R
│   ├── 19d-clust-scaling.R
│   ├── 20b-clust-plots.R
│   ├── 20c-clust-plots2.R
│   ├── 23b-km-marks1.R
│   ├── 23c-km-marks2.R
│   ├── 23d-km-amap-marks3.R
│   ├── 23e-km-student2.R
│   ├── 24b-clust-women.R
│   ├── 25b-km-iris.R
│   ├── 25c-km-iris2.R
│   ├── 25f-km-iris2.R
│   ├── 26h-km-attitude.R
│   ├── 27c-clust-som1.R
│   ├── 33c-hc-nutrients1.R
│   ├── 33c-hc-vegan-dune1.R
│   ├── 33d-hc-protein.R
│   ├── 33f-hc-marks.R
│   ├── 33g-hc-sample.R
│   ├── 35d-pam-iris.R
│   ├── 35e-pam-nutrient.R
│   ├── 40b-mixedclust1.R
│   ├── 40c-clust-dendgm.R
│   ├── 43b-clust-mixedDataTypes1.R
│   ├── 45c-clustering-exist1.R
│   ├── 45e-clustering-animation1.R
│   ├── 50b-clust-ma1.R
│   ├── 50c-clust-ma2.R
│   ├── 50d-clust-ma3.R
│   ├── 61b-clust-custsegm.R
│   ├── animation2.R
│   ├── clust-allcustering.R
│   ├── clust-case-liberty.R
│   ├── clust-class-differences.R
│   ├── clust-compare.R
│   ├── clust-distance-calc.R
│   ├── clust-distance2.R
│   ├── clust-entropy.R
│   ├── clust-iterations.R
│   ├── clust-kselect.R
│   ├── clustering-women.R
│   ├── clusters3.R
│   ├── hier-simplecase.R
│   ├── hier-usarrests.R
│   ├── iris.R
│   ├── kmeans-bankdata.R
│   ├── kmeans-pcalike.R
│   ├── kmeans-plots.R
│   ├── kmeans-randomness.R
│   └── pam1.R
├── 45a-AR/
│   ├── 11a-measures1.R
│   ├── 12a-ar-samplecase.R
│   ├── 12b-ar-samplecase2.R
│   ├── 14a-ar-datastr.R
│   ├── 15-ar-groceries.R
│   ├── 15a-ar-Groceries1.R
│   ├── 15b-ar-Groceries.R
│   ├── 16b-groceries-summary.R
│   ├── 16d-ar-groceries-subset.R
│   ├── 16f-ar-groceries-vis.R
│   ├── 16f-ar-groceries-vis2.R
│   ├── 16f-ar-groceries-vis3.R
│   ├── 16f-ar-groceries-vis4.R
│   ├── 16f-ar-groceries-vis5.R
│   ├── 16k-ar-grocery-DT.R
│   ├── 17a-ar-transactionformat.R
│   ├── 17d-ar-matrix-transactions.R
│   ├── 17e-ar-df-transcations.R
│   ├── 17f-ar-csv-transactions.R
│   ├── 17f-ar-csv2-transactions.R
│   ├── 17g-ar-list-transcations.R
│   ├── 17h-ar-dataformats.R
│   ├── 18a-arules1.R
│   ├── 20a-ar-DU1.R
│   ├── 20b-ar-DU2.R
│   ├── 20c-ar-DU3.R
│   ├── 22a-ar-edn.R
│   ├── 22b-ar-elective.R
│   ├── 22d-ar-subjects.R
│   ├── 22e-ar-placement.R
│   ├── 22f-myAR1.R
│   ├── 25a-ar-income.R
│   ├── 25b-ar-medical.R
│   ├── 25c-ar-titanic.R
│   ├── 29a-ar-Adult.R
│   ├── 29b-ar-Adult-NW.R
│   ├── 29c-ar-Adult-Draft.R
│   ├── 30a-ar-Finance1.R
│   ├── 30b-ar-Finance.R
│   ├── 32a-ar-visual.R
│   ├── 33a-ar-redundant.R
│   ├── 33b-redundantrules.R
│   ├── 40a-ar-multilevel-Groceries.R
│   ├── 43a-ar-patterns.R
│   ├── 45a-ar-rulesextract.R
│   ├── 99-ar-NW.R
│   ├── 99-ar-OnlineSales.R
│   ├── 99-ar-basketanalysis2.R
│   ├── 99-ar-policechecks.R
│   ├── AR-Weka
│   ├── ar-case-liberty.R
│   ├── ar-groceries2.R
│   └── my_basket1.txt
├── 46a-GD/
│   ├── aboutSL
│   ├── gradientdescent1.R
│   ├── gradientdescent2.R
│   ├── gradientdescent3.R
│   ├── gradientdescent4.R
│   └── regr1.R
├── 47A-TS/
│   ├── 12b-TS-add-mult.R
│   ├── 16c-dates-split1.R
│   ├── 16d-dates1.R
│   ├── 23b-TS-Case-sales.R
│   ├── 23b-lubridate1.R
│   ├── 24b-Data-DFtoTS.R
│   ├── 24b-timeseries1.R
│   ├── 24c-timeseries2.R
│   ├── 24f-ts-data.R
│   ├── 26b-ts-components-airp.R
│   ├── 26c-ts-components.R
│   ├── 27b-ts-johnson.R
│   ├── 27c-ts-lm-uscons.R
│   ├── 28c-ts-lubridate1.R
│   ├── 31c-TS-airp.R
│   ├── 33b-zoo-ts.R
│   ├── 35b-LSM-beer1.R
│   ├── 38b-tsplots2a.R
│   ├── 38c-tsplots3.R
│   ├── 38f-plot-zz.R
│   ├── 41b-arima1.R
│   ├── 41c-arima2.R
│   ├── 41d-arima-airp.R
│   ├── 41d-arima-jj-nile.R
│   ├── 45b-TS-arima.R
│   ├── 52c-Case1-complete.R
│   ├── 53b-sales-ts.R
│   ├── 55b-ts-case-xxx2.R
│   ├── 55c-ts-case-xxxx.R
│   ├── SMA-nile.R
│   ├── TS-P-fpp.R
│   ├── TS-c02.R
│   ├── TS-data-DU1.R
│   ├── TS-fpp-seasonplot.R
│   ├── TS-kings.R
│   ├── TS-links
│   ├── TS-movag1.R
│   ├── TS-nile.R
│   ├── TS-xts.R
│   ├── TS-zoo.R
│   ├── ts-P-highfreq.R
│   ├── ts-P-openair.R
│   ├── ts-P-padr.R
│   ├── ts-beer2.R
│   ├── ts-case1.R
│   ├── ts-case2.R
│   ├── ts-lubridate2.R
│   ├── ts-rollingvalues.R
│   ├── ts-rollingvalues2.R
│   ├── ts-splitdate.R
│   ├── ts-timestamp.R
│   └── tsforecast-exp.R
├── 48A-HTML/
│   └── aboutUSL
├── 48c-TM/
│   ├── SM-rtexttools1.R
│   ├── TM-zz.R
│   ├── downloadfile.R
│   ├── facebook1.R
│   ├── facebook2.R
│   ├── fms.txt
│   ├── linkedin1.R
│   ├── linkedin3.R
│   ├── pagerank.R
│   ├── rowling.txt
│   ├── rquery_wordcloud.R
│   ├── sentiment-tidyr1.R
│   ├── sentiment2.R
│   ├── textmining-DU1.R
│   ├── tm-worldcloud4.R
│   ├── twitter-hotel.R
│   ├── twitter-keys.R
│   ├── twitter-sentiment2.R
│   ├── twitter1-DU1.R
│   ├── twitter1-DU2.R
│   ├── twitter1-authen.R
│   ├── twitter1.R
│   ├── twitter2.R
│   ├── twitteracct
│   ├── wordcloud1.R
│   ├── wordcloud2.R
│   ├── wordcloud3.R
│   └── worldcloud2.R
├── 48g-textdocs/
│   └── vit.txt
├── 51a-OR-LP/
│   ├── 15b-lpsolveAPI.R
│   ├── 15c-lpassign.R
│   ├── 21b-LP-mach-prod.R
│   ├── 21c-LP-mach-prod.R
│   ├── 22b-LP-case1.R
│   ├── 22c-LP-assign-case3.R
│   ├── 22d-LP-Case-carmanufacturing.R
│   ├── 25b-LPassign-job.R
│   ├── 30a-LP-tpt-function.R
│   ├── 31b-LP-tpt1.R
│   ├── 31c-LP-tpt2.R
│   ├── 31d-LP-tpt3.R
│   ├── 33d-proptable.R
│   ├── 41b-pricing.R
│   ├── 51b-LP-marketing.R
│   ├── model.lp
│   ├── zz-LP-clplite.R
│   └── zz-LP-general.R
├── 62A-MA/
│   ├── CA.R
│   ├── campaign.csv
│   ├── data1.R
│   ├── graph1.R
│   ├── maregression1.R
│   ├── pricing1.R
│   └── tree1.R
├── 62c-RFM/
│   ├── rfm1.R
│   └── rfm3.R
├── 63A-FA/
│   ├── 10-FAlinks.R
│   ├── Insurance Loss v01.R
│   ├── InsuranceLosses.csv
│   ├── Packages Pre-requisites_v03.R
│   ├── aapl.csv
│   ├── fa-iitg-dataanalysis.R
│   ├── finTS1.R
│   ├── findata1.R
│   ├── finstmts1.R
│   ├── finstmts2.R
│   ├── finstmts3.R
│   ├── gtrends1.R
│   ├── intrino1.R
│   ├── intrino2.R
│   ├── lag1.R
│   ├── logistic_regression.R
│   ├── qf1.R
│   ├── sentianalysistrading1.R
│   ├── shares1.R
│   ├── shares2.R
│   ├── stock3.R
│   ├── stockanalysis1.R
│   ├── stockanalysis2.RData
│   ├── stocks5.R
│   ├── stocksanalysis3.R
│   ├── stocksanalysis4.R
│   └── volatity1.R
├── 70a-report/
│   ├── report1.Rmd
│   └── report1.tex
├── 70d-myTest/
│   └── lm-sim-test1.R
├── 70e-phd/
│   ├── attendance2.R
│   └── grades.R
├── 75a-sports/
│   ├── cricket1.R
│   └── cricket2-york.R
├── 76a-Misc/
│   ├── dhmethods.R
│   ├── funcpgm1.R
│   ├── h2o.R
│   └── skimr-package.R
├── 78b-json/
│   ├── 21b-json-format.R
│   ├── 23b-xml-import.R
│   ├── 25a-httr1.R
│   ├── 25c-httr2.R
│   ├── json-1.R
│   └── json2.R
├── 80a-artwork/
│   ├── AuctionsData - artwork.csv
│   ├── AuctionsData - set1.csv
│   ├── artwork-cls1.R
│   ├── artwork-descp.R
│   ├── artwork-eda1.R
│   ├── artwork-eda2.R
│   ├── artwork-rought.R
│   ├── artwork1.R
│   ├── artwork2.R
│   ├── artwork4.R
│   ├── awdata1.R
│   ├── density.R
│   └── file2.R
├── 80c-studqueries/
│   ├── Sapient_Big Data.R
│   ├── achal1.R
│   ├── achal1.csv
│   ├── achal2.R
│   ├── achal2.csv
│   ├── deepak.R
│   ├── hitesh-dec18.R
│   ├── hitesh1.R
│   ├── hitesh2.R
│   ├── hitesh3.R
│   ├── hitesh4.R
│   ├── hiteshJul18.R
│   ├── lalit1.R
│   ├── meena1
│   ├── meena2.R
│   ├── meena3.R
│   └── sidana2.R
├── Data/
│   ├── AuctionsData - set1.csv
│   ├── Churn.csv
│   ├── Computers.csv
│   ├── MA.RData
│   ├── MMM_raw_data_v02.csv
│   ├── NPS Data Food Order v01.csv
│   ├── Predict Merchant_Sales v01.csv
│   ├── Prostate_Cancer.csv
│   ├── Rdatasets.R
│   ├── Sales.csv
│   ├── Sales_files/
│   │   ├── 6006907
│   │   ├── frameworks-95aff0b550d3fe338b645a4deebdcb1b.css
│   │   ├── frameworks-b3cd8fa1481bc34c4b18cf307ca75438.js.download
│   │   ├── github-542f291c828bb453339765ba3a54c144.js.download
│   │   └── github-cdaf214b636e7d0581fce94eda9de4bd.css
│   ├── Segmentation_Data v01.csv
│   ├── Social_Network_Ads.csv
│   ├── airpsng.csv
│   ├── artwork.rds
│   ├── arulesfin.csv
│   ├── attendance1.csv
│   ├── attendance2.csv
│   ├── badata.Rdata
│   ├── bakery.csv
│   ├── bank.csv
│   ├── binary.csv
│   ├── bitsgoa.csv
│   ├── cclogr.csv
│   ├── clscredit.csv
│   ├── clsplay.csv
│   ├── clust_custseg.csv
│   ├── data4cluster2.csv
│   ├── data_clus_2.csv
│   ├── denco.csv
│   ├── dhiraj.csv
│   ├── dtdata.csv
│   ├── fintransactions.csv
│   ├── grades.csv
│   ├── hhe.txt
│   ├── iimc1.csv
│   ├── iimtrichy.csv
│   ├── iitgfa.csv
│   ├── iitgfa.xlsx
│   ├── iitgfa2.xlsx
│   ├── iris.csv
│   ├── itemlist1
│   ├── km5_c2.csv
│   ├── logr2.csv
│   ├── msales.csv
│   ├── mtcars.csv
│   ├── mtcars.sas7bdat
│   ├── mtcars1.csv
│   ├── myexcel.xlsx
│   ├── myitems1.csv
│   ├── myrules1.csv
│   ├── myworkbook.xlsx
│   ├── node1.csv
│   ├── pumba.csv
│   ├── rules.csv
│   ├── rulesR.csv
│   ├── salesslr.csv
│   ├── slr1.csv
│   ├── student.csv
│   ├── student1.xlsx
│   ├── student2.xlsx
│   ├── student3.xlsx
│   ├── student3a.xlsx
│   ├── studentdata2.csv
│   ├── studentdata3.txt
│   ├── studentdata4.csv
│   ├── students3.csv
│   ├── talltransactions.csv
│   ├── tendulkar.csv
│   ├── women.sav
│   └── ximb.csv
├── Unsorted/
│   ├── CLT.R
│   ├── R-Exercise.R
│   ├── RCommander.R.R
│   ├── basiclm1.R
│   ├── binomial.R
│   ├── boxplot.R
│   ├── c.R
│   ├── central1.R
│   ├── colstats1.R
│   ├── complextables.R
│   ├── cor1.R
│   ├── crossfold1.R
│   ├── cut1.R
│   ├── cutprety1.R
│   ├── datalevels.R
│   ├── dbconnection.R
│   ├── dec17.R
│   ├── dec17b.R
│   ├── demo1.R
│   ├── density2.R
│   ├── descstatsgraphs1.R
│   ├── dplyr1.R
│   ├── ds1.R
│   ├── env1.R
│   ├── examB.R
│   ├── extra.R
│   ├── fd1.R
│   ├── fd2.R
│   ├── fd3.R
│   ├── fd4.R
│   ├── fd5-means.R
│   ├── googleS.R
│   ├── knitr.R
│   ├── kurtosis.R
│   ├── lm-sales.R
│   ├── lm1-sales.R
│   ├── lm1.R
│   ├── miscscripts.R
│   ├── nd1.R
│   ├── normal.R
│   ├── paneldata1.R
│   ├── plot1.R
│   ├── practise-dec17c.R
│   ├── practise.R
│   ├── rattle1.R
│   ├── rattle2.R
│   ├── rcdr1.R
│   ├── rjava.R
│   ├── rle1.R
│   ├── sample1.R
│   ├── sample2.R
│   ├── scripting1.R
│   ├── skewness1.R
│   ├── skewness2.R
│   ├── skewness3.R
│   ├── smpdist1.R
│   ├── summary1.R
│   ├── sumstats1.R
│   ├── ttest1.R
│   └── vaibhavi.R
├── _config.yml
├── all_letters.csv
├── analytics.Rproj
├── blank.R
├── cacert.pem
├── car.data
├── data/
│   ├── Dataset1-Media-Example-EDGES.csv
│   ├── Dataset1-Media-Example-NODES.csv
│   ├── Dataset2-Media-User-Example-EDGES.csv
│   ├── Dataset2-Media-User-Example-NODES.csv
│   ├── ItemList.csv
│   ├── MBA.csv
│   ├── MBArules.csv
│   ├── Prostate_Cancer.csv
│   ├── Rules_20.csv
│   ├── StudentPassFail.csv
│   ├── StudentTid1.csv
│   ├── StudentTid2.csv
│   ├── ar14.csv
│   ├── ar14b.csv
│   ├── dar1.csv
│   ├── dar1w.csv
│   ├── dar1w.csv.arff
│   ├── dar2.csv
│   ├── dar3.csv
│   ├── dar3a.csv
│   ├── dar3b.csv
│   ├── data1.R
│   ├── dateformat1.R
│   ├── groceries.csv
│   ├── mushrooms.csv
│   ├── my_basket
│   ├── onsen.csv
│   ├── splitData1.R
│   ├── student1.csv
│   ├── studentdata.R
│   ├── studentdata.csv
│   ├── titanic.csv
│   └── titanic.raw.rdata
├── download/
│   ├── fms.txt
│   ├── iris.csv
│   ├── iris.xlsx
│   ├── rowling.txt
│   └── vector.R
├── file1.R
├── fms.txt
├── iimc1.R
├── mdi1.R
├── packages/
│   ├── switchr.R
│   ├── useful.R
│   └── useful2.R
├── report/
│   ├── example1.Rmd
│   ├── knit2.R
│   ├── knitr-minimal.R
│   ├── knitr-minimal.tex
│   ├── mdreport1.Rmd
│   ├── report2.Rmd
│   ├── report3.Rmd
│   ├── report4.Rmd
│   ├── report4.html
│   ├── report4.log
│   ├── reportnotes
│   ├── sample1.R
│   ├── sample2.R
│   └── xbrl.Cache/
│       ├── aapl-20140927.xml
│       ├── aapl-20140927.xsd
│       ├── aapl-20140927_cal.xml
│       ├── aapl-20140927_def.xml
│       ├── aapl-20140927_lab.xml
│       ├── aapl-20140927_pre.xml
│       ├── country-2013-01-31.xsd
│       ├── currency-2014-01-31.xsd
│       ├── dei-2014-01-31.xsd
│       ├── exch-2014-01-31.xsd
│       ├── invest-2013-01-31.xsd
│       ├── naics-2011-01-31.xsd
│       ├── nonNumeric-2009-12-16.xsd
│       ├── numeric-2009-12-16.xsd
│       ├── ref-2006-02-27.xsd
│       ├── us-gaap-2014-01-31.xsd
│       ├── us-roles-2014-01-31.xsd
│       ├── us-types-2014-01-31.xsd
│       ├── xbrl-instance-2003-12-31.xsd
│       ├── xbrl-linkbase-2003-12-31.xsd
│       ├── xbrldt-2005.xsd
│       ├── xl-2003-12-31.xsd
│       └── xlink-2003-12-31.xsd
├── studentdata.csv
├── twitter authentication.Rdata
└── ximb.txt

Copy disabled (too large) Download .json

Condensed preview — 1258 files, each showing path, character count, and a content snippet. Download the .json file for the full structured content (20,491K chars).

[
  {
    "path": ".gitignore",
    "chars": 65,
    "preview": ".Rproj.user\n.Rhistory\n.RData\n.Ruserdata\n.httr-oauth\nSQF 2012.csv\n"
  },
  {
    "path": "0-Practise/day1.R",
    "chars": 2645,
    "preview": "# Day 1\n\nlibrary(ISLR)\ndata('Default')\nstr(Default)\nLR1 = glm(default ~ ., family='binomial', data=Default)\nsummary(LR1)"
  },
  {
    "path": "0-Practise/day2.R",
    "chars": 523,
    "preview": "# Day 3 - Online batch of MA\n\n#attach function of R\nwomen\nnames(women)\nheight\nattach(women)\nheight\nweight\nwomen$height\n\n"
  },
  {
    "path": "0-Practise/day3.R",
    "chars": 779,
    "preview": "attach(mtcars)\nplot(wt, mpg)\nabline(lm(mpg~wt))\ntitle(\"Regression of MPG on Weight\")\ndetach(mtcars)\n\ndose <- c(20, 30, 4"
  },
  {
    "path": "0-Practise/first.R",
    "chars": 421,
    "preview": "# First File in R\nx1 <- c(1, 5, 4, 9, 0) # <- is assignment x to have value 1,5,4,9,0\n#control + enter\nx2 = c(1, 5, 4, 9"
  },
  {
    "path": "0-Practise/htmlimport.R",
    "chars": 3543,
    "preview": "#Installing the web scraping package rvest\n#install.packages(\"rvest\")\nlibrary(rvest)\n#Specifying the url for desired web"
  },
  {
    "path": "0-Practise/import2.R",
    "chars": 1594,
    "preview": "#web scrapping\n#https://www.analyticsvidhya.com/blog/2017/03/beginners-guide-on-web-scraping-in-r-using-rvest-with-hands"
  },
  {
    "path": "0-Practise/practise.R",
    "chars": 4298,
    "preview": "#List\nx; m1; a1; df1\ng =\"My First List\"\nh = c(25, 26,18,39)\nj = matrix(1:10,nrow=2)\nk = c('one','two','three')\nmylist = "
  },
  {
    "path": "0-Practise/practise2.R",
    "chars": 38,
    "preview": "#misc practise\n\nx = 1:5\ndata.entry(x)\n"
  },
  {
    "path": "0-Practise/rough.R",
    "chars": 1226,
    "preview": "# Rough Work\n?cat\n?dput\n?dget\n?dump\n?write\n?write.table\n?save\n?detach\n?attach\n?dir\n?ls\n?rm\n?attr\n?attributes\n\ndata1 = c("
  },
  {
    "path": "0-Practise/vector.R",
    "chars": 225,
    "preview": "#Data Structure - Vectors\n\nx = c(1,5,7,8,4)\nx2 <- c(2,5,7,8,4)\nx\nx2\nx4 = c('M','F','M','F','M')\nx4\n(x5 = 1:100)\n(x6 = se"
  },
  {
    "path": "00-toc.R",
    "chars": 47,
    "preview": "# Table of Contents\n\n#Folder Name - Topic\n\nlms "
  },
  {
    "path": "02-lms/1-ds.R",
    "chars": 4009,
    "preview": "# Data Structures\n\n#vectors----\nv1 = 1:100 #create vector from 1 to 100\nv2 = c(1,4,5,10)\nclass(v1)\nclass(v2)\nv3 = c('a',"
  },
  {
    "path": "02-lms/fms.txt",
    "chars": 1611,
    "preview": "The Faculty of Management Studies focuses on management education more than just business management. The commitment is "
  },
  {
    "path": "02-lms/importcsv.R",
    "chars": 400,
    "preview": "#import from csv\n\ndf1 = read.csv('pdpu.csv')\ndf1\nhead(df1)\nnames(df1)\n#avg marks gender wise\naggregate(cbind(df1$marks1,"
  },
  {
    "path": "03-wksp1/1a1-start.R",
    "chars": 696,
    "preview": "#initial commands\n\n# assign\nx1 = 3 #press control + enter to run the line\nx2 <- 3 # same \n#which is better\nx1\nx2\ny\nls() "
  },
  {
    "path": "03-wksp1/1a3-packages1.R",
    "chars": 3211,
    "preview": "# Packages installation\n\n#List avl packages\nlibrary()\n\n\n#Total Avl Packages\nnrow(available.packages())\n\n#Install Package"
  },
  {
    "path": "03-wksp1/1b2-ds.R",
    "chars": 7189,
    "preview": "# Data Structures in R\n\n#control+enter when you are in the line to execute\n# Vectors-----\nc(2,4,6)\nseq(2,3,.5)\nseq(by=.5"
  },
  {
    "path": "03-wksp1/1b3-factor.R",
    "chars": 643,
    "preview": "\n\n(grades = sample(c(LETTERS[1:4]), size=30, replace=T, prob=c(.4,.2,.3,.1 )))\nsummary(grades)\n(gradesF = factor(grades)"
  },
  {
    "path": "03-wksp1/1d2-basicstats.R",
    "chars": 620,
    "preview": "# Basic Stats\nx = ceiling(rnorm(10000, mean=60, sd=20))\nmean(x)\nmedian(x)\n#there is no mode function for mode stats\ntabl"
  },
  {
    "path": "03-wksp1/1d2-dm-student1.R",
    "chars": 5943,
    "preview": "# Data Manipulation : Academic Data\n\n#Method1 : gsheet\nlibrary(gsheet)\nurl= \"https://docs.google.com/spreadsheets/d/1qLH"
  },
  {
    "path": "03-wksp1/1d3-dencoCase.R",
    "chars": 4430,
    "preview": "# Case Study - Denco  \n#Manufacturing Firm with sales data of partnum and customer with region wise sales\n\n# Should know"
  },
  {
    "path": "03-wksp1/1d4-DA-dencoCase.R",
    "chars": 3403,
    "preview": "# Case Study - Denco  \n\n#read file : Method1\nsales1 = read.csv(\"./data/denco.csv\")\nstr(sales1)\n\n#read file : Method2\nsal"
  },
  {
    "path": "03-wksp1/1e-graphs-basic.R",
    "chars": 1967,
    "preview": "# Combined Plots\n#plot, histogram, pie, boxplot, linechart, correlation plot\n\n#plot\nwomen\nplot(women)\n?plot\nplot(women, "
  },
  {
    "path": "03-wksp1/1e2-graphs.R",
    "chars": 1591,
    "preview": "# Combined Plots\n\n#plot, histogram, pie, boxplot, linechart, correlation plot\n\n\n#plot\nwomen\n?women\nstr(women)\nplot(women"
  },
  {
    "path": "03-wksp1/1e3-advgraphs.R",
    "chars": 666,
    "preview": "#Advanced Graphs\n\n\nlibrary(corrgram)\ncor(mtcars[1:4])\ncorrgram(mtcars[1:4], order=TRUE, lower.panel=panel.shade,\n       "
  },
  {
    "path": "03-wksp1/1f-SLR-women.R",
    "chars": 545,
    "preview": "#topics ----\n#factors, env, import/export. package install\n#rep, recode, split, partition, subset, loops, cast & melt\n#m"
  },
  {
    "path": "03-wksp1/1h1-dplyr.R",
    "chars": 6533,
    "preview": "#dplyr - mtcars\nlibrary(dplyr)\n#library(tidyverse)\n#Filter----\n\nfilter(mtcars, cyl == 8)\nfilter(mtcars, cyl < 6)\n\n# Mult"
  },
  {
    "path": "03-wksp1/1h2-freqtable.R",
    "chars": 1199,
    "preview": "# Frequency Distribution\n\n#Discrete Cat Data\n(attend = c('A','P','P','A','P','A'))\ntable(attend)\ncbind(table(attend))  #"
  },
  {
    "path": "03-wksp1/2a-importExport.R",
    "chars": 1875,
    "preview": "# Read Data into R Environment\n\n#CSV Files----\n#Read from CSV file in PC\nhead(iris)\nwrite.csv(iris, \"./data/iris.csv\", r"
  },
  {
    "path": "03-wksp1/2b-SLR-salesarea.R",
    "chars": 5331,
    "preview": "#Simple Linear Regression - Case Study\n# Regression : Areas vs Sales\n#Given data of area and sales, predict value for sa"
  },
  {
    "path": "03-wksp1/2b-allmodels.R",
    "chars": 5435,
    "preview": "# All models\nlibrary(dplyr)\n\n#Linear Regression\nhead(women)\nhead(mtcars)\n#predict weight for new height\nplot(women)\nplot"
  },
  {
    "path": "03-wksp1/2b2-SLM-women.R",
    "chars": 2269,
    "preview": "# Regression Analysis\n# Simple Linear with 1 IV and 1 DV\n\ndata(women)\nwomen\nnames(women)\nstr(women)\n\ncov(women$height, w"
  },
  {
    "path": "03-wksp1/2b3-SLM-women-A.R",
    "chars": 2527,
    "preview": "# Simple Linear Regression : Built in Data Set Women\n# Check for assumptions of Regression in the data Set\nwomen\n?women\n"
  },
  {
    "path": "03-wksp1/2b4-LM-cars.R",
    "chars": 929,
    "preview": "\n#http://r-statistics.co/Linear-Regression.html\nhead(cars) \nfit2=lm(dist ~ speed, data=cars)\nsummary(fit2)\npredict(fit2,"
  },
  {
    "path": "03-wksp1/2b4-SLR-women.R",
    "chars": 215,
    "preview": "\nfit = lm(weight ~ height, data=women)\nsummary(fit)\nrange(women$height)\n(ndata = data.frame(height= c(58.5, 60.7)))\n(p ="
  },
  {
    "path": "03-wksp1/2c3-MLM-salespromotion.R",
    "chars": 3156,
    "preview": "#Multiple Linear Regression : DV vs more than 1 IVs\n#sales Qty vs price & promotion\n#Predict Sales Qty from Price and Pr"
  },
  {
    "path": "03-wksp1/2c4-MLM-mtcars1.R",
    "chars": 827,
    "preview": "#https://cran.r-project.org/web/packages/olsrr/olsrr.pdf\n#install.packages('olsrr')\nlibrary(olsrr) #install it first\n\nmo"
  },
  {
    "path": "03-wksp1/2d1-missingvalues.R",
    "chars": 907,
    "preview": "# Missing values\n\nx = c(NA, 1, NA, 2,3, NA)\nis.na(x)\nsum(is.na(x))\nsum(c(T,F,T,F,F))\nmean(x)\n?mean\nmean(x, na.rm=T)\nx\nx["
  },
  {
    "path": "03-wksp1/2d3-datapartition.R",
    "chars": 1120,
    "preview": "#partition the data into train and test set\nmtcars\nnrow(mtcars)\n#train-70%, test-30%\nsample(x=1:32, size=.7 * 32)\nindex "
  },
  {
    "path": "03-wksp1/2e1-logR-purchase.R",
    "chars": 1855,
    "preview": "# Logistic Regression : Predict Purchase\n\n\n# Import the dataset\ndf1 = read.csv('./data/logr2.csv')\nhead(df1)\n\nurl=\"https"
  },
  {
    "path": "03-wksp1/2e2-LOGR-adult.R",
    "chars": 2962,
    "preview": "#Logistic Regression : Binary Cls : 0 or 1\n\n#Case Study :  predict if an individual will earn more than $50K using logis"
  },
  {
    "path": "03-wksp1/2e3-LOGR-gre.R",
    "chars": 2985,
    "preview": "#Logistic Regresion : GRE\n#https://stats.idre.ucla.edu/r/dae/logit-regression/\n#A researcher is interested in how variab"
  },
  {
    "path": "03-wksp1/3b1-DT-CART-carseats.R",
    "chars": 1334,
    "preview": "# Decision Trees : - regression tree\n#install and load this library\nlibrary(ISLR)\ndata(Carseats)\n?Carseats\ndata = Carsea"
  },
  {
    "path": "03-wksp1/3b2-DT-CART-R-sales.R",
    "chars": 1333,
    "preview": "# CART Models - HH Case Study - Regression\nlibrary(rpart)\nlibrary(rpart.plot)\nlibrary(forecast)\n\nlibrary(gsheet)\nurl='ht"
  },
  {
    "path": "03-wksp1/3b3-DT-CART-titanic.R",
    "chars": 1295,
    "preview": "# Decision Tree - Classification\n#we want predict for combination of input variables, is a person likely to servive or n"
  },
  {
    "path": "03-wksp1/3b4-DT-CART-R-loan.R",
    "chars": 408,
    "preview": "\n\n#Decision Tree\n#(https://rpubs.com/fabiorocha5150/decisiontreemodel)\n\nurl='https://raw.githubusercontent.com/fabiorcam"
  },
  {
    "path": "03-wksp1/3b5-DT-loanapproved1.R",
    "chars": 1080,
    "preview": "# Decision Tree # loanapproved = age + job + house + credit\nloanapproved = sample(x=c('Yes','No'), size=50, replace=T)\na"
  },
  {
    "path": "03-wksp1/3b5-DT-rpart-iris.R",
    "chars": 1563,
    "preview": "#CART Regression Tree\n\n#Load Libraries\nlibrary(rpart)  #does only binary splits; CART\nlibrary(rpart.plot)\n\n#DataSet\nstr("
  },
  {
    "path": "03-wksp1/3d1-DT-CHAID-usvote.R",
    "chars": 926,
    "preview": "#CHAID - dataset USvote #multisplit\n# require(rsample) # for dataset and splitting also loads broom and tidyr\n#install.p"
  },
  {
    "path": "03-wksp1/3e1-clust-customer.R",
    "chars": 1024,
    "preview": "# HH MA example  - customer\n\n#install.packages(\"amap\")\nlibrary(amap)\n##Read the data in the file\nurl = 'https://docs.goo"
  },
  {
    "path": "03-wksp1/3e1-clustering.R",
    "chars": 3675,
    "preview": "#Clustering\n#sample data, iris, no of clusters\nlibrary(cluster)\nlibrary(fpc)\nlibrary(dplyr)\n\n#sample Data\nmarks = data.f"
  },
  {
    "path": "03-wksp1/3e2-clust-samplecase.R",
    "chars": 357,
    "preview": "# Clustering\n\nset.seed(1234)\nsubject1 = trunc(rnorm(30, mean=60, sd=15))\nrange(subject1)\nsubject1\nmarks = data.frame(sub"
  },
  {
    "path": "03-wksp1/3e3-clust-segmentation.R",
    "chars": 1028,
    "preview": "# HH MA example  - customer\n\n#install.packages(\"amap\")\nlibrary(amap)\n##Read the data in the file\nurl = 'https://docs.goo"
  },
  {
    "path": "03-wksp1/3e4-clust-noOfclusters.R",
    "chars": 1206,
    "preview": "#Optimal Number of Clusters in data\n#Reduce total within ss\niris\nhead(iris)\ntable(iris$Species)\n\ndata = iris[-5]\nhead(da"
  },
  {
    "path": "03-wksp1/4b1-AR-groceries.R",
    "chars": 2751,
    "preview": "# Association Rules - Groceries data set ####\n\nlibrary(arules)  #install first\nlibrary(arulesViz) #install first\nlibrary"
  },
  {
    "path": "03-wksp1/4b2-AR-samplecase.R",
    "chars": 2801,
    "preview": "# Association Rule - Simple Example Case\n# read this pdf for help\n#https://cran.r-project.org/web/packages/arules/arules"
  },
  {
    "path": "03-wksp1/4b3-AR-groceries-subset.R",
    "chars": 3624,
    "preview": "#AR - Groceries - Subset\n\n#Subsetting rules and itemsets\nrules <- apriori(Groceries, parameter = list(support=.001, conf"
  },
  {
    "path": "03-wksp1/4b5-AR-finproducts.R",
    "chars": 3634,
    "preview": "# AR data for Finance\n\nlibrary(\"arules\")\n\nset.seed(101)\ntransactionID = sample(1:500, 1000, replace=T)\ntransactionID\n\nfi"
  },
  {
    "path": "03-wksp1/4e1-twitter1.R",
    "chars": 3273,
    "preview": "#Twitter 1 - Configure Tweets and Download them\n#@dupadhyaya  #Working using my Keys\n#Load libraries\nlibrary(\"curl\")\nlib"
  },
  {
    "path": "03-wksp1/4e2-wordcloud.R",
    "chars": 765,
    "preview": "# Word Cloud\n\n##http://dni-institute.in/blogs/colorful-word-cloud-using-r/\n# tm for text mining\n# SnowballC for text ste"
  },
  {
    "path": "03-wksp1/4e3-worldcloud2.R",
    "chars": 1676,
    "preview": "# World Cloud 2\nlibrary(wordcloud)\nlibrary(RColorBrewer)\nlibrary(SnowballC)\nlibrary(RCurl)\nlibrary(XML)\nlibrary(tm)\n\n# R"
  },
  {
    "path": "03-wksp1/4e5-wordcloud3.R",
    "chars": 2774,
    "preview": "# World Cloud\n#http://stat.ethz.ch/R-manual/R-devel/library/base/html/strsplit.html\n#https://stackoverflow.com/questions"
  },
  {
    "path": "03-wksp1/4f2-quantmod1.R",
    "chars": 3588,
    "preview": "#https://ntguardian.wordpress.com/2017/03/27/introduction-stock-market-data-r-1/\n#Stock Analysis  \n\n# Get quantmod\nif (!"
  },
  {
    "path": "03-wksp1/4f3-indianstocks.R",
    "chars": 852,
    "preview": "# Indian Stocks\n\nlibrary(quantmod)\nstart <- as.Date(\"2017-01-01\")\nend <- as.Date(\"2018-10-01\")\ngetSymbols(\"SBIN.NS\", src"
  },
  {
    "path": "03-wksp1/5-wordcloud2-New.R",
    "chars": 933,
    "preview": "#wordcloud2\n\n#install.packages('wordcloud2')\nlibrary(wordcloud2)\n\n?wordcloud2\n\ndf = data.frame(word=c('mdi','iim','imt')"
  },
  {
    "path": "03-wksp1/5b-LP-marketingspend.R",
    "chars": 149,
    "preview": "#LP in R : Marketing Spend\n#https://analyticsprofile.com/business-analytics/how-to-optimise-digital-marketing-spend-usin"
  },
  {
    "path": "03-wksp1/5c2-LP-marketingspend-case.R",
    "chars": 620,
    "preview": "# LP - Marketing Spend\n## Code to solve LP\n\n#install.packages(\"linprog\")\nlibrary(linprog)\n\nMax_ROI = c(0.07, 0.03, 0.15,"
  },
  {
    "path": "03-wksp1/5d-wordcloud2.R",
    "chars": 1029,
    "preview": "##http://www.sthda.com/english/wiki/text-mining-and-word-cloud-fundamentals-in-r-5-simple-steps-you-should-know\n\n#https:"
  },
  {
    "path": "03-wksp1/5d2-LP-tpt.R",
    "chars": 1178,
    "preview": "# LP - Transportation Problem \n#https://docs.google.com/spreadsheets/d/1G6-iPDoD_i4THQAHwBeOLeiTfuqn7a6Q7MrOg9v1C5U/edit"
  },
  {
    "path": "03-wksp1/5e2-LP-machassign.R",
    "chars": 1232,
    "preview": "#----------------------------------------------#\n#Another Method\n#https://cran.r-project.org/web/packages/lpSolveAPI/lpS"
  },
  {
    "path": "03-wksp1/5e5-LP-farmer1.R",
    "chars": 2422,
    "preview": "#Farmer Problem in LP\n#A farmer plans to plant two crops, A and B. The cost of cultivating crop A is $40/acre, whereas t"
  },
  {
    "path": "03-wksp1/6b1-dates.R",
    "chars": 3166,
    "preview": "#Data Format in R \n#When we import data into R, dates and times are usually stored as character or factor by default due"
  },
  {
    "path": "03-wksp1/6b1-ts-data.R",
    "chars": 1745,
    "preview": "# create a time series data\n\n#first create a vector of numerical values\n# 36 observations \nset.seed(1234)\n(sales = round"
  },
  {
    "path": "03-wksp1/6c2-dates-lubridate.R",
    "chars": 4640,
    "preview": "#Package Lubridate https://data.library.virginia.edu/working-with-dates-and-time-in-r-using-the-lubridate-package/\n# Dat"
  },
  {
    "path": "03-wksp1/6d-TS-airpassengers.R",
    "chars": 3657,
    "preview": "# Time Series Case Study - Decomposition\n\n#https://rpubs.com/emb90/137525\n# Data Set - AirPassengers\nx=c(9.23221232,5.34"
  },
  {
    "path": "03-wksp1/6d-ts-components-airp.R",
    "chars": 3469,
    "preview": "# Time Series Case Study - Decomposition\n\n#https://rpubs.com/emb90/137525\n# Data Set - AirPassengers\nx=c(9.23221232,5.34"
  },
  {
    "path": "03-wksp1/6d-ts-johnson.R",
    "chars": 908,
    "preview": "#Johnson Case - TS\n#time series analysis\n#plot, decompose, forecast, \nJohnsonJohnson\n?JohnsonJohnson\n\nmonthplot(JohnsonJ"
  },
  {
    "path": "03-wksp1/6d-ts-xts-data.R",
    "chars": 662,
    "preview": "# xts  - create object and export data\n\nlibrary(xts)\n\n#create matrix : 1 col for 1 share\n(stockprices = matrix(c(100,103"
  },
  {
    "path": "03-wksp1/6e-TS-auto-arima-johnson.R",
    "chars": 2667,
    "preview": "#Times Series Analysis \n# is the price of Johnson and Johnson shares change over time\n# are there quarterly effects with"
  },
  {
    "path": "03-wksp1/6g-ts-TTR-ma.R",
    "chars": 1183,
    "preview": "#Time Series - SMA\nlibrary(TTR)\n\nlibrary(forecast)\n#MA\n#https://www.rdocumentation.org/packages/forecast/versions/8.4/to"
  },
  {
    "path": "03-wksp1/8-fa-quandl.R",
    "chars": 4103,
    "preview": "# Finance Stock Analysis\n#Stock Download\n\nlibrary(Quandl)\n#https://www.quandl.com/account/api 4D8hkYAV4WEkcTmD9LMW\n\nQuan"
  },
  {
    "path": "03-wksp1/8-fa-quandl2.R",
    "chars": 3687,
    "preview": "# Finance Stock Analysis\n\n#Install Packages \n#pckgs<-c(\"Quandl\",\"Sweep\",\"tidyverse\")\n#install.packages(pckgs,dependencie"
  },
  {
    "path": "03-wksp1/8-fa-quantmod.R",
    "chars": 3748,
    "preview": "#https://ntguardian.wordpress.com/2017/03/27/introduction-stock-market-data-r-1/\n#Stock Analysis  \n\n# Get quantmod\nif (!"
  },
  {
    "path": "03-wksp1/8-quantmod-I-stocks.R",
    "chars": 955,
    "preview": "#Indian Stocks\n\n# Indian Stocks\n#stocks2\n\n# Get quantmod\nif (!require(\"quantmod\")) {\n  install.packages(\"quantmod\")\n  li"
  },
  {
    "path": "03-wksp1/zz-practise.R",
    "chars": 4529,
    "preview": "# Practise Exercise - XIMB\n\n#Create a 100 row DF with following Variables\n#gender, spl, age, experience, grade, placemen"
  },
  {
    "path": "04-wksp2/Graph-matrixplots.R",
    "chars": 1237,
    "preview": "# Matrix Plots\n# Multiple Series on one Graph\n\ntable(iris$Species) # is data.frame with 'Species' factor\niS <- iris$Spec"
  },
  {
    "path": "04-wksp2/LMtrainTest.R",
    "chars": 600,
    "preview": "#partition the data into train and test set\nmtcars\nlibrary(caret)\nnrow(mtcars)\nindex = sample(x=1:nrow(mtcars), size=.7 "
  },
  {
    "path": "04-wksp2/Links_DAR",
    "chars": 386,
    "preview": "\nClassification\nhttp://dataaspirant.com/2017/01/30/how-decision-tree-algorithm-works/\n\n#Clustering\nhttps://www.statmetho"
  },
  {
    "path": "04-wksp2/Links_DAR.R",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "04-wksp2/TS-arima-johnson.R",
    "chars": 2633,
    "preview": "#Times Series Analysis \n# is the price of Johnson and Johnson shares change over time\n# are there quarterly effects with"
  },
  {
    "path": "04-wksp2/TS-components-airpassengers.R",
    "chars": 3486,
    "preview": "# Time Series Case Study - Decomposition\n\n#https://rpubs.com/emb90/137525\n# Data Set - AirPassengers\nx=c(9.23221232,5.34"
  },
  {
    "path": "04-wksp2/TS-data.R",
    "chars": 3289,
    "preview": "#ts9\n# save a numeric vector containing 72 monthly observations # from Jan 2009 to Dec 2014 as a time series object\nmyve"
  },
  {
    "path": "04-wksp2/TS-dates.R",
    "chars": 3444,
    "preview": "#Handling and creating Dates in R\n\n#date in yyyy-mm-dd\ndates1a = c('2018-21-4', '2018-29-4')\nclass(dates1a)\n?as.Date\nas."
  },
  {
    "path": "04-wksp2/TS-lubridate.R",
    "chars": 4330,
    "preview": "#Package Lubridate\n\n#functions which allow you to specify the order in which year, month and day components appear in da"
  },
  {
    "path": "04-wksp2/TS-movavg-Nile.R",
    "chars": 1038,
    "preview": "# TS - Simple Moving Average\n#SMA\n# technique to get an overall idea of the trends in a data set; it is an average of an"
  },
  {
    "path": "04-wksp2/TS-movavg.R",
    "chars": 1426,
    "preview": "# Mov Average\n#https://www.otexts.org/fpp/6/2\n\nadmission = c(100,110,115,130,150,145,160,125,123,134)\nlength(admission)\n"
  },
  {
    "path": "04-wksp2/assocrule1.R",
    "chars": 3489,
    "preview": "# Association Rules - Groceries data set ####\n\nlibrary(arules)  #install first\nlibrary(arulesViz) #install first\nlibrary"
  },
  {
    "path": "04-wksp2/assocrule2.R",
    "chars": 3297,
    "preview": "#AR - Groceries - Subset\n\n#Subsetting rules and itemsets\nrules <- apriori(Groceries, parameter = list(support=.001, conf"
  },
  {
    "path": "04-wksp2/assocrule3.R",
    "chars": 2846,
    "preview": "# Association Rule - Simple Example Case\n# read this pdf for help\n#https://cran.r-project.org/web/packages/arules/arules"
  },
  {
    "path": "04-wksp2/decisiontree1.R",
    "chars": 824,
    "preview": "#Decision Tree\n\nlibrary(rpart)\nlibrary(rpart.plot)\n\neducation=factor(c(3,2,2,3,2,2,3,2,2,2))\nmarried=factor(c('S','M','S"
  },
  {
    "path": "04-wksp2/decisiontree2.R",
    "chars": 1250,
    "preview": "\nmarry=sample(c('Yes',\"No\"), size=100, replace=T)\nselfGender=sample(c('M',\"F\"), size=100, replace=T)\nselfAge=ceiling(run"
  },
  {
    "path": "04-wksp2/decisiontree3.R",
    "chars": 1471,
    "preview": "# Decision Trees : - regression tree\n\nlibrary(ISLR)  #download this\ndata(Carseats)\ndata = Carseats\nhead(data)\n?Carseats\n"
  },
  {
    "path": "04-wksp2/decisiontree4.R",
    "chars": 1333,
    "preview": "# CART Models - HH Case Study - Regression\nlibrary(rpart)\nlibrary(rpart.plot)\nlibrary(forecast)\n\nlibrary(gsheet)\nurl='ht"
  },
  {
    "path": "04-wksp2/decisiontree5.R",
    "chars": 513,
    "preview": "#CHAID - dataset USvote #multisplit\n# require(rsample) # for dataset and splitting also loads broom and tidyr\n#install.p"
  },
  {
    "path": "04-wksp2/decisiontree5CHAID.R",
    "chars": 513,
    "preview": "#CHAID - dataset USvote #multisplit\n# require(rsample) # for dataset and splitting also loads broom and tidyr\n#install.p"
  },
  {
    "path": "04-wksp2/df.R",
    "chars": 1641,
    "preview": "#data frame\n\n#rollno, name, batch, gender, marks1, marks2\n\n(rollno = 1:30)\n(name = paste('student',1:30,sep='-'))\n(batch"
  },
  {
    "path": "04-wksp2/environ.R",
    "chars": 204,
    "preview": "# Environment\n\n#objects in memory\nls()\n\n#create an object\nx = 1:5\nls()  #check\ny=100:200\nls()  #check\n\n#remove one\nrm(x)"
  },
  {
    "path": "04-wksp2/freqtable.R",
    "chars": 972,
    "preview": "# Frequency Distribution\n\nduration = faithful$eruptions \nrange(duration)\nbreaks = seq(1.5, 5.5, by=0.5)# half-integer se"
  },
  {
    "path": "04-wksp2/lm-salesarea.R",
    "chars": 3159,
    "preview": "#Simple Linear - Sales vs Area\n\n#import from ggsheet  #pickup the correct url\nlibrary(gsheet)\narea = \"https://docs.googl"
  },
  {
    "path": "04-wksp2/lm-salesqty.R",
    "chars": 964,
    "preview": "#Multiple Linear Regression \n#Linear Modeling : DV vs more than 1 IVs\n#sales Qty vs price & promotion\n\n#Omni Store\n\n#Met"
  },
  {
    "path": "04-wksp2/lm-women-simple.R",
    "chars": 966,
    "preview": "# Simple Linear Regression : Built in Data Set Women\n# Predit Weight of heights of women - 60.5, 75.5\n\n#Check the Data s"
  },
  {
    "path": "04-wksp2/lm.R",
    "chars": 914,
    "preview": "#BITS LM\n\n#simple linear regression\n#how Y depends on X\n#women\nwomen\nstr(women)\ncov(women$height, women$weight)\ncor(wome"
  },
  {
    "path": "04-wksp2/logR.R",
    "chars": 114,
    "preview": "#Logistic Regression\n\n#probability of Y based on single or multiple variables\n#chance of default based on X1, X2\n\n"
  },
  {
    "path": "04-wksp2/logr-gre.R",
    "chars": 1173,
    "preview": "#Logistic Regression : Predict Probability of Selection \n\ndf =  read.csv(\"https://stats.idre.ucla.edu/stat/data/binary.c"
  },
  {
    "path": "04-wksp2/matrix.R",
    "chars": 323,
    "preview": "#matrices\n?matrix\n\n(m1 = matrix(1:12, nrow=4))\nmarks3 = floor(runif(30, 50,90))\nmarks3\nm2 = matrix(marks3, nrow=6)\nm2\n\n("
  },
  {
    "path": "04-wksp2/missingvalues.R",
    "chars": 2588,
    "preview": "# Missing Values\n# Create, Detect, Replace, Remove, Impute, Visualise\nlibrary(VIM)\n\n(v1 = c(1,2,NA,NA,5))\nis.na(v1)\nv1=N"
  },
  {
    "path": "04-wksp2/packages1.R",
    "chars": 824,
    "preview": "#Packages\n\n#List avl packages\nlibrary()\n\n#Total Avl Packages - CRAN Site\nnrow(available.packages())\n\n#Install Package am"
  },
  {
    "path": "04-wksp2/packages2.R",
    "chars": 1090,
    "preview": "# Install multiple Packages\n\n#Install packages for Data Analytics Course\n\n\npackages1 = c('rJava','xlsx','dplyr')\nlist.of"
  },
  {
    "path": "04-wksp2/stats2.R",
    "chars": 1037,
    "preview": "#Stats \n\n#Package fBasics\n\nlibrary(fBasics)\n#skewness\treturns value of skewness,\nskewness(mtcars)\nplot(density(mtcars$cy"
  },
  {
    "path": "04-wksp2/twitter.R",
    "chars": 3140,
    "preview": "#Twitter 1 - Configure Tweets and Download them\n#@dupadhyaya  #Working using my Keys\n#Load libraries\nlibrary(\"curl\")\nlib"
  },
  {
    "path": "04-wksp2/vectors.R",
    "chars": 1530,
    "preview": "#Vector\n\nx = c(1,2,3)\nx1 = 1:10000000\nlength(x1)\nx\nx1\n\nx2 = seq(10, 100, 2)\nseq(from = 1, to = 1, by = ((to - from)/(len"
  },
  {
    "path": "04-wksp2/wordcloud1.R",
    "chars": 1531,
    "preview": "# World Cloud 2\n\n# Read the text file from file\ntext <- readLines(file.choose())\ntext\n\n# Load the data as a corpus\ndocs "
  },
  {
    "path": "04-wksp2/wordcloud2.R",
    "chars": 1434,
    "preview": "#word cloud\n\n# Load\nlibrary(\"tm\")\nlibrary(\"SnowballC\")\nlibrary(\"wordcloud\")\nlibrary(\"RColorBrewer\")\n\n\ntext <- readLines("
  },
  {
    "path": "10a-setup/11a-start.R",
    "chars": 432,
    "preview": "# R Environment\n\n# The operators <- and = assign into the environment in which they are # evaluated. The operator <- can"
  },
  {
    "path": "10a-setup/11b-gettingstarted.R",
    "chars": 205,
    "preview": "# Getting Started- Basics/Shortcuts in R\n# under construction\n\n# For Help\n\n?mean\n\nsessionInfo()\nlibrary(dplyr)\nsessionIn"
  },
  {
    "path": "10a-setup/15a-envrm.R",
    "chars": 903,
    "preview": "# Workspace & Environment\n\n# Identify the directory\ngetwd()\n\n# List all the objects in directory\ndir()\n\n# How to run the"
  },
  {
    "path": "10a-setup/15b-renv.R",
    "chars": 383,
    "preview": "# Sys Env\n\n## whether HOST is set will be shell-dependent e.g. Solaris' csh does not.\nSys.getenv(c(\"R_HOME\", \"R_PAPERSIZ"
  },
  {
    "path": "10a-setup/15e-rjava.R",
    "chars": 209,
    "preview": "#rJava Settings\n\n\n#install java https://java.com/en/download/win10.jsp\n\n\n\nlibrary(xlsx)\nlibrary(rJava)\n\nSys.setenv(JAVA_"
  },
  {
    "path": "10a-setup/16a-pathconfig.R",
    "chars": 284,
    "preview": "# Lib paths\n\n.libPaths()\n\nfile.exists(\"~/.Rprofile\")\nfile.edit(\"~/.Rprofile\")\n# Add these lines to Rprofile\n# .First = f"
  },
  {
    "path": "10a-setup/17a-rstudio.R",
    "chars": 109,
    "preview": "# rstudio\n\n#Tools -> Global Options\n#code -> editing -> Soft Wrap\n\n# themes\n\n#fonts\n\n#colors\n\n#size of Fonts\n"
  },
  {
    "path": "10a-setup/18a-processtime.R",
    "chars": 1678,
    "preview": "# Execution Time\nproc.time()\ng <- rnorm(100000)\nh <- rep(NA, 100000)\n\n# Start the clock!\nptm <- proc.time()\n\n# Loop thro"
  },
  {
    "path": "10a-setup/21a-floorceiling1.R",
    "chars": 690,
    "preview": "# R Tips1\n\n#round----\nround(14.5378, digits=2) # 14.54\nround(14.5378, digits=1) # 14.5\nround(14.5378) #15\n\n#Significant-"
  },
  {
    "path": "10a-setup/21b-options.R",
    "chars": 1203,
    "preview": "#Options\n#Allow the user to set and examine a variety of global options which affect the way in which R computes and dis"
  },
  {
    "path": "10a-setup/24a-github.R",
    "chars": 8,
    "preview": "# github"
  },
  {
    "path": "10a-setup/25a-help.R",
    "chars": 1579,
    "preview": "# Misc Commands\n#https://www.r-project.org/help.html\n\n#library for help-----\nlibrary(swirl)  # for online help\nswirl()\n\n"
  },
  {
    "path": "10a-setup/51c-deletefiles.R",
    "chars": 393,
    "preview": "# Delete Files from command\n\nwrite.csv(mtcars, 'mcars.csv')\nfile.exists('mcars.csv')\nunlink('mcars.csv')\nfile.exists('mc"
  },
  {
    "path": "10a-setup/help.R",
    "chars": 128,
    "preview": "# Misc Commands\n\nlibrary(swirl)  # for online help\nswirl()\n\ndata()  # to see in built data sets\n\nmean(mtcars$mgp)\n?mean\n"
  },
  {
    "path": "10a-setup/pathconfig.R",
    "chars": 283,
    "preview": "# Lib paths\n\n.libPaths()\nfile.exists(\"~/.Rprofile\")\nfile.edit(\"~/.Rprofile\")\n# Add these lines to Rprofile\n# .First = fu"
  },
  {
    "path": "10d-excel/student1.R",
    "chars": 1330,
    "preview": "# R to Excel \n# Data Analysis in Excel\n\n\nrollno = paste('S',1:30,sep='-')\nname = paste0('Student',1:30, ' - surname')\nna"
  },
  {
    "path": "10e-impexp/14a-readcsv.R",
    "chars": 408,
    "preview": "# Read csv\n\n# How to read CSV File\n\n#read.csv(file, header = TRUE, sep = \",\", quote = \"\\\"\", dec = \".\", fill = TRUE, comm"
  },
  {
    "path": "10e-impexp/14b-readcsv.R",
    "chars": 202,
    "preview": "# reading from file\n\n#used generally when u don't want to path to the file\n#or location is different from Project Folder"
  },
  {
    "path": "10e-impexp/14c-importweb.R",
    "chars": 478,
    "preview": "# read from Internet \n\n#read.csv(url(\"http://some.where.net/data/foo.csv\"))\n#url is optional\n\ndf2 = read.csv('http://www"
  },
  {
    "path": "10e-impexp/14d-importweb.R",
    "chars": 800,
    "preview": "#Loading the rvest package\nlibrary('rvest')\n\n#Specifying the url for desired website to be scrapped\nlego_movie <- read_h"
  },
  {
    "path": "10e-impexp/14e-readothers.R",
    "chars": 57,
    "preview": "# reading data from other formats \n# txt, \n# spss, sas\n\n\n"
  },
  {
    "path": "10e-impexp/15b-datawrangling.R",
    "chars": 2561,
    "preview": "drinks = data.frame(ingredient=c('amulya', 'pepsi',\n                                'maaza','vodka'), sweetness=c(7,9,4,"
  },
  {
    "path": "10e-impexp/20a-importgg.R",
    "chars": 694,
    "preview": "#Importing from ggsheet\n#install.packages('gsheet')\nlibrary(gsheet)\n\nregr1 = \"https://docs.google.com/spreadsheets/d/1Qo"
  },
  {
    "path": "10e-impexp/21b-googlesheet1.R",
    "chars": 2292,
    "preview": "#R and Google Sheets\n#https://cran.r-project.org/web/packages/googlesheets/vignettes/basic-usage.html\n\nlibrary(googleshe"
  },
  {
    "path": "10e-impexp/22b-ggsheets2.R",
    "chars": 558,
    "preview": "#https://googlesheets4.tidyverse.org/\n\n\n#install.packages(\"devtools\")\nlibrary(devtools)\ndevtools::install_github(\"tidyve"
  },
  {
    "path": "10e-impexp/31a-export.R",
    "chars": 390,
    "preview": "#export data\n\nmtcars\nwrite.csv(mtcars, file='./data/mtcars.csv')\n\n#list files\ndir('./data')\nlist.files('./data')\n\nfile.e"
  },
  {
    "path": "10e-impexp/32c-writecsv.R",
    "chars": 21,
    "preview": "# Write to csv file\n\n"
  },
  {
    "path": "12a-packages1/21b-installpackages.R",
    "chars": 696,
    "preview": "#Install packages for Data Analytics Course\n\n\npackages1 = c('rJava','xlsx','dplyr')\npackages2 = c('plyr', 'psych', 'tm',"
  },
  {
    "path": "12a-packages1/21e-installFmGit.R",
    "chars": 692,
    "preview": "#install packages from Git Hub\n\n#lubripack\ninstall.packages('lubripack')  #NA for some versions\n#install older version o"
  },
  {
    "path": "12a-packages1/21g-packages1.R",
    "chars": 2864,
    "preview": "# Packages installation\n#https://www.rstudio.com/products/rpackages/\n\n#List avl packages\nlibrary()\n\n#Total Avl Packages\n"
  },
  {
    "path": "12a-packages1/31b-datasets.R",
    "chars": 1126,
    "preview": "# Data Sets\n#https://stat.ethz.ch/R-manual/R-devel/library/datasets/html/00Index.html\n\n#built in datasets in base\nmtcars"
  },
  {
    "path": "12a-packages1/31c-datasets.R",
    "chars": 448,
    "preview": "#Datasets\n\n?datasets                     # Using R's built in data sets\ndata()\n\nlibrary(help=datasets)\n\ndata(mtcars)    "
  },
  {
    "path": "12a-packages1/41-purrr1.R",
    "chars": 672,
    "preview": "#purrr\n#https://www.weirdfishes.blog/blog/practical-purrr/\n\ndf = mtcars\n\nlibrary(purrr)\nlibrary(dplyr)\n\n#functions\n\nmu ="
  },
  {
    "path": "12a-packages1/42-purr2.R",
    "chars": 960,
    "preview": "#purrr\n\ndf = mtcars\n\nlibrary(purrr)\nlibrary(dplyr)\n\ndf %>% split(.$cyl)\n\ndf %>% split(.$cyl)  %>% walk(print)\n\n\ndf %>% s"
  },
  {
    "path": "12a-packages1/43-purrr3.R",
    "chars": 1156,
    "preview": "#purrr3\n\nlibrary(purrr)\nlibrary(dplyr)\n\ncar_data <- transform(aggregate(. ~ cyl,  data = subset(mtcars, hp > 100), FUN ="
  },
  {
    "path": "12a-packages1/44-purrr4.R",
    "chars": 1832,
    "preview": "#purrr4\n\n#purrr is designed to help with “functional programming”, which you can take broadly as trying to use functions"
  },
  {
    "path": "12a-packages1/45-purrr5.R",
    "chars": 1765,
    "preview": "#purrr5\n#https://adv-r.hadley.nz/functionals.html\n\nlibrary(purrr)\nlibrary(dplyr)\n\ntriple <- function(x) x * 3\nmap(1:3, t"
  },
  {
    "path": "12a-packages1/51-plyr1.R",
    "chars": 1525,
    "preview": "# split - apply - combine\n\n#https://vita.had.co.nz/papers/plyr.pdf\na*ply(.data, .margins, .fun, ..., .progress = \"none\")"
  },
  {
    "path": "12a-packages1/61-splitapplycombine1.R",
    "chars": 440,
    "preview": "#plyr, dplyr, \n\n#http://krlmlr.github.io/pdlyr/vignettes/pdlyr.html\n\n#https://coolbutuseless.bitbucket.io/2018/03/03/spl"
  },
  {
    "path": "12a-packages1/62-splitapplycombine2.R",
    "chars": 457,
    "preview": "#split apply combine\n\ndf = mtcars\nbaseball\nlibrary(plyr)\nbaseball.1 <- ddply(baseball, .(id), transform, cyear = year - "
  },
  {
    "path": "12a-packages1/71-broom1.R",
    "chars": 1082,
    "preview": "\nlibrary(tidyr)\nlibrary(dplyr)\nlibrary(broom)\nmdply(expand.grid(mean = 1:5, sd = 1:5), as.data.frame(rnorm), n = 10)\nmtc"
  },
  {
    "path": "12a-packages1/packages1.R",
    "chars": 3162,
    "preview": "# Packages installation\n\n#List avl packages\nlibrary()\n\n\n#Total Avl Packages\nnrow(available.packages())\n\n#Install Package"
  },
  {
    "path": "13a-Packages2/10a-fBasics.R",
    "chars": 990,
    "preview": "#Package fBasics\n#http://math.furman.edu/~dcs/courses/math47/R/library/fBasics/html/015A-BasicStatistics.html\n\nlibrary(f"
  },
  {
    "path": "13a-Packages2/11a-pysch.R",
    "chars": 65,
    "preview": "#Descriptive Values of data set\n\nlibrary(psych)\ndescribe(mtcars)\n"
  },
  {
    "path": "15a-DS/0FileList.R",
    "chars": 90,
    "preview": "# List of Files in this folder\n\n# Data Structures in R\n# Use Help\n# Packages\n# PathConfig\n"
  },
  {
    "path": "15a-DS/10a-TOC",
    "chars": 88,
    "preview": "#Files in this folder related\n\n# Data Structures in R\n# Use Help\n# Packages\n# PathConfig"
  },
  {
    "path": "15a-DS/13b-ds-blank.R",
    "chars": 118,
    "preview": "# Data Structures in R\n\n#Vectors----\n\n\n\n\n#Matrix----\n\n\n\n\n\n#Arrays----\n\n\n\n\n\n\n\n#List----\n\n#Factors----\n\n#DataFrame----\n\n"
  },
  {
    "path": "15a-DS/14b-Basic_R_v01.R",
    "chars": 14461,
    "preview": "\n\n#BASICS ABOUT R =================================\n\nx<-10+20                  # Simple Math\nx\n\n\n10+20 -> x             "
  },
  {
    "path": "15a-DS/14b-objectsmethods.R",
    "chars": 394,
    "preview": "\n#Methods to handle objects\nx = 1:5\nmethods(class=class(x))\n\nmethods(class=class(mtcars))\n\n\n#seq\n1:5\nrev(x)\nmatch(5, x)\n"
  },
  {
    "path": "15a-DS/14c-ds1.R",
    "chars": 3299,
    "preview": "# Data Structures in R\n\n#Vectors----\nv1 = c('A', 'B','C')   #create a vector\nv1   #print the vector\nclass(v1)  #print th"
  },
  {
    "path": "15a-DS/15a-objects.R",
    "chars": 1018,
    "preview": "# Objects\n# \n\nm1 = matrix(c(10:1, rep(5,10), rep(c(5,6),5),seq_len(length.out=10)), byrow=F, ncol =4)\ncolnames(m1) = c('"
  },
  {
    "path": "15a-DS/16b-datatypes.R",
    "chars": 4349,
    "preview": "# R Programming\n?mean\nx <- c(0:10, 50)\nx\nx[c(seq(1,12,2))]\n?c\nseq(2,100,2)\n\n?seq\nxm <- mean(x)\nxm\nmean(x, trim = 0.10)\n\n"
  },
  {
    "path": "15a-DS/16c-basicDT.R",
    "chars": 2164,
    "preview": "#Basic Data Types\n\n#Numeric ----\n#Decimal values are called numerics in R. It is the default computational data type. If"
  },
  {
    "path": "15a-DS/16d-ds1.R",
    "chars": 3259,
    "preview": "# Data Structures in R\n\n#Vectors----\nv1 = c('A', 'B','C')   #create a vector\nv1   #print the vector\nclass(v1)  #print th"
  },
  {
    "path": "15a-DS/20a-vectors.R",
    "chars": 2945,
    "preview": "# Vectors\n\n#sequence of data elements of the same basic type\n\n#Scalar\nx1 = 1\nx2 <- 2\nx1\nx2\n(x3 = 3) #assign and print\n\n#"
  },
  {
    "path": "15a-DS/20b-vectors2.R",
    "chars": 84,
    "preview": "# Vectors 2\n\nx <- vector()\n# with a length and type\nvector(\"character\", length = 10)"
  },
  {
    "path": "15a-DS/20c-vectors.R",
    "chars": 1282,
    "preview": "#Datatypes:  string- name, location, department\n# Number--- age, salary, weight, height\n# Boolean/Logical---- TRUE/FALSE"
  },
  {
    "path": "15a-DS/20d-vectorfunctions.R",
    "chars": 287,
    "preview": "# Functions for Vectors\n\nx = 1:10\n\n#Confirmation Functions\nis.numeric(x)\nis.character(x)\nis.complex(x)\nis.integer(x)\nis."
  },
  {
    "path": "15a-DS/20f-vectors.R",
    "chars": 2536,
    "preview": "# Vectors\n\n#sequence of data elements of the same basic type\n\n#Scalar\n\n#Vector----\nx = c(1, 5, 4, 9, 0)\ntypeof(x)  #[1] "
  },
  {
    "path": "15a-DS/20g-valuegenerate.R",
    "chars": 78,
    "preview": "# Value Generation\n\n#seq\n\n\n#rep\n\n#rnorm\n\n#runif\n\n#sample\n\n#months\n\n#alphabets\n"
  },
  {
    "path": "15a-DS/20h-vectors2.R",
    "chars": 84,
    "preview": "# Vectors 2\n\nx <- vector()\n# with a length and type\nvector(\"character\", length = 10)"
  },
  {
    "path": "15a-DS/25a-matrices.R",
    "chars": 1087,
    "preview": "# Matrices\nix(1:12, ncol=3, byrow=T))\n\nclass(m1)\n#[1] \"matrix\"\nattributes(m1)\n#$dim\n#[1] 4 3\ndim(m1)\n#[1] 4 3\n\n#names of"
  },
  {
    "path": "15a-DS/25c-matrices.R",
    "chars": 163,
    "preview": "#matrix\n\nlibrary(matlib)\n\nA <- matrix( c(5, 1, 0,\n               3,-1, 2,\n               4, 0,-1), nrow=3, byrow=TRUE)\nA"
  },
  {
    "path": "15a-DS/25d-matrices.R",
    "chars": 664,
    "preview": "# Matrix\n#col or rownames, sweep, addmargins\n(m1 = matrix(1:12, nrow=4))\n\nclass(m1)\nattributes(m1)\n\n#names of cols and r"
  },
  {
    "path": "15a-DS/25e-matrices.R",
    "chars": 10,
    "preview": "# Matrices"
  },
  {
    "path": "15a-DS/27a-arrays.R",
    "chars": 1273,
    "preview": "# Arrays\n# 2 states ; Each State has 3 districts : Each District has 4 cities\nstate = c('state1', 'state2')\ndistrict = c"
  },
  {
    "path": "15a-DS/27b-arrays.R",
    "chars": 808,
    "preview": "# Arrays\n# 2 states ; Each State has 3 districts : Each District has 4 cities\nstate = c('state1', 'state2')\ndistrict = c"
  },
  {
    "path": "15a-DS/27d-arrays.R",
    "chars": 1271,
    "preview": "# Arrays\n# 2 states ; Each State has 3 districts : Each District has 4 cities\nstate = c('state1', 'state2')\ndistrict = c"
  },
  {
    "path": "15a-DS/30c-basicdatatypes.R",
    "chars": 2164,
    "preview": "#Basic Data Types\n\n#Numeric ----\n#Decimal values are called numerics in R. It is the default computational data type. If"
  },
  {
    "path": "15a-DS/30d-ds1.R",
    "chars": 2701,
    "preview": "# data strucutres \n\n#Vectors----\nv1 = c(1,2,3,4,5,4)\nclass(v1)\nv1\nv1[v1==4]=7\nv1\n?class\nv2= c('A','B','C','D','e')\nv2\ncl"
  },
  {
    "path": "15a-DS/30e-datatypes.R",
    "chars": 4349,
    "preview": "# R Programming\n?mean\nx <- c(0:10, 50)\nx\nx[c(seq(1,12,2))]\n?c\nseq(2,100,2)\n\n?seq\nxm <- mean(x)\nxm\nmean(x, trim = 0.10)\n\n"
  },
  {
    "path": "15a-DS/33b-df.R",
    "chars": 1117,
    "preview": "# Data Frames\n#data frame\n#(rollno= c('MBA-01', 'MBA-02'))\n(rollno= paste('MBA',1:60,sep='-'))\n\n#(name = c('Student1', '"
  },
  {
    "path": "15a-DS/33c-df.R",
    "chars": 13,
    "preview": "# Data Frames"
  },
  {
    "path": "15a-DS/35a-lists.R",
    "chars": 196,
    "preview": "# Lists\n# \ng =\"My First List\"\nh = c(25, 26,18,39)\nj = matrix(1:10,nrow=2)\nk = c('one','two','three')\nmylist = list(title"
  },
  {
    "path": "15a-DS/35b-lists.R",
    "chars": 195,
    "preview": "# Lists\n\ng =\"My First List\"\nh = c(25, 26,18,39)\nj = matrix(1:10,nrow=2)\nk = c('one','two','three')\nmylist = list(title=g"
  },
  {
    "path": "15a-DS/35e-lists.R",
    "chars": 196,
    "preview": "# Lists\n# \ng =\"My First List\"\nh = c(25, 26,18,39)\nj = matrix(1:10,nrow=2)\nk = c('one','two','three')\nmylist = list(title"
  }
]

// ... and 1058 more files (download for full content)

About this extraction

This page contains the full source code of the dupadhyaya/analytics GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 1258 files (18.8 MB), approximately 5.0M tokens. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.

Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.

Extract another repo