Repository: christophergandrud/Rep-Res-Book Branch: master Commit: 716e545e0764 Files: 90 Total size: 1.4 MB Directory structure: gitextract_fuj2c_ui/ ├── .gitignore ├── Old/ │ ├── BookMake.R │ ├── CoverGraphics/ │ │ └── 2ndEditionCover_v1/ │ │ ├── index.html │ │ └── main.css │ ├── EarlyOutline.md │ ├── README.md │ ├── Source-v2/ │ │ ├── .gitignore │ │ ├── Children/ │ │ │ ├── Chapter1/ │ │ │ │ ├── chapter1.Rnw │ │ │ │ └── chapter1.md │ │ │ ├── Chapter10/ │ │ │ │ └── chapter10.Rnw │ │ │ ├── Chapter11/ │ │ │ │ └── chapter11.Rnw │ │ │ ├── Chapter12/ │ │ │ │ └── chapter12.Rnw │ │ │ ├── Chapter13/ │ │ │ │ └── chapter13.Rnw │ │ │ ├── Chapter14/ │ │ │ │ └── chapter14.Rnw │ │ │ ├── Chapter2/ │ │ │ │ └── chapter2.Rnw │ │ │ ├── Chapter3/ │ │ │ │ └── chapter3.Rnw │ │ │ ├── Chapter4/ │ │ │ │ └── chapter4.Rnw │ │ │ ├── Chapter5/ │ │ │ │ └── chapter5.Rnw │ │ │ ├── Chapter6/ │ │ │ │ └── chapter6.Rnw │ │ │ ├── Chapter7/ │ │ │ │ └── chapter7.Rnw │ │ │ ├── Chapter8/ │ │ │ │ └── chapter8.Rnw │ │ │ ├── Chapter9/ │ │ │ │ └── chapter9.Rnw │ │ │ └── FrontMatter/ │ │ │ ├── AdditionalResources/ │ │ │ │ └── AdditionalResources.Rnw │ │ │ ├── Packages.Rnw │ │ │ ├── Preface.Rnw │ │ │ ├── StylisticConventions.md │ │ │ └── rep-res-PackagesCited.bib │ │ ├── Rep-Res-Parent.Rnw │ │ ├── Rep-Res-Parent.toc │ │ ├── krantz.cls │ │ └── rep-res-book.bib │ ├── SourceOld/ │ │ ├── Chapter1/ │ │ │ └── chapter1.Rmd │ │ ├── Chapter10/ │ │ │ └── chapter10.Rmd │ │ ├── Chapter11/ │ │ │ └── chapter11.Rmd │ │ ├── Chapter12/ │ │ │ └── chapter12.Rmd │ │ ├── Chapter13/ │ │ │ └── chapter13.Rmd │ │ ├── Chapter14/ │ │ │ └── chapter14.Rmd │ │ ├── Chapter2/ │ │ │ └── chapter2.Rmd │ │ ├── Chapter3/ │ │ │ └── chapter3.Rmd │ │ ├── Chapter4/ │ │ │ └── chapter4.Rmd │ │ ├── Chapter5/ │ │ │ └── chapter5.Rmd │ │ ├── Chapter6/ │ │ │ └── chapter6.Rmd │ │ ├── Chapter7/ │ │ │ └── chapter7.Rmd │ │ ├── Chapter8/ │ │ │ └── chapter8.Rmd │ │ └── Chapter9/ │ │ └── chapter9.Rmd │ └── Writing_Setup/ │ ├── Early_Book_Origins.md │ ├── HeaderFooter/ │ │ ├── IndvChapterFoot.tex │ │ └── IndvChapterHead.tex │ ├── IndvChapter.sh │ ├── IndvChapter1.Rnw │ ├── OldScripts/ │ │ ├── ConvertRmdtoRnw.sh │ │ └── Rmd_Book.sh │ ├── ProductionNotes.md │ ├── Rnw_Book.sh │ └── TableofContentPDF/ │ ├── GandrudRep-Res-Book-TOC.fdb_latexmk │ ├── GandrudRep-Res-Book-TOC.tex │ └── krantz.cls ├── README.Rmd ├── README.md └── rep-res-3rd-edition/ ├── .gitignore ├── 01-author.Rmd ├── 01-stylistic-conventions.Rmd ├── 02-additional-resources.Rmd ├── 03-introduction.Rmd ├── 04-getting-started.Rmd ├── 05-start-R.Rmd ├── 06-file-management.Rmd ├── 07-storage.Rmd ├── 08-gather.Rmd ├── 09-clean.Rmd ├── 10-modeling.Rmd ├── 11-tables.Rmd ├── 12-figures.Rmd ├── 13-latex.Rmd ├── 14-web.Rmd ├── 16-conclusion.Rmd ├── 99-references.Rmd ├── LICENSE ├── README.md ├── _bookdown.yml ├── _output.yml ├── book.bib ├── css/ │ └── style.css ├── index.Rmd ├── krantz.cls ├── latex/ │ ├── after_body.tex │ ├── before_body.tex │ └── preamble.tex ├── packages.bib └── rep-res-3rd-edition.Rproj ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ # Ignore the following files from Git version control tracking # ################################################################ .DS_Store .Rhistory *.RData *.aux *.latexmk *.log *.gz .Rproj.user package-lock.json _publish.R _book _bookdown_files rsconnect ================================================ FILE: Old/BookMake.R ================================================ ################# # Make file for the book Reproducible Research with R and RStudio # Christopher Gandrud # Updated: 30 March 2015 ################# # This R source code compiles the manuscript for the book Reproducible Research # with R and RStudio. # It also updates the main README file. ## Installing required packages # Note: To install the R packages used to compile the book open the # Source/Children/FrontMatter/Packages.Rnw. # Find: doInstall <- FALSE in the code chunk labeled "FrontPackageCitations". # Change the value `FALSE` to `TRUE` and run the code chunk. # Load knitr package library(knitr) #### Specify working directories. Change as needed. #### # Rep-Res-Parent.Rnw ParentDirectory <- "/git_repositories/Rep-Res-Book/Source/" # README.Rmd SetupDirectory <- "/git_repositories/Rep-Res-Book/" ##### Create PDF Book Manuscript #### # Compile the book's parent document setwd(ParentDirectory) knitr::knit2pdf(input = "Rep-Res-Parent.Rnw") # Embed fonts # This is largely for complete replication purposes only and is not necessary. ## If using Windows please see extrafont set up instructions at # https://GitHub.com/wch/extrafont # extrafont::embed_fonts("Rep-Res-Parent.pdf") #### README #### setwd(SetupDirectory) knitr::knit(input = "README.Rmd", output = "README.md") ================================================ FILE: Old/CoverGraphics/2ndEditionCover_v1/index.html ================================================ ================================================ FILE: Old/CoverGraphics/2ndEditionCover_v1/main.css ================================================ .book-logo { width: 100%; height: 80%; padding: 20%; margin: 5%; position: relative; } .book-logo:before { content: ""; position: absolute; top: 0; left: 0; width: 1em; height: 1em; color: orange; box-shadow: 7em 1em, 8em 1em, 9em 1em, 10em 1em, 11em 1em, 12em 1em, 13em 1em, 7em 2em, 13em 2em, 7em 3em, 8em 3em, 9em 3em, 10em 3em, 11em 3em, 12em 3em, 13em 3em, 10em 4em, 10em 5em, 10em 6em, 2em 7em, 3em 7em, 4em 7em, 5em 7em, 6em 7em, 7em 7em, 8em 7em, 9em 7em, 10em 7em, 11em 7em, 12em 7em, 13em 7em, 14em 7em, 15em 7em, 16em 7em, 17em 7em, 18em 7em, 2em 8em, 12em 8em, 18em 8em, 2em 9em, 12em 9em, 18em 9em, 2em 10em, 12em 10em, 13em 10em, 14em 10em, 15em 10em, 18em 10em, 19em 10em, 20em 10em, 21em 10em, 0em 11em, 1em 11em, 2em 11em, 3em 11em, 4em 11em, 12em 11em, 14em 11em, 15em 11em, 20em 11em, 21em 11em, 0em 12em, 4em 12em, 5em 12em, 6em 12em, 12em 12em, 0em 13em, 1em 13em, 2em 13em, 3em 13em, 4em 13em, 6em 13em, 12em 13em, 13em 13em, 14em 13em, 15em 13em, 1em 14em, 6em 14em, 7em 14em, 8em 14em, 9em 14em, 12em 14em, 14em 14em, 15em 14em, 1em 15em, 6em 15em, 8em 15em, 9em 15em, 12em 15em, 1em 16em, 2em 16em, 3em 16em, 4em 16em, 6em 16em, 12em 16em, 13em 16em, 14em 16em, 15em 16em, 1em 17em, 3em 17em, 4em 17em, 6em 17em, 7em 17em, 8em 17em, 9em 17em, 14em 17em, 15em 17em, 1em 18em, 8em 18em, 9em 18em, 1em 19em, 2em 19em, 3em 19em, 4em 19em, 1em 20em, 3em 20em, 4em 20em, 1em 21em, 1em 22em, 2em 22em, 3em 22em, 4em 22em, 3em 23em, 4em 23em; } ================================================ FILE: Old/EarlyOutline.md ================================================ # Reproducible Research with R and RStudio: A workflow for data gathering, analysis, and document creation ## Updated Chapter Outline ### Christopher Gandrud ### 23 June 2012 --- ## Part I: Getting Started ### ### 1 Introducing Reproducible Research ### 1.1 What is reproducible research? 1.2 Why should research be reproducible? > 1.2.1 Benefits for the research community 1.2.2 Benefits for individual researchers 1.3 Who should read this book? > 1.3.1 Students 1.3.2 Researchers 1.3.3 Industry specialists 1.3.4 Background skills 1.4 Why use **R**/**RStudio** for reproducible research? > 1.4.1 The advantages of **R** and `knitr` 1.4.2 The advantages of **RStudio** 1.5 Book overview > 1.5.1 How to read this book 1.5.2 Contents overview ### 2 Getting Started with Reproducible Research ### 2.1 The Big Picture: A workflow for reproducible research > 2.1.1 Data gathering 2.1.2 Data analysis 2.1.3 Data presentation 2.2 Practical tips for reproducible research > 2.2.1 Document everything 2.2.2 Everything is a (text) file 2.2.3 All files should tell you what they are 2.2.4 Research projects are many files tied together 2.2.5 Have a plan to organize, store, and make your text files available 2.3 Introduction to the tools of reproducible research covered in this book > 2.3.1 **R**/**RStudio** 2.3.2 `knitr` 2.3.3 Cloud storage & versioning 2.3.4 The command-line 2.3.5 Markup languages: \\( LaTeX \\) & **Markdown**/HTML ### 3 Getting Started with R/RStudio ### 3.1 Installing **R** and **RStudio** 3.2 Using **R** > 3.2.1 Objects 3.2.2 Functions, Commands, and Arguments 3.2.3 More resources 3.3 Using **RStudio** ### 4 Getting Started with File Management ### 4.1 Locating and organizing files > 4.1.1 Working directories 4.1.2 File management with **RStudio** Projects 4.2 Formatting and Commenting: writing **R** code to aid reproducibility > 4.2.1 Why use a style guide to format **R** code? 4.2.2 Why comment on your code? 4.3 Introduction to `knitr` > 4.3.1 Code chunks 4.3.2 Global options 4.3.3 `knitr` for the web: **Markdown**/HTML 4.3.4 `knitr` for PDF: \\( LaTeX \\) 4.4 Text editors and **R**/**RStudio** ## Part II: Reproducible Data Gathering & Storage ## ### 5 Gathering Data with R ### 5.1 Importing locally stored data sets > 5.1.1 Single files 5.1.2 Looping through multiple files 5.2 Importing data sets from the internet > 5.2.1 Data from non-secure (`http`) URLs 5.2.2 Data from secure (`https`) URLs 5.2.3 Data APIs & feeds 5.3 Basic web scraping > 5.3.1 Scraping Tables 5.3.2 Gathering and Parsing Text ### 6 Storing, Versioning, Collaborating, and Accessing Files ### 6.1 Saving data in reproducible formats 6.2 Storing data in the cloud 6.3 **Dropbox** > 6.3.1 Storing 6.3.2 Versioning 6.3.3 Collaborating 6.3.4 Accessing 6.4 **Dataverse** > 6.4.1 Storing 6.4.2 Versioning 6.4.3 Collaborating 6.4.4 Accessing 6.5 **GitHub** > 6.5.1 Storing 6.5.2 Versioning 6.5.3 Collaborating 6.5.4 Accessing 6.6 Citing data stored in the cloud ### 7 Preparing Data for Analysis ### 7.1 Cleaning data for merging 7.2 Sorting data 7.3 Merging data sets 7.4 Subsetting data ## Part III: Analysis and Results ## ### 8 Statistical Modelling and `knitr` ### 8.1 Incorporating analyses into the markup > 8.1.1 Full code in the main document - \\( LaTeX \\) - **Markdown** 8.1.2 Sourcing R code from another file - \\( LaTeX \\) - **Markdown** 8.2 Saving output objects for future use 8.3 Including highlighted syntax in the output > - \\( LaTeX \\) - **Markdown** 8.4 Debugging ### 9 Showing Results with Tables ### 9.1 Table basics > 9.1.1 Tables in \\( LaTeX \\) 9.1.2 Tables in **Markdown** and HTML 9.1.3 Basic `knitr` syntax for tables 9.2 Creating tables from **R** objects > 9.2.1 `xtable` basics with supported class objects - `xtable` for \\( LaTeX \\) - `xtable` for **Markdown** 9.2.2 `xtable` for non-supported class objects 9.3 Tables with `apsrtable` ### 10 Showing Results with Figures ### 10.1 Basic `knitr` syntax for figures 10.2 Plots with `plot` and `ggplot2` 10.3 Animations 10.4 Motion charts and basic maps with `googleVis` 10.5 Advanced maps with `ggmap` ## Part IV: Final Documents ## ### 11 Presenting with \\( LaTeX \\) ### 11.1 The Basics > 11.1.1 The Header 11.1.2 Headings 11.1.3 Footnotes and bibliographies 11.1.4 Math 11.1.5 Drawing figures with Ti*k*Z 11.2 Articles > 11.2.1 Planning reproducible articles 11.2.2 Compiling \\( LaTeX \\) articles in **R** and **RStudio** 11.3 Presentations with Beamer ### 12 Large \\( LaTeX \\) Documents: Theses, books and batch reports ### 12.1 Planning large documents > 12.1.1 Planning reproducible theses and books 12.1.2 Planning reproducible batch reports 12.2 Combining chapters > 12.2.1 Presenting in parts 12.2.2 Parent documents 12.2.3 Child documents - In line output with `\Sexpr{}` - Custom title pages in \\( LaTeX \\) 12.3 Batch reports with `knitr` and the command-line > 12.3.1 The data 12.3.2 The markup 12.3.3 The makefile ### 13 Presenting on the Web and Beyond with Markdown/HTML ### 13.1 The Basics > 13.1.1 Headings 13.1.2 Footnotes and bibliographies with **MultiMarkdown** 13.1.3 Math 13.1.4 Drawing figures with CSS 13.2 Simple webpages > 13.2.1 RPubs 13.2.2 Hosting webpages with Dropbox 13.3 Presentations with `Slidify` 13.4 Reproducible Websites > 13.4.1 Blogging with Tumblr 13.4.2 Jekyll-Bootstrap and GitHub 13.4.3 Jekyll and GitHub 13.5 Using **Markdown** for non-HTML output with **Pandoc** ### 14 Other Resources for Reproducible Research ### ================================================ FILE: Old/README.md ================================================ # The Old Directory This folder contains obsolete files that were used in earlier versions of the book. ================================================ FILE: Old/Source-v2/.gitignore ================================================ # Ignore LaTeX compile byproduct files # ######################################## *.aux *.bbl *.blg cache/* .DS_Store figure/* *.idx *.ilg *.ind *.lof *.log *.lot *.pdf *.RData .Ristory *.tex # Don't ignore these files !Children/FrontMatter/AdditionalResources/imagesExamp/ExampDiagram.tex !Children/FrontMatter/AdditionalResources/imagesExamp/FileTree.tex !Children/Chapter2/images2/WorkFlowLinks.tex !Children/Chapter3/images3/KnitrProcess.tex !Children/Chapter4/images4/ExampleFilePath.tex ================================================ FILE: Old/Source-v2/Children/Chapter1/chapter1.Rnw ================================================ % Chapter Chapter 1 For Reproducible Research in R and RStudio % Christopher Gandrud % Created: 16/07/2012 05:45:03 pm CEST % Updated: 5 May 2015 \chapter{Introducing Reproducible Research}\label{Intro} Research is often presented in very selective containers: slideshows, journal articles, books, or maybe even websites. These presentation documents announce a project's findings and try to convince us that the results are correct \cite[]{Mesirov2010}. It's important to remember that these documents are not the research. Especially in the computational and statistical sciences, these documents are the ``advertising''. The research is the ``full software environment, code, and data that produced the results'' \cite[385]{Buckheit1995,Donoho2010}. When we separate the research from its advertisement we are making it difficult for others to verify the findings by reproducing them. This book gives you the tools to dynamically combine your research with the presentation of your findings. The first tool is a workflow for reproducible research that weaves the principles of reproducibility throughout your entire research project, from data gathering to the statistical analysis, and the presentation of results. You will also learn how to use a number of computer tools that make this workflow possible. These tools include: \begin{itemize} \item the \textbf{R} statistical language that will allow you to gather data and analyze it; \item the \textbf{LaTeX} and \textbf{Markdown} markup languages that you can use to create documents--slideshows, articles, books, and webpages--for presenting your findings; \item the {\emph{knitr}} and \emph{rmarkdown} \textbf{packages} for R and other tools, including \textbf{command-line shell programs} like GNU Make and Git version control, for dynamically tying your data gathering, analysis, and presentation documents together so that they can be easily reproduced; \item \textbf{RStudio}, a program that brings all of these tools together in one place. \end{itemize} %%%%%%%%%%%%%% What is reproducible research? %%%%%%%%%%%%% \section{What Is Reproducible Research?} \index{reproducible research|(} \index{replication|(} Though there is some debate over what are the necessary and sufficient conditions for a replication \cite[2]{Makel2014}, research results are generally considered \emph{replicable} if there is sufficient information available for independent researchers to make the same findings using the same procedures with new data.\footnote{This is close to what \cite{Lykken1968} calls ``operational replication''.} For research that relies on experiments, this can mean a researcher not involved in the original research being able to rerun the experiment, including sampling, and validate that the new results are comparable to the original ones. In computational and quantitative empirical sciences, results are replicable if independent researchers can recreate findings by following the procedures originally used to gather the data and run the computer code. Of course, it is sometimes difficult to replicate the original data set because of issues such as limited resources to gather new data or because the original study already sampled the full universe of cases. \index{replication|)} So as a next-best standard we can aim for ``\emph{really reproducible research}'' \cite[1226]{Peng2011}.\footnote{The idea of really reproducible computational research was originally thought of and implemented by Jon Claerbout\index{Jon Claerbout} and the Stanford Exploration Project beginning in the 1980s and early 1990s \cite[]{Fomel2009,Donoho2009}. Further seminal advances were made by Jonathan B. Buckheit and David L. Donoho who created the Wavelab library of MATLAB\index{MATLAB} routines for their research on wavelets in the mid-1990s \cite[]{Buckheit1995}.} In computational sciences\footnote{Reproducibility is important for both quantitative and qualitative research \cite[]{King1994}. Nonetheless, we will focus mainly on methods for reproducibility in quantitative computational research.} this means: \begin{quote} the data and code used to make a finding are available and they are sufficient for an independent researcher to recreate the finding. \end{quote} In practice, research needs to be {\emph{easy}} for independent researchers to reproduce \cite[]{Ball2012}. If a study is difficult to reproduce it's more likely that no one will reproduce it. If someone does attempt to reproduce this research, it will be difficult for them to tell if any errors they find were in the original research or problems they introduced during the reproduction. In this book you will learn how to avoid these problems. In particular you will learn tools for dynamically ``{\emph{knitting}}''\index{knit}\footnote{Much of the reproducible computational research and literate programming literatures have traditionally used the term ``weave''\index{weave} to describe the process of combining source code and presentation documents \cite[see][101]{Knuth1992}. In the R community weave is usually used to describe the combination of source code and LaTeX documents. The term ``knit'' reflects the vocabulary of the {\emph{knitr}} R package\index{knitr} (knit + R). It is used more generally to describe weaving with a variety of markup languages. The term is used by RStudio if you are using the \emph{rmarkdown}\index{rmarkdown} package, which is similar to \emph{knitr}. We also cover the \emph{rmarkdown} package in this book. Because of this, I use the term knit rather than weave in this book.} the data and the source code together with your presentation documents. Combined with well-organized source files and clearly and completely commented code, independent researchers will be able to understand how you obtained your results. This will make your computational research easily reproducible. %%%%%%%%%%%%%% Why should research be reproducible? %%%%%%%%%%%%% \section{Why Should Research Be Reproducible?} Reproducible research is one of the main components of science. If that's not enough reason for you to make your research reproducible, consider that the tools of reproducible research also have direct benefits for you as a researcher. \subsection{For science} Replicability has been a key part of scientific inquiry from perhaps the 1200s \cite[]{Bacon1267,Nosek2012}. It has even been called the ``demarcation between science and non-science'' \cite[2]{Braude1979}. Why is replication so important for scientific inquiry? \paragraph{Standard to judge scientific claims} \emph{Replication} opens claims to scrutiny, allowing us to keep what works and discard what doesn't. Science, according to the American Physical Society, ``is the systematic enterprise of gathering knowledge \ldots organizing and condensing that knowledge into testable laws and theories''. The ``ultimate standard'' for evaluating scientific claims is whether or not the claims can be replicated \cite[]{Peng2011,Kelly2006}. Research findings cannot even really be considered ``genuine contribution[s] to human knowledge'' until they have been verified through replication \cite[38]{Stodden2009}. Replication ``requires the complete and open exchange of data, procedures, and materials''. Scientific conclusions that are not replicable should be abandoned or modified ``when confronted with more complete or reliable \ldots evidence''.\footnote{See the American Physical Society's website at \url{http://www.aps.org/policy/statements/99_6.cfm}. See also \cite{Fomel2009}.} \emph{Reproducibility enhances replicability}. If other researchers are able to clearly understand how a finding was originally made, then they will be better able to conduct comparable research in meaningful attempts to replicate the original findings. Sometimes strict replicability is not feasible, for example, when it is only possible to gather one data set on a population of interest. In these cases reproducibility is a ``minimum standard'' for judging scientific claims \citep{Peng2011}. It is important to note that though reproducibility is a minimum standard for judging scientific claims, ``a study can be reproducible and still be wrong'' \citep{Peng2014}. For example, a statistically significant finding in one study may remain statistically significant when reproduced using the original data/code, but when researchers try to replicate it using new data and even methods, they are unable to find a similar result. The original finding could simply have been noise, even though it is fully reproducible. \paragraph{Avoiding effort duplication \& encouraging cumulative knowledge development} Not only is reproducibility important for evaluating scientific claims, it can also contribute to the cumulative growth of scientific knowledge \citep{Kelly2006,King1995}. Reproducible research cuts down on the amount of time scientists have to spend gathering data or developing procedures that have already been collected or figured out. Because researchers do not have to discover on their own things that have already been done, they can more quickly build on established findings and develop new knowledge. \subsection{For you} Working to make your research reproducible does require extra upfront effort. For example, you need to put effort into learning the tools of reproducible research by doing things such as reading this book. But beyond the clear benefits for science, why should you make this effort? Using reproducible research tools can make your research process more effective and (hopefully) ultimately easier. \paragraph{Better work habits} Making a project reproducible from the start encourages you to use better work habits. It can spur you to more effectively plan and organize your research. It should push you to bring your data and source code up to a higher level of quality than you might if you ``thought `no one was looking'\thinspace'' \cite[386]{Donoho2010}. This forces you to root out errors--a ubiquitous part of computational research--earlier in the research process \cite[385]{Donoho2010}. Clear documentation also makes it easier to find errors.\footnote{Of course, it's important to keep in mind that reproducibility is ``neither necessary nor sufficient to prevent mistakes'' \cite[]{Stodden2009b}.} Reproducible research needs to be stored so that other researchers can actually access the data and source code. By taking steps to make your research accessible for others you are also making it easier for yourself to find your data and methods when you revise your work or begin new a project. You are avoiding personal effort duplication, allowing you to cumulatively build on your own work more effectively. \paragraph{Better teamwork} The steps you take to make sure an independent researcher can figure out what you have done also make it easier for your collaborators to understand your work and build on it. This applies not only to current collaborators, but also future collaborators. Bringing new members of a research team up to speed on a cumulatively growing research project is faster if they can easily understand what has been done already \cite[386]{Donoho2010}. \paragraph{Changes are easier} A third person may or may not actually reproduce your research even if you make it easy for them to do so. But, {\emph{you will almost certainly reproduce parts or even all of your own research}}. No actual research process is completely linear. You almost never gather data, run analyses, and present your results without going backwards to add variables, make changes to your statistical models, create new graphs, alter results tables in light of new findings, and so on. You will probably try to make these changes long after you last worked on the project and long since you remembered the details of how you did it. Whether your changes are because of journal reviewers' and conference participants' comments or you discover that new and better data has been made available since beginning the project, designing your research to be reproducible from the start makes it much easier to change things later on. Dynamic reproducible documents in particular can make changing things much easier. Changes made to one part of a research project have a way of cascading through the other parts. For example, adding a new variable to a largely completed analysis requires gathering new data and merging it with existing data sets. If you used data imputation or matching methods you may need to rerun these models. You then have to update your main statistical analyses, and recreate the tables and graphs you used to present the results. Adding a new variable essentially forces you to reproduce large portions of your research. If when you started the project you used tools that make it easier for others to reproduce your research, you also made it easier to reproduce the work yourself. You will have taken steps to have a ``better relationship with [your] future [self]'' \cite[2]{Bowers2011}. \paragraph{Higher research impact}\index{impact, research} Reproducible research is more likely to be useful for other researchers than non-reproducible research. Useful research is cited more frequently \cite[]{Donoho2002,Piwowar2007,Vandewalle2012}. Research that is fully reproducible contains more information, i.e. more reasons to use and cite it, than presentation documents merely showing findings. Independent researchers may use the reproducible data or code to look at other, often unanticipated, questions. When they use your work for a new purpose they will (should) cite your work. Because of this, Vandewalle et al. even argue that ``the goal of reproducible research is to have more impact with our research'' \citeyearpar[1253]{Vandewalle2007}. A reason researchers often avoid making their research fully reproducible is that they are afraid other people will use their data and code to compete with them. I'll let Donoho et al. address this one: \begin{quote} \emph{True. But competition means that strangers will read your papers, try to learn from them, cite them, and try to do even better. If you prefer obscurity, why are you publishing?} \citeyearpar[16]{Donoho2009} \end{quote} \index{reproducible research|)} \section{Who Should Read This Book?} This book is intended primarily for researchers who want to use a systematic workflow that encourages reproducibility as well as practical state-of-the-art computational tools to put this workflow into practice. These people include professional researchers, upper-level undergraduate, and graduate students working on computational data-driven projects. Hopefully, editors at academic publishers will also find the book useful for improving their ability to evaluate and edit reproducible research. The more researchers that use the tools of reproducibility the better. So I include enough information in the book for people who have very limited experience with these tools, including limited experience with R, LaTeX, and Markdown. They will be able to start incorporating reproducible research tools into their workflow right away. The book will also be helpful for people who already have general experience using technologies such as R and LaTeX, but would like to know how to tie them together for reproducible research. \subsection{Academic researchers} Hopefully so far in this chapter I've convinced you that reproducible research has benefits for you as a member of the scientific community and personally as a computational researcher. This book is intended to be a practical guide for how to actually make your research reproducible. Even if you already use tools such as R and LaTeX you may not be leveraging their full potential. This book will teach you useful ways to get the most out of them as part of a reproducible research workflow. \subsection{Students} Upper-level undergraduate and graduate students conducting original computational research should make their research reproducible for the same reasons that professional researchers should. Forcing yourself to clearly document the steps you took will also encourage you to think more clearly about what you are doing and reinforce what you are learning. It will hopefully give you a greater appreciation of research accountability and integrity early in your career \cite[183]{Barr2012,Ball2012}. Even if you don't have extensive experience with computer languages, this book will teach you specific habits and tools that you can use throughout your student research and hopefully your careers. Learning these things earlier will save you considerable time and effort later. \subsection{Instructors} When instructors incorporate the tools of reproducible research into their assignments they not only build students' understanding of research best practice, but are also better able to evaluate and provide meaningful feedback on students' work \cite[183]{Ball2012}. This book provides a resource that you can use with students to put reproducibility into practice. If you are teaching computational courses, you may also benefit from making your lecture material dynamically reproducible. Your slides will be easier to update for the same reasons that it is easier to update research. Making the methods you used to create the material available to students will give them more information. Clearly documenting how you created lecture material can also pass information on to future instructors. \subsection{Editors} Beyond a lack of reproducible research skills among researchers, an impediment to actually creating reproducible research is a lack of infrastructure to publish it \cite[]{Peng2011}. Hopefully, this book will be useful for editors at academic publishers who want to be better at evaluating reproducible research, editing it, and developing systems to make it more widely available. The journal {\emph{Biostatistics}} is a good example of a publication that is encouraging (actually requiring) reproducible research. From 2009 the journal has had an editor for reproducibility that ensures replication files are available and that results can be replicated using these files \cite[]{Peng2009}. The more editors there are with the skills to work with reproducible research the more likely it is that researchers will do it. \subsection{Private sector researchers} Researchers in the private sector may or may not want to make their work easily reproducible outside of their organization. However, that does not mean that significant benefits cannot be gained from using the methods of reproducible research. First, even if public reproducibility is ruled out to guard proprietary information,\footnote{There are ways to enable some public reproducibility without revealing confidential information. See \cite{Vandewalle2007} for a discussion of one approach.} making your research reproducible to members of your organization can spread valuable information about how analyses were done and data was collected. This will help build your organization's knowledge and avoid effort duplication. Just as a lack of reproducibility hinders the spread of information in the scientific community, it can hinder it inside of a private organization. Using the sort of dynamic automated processes run with clearly documented source code we will learn in this book can also help create robust data analysis methods that help your organization avoid errors that may come from cutting-and-pasting data across spreadsheets.\footnote{See this post by David Smith about how the J.P. Morgan\index{JP Morgan} ``London Whale''\index{London Whale} problem may have been prevented with the type of processes covered in this book: \url{http://blog.revolutionanalytics.com/2013/02/did-an-excel-error-bring-down-the-london-whale.html} (posted 11 February 2013).} Also, the tools of reproducible research covered in this book enable you to create professional standardized reports that can be easily updated or changed when new information is available. In particular, you will learn how to create batch reports based on quantitative data. %%%%%%%%%%%%%%%%% The Tools of Reproducible Research %%%%%%%%%%%%%%% \section{The Tools of Reproducible Research} This book will teach you the tools you need to make your research highly reproducible. Reproducible research involves two broad sets of tools. The first is a {\bf{reproducible research environment}}\index{reproducible research!environment} that includes the statistical tools you need to run your analyses as well as ``the ability to automatically track the provenance of data, analyses, and results and to package them (or pointers to persistent versions of them) for redistribution''. The second set of tools is a {\bf{reproducible research publisher}}\index{reproducible research!publisher}, which prepares dynamic documents for presenting results and is easily linked to the reproducible research environment \cite[415]{Mesirov2010}. In this book we will focus on learning how to use the widely available and highly flexible reproducible research environment--R/RStudio \cite[]{RLanguage,RStudioCite}.\footnote{The book was created with R version \Sexpr{paste0(version$major, '.', version$minor)} and developer builds of RStudio version 0.99.370.} R/RStudio can be linked to numerous reproducible research publishers such as LaTeX and Markdown with Yihui Xie's {\emph{knitr}} package \citeyearpar{R-knitr} or the related \emph{rmarkdown} package \citep{R-rmarkdown}. The main tools covered in this book include: \begin{itemize} \item {\bf{R}}: a programming language primarily for statistics and graphics. It can also be useful for data gathering and creating presentation documents. \item {\bf{{\emph{knitr} and {\emph{rmarkdown}}}}}: related R packages for literate programming\index{literate programming}. They allow you to combine your statistical analysis and the presentation of the results into one document. They work with R and a number of other languages such as Bash, Python, and Ruby. \item {\bf{Markup languages}}: instructions for how to format a presentation document. In this book we cover LaTeX, Markdown, and a little HTML. \item {\bf{RStudio}}: an integrated developer environment (IDE)\index{integrated developer environment} for R that tightly combines R, {\emph{knitr}}, \emph{rmarkdown}, and markup languages. \item {\bf{Cloud storage \& versioning}}: Services such as Dropbox and Git/GitHub that can store data, code, and presentation files, save previous versions of these files, and make this information widely available. \item {\bf{Unix-like shell programs}}\index{Unix-like shell program}: These tools are useful for working with large research projects.\footnote{In this book I cover the Bash shell for Linux\index{Linux} and Mac as well as Windows PowerShell\index{Windows PowerShell}.} They also allow us to use command-line tools including GNU Make for compiling projects and Pandoc, a program useful for converting documents from one markup language to another. \end{itemize} %%%%%%%%%%%%%%%%%%% Why use R, knitr, and RStudio for reproducible research? %%%%%%%%%%%%%% \section{Why Use R, \emph{knitr}/\emph{rmarkdown}, and RStudio for Reproducible Research?} \paragraph{Why R?} Why use a statistical programming language like R for reproducible research? R has a very active development community that is constantly expanding what it is capable of. As we will see in this book, R enables researchers across a wide range of disciplines to gather data and run statistical analyses. Using the {\emph{knitr}} or \emph{rmarkdown} package, you can connect your R-based analyses to presentation documents created with markup languages\index{markup language} such as LaTeX and Markdown. This allows you to dynamically and reproducibly present results in articles, slideshows, and webpages. The way you interact with R has benefits for reproducible research. In general you interact with R (or any other programming and markup language) by explicitly writing down your steps as source code\index{source code}. This promotes reproducibility more than your typical interactions with Graphical User Interface (GUI)\index{Graphical User Interface}\index{GUI} programs like\index{SPSS} SPSS\footnote{I know you can write scripts in statistical programs like SPSS, but doing so is not encouraged by the program's interface and you often have to learn multiple languages for writing scripts that run analyses, create graphics, and deal with matrices.} and Microsoft Word\index{Microsoft Word}. When you write R code and embed it in presentation documents created using markup languages, you are forced to explicitly state the steps you took to do your research. When you do research by clicking through drop-down menus in GUI programs, your steps are lost, or at least documenting them requires considerable extra effort. Also it is generally more difficult to dynamically embed your analysis in presentation documents created by GUI word processing programs in a way that will be accessible to other researchers both now and in the future. I'll come back to these points in Chapter \ref{GettingStartedRR}. \paragraph{Why {\normalfont{knitr}} and {\normalfont{rmarkdown}}?} Literate programming\index{literate programming} is a crucial part of reproducible quantitative research.\footnote{Donald Knuth\index{Donald Knuth} coined the term literate programming in the 1970s to refer to a source file that could be both run by a computer and ``woven'' with a formatted presentation document \cite[]{Knuth1992}.} Being able to directly link your analyses, your results, and the code you used to produce the results makes tracing your steps much easier. There are many different literate programming tools for a number of different programming languages.\footnote{A very interesting tool that is worth taking a look at for the Python\index{Python} programming language is HTML Notebooks\index{HTML Notebook} created with IPython.\index{IPython} For more details see \url{http://ipython.org/ipython-doc/dev/notebook/index.html}.} Previously, one of the most common tools for researchers using R and the LaTeX markup language was \emph{Sweave} \cite[]{Leisch2002}.\index{Sweave} The packages I am going to focus on in this book are newer and have more capabilities. They are called {\emph{knitr}}\index{knitr} and \emph{rmarkdown}. Why are we going to use these tools in this book and not \emph{Sweave} or some other tool? The simple answer is that they are more capable than \emph{Sweave}. Both \emph{knitr} and \emph{rmarkdown} can work with markup languages other than LaTeX including Markdown and HTML. \emph{rmarkdown} can even output Microsoft Word documents.\index{Microsoft Word} They can work with programming languages other than R. They highlight R code\index{syntax highlighting} in presentation documents making it easier for your readers to follow.\footnote{Syntax highlighting uses different colors and fonts to distinguish different types of text.} They give you better control over the inclusion of graphics and can cache code chunks, i.e. save the output for later.\index{cache} \emph{knitr} has the ability to understand \emph{Sweave}-like syntax, so it will be easy to convert backwards to \emph{Sweave} if you want to.\footnote{Note that the Sweave-style syntax is not identical to actual \emph{Sweave} syntax. See Yihui Xie's discussion of the differences between the two at: \url{http://yihui.name/knitr/demo/sweave/}. \emph{knitr} has a function (\texttt{Sweave2knitr})\index{Sweave2knitr} for converting \emph{Sweave} to \emph{knitr} syntax.} You also have the choice to use much simpler and more straightforward syntax with {\emph{knitr}} and \emph{rmarkdown}. \emph{knitr} and \emph{rmarkdown} have broadly similar capabilities and syntax. They both are literate programming tools that can produce presentation documents from multiple markup languages. They have almost identical syntax when used in Markdown.\index{Markdown} Their main difference is that they take different approaches to creating presentation documents. \emph{knitr} documents must be written using the markup language associated with the desired output. For example, with \emph{knitr}, LaTeX must be used to create PDF output documents and Markdown or HTML must be used to create webpages. \emph{rmarkdown} builds directly on \emph{knitr}, the key difference being that it uses the straightforward Markdown markup language to generate PDF, HTML, and MS Word documents.\footnote{It does this by relying on a tool called Pandoc \citep{Pandoc2014}.\index{Pandoc}} Because you write with the simple Markdown syntax, \emph{rmarkdown} is generally easier to use. It has the advantage of being able to take the same markup document and output multiple types of presentation documents. Nonetheless, for complex documents like books and long articles or work that requires custom formatting, \emph{knitr} LaTeX is often preferable and extremely flexible, though the syntax is more complicated. \paragraph{Why RStudio?} \index{RStudio}Why use the RStudio integrated development environment for reproducible research? R by itself has the capabilities necessary to gather data, analyze it, and, with a little help from {\emph{knitr}}/\emph{rmarkdown} and markup languages, present results in a way that is highly reproducible. RStudio allows you to do all of these things, but simplifies many of them and allows you to navigate through them more easily. It also is a happy medium between R's text-based interface and a pure GUI. Not only does RStudio do many of the things that R can do but more easily, it is also a very good standalone editor for writing documents with LaTeX and Markdown. For LaTeX documents it can, for example, insert frequently used commands like \texttt{\textbackslash{}section\{\}} for numbered sections (see Chapter \ref{LatexChapter}).\footnote{If you are more comfortable with a what-you-see-is-what-you-get (WYSIWYG)\index{WYSIWYG} word processor like Microsoft Word, you might be interested in exploring Lyx\index{Lyx}. It is a WYSIWYG-like LaTeX editor that works with {\emph{knitr}}. It doesn't work with the other markup languages covered in this book. For more information see: \url{http://www.lyx.org/}. I give some brief information on using Lyx with \emph{knitr} in Chapter 3's Appendix.} There are many LaTeX editors available, both open source and paid. But RStudio is currently the best program for creating reproducible LaTeX and Markdown documents. It has full syntax highlighting\index{syntax highlighting}. Its syntax highlighting can even distinguish between R code and markup commands in the same document. It can spell check LaTeX and Markdown documents. It handles {\emph{knitr}}/\emph{rmarkdown} code chunks\index{code chunk} beautifully (see Chapter \ref{GettingStartedRKnitr}). Finally, RStudio not only has tight integration with various markup languages, it also has capabilities for using other tools such as C++, CSS, JavaScript, and a few other programming languages. It is closely integrated with the version control programs Git\index{Git} and SVN\index{SVN}. Both of these programs allow you to keep track of the changes you make to your documents (see Chapter \ref{Storing}). This is important for reproducible research since version control programs can document many of your research steps. It also has a built-in ability to make HTML slideshows from \emph{knitr}/\emph{rmarkdown} documents. Basically, RStudio makes it easy to create and navigate through complex reproducible research documents. \subsection{Installing the main software}\label{InstallR} Before you read this book you should install the main software. All of the software programs covered in this book are open source and can be easily downloaded for free. They are available for Windows\index{Windows}, Mac\index{Mac}, and Linux operating systems\index{Linux}. They should run well on most modern computers. You should install R before installing RStudio. You can download the programs from the following websites: \begin{itemize} \item {\bf{R}}: \url{http://www.r-project.org/}, \item {\bf{RStudio Desktop (Open Source License)}}: \url{http://www.rstudio.com/products/rstudio/download/}. \end{itemize} \noindent The download webpages for these programs have comprehensive information on how to install them, so please refer to those pages for more information. After installing R and RStudio you will probably also want to install a number of user-written packages that are covered in this book. To install all of these user-written packages, please see page \pageref{ReqPackages}. \paragraph{Installing markup languages}\label{InstallMarkup} If you are planning to create LaTeX documents you need to install a TeX distribution\index{TeX distribution}.\footnote{LaTeX is is really a set of macros for the TeX typesetting system.\index{TeX} It is included in all major TeX distributions.} They are available for Windows, Mac, and Linux systems. They can be found at: \url{http://www.latex-project.org/ftp.html}. Please refer to that site for more installation information. If you want to create Markdown documents you can separately install the {\emph{markdown}} package\index{R package!markdown} in R. You can do this the same way that you install any package in R, with the {\tt{install.packages}} command.\footnote{The exact command is: {\tt{install.packages("markdown")}}.} \paragraph{GNU Make} If you are using a Linux computer you already have GNU Make\label{InstallMake}\index{GNU Make} installed.\footnote{To verify this, open the Terminal\index{Terminal} and type: \texttt{make --version} (I used version 3.81 for this book). This should output details about the current version of Make installed on your computer.} Mac users will need to install the command-line developer tools.\index{command-line developer tools} There are two ways to do this. One is go to the App Store\index{Apple App Store} and download Xcode (it's free).\index{Xcode} Once Xcode is installed, install command-line tools, which you will find by opening Xcode then clicking on \texttt{Preference} \textrightarrow \: \texttt{Downloads}. However, Xcode is a very large download and you only need the command-line tools for Make. To install just the command-line tools, open the Terminal\index{Terminal} and try to run Make by typing \texttt{make} and hitting return. A box should appear asking you if you want to install the command-line developer tools. Click \texttt{Install}. Windows users will have Make installed if they have already installed Rtools\index{Rtools} (see page \pageref{RtoolsDownload}). Mac and Windows users will need to install this software not only so that GNU Make runs properly, but also so that other command-line tools work well. \paragraph{Other Tools} We will discuss other tools such as Git that can be a useful part of a reproducible research workflow. Installation instructions for these tools will be discussed below. %%%%%%%%%%%%%% Book Overview %%%%%%%%%%%%%% \section{Book Overview} The purpose of this book is to give you the tools that you will need to do reproducible research with R and RStudio. This book describes a workflow for reproducible research primarily using R and RStudio. It is designed to give you the necessary tools to use this workflow for your own research. It is not designed to be a complete reference for R, RStudio, {\emph{knitr}}/\emph{rmarkdown}, Git, or any other program that is a part of this workflow. Instead it shows you how these tools can fit together to make your research more reproducible. To get the most out of these individual programs I will along the way point you to other resources that cover these programs in more detail. To that end, I can recommend a number of resources that cover more of the nitty-gritty:\label{OtherBooks} \begin{itemize} \item Michael J. Crawley's \citeyearpar{Crawley2013} encyclopaedic R book, appropriately titled \emph{\textbf{The R Book}}, published by Wiley. \item Hadley Whickham \citeyearpar{Whickham2014book} has a great new book out from Chapman and Hall on \emph{\textbf{Advanced R}}. \item Yihui Xie's \citeyearpar{Xie2013} book \emph{\textbf{Dynamic Documents with R and knitr}}, published by Chapman and Hall, provides a comprehensive look at how to create documents with \emph{knitr}. It's a good complement to this book's generally more research project--level focus. \item Norman Matloff's \citeyearpar{Matloff2011} tour through the programming language aspects of R called \emph{\textbf{The Art of R Programming: A Tour of Statistical Design Software}}, published by No Starch Press. \item Cathy O'Neil and Rachel Schutt \citeyearpar{ONeil2013} give a great introduction the field of data science generally in \emph{\textbf{Doing Data Science}}, published by O'Reilly Media Inc. \item For an excellent introduction to the command-line\index{command-line} in Linux and Mac, see William E. Shotts Jr.'s \citeyearpar{ShottsJr2012} book \emph{\textbf{The Linux Command-line: A Complete Introduction}} also published by No Starch Press. It is also helpful for Windows users running PowerShell (see Chapter \ref{DirectoriesChapter}). \item The RStudio website (\url{http://www.rstudio.com/ide/docs/}) has a number of useful tutorials on how to use {\emph{knitr}} with LaTeX and Markdown. They also have very good documentation for \emph{rmarkdown} at \url{http://rmarkdown.rstudio.com/}. \end{itemize} That being said, my goal is for this book to be {\emph{self-sufficient}}. A reader without a detailed understanding of these programs will be able to understand and use the commands and procedures I cover in this book. While learning how to use R and the other programs I personally often encountered illustrative examples that included commands, variables, and other things that were not well explained in the texts that I was reading. This caused me to waste many hours trying to figure out, for example, what the \texttt{\$} is used for (preview: it's the component selector, see Section \ref{ComponentSelect}). I hope to save you from this wasted time by either providing a brief explanation of possibly frustrating and mysterious things and/or pointing you in the direction of good explanations. \subsection{How to read this book} This book gives you a workflow. It has a beginning, middle, and end. So, unlike a reference book, it can and should be read linearly as it takes you through an empirical research processes from an empty folder to a completed set of documents that reproducibly showcase your findings. That being said, readers with more experience using tools like R or LaTeX may want to skip over the nitty-gritty parts of the book that describe how to manipulate data frames or compile LaTeX documents into PDFs. Please feel free to skip these sections. \paragraph{More-experienced R users} If you are an experienced R user you may want to skip over the first section of Chapter \ref{GettingStartedRKnitr}: Getting Started with R, RStudio, and \emph{knitr}/\emph{rmarkdown}. But don't skip over the whole chapter. The latter parts contain important information on the {\emph{knitr}}/\emph{rmarkdown} packages. If you are experienced with R data manipulation you may also want to skip all of Chapter \ref{DataClean}. \paragraph{More-experienced LaTeX users} If you are familiar with LaTeX you might want to skip the first part of Chapter \ref{LatexChapter}. The second part may be useful as it includes information on how to dynamically create BibTeX bibliographies with \emph{knitr} and how to include \emph{knitr} output in a Beamer slideshow. \paragraph{Less-experienced LaTeX/Markdown users} If you do not have experience with LaTeX or Markdown you may benefit from reading, or at least skimming, the introductory chapters on these top topics (chapters \ref{LatexChapter} and \ref{MarkdownChapter}) before reading Part III. \subsection{Reproduce this book} This book practices what it preaches. It can be reproduced. I wrote the book using the programs and methods that I describe. Full documentation and source files can be found at the book's GitHub\index{GitHub} repository. Feel free to read and even use (within reason and with attribution, of course) the book's source code. You can find it at: \url{https://GitHub.com/christophergandrud/Rep-Res-Book}. This is especially useful if you want to know how to do something in the book that I don't directly cover in the text. If you notice any errors or places where the book can be improved please report them on the book's GitHub Issues page: \url{https://GitHub.com/christophergandrud/Rep-Res-Book/issues}. Corrections will be posted at: \url{http://christophergandrud.GitHub.io/RepResR-RStudio/errata.htm}. \subsection{Contents overview} The book is broken into four parts. The first part (chapters \ref{GettingStartedRR}, \ref{GettingStartedRKnitr}, and \ref{DirectoriesChapter}) gives an overview of the reproducible research workflow as well as the general computer skills that you'll need to use this workflow. Each of the next three parts of the book guides you through the specific skills you will need for each part of the reproducible research process. Part two (chapters \ref{Storing}, \ref{DataGather}, and \ref{DataClean}) covers the data gathering and file storage process. The third part (chapters \ref{StatsModel}, \ref{TablesChapter}, and \ref{FiguresChapter}) teaches you how to dynamically incorporate your statistical analysis, results figures, and tables into your presentation documents. The final part (chapters \ref{LatexChapter}, \ref{LargeDocs}, and \ref{MarkdownChapter}) covers how to create reproducible presentation documents including LaTeX articles, books, slideshows, and batch reports as well as Markdown webpages and slideshows. ================================================ FILE: Old/Source-v2/Children/Chapter1/chapter1.md ================================================ Introducing Reproducible Research {#Intro} ================================= Research is often presented in very selective containers: slideshows, journal articles, books, or maybe even websites. These presentation documents announce a project's findings and try to convince us that the results are correct [@Mesirov2010]. It's important to remember that these documents are not the research. Especially in the computational and statistical sciences, these documents are the "advertising". The research is the "full software environment, code, and data that produced the results" [@Buckheit1995; @Donoho2010 385]. When we separate the research from its advertisement we are making it difficult for others to verify the findings by reproducing them. This book gives you the tools to dynamically combine your research with the presentation of your findings. The first tool is a workflow for reproducible research that weaves the principles of reproducibility throughout your entire research project, from data gathering to the statistical analysis, and the presentation of results. You will also learn how to use a number of computer tools that make this workflow possible. These tools include: - the **R** statistical language that will allow you to gather data and analyze it; - the **LaTeX** and **Markdown** markup languages that you can use to create documents--slideshows, articles, books, and webpages--for presenting your findings; - the *knitr* and *rmarkdown* **packages** for R and other tools, including **command-line shell programs** like GNU Make and Git version control, for dynamically tying your data gathering, analysis, and presentation documents together so that they can be easily reproduced; - **RStudio**, a program that brings all of these tools together in one place. What Is Reproducible Research? ------------------------------ Though there is some debate over what are the necessary and sufficient conditions for a replication [@Makel2014 2], research results are generally considered *replicable* if there is sufficient information available for independent researchers to make the same findings using the same procedures with new data.[^1] For research that relies on experiments, this can mean a researcher not involved in the original research being able to rerun the experiment, including sampling, and validate that the new results are comparable to the original ones. In computational and quantitative empirical sciences, results are replicable if independent researchers can recreate findings by following the procedures originally used to gather the data and run the computer code. Of course, it is sometimes difficult to replicate the original data set because of issues such as limited resources to gather new data or because the original study already sampled the full universe of cases. So as a next-best standard we can aim for "*really reproducible research*" [@Peng2011 1226].[^2] In computational sciences[^3] this means: > the data and code used to make a finding are available and they are > sufficient for an independent researcher to recreate the finding. In practice, research needs to be *easy* for independent researchers to reproduce [@Ball2012]. If a study is difficult to reproduce it's more likely that no one will reproduce it. If someone does attempt to reproduce this research, it will be difficult for them to tell if any errors they find were in the original research or problems they introduced during the reproduction. In this book you will learn how to avoid these problems. In particular you will learn tools for dynamically "*knitting*"[^4] the data and the source code together with your presentation documents. Combined with well-organized source files and clearly and completely commented code, independent researchers will be able to understand how you obtained your results. This will make your computational research easily reproducible. Why Should Research Be Reproducible? ------------------------------------ Reproducible research is one of the main components of science. If that's not enough reason for you to make your research reproducible, consider that the tools of reproducible research also have direct benefits for you as a researcher. ### For science Replicability has been a key part of scientific inquiry from perhaps the 1200s [@Bacon1267; @Nosek2012]. It has even been called the "demarcation between science and non-science" [@Braude1979 2]. Why is replication so important for scientific inquiry? ##### Standard to judge scientific claims *Replication* opens claims to scrutiny, allowing us to keep what works and discard what doesn't. Science, according to the American Physical Society, "is the systematic enterprise of gathering knowledge ...organizing and condensing that knowledge into testable laws and theories". The "ultimate standard" for evaluating scientific claims is whether or not the claims can be replicated [@Peng2011; @Kelly2006]. Research findings cannot even really be considered "genuine contribution\[s\] to human knowledge" until they have been verified through replication [@Stodden2009 38]. Replication "requires the complete and open exchange of data, procedures, and materials". Scientific conclusions that are not replicable should be abandoned or modified "when confronted with more complete or reliable ...evidence".[^5] *Reproducibility enhances replicability*. If other researchers are able to clearly understand how a finding was originally made, then they will be better able to conduct comparable research in meaningful attempts to replicate the original findings. Sometimes strict replicability is not feasible, for example, when it is only possible to gather one data set on a population of interest. In these cases reproducibility is a "minimum standard" for judging scientific claims [@Peng2011]. It is important to note that though reproducibility is a minimum standard for judging scientific claims, "a study can be reproducible and still be wrong" [@Peng2014]. For example, a statistically significant finding in one study may remain statistically significant when reproduced using the original data/code, but when researchers try to replicate it using new data and even methods, they are unable to find a similar result. The original finding could simply have been noise, even though it is fully reproducible. ##### Avoiding effort duplication & encouraging cumulative knowledge development Not only is reproducibility important for evaluating scientific claims, it can also contribute to the cumulative growth of scientific knowledge [@Kelly2006; @King1995]. Reproducible research cuts down on the amount of time scientists have to spend gathering data or developing procedures that have already been collected or figured out. Because researchers do not have to discover on their own things that have already been done, they can more quickly build on established findings and develop new knowledge. ### For you Working to make your research reproducible does require extra upfront effort. For example, you need to put effort into learning the tools of reproducible research by doing things such as reading this book. But beyond the clear benefits for science, why should you make this effort? Using reproducible research tools can make your research process more effective and (hopefully) ultimately easier. ##### Better work habits Making a project reproducible from the start encourages you to use better work habits. It can spur you to more effectively plan and organize your research. It should push you to bring your data and source code up to a higher level of quality than you might if you "thought 'no one was looking'" [@Donoho2010 386]. This forces you to root out errors--a ubiquitous part of computational research--earlier in the research process [@Donoho2010 385]. Clear documentation also makes it easier to find errors.[^6] Reproducible research needs to be stored so that other researchers can actually access the data and source code. By taking steps to make your research accessible for others you are also making it easier for yourself to find your data and methods when you revise your work or begin new a project. You are avoiding personal effort duplication, allowing you to cumulatively build on your own work more effectively. ##### Better teamwork The steps you take to make sure an independent researcher can figure out what you have done also make it easier for your collaborators to understand your work and build on it. This applies not only to current collaborators, but also future collaborators. Bringing new members of a research team up to speed on a cumulatively growing research project is faster if they can easily understand what has been done already [@Donoho2010 386]. ##### Changes are easier A third person may or may not actually reproduce your research even if you make it easy for them to do so. But, *you will almost certainly reproduce parts or even all of your own research*. No actual research process is completely linear. You almost never gather data, run analyses, and present your results without going backwards to add variables, make changes to your statistical models, create new graphs, alter results tables in light of new findings, and so on. You will probably try to make these changes long after you last worked on the project and long since you remembered the details of how you did it. Whether your changes are because of journal reviewers' and conference participants' comments or you discover that new and better data has been made available since beginning the project, designing your research to be reproducible from the start makes it much easier to change things later on. Dynamic reproducible documents in particular can make changing things much easier. Changes made to one part of a research project have a way of cascading through the other parts. For example, adding a new variable to a largely completed analysis requires gathering new data and merging it with existing data sets. If you used data imputation or matching methods you may need to rerun these models. You then have to update your main statistical analyses, and recreate the tables and graphs you used to present the results. Adding a new variable essentially forces you to reproduce large portions of your research. If when you started the project you used tools that make it easier for others to reproduce your research, you also made it easier to reproduce the work yourself. You will have taken steps to have a "better relationship with \[your\] future \[self\]" [@Bowers2011 2]. ##### Higher research impact Reproducible research is more likely to be useful for other researchers than non-reproducible research. Useful research is cited more frequently [@Donoho2002; @Piwowar2007; @Vandewalle2012]. Research that is fully reproducible contains more information, i.e. more reasons to use and cite it, than presentation documents merely showing findings. Independent researchers may use the reproducible data or code to look at other, often unanticipated, questions. When they use your work for a new purpose they will (should) cite your work. Because of this, Vandewalle et al. even argue that "the goal of reproducible research is to have more impact with our research" [-@Vandewalle2007 1253]. A reason researchers often avoid making their research fully reproducible is that they are afraid other people will use their data and code to compete with them. I'll let Donoho et al. address this one: > *True. But competition means that strangers will read your papers, try > to learn from them, cite them, and try to do even better. If you > prefer obscurity, why are you publishing?* [-@Donoho2009 16] Who Should Read This Book? -------------------------- This book is intended primarily for researchers who want to use a systematic workflow that encourages reproducibility as well as practical state-of-the-art computational tools to put this workflow into practice. These people include professional researchers, upper-level undergraduate, and graduate students working on computational data-driven projects. Hopefully, editors at academic publishers will also find the book useful for improving their ability to evaluate and edit reproducible research. The more researchers that use the tools of reproducibility the better. So I include enough information in the book for people who have very limited experience with these tools, including limited experience with R, LaTeX, and Markdown. They will be able to start incorporating reproducible research tools into their workflow right away. The book will also be helpful for people who already have general experience using technologies such as R and LaTeX, but would like to know how to tie them together for reproducible research. ### Academic researchers Hopefully so far in this chapter I've convinced you that reproducible research has benefits for you as a member of the scientific community and personally as a computational researcher. This book is intended to be a practical guide for how to actually make your research reproducible. Even if you already use tools such as R and LaTeX you may not be leveraging their full potential. This book will teach you useful ways to get the most out of them as part of a reproducible research workflow. ### Students Upper-level undergraduate and graduate students conducting original computational research should make their research reproducible for the same reasons that professional researchers should. Forcing yourself to clearly document the steps you took will also encourage you to think more clearly about what you are doing and reinforce what you are learning. It will hopefully give you a greater appreciation of research accountability and integrity early in your career [@Barr2012; @Ball2012 183]. Even if you don't have extensive experience with computer languages, this book will teach you specific habits and tools that you can use throughout your student research and hopefully your careers. Learning these things earlier will save you considerable time and effort later. ### Instructors When instructors incorporate the tools of reproducible research into their assignments they not only build students' understanding of research best practice, but are also better able to evaluate and provide meaningful feedback on students' work [@Ball2012 183]. This book provides a resource that you can use with students to put reproducibility into practice. If you are teaching computational courses, you may also benefit from making your lecture material dynamically reproducible. Your slides will be easier to update for the same reasons that it is easier to update research. Making the methods you used to create the material available to students will give them more information. Clearly documenting how you created lecture material can also pass information on to future instructors. ### Editors Beyond a lack of reproducible research skills among researchers, an impediment to actually creating reproducible research is a lack of infrastructure to publish it [@Peng2011]. Hopefully, this book will be useful for editors at academic publishers who want to be better at evaluating reproducible research, editing it, and developing systems to make it more widely available. The journal *Biostatistics* is a good example of a publication that is encouraging (actually requiring) reproducible research. From 2009 the journal has had an editor for reproducibility that ensures replication files are available and that results can be replicated using these files [@Peng2009]. The more editors there are with the skills to work with reproducible research the more likely it is that researchers will do it. ### Private sector researchers Researchers in the private sector may or may not want to make their work easily reproducible outside of their organization. However, that does not mean that significant benefits cannot be gained from using the methods of reproducible research. First, even if public reproducibility is ruled out to guard proprietary information,[^7] making your research reproducible to members of your organization can spread valuable information about how analyses were done and data was collected. This will help build your organization's knowledge and avoid effort duplication. Just as a lack of reproducibility hinders the spread of information in the scientific community, it can hinder it inside of a private organization. Using the sort of dynamic automated processes run with clearly documented source code we will learn in this book can also help create robust data analysis methods that help your organization avoid errors that may come from cutting-and-pasting data across spreadsheets.[^8] Also, the tools of reproducible research covered in this book enable you to create professional standardized reports that can be easily updated or changed when new information is available. In particular, you will learn how to create batch reports based on quantitative data. The Tools of Reproducible Research ---------------------------------- This book will teach you the tools you need to make your research highly reproducible. Reproducible research involves two broad sets of tools. The first is a **reproducible research environment** that includes the statistical tools you need to run your analyses as well as "the ability to automatically track the provenance of data, analyses, and results and to package them (or pointers to persistent versions of them) for redistribution". The second set of tools is a **reproducible research publisher**, which prepares dynamic documents for presenting results and is easily linked to the reproducible research environment [@Mesirov2010 415]. In this book we will focus on learning how to use the widely available and highly flexible reproducible research environment--R/RStudio [@RLanguage; @RStudioCite].[^9] R/RStudio can be linked to numerous reproducible research publishers such as LaTeX and Markdown with Yihui Xie's *knitr* package [-@R-knitr] or the related *rmarkdown* package [@R-rmarkdown]. The main tools covered in this book include: - **R**: a programming language primarily for statistics and graphics. It can also be useful for data gathering and creating presentation documents. - ***knitr* and *rmarkdown***: related R packages for literate programming. They allow you to combine your statistical analysis and the presentation of the results into one document. They work with R and a number of other languages such as Bash, Python, and Ruby. - **Markup languages**: instructions for how to format a presentation document. In this book we cover LaTeX, Markdown, and a little HTML. - **RStudio**: an integrated developer environment (IDE) for R that tightly combines R, *knitr*, *rmarkdown*, and markup languages. - **Cloud storage & versioning**: Services such as Dropbox and Git/GitHub that can store data, code, and presentation files, save previous versions of these files, and make this information widely available. - **Unix-like shell programs**: These tools are useful for working with large research projects.[^10] They also allow us to use command-line tools including GNU Make for compiling projects and Pandoc, a program useful for converting documents from one markup language to another. Why Use R, *knitr*/*rmarkdown*, and RStudio for Reproducible Research? ---------------------------------------------------------------------- ##### Why R? Why use a statistical programming language like R for reproducible research? R has a very active development community that is constantly expanding what it is capable of. As we will see in this book, R enables researchers across a wide range of disciplines to gather data and run statistical analyses. Using the *knitr* or *rmarkdown* package, you can connect your R-based analyses to presentation documents created with markup languages such as LaTeX and Markdown. This allows you to dynamically and reproducibly present results in articles, slideshows, and webpages. The way you interact with R has benefits for reproducible research. In general you interact with R (or any other programming and markup language) by explicitly writing down your steps as source code. This promotes reproducibility more than your typical interactions with Graphical User Interface (GUI) programs like SPSS[^11] and Microsoft Word. When you write R code and embed it in presentation documents created using markup languages, you are forced to explicitly state the steps you took to do your research. When you do research by clicking through drop-down menus in GUI programs, your steps are lost, or at least documenting them requires considerable extra effort. Also it is generally more difficult to dynamically embed your analysis in presentation documents created by GUI word processing programs in a way that will be accessible to other researchers both now and in the future. I'll come back to these points in Chapter [\[GettingStartedRR\]](#GettingStartedRR){reference-type="ref" reference="GettingStartedRR"}. ##### Why and ? Literate programming is a crucial part of reproducible quantitative research.[^12] Being able to directly link your analyses, your results, and the code you used to produce the results makes tracing your steps much easier. There are many different literate programming tools for a number of different programming languages.[^13] Previously, one of the most common tools for researchers using R and the LaTeX markup language was *Sweave* [@Leisch2002]. The packages I am going to focus on in this book are newer and have more capabilities. They are called *knitr* and *rmarkdown*. Why are we going to use these tools in this book and not *Sweave* or some other tool? The simple answer is that they are more capable than *Sweave*. Both *knitr* and *rmarkdown* can work with markup languages other than LaTeX including Markdown and HTML. *rmarkdown* can even output Microsoft Word documents. They can work with programming languages other than R. They highlight R code in presentation documents making it easier for your readers to follow.[^14] They give you better control over the inclusion of graphics and can cache code chunks, i.e. save the output for later. *knitr* has the ability to understand *Sweave*-like syntax, so it will be easy to convert backwards to *Sweave* if you want to.[^15] You also have the choice to use much simpler and more straightforward syntax with *knitr* and *rmarkdown*. *knitr* and *rmarkdown* have broadly similar capabilities and syntax. They both are literate programming tools that can produce presentation documents from multiple markup languages. They have almost identical syntax when used in Markdown. Their main difference is that they take different approaches to creating presentation documents. *knitr* documents must be written using the markup language associated with the desired output. For example, with *knitr*, LaTeX must be used to create PDF output documents and Markdown or HTML must be used to create webpages. *rmarkdown* builds directly on *knitr*, the key difference being that it uses the straightforward Markdown markup language to generate PDF, HTML, and MS Word documents.[^16] Because you write with the simple Markdown syntax, *rmarkdown* is generally easier to use. It has the advantage of being able to take the same markup document and output multiple types of presentation documents. Nonetheless, for complex documents like books and long articles or work that requires custom formatting, *knitr* LaTeX is often preferable and extremely flexible, though the syntax is more complicated. ##### Why RStudio? Why use the RStudio integrated development environment for reproducible research? R by itself has the capabilities necessary to gather data, analyze it, and, with a little help from *knitr*/*rmarkdown* and markup languages, present results in a way that is highly reproducible. RStudio allows you to do all of these things, but simplifies many of them and allows you to navigate through them more easily. It also is a happy medium between R's text-based interface and a pure GUI. Not only does RStudio do many of the things that R can do but more easily, it is also a very good standalone editor for writing documents with LaTeX and Markdown. For LaTeX documents it can, for example, insert frequently used commands like `\section{}` for numbered sections (see Chapter [\[LatexChapter\]](#LatexChapter){reference-type="ref" reference="LatexChapter"}).[^17] There are many LaTeX editors available, both open source and paid. But RStudio is currently the best program for creating reproducible LaTeX and Markdown documents. It has full syntax highlighting. Its syntax highlighting can even distinguish between R code and markup commands in the same document. It can spell check LaTeX and Markdown documents. It handles *knitr*/*rmarkdown* code chunks beautifully (see Chapter [\[GettingStartedRKnitr\]](#GettingStartedRKnitr){reference-type="ref" reference="GettingStartedRKnitr"}). Finally, RStudio not only has tight integration with various markup languages, it also has capabilities for using other tools such as C++, CSS, JavaScript, and a few other programming languages. It is closely integrated with the version control programs Git and SVN. Both of these programs allow you to keep track of the changes you make to your documents (see Chapter [\[Storing\]](#Storing){reference-type="ref" reference="Storing"}). This is important for reproducible research since version control programs can document many of your research steps. It also has a built-in ability to make HTML slideshows from *knitr*/*rmarkdown* documents. Basically, RStudio makes it easy to create and navigate through complex reproducible research documents. ### Installing the main software {#InstallR} Before you read this book you should install the main software. All of the software programs covered in this book are open source and can be easily downloaded for free. They are available for Windows, Mac, and Linux operating systems. They should run well on most modern computers. You should install R before installing RStudio. You can download the programs from the following websites: - **R**: , - **RStudio Desktop (Open Source License)**: . The download webpages for these programs have comprehensive information on how to install them, so please refer to those pages for more information. After installing R and RStudio you will probably also want to install a number of user-written packages that are covered in this book. To install all of these user-written packages, please see page . ##### Installing markup languages {#InstallMarkup} If you are planning to create LaTeX documents you need to install a TeX distribution.[^18] They are available for Windows, Mac, and Linux systems. They can be found at: . Please refer to that site for more installation information. If you want to create Markdown documents you can separately install the *markdown* package in R. You can do this the same way that you install any package in R, with the command.[^19] ##### GNU Make If you are using a Linux computer you already have GNU Make[\[InstallMake\]]{#InstallMake label="InstallMake"} installed.[^20] Mac users will need to install the command-line developer tools. There are two ways to do this. One is go to the App Store and download Xcode (it's free). Once Xcode is installed, install command-line tools, which you will find by opening Xcode then clicking on `Preference` `Downloads`. However, Xcode is a very large download and you only need the command-line tools for Make. To install just the command-line tools, open the Terminal and try to run Make by typing `make` and hitting return. A box should appear asking you if you want to install the command-line developer tools. Click `Install`. Windows users will have Make installed if they have already installed Rtools (see page ). Mac and Windows users will need to install this software not only so that GNU Make runs properly, but also so that other command-line tools work well. ##### Other Tools We will discuss other tools such as Git that can be a useful part of a reproducible research workflow. Installation instructions for these tools will be discussed below. Book Overview ------------- The purpose of this book is to give you the tools that you will need to do reproducible research with R and RStudio. This book describes a workflow for reproducible research primarily using R and RStudio. It is designed to give you the necessary tools to use this workflow for your own research. It is not designed to be a complete reference for R, RStudio, *knitr*/*rmarkdown*, Git, or any other program that is a part of this workflow. Instead it shows you how these tools can fit together to make your research more reproducible. To get the most out of these individual programs I will along the way point you to other resources that cover these programs in more detail. To that end, I can recommend a number of resources that cover more of the nitty-gritty:[\[OtherBooks\]]{#OtherBooks label="OtherBooks"} - Michael J. Crawley's [-@Crawley2013] encyclopaedic R book, appropriately titled ***The R Book***, published by Wiley. - Hadley Whickham [-@Whickham2014book] has a great new book out from Chapman and Hall on ***Advanced R***. - Yihui Xie's [-@Xie2013] book ***Dynamic Documents with R and knitr***, published by Chapman and Hall, provides a comprehensive look at how to create documents with *knitr*. It's a good complement to this book's generally more research project--level focus. - Norman Matloff's [-@Matloff2011] tour through the programming language aspects of R called ***The Art of R Programming: A Tour of Statistical Design Software***, published by No Starch Press. - Cathy O'Neil and Rachel Schutt [-@ONeil2013] give a great introduction the field of data science generally in ***Doing Data Science***, published by O'Reilly Media Inc. - For an excellent introduction to the command-line in Linux and Mac, see William E. Shotts Jr.'s [-@ShottsJr2012] book ***The Linux Command-line: A Complete Introduction*** also published by No Starch Press. It is also helpful for Windows users running PowerShell (see Chapter [\[DirectoriesChapter\]](#DirectoriesChapter){reference-type="ref" reference="DirectoriesChapter"}). - The RStudio website () has a number of useful tutorials on how to use *knitr* with LaTeX and Markdown. They also have very good documentation for *rmarkdown* at . That being said, my goal is for this book to be *self-sufficient*. A reader without a detailed understanding of these programs will be able to understand and use the commands and procedures I cover in this book. While learning how to use R and the other programs I personally often encountered illustrative examples that included commands, variables, and other things that were not well explained in the texts that I was reading. This caused me to waste many hours trying to figure out, for example, what the `$` is used for (preview: it's the component selector, see Section [\[ComponentSelect\]](#ComponentSelect){reference-type="ref" reference="ComponentSelect"}). I hope to save you from this wasted time by either providing a brief explanation of possibly frustrating and mysterious things and/or pointing you in the direction of good explanations. ### How to read this book This book gives you a workflow. It has a beginning, middle, and end. So, unlike a reference book, it can and should be read linearly as it takes you through an empirical research processes from an empty folder to a completed set of documents that reproducibly showcase your findings. That being said, readers with more experience using tools like R or LaTeX may want to skip over the nitty-gritty parts of the book that describe how to manipulate data frames or compile LaTeX documents into PDFs. Please feel free to skip these sections. ##### More-experienced R users If you are an experienced R user you may want to skip over the first section of Chapter [\[GettingStartedRKnitr\]](#GettingStartedRKnitr){reference-type="ref" reference="GettingStartedRKnitr"}: Getting Started with R, RStudio, and *knitr*/*rmarkdown*. But don't skip over the whole chapter. The latter parts contain important information on the *knitr*/*rmarkdown* packages. If you are experienced with R data manipulation you may also want to skip all of Chapter [\[DataClean\]](#DataClean){reference-type="ref" reference="DataClean"}. ##### More-experienced LaTeX users If you are familiar with LaTeX you might want to skip the first part of Chapter [\[LatexChapter\]](#LatexChapter){reference-type="ref" reference="LatexChapter"}. The second part may be useful as it includes information on how to dynamically create BibTeX bibliographies with *knitr* and how to include *knitr* output in a Beamer slideshow. ##### Less-experienced LaTeX/Markdown users If you do not have experience with LaTeX or Markdown you may benefit from reading, or at least skimming, the introductory chapters on these top topics (chapters [\[LatexChapter\]](#LatexChapter){reference-type="ref" reference="LatexChapter"} and [\[MarkdownChapter\]](#MarkdownChapter){reference-type="ref" reference="MarkdownChapter"}) before reading Part III. ### Reproduce this book This book practices what it preaches. It can be reproduced. I wrote the book using the programs and methods that I describe. Full documentation and source files can be found at the book's GitHub repository. Feel free to read and even use (within reason and with attribution, of course) the book's source code. You can find it at: . This is especially useful if you want to know how to do something in the book that I don't directly cover in the text. If you notice any errors or places where the book can be improved please report them on the book's GitHub Issues page: . Corrections will be posted at: . ### Contents overview The book is broken into four parts. The first part (chapters [\[GettingStartedRR\]](#GettingStartedRR){reference-type="ref" reference="GettingStartedRR"}, [\[GettingStartedRKnitr\]](#GettingStartedRKnitr){reference-type="ref" reference="GettingStartedRKnitr"}, and [\[DirectoriesChapter\]](#DirectoriesChapter){reference-type="ref" reference="DirectoriesChapter"}) gives an overview of the reproducible research workflow as well as the general computer skills that you'll need to use this workflow. Each of the next three parts of the book guides you through the specific skills you will need for each part of the reproducible research process. Part two (chapters [\[Storing\]](#Storing){reference-type="ref" reference="Storing"}, [\[DataGather\]](#DataGather){reference-type="ref" reference="DataGather"}, and [\[DataClean\]](#DataClean){reference-type="ref" reference="DataClean"}) covers the data gathering and file storage process. The third part (chapters [\[StatsModel\]](#StatsModel){reference-type="ref" reference="StatsModel"}, [\[TablesChapter\]](#TablesChapter){reference-type="ref" reference="TablesChapter"}, and [\[FiguresChapter\]](#FiguresChapter){reference-type="ref" reference="FiguresChapter"}) teaches you how to dynamically incorporate your statistical analysis, results figures, and tables into your presentation documents. The final part (chapters [\[LatexChapter\]](#LatexChapter){reference-type="ref" reference="LatexChapter"}, [\[LargeDocs\]](#LargeDocs){reference-type="ref" reference="LargeDocs"}, and [\[MarkdownChapter\]](#MarkdownChapter){reference-type="ref" reference="MarkdownChapter"}) covers how to create reproducible presentation documents including LaTeX articles, books, slideshows, and batch reports as well as Markdown webpages and slideshows. [^1]: This is close to what [@Lykken1968] calls "operational replication". [^2]: The idea of really reproducible computational research was originally thought of and implemented by Jon Claerbout and the Stanford Exploration Project beginning in the 1980s and early 1990s [@Fomel2009; @Donoho2009]. Further seminal advances were made by Jonathan B. Buckheit and David L. Donoho who created the Wavelab library of MATLAB routines for their research on wavelets in the mid-1990s [@Buckheit1995]. [^3]: Reproducibility is important for both quantitative and qualitative research [@King1994]. Nonetheless, we will focus mainly on on methods for reproducibility in quantitative computational research. [^4]: Much of the reproducible computational research and literate programming literatures have traditionally used the term "weave" to describe the process of combining source code and presentation documents [see @Knuth1992 101]. In the R community weave is usually used to describe the combination of source code and LaTeX documents. The term "knit" reflects the vocabulary of the *knitr* R package (knit + R). It is used more generally to describe weaving with a variety of markup languages. The term is used by RStudio if you are using the *rmarkdown* package, which is similar to *knitr*. We also cover the *rmarkdown* package in this book. Because of this, I use the term knit rather than weave in this book. [^5]: See the American Physical Society's website at . See also [@Fomel2009]. [^6]: Of course, it's important to keep in mind that reproducibility is "neither necessary nor sufficient to prevent mistakes" [@Stodden2009b]. [^7]: There are ways to enable some public reproducibility without revealing confidential information. See [@Vandewalle2007] for a discussion of one approach. [^8]: See this post by David Smith about how the J.P. Morgan "London Whale" problem may have been prevented with the type of processes covered in this book: (posted 11 February 2013). [^9]: The book was created with R version and developer builds of RStudio version 0.99.370. [^10]: In this book I cover the Bash shell for Linux and Mac as well as Windows PowerShell. [^11]: I know you can write scripts in statistical programs like SPSS, but doing so is not encouraged by the program's interface and you often have to learn multiple languages for writing scripts that run analyses, create graphics, and deal with matrices. [^12]: Donald Knuth coined the term literate programming in the 1970s to refer to a source file that could be both run by a computer and "woven" with a formatted presentation document [@Knuth1992]. [^13]: A very interesting tool that is worth taking a look at for the Python programming language is HTML Notebooks created with IPython. For more details see . [^14]: Syntax highlighting uses different colors and fonts to distinguish different types of text. [^15]: Note that the Sweave-style syntax is not identical to actual *Sweave* syntax. See Yihui Xie's discussion of the differences between the two at: . *knitr* has a function (`Sweave2knitr`) for converting *Sweave* to *knitr* syntax. [^16]: It does this by relying on a tool called Pandoc [@Pandoc2014]. [^17]: If you are more comfortable with a what-you-see-is-what-you-get (WYSIWYG) word processor like Microsoft Word, you might be interested in exploring Lyx. It is a WYSIWYG-like LaTeX editor that works with *knitr*. It doesn't work with the other markup languages covered in this book. For more information see: . I give some brief information on using Lyx with *knitr* in Chapter 3's Appendix. [^18]: LaTeX is is really a set of macros for the TeX typesetting system. It is included in all major TeX distributions. [^19]: The exact command is: . [^20]: To verify this, open the Terminal and type: `make –version` (I used version 3.81 for this book). This should output details about the current version of Make installed on your computer. ================================================ FILE: Old/Source-v2/Children/Chapter10/chapter10.Rnw ================================================ % Chapter Chapter 10 For Reproducible Research in R and RStudio % Christopher Gandrud % Created: 16/07/2012 05:45:03 pm CEST % Updated: 2 April 2015 <>= set_parent('Rep-Res-Parent.Rnw') @ \chapter{Showing Results with Figures}\label{FiguresChapter} One of the main reasons that many people use R is to take advantage of its comprehensive and powerful set of data visualization tools. Visually displaying information with graphics is often a much more effective way of presenting both descriptive statistics and analysis results than the tables we covered in the last chapter.\footnote{There are, of course, a number of exceptions to this rule of thumb. \citeauthor{vanBelle2008} \citeyearpar[][Ch. 9]{vanBelle2008} argues that a few numbers should be listed in a sentence, many numbers shown in tables, and relationships between numbers are best shown with graphs. Similarly, \cite{Tufte2001} argues that tables tend to outperform graphics for displaying 20 or fewer numbers. Graphics often outperform tables for showing larger data sets and relationships within the data.} Nonetheless, dynamically incorporating figures with \emph{knitr}/\emph{rmarkdown} has many of the same benefits as dynamically including tables, especially the ability to have data set or analysis changes automatically cascade into your presentation documents. The basic process for including figures in knitted presentation documents is also very similar to including tables, though there are some important extra considerations we need to make to properly size the figures and be able to include interactive visualizations in our presentation documents. In this chapter we will first briefly learn how to include non-knitted graphics in LaTeX and Markdown documents before turning to look at how to dynamically knit R graphics into presentation documents. In the remainder of the chapter we will look at how to actually create graphics with R including some of the fundamentals of R's default graphics package, as well as the \emph{ggplot2} \citep{R-ggplot2}\index{ggplot2} and \emph{googleVis} \citep{R-googleVis}\index{googleVis} packages. In each case we will focus on how to include the figures created by these packages in knitted presentation documents. \section{Including Non-knitted Graphics} Understanding how \emph{knitr}/\emph{rmarkdown} dynamically include figures is easier if you understand how figures are normally included in LaTeX and Markdown. Unlike a word processing program like Microsoft Word\index{Microsoft Word}, in LaTeX, Markdown, HTML, and other markup languages you don't copy and paste figures into your document. Instead you link to an image file outside of your markup document. Typically these image files are in formats such as \emph{PDF}, \emph{PNG}, and \emph{JPEG}.\index{PDF}\index{PNG}\index{JPEG}\footnote{PDF: Portable Document Format, PNG: Portable Network Graphic, JPEG: Joint Photographic Experts Group. \\ A quick note about file formats: By default \emph{knitr} creates PDF formatted figure files when knitting R LaTeX documents. These figures, generally built with vector graphics,\index{vector graphics} allow you to zoom in on them by any amount without them becoming pixelated. This means that your images will be crisp in PDF presentation documents. For Markdown documents,\index{Markdown!figure formats} \emph{knitr} creates PNG images. PNG images are usually relatively high quality and can be rendered directly on websites, unlike PDFs. JPEG formatted files usually take up less disk space than PDF and PNG files. However, their quality is also worse and can often look very pixelated. For more information, Wikipedia has a comprehensive comparison of graphics file formats at: \url{http://en.wikipedia.org/wiki/Comparison_of_graphics_file_formats}.} There are three advantages to this method of including graphics over cut and paste. The first is that whenever the image files are changed, the changes are updated in the final presentation document when it is compiled, no recopying and pasting. The second advantage is that the images are sized and placed with the markup code rather than pointing and clicking. This is tedious at first, but saves considerable time and frustration when a document becomes larger. It also makes it easy to consistently format multiple images in a document. Finally, because the image is not actually loaded in the markup file, you won't notice any sluggishness while editing the markup document that you get in a traditional word processor if there are many images. If the image files are in the same directory as the markup document, we don't need to specify the image's file path, only its name. If they are in another directory, we need to include additional file path information.\index{file path} Remember to use relative paths when possible.\index{file path!relative} In this section we will learn how to include graphics files in documents created with LaTeX and Markdown. \subsection{Including graphics in LaTeX}\index{LaTeX!graphics} The main way to include graphics (graphs, photos, and so on) in LaTeX documents is to use the \texttt{includegraphics}\index{LaTeX command!includegraphics} command to link to image files. To have the full range of features for \texttt{includegraphics}, make sure to load the \emph{graphicx}\index{LaTeX package!graphicx} package in your document's preamble. Imagine that we wanted to include an image of butterflies\index{butterfly} stored in a file called \emph{HeliconiusMimicry.png} in a LaTeX-produced document.\footnote{The image used here is from \cite{Meyer2006}.} We type: <>= \includegraphics[scale=0.8]{HeliconiusMimicry.png} @ \noindent In the square brackets you'll notice \verb|scale=0.8|. This formats the image to be included at 80 percent of its actual size. You can use other options such as \texttt{height} to specify the height, \texttt{width} to specify the width, and \texttt{angle} to specify the angle at which to rotate the image. You can add more than one option if they are separated by commas. Rather than hard coding the width in exact centimeters, you can determine its width as a proportion of the text width using \verb|\textwidth|\index{LaTeX!textwidth}.\footnote{Note there are a number of other ways to set the size of a figure relative to a page element. See: LaTeX Wiki Book for more details: \url{http://en.wikibooks.org/wiki/LaTeX/Page_Layout}.} For example, to set our image at 80 percent of the text width we can type: <>= \includegraphics[width=0.8\textwidth]{HeliconiusMimicry.png} @ \paragraph{{\tt{figure}} float environment}\index{LaTeX environment!figure} Most often you will want to include LaTeX figures in a \texttt{figure} float environment. The \emph{figure} environment works almost exactly the same way as the \texttt{table} environment we saw in the last chapter. It allows you to separate the figure from the text, add a caption, and label the figure. We begin the environment with \verb|\begin{figure}[POSITION_SPEC]|. \verb|POSITION_SPEC| can have the same values as we saw earlier with tables (page \pageref{POSITIONSPEC}). We can then include a \texttt{caption}\index{LaTeX!caption} and \texttt{label} command.\index{LaTeX command!label} The environment is closed with \verb|\end{figure}|. For example, to create Figure \ref{ExampleLaTeXFigure}, I used the following code:\footnote{For simplicity, this code does not include the full image's actual file path.} <>= \begin{figure}[ht] \caption{An Example Figure in LaTeX} \label{ExampleLaTeXFigure} \begin{center} \includegraphics[scale=0.8]{HeliconiusMimicry.png} \end{center} {\scriptsize{Source: \cite{Meyer2006}}} \end{figure} @ \noindent Notice that after the call to end the \texttt{center} environment we include \verb|{\scriptsize{Source: \cite{Meyer2006}}}|. This simply includes a note in the figure environment giving the image's source. The note moves with the figure and is separate from the text. The \texttt{scriptsize}\index{LaTeX command!scriptsize} command transforms the text to smaller than normal size font. See Chapter \ref{LatexChapter} (Section \ref{FontSize}) for more details on LaTeX font sizes. The command \verb|\cite{Meyer2006}| inserts a citation from the bibliography for \cite{Meyer2006}. We will discuss bibliographies in more detail in the next chapter (Section \ref{BibTeXBib}). \begin{figure}[ht] \caption{An Example Figure in LaTeX} \label{ExampleLaTeXFigure} \begin{center} \includegraphics[scale=0.8]{Children/Chapter10/images10/HeliconiusMimicry.png} \end{center} {\scriptsize{Source: \cite{Meyer2006}}} \end{figure} \subsection{Including graphics in Markdown/HTML}\index{Markdown!graphics} Markdown has a similar command as LaTeX's \texttt{includegraphics}. It goes like this: \verb|![ALT_TEXT](FILE_PATH)|.\index{![]()} This syntax may seem strange now, but it will hopefully make more sense when we cover Markdown hyperlinks in Chapter \ref{MarkdownChapter} (Section \ref{MarkdownLinks}) as this is what it is intended to imitate. \verb|ALT_TEXT| refers to HTML's \texttt{alt} (alternative text)\index{HTML!alt} attribute. This should be a very short description of the image that will appear if it fails to load in a web browser. \verb|FILE_PATH| specifies the image's file path.\footnote{You can also include a title in quotation marks after the file path. This specifies the HTML \texttt{title} attribute. However, this attribute does not create a title for the image in the way that \texttt{caption} does for LaTeX float figures. Instead it creates a tooltip\index{tooltip}, a small box that appears when you place your cursor over the image. Specifying descriptive alt text is very useful for screen readers that help visually impaired people access web content.} Here is an example using the image we worked with before.\label{TitleAttribute} <>= ![ButterflyImage](HeliconiusMimicry.png) @ \noindent Note that the file path can be a URL. You may, for example, store an image in the Dropbox Public folder or on GitHub and use its URL to link to it in the Markdown document.\footnote{For images stored on GitHub\index{GitHub} use the URL for the raw version of the file.} Markdown does not include ways to resize or re-position an image, so that the syntax would stay simple. If you want to resize or position your image you will have to use HTML\index{HTML} markup. Probably the simplest way to include images with HTML is by using the \texttt{img} (image) element tag.\index{HTML element!img} To create the equivalent of what we just did in Markdown with HTML we type: \begin{knitrout} \definecolor{shadecolor}{rgb}{1, 1, 1} \color{fgcolor} \begin{kframe} \begin{alltt} ButterflyImage \end{alltt} \end{kframe} \end{knitrout} \noindent The \texttt{src} (script)\index{HTML attribute!src} attribute specifies the file path. To change the width and height of the image we can use the \texttt{width}\index{HTML attribute!width} and \texttt{height}\index{HTML attribute!height} attributes. For example: \begin{knitrout} \definecolor{shadecolor}{rgb}{1, 1, 1} \color{fgcolor} \begin{kframe} \begin{alltt} ButterflyImage \end{alltt} \end{kframe} \end{knitrout} \noindent creates an image that is 100 pixels (\texttt{px}) wide by 100 pixels\index{pixel}\index{px} high.\footnote{A pixel is the smallest discrete part of images displayed on a screen. See the ``pixel'' Wikipedia page for more details: \url{http://en.wikipedia.org/wiki/Pixel}.} It is also possible to specify the alignment of figures in Markdown with a custom CSS style file. I don't cover how to do that here. \section{Basic \emph{knitr}/\emph{rmarkdown} Figure Options} So far we have looked at how to include images that have already been created into our LaTeX and Markdown documents. \emph{knitr}, and by extension \emph{rmarkdown}, allow us to combine a figure's creation by R with its inclusion in a presentation document. They are tied together and update together. We use \emph{knitr} chunk options to specify how the figure will look in the presentation document and where it will be saved. Let's learn some of the more important chunk options for figures. \subsection{Chunk options} \paragraph{{\tt{fig.path}}}\index{knitr option!fig.path} When you use \emph{knitr} to create and include figures in your presentation documents it (1) runs the code you give it to create the figure, (2) automatically saves it into a particular directory,\footnote{If a code chunk creates more than one figure, \emph{knitr} automatically saves each into its own file in the same directory.} and (3) includes the necessary LaTeX or Markdown code to include the figure in the final presentation document. By default \emph{knitr} saves images into a folder (it creates) called \emph{figure} located in the working directory.\footnote{File names are based on the code chunk label where they were created.} You can tell \emph{knitr} where to save the images with the \texttt{fig.path} option. Simply use the file path naming conventions suitable for your system and include the new path in quotation marks. \paragraph{{\tt{out.height}}}\index{knitr option!out.height} To set the height that a figure will be in the final presentation document use the \texttt{out.height} option. In R LaTeX documents you can set the width using centimeters, inches, or as a proportion of a page element. In R Markdown documents you use pixels to set the height. For example, to set a figure's height in an R Markdown document to 200 pixels use \verb|out.height='200px'|. \paragraph{{\tt{out.width}}}\index{knitr option!out.width} Similarly, we can set the width of a \emph{knitr} created figure using the \texttt{out.width} option. The same rules apply as with \texttt{out.width}. For example, to have a figure shown up at 80 percent of the text width in an R LaTeX document use: \verb|out.width='0.8\\textwidth'|. Notice that that there are two backslashes before \texttt{textwidth}.\index{LaTeX!textwidth} As we saw earlier, the LaTeX command only has one. However, all \emph{knitr} code chunk options must be written as they would be in R. We need to escape the backslash with the backslash escape character, i.e. use two backslashes. \paragraph{{\tt{fig.align}}}\index{knitr option!fig.align} You can set a knitted figure's alignment using \texttt{fig.align}. The option can be set to \texttt{left}, \texttt{center}, or \texttt{right}. To center a figure, add \verb|fig.align='center'|. \paragraph{Other figure chunk options} The previous options are probably the most commonly used ways of adjusting figures with \emph{knitr}. However, \emph{knitr} has many other chunk options to help you adjust your figures so that they are incorporated into your presentation documents the way that you want. The option \texttt{fig.cap}\index{knitr option!fig.cap} allows you to set a figure's LaTeX caption and \texttt{fig.lb}\index{knitr option!fig.lb} allows you to set the label.\footnote{In this chapter we will set these options in the markup rather than the code chunk. I prefer doing this because \emph{knitr} options need to be on the same line and so can sometimes result in very long lists of options that are difficult to read.} As we will see below (page \pageref{DevTalk}), you can use the \verb|dev| option to choose the figure's output file format, e.g. PDF, PNG, JPEG. Please see the official \emph{knitr} code chunk options webpage for more information on figure chunk options: \url{http://yihui.name/knitr/options#chunk_options}. \subsection{Global options} If you want all of your figures to share the same options--e.g. same height and alignment--you can set global figure options at the beginning of your document with \verb|opts_chunk$set|.\index{knitr!global chunk options}\index{knitr!opts\_chunk} Imagine that we are making an R LaTeX Sweave-style document and want all of our figures to be center aligned and 80 percent of the text width. We type: \begin{knitrout} \definecolor{shadecolor}{rgb}{1, 1, 1} \color{fgcolor} \begin{kframe} \begin{alltt} \textless{\textless}include=FALSE\textgreater{\textgreater}= opts_chunk$\hlkwd{set.}(fig.align = \hlstr{"center"}, out.width = \hlstr{"0.8\textbackslash{}\textbackslash{}textwidth"}) @ \end{alltt} \end{kframe} \end{knitrout} \noindent You can also set some global figure options, such as \texttt{fig\_height} and \texttt{fig\_width} in your \emph{rmarkdown} YAML header.\index{rmarkdown!header} \section{Knitting R's Default Graphics} R's \emph{graphics} package\index{graphics, R package}--loaded by default--includes commands to create numerous plot types. These include \texttt{hist}\index{hist} for histograms, \texttt{pairs}\index{R function!pairs} for scatterplot matrices, \texttt{boxplot}\index{boxplot} for creating boxplots, and the versatile \texttt{plot}\index{R function!plot} for creating x-y plots--including scatterplots\index{scatterplot} and bar charts\index{bar chart} depending on the data's type. There are many useful resources for learning how to fully utilize R's default graphics capabilities. These include Paul Murrell's \citeyearpar{Murrell2011} very comprehensive \emph{R Graphics} book. The Cookbook for R\footnote{\url{http://www.cookbook-r.com/Graphs/}} and Quick-R\footnote{\url{http://www.statmethods.net/advgraphs/}} websites are also very helpful. Winston Chang, the maintainer of the Cookbook for R, also has a full book devoted to creating R graphics \citeyearpar{Chang2012}. In this section we are going to see how to include R's default graphics in our LaTeX and Markdown presentation documents. We will also see an example of how to source the creation of a graph from a segmented analysis file. Most of R's default graphics capabilities create static graphics. They are not animations or interactive. The discussion in this section is exclusively about using static graphics with \emph{knitr}/\emph{rmarkdown}. Later in the chapter we will discuss how to knit interactive graphics. Let's look at an example we first saw at the end of Chapter \ref{StatsModel} (Section \ref{SourceCarsGraph}). Remember that we accessed an R source code file stored on GitHub to create a simple scatterplot of cars' speed and stopping distances using R's \emph{cars} data set, which is loaded by default. We haven't yet seen the code in the R source file that created the plot. The variable \textbf{speed} contains the stopping speed and \textbf{dist} contains the stopping distances. Here is the code to create the plot: <>= # Create simple scatterplot of cars' speed and stopping distance plot(x = cars$speed, y = cars$dist, xlab = "Speed (mph)", ylab = "Stopping Distance (ft)", cex.lab = 1.5) @ \noindent We select the variables from \emph{cars} to plot on the $x$ and $y$ axes of our graph with the component selector (\verb|$|). Then we use the \texttt{xlab}\index{xlab} and \texttt{ylab}\index{ylab} arguments to specify the $x$ and $y$ axis labels. We could have added a title for the plot using the \texttt{main}\index{main} argument. We didn't do this because we will give the plot a title in the LaTeX \texttt{figure} environment. The \texttt{cex.lab}\index{cex.lab} argument increased the labels' font size. The argument specifically determines how to scale the labels relative to the default size. 1.5 means 50 percent larger than the default. Now let's see how to create this plot with \emph{knitr} and include it in a LaTeX \texttt{figure} environment.\index{LaTeX environment!figure} {\small \begin{knitrout} \definecolor{shadecolor}{rgb}{1, 1, 1} \color{fgcolor} \begin{kframe} \begin{alltt} \textbackslash{}begin\{figure\}[ht] \textbackslash{}caption\{Example Simple Scatter Plot Using \textbackslash{}texttt\{plot\}\} \textbackslash{}label\{BasicFigureExample\} \textless{\textless}echo=FALSE, fig.align='center', out.width='8cm', out.height='8cm'\textgreater{\textgreater}= \hlkwd{plot}(x = cars$speed, y = cars$dist, xlab = \hlstr{"Speed (mph)"}, ylab = \hlstr{"Stopping Distance (ft)"}, cex.lab = 1.5) @ \textbackslash{}end\{figure\} \end{alltt} \end{kframe} \end{knitrout} } % Actually create simple scatterplot \begin{figure} \caption{Example Simple Scatter Plot Using \texttt{plot}} \label{BasicFigureExample} <>= plot(x = cars$speed, y = cars$dist, xlab = "Speed (mph)", ylab = "Stopping Distance (ft)", cex.lab = 1.5) @ \end{figure} \noindent This code produces Figure \ref{BasicFigureExample}.\footnote{Note that I did not specify the center environment. This is because it is specified in a \emph{knitr} global chunk option.} If you are familiar with R graphics you will notice that we did not need to tell \emph{knitr} to save the file in a particular format. Instead, behind the scenes it automatically saves the plot as a PDF file in a folder called \emph{figure} that is a child of the current working directory. You can choose the figure file's format with the \texttt{dev} (graphical device) chunk option.\index{knitr option!dev}\label{DevTalk} For example, to save the figure in a PNG formatted file simply add the chunk option \verb|dev='PNG'|. You can choose any graphical device format supported by R. For a full list of R's graphical devices type \verb|?Devices| into your console.\index{R!graphical device} One reason you might want to change the format is to reduce your presentation document's file size. Using a bitmap format like PNG will create smaller files than PDFs, though lower-quality images. We could, of course, simply link to the original R source code file stored on GitHub\index{GitHub} with the \verb|source_url|\index{R function!source\_url} command. Let's look at an example of this with a different source code file. Remember in Chapter \ref{DataGather} we used a makefile to gather data from three different sources on the internet. The CSV is called \emph{MainData.csv} and is stored on GitHub at: \url{http://bit.ly/V0ldsf}.\footnote{The full version of the URL is: \url{https://raw.githubusercontent.com/christophergandrud/Rep-Res-Examples/master/DataGather_Merge/MainData.csv}} We can download this data into R and make a scatterplot matrix with this code:\index{scatterplot matrix} <>= # Download data MainData <- repmis::source_data("http://bit.ly/V0ldsf") # Subset MainData so that it only includes the year 2003 SubData <- subset(MainData, year == 2003) # Remove iso2c, country, year variables # Keep reg_4state, disproportionality, FertilizerConsumption SubData <- SubData[, c("reg_4state", "disproportionality", "FertilizerConsumption")] # Create a scatterplot matrix pairs(x = SubData) @ \noindent This\index{R function!source\_data} is a lot of code, but you should be familiar with most of it. You will notice that after downloading the data we cleaned it up in preparation for plotting with the \texttt{pairs} command\index{R function!pairs} by removing data from all years other than 2003 and all of the country-year identifying variables. Finally, we created the scatterplot matrix with \texttt{pairs}. To dynamically include the plot in our final document, we don't need to include all of this code in a code chunk in our markup document. A file containing the code is available on GitHub.\footnote{See: \url{https://raw.githubusercontent.com/christophergandrud/Rep-Res-Examples/master/Graphs/ScatterPlotMatrix.R}.} So we only need to use \verb|source_url| to link to it. I've shortened the raw source code file's URL to: \url{http://bit.ly/TE0gTc}. Let's look at the syntax for knitting this into an R Markdown file: {\scriptsize <>= ### Scatterplot Matrix Created from MainData.csv ```{r, echo=FALSE, warning=FALSE, message=FALSE, out.width='500px', out.height='500px'} # Create scatterplot matrix from MainData.csv devtools::source_url("http://bit.ly/TE0gTc") ``` @ } \noindent This code creates the plot that we see in Figure \ref{MarkdownScatterMatrix}. Because we have linked all the way back to the original data set \emph{MainData.csv}, any time it is updated by the makefile, the update will automatically cascade all the way through to our final presentation document the next time we knit it. \begin{figure} \caption{Example of a Scatterplot Matrix in a Markdown Document} \label{MarkdownScatterMatrix} \begin{center} \includegraphics[width=0.7\textwidth]{Children/Chapter10/images10/MarkdownScatterMatrix.png} \end{center} \end{figure} \section{Including \emph{ggplot2} Graphics}\index{ggplot2|(} The \emph{ggplot2} package\footnote{``GG'' stands for grammar of graphics and ``2'' indicates that it is the second major version of the package.} \citep{R-ggplot2}\index{ggplot2} is probably one of the most popular recent developments in R graphics. It greatly expands the aesthetic and substantive tools R has for displaying quantitative information. Figures created with \emph{ggplot2} are (generally) static,\footnote{It is possible to combine a series of figures created with \emph{ggplot2} into an animation. For a nice example of an animation using \emph{ggplot2} see Jerzy Wieczorek's animation of 2012 US presidential campaigning:\index{presidential campaigning} \url{http://bit.ly/UUVKka}.} so they are included in knitted documents the same way as most of R's default graphics. There are a number of very good resources for learning how to use \emph{ggplot2}. These include Hadley Wickham's \emph{ggplot2} book \citeyearpar{Whickham2009book} and article \citeyearpar{Whickham2010journal}. The official \emph{ggplot2} website\footnote{\url{http://docs.ggplot2.org/current/}} has up-to-date information. I've also found the Cookbook for R website helpful.\footnote{\url{http://wiki.stdout.org/rcookbook/Graphs/}} Given that there is already extensive good documentation on \emph{ggplot2} we are not going to learn the full details of how to use the package here. Instead, let's look at some examples of how to manipulate a data frame and a regression results object so that they can be graphed with \emph{ggplot2}. First we will create a multi-line time series plot. Then we will create a caterpillar plot of regression results. Along with giving you a general sense of how \emph{ggplot2} works, the examples illuminate how \emph{ggplot2} can be made part of a fully reproducible research workflow.\footnote{Note that everything we do here with \emph{ggplot2} can also be done with R's default graphics, though the appearance will be different.} Sometimes we may want to show how multiple variables change together overtime. For example, imagine we have data on inflation\index{inflation}\index{US Federal Reserve} in the United States along with inflation forecasts made by the US Federal Reserve\index{US Federal Reserve} two quarters beforehand. The data is stored on GitHub at: \url{https://raw.githubusercontent.com/christophergandrud/Rep-Res-Examples/master/Graphs/InflationData.csv}.\footnote{This data is from \cite{GandrudGrafstrom2012}. The example here partially recreates Figure 1 from that paper.} I've loaded the data into R and put it into an object called \emph{InflationData}. It looks like this: % Load inflation data <>= # Create URL object InflationUrl <- "https://raw.githubusercontent.com/christophergandrud/Rep-Res-Examples/master/Graphs/InflationData.csv" # Load data InflationData <- repmis::source_data(InflationUrl) @ {\small <>= names(InflationData) @ } We want to create a plot with \textbf{Quarter} as the $x$ axis, inflation as the $y$ axis, and two lines. One line will represent \textbf{ActualInflation} and the other \textbf{EstimatedInflation}. To do this we need to reshape\index{reshape data} our data so that the inflation variables are in long format\index{long formatted data} like this: \vspace{0.5cm} \begin{tabular}{l l l} \hline Quarter & Variable & Value \\[0.25cm] \hline\hline 1969.1 & ActualInflation & \\ 1969.1 & EstimatedInflation & \\ 1969.2 & ActualInflation & \\ 1969.2 & EstimatedInflation & \\ \ldots & & \\ \hline \end{tabular} \vspace{0.5cm} \noindent We can use the \texttt{gather} command from \emph{tidyr}\index{R function!gather}\index{tidyr} that we first saw in Chapter \ref{DataClean} (Section \ref{GatherReshape}) to reshape the data. The variable identifying the observations in this case is \texttt{Quarter}. The \textbf{ActualInflation} and \textbf{EstimatedInflation} variables (in columns two and three) are the variables that we want to gather. So let's gather the data: <>= # Load tidyr library(tidyr) # Gather InflationData GatheredInflation <- gather(InflationData, variable, value, 2:3) # Show GatheredInflation variables head(GatheredInflation) @ \noindent Now we have a data set we can use to create our line graph with \emph{ggplot2}. Let's cover a few basic \emph{ggplot2}\index{ggplot2} ideas that will help us understand the following code better. First, plots are composed of layers\index{ggplot2!layers} including the coordinate system, points, labels, and so on. Each layer has aesthetics, including the variables plotted on the $x$ and $y$ axes, label sizes, colors, and shapes. Aesthetic elements are defined by the \texttt{aes}\index{ggplot2!aes}\index{ggplot2!aesthetics} argument. Finally, the main layer types are called geometrics,\index{ggplot2!geometrics} including lines, points, bars, and text. Commands that set geometrics usually begin with \texttt{geom}.\index{ggplot2!geom} For example, the geometric to create lines is \verb|geom_line|.\index{ggplot2!geom\_line} {\footnotesize <>= # Load ggplot2 library(ggplot2) # Create plot LinePlot <- ggplot(data = GatheredInflation, aes(x = Quarter, y = value, color = variable, linetype = variable)) + geom_line() + scale_color_discrete(name = "", labels = c("Actual", "Estimated")) + scale_linetype(name = "", labels = c("Actual", "Estimated")) + xlab("\n Quarter") + ylab("Inflation\n") + theme_bw(base_size = 15) # Print plot print(LinePlot) @ } \noindent You can see we set the $x$ and $y$ axes using the \textbf{Quarter} and \textbf{value} variables. We told \emph{ggplot} that elements in the geometric layer should have lines with different colors and line types (dashed, dotted, and so on) based on the value of \textbf{variable} that they represent. \verb|geom_line| specifies that we want to add a line geometric layer.\footnote{Remember from Chapter \ref{GettingStartedRKnitr} that commands must be followed by parentheses. These layers are commands so they need to be followed by parentheses.} \verb|scale_color_discrete| and \verb|scale_linetype|\index{ggplot2!scale\_color\_discrete}\index{ggplot2!scale\_linetype} are used here to hide the plot's legend title with \verb|name = ""| and customize the legend's labels with \verb|labels = . . .|.\index{ggplot2!labels} You can also use them to determine the specific colors and line types you would like to use. \texttt{xlab} and \texttt{ylab} set the axes' labels. You can add a title with \texttt{ggtitle}.\index{ggplot2!ggtitle} Finally, I added \verb|theme_bw|\index{ggplot2!theme\_bw} so that the plot would use a simple black-and-white theme. We added the argument \verb|base_size = 15| to increase the plot's font size.\index{ggplot2!base\_size} All of the code required to create this graph is on GitHub at: \url{http://bit.ly/VEvGJG}.\footnote{The full URL is: \url{https://raw.githubusercontent.com/christophergandrud/Rep-Res-Examples/master/Graphs/InflationLineGraph.R}.} So to knit the graph like Figure \ref{ggplot2Line} into an R Sweave-style LaTeX document we type: {\scriptsize \begin{knitrout} \definecolor{shadecolor}{rgb}{1, 1, 1} \color{fgcolor} \begin{kframe} \begin{alltt} \textbackslash{}begin\{figure\}[ht] \textbackslash{}caption\{Example Multi-line Time Series Plot Created with \textbackslash{}emph\{ggplot2\}\} \textbackslash{}label\{ggplot2Line\} \textbackslash{}begin\{center\} \textless{\textless}echo=FALSE, message=FALSE, warning=FALSE, out.width='10cm', out.height='8cm'\textgreater{\textgreater}= \hlcom{# Create plot} devtools::\hlkwd{source_url}(\hlstr{"http://bit.ly/VEvGJG"}) @ \textbackslash{}end\{center\} \textbackslash{}end\{figure\} \end{alltt} \end{kframe} \end{knitrout} } % Actually add graph \begin{figure} \caption{Example Multi-line Time Series Plot Created with \emph{ggplot2}} \label{ggplot2Line} <>= # Create plot devtools::source_url("http://bit.ly/VEvGJG") @ \end{figure} \noindent The syntax for including this and other \emph{ggplot2} figures in an R Markdown document is the same as we saw for default R graphics. \subsection{Showing regression results with caterpillar plots} Many packages that estimate statistical models from data in R have built-in plotting capabilities. For example, the \emph{survival} package \citep{R-survival} has a \texttt{plot.survfit}\index{plot.survfit} command for plotting survival curves created using event history analysis.\index{event history analysis}\index{event history analysis} These plots can Of course, be knitted into presentation documents like the plots we have seen already. However, sometimes either a package doesn't have built-in commands for plotting model results the way you want to and/or you want to use \emph{ggplot2} to improve the aesthetic quality of the plots they do create by default. In either case you can almost always create the plot that you want by first breaking into the model results object, extracting what you want, then plotting it with \emph{ggplot2}. The process is very similar to what we did in Chapter \ref{TablesChapter} to create custom tables (see Section \ref{NonSupportedClasses}). To illustrate how this can work, let's create a caterpillar plot, like Figure \ref{CatPlot},\index{caterpillar plot}\index{coefficient} showing the mean coefficient estimates and the uncertainty\index{uncertainty} surrounding them from a Bayesian normal linear regression model\index{Bayesian normal linear regression} using the \emph{swiss} data frame. Here is our model:\index{Zelig}\index{R function!zelig} <>= # Load Zelig package library(Zelig) # Estimate model NBModel2 <- zelig(Examination ~ Education + Agriculture + Catholic + Infant.Mortality, model = "normal.bayes", data = swiss, cite = FALSE) @ \noindent Remember from Chapter \ref{TablesChapter} that we can create an object summarizing\index{R function!summary} our estimation results like this: <>= # Create summary object NBModel2Sum <- summary(NBModel2) # Create summary data frame NBSum2DF <- data.frame(NBModel2Sum$summary) # Show data frame NBSum2DF @ \noindent We want to use \emph{ggplot2} to create credibility intervals\index{credibility interval} for each variable with \textbf{X2.5.} as the minimum value and \textbf{X97.5.} as the maximum value. These are the lower and upper bounds of the middle 95 percent of the estimates' marginal posterior distributions, i.e. the 95 percent credibility intervals.\footnote{The procedures used here are also generally applicable for graphing frequentist\index{frequentist} confidence intervals once you have calculated the confidence intervals. One useful command for doing this is \texttt{confint}.\index{R function!confint}\index{confidence interval}} We will also create a point at the \textbf{mean} of each estimate. To do this we will use \emph{ggplot2}'s \verb|geom_pointrange| command. First we need to do a little tidying up.\label{RowNamesTidy} <>= # Convert row.names to normal variable NBSum2DF$Variable <- row.names(NBSum2DF) # Keep only coefficient estimates ## This allows for a more interpretable scale NBSum2DF <- subset(NBSum2DF, Variable != "(Intercept)") NBSum2DF <- subset(NBSum2DF, Variable != "sigma2") @ \noindent The first line of executable code creates a proper variable out of the data frame's row.names\index{row.names} attribute. In this case row.names contains the names of the variables included in the regression. The second and third executable lines remove the estimates \emph{(Intercept)} and \emph{sigma2}. This allows the variable's coefficient estimates to be plotted on a scale that enables easier interpretation. Now we can create our caterpillar plot. <>= # Load ggplot2 library(ggplot2) # Make caterpillar plot ggplot(data = NBSum2DF, aes(x = reorder(Variable, X2.5.), y = Mean, ymin = X2.5., ymax = X97.5.)) + geom_pointrange(size = 1.4) + geom_hline(aes(intercept= 0), linetype = "dotted") + xlab("Variable\n") + ylab("\n Coefficient Estimate") + coord_flip() + theme_bw(base_size = 20) @ \noindent There are some new pieces of code in here, so let's take a look. First, the data frame is reordered from the highest to lowest value of \textbf{X2.5.} using the \texttt{reorder} command.\index{R function!reorder} This makes the plot easier to read. The middle point of the point range is set with \texttt{y} and the lower and upper bounds with \texttt{ymin}\index{ggplot2!ymin} and \texttt{ymax}.\index{ggplot2!ymax} The \verb|geom_hline|\index{ggplot2!geom\_hline} command used here creates a dotted horizontal line at 0, i.e. no effect. \verb|coord_flip|\index{ggplot2!coord\_flip} flips the plot's coordinates so that the variable names are on the $y$ axis. We can include this plot in a knitted document the same way as before. \begin{figure} \caption{An Example Caterpillar Plot Created with \emph{ggplot2}} \label{CatPlot} % Actually include plot <>= # Create plot devtools::source_url("https://raw.githubusercontent.com/christophergandrud/Rep-Res-Examples/master/Graphs/CaterpillarPlot.R") @ \end{figure} \index{ggplot2|)} %%%%%%%%%%%%% googleVis \section{JavaScript Graphs with \emph{googleVis}}\index{googleVis} Markus Gesmann and Diego de Castillo's \emph{googleVis}\index{googleVis} package \citeyearpar{R-googleVis} allows us to use Google's Visualization API\index{API} from within R to create interactive tables, plots, and maps with Google Chart Tools.\index{Google Chart Tools} Because the visualizations are written in JavaScript\index{JavaScript} they can be included in HTML presentation documents created by R Markdown. Unfortunately, they cannot be directly\footnote{The example in this chapter is a from a screenshot.} included in LaTeX-produced PDFs. The \emph{animation}\index{animation} package \citep{R-animation} does have some limited features for including interactive visualizations in PDFs (as well as HTML documents) and is worth investigating if you want to do this. \paragraph{Basic googleVis figures} Let's briefly look at how to make one type of figure with \emph{googleVis}: a choropleth map.\index{choropleth map} This is created with the \texttt{gvisGeoChart} function.\index{gvisGeoChart}\index{R function!gvisGeoChart} We will use this example to illustrate how to incorporate \emph{googleVis} figures into R Markdown.\footnote{For demonstrations of the full range of plotting functions available, visit the \emph{googleVis} website: \url{http://code.google.com/p/google-motion-charts-with-r/wiki/GadgetExamples#googleVis_Examples}.} Imagine that we want to map global fertilizer\index{fertilizer} consumption in 2003 using the World Bank\index{World Bank} data we gathered in Chapter \ref{DataGather}. Remember that the data was highly right skewed, so we will actually map the natural logarithm\index{logarithm}\index{R function!log} of the \textbf{FertilizerConsumption} variable.\footnote{You'll notice in the code below that we remove all values of \textbf{FertilizerConsumption} less-than 0.1. This is so that we can calculate integer values with the natural logarithm. See Section \ref{Infinity} for more details.} Assuming that we have already loaded the \emph{MainData.csv} data set, here is the code: <>= # Load googleVis library(googleVis) # Subset MainData so that it only includes 2003 SubData <- subset(MainData, year == 2003) # Keep values of FertilizerConsumption greater-than 0.1 SubData <- subset(SubData, FertilizerConsumption > 0.1) # Find the natural logarithm of FertilizerConsumption. ## Round the results to one decimal digit. SubData$LogConsumption <- round(log(SubData$FertilizerConsumption), digits = 1) # Make a map of Fertilizer Consumption FCMap <- gvisGeoChart(data = SubData, locationvar = "iso2c", colorvar = "LogConsumption", options = list( colors = "['#ECE7F2', '#A6BDDB', '#2B8CBE']", width = "780px", height = "500px")) @ \noindent The \texttt{locationvar} argument specifies the variable with information on each observation's location. Google Chart Tools can use ISO\index{ISO} two-letter country codes to determine each country's location. \texttt{colorvar} specifies the variable with the values to map for each country. We can determine other options by creating a list-type\index{R!list} object with arguments specifying characteristics such as the map's width, height, and colors. The colors here are written using hexadecimal values.\index{hexadecimal} This is a commonly used format for specifying colors on websites.\footnote{You can also use hexadecimal values in \emph{ggplot2}. The Color Brewer 2\index{Color Brewer} website (\url{http://colorbrewer2.org/}) is very helpful for picking hexadecimal color palettes,\index{color palettes} among others.} To view the figure on your computer simply use \emph{googleVis}'s \texttt{plot} command. For example, to view our map we type: <>= plot(FCMap) @ \noindent Note that you need to be connected to the internet to view figures created by \emph{googleVis}, otherwise your image will not be able to access the required JavaScript\index{JavaScript} files from the Google Visualization API.\index{API} \begin{figure} \caption{Screenshot of a \emph{googleVis} Geo Chart} \label{GeoMapImage} \begin{center} \includegraphics[width=\textwidth]{Children/Chapter10/images10/GeoChartScreenShot.png} \end{center} \end{figure} \paragraph{Including \emph{googleVis} in knitted documents} Typing \verb|print(FCMap, tag = "chart")|\index{R function!print} in a knittable document would print the entire JavaScript code needed to create the map. Much like we saw with tables produced with \emph{xtable} and \emph{texreg} in Chapter \ref{TablesChapter}, we need to change the code chunk \texttt{results} option to include the map as a map rather than as JavaScript markup. To have the visualization show up in your HTML output, rather than the code block, simply set the code chunk option\index{knitr option!results} to \verb|results='asis'|.\footnote{You can use \texttt{results='asis'} to include almost any type of JavaScript graphics. For an example using the D3 JavaScript library\index{D3 JavaScript library}\index{JavaScript} and \emph{knitr} see this page by Yihui Xie: \url{http://yihui.name/knitr/demo/javascript/}.} For example, the full code needed to create and print \emph{FCMap} is available at: \url{http://bit.ly/VNnZxS}.\footnote{The full URL is: \url{https://raw.githubusercontent.com/christophergandrud/Rep-Res-Examples/master/Graphs/GoogleVisMap.R}.} To knit the map into an R Markdown document we type: <>= ```{r, echo=FALSE, message=FALSE, results='asis'} # Create and print geo map devtools::source_url("http://bit.ly/VNnZxS") ``` @ \paragraph{Note for Motion Charts} You may notice that Google motion charts\footnote{You can use the \texttt{gvisMotionChart}\index{R function!gvisMotionChart} command to make these.}\index{motion chart} do not show up in the RStudio \textbf{Preview HTML}\index{RStudio!Preview HTML} window or even in your web browser when you open the knitted HTML version of the file. You just see a big blank space where you had hoped the chart would be. It will show up, however, if you use the \verb|plot| command on a \verb|gvis| motion chart object in the console. Motion charts can only be displayed when they are hosted on a web server or located in a directory `trusted' by Flash Player.\footnote{This is because motion charts and annotated time line charts rely on Flash,\index{Flash} unlike the other Google visualizations. For more information see Markus Gesmann's blog post at: \url{http://www.magesblog.com/2012/05/interactive-reports-in-r-with-knitr-and.html}.}\index{Flash Player} The \verb|plot| command opens a local server, but simply opening the HTML file and the RStudio \textbf{Preview HTML} window do not. An easy way to solve this problem is to save the HTML file in your Dropbox\index{Dropbox} \emph{Public} folder\index{Dropbox!Public folder} and access it through the associated public URL link (see Chapter \ref{Storing}). Publishing a motion chart on GitHub Pages\index{GitHub!Pages} also works well (see Chapter \ref{MarkdownChapter}). For information on how to set a directory as `trusted' by Flash Player\index{Flash Player} see: \url{http://www.macromedia.com/support/documentation/en/flashplayer/help/settings_manager04.html}. \subsection{JavaScript Graphs with \emph{htmlwidgets}-based packages} The number of tools for creating JavaScript graphs from R that can be knitted into HTML files is growing rapidly. The \emph{htmlwidgets}\index{htmlwidgets} \citep{R-htmlwidgets} framework is especially making the development of these tools easier. As of this writing there are tools built on \emph{htmlwidgets} for creating maps, network graphs, time series graphs, and interactive tables, among others. Though the syntax of each of these tools differs, they can all easily be included into R Markdown documents. Often you simply run their core functions in a code chunk, without needing to use an additional call to \texttt{print} or \texttt{plot}. \subsection*{Chapter summary} In this chapter we have learned how to take results from our statistical analyses and other information from our data and dynamically present them in figures. In the next three chapters we will learn the details of how to create the LaTeX and Markdown presentation documents we use to present the tables we created in Chapter \ref{TablesChapter} and the figures we created in this chapter. ================================================ FILE: Old/Source-v2/Children/Chapter11/chapter11.Rnw ================================================ % Chapter Chapter 11 For Reproducible Research in R and RStudio % Christopher Gandrud % Created: 16/07/2012 05:45:03 pm CEST % Updated: 17 April 2015 <>= set_parent('Rep-Res-Parent.Rnw') @ \chapter{Presenting with \emph{knitr}/LaTeX}\label{LatexChapter} We have already begun to see how LaTeX works for presenting research results. This chapter gives you a more detailed and comprehensive introduction to basic LaTeX document structures and commands. It is not a complete introduction to all that LaTeX is capable of, but we will cover enough that you will be able to create an entire well-formatted article and slideshow with LaTeX that you can use to dynamically present your results. In the next chapter (Chapter \ref{LargeDocs}) we will build on these skills by learning how to use {\emph{knitr}} to create more complex LaTeX documents. For basic LaTeX documents, such as short articles or simple presentations, it may often be quicker and simpler to write the markup using an R Markdown document and compile it to PDF with the \emph{rmarkdown} package.\index{rmarkdown} As we will see in Chapter \ref{MarkdownChapter}, Markdown syntax is much simpler than normal LaTeX. However, there are at least two reasons why it is useful to become familiar with LaTeX syntax. First, understanding LaTeX syntax will help you debug issues you might encounter when using \emph{rmarkdown} with LaTeX that would otherwise be mysterious if you were only familiar with Markdown. Second, R Markdown has limited capabilities for creating more complex documents such as books and documents with highly customizable formatting needs. Using \emph{kntr} and LaTeX can be useful in these situations. In this chapter we will learn about basic LaTeX document structures and syntax as well as how to dynamically create LaTeX bibliographies with BibTeX, R, and \emph{knitr}. Finally, we will look at how to create PDF beamer slideshows with LaTeX and \emph{knitr}. \textbf{Note:} Chapter \ref{LatexChapter} and the following chapter are unusual for this book in that they do not refer to both \emph{knitr} and \emph{rmarkdown}. Instead they focus on capabilities largely exclusive to \emph{knitr}. \section{The Basics} In this section we will look at how to create a LaTeX article including what editor programs to use, the basic structure of a LaTeX document, including preamble and body, LaTeX syntax for creating headings, paragraphs, lines, text formatting, math, lists, footnotes, and cross-references. I will assume that you already have a fully functioning TeX distribution\index{TeX distribution} installed on your computer. See Section \ref{InstallMarkup} for information on how to install TeX. \subsection{Getting started with LaTeX editors} As I mentioned earlier, RStudio\index{RStudio!LaTeX editor} is a fully functional LaTeX editor in addition to being an integrated development environment for R. If you want to create a new LaTeX document you can click {\tt{File}} in the menu bar then {\tt{New}} \textrightarrow{} {\tt{R Sweave}}. \begin{wrapfigure}{r}{0.3\textwidth} \caption{RStudio TeX Format Options} \label{TeXFormat} \begin{center} \includegraphics[scale=0.6]{Children/Chapter11/images11/TeXFormat.png} \end{center} \end{wrapfigure} Remember from Chapter \ref{GettingStartedRKnitr} that R Sweave\index{R Sweave} files are basically LaTeX files that can include {\emph{knitr}} code chunks. You can use RStudio to knit and compile a document with the click of one button: \textbf{Compile PDF}\index{RStudio!Compile PDF button} (\includegraphics[scale=0.5]{Children/Chapter11/images11/CompilePDF.png}). You can use this button to compile R Sweave files like regular LaTeX files in RStudio even if they do not have code chunks. If you use another program to compile them you might need to change the file extension from {\tt{.Rnw}} to {\tt{.tex}}. You can also insert many of the items we will cover in this section into your documents with RStudio's LaTeX \texttt{TeX Format} button.\index{RStudio!TeX format button} See Figure \ref{TeXFormat}. There are many other LaTeX editors\index{LaTeX!editors}\footnote{Wikipedia has collated a table that comprehensively compares many of these editors: \url{http://en.wikipedia.org/wiki/List_of_text_editors}.} and many text editors that can be modified to compile LaTeX documents. For example, alongside writing this book in RStudio, I typed much of the LaTeX markup in the Sublime Text\footnote{http://www.sublimetext.com/} text editor.\index{Sublime Text} None of these options have RStudio's high-level integration with \emph{knitr}, however.\footnote{Andrew Wheiss has created a Sublime Text plugin called \emph{KnitrSublime}. It enables some R LaTeX integration. For more details see: \url{https://GitHub.com/andrewheiss/KnitrSublime}.} If you are new to LaTeX you may be more comfortably using Lyx.\index{Lyx} Lyx has a Microsoft Word-type interface, but creates actual LaTeX documents. It also has \emph{knitr} integration. See Chapter \ref{GettingStartedRKnitr}'s Appendix for how to set up and use \emph{knitr} and Lyx. \subsection{Basic LaTeX command syntax}\index{LaTeX!basic command syntax} As you probably noticed in Part III's examples, LaTeX commands start with a backslash (\texttt{\textbackslash{}}). For example, to create a section heading you use the \verb|\section| command.\index{LaTeX command!section} The arguments for LaTeX commands are written inside of curly braces (\verb|{}|) like this: \begin{knitrout} \definecolor{shadecolor}{rgb}{1, 1, 1} \color{fgcolor} \begin{kframe} \begin{verbatim} \section{My Section Name} \end{verbatim} \end{kframe} \end{knitrout} \noindent Probably one of the biggest sources of errors that occur when compiling a LaTeX document to PDF\index{PDF}\index{LaTeX!error} are caused by curly brackets that aren't closed, i.e. an open bracket (\verb|{|) is not matched with a subsequent closed bracket (\verb|}|). Watch out for this and use an editor (like RStudio) that highlights brackets' matching pairs. As we will see, unlike in R with parentheses, if your LaTeX command does not have an argument you do not need to include the curly brackets at all. There are a number of places to find comprehensive lists of LaTeX commands. The Netherlands TeX users group\index{Netherlands TeX users} has compiled one: \url{http://www.ntg.nl/doc/biemesderfer/ltxcrib.pdf}. \subsection{The LaTeX preamble \& body}\label{LaTeXPreamble} \index{LaTeX!preamble|(} All LaTeX documents require a preamble. The preamble goes at the very beginning of the document. The preamble usually starts with the \texttt{documentclass}\index{LaTeX command!documentclass} command. This specifies what type of presentation document you are creating--e.g. an article, a book, a slideshow,\footnote{``Slideshow'' is not a valid class. One slideshow class that we discuss later is called ``beamer''.} and so on. LaTeX refers to these as classes.\index{LaTeX!class} Classes specify a document's formatting. You can add options to \texttt{documentclass} to change the format of the entire document. For example, if we wanted to create an article class document with two columns we would type: <>= \documentclass[twocolumn]{article} @ In the preamble you can also specify other style options and load any extra packages\index{LaTeX!packages} you may want to use.\footnote{The command to load a package in LaTeX is \texttt{\textbackslash{}usepackage}.\index{LaTeX command!usepackage} For example, if you include \texttt{\textbackslash{}usepackage\{url\}} in the preamble of your document you will be able to specify URL links in the body with the command \texttt{\textbackslash{}url\{SOMEURL\}}.\index{LaTeX package!url}} The preamble is often followed by the body of your document. It is specified with the \texttt{body} environment.\index{LaTeX environment!body} See Chapter \ref{TablesChapter} (Section \ref{LaTeXEnviron}) for more details about LaTeX environments. You tell LaTeX where the body\index{LaTeX!begin document} of your document starts by typing \verb|\begin{document}|. The very last line of your document is usually \verb|\end{document}|, indicating that your document has ended. When you open a new R Sweave file in RStudio it creates an article class\index{LaTeXarticle} document with a very simple preamble and body like this: <>= \documentclass{article} \begin{document} \end{document} @ \noindent This is all you need to get a very basic article class document working. If you want the document to be of another class, simply change \texttt{article} to something else, a \texttt{book} for example. Let's begin to modify the markup. First we will include in the preamble the (\texttt{hyperref})\index{LaTeX package!hyperref} for clickable hyperlinks and \texttt{natbib}\index{LaTeX package!natbib} for bibliography formatting.\index{bibliography} We will discuss \texttt{natbib} in more detail below. Note that in general, and unlike in R, almost all of the LaTeX packages you will use are installed on your computer when you installed the TeX distribution. \index{LaTeX!preamble|)} Next, it's often a good idea to include \emph{knitr} code chunks that specify features of the document as a whole. These can include global chunk options\index{knitr!global chunk options} as well as loading data and packages used throughout the document. Then it's a good idea to specify title information just after the \texttt{document} environment begins.\index{LaTeX environment!document} Use the \texttt{title}\index{LaTeX command!title} command to add a title, the \texttt{author}\index{LaTeX!author} command to add author information, and \texttt{date}\index{LaTeX command!date} to specify the date.\footnote{In some document classes the current data will automatically be included if you don't specify the date.} Then include the \texttt{maketitle} command.\index{LaTeX command!maketitle} This will place your title and author information in the body of the document. If you are writing an article you may also want to follow \texttt{maketitle} with an abstract. Unsurprisingly, you can use the \texttt{abstract}\index{LaTeX environment!abstract} environment to include this. Here is a full LaTeX article class document with all of these changes added: {\scriptsize \begin{knitrout} \definecolor{shadecolor}{rgb}{1, 1, 1}\color{fgcolor} \begin{kframe} \begin{alltt} %%%%%%%%%%%%%% Article Preamble %%%%%%%%%%%%%% \textbackslash{}documentclass\{article\} %%%% Load LaTeX packages \textbackslash{}usepackage\{hyperref\} \textbackslash{}usepackage[authoryear]\{natbib\} %%%% Set knitr global options and gather data \textless{}\textless{}Global, include=FALSE\textgreater{}\textgreater{}= \hlcom{#### Set chunk options ####} opts_chunk$\hlkwd{set}(fig.align=\hlstr{'center'}) \hlcom{#### Load and cite R packages ####} \hlcom{# Create list of packages} PackagesUsed <- c(\hlstr{"knitr", "ggplot2", "repmis"}) \hlcom{# Load PackagesUsed and create .bib BibTeX file} \hlcom{# Note must have repmis package installed.} repmis::LoadandCite(PackagesUsed, file = \hlstr{"Packages.bib"}, install = FALSE) \hlcom{#### Gather Democracy data from Pemstein et al. (2010) ####} \hlcom{# For simplicity, store the URL in an object called 'url'.} url <- \hlstr{"http://www.unified-democracy-scores.org/files/20140312/z/uds_summary.csv.gz"} \hlcom{# Create a temporary file called 'temp' to put the zip file into.} temp <- \hlkwd{tempfile}() \hlcom{# Download the compressed file into the temporary file.} \hlkwd{download.file}(url, temp) \hlcom{# Decompress the file and convert it into a data frame} \hlcom{# class object called 'data'.} UDSData <- \hlkwd{read.csv}(\hlkwd{gzfile}(temp, \hlstr{"uds_summary.csv"})) \hlcom{# Delete the temporary file.} \hlkwd{unlink}(temp) @ %%%% Start document body \textbackslash{}begin\{document\} %%%%%%%%%%%%% Create title %%%%%%%%%%%%%%%%% \textbackslash{}title\{An Example knitr LaTeX Article\} \textbackslash{}author\{Christopher Gandrud \textbackslash{}\textbackslash{} Hertie School of Governance\textbackslash{}thanks\{Email: \textbackslash{}href\{mailto:gandrud@hertie-school.org\} \{gandrud@hertie-school.org\}\}\} \textbackslash{}date\{January 2015\} \textbackslash{}maketitle %%%%%%%%%%%%% Abstract %%%%%%%%%%%%%%%%%%%% \textbackslash{}begin\{abstract\} Here is an example of a knittable article class LaTeX document. \textbackslash{}end\{abstract\} %%%%%%%%%%% Article Main Text %%%%%%%%%%%%% \textbackslash{}section\{The Graph\} I gathered data from \textbackslash{}cite\{Pemstein2010\} on countries' democracy level. They call their democracy measure the Unified Democracy Score (UDS). Figure \textbackslash{}ref\{DemPlot\} shows the mean UDS scores over time for all of the countries in their sample. \textbackslash{}begin\{figure\} \textbackslash{}caption\{Mean UDS Scores\} \textbackslash{}label\{DemPlot\} \textless{}\textless{}echo=FALSE, message=FALSE, warning=FALSE, out.width='7cm', out.height='7cm'\textgreater{}\textgreater{}= \hlcom{# Graph UDS scores} \hlkwd{ggplot}(UDSData, \hlkwd{aes}(x = year, y = mean)) + \hlkwd{geom_point}(alpha = I(0.1)) + \hlkwd{stat_smooth}(size = 2) + \hlkwd{ylab}(\hlstr{"Democracy Score"}) + \hlkwd{xlab}(\hlstr{""}) + \hlkwd{theme\_bw}() @ \textbackslash{}end\{figure\} %%%%%%%%%%% Reproducing the Document %%%%% \textbackslash{}section\*\{Appendix: Reproducing the Document\} This document was created using R version \textbackslash{}Sexpr\{\hlkwd{paste0}(version$major, ".", version$minor)\} and the R package \textbackslash{}emph\{knitr\} \textbackslash{}citep\{R-knitr\}. It also relied on the R packages \textbackslash{}emph\{ggplot2\} \textbackslash{}citep\{R-ggplot2\} and \textbackslash{}emph\{repmis\} \textbackslash{}citep\{R-repmis\}. The document can be completely reproduced from source files available on GitHub at: \textbackslash{}url\{https://GitHub.com/christophergandrud/Rep-Res-Examples\}. %%%%%%%%% Bibliography %%%%%%%%%%%%%%%%%%%% \textbackslash{}bibliographystyle\{apa\} \textbackslash{}bibliography\{Main.bib,Packages.bib\} \textbackslash{}end\{document\} \end{alltt} \end{kframe} \end{knitrout} } \noindent The \emph{knitr} code chunk\index{knitr!code chunk} syntax should be familiar to you from previous chapters, so let's unpack the LaTeX syntax from just after the first code chunk, including the ``Create Title'' and ``Abstract'' parts. New syntax shown in later parts of this example is discussed in the remainder of this section and the next section on bibliographies. First, remember that the percent sign (\%) is LaTeX's comment character. Using it to comment your markup can make it easier to read. Second, as we saw in Chapter \ref{TablesChapter} (Section \ref{LaTeXTables}), double backslashes (\verb|\\|),\index{LaTeX!\textbackslash{}\textbackslash{}} like those after the author's name, force a new line in LaTeX. We will discuss the \texttt{emph} command in a moment. Third, using the \texttt{thanks}\index{LaTeX command!thanks} command allows us to create a footnote for author contact information\footnote{Frequently it also includes thank-yous to people who have helped the research.} that is not numbered like the other footnotes (see below). Finally, you'll notice \verb|\href{mailto: . . . .org}}|.\index{LaTeX command!href}\index{LaTeX command!mailto} This creates an email address in the final document that will open the reader's default email program\index{LaTeX!email program} when clicked. Finally, you may have noticed the following line: \begin{knitrout} \definecolor{shadecolor}{rgb}{1, 1, 1}\color{fgcolor} \begin{kframe} \begin{alltt} \textbackslash{}Sexpr\{paste0(version$major, ".", version$minor)\} \end{alltt} \end{kframe} \end{knitrout} \noindent This code finds the current version of R being used and prints the version number into the presentation document. \subsection{Headings}\index{LaTeX!headings} Earlier in the chapter we briefly saw how to create section-level headings with \texttt{section}.\index{LaTeX command!section} There are a number of other sub-section-level headings including \texttt{subsection}, \texttt{subsubsection}, \texttt{paragraph}, and \texttt{subparagraph}.\index{LaTeX command!subsection}\index{LaTeX command!subsubsection}\index{LaTeX command!paragraph}\index{LaTeX command!subparagraph} Headers are numbered automatically by LaTeX.\footnote{The \texttt{paragraph} level does not have numbers.} To have an unnumbered section,\index{LaTeX!unnumbered section} place an asterisk in it like this: \verb|\section*{Unnumbered Section}|. In book class documents you can also use \texttt{chapter}\index{LaTeX command!chapter} to create new chapters and \texttt{part} for collections of chapters.\index{LaTeX command!part} \subsection{Paragraphs \& spacing}\index{LaTeX!paragraph}\index{LaTeX!spacing} In LaTeX, paragraphs are simply created by adding a blank line between lines. It will format all of the tabs for the beginning of paragraphs based on the document's class rules. As we discussed before, writing tabs in the markup version of your document does nothing in the compiled document. They are generally used just to make the markup easier for people to read.\index{LaTeX!tabs} Note that adding more blank lines between paragraphs will not add extra space between the paragraphs in the final document. To specify the space following paragraphs (or almost any line) use the \texttt{vspace} (vertical space) command.\index{LaTeX command!vspace} For example, to add three centimeters of vertical space on a page type: \verb|\vspace{3cm}|. This gives us the following space: \vspace{3cm} Similarly, adding extra spaces between words in your LaTeX markup won't create extra spaces between words in the compiled document. To add horizontal space use the \texttt{hspace}\index{LaTeX command!hspace} command in the same way as \texttt{vspace}. \subsection{Horizontal lines}\index{LaTeX command!hline}\index{LaTeX command!hrulefill}\index{LaTeX!lines} Use the \texttt{hrulefill} command to create horizontal lines in the text of your document. For example, \verb|\hrulefill| creates: \vspace{0.2cm} \hrulefill \noindent Inside of a \verb|tabular| environment,\index{LaTeX environment!tabular} use the \verb|hline| command rather than \verb|hrulefill|. \subsection{Text formatting} Let's briefly look at how to do some of the more common types of text formatting in LaTeX and how to create some commonly used diacritics and special characters. \paragraph{Italics \& Bold}\index{LaTeX!italics}\index{LaTeX!emphasis}\index{LaTeX!bold} To italicize a word in LaTeX use the \texttt{emph} (emphasis) command.\index{LaTeX command!emph} For bold use \texttt{textbf}.\index{LaTeX command!textbf} You can nest commands inside of one another to combine their effect. For example, to \emph{\textbf{italicize and bold}} a word use: \verb|\emph{textbf{italicize and bold}}|. \paragraph{Font size}\label{FontSize}\index{LaTeX!font size} You can specify the base font size of an entire document with a \texttt{documentclass} option. For example, to create an article with 12-point font use: \texttt{\textbackslash{}documentclass[12pt]\{article\}}. There are a number of commands to set the size of specific pieces of text relative to the base size. See Table \ref{LaTeXFontSize} for the full list. Usually a slightly different syntax is used for these commands that goes like this: \verb|{\SIZE_COMMAND . . . }|. For example, to use the {\tiny{tiny size}} in your text use: \verb|{\tiny{tiny size}}|. You can change the size of code chunks that \emph{knitr} places in presentation documents using these commands. Just place the code chunk inside of \verb|{\SIZE_COMMAND . . . }|. This is similar to using the \verb|size| code chunk option.\index{knitr option!size} \begin{table} \caption{LaTeX Font Size Commands} \label{LaTeXFontSize} \begin{center} \vspace{0.2cm} \begin{tabular}{c} {\Huge \texttt{Huge}} \\ {\huge \texttt{huge}} \\ {\LARGE \texttt{LARGE}} \\ {\Large \texttt{Large}} \\ {\large \texttt{large}} \\ {\normalsize \texttt{normalsize}} \\ {\small \texttt{small}} \\ {\footnotesize \texttt{footnotesize}} \\ {\scriptsize \texttt{scriptsize}} \\ {\tiny \texttt{tiny}} \vspace{0.2cm} \end{tabular} \end{center} \end{table} \paragraph{Diacritics}\index{LaTeX!diacritics} You cannot directly enter letters with diacritics--e.g. accent mark--into LaTeX. For example, to create a letter c with a cedilla (\c{c}) you need to type \verb|\c{c}|. To create an `a' with an acute accent (\'{a}) type: \verb|\'{a}|. There are obviously many types of diacritics and commands to include them within LaTeX-produced documents. For a comprehensive discussion of the issue and a list of commands see the LaTeX Wikibook page on the topic: \url{http://en.wikibooks.org/wiki/LaTeX/Special_Characters}. If you regularly use non-English alphabets you might also be interested in reading the LaTeX Wikibook page on internationalization: \url{http://en.wikibooks.org/wiki/LaTeX/Internationalization}.\index{LaTeX!internationalization}\index{LaTeX!non-English characters} \paragraph{Quotation marks}\index{LaTeX!quotation marks} To specify double left quotation marks (``) use two back ticks (\verb|``|). For double right quotes ('') use two apostrophes (\verb|''|). Single quotes follow the same format (\verb|`'|). \subsection{Math}\index{LaTeX!math}\label{MathLaTeX} LaTeX is particularly popular among quantitative researchers and mathematicians because it is very good at rendering mathematics. A complete listing of every math command would take up quite a bit of space.\footnote{See the Netherlands TeX user group list mentioned earlier for an extensive compilation of math commands.} I am briefly going to discuss how to include math in a LaTeX document. This discussion includes a few math syntax examples. To include math inline with your text, place the math syntax in between backslashes and parentheses, i.e. \verb|\( . . . \)|. For example, \verb|\( s^{2} = \frac{\sum(x - \bar{x})^2}{n - 1} \)| produces \( s^{2} = \frac{\sum(x - \bar{x})^2}{n - 1} \) in our final document.\footnote{Instead of backslashes and parentheses you can also use a pair of dollar signs (\texttt{\$\ldots \$})\index{LaTeX!\$}.} We can display math separately from the text by placing the math commands inside of backslashes and square brackets: \verb|\[ . . . \]|.\footnote{Equivalently, use two pairs of dollar signs (\texttt{\$\$\ldots \$\$}) or the \texttt{display} environment.\index{LaTeX environment!display} Though it will still work in most cases, the double dollar sign math syntax may cause errors. You can also number display equations using the \texttt{equation} environment.\index{LaTeX environment!equation}} For example, <>= \[ s^{2} = \frac{\sum(x - \bar{x})^2}{n - 1} \] @ \noindent gives us: \[ s^{2} = \frac{\sum(x - \bar{x})^2}{n - 1} \] \subsection{Lists}\index{LaTeX!lists} To create bullet lists\index{LaTeX!bullet lists} in LaTeX use the \texttt{itemize}\index{LaTeX environment!itemize} environment. Each list item is delimited with the \texttt{item}\index{LaTeX command!item} command. For example: <>= \begin{itemize} \item The first item. \item The second item. \item The third item. \end{itemize} @ \noindent gives us: \begin{itemize} \item The first item. \item The second item. \item The third item. \end{itemize} \noindent To create a numbered list use the \texttt{enumerate}\index{LaTeX environment!enumerate} environment instead of \texttt{itemize}. You can create sublists\index{LaTeX!sublists} simply by nesting lists inside of lists like this: <>= \begin{itemize} \item The first item. \item The second item. \begin{itemize} \item A sublist item \end{itemize} \item The third item. \end{itemize} @ \noindent which gives us: \begin{itemize} \item The first item. \item The second item. \begin{itemize} \item A sublist item \end{itemize} \item The third item. \end{itemize} \subsection{Footnotes}\index{LaTeX!footnotes} Plain, non-bibliographic footnotes are easy to create in LaTeX. Simply place \texttt{\textbackslash{}footnote\{} where you would like the footnote number to appear in the text. Then type the footnote's text. Of course, remember to close the footnote with a \texttt{\}}. LaTeX does the rest, including formatting and numbering. \subsection{Cross-references}\index{LaTeX!cross-references} LaTeX will also automatically format cross-references. We were already partially introduced to cross-references in chapters \ref{TablesChapter} and \ref{FiguresChapter}. At the place where you would like to reference, add a \texttt{label} such as \verb|\label{ACrossRefLabel}|.\index{LaTeX command!label} It doesn't really matter what label you choose, though make sure they are not duplicated in the document. Also, it can be a good idea to use the same conventions that we learned for labeling R objects (see Section \ref{ObjectNames}). Then place a \texttt{ref}\index{LaTeX command!ref} command (e.g. \verb|\ref{ACrossRefLabel|) at the place in the text where you want the cross-reference to be. If you place the \texttt{label} on the same line as a heading command, \texttt{ref} will place the heading number. If \texttt{label} is in a \texttt{table} or \texttt{figure} environment you will get the table or figure number. You can also use \texttt{pageref} instead of \texttt{ref} to include the page number. Finally, loading the \emph{hyperref}\index{LaTeX package!hyperref} package makes cross-references (or footnote) clickable. Clicking on them will take you to the items they refer to. \section{Bibliographies with BibTeX}\label{BibTeXBib}\index{BibTeX|(}\index{LaTeX!bibliographies|(}\index{bibliography|(} LaTeX can take advantage of very comprehensive bibliography-making capabilities. All major TeX distributions come with BibTeX. BibTeX is basically a tool for creating databases of citation information. In this section, we are going to see how to incorporate a BibTeX bibliography into your LaTeX documents. Then we will learn how use R to automatically generate a bibliography of packages used to create a knitted document. For more information on BibTeX syntax see the LaTeX Wikibook page on Bibliography management: \url{http://en.wikibooks.org/wiki/LaTeX/Bibliography_Management}. \subsection{The \emph{.bib} file} BibTeX bibliographies are stored in plain-text files with the extension \texttt{.bib}. These files are databases of citations.\footnote{The order of the citations does not matter.} The syntax for each citation goes like this: \begin{knitrout} \definecolor{shadecolor}{rgb}{1, 1, 1}\color{fgcolor} \begin{kframe} \begin{alltt} @DOCUMENT_TYPE\{CITE_KEY, title = \{TITLE\}, author = \{AUTHOR\}, . . . = \{. . .\} \} \end{alltt} \end{kframe} \end{knitrout} \noindent \verb|DOCUMENT_TYPE| specifies what type of document--article, book, webpage, and so on--the citation is for. This determines what items the citation can and needs to include. Then we have the \verb|CITE_KEY|.\index{BibTeX!citation keys} This is the reference's label that you will use to include the citation in your presentation documents. We'll look more at this later in the section. Each citation must have a unique \verb|CITE_KEY|. A common way to write these keys is to use the author's surname and the publication year, e.g. \verb|Donoho2009|. The cite key is followed by the other citation attributes such as \texttt{author}, \texttt{title}, and \texttt{year}. These attributes all follow the same syntax: \verb|ATTRIBUTE = {. . .}|. It's worth taking a moment to discuss the syntax for the BibTeX author attribute. First, multiple author names are separated by \texttt{and}. Second, BibTeX assumes that the last word for each author is their surname. If you would like multiple words to be taken as the ``surname'' then enclose these words in curly brackets. If we wanted to cite the World Bank\index{World Bank, citing} as an author we write \verb|{World Bank}|; otherwise it will be formatted ``Bank, World'' in the presentation document. Here is a complete BibTeX entry for \cite{Donoho2009}: \begin{knitrout} \definecolor{shadecolor}{rgb}{1, 1, 1}\color{fgcolor} \begin{kframe} \begin{alltt} @article\{Donoho2009, author = \{David L Donoho and Arian Maleki and Morteza Shahram and Inam Ur Rahman and Victoria Stodden\}, title = \{Reproducible research in computational harmonic analysis\}, journal = \{Computing in Science \& Engineering\}, year = \{2009\}, volume = \{11\}, number = \{1\}, pages = \{8--18\} \} \end{alltt} \end{kframe} \end{knitrout} \noindent Each item of the entry must end in a comma, except the last one.\footnote{This is very similar to how we create vectors in R, though in BibTeX you can actually have a comma after the last attribute.} \subsection{Including citations in LaTeX documents} When you want to include citations from a BibTeX file in your LaTeX document you first use the \texttt{bibliography}\index{LaTeX command!bibliography} command. For example, if the BibTeX file is called \emph{Main.bib} and it is in the same directory as your markup document, then type: \verb|\bibliography{Main.bib}|. You can use a bibliography stored in another directory; just include the appropriate file path information. Usually \texttt{bibliography} is placed right before \verb|\end{document}| so that it appears at the end of the compiled presentation document. You can also specify how you would like the references to be formatted using the \texttt{bibliographystyle}\index{LaTeX command!bibliographystyle} command. For example, this book uses the American Psychological Association (APA)\index{APA} style for references. To set this I included \verb|\bibliographystyle{apa}| directly before \texttt{bibliography}. The default style\footnote{It is referred to in LaTeX as the plain style.} is to number citations (e.g. [1]) rather than include author-year information\footnote{This is sometimes referred to as the ``Harvard'' style.} used by the APA. You will need to include the LaTeX package \emph{natbib}\index{LaTeX package!Natbib} in your preamble to be able to use author-year citation styles. This book includes \verb|\usepackage[authoryear]{natbib}| in its preamble.\index{author-year citations}\index{Harvard style citations} Place the \texttt{cite}\index{LaTeX command!cite} command in your document's text where you want to place a reference. You include the \verb|CITE_KEY| for the reference in this command, e.g. \verb|\cite{Donoho2009}|. You can include multiple citations in \texttt{cite}, just separate the \verb|CITE_KEY|s with commas. You can add options such as the page numbers or other text to a citation using square brackets ([]). For example, if we wanted to cite the tenth page of \cite{Donoho2009} we type: \verb|\cite[10]{Donoho2009}|. The author-year style in-text citation that this produces looks like this: \cite[10]{Donoho2009}. You can add text at the beginning of a citation with another set of square brackets. Typing \verb|\cite[see][10]{Donoho2009}| gives us: \cite[see][10]{Donoho2009}. If you are using an author-year style you can use a variety of \emph{natbib} commands to change what information is included in the parentheses. For a selection of these commands and examples, see Table \ref{NatbibTable}. \begin{table} \caption{A Selection of \emph{natbib} In-text Citation Style Commands} \label{NatbibTable} \begin{center} \begin{tabular}{l r} \hline Command Example & Output \\[0.25cm] \hline\hline \verb|\cite{Donoho2009}| & \cite{Donoho2009} \\ \verb|\citep{Donoho2009}| & \citep{Donoho2009} \\ \verb|\citeauthor{Donoho2009}| & \citeauthor{Donoho2009} \\ \verb|\citeyear{Donoho2009}| & \citeyear{Donoho2009} \\ \verb|\citeyearpar{Donoho2009}| & \citeyearpar{Donoho2009} \\ \hline \end{tabular} \end{center} \end{table} \subsection{Generating a BibTeX file of R package citations}\index{BibTeX!automatic generation} Researchers are pretty good about citing others' articles and data. However, citations of R packages used in analyses is very inconsistent. This is unfortunate not only because correct attribution is not being given to those who worked to create the packages, but also because it makes reproducibility harder. Not citing packages obscures important steps that were taken in the research process, primarily which package versions were used. Fortunately, there are R tools for quickly and dynamically generating package BibTeX files, including the versions of the packages you are using. They will automatically update the citations each time you compile your document to reflect any changes made to the packages. You can automatically create citations for R packages using the \texttt{citation}\index{R function!citation} command inside of a code chunk. For example, if you want the citation information for the \texttt{xtable}\index{xtable} package you simply type: {\small <>= citation("xtable") @ } \noindent This gives you both the plain citation as well as the BibTeX version. If you only want the BibTeX version of the citation you can use the \texttt{toBibtex} command.\index{R function!toBibTeX} <>= toBibtex(citation("xtable")) @ The {\emph{knitr}} package creates BibTeX bibliographies for R packages with the \verb|write_bib|\index{R function!write\_bib} command. Let's make a BibTeX file called \emph{Packages.bib} containing citation information for the \emph{xtable} package. <>= # Create package BibTeX file knitr::write_bib("xtable", file = "Packages.bib") @ \noindent \verb|write_bib| automatically assigns each entry a cite key using the format \verb|R-PACKAGE_NAME|, e.g. \verb|R-xtable|. \textbf{Warning:} \emph{knitr}'s \verb|write_bib| command currently does not have the ability to append package citations to an existing file, but instead writes them to a new file. If there is already a file with the same name, it will overwrite the file. So, be very careful using this command to avoid accidental deletions. It is a good idea to have \verb|write_bib| always write to a file specifically for automatically generated package citations. You can include more than one bibliography in LaTeX's \texttt{bibliography} command. All you need to do is separate them with a comma. <>= \bibliography{Main.bib,Packages.bib} @ We can use these techniques to automatically create a BibTeX file with citation information for all of the packages used in a research project. Simply make a character vector of the names of packages that you would like to include in your bibliography. Then run this through \verb|write_bib|. You can make sure you are citing all of the key packages used in a knitted document by (a) creating a vector of all of the packages and then (b) using this in the following code to both load the packages and write the bibliography: <>= # Package list PackagesUsed <- c("ggplot2", "knitr", "xtable", "Zelig") # Load packages lapply(PackagesUsed, library, character.only = TRUE) # Create package BibTeX file knitr::write_bib(PackagesUsed, file = "Packages.bib") @ \noindent In the first executable line we just create our list of packages to load and cite. The next command is \texttt{lapply}\index{R function!lapply} (list apply). This applies the function \texttt{library} to all of the items in \emph{PackagesUsed}. \texttt{character.only = TRUE} is a \texttt{library}\index{R function!library} argument that allows us to use character string versions of the package names as R sees them in the \emph{PackagesUsed} vector, rather than as objects (how we have used \texttt{library} up until now). If you include these commands in a code chunk at the beginning of your knitted document, then you can be sure that you will have a BibTeX file with all of your packages. The full LaTeX document example I showed you earlier uses the \texttt{LoadandCite} command\index{R function!LoadandCite} from the \emph{repmis} package. This simplifies the process of loading and citing R packages.\index{repmis}\footnote{It can also install the packages if the option \texttt{install = TRUE}. You can have it install specific package versions by entering the version numbers with the \texttt{versions} argument. This is very useful for enabling the replication of analyses that rely on specific package versions.} \index{BibTeX|)}\index{LaTeX!bibliographies|)}\index{bibliography|(} \section{Presentations with LaTeX Beamer}\label{latexBeamer} \index{beamer|(}\index{LaTeXbeamer|(} You can make slideshow presentations with LaTeX. Creating a presentation with a markup language can take a bit more effort than using a WYSIWYG program like Microsoft PowerPoint\index{Microsoft PowerPoint} or Apple's Keynote.\index{Apple Keynote} However, combining LaTeX and \emph{knitr} can make fully reproducible presentations that dynamically create and present results. I have found this particularly useful in my teaching as dynamically produced presentations allow me to provide my students with fully replicable examples of how I created a figure on a slide, for example. \emph{knitr} also makes it easy to beautifully present code examples. One of the most popular LaTeX tools for slideshows is the beamer class. When you compile a beamer class document, a PDF will be created where every page is a different slide (see Figure \ref{BeamerExample}). All major PDF viewer programs have some sort of ``View Full Screen'' option to view beamer PDFs as full screen slideshows. Usually you can navigate through the slides with the forward and back arrows on the keyboard. In this section we will take a brief look at the basics of creating slideshows with beamer, highlighting special considerations that need to be made when working with beamer and \emph{knitr}. A full example of a knittable beamer presentation with illustrations of the many of the points discussed here is printed at the end of the chapter. \begin{figure} \caption{Knitted Beamer PDF Example} \label{BeamerExample} \begin{center} \includegraphics[scale=0.5]{Children/Chapter11/images11/BeamerExample.png} \end{center} {\scriptsize The presentation in this example was created using a custom beamer theme available at: \url{https://GitHub.com/christophergandrud/Make-Projects/tree/master/Rnw_Lecture}.} \end{figure} \subsection{Beamer basics} {\emph{knitr}} largely works the same way in LaTeX slideshows as it does in article or book class documents. There are a few differences to look out for. \paragraph{The Beamer preamble} You use \texttt{documentclass}\index{LaTeX command!documentclass} to set a LaTeX document as a \texttt{beamer} slideshow. You can also include global style information in the preamble by using the commands \texttt{usetheme},\index{LaTeX command!usetheme}\index{LaTeX command!usecolortheme}\index{LaTeX command!useinnertheme}\index{LaTeX command!useoutertheme} \texttt{usecolortheme}, \texttt{useinnertheme}, \texttt{useoutertheme}. For a fairly comprehensive compilation of beamer themes see the Hartwork's Beamer theme matrix: \url{http://www.hartwork.org/beamer-theme-matrix/}. \paragraph{Slide frames}\index{LaTeX!beamer slides} After the preamble, you start your document as usual by beginning the \texttt{document} environment.\index{LaTeX environment!document} Then you need to start creating slides. Individual beamer slides are created using the \texttt{frame}\index{LaTeX command!frame}\index{LaTeX environment!frame} environments. Create a frame title using \texttt{frametitle}.\index{LaTeX command!frametitle} \begin{knitrout} \definecolor{shadecolor}{rgb}{1, 1, 1} \color{fgcolor} \begin{kframe} \begin{verbatim} \frame{ \frametitle{An example frame} } \end{verbatim} \end{kframe} \end{knitrout} \noindent Note that you can also use the usual \verb|\begin{frame} . . \end{frame}| syntax. Unlike in a WYSIWYG slide show program, you will not be able to tell if you have tried to put more information on one slide than it can handle until after you compile the document.\footnote{One way to deal with frames that span multiple slides is to use the \texttt{allowframebreaks} command, i.e. \texttt{\textbackslash{}begin\{frame\}[allowframebreaks].\index{LaTeX command!allowframebreaks}}} \paragraph{Title frames}\index{LaTeX!beamer title frames} One important difference from a regular LaTeX article is that instead of using \texttt{maketitle} to place your title information, in beamer you place the \texttt{titlepage}\index{LaTeX command!titlepage} inside of a frame by itself. \paragraph{Sections \& outlines} We can use section\index{LaTeX command!section} commands in much the same way as we do in other types of LaTeX documents. Section commands do not need to be placed inside of frames. After the title slide, many slideshows have a presentation outline. You can automatically create one from your section headings using the \texttt{tableofcontents}\index{LaTeX command!tableofcontents} command. Like the \texttt{titlepage} command,\index{LaTeX command!titlepage} \texttt{tableofcontents} can go on its own frame, i.e.\index{LaTeX!table of contents}\index{LaTeX!outlines} \begin{knitrout} \definecolor{shadecolor}{rgb}{1, 1, 1} \color{fgcolor} \begin{kframe} \begin{verbatim} %%% Title slide \frame{ \titlepage } %% Table of contents slide \frame{ \frametitle{Outline} \tableofcontents } \end{verbatim} \end{kframe} \end{knitrout} \paragraph{Make list items appear}\index{LaTeX!list appear} Lists work the same way in beamer as they do in other LaTeX document classes. They do have an added feature in that you can have each item appear as you progress through the slide show. After \verb|\item|, place the number of the order in which the item should appear. Enclose the number in \verb|< ->|. For example, <>= \begin{itemize} \item<1-> The first item. \item<2-> The second item. \item<2-> The third item. \end{itemize} @ \noindent In this example the first item will appear before the next two. These two will appear at the same time. \subsection{\emph{knitr} with LaTeX slideshows} \emph{knitr} code chunks have the same syntax in LaTeX slideshows as in other LaTeX documents. You do need to make one change to the \texttt{frame} options, however, to include highlighted {\emph{knitr}} code chunks on your slides. You should add the \texttt{fragile} option to the \texttt{frame} command.\footnote{For a detailed discussion of why you need to use the \texttt{fragile} option with the \texttt{verbatim} environment\index{LaTeX environment!verbatim} that {\emph{knitr}} uses to display highlighted text in LaTeX documents see this blog post by Pieter Belmans: \url{http://pbelmans.wordpress.com/2011/02/20/why-latex-beamer-needs-fragile-when-using-verbatim/} (posted 20 February 2011).} Here is an example: \begin{knitrout} \definecolor{shadecolor}{rgb}{1, 1, 1} \color{fgcolor} \begin{kframe} \begin{verbatim} \begin{frame}[fragile] \frametitle{An example fragile frame.} \end{frame} \end{verbatim} \end{kframe} \end{knitrout} \noindent Here is a complete knittable beamer example: {\scriptsize \begin{knitrout} \definecolor{shadecolor}{rgb}{1, 1, 1}\color{fgcolor} \begin{kframe} \begin{alltt} \textbackslash{}documentclass\{beamer\} \textbackslash{}begin\{document\} %% Title page inforamtion \textbackslash{}title\{Example Beamer/\textbackslash{}emph\{knitr\} Slideshow\} \textbackslash{}author\{\textbackslash{}href\{mailto:gandrud@hertie-school.org\}\{Christopher Gandrud\}\} %%% Title slide \textbackslash{}frame\{ \textbackslash{}titlepage \} %% Table of contents slide \textbackslash{}frame\{ \textbackslash{}frametitle\{Outline\} \textbackslash{}tableofcontents \} %%% The code \textbackslash{}section\{Access the code\} \textbackslash{}begin\{frame\}[fragile] \textbackslash{}frametitle\{Access the code\} The code to create the following figure is available online. To access it we can type: \textless{}\textless{}eval=FALSE\textgreater{}\textgreater{}= \hlcom{# Access and run the code to create a caterpillar plot} devtools::source\_url(\hlstr{"http://bit.ly/VRKphr"}) @ \textbackslash{}end\{frame\} %%% The figure \textbackslash{}section\{The Figure\} \textbackslash{}begin\{frame\}[fragile] \textbackslash{}frametitle\{The resulting figure\} \textless{}\textless{}echo=FALSE, message=FALSE, out.width='\textbackslash{}\textbackslash{}textwidth', out.height='0.8\textbackslash{}\textbackslash{}textheight'\textgreater{}\textgreater{}= \hlcom{# Access and run the figure code} devtools::source\_url(\hlstr{"http://bit.ly/VRKphr"}) @ \textbackslash{}end\{frame\} \textbackslash{}end\{document\} \end{alltt} \end{kframe} \end{knitrout} } In Chapter \ref{MarkdownChapter} we will see how to use the \emph{rmarkdown} package to create beamer presentations with the much simpler Markdown syntax. \index{beamer|)}\index{LaTeXbeamer|)} \subsection*{Chapter summary} In this chapter we have learned the nitty-gritty of how to create simple LaTeX documents--articles and slideshows--that we can embed our reproducible research in using \emph{knitr}. In the next chapter we look at how to create more complex LaTeX documents, including theses, books, and batch reports. ================================================ FILE: Old/Source-v2/Children/Chapter12/chapter12.Rnw ================================================ % Chapter Chapter 12 For Reproducible Research in R and RStudio % Christopher Gandrud % Created: 16/07/2012 05:45:03 pm CEST % Updated: 5 May 2015 <>= set_parent('Rep-Res-Parent.Rnw') @ \chapter{Large \emph{knitr}/LaTeX Documents: Theses, Books, and Batch Reports}\label{LargeDocs} In the previous chapter we learned the basics of how to make LaTeX documents to create and present research findings. So far we have only learned how to create short documents, like articles and slideshows. For longer and more complex documents, such as theses and books, a single LaTeX markup file can become very unwieldy very quickly, especially when it includes \emph{knitr} code chunks as well. Ideally we would segment the markup file into individual chapter files and then bring them all together when we compile the whole document. This would allow us to benefit from a modular file structure while producing one presentation document with continuous section and page numbering. To do this we can take advantage of LaTeX and \emph{knitr} to separate markup files into manageable pieces. Like directories, these pieces are called \textbf{child} files, which are combined using a \textbf{parent} document. Many of these tools can also be used to create batch reports\index{batch reports}: documents that present results for a selected part of a data set. For example, a researcher may want to create individual reports of answers to survey questions from interviewees with a specific age. In the latter part of this chapter we will rely on {\emph{knitr}} and the \emph{brew} package \citep{R-brew} to create batch reports. In this chapter we will first briefly discuss how to plan a large document's file structure. We will then look at three methods for including child documents into parent documents. The first is very simple and uses the LaTeX command \texttt{input}\index{LaTeX command!input}. The second uses \emph{knitr} to include knittable child documents. The final method is a special case of the \emph{knitr} method that uses the command-line program Pandoc \index{Pandoc} to convert child documents written in non-LaTeX markup languages and include them into a LaTeX parent. After this we will look at how to create batch reports. \section{Planning Large Documents} Before discussing the specifics of each of these methods, it's worth taking a moment to carefully plan the structure of our child and parent documents. Books and theses have a natural parent-child structure, i.e. they are single documents comprised of multiple chapters. They often include other child-like features such as title pages, bibliographies, figures, and appendices. You could include most of these features directly into one markup file. But this file would become very large and unwieldy. It would be difficult to find the one part or section that you want to edit. If your presentation markup files are difficult to navigate, they are difficult to reproduce. Instead of one long markup file, you can break the document at natural division points, like chapters, into multiple child documents.\index{child files}\index{parent document} These can then be combined with a parent document. The parent document acts like the skeleton that organizes the children in a specific order. The parent document can be compiled and all of the children will be in the right place. In LaTeX, a parent document will include the preamble where the document class (\texttt{book} for example\index{LaTeXbook}) is set and all of the necessary LaTeX packages are loaded. It also includes \emph{knitr} global options, the \texttt{maketitle}, \verb|\begin{document}| and \verb|\end{document}|, and the \texttt{bibliography.} When you compile the parent document you will compile the entire document. Notice that if the parent document contains the preamble and so on, that the children cannot contain this information as well. This can create some issues if you only want to compile one chapter rather than the whole document. We will see how to overcome this problem with \emph{knitr} later in the chapter. To make your many child and parent documents manageable, it is a good idea to store your child files in a subdirectory of the folder storing the parent file. This book was created using a knittable parent and child structure, so please see the markup files on GitHub for a complete example of how to use \emph{knitr} with large documents.\footnote{See: \url{https://github.com/christophergandrud/Rep-Res-Book/tree/master/Source}.} When segmenting your presentation documents into parents and children, the remainder of your research project structure can stay largely the same as we have seen so far. \section{Large Documents with Traditional LaTeX} Imagine that we are writing a book with three chapters. No part of the document includes \emph{knitr} code chunks. We can split the book into three child documents and place them in a subdirectory of the parent document's folder called \emph{Children}. The child documents should not contain a preamble, \verb|\begin{document}|, or \verb|\end{document}|. Because they are chapters, we will begin the documents simply with the \texttt{chapter} heading.\index{LaTeX command!chapter} For example, the chapter in this book has: <>= \chapter{Large \emph{knitr}/LaTeX Documents: Theses, Books, \& Batch Reports}\label{LargeDocs} @ \noindent As we saw earlier, the \texttt{label}\index{LaTeX command!label} command is used for cross-referencing. \subsection{Inputting/including children} Now in the parent document we can place the \texttt{input}\index{LaTeX command!input} command where we would like the child to show up in the final document. If we want there to be a clear page on either side of the included document we should use the \texttt{include}\index{LaTeX command!include} command instead. In the \texttt{input} or \texttt{include} command, we simply place the child document's file path. Here is an example parent document with three child documents (\emph{Chapter1.tex}, \emph{Chapter2.tex}, and \emph{Chapter3.tex}) all located in a subdirectory of the parent document called \emph{Children}: \begin{knitrout} \definecolor{shadecolor}{rgb}{1, 1, 1} \color{fgcolor} \begin{kframe} \begin{alltt} %%%%%%%%%%%%%% Article Preamble %%%%%%%%%%%%%% \textbackslash{}documentclass\{book\} %%%% Load LaTeX packages \textbackslash{}usepackage\{hyperref\} \textbackslash{}usepackage\{makeidx\} \textbackslash{}usepackage[authoryear]\{natbib\} %%%% Start document body \textbackslash{}begin\{document\} %%%%%%%%%%%%% Create title %%%%%%%%%%%%%%%%% \textbackslash{}title\{An Example LaTeX Book\} \textbackslash{}author\{Christopher Gandrud\} \textbackslash{}maketitle %%%%%%%%%%%% Frontmatter %%%%%%%%%%%%%%%%%%% \textbackslash{}tableofcontents \textbackslash{}listoffigures \textbackslash{}listoftables %%%% Start index \textbackslash{}makeindex %%%%%%%%%%% Input child documents %%%%%%%%% %%%% Chapter 1 \textbackslash{}input\{Children/Chapter1.tex\} %%%% Chapter 2 \textbackslash{}input\{Children/Chapter2.tex\} %%%% Chapter 3 \textbackslash{}input\{Children/Chapter3.tex\} %%%%%%%%% Bibliography %%%%%%%%%%%%%%%%%%%% \textbackslash{}bibliographystyle\{apa\} \textbackslash{}bibliography\{Main.bib,Packages.bib\} %%%%%%%%% Index %%%%%%%%%%%%%%%%%%%%%%%%%% \textbackslash{}clearpage \textbackslash{}printindex \textbackslash{}end\{document\} \end{alltt} \end{kframe} \end{knitrout} \subsection{Other common features of large documents} There are some other commands in this example parent document that we have not seen before. These commands create the book's front matter\index{front matter}--tables of contents, lists of figures and tables--as well as blank pages and the book's index. \paragraph{Table of contents}\index{LaTeX!table of contents} If you are using LaTeX's section headings (e.g. \texttt{chapter}, \texttt{section})\index{LaTeX command!chapter}\index{LaTeX command!section} you can automatically generate a table of contents with the \texttt{tableofcontents}\index{LaTeX command!tableofcontents} command. We saw an example earlier when we created a beamer slideshow. Simply place this command where you want the table of contents to appear. Usually this is after the \texttt{maketitle} command near the beginning of the document. \paragraph{Lists of figures and tables}\index{LaTeX!list of tables/figures} It is also common for large documents to include lists of its figures and tables. Usually these are placed after the table of contents. LaTeX will automatically create these lists from the \texttt{caption}s\index{LaTeX command!caption} you place in \texttt{table} and \texttt{figure} environments. To create these lists, use the \texttt{listoffigures} and \texttt{listoftables} commands.\index{LaTeX command!listoftables}\index{LaTeX command!listoffigures} \paragraph{Blank Pages} Sometimes we want to make sure that an index, a bibliography, or some other item begins on a new page. To do this, simply place the \texttt{clearpage}\index{LaTeX command!clearpage} command directly before the item. \paragraph{Index}\index{LaTeX!indices} You can automatically create an index with the \emph{makeidx} (make index) LaTeX package.\index{LaTeX package!makeidx} To set up this package, include it in your preamble. Then, near the beginning of your document, enable the index by placing \verb|\makeindex|. You will probably want the actual index to be printed near the end of the document. To do this, place \verb|\printindex| after the bibliography or somewhere else before \verb|\end{document}|. Throughout the child documents, you can use \verb|\index{INDEX_KEY}| at places that you would like the index to refer to. For example, if we wanted to create an index entry for this spot in this book with the \verb|INDEX_KEY| ``indices'' we type: \verb|\index{indices}|. %%%%%%%%%%%%% Knitted Child Documents %%%%%%%%%%%%%% \section{\emph{knitr} and Large Documents}\index{knitr!large documents} LaTeX's own parent-child functions are very useful if you are creating plain, non-knittable documents. For knittable documents we need to use \emph{knitr}'s parent-child options. Not only do these allow us to include knittable children in parent documents, it also allows us to \texttt{knit} each child document separately. This can be very useful working on document drafts as we don't need to compile the whole document every time we want to look at changes made in one chapter. \subsection{The parent document} Like regular LaTeX parent documents, knittable parent documents include commands to create the preamble, front matter, bibliography, and so on. {\emph{knitr}} global chunk options\index{knitr!global chunk options} and package/data loading should also be set at the beginning of the parent document if you want them to apply to the entire thing. Rather than using the \texttt{input} or \texttt{include} commands, we use the \texttt{child}\index{knitr!child option} code chunk option to include child documents with \emph{knitr}. The \texttt{child} option simply takes as its value the child document's file path. For example: \begin{knitrout} \definecolor{shadecolor}{rgb}{1, 1, 1}\color{fgcolor} \begin{kframe} \begin{alltt} \textless{}\textless{}SetChild, child='Children/Chapter1.Rnw', include=FALSE\textgreater{}\textgreater{}= @ \end{alltt} \end{kframe} \end{knitrout} \noindent We can include the other child documents either in their own code chunks or all in one chunk as a character vector. You can also use \verb|Sexpr|\index{knitr!Sexpr} with the option \verb|knit_child|.\index{knitr!knit\_child} <>= \Sexpr{knit_child('Children/Chapter1.Rnw')} @ \noindent This is the same thing as using the \texttt{child} option in a code chunk. Note also that you can continue to use \texttt{input}, \texttt{include}, and code chunks with the \texttt{child} option in the same document if you like. When you have your child code chunks set up in your parent document, just \texttt{knit} the parent like you would any other knittable file. The knittable children will be knit and included every time you knit the parent document. \subsection{Knitting child documents} You can use \emph{knitr} to compile individual child documents. To do this, place a code chunk at the beginning of the child document. In the code chunk (not as a option) use the \verb|set_parent| command to specify where the parent file is. Here is an example child file with a parent located at \emph{/ExampleProject/Parent.Rnw}: \begin{knitrout} \definecolor{shadecolor}{rgb}{1, 1, 1}\color{fgcolor} \begin{kframe} \begin{alltt} %%%%%%%%% Set parent %%%%%%%%% \textless{}\textless{}SetParent, include=FALSE\textgreater{}\textgreater{}= set_parent('/ExampleProject/Parent.Rnw') @ %%%%%%%%% Chapter heading %%%% \textbackslash{}chapter\{The first chapter\} This chapter is very short \end{alltt} \end{kframe} \end{knitrout} \noindent You can also use \verb|set_parent| with \verb|Sexpr|. When you have set the parent document you can \texttt{knit} the child document by itself. In addition to knitting the code chunks, \emph{knitr} will include all of the preamble information from the parent document as well as \verb|\begin{document}| and \verb|\end{document}|.\footnote{If you are using custom LaTeX style files (they have the file extension \texttt{.sty}) then \emph{knitr} won't include these in the knitted document unless you include a copy of the style file in the child document's directory.} \paragraph{Other markup languages} We can use \emph{knitr}'s parent-child functions in any of the markup languages it supports. For example, we can \texttt{knit} R Markdown children into R Markdown parent documents. We don't look at specific examples in this book. The \emph{knitr} options syntax is the same, but as usual, syntax for opening and closing the code chunks is specific to the markup language. %%%%%%%%%%% Child documents in another markup language %%%%% \section{Child Documents in a Different Markup Language} \index{Pandoc|(} Because {\emph{knitr}} is able to run not only R code but also command-line programs,\index{command-line} you can use the Pandoc program to convert child documents written in a different markup language into the primary markup language you are using for your document. If you have Pandoc installed on your computer,\footnote{Pandoc installation instructions can be found at: \url{http://johnmacfarlane.net/pandoc/installing.html}.} you can call it directly from your parent document by including the Pandoc commands in a code chunk with the \texttt{engine} option set to either \verb|'bash'| or \verb|'sh'|.\footnote{Alternatively, you can run Pandoc in an R code chunk using the {\tt{system}} command.\index{R function!system} For example: \texttt{system("pandoc Children/FrontMattter/StylisticConventions.md -f markdown -t latex -o StyleTemp.tex")}. \emph{knitr} also has a \texttt{pandoc} command\index{knitr!pandoc} that is a wrapper for converting Markdown documents to other formats with Pandoc.} For example, the Stylistic Conventions (page \pageref{StylisticConventions}) part of this book is written in Markdown. The source file is called {\emph{StylisticConventions.md}} and is in a subdirectory of the parent's directory called: \emph{Children/FrontMatter} It was faster to write the list of conventions using the simpler Markdown syntax than LaTeX, which as we saw has a more complicated way of creating lists. However, I wanted to include this file in the LaTeX-produced book. Pandoc can convert the Markdown document\index{Markdown} into a LaTeX file. This file can then be input into the main document with the LaTeX command \texttt{input}.\index{LaTeX command!input} In the parent document I added a code chunk with the following command to convert the Markdown syntax in {\emph{StylisticConventions.md}} to LaTeX and save it in a file called {\emph{StyleTemp.tex}}.\label{PandoctoLaTeXExample} \begin{knitrout} \definecolor{shadecolor}{rgb}{1, 1, 1} \color{fgcolor} \begin{kframe} \begin{alltt} \textless{}\textless{}StyleConventions, include=FALSE, engine='sh'\textgreater{}\textgreater{}= # Use pandoc to convert MD to TEX pandoc Children/FrontMattter/StylisticConventions.md -f markdown \textbackslash{} -t latex -o StyleTemp.tex @ % Input converted StyleTemp document \textbackslash{}input\{StyleTemp.tex\} \end{alltt} \end{kframe} \end{knitrout} \noindent The options {\tt{-f markdown}} and {\tt{-t latex}} tell Pandoc to convert {\emph{StylisticConventions.md}} from Markdown to LaTeX syntax. {\tt{-o StyleTemp.tex}} instructs Pandoc to save the resulting LaTeX markup to a new file called {\emph{StyleTemp.tex}}. I only need to include a backslash (\textbackslash{}) at the end of the first line because I wanted to split the code over two lines. The code wouldn't fit on this page otherwise. The backslash tells the shell not to treat the following line as a different line. Unlike in R, the shell only recognizes a command's arguments if they are on the same line as the command. You'll notice that after the code chunk we use \verb|input| to include the new \emph{StyleTemp.tex} document. Note that using this method to include a child document that needs to be knit will require extra steps not covered in this book. \begin{figure} \caption{The \emph{brew} + \emph{knitr} Process} \label{BrewFig} \vspace{0.2cm} \input{Children/Chapter12/images12/BrewProcess.tex} \end{figure} \index{Pandoc|)} \section{Creating Batch Reports}\index{batch reports|(} When we create batch reports we want to somehow subset a data set into multiple pieces and use these pieces as the input for \emph{knitr} code chunks in different presentation documents for each subset of the data set. The \emph{brew} package \citep{R-brew}\index{brew} is maybe the most popular tool for creating batch reports in R. Using \emph{brew} with multiple subsets of a data set adds two steps to the process of creating \emph{knitr} presentation documents (see Figure \ref{BrewFig}): \begin{enumerate} \item Create a \emph{brew} template document. \item Create a function to subset the data, brew, and knit each file. \end{enumerate} \emph{knitr}'s \verb|knit_expand| command can also be used to create batch reports.\index{R function!knit\_expand}\index{knitr!expand}\index{knitr!batch reports} Because \emph{brew} is the dominant way to create batch reports in R and currently has more capabilities than \verb|knit_expand| we will cover \emph{brew} rather than \verb|knit_expand| in detail. Imagine that we are using the \emph{MainData} data set discussed in the previous chapters and we want to create a LaTeX document for each country displaying its average fertilizer consumption (\emph{FertilizerConsumption}).\index{fertilizer}\footnote{The files needed to create this example are available at: \url{http://bit.ly/XJbyCK}.} First, let's create a \emph{brew} template document. This document will include all of our markup and the code chunks we want in our \emph{knitr} document. There is one small difference from regular knittable documents: it will use \emph{brew} syntax to include information from the subsetted data. Text in a \emph{brew} template document is printed `as is' when we \texttt{brew} it unless it is between \emph{brew}'s delimiters.\index{brew!delimiters} The delimiters are:\footnote{Note that the spaces between the delimiter and its contents are important.} \begin{itemize} \item \texttt{\textless{}\textbackslash{}\# . . . \%\textgreater{}}: Comment delimiter, i.e. contents are thrown away when brewed. \item \texttt{\textless{}\% . . . \%\textgreater{}}: R functions inside the delimiters are run, but the results aren't printed. \item \texttt{\textless{}\%= . . . \%\textgreater{}}: Contents are printed. \end{itemize} \noindent In the following example we use the latter two. Here is our \emph{brew} template: \begin{knitrout} \definecolor{shadecolor}{rgb}{1, 1, 1} \color{fgcolor} \begin{kframe} \begin{alltt} \textbackslash{}documentclass\{article\} \textbackslash{}begin\{document\} % Create numeric vector \textless{}% NewFC <- FC %\textgreater{} \{\textbackslash{}LARGE \textless{}%= Name %\textgreater{}\} \vspace{1cm} The mean fertilizer consumption for \textless{}%= Name %\textgreater{} is \textbackslash{}Sexpr\{round(mean(NewFC, na.rm = TRUE), digits = 1)\} kilograms per hectare of arable land. \textbackslash{}end\{document\} \end{alltt} \end{kframe} \end{knitrout} \noindent There are a few things to note. The line \verb|<% NewFC <- FC %>| will create a vector called \emph{NewFC} from the object \emph{FC}. As we will see when we create the \emph{brew} function, \emph{FC} contains the values of \emph{FertilizerConsumption} for each country. We need to put \emph{FC} into a new object because if we typed \verb|<%= FC %>| \emph{brew} would print the numbers literally, not in a numeric vector like we need later for the \texttt{mean} command. \verb|<%= Name %>| prints the country name in the subsetted data. We'll see how to create \emph{Name} in the \emph{brew} function below. We save this template in \emph{BatchReports/Template}, i.e. in a subdirectory of \emph{BatchReports} called \emph{Template}. Let's give it the file name \emph{BrewTemplate.Rnw}. Now let's create the R code to subset the data, \texttt{brew}, and \texttt{knit} the reports: <>= # Set working directory setwd("/BatchReports") #### Download Data #### # Load repmis library(repmis) # Download data MainData <- source_data("http://bit.ly/V0ldsf") # Create vector of country names COUNTRY <- as.character(unique(MainData$country)) #### Create BatchReports Function #### BatchReports <- function(Name){ # Create file names for individual reports ## Remove white space in country names CountryNoWhite <- gsub(" ", "", x = Name) KnitFile <- paste(CountryNoWhite, ".Rnw", sep = "") # Subset data SubData <- subset(MainData, country == Name) # Create vector of the country's fertilizer consumption FC <- SubData$FertilizerConsumption # Brew and Knit brew::brew("Template/BrewTemplate.Rnw", KnitFile) knitr::knit2pdf(KnitFile) } #### Run function and cleanup #### lapply(COUNTRY, BatchReports) # Keep only pdf reports unlink(c("*.aux", "*.log", "*.Rnw", "*.tex")) @ \noindent Ok, this is a lot of new code. Let's go through it step by step: \begin{enumerate} \item Set the working directory to \emph{/BatchReports}. \item Download \emph{MainData.csv} using the \texttt{source\_data}\index{R function!source\_data} function, as we've done before. \item Create a vector for the country names in the data. We will use this vector to subset the data. \item Create a \texttt{function}\index{R function!function}\index{R!functions} called \emph{BatchReports} for subsetting\index{R!subset} the data, brewing it, and knitting it. \begin{itemize} \item The \texttt{function} command allows us create a new function.\footnote{User-created functions are just like most other R functions.} Arguments are specified in parentheses (these are also called the formals\index{R functions!formals}) and R expressions are put in the curly brackets that denote the function's body.\index{R functions!body} The expressions do things with the arguments. Our argument here is \texttt{Name} and the contents of the curly brackets subset, brew, and knit the data according to \emph{Name}'s value.\footnote{For more information on functions see Hadley Wickham's page on the topic: \url{http://adv-r.had.co.nz/Functions.html}.} \item An important step in the \texttt{BatchReports} function is creating a new name to give our brewed and knit files. Some country names like ``United Arab Emirates'' have white spaces in them. We cannot run LaTeX on a file with a name containing white spaces. We remove the white spaces with the \texttt{gsub} command\index{R function!gsub}, i.e. we substitute a space with no white space. We then use the \texttt{paste}\index{R function!paste} command to create a name that will be used for the brewed file. \emph{knitr} will automatically create a name for the final PDFs. \end{itemize} \item \texttt{lapply}\index{R function!lapply} allows us to run our \texttt{BatchReports} function separately for every value of the \emph{COUNTRY} vector. \texttt{BatchReport}'s \emph{Name} argument takes the value \emph{COUNTRY}. Note: it is important to end the \emph{BrewTemplate.Rnw} with a blank line for \texttt{lapply} to work correctly. \item Finally, we use the \texttt{unlink}\index{R function!unlink} command to delete all of the ancillary files used to create the final batch report PDFs. Always be careful with the \texttt{unlink} command as it permanently deletes files. Because we used the asterisk wildcard (see Section \ref{AsteriskWildcard}),\index{R!wildcard}\index{wildcard} \texttt{unlink} will delete all files in the working directory with the extensions \texttt{.aux}, \texttt{.log}, \texttt{.Rnw} and \texttt{.tex}. \end{enumerate} Figure \ref{BrewExample} shows you a sample of what the final PDF created by this \emph{brew}/\emph{knitr} process for Afghanistan looks like. This was a very simple example illustrating the basic process for combining \emph{brew} and \emph{knitr} to create batch reports. The process can be used to create much more complex documents and with other markup languages. \begin{figure} \caption{Snippet of an Example PDF Document Created with \emph{brew} + \emph{knitr}} \label{BrewExample} \vspace{0.5cm} \includegraphics[width=0.8\textwidth]{Children/Chapter12/images12/BrewExample.png} \vspace{1cm} \end{figure} \index{batch reports|)} \subsection*{Chapter summary} In this chapter we have learned how to create more complex LaTeX documents to present our reproducible research. In particular we learned how to take advantage of parent and child document structures using both basic LaTeX and \emph{knitr} tools. These allow us to more easily work with very large presentation documents. We saw how Pandoc can be combined with these tools so that we can create our documents using multiple markup languages. We also learned how to create \emph{brew} templates that can be used to create multiple documents presenting information from subsets of our data. In the next chapter we will learn how to create documents for presenting reproducible research on the web with Markdown. We will also see how to use \emph{rmarkdown} to easily create documents in other formats as well. ================================================ FILE: Old/Source-v2/Children/Chapter13/chapter13.Rnw ================================================ % Chapter Chapter 13 For Reproducible Research in R and RStudio % Christopher Gandrud % Created: 16/07/2012 05:45:03 pm CEST % Updated: 17 April 2015 <>= set_parent('Rep-Res-Parent.Rnw') @ \chapter{Presenting on the Web and Other Formats with R Markdown}\label{MarkdownChapter} LaTeX is the standard markup language for creating academic-quality articles and books. If we want to present research findings via the internet, our best option is HTML.\index{HTML} HTML syntax can be tedious to write, as we saw in Chapter \ref{TablesChapter}. Luckily the Markdown language was created as a simplified way of writing HTML documents. As we have seen, Markdown can be fully integrated with \emph{knitr}/\emph{rmarkdown} for creating reproducible research HTML presentation documents. In addition, the \emph{rmarkdown} package allows us to write documents in Markdown and render them to PDF and MS Word. In this chapter we will learn about Markdown editors and the basic Markdown syntax for creating simple reproducible documents, including many of the things we covered for \emph{knitr}/LaTeX documents such as headings and text formatting. Please refer back to previous chapters for syntax used to display code and code chunks (Chapter \ref{StatsModel}), tables (Chapter \ref{TablesChapter}), and figures (Chapter \ref{FiguresChapter}) with R Markdown documents. In this chapter will also briefly look at some more advanced features for including math with MathJax, footnotes and bibliographies with Pandoc, and customizing styles with CSS. Then we will learn how to create HTML slideshows. We'll finish up the chapter by looking at options for publishing Markdown-created HTML documents, including locally on your computer, Dropbox, and GitHub Pages. \section{The Basics} \index{Markdown|(} Markdown was created specifically to make it easy to write HTML (or XHTML\index{XHTML}\footnote{Extensible HyperText Markup Language}) using a syntax that is human readable and possibly publishable without compiling. For example, compare the Markdown table syntax in Chapter \ref{TablesChapter} to the HTML syntax for virtually the same table.\footnote{For more information see John Gruber's website: \url{http://daringfireball.net/projects/markdown/}.} That being said, to make Markdown simple, it does not have as many capabilities as HTML. To get around this problem you can still use HTML in Markdown, though note that Markdown syntax cannot be used between HTML element tags. Pandoc and \emph{rmarkdown} have extended Markdown so that it can be used to create reproducible PDF and MS Word documents. \textbf{Note:} if you are using \emph{rmarkdown} to compile a document to PDF or Word, using raw HTML syntax will often not work as intended, if at all. As a rule, syntax specific to LaTeX or HTML that is included in an R Markdown document can only be properly compiled to a PDF or HTML document, respectively. Similarly, you are only able to include graphics that are of types supported by the output format. You are not be able to include a JavaScript\index{JavaScript} plot directly in a PDF. \subsection{Getting started with Markdown editors}\index{Markdown!editors} \begin{wrapfigure}{r}{0.3\textwidth} \caption{R Markdown Compile Dropdown Menu} \label{DropdowMarkdown} \begin{center} \includegraphics[scale=0.5]{Children/Chapter13/images13/rmarkdownOutputOptions.png} \end{center} \end{wrapfigure} Like for R LaTeX, RStudio\index{RStudio} functions as a very good editor for R Markdown documents and regular non-knittable Markdown documents as well. To create a new R Markdown document in RStudio, click \texttt{File} in the menu bar then \texttt{New} \textrightarrow{} \texttt{R Markdown}. You will then be able to select what output format you would like. RStudio has full syntax highlighting for code chunks and can compile \texttt{.Rmd} files into \texttt{.md}, then render them in \emph{.html}, for example, with one click of the \texttt{Knit HTML} button (\includegraphics[scale=0.5]{Children/Chapter13/images13/KnitHTML.png}). As we saw in Chapter \ref{GettingStartedRKnitr} (Figure \ref{NotebookExample}), when you knit a Markdown document in RStudio, it will preview the HTML document for you. You can always view HTML documents by opening them with your web browser. You can do this directly from RStudio's \textbf{Preview HTML}\index{RStudio!Preview HTML} window by clicking the \texttt{Open in Browser} button (\includegraphics[scale=0.45]{Children/Chapter10/images10/ShowInBrowser.png}). If you click on the downward arrow next to \texttt{Knit HTML} you will see the drop-down menu in Figure \ref{DropdowMarkdown}. This allows you to also compile the document to PDF or MS Word, regardless of which format you originally chose when you created the document. As with HTML you will be given a preview of the PDF or Word document when it is compiled. In Figure \ref{DropdowMarkdown} you'll also notice the question mark button. Click this for a quick guide to the Markdown syntax used in RStudio. Being plain-text, you can also use any other text editor to modify Markdown documents, though they will lack the level of integration with \emph{knitr}/\emph{rmarkdown} that RStudio has. \subsection{Preamble and document structure} That was kind of a trick subsection title. Unlike LaTeX documents, plain Markdown documents do not have a preamble. \emph{rmarkdown} documents can have a header, basically another name for a preamble, but we will get to that later. There is also no need to start a body environment or anything like that. HTML head elements\index{HTML element!head} (HTMLs preamble equivalent) are added automatically when you render Markdown documents into HTML. So with Markdown, you can just start typing. Here is an example of an R Markdown document that creates the map we saw in Chapter \ref{FiguresChapter}:\footnote{This code is available on GitHub at: \url{https://GitHub.com/christophergandrud/Rep-Res-Examples/blob/master/RMarkdownExamples/ExampleKnitrDocument/ExampleKnitrMarkdown.Rmd}.} We'll go through all of the code below. <>= # Example R Markdown File ## from "Reproducible Research with R and RStudio" ### Christopher Gandrud ### 15 January 2015 ----------- ```{r LoadPackages, include=FALSE} # Load required packages library(devtools) ``` We can use the [googleVis](http://code.google.com/p/google-motion-charts-with-r/) package to create interactive JavaScript tables, charts, and maps. Here is an example of how to create a map with *googleVis*'s `gvisGeoChart` function. Let's first download some data from [GitHub](https://GitHub.com/). See chapters 6 and 7 for details about this data as well as the [variable description page](https://GitHub.com/christophergandrud/ Rep-Res-Examples/blob/master/DataGather_Merge/ MainData_VariableDescriptions.md). ----------- ## Fertilizer Consumption (kilograms per hectare of arable land) in 2003 ### Data from [World Bank](http://data.worldbank.org/indicator/AG.CON.FERT.ZS) ```{r CreategvisGeoChart, echo=FALSE, message=FALSE, results='asis'} # Create geo map of global fertilizer consumption for 2003 # The data is loaded from GitHub (http://bit.ly/V0ldsf) ## The data gathering process used to create this data set ## is completely reproducible. For more information see: ## http://bit.ly/YnMKBG source_url("http://bit.ly/VNnZxS") ``` ----------- @ \noindent When knitted in RStudio and viewed in the Google Chrome web browser,\index{Google Chrome} the final presentation document looks like Figure \ref{MarkdownExampleFig}. \begin{figure} \caption{Example Rendered R Markdown Document} \label{MarkdownExampleFig} \begin{center} \includegraphics[width=0.8\textwidth]{Children/Chapter13/images13/MarkdownExampleMap.png} \end{center} \end{figure} \subsection{Headings}\index{Markdown!headings |(} Headings\label{MarkdownHeader} in Markdown are extremely simple. Note that Markdown headings and R Markdown headers are not the same thing. The latter gives instructions for how to render the document, the former are section titles in the text. To create a line in the topmost heading style--maybe a title--just place one hash mark (\verb|#|) at the beginning of the line. The second-tier heading gets two hashes (\verb|##|) and so on. You can also put the hash mark(s) at the end of the heading, but this is not necessary. Here is an example of the three headings: <>= # A level one heading ## A level two heading ### A level three heading @ \noindent There are six heading levels in Markdown. You can also create a level-one heading by following a line of text with equal signs. Level-two headings can be created by following a line of text with dashes: <>= A level one heading =================== A level two heading ------------------- @ \index{Markdown!headings |)} \subsection{Horizontal lines}\index{Markdown!lines} If you would like to create horizontal lines that run the width of the page in Markdown, simply place three or more equal signs or dashes separated by text from above by one blank line: <>= Create a horizontal line. ========= @ \subsection{Paragraphs and new lines}\index{Markdown!paragraphs} Just like in LaTeX, new paragraphs are created by putting text on a new line separated from previous text with a blank line. For example: <>= This is the first paragraph. This is the second paragraph. @ \noindent You might have noticed that in the headers example we did not need to separate the header with a blank line. Separating lines with a blank line places\index{Markdown!new line} a blank line in the final document. End a line with two or more white spaces ( ) to create a new line that is not separated by a blank line. \subsection{Italics and bold}\index{Markdown!italics}\index{Markdown!bold} To \emph{italicize} a word in Markdown, simply place it between two asterisks, e.g. \verb|*Italicize these words*|. To make words \textbf{bold}, place them between four asterisks, two on either side: \verb|**Make these words bold**|. \subsection{Links}\label{MarkdownLinks}\index{Markdown!hyper-links} To create hyper-links in Markdown, use the \verb|[LINK_TEXT](URL)| syntax.\footnote{You can also include a \texttt{title} attribute after the URL, though this is generally not very useful. See Section \ref{TitleAttribute} for a discussion.} \verb|LINK_TEXT| is the text that you would like to show up as the hyper-link text. When you click on this text it will take you to the linked site specified by \texttt{URL}. If you want to show only a URL as the text, type it in both the square brackets and parentheses. This is a little tedious, so in RStudio you can just type the URL and it will be hyper-linked. In regular Markdown place the URL between less-than and greater-than signs (\verb||). \subsection{Special characters and font customization}\index{Markdown!special characters} Unlike LaTeX rendered with pdfLaTeX\index{pdfLaTeX}, Markdown can include almost any letters and characters included in your system. The main exceptions are characters used by Markdown syntax (e.g. \verb|*|, \verb|#|, \verb|\| and so on). You will have to escape these (see below). Font sizes and typefaces cannot be set directly with Markdown syntax. You need to set these with HTML or CSS, which I don't cover here, though below we will look at how to use a custom CSS file. \subsection{Lists}\index{Markdown!lists} To create itemized lists in Markdown, simply place the items after one dash: <>= - Item 1 - Another item - Item 3 @ \noindent To create a numbered list, use numbers and periods rather than dashes. <>= 1. Item 1 2. Another item 3. Item 3 @ \subsection{Escape characters}\index{Markdown!escape character} Markdown, like LaTeX and R, uses a backslash (\verb|\|) as an escape character. For example, if you want to have an asterisk in the text of your document (rather than start to italicize your text, e.g. \verb|*some italicized text*|), type: \verb|\*|. Two characters--ampersand (\verb|&|)\index{Markdown!ampersand} and the less-than sign (\verb|<|)--\index{Markdown!less-than sign}have special meanings in HTML.\footnote{Ampersands declare the beginning of a special HTML character. Less-than signs begin HTML tags.} So, to have them printed literally in your text you have to use the HTML code for the characters. Ampersands are created with \verb|&|. Less-than signs are created with \verb|<|. \index{Markdown|)} \subsection{Math with MathJax} \index{MathJax|(} Markdown by itself can't format mathematical equations. We can create LaTeX-style equations by adding on the MathJax JavaScript\index{JavaScript} engine. MathJax syntax is the same as LaTeX syntax (see Section \ref{MathLaTeX}), especially when used from RStudio or when rendered with \emph{rmarkdown}. Markdown documents rendered in RStudio automatically link to the MathJax engine online.\footnote{You will not be able to render equations when you are not online.} If you want to use another program to render Markdown documents with MathJax equations, you may need to take extra steps to link to MathJax. For more details see: \url{http://docs.mathjax.org/en/latest/start.html#mathjax-cdn}. Because backslashes are Markdown escape characters, in many Markdown editors you will have to use two backslashes to create math environments with MathJax. For example, in LaTeX and RStudio's\index{RStudio!Markdown math} Markdown you can create a display equation like this: \[ s^{2} = \frac{\sum(x - \bar{x})^2}{n - 1} \] \noindent by typing:\footnote{In RStudio you can also use dollar signs to delimit MathJax equations as in LaTeX. See the footnotes in Section \ref{MathLaTeX} for more information.} <>= \[ s^{2} = \frac{\sum(x - \bar{x})^2}{n - 1} \] @ \noindent But, in other Markdown programs you may have to use: <>= \\[ s^{2} = \frac{\sum(x - \bar{x})^2}{n - 1} \\] @ To make inline equations, use parentheses instead of square brackets as in LaTeX, e.g. \verb|\( s^{2} = \frac{\sum(x - \bar{x})^2}{n - 1} \)|. \index{MathJax|)} \section{Further Customizability with \emph{rmarkdown}}\index{rmarkdown|(} Markdown is simple and easy to use. But being simple means that it lacks important functionality for presenting research results, such as footnotes and bibliographies, and custom formatting. In this section we will learn how to overcome these limitations with Pandoc and CSS via \emph{rmarkdown}. \subsection{More on \emph{rmarkdown} Headers} In Chapter \ref{GettingStartedRKnitr} (page \pageref{rmardownHeader}) we first saw an \emph{rmarkdown} header written in YAML\index{YAML}. Just as a refresher, here is the basic header we looked at: \begin{knitrout} \definecolor{shadecolor}{rgb}{1, 1, 1} \color{fgcolor}\begin{kframe} \begin{alltt} --- title: "A Basic PDF Presentation Document" author: "Christopher Gandrud" date: "30 November 2015" output: pdf_document: toc: true --- \end{alltt} \end{kframe} \end{knitrout} This header provides instructions for what to do when the document is rendered, gives instructions to render the document as a PDF (via LaTeX), and inserts a title, author, date, and table of contents at the beginning. We also have the option to include other formatting options, many of which we would include in a \emph{knitr} LaTeX document's preamble.\index{LaTeX!preamble} You include these at the top level, i.e. without being tabbed. \emph{rmarkdown} refers to these options as ``metadata''.\index{rmarkdown!metadata} For example, to change the font size to 11-point we could use: \begin{knitrout} \definecolor{shadecolor}{rgb}{1, 1, 1} \color{fgcolor}\begin{kframe} \begin{alltt} --- title: "A Basic PDF Presentation Document" author: "Christopher Gandrud" date: "30 November 2015" output: pdf_document: toc: true fontsize: 11pt --- \end{alltt} \end{kframe} \end{knitrout} \noindent We could double-space the PDF document with a similar top-level entry: \texttt{linestretch: 2}.\footnote{1 would be for single space and 1.5 would be for one and a half spacing.} To find more options for PDF documents, type \texttt{?pdf\_document} into your R console. Note that these options will only affect your PDF document, not a rendered HTML file. Remember from Chapter \ref{GettingStartedRKnitr} (page \pageref{rmarkdownRender}) that we can specify rendering instructions for multiple output formats in the same header. Here is a longer header, building on what we just saw. We'll go through it in detail: \begin{knitrout} \definecolor{shadecolor}{rgb}{1, 1, 1} \color{fgcolor}\begin{kframe} \begin{alltt} --- title: "An Example rmarkdown Article" author: "Christopher Gandrud" date: "15 January 2015" output: pdf_document: latex_engine: xelatex number_sections: yes toc: yes html_document: toc: no theme: "flatly" linestretch: 2 fontsize: 11pt bibliography: - Main.bib - Packages.bib --- \end{alltt} \end{kframe} \end{knitrout} \noindent Ok, let's go through this in detail. We have already seen the \texttt{title}, \texttt{author}, \texttt{date}, \texttt{linestretch}, and \texttt{fontsize} options. Notice that we used \texttt{latex\_engine} to set the LaTeX engine to XeLaTeX, which is useful for documents that include non-standard English characters.\index{XeLaTeX} We also specified with \texttt{number\_sections} that the PDF document should have numbered section headings. For the HTML version of the document we do not want a table of contents as we set \texttt{toc: no}. We specified a CSS\index{CSS} theme called Flatly for our HTML document using \verb|theme: "flatly"|. As of this writing, \emph{rmarkdown} has a built-in ability to use a range of themes from Bootswatch (\url{http://bootswatch.com/}). Alternatively, you can link to a custom CSS file with the \texttt{css} option. Use \texttt{html\_document} to see other options. Notice that we can use \texttt{no} and \texttt{yes} instead of \texttt{false} and \texttt{true}, respectively. We linked to two BibTeX files with the \texttt{bibliography} option. Using Pandoc syntax, the references will apply to both the PDF and HTML documents. If you want to also enable the creation of a Microsoft Word document, include \verb|output: word_document| in the header. \index{Pandoc!bibliographies}\index{Markdown!bibliographies} \begin{table} \caption{A Selection of Pandoc In-text Citations} \begin{center} \label{PandocCitations} \begin{tabular}{l l} \hline Markup & Result \\[0.25cm] \hline\hline \texttt{[@Donoho2009]} & (Donoho 2009) \\[0.25cm] \texttt{[-@Donoho2009]} & (2009) \\[0.25cm] \texttt{[see @Donoho2009]} & (see Donoho 2009) \\[0.25cm] \texttt{[see @Donoho2009, 10-11]} & (see Donoho 2009, 10--11) \\[0.25cm] \texttt{[@Donoho2009; @Box1973]} & (Donoho 2009; Box 1973) \\[0.25cm] \texttt{@Donoho2009 [10-11]} & Donoho (2009, 10--11) \\[0.25cm] \hline \end{tabular} \end{center} \end{table} \paragraph{Bibliographies with Pandoc}\index{bibliography |(} Pandoc via \emph{rmarkdown} allows us to insert citations from normal BibTeX files (see Chapter \ref{LatexChapter}) specified in the header with \texttt{bibliography}. The main difference is that Pandoc has a different syntax from LaTeX for making in-text citations. Basic Pandoc citations begin with \verb|@| followed by the BibTeX citation key. Square brackets (\verb|[]|) create parentheses around the citation. Here is an example: <>= This is a citation [@Donoho2009]. @ \noindent Pandoc uses \emph{natbib}\index{LaTeX package!natbib}\index{bibliography}\index{in-text citation} by default, so the citation \verb|[@Donoho2009]| will appear as (Donoho et al., 2009). To add text before and after the citation inside of the parentheses, use something like this: \verb|[see @Donoho2009, 10]|; which creates: (see Donoho et al. 2009, 10). If you do not want the parentheses around the entire citation (only the year) then omit the square brackets. To include only the year and not the authors' surnames, add a minus sign, e.g. \verb|[-@Donoho2009]|. See Table \ref{PandocCitations} for more options. Full bibliographic information for each item that is cited in the text will be produced at the end of the output document. I suggest placing a heading like \verb|# References| at the very end of your document so that the bibliography will be differentiated from the document's text. \index{bibliography |)} \paragraph{Footnotes with Pandoc}\index{Pandoc!footnotes}\index{Markdown!footnotes} You can also include footnotes in documents rendered with \emph{rmarkdown} by using Pandoc's footnote syntax. In the text where you would like a footnote to be located use: \verb|[^NOTE_KEY]|. Then at the end of your document, place \verb|[^NOTE_KEY]: The footnote text|.\footnote{You can actually put this almost anywhere and it will be placed and numbered correctly in the output document, but I find it easier to organize the footnotes when they are placed at the end.} \texttt{NOTE\_KEY}s generally follow the same rules as BibTeX citation keys, so no spaces. The footnotes will be numbered sequentially when rendered. To sum up, here is an example of document that can be rendered in HTML or PDF using \emph{rmarkdown}. It includes footnotes and a bibliography. <>= --- title: "Minimal rmarkdown Example" output: pdf_document: toc: true html_document: toc: false bibliography: Main.bib --- This is some text.[^FirstNote] This is a *knitr* code chunk: ```{r} plot(cars$speed, cars$dist) ``` This is a citation [see @Donoho2009, 10]. [^FirstNote]: This is a footnote. # References @ We have only covered a small proportion of Pandoc's capabilities that you can take advantage of with \emph{rmarkdown}. For full range of Pandoc's abilities see: \url{http://johnmacfarlane.net/pandoc/README.html}. \subsection{CSS style files and Markdown}\index{CSS|(} You can customize the formatting of HTML documents created with Markdown files using custom CSS style sheets. CSS files allow you to specify the way a rendered Markdown file looks in a web browser including fonts, margins, background color, and so on. We don't have space to cover CSS syntax here. There are numerous online resources for learning CSS. One of the best ways may be to just copy a CSS style sheet into a new file and play around with it to see how things change. A really good resource for this is Google Chrome's Developer Tools.\index{Google Chrome!Developer Tools} The Developer Tools allows you to edit your webpages, including their CSS, and see a live preview. It is a really nice way to experiment with CSS (and HTML and JavaScript).\index{HTML}\index{JavaScript}\footnote{For more information on how to access and use Developer Tools in Chrome see: \url{https://developers.google.com/chrome-developer-tools/}.} There are also numerous pre-made style sheets available online.\footnote{One small note: when you create a new style sheet or copy an old one, make sure the final line is blank. Otherwise you may get an ``incomplete final line'' error when you render the document.} \index{Markdown!custom CSS|(} \paragraph{Rendering R Markdown files to HTML using custom CSS} The simplest way to use a custom CSS style sheet is to include the file path to the CSS file in an \emph{rmarkdown} header. As mentioned earlier, \emph{rmarkdown} has a number of built-in CSS file options that you can access with \texttt{style}. If you want to use another custom CSS file, use the \texttt{css} option. If our custom CSS file is called \emph{Markdown.css} in the same directory as the R Markdown document, then a basic header would be: <>= --- output: html_document: css: Markdown.css --- @ If you are using the \emph{knitr} package to render an R Markdown document to HTML you can also include a custom CSS file. First use \texttt{knit} to knit the document to a plain Markdown file. Then use the \texttt{markdownToHTML}\index{R function!markdownToHTML} function from the \emph{markdown} package \citep{R-markdown} to render the plain Markdown document in HTML, including the \texttt{stylesheet} argument with the path to the CSS file. \index{Markdown!custom CSS|(} \index{CSS|(} %%%%%%%%%%%%% Presentations %%%%%%%%%%%%%%%%%% \section{Slideshows with Markdown, \emph{rmarkdown}, and HTML} Because R Markdown documents can be compiled into HTML files it is possible to use them to create HTML5\index{HTML5} slideshows.\footnote{The slideshows created by the tools in this section use features introduced in the 5th version of HTML, i.e. HTML5. In this section I often refer to HTML5 as just HTML for simplicity.} There are a number of advantages to creating HTML presentations with Markdown: \begin{itemize} \item You can use the relatively simple Markdown syntax. \item HTML presentations are a nice native way to show content on the web. \item HTML presentations can incorporate virtually any content that can be included in a webpage. This includes interactive content, like motion charts created by \emph{googleVis}\index{googleVis} (see Chapter \ref{FiguresChapter}). \end{itemize} \noindent Let's look at how to create HTML slideshows from Markdown documents using (a) the \emph{rmarkdown} package and (b) RStudio's built-in slideshow files, called R Presentations. You can also use \emph{rmarkdown} to create beamer\index{beamer} presentations. \paragraph{HTML5 frameworks} Before getting into the details of how to use \emph{rmarkdown} for presentations and R Presentations, let's briefly look more into what an HTML5\index{HTML5} slideshow is and the frameworks that make them possible. HTML5 slideshows rely on a number of web technologies in addition to HTML5, including CSS,\index{CSS} and JavaScript\index{JavaScript} to create a website that behaves like a LaTeX beamer\index{beamer} or PowerPoint\index{PowerPoint} presentation. They run in your web browser and you may need to be connected to the internet for them to work properly as key components are often located remotely. Most browsers have a \texttt{Full Screen} mode you can use to view presentations. There are a number of different HTML5 slideshow frameworks that let you create and style your slideshows. In all of the frameworks you view the slideshow in your web browser and advance through slides with the forward arrow key on your keyboard. You can go back with the back arrow. Despite these similarities, the frameworks have different looks and capabilities. %%%%%%%%%%%%%% rmarkdown Presentations Presentations %%%%%%%% \subsection{HTML Slideshows with \emph{rmarkdown}} It is very easy to create an HTML presentation using \emph{rmarkdown} and the IO Slides\index{IO Slides}\index{io2012}\footnote{\url{https://code.google.com/p/io-2012-slides/}} or Slidy\index{slidy}\footnote{\url{http://www.w3.org/Talks/Tools/Slidy2/#(1)}} HTML5 frameworks. The syntax for IO Slides and Slidy presentations with \emph{rmarkdown} presentations is almost exactly the same as the syntax we have seen throughout this chapter. There are two main differences from the syntax we have seen so far. First, \verb|ioslides_presentation| for IO Slides or \verb|slidy_presentation| for Slidy presentations is the output type to set in the header. Second, two hashes (\verb|##|) set a frame's header.\footnote{You can create sections with one hash.} For example, <>= --- title: "Simple rmarkdown Presentation Example" author: "Christopher Gandrud" date: "26 December 2015" output: ioslides_presentation: incremental: true --- ## Access the code The code to create the following figure is available online. @ \noindent This code creates a slide show that begins with the slide in Figure \ref{BasicIO}. Bullet points will be brought in incrementally because we used \verb|incremental: true| under \verb|output: ioslides_presentation|. Bullets are created using Markdown list syntax. \begin{figure} \caption{\emph{rmarkdown}/IO Slides Example Title Slide} \label{BasicIO} \begin{center} \includegraphics[scale=0.3]{Children/Chapter13/images13/rmarkdownIo_slidesExample.png} \end{center} \end{figure} Use three dashes (\verb|---|) to delineate a new slide without a header. You can style the presentation further using the \texttt{css} option in the header to link to a custom CSS file.\index{CSS} You can create a new IO Slides or Slidy \emph{rmarkdown} presentation in RStudio by selecting \texttt{File} \textrightarrow\: \texttt{R Markdown...} then \texttt{Presentation} in the menu on the left of the window (shown in Figure \ref{rmarkdownPresRStudio}). Finally, click \texttt{HTML (ioslides)} or \texttt{HTML (Slidy)}. \begin{figure} \caption{Create New \emph{rmarkdown} Presentation in RStudio} \label{rmarkdownPresRStudio} \begin{center} \includegraphics[scale=0.5]{Children/Chapter13/images13/rmarkdownPresRStudio.png} \end{center} \end{figure} \subsection{LaTeX Beamer Slideshows with \emph{rmarkdown}}\label{rmarkdownBeamer} \index{beamer|(}\index{LaTeXbeamer|(} As we saw in Chapter \ref{LatexChapter}, creating a presentation with LaTeX beamer involves rather convoluted syntax. Luckily, we can use \emph{rmarkdown} to create beamer presentations using much cleaner Markdown syntax. An \emph{rmarkdown} beamer presentation uses the same syntax that we just saw with HTML presentations. The main difference is in the header where we use \verb|output: beamer_presentation|. You create a new R Markdown beamer document in RStudio in a similar way as IO Slides or Slidy. The only difference is that we select \texttt{PDF (Beamer)} in the window shown in Figure \ref{rmarkdownPresRStudio}. As before, frame titles are delineated with two hashes (\verb|##|). You can mark sections in much the same way with one hash. In the header you can switch the beamer theme, font theme, and color theme with \texttt{theme}, \texttt{colortheme}, and \texttt{fonttheme}, respectively. For example, to create the slide show that begins with the slide in Figure \ref{rmarkdownBeamerExample}:\index{LaTeX command!usetheme}\index{LaTeX command!usecolortheme}\index{LaTeX command!usefonttheme} <>= output: beamer_presentation: incremental: true theme: "Bergen" colortheme: "crane" fonttheme: "structurebold" @ \noindent Note that themes are placed in quotation marks. You can also include a custom template with the \texttt{template} option followed by the path to the custom template file. \index{beamer|)}\index{LaTeXbeamer|)} \index{rmarkdown|)} \begin{figure} \caption{\emph{rmarkdown}/Beamer Example Title Slide} \label{rmarkdownBeamerExample} \begin{center} \includegraphics[scale=0.45]{Children/Chapter13/images13/rmarkdownBeamerExample.png} \end{center} \end{figure} %%%%%%%%%%%%%% RStudio Presentations Presentations %%%%%%%% \subsection{Slideshows with Markdown and RStudio's R Presentations} Another easy, but less customizable way to create HTML slideshows is with RStudio's R Presentation\index{R Presentation}\index{RStudio!R Presentation} documents. To get started, open RStudio and click \texttt{File} \textrightarrow \: \texttt{New} \textrightarrow \: \texttt{R Presentation}. RStudio will then ask you to give the presentation a name and save it in a particular file. The reason RStudio does this is because an R Presentation is not just one file. Instead it includes: \begin{itemize} \item A \emph{.Rpres}\index{Rpres} file, which is very similar to a \emph{knitr} Markdown \emph{.Rmd} file. \item A \emph{.md} Markdown file created from the \emph{.Rpres} file. \item \emph{knitr} cache and figure folders, also created from the \emph{.Rpres} file. \end{itemize} \paragraph{Editing and compiling the presentation} You change the presentation's content by editing the \emph{.Rpres} file using the normal \emph{knitr} Markdown syntax we've covered. The only difference is how you create new slides. Luckily, the syntax for this is very simple. Just type the slide's title then at least three equal signs (\verb|===|). For example, <>= This is an Example .Rpres Slide Title === @ \noindent The very first slide is automatically the title slide and will be formatted differently from the rest.\footnote{As of this writing it is a blue slide with white letters.} Here is an example of a complete \emph{.Rpres} file: <>= Example R Presentation === ## Christopher Gandrud ## 1 July 2015 Access the Code === The code to create the following figure is available online. To access it we type: ```{r, eval=FALSE} # Access and run the code to create a caterpillar plot devtools::source_url("http://bit.ly/VRKphr") ``` Caterpillar Plot === ```{r, echo=FALSE, message=FALSE} # Access and run the code to create a caterpillar plot devtools::source_url("http://bit.ly/VRKphr") ``` Fertilizer Consumption Map (2003) === ```{r CreategvisGeoMap, echo=FALSE, message=FALSE, results='asis'} # Create geo map of global fertilizer consumption for 2003 devtools::source_url("http://bit.ly/VNnZxS") ``` @ \noindent This example includes four slides and three code chunks. The last code chunk uses the \emph{googleVis}\index{googleVis} package to create the global map of fertilizer consumption we saw earlier in Figure \ref{GeoMapImage}. Because the slideshow we are creating is in HTML, the map will be fully dynamic. Note that like before you will not be able to see the map in the RStudio preview, only in a web browser. To compile the slideshow, either click the \texttt{Preview} button (\includegraphics[scale=0.35]{Children/Chapter13/images13/PreviewButton.png}) or save the \emph{.Rpres} document. When you do this, you can view your updated slideshow in the \emph{Presentation} pane.\index{RStudio!Presentation pane} For example, see Figure \ref{PresentPane}. You can navigate through the slideshow using the arrow buttons at the bottom right of the \emph{Presentation} pane. If you click the magnifying glass icon (\includegraphics[scale=0.35]{Children/Chapter13/images13/MagGlass.png}) at the top of the \emph{Presentation} pane you will get a much larger view of the slideshow. You can also view the slideshow in your web browser by clicking on the \texttt{More} icon (\includegraphics[scale=0.35]{Children/Chapter13/images13/MorePres.png}), then \texttt{View in Browser}. \paragraph{Publishing slideshows} You can of course, view your slideshows locally. To share your presentation with others, you probably want to either publish the presentation to a standalone HTML file and host it, for example, on a Dropbox\index{Dropbox} \emph{Public}\index{Dropbox!Public folder} folder or publish it directly to RPubs.\index{RPubs} For R Presentations, create a standalone HTML file by simply clicking the \texttt{More} button in the \emph{Presentation} pane, then \texttt{Save as Webpage...}. Under the \texttt{More} button you can also choose the option \texttt{Publish to RPubs...}. \begin{figure} \caption{RStudio R Presentation Pane} \label{PresentPane} \begin{center} \includegraphics[width=\textwidth]{Children/Chapter13/images13/PresentationPane.png} \end{center} \end{figure} \section{Publishing HTML Documents Created by R Markdown} In Chapter \ref{GettingStartedRKnitr} (Section \ref{PublishRPubs}) we saw how to publish other R Markdown documents compiled with RStudio to RPubs. The \emph{knitr} function \texttt{knit2wp} can be used to post a knitted Markdown file to WordPress\footnote{\url{http://wordpress.com}} sites, which are often used for blogging.\index{knitr!knit2wp}\index{WordPress} In this section we will look at two other ways to publish R Markdown documents using Dropbox and GitHub. \subsection{Standalone HTML files} Of course, you can simply open the HTML file rendered from any R Markdown document in your web browser. If the HTML file contains the full information for the page as they generally do when created by \emph{rmarkdown}, e.g. the file does not depend on any auxiliary files, you can simply share this file via email or whatnot and anyone with a web browser can open it. We can Of course, also send auxiliary files if need be, but this can get unwieldy. \subsection{Hosting webpages with Dropbox}\index{Dropbox!Public folder} Probably one of the easiest ways to host an HTML file created with R Markdown is on your Dropbox \emph{Public} folder.\footnote{See Section \ref{EnablePublicFolder} for instructions on how to enable this folder if you created your Dropbox account after 4 October 2012.} Any HTML file Slideshows in the \emph{Public} folder will be rendered and widely accessible simply by entering the public link into a web browser. \subsection{GitHub Pages}\index{GitHub!Pages}\index{GitHub} GitHub also offers a free hosting service for webpages. These can be much more complex than a single HTML file. The simplest way to create one of these pages is to create a repository with a file called \emph{README.Rmd}. You can \texttt{knit} this file and then create your GitHub Page with it. To do this, go to the \texttt{Settings} \textrightarrow{} \texttt{GitHub Pages} on your repository's main GitHub website. Then click \texttt{Automatic Page Generator}. This places the contents of your \emph{README.md} file in the page and provides you with formatting options. Click \texttt{Publish} and you will have a new website. Clicking \texttt{Publish} creates a new orphan branch\footnote{An orphan branch is a branch with a different root from other repository branches. Another way of thinking about this is that they have their own history.}\index{git!orphan branch} called \emph{gh-pages}.\index{GitHub!gh-pages branch} When these branches are pushed to GitHub it will create a website based on a file called \emph{index.html} that you include in the branch. This will be the website's main page. If you want to create more customized and larger websites with GitHub Pages, you can manually create a GitHub Pages orphan branch and push it to GitHub. This is essentially what \emph{slidify} did for us with its \texttt{publish}\index{R function!publish} command. Imagine we have our working directory set as a repository containing an R Markdown file that we have rendered into an HTML file called \emph{index.html}.\index{HTML} Let's create a new orphan branch: <>= # Create orphan gh-pages branch git checkout --orphan gh-pages @ \noindent Now \texttt{add}\index{Git command!add} the files, \texttt{commit}\index{Git command!commit} the changes and \texttt{push}\index{Git command!push} it to GitHub. Push it to the \emph{gh-pages} branch like this: <>= # Add files git add . # Commit changes git commit -am "First gh-pages commit" # Push branch to GitHub Pages git push origin gh-pages @ \noindent A new webpage will be created at: \emph{USERNAME.GitHub.io/REPO\_NAME} You can also add custom domain names. For details see: \url{https://help.GitHub.com/articles/setting-up-a-custom-domain-with-GitHub-pages/}. \subsection{Further information on R Markdown} We have covered many of the core capabilities of \emph{rmarkdown} for creating reproducible research documents. Please see RStudio's R Markdown documentation (\url{http://rmarkdown.rstudio.com/}) for even more information. Another tool to look into for interactive results presentation is the \emph{shiny} package \citep{R-shiny}.\index{shiny} It gives R the capability to create interactive web applications,\index{web application} not just the static websites that we have covered in this chapter. This package is well integrated with RStudio. For more information please see: \url{http://shiny.rstudio.com/}. \subsection*{Chapter summary} In this chapter we learned a number of tools for dynamically presenting our reproducible research on the web as well as how to create PDFs with the simple R Markdown syntax. Though LaTeX and PDFs will likely remain the main tools for presenting research in published journals and books for some time to come, choosing to also make your research available in online native formats can make it more accessible to general readers. It also allows you to take advantage of interactive tools for presenting your research. \emph{rmarkdown} also makes it easy to create documents in a variety of formats using the simple R Markdown format. For relatively simple documents this can be a very useful tool. ================================================ FILE: Old/Source-v2/Children/Chapter14/chapter14.Rnw ================================================ % Chapter Chapter 14 For Reproducible Research in R and RStudio % Christopher Gandrud % Created: 16/07/2012 05:45:03 pm CEST % Updated: 5 May 2015 <>= set_parent('Rep-Res-Parent.Rnw') @ \chapter{Conclusion}\label{FinalChapter} \begin{quote} \emph{Well, we have completed our journey. The only thing left to do now is practice, practice, practice.} \citep[][432]{ShottsJr2012} \end{quote} In this book we learned a workflow for highly reproducible computational research and many of the tools needed to actually do it. Hopefully, if you haven't already, you will begin using and benefiting from these tools in your own work. Though we've covered enough material in this book to get you well on your way, there is still a lot more to learn. With most things computational (possibly most things in general), one of the best ways to continue learning is to practice and try new things. Inevitably you will hit walls, but there are almost always solutions that can be found with curiosity and patience. The R and reproducible research community is extremely helpful when it comes to finding and sharing solutions. I highly recommend getting involved in and eventually contributing to this community to get the most out of reproducible research.\footnote{A good point of entry into the R reproducible research community is R-bloggers (\url{http://www.r-bloggers.com/}). The site aggregates many different blogs on R-related topics from both advanced and relatively new R users. I have found that beyond just consuming other peoples' insights, contributing to R-bloggers--having to clearly write down my steps--has sharpened my understanding of the reproducible research process and enabled me to get great feedback. Other really useful resources are the R Stack Overflow (\url{http://stackoverflow.com/questions/tagged/r}) and Cross Validated (\url{http://stats.stackexchange.com/questions/tagged/r}) sites.} Before ending the book, I want to briefly address five issues we have not covered so far that are important for reproducible research: citing reproducible research, licensing this research, sharing your code with R packages, whether or not to make your research files public before publishing the results, and whether or not it is possible to completely future-proof your research. \section{Citing Reproducible Research}\index{citing} There are a number of well-established methods for citing presentation documents, especially published articles and books. However, as we discussed in the beginning, these documents are just the advertising for research findings rather than the actual research \cite[385]{Buckheit1995,Donoho2010}. If other researchers are going to use the data and source code used to create the findings in their own work, they need a way of actually citing the particular data and source code they used. Citing data and source code presents unique problems. Data and source code can change and be updated over time in a way that published articles and books generally are not. As such we have a much less developed, or at least less commonly used set of standards for citing these types of materials. One possibility is a standard for citing quantitative data sets laid out by \cite{Altman2007} \citep[see also][]{King2007}. They argue that quantitative data set citations should: \begin{itemize} \item allow a reader to quickly understand the nature of the cited data set, \item unambiguously identify a particular version of the data set, and \item enable reliable location, retrieval, and verification of the data set. \end{itemize} \noindent The first issue can be solved by having a citation that includes the author, the date the data set was made public, and its title. However, these things do not unambiguously identify the data set as it may be updated or changed and it does not enable its location and retrieval. To solve this problem, \citeauthor{Altman2007} suggest that these citations also include: \begin{itemize} \item a unique global identifier (UGI),\index{UGI} \item a universal numeric fingerprint (UNF),\index{UNF} and \item a bridge service.\index{bridge service} \end{itemize} \noindent A UGI uniquely identifies the data set. Examples include Document Object Identifiers (DOI)\index{DOI} and the Handel System.\footnote{See: \url{http://www.handle.net/}.}\index{Handel System} UGIs by themselves do not uniquely identify a particular version of a data set. This is where UNFs come in. They uniquely identify each version of a data set. Finally, a bridge service links the UGI and UNF to an actual document, usually posted online, so that it can be retrieved. There are many ways to register DOIs and Handel UGIs. Most of these also include means for creating UNFs and a bridge service.\index{bridge service} Examples of services that store your work and assign it DOIs are figshare\index{figshare}\footnote{\url{http://figshare.com/}} and Zenodo.\index{Zenodo}\footnote{\url{https://zenodo.org/}} Zenodo can be integrated with GitHub so that it will store and create citations for a specific commit of a GitHub repository whenever you create a tag.\index{git command!tag} For more information about integrating GitHub and Zenodo see: \url{https://guides.GitHub.com/activities/citable-code/}. Please see \cite{Altman2007} for details of other services.\footnote{The Dataverse Project (\url{http://thedata.org/}) offers a free service to host files that also uses the Handel System to assign UGIs, UNFs, and provides a bridge service. See \cite{Gandrud2013} for a comparison of Dataverse\index{Dataverse Project} with GitHub\index{GitHub} and Dropbox\index{Dropbox} for data storage.} Though \citeauthor{Altman2007} are interested in data sets, their system could easily be applied to source code as well. UGIs could identify a source code file or collection of files. The UNF could identify a particular version and a bridge service would create a link to the actual files. \section{Licensing Your Reproducible Research}\index{licensing} In the United States and many other countries, research, including computer code made available via the internet, is automatically given copyright protection. However, copyright protection works against the scientific goals of reproducible research, because work derived from the research falls under the original copyright protections \cite[36]{Stodden2009}. To solve this problem, some authors have suggested placing code under an open source software license like the GNU General Public License (GPL) \cite[]{Vandewalle2007}.\index{GNU General Public License} \cite{Stodden2009} argues that this type of license is not really adequate for making available the data, code, and other material needed to reproduce research findings in a way that enables scientific validation and knowledge growth. I don't want to explore the intricacies of these issues here. Nonetheless, they are important for computational researchers to think about, especially if their data and source code is publicly available. Two good places to go for more information are \cite{Stodden2009} and \cite{CreativeCommons2012}. \section{Sharing Your Code in Packages}\index{R!package development}\index{R!functions} Developing R functions and putting them into packages is a good way to enable cumulative knowledge development. Many researchers spend a considerable amount of time writing code to solve problems that no one has addressed yet, or haven't addressed in a way that they believe is adequate. It is very useful if they make this code publicly accessible so that others can perhaps adopt and use it in their own work without having to duplicate the effort used to create the original functions. Abstracting your code into functions so that they can be applied to many problems and distributing them in easily installed packages makes it much easier for other researchers to adopt and use your code to help solve their research problems. The active community of researcher/package developers is one of the main reasons that R has become such a widely used and useful statistical language. Many of the tools we have covered in this book provide a good basis to start making and distributing functions. We have discussed many of the R commands and concepts that are important for creating functions. We have also looked at Git and GitHub, which are very helpful for developing and distributing packages. Learning about Hadley Wickham's \emph{devtools} package is probably the best next step for you to take to be able to develop and distribute functions in packages. He has an excellent introduction to \emph{devtools} and R package development in general at \url{http://adv-r.had.co.nz/Philosophy.html#introduction-to-devtools}. RStudio Projects\index{RStudio!Projects}\index{RStudio!package development} have excellent \emph{devtools} integration and are certainly worth using. To begin creating a new package in RStudio, start a new project, preferably with Git version control (see Section \ref{NewProjectGit}). In the \textbf{New Project} window select \texttt{Package}. Now you will have a new Project with all of the files and directories you need to get started making packages that will hopefully be directly useful for the computational research community. \section{Project Development: Public or Private?} Hopefully I have made a convincing case in this book that research results, especially in academia, should almost always be highly reproducible. The files used to create the results need to be publicly available for the research to be really reproducible.\footnote{There are obvious exceptions, such as when a study's participants' identities need to remain confidential.} During the development of a research project, however, should files be public or private? On the one hand, openness encourages transparency and feedback. Other researchers may alert you to mistakes before a result is published. On the other hand, there are worries that you may be ``scooped''. Another researcher might see your files, take your idea, and publish it before you have a chance to. In general, this worry may be a bit overblown. Especially if you use a version control\index{version control} system that clearly dates all of your file versions, it would be very easy to make the case that someone has stolen your work. Hopefully this possibility would discourage any malfeasance. That being said, unlike the clear need to make research files available after publication, during research development there are good reasons for both making files public and keeping them private. Researchers should probably make this decision on a case-by-case basis. In general, I choose to make my research repositories public to increase transparency and encourage feedback. The community of researchers in my field is relatively small and close knit. It would be hard for someone to take my work and pass it off as their own. This is especially true if many people already know that they are my ideas, because I have made by research files publicly available. However, during the development of this book, which has a more general appeal, I kept the repository private to avoid being ``scooped''. Regardless, cloud storage systems like GitHub\index{GitHub} make it easy to choose whether or not to make your files public or private. You can easily keep a repository private while you create a piece of research and then make it public once the results are published. \section{Is it Possible to Completely Future-Proof Your Research?} \index{future-proof research|(} In this book we've looked at a number of ways to help future-proof\index{future-proof} your research so that future researchers (and you) are able to actually reproduce it. These included storing your research in text files, clearly commenting on your code, and recording information about the software environment you used by, for example, recording your session info.\index{session info} Are these steps enough to completely ensure that your research will always be reproducible? The simple answer is probably no. Software changes, but it is difficult to foresee what these changes will be. Nonetheless, beyond what we have discussed so far there are other steps we can take to make our reproducible research as future-proof as possible. One of the main obstacles to completely future-proofing your research is that no (or at least very few) pieces of software are complete. R packages are updated. R is updated. Your operating system is updated. These and other software programs discussed in this book may not only be updated, but also discontinued. Changes to the software you used to find your results may change the results someone reproducing your research gets. This problem becomes larger as you use more pieces of software in your research That being said, many of the software tools we have learned about in this book have future-proofing at their heart. TeX, the typesetting system that underlies LaTeX, is probably the best example. TeX was created in 1978 and has since been maintained with future-proofing in mind \citep{Knuth1990}. Though changes and new versions continue to be made, we are still able to use TeX to recreate documents in their original intended form even if they were written over thirty years ago. We also saw that, though R and especially R packages change rapidly, the Comprehensive R Archive Network\index{CRAN} stores and makes accessible old versions (as the name suggests). Old versions can be downloaded by anyone wishing to reproduce a piece of research, provided the original researcher has recorded which versions they used. This is very easy using \emph{repmis}'s \texttt{LoadandCite} command.\index{R function!LoadandCite} This command lets you specify particular package versions to install and load from the CRAN package archive.\footnote{Do this by entering specific package version numbers in the \texttt{versions} argument.} Another approach is to use the \emph{packrat} \index{packrat}\index{dependency management} R package \citep{R-packrat} for managing the packages your project depends on. Some of the other technologies discussed in this book may be less reliable over time, so some caution should be taken if you intend to use them to create fully reproducible research. In addition to documenting what software you used and using software that archives old versions, some have suggested another step to future-proof reproducible research: encapsulate it in a virtual machine\index{virtual machine} that is available on a cloud storage system. See in particular \cite{Howe2012}. A virtual reproducible research machine would store a ``snapshot [of] a researcher's entire working environment, including data, software, dependencies, notes, logs, scripts, and more''. If the virtual machine is stored on a cloud server, then anyone wanting to reproduce the research could access the full computing environment used to create a piece of research \citep[36]{Howe2012}. As long as others could run the virtual machine and access the cloud storage system, you would not have to worry about changing software, because the exact versions of the software you used would be available in one place. We don't have space to cover the specifics of how to create a virtual machine in this book. However, using a virtual machine is a tool that can be added to the workflow discussed in this book, rather than being a replacement for it. Carefully documenting your steps, clearly organizing your files, and dynamically tying together your data gathering, analysis, and presentation files helps you and others understand how you created a result after a research project's results have been published. Being able to understand your research will give it higher research impact as others can more easily build on it. The steps covered in this book will still encourage you to have better work habits from the beginning of your research projects even if you will be using a virtual machine. The tools and workflow will also continue to facilitate collaboration and make it easier to dynamically update your research documents when you make changes. \index{future-proof research|)} \vspace{1cm} \noindent Now, get started with reproducible research! ================================================ FILE: Old/Source-v2/Children/Chapter2/chapter2.Rnw ================================================ % Chapter Chapter 2 For Reproducible Research in R and RStudio % Christopher Gandrud % Created: 16/07/2012 05:45:03 pm CEST % Updated: 5 Mays 2015 <>= set_parent('Rep-Res-Parent.Rnw') @ \chapter{Getting Started with Reproducible Research}\label{GettingStartedRR} Researchers often start thinking about making their work reproducible near the end of the research process when they write up their results or maybe even later when a journal requires their data and code be made available for publication. Or maybe even later when another researcher asks if they can use the data from a published article to reproduce the findings. By then there may be numerous versions of the data set and records of the analyses stored across multiple folders on the researcher's computers. It can be difficult and time consuming to sift through these files to create an accurate account of how the results were reached. Waiting until near the end of the research process to start thinking about reproducibility can lead to incomplete documentation that does not give an accurate account of how findings were made. Focusing on reproducibility from the beginning of the process and continuing to follow a few simple guidelines throughout your research can help you avoid these problems. Remember ``reproducibility is not an afterthought--it is something that must be built-into the project from the beginning'' \cite[386]{Donoho2010}. This chapter first gives you a brief overview of the reproducible research process: a workflow for reproducible research. Then it covers some of the key guidelines that can help make your research more reproducible. \section{The Big Picture: A Workflow for Reproducible Research} The three basic stages of a typical computational empirical research project are: \begin{itemize} \item data gathering, \item data analysis, \item results presentation. \end{itemize} \noindent Each stage is part of the reproducible research workflow covered in this book. Tools for reproducibly gathering data are covered in Part II. Part III teaches tools for tying the data we gathered to our statistical analyses and presenting the results with tables and figures. Part IV discusses how to tie these findings into a variety of documents you can use to advertise your findings. Instead of starting to use the individual tools of reproducible research as soon as you learn them, I recommend briefly stepping back and considering how the stages of reproducible research {\emph{tie}} together overall. This will make your workflow more coherent from the beginning and save you a lot of backtracking later on. Figure \ref{WorkflowTies} illustrates the workflow. Notice that most of the arrows connecting the workflow's parts point in both directions, indicating that you should always be thinking about how to make it easier to go backwards through your research, i.e. reproduce it, as well as forwards. Around the edges of the figure are some of the commands you will learn to make it easier to go forwards and backwards through the process. These commands tie your research together. For example, you can use API-based R packages\index{API} to gather data from the internet. You can use R's \texttt{merge} command to combine data gathered from different sources into one data set. The \texttt{getURL}\index{R function!getURL} function from R's \emph{RCurl} package \citep{R-RCurl} and the {\texttt{read.table}}\index{R function!read.table} function can be used to bring this data set into your statistical analyses. The {\emph{knitr}} or \emph{rmarkdown} package then ties your analyses into your presentation documents. This includes the code you used, the figures you created, and, with the help of tools such as the \texttt{kable}\index{R function!kable} function in the \emph{knitr} package, tables of results. You can even tie multiple presentation documents together. For example, you can access the same figure for use in a LaTeX article and a Markdown-created website with the \texttt{includegraphics}\index{LaTeX command!includegraphics} and \texttt{![]()}\index{![]()} commands, respectively. This helps you maintain a consistent presentation of results across multiple document types. We'll cover these commands in detail throughout the book. See Table \ref{TableTieCommands} for a brief but more complete overview of the main {\emph{tie commands}}.\index{tie commands} \subsection{Reproducible theory} An important part of the research process that I do not discuss in this book is theoretical stage. Ideally, if you are using a deductive research design, the bulk of this work will precede and guide the data gathering and analysis stages. Just because I don't cover this stage of the research process doesn't mean that theory building can't and shouldn't be reproducible. It can in fact be ``the easiest part to make reproducible'' \cite[1254]{Vandewalle2007}. Quotes and paraphrases from previous works in the literature obviously need to be fully cited so that others can verify that they accurately reflect the source material. For mathematically based theory, clear and complete descriptions of the proofs should be given. Though I don't actively cover theory replication in depth in this book, I do touch on some of the ways to incorporate proofs and citations into your presentation documents. These tools are covered in Part IV. \begin{landscape} \begin{figure} \caption{Example Workflow \& a Selection of Commands to Tie It Together} \label{WorkflowTies} \input{Children/Chapter2/images2/WorkFlowLinks.tex} \end{figure} \end{landscape} \section{Practical Tips for Reproducible Research} Before we start learning the details of the reproducible research workflow with R and RStudio, it's useful to cover a few broad tips that will help you organize your research process and put these skills in perspective. The tips are: \begin{enumerate} \item Document everything! \item Everything is a (text) file. \item All files should be human readable. \item Explicitly tie your files together. \item Have a plan to organize, store, and make your files available. \end{enumerate} \noindent Using these tips will help make your computational research really reproducible. \subsection{Document everything!} In order to reproduce your research, others must be able to know what you did. You have to tell them what you did by documenting as much of your research process as possible. Ideally, you should tell your readers how you gathered your data, analyzed it, and presented the results. Documenting everything is the key to reproducible research and lies behind all of the other tips in this chapter and tools you will learn throughout the book. \paragraph{Document your R session info}\label{SessionInfoHow} Before discussing the other tips it's important to learn a key part of documenting with R. You should \emph{record your session info\index{session info}}. Many things in R have stayed the same since it was introduced in the early 1990s. This makes it easy for future researchers to recreate what was done in the past. However, things can change from one version of R to another and especially from one version of an R package to another. Also, the way R functions and how R packages are handled can vary across different operating systems, so it's important to note what system you used. Finally, you may have R set to load packages\index{packages} by default (see Section \ref{Packages} for information about packages). These packages might be necessary to run your code, but other people might not know what packages and what versions of the packages were loaded from just looking at your source code. The \texttt{sessionInfo} command\index{R function!sessionInfo} in R prints a record of all of these things. The information from the session I used to create this book is: <>= # Print R session info sessionInfo() @ \noindent Chapter \ref{DirectoriesChapter} gives specific details about how to create files with dynamically included session information. If you use non-R tools you should also record what versions of these tools you used. \subsection{Everything is a (text) file} Your documentation is stored in files that include data, analysis code, the write-up of results, and explanations of these files (e.g. data set codebooks, session info files, and so on). Ideally, you should use the simplest file format possible to store this information. Usually the simplest file format is the humble, but versatile, text file.\footnote{Plain text files are usually given the file extension \texttt{.txt}. Depending on the size of your data set it may not be feasible to store it as a text file. Nonetheless, text files can still be used for analysis code and presentation files.} Text files are extremely nimble. They can hold your data in, for example, comma-separated values ({\tt{.csv}}) \index{comma-separated values} format. They can contain your analysis code in {\tt{.R}} files. And they can be the basis for your presentations as markup documents like {\tt{.tex}} or {\tt{.md}}, for LaTeX and Markdown files, respectively. All of these files can be opened by any program that can read text files. One reason reproducible research is best stored in text files is that this helps {\emph{future-proof}}\index{future-proof research} your research. Other file formats, like those used by Microsoft Word \index{Microsoft Word} (\texttt{.docx}) or Excel\index{Microsoft Excel} (\texttt{.xlsx}), change regularly and may not be compatible with future versions of these programs. Text files, on the other hand, can be opened by a very wide range of currently existing programs and, more likely than not, future ones as well. Even if future researchers do not have R or a LaTeX distribution, they will still be able to open your text files and, aided by frequent comments (see below), be able to understand how you conducted your research \cite[3]{Bowers2011}. Text files are also very easy to search and manipulate with a wide range of programs--such as R and RStudio--that can find and replace text characters as well as merge and separate files. Finally, text files are easy to version and changes can be tracked using programs such as Git (see Chapter \ref{Storing}). \subsection{All files should be human readable} Treat all of your research files as if someone who has not worked on the project will, in the future, try to understand them. Computer code is a way of communicating with the computer. It is `machine readable' in that the computer is able to use it to understand what you want to do.\footnote{Of course, if the computer does not understand it will usually give an error message.} However, there is a very good chance that other people (or you six months in the future) will not understand what you were telling the computer. So, you need to make all of your files `human readable'. To make them human readable, you should comment on your code with the goal of communicating its design and purpose \citep{Wilson2012}. With this in mind it is a good idea to {\emph{comment frequently}} \cite[3]{Bowers2011} and {\emph{format your code using a style guide}} \cite[]{Nagler1995}. For especially important pieces of code you should use {\emph{literate programming}}\index{literate programming}--where the source code and the presentation text describing its design and purpose appear in the same document. Doing this will make it very clear to others how you accomplished a piece of research. \paragraph{Commenting} In R, everything on a line after a hash character--{\tt{\#}}--(also known as number, pound, or sharp) is ignored by R, but is readable to people who open the file. The hash character is a comment declaration\index{comment declaration} character. You can use the {\tt{\#}} to place comments telling other people what you are doing. Here are some examples: <>= # A complete comment line 2 + 2 # A comment after R code @ \noindent On the first line the {\tt{\#}} (hash) is placed at the very beginning, so the entire line is treated as a comment. On the second line the {\tt{\#}} is placed after the simple equation \texttt{2 + 2}. R runs the equation and finds the answer {\tt{4}}, but it ignores all of the words after the hash. Different languages have different comment declaration characters. In LaTeX everything after the {\tt{\%}} percent sign is treated as a comment, and in Markdown/HTML comments are placed inside of {\tt{\textless !-- --\textgreater}}. The hash character is used for comment declaration in command-line shell scripts.\index{shell script} Nagler \citeyearpar[491]{Nagler1995} gives some advice on when and how to use comments: \begin{itemize} \item write a comment before a block of code describing what the code does, \item comment on any line of code that is ambiguous. \end{itemize} \noindent In this book I follow these guidelines when displaying code. Nagler also suggests that all of your source code files should begin with a comment header. {\emph{At the least}} the header should include: \begin{itemize} \item a description of what the file does, \item the date it was last updated, \item the name of the file's creator and any contributors. \end{itemize} \noindent You may also want to include other information in the header such as what files it depends on, what output files it produces, what version of the programming language you are using, sources that may have influenced the code, and how the code is licensed. Here is an example of a minimal file header for an R source code file that creates the third figure in an article titled `My Article': <>= ################## # R Source code file used to create Figure 3 in My 'Article' # Created by Christopher Gandrud # MIT License ################## @ \noindent Feel free to use things like the long series of hash marks above and below the header, white space, and indentations to make your comments more readable. \paragraph{Style guides} In natural language writing you don't necessarily have to follow a style guide\index{style guide}. People could probably figure out what you are trying to say, but it is a lot easier for your readers if you use consistent rules. The same is true when writing computer code. It's good to follow consistent rules for formatting your code so that it's easier for you and others to understand. There are a number of R style guides. Most of them are similar to the Google R Style Guide\index{Google R Style Guide}.\footnote{See: \url{https://google.github.io/styleguide/Rguide.xml}.} Hadley Wickham also has a nicely presented R style guide.\footnote{You can find it at \url{http://adv-r.had.co.nz/Style.html}.} You may want to use the {\emph{formatR}}\index{formatR} \cite[]{R-formatR} package to automatically reformat your code so that it is easier to read. \paragraph{Literate programming} For particularly important pieces of research code it may be useful to not only comment on the source file, but also display code in presentation text. For example, you may want to include key parts of the code you used for your main statistical models and an explanation of this code in an appendix following your article. This is commonly referred to as literate programming \index{literate programming} \cite[]{Knuth1992}. \subsection{Explicitly tie your files together} If everything is just a text file, then research projects can be thought of as individual text files that have a relationship with one another. They are tied together. A data file is used as input for an analysis file. The results of an analysis are shown and discussed in a markup file that is used to create a PDF document. Researchers often do not explicitly document the relationships between files that they used in their research. For example, the results of an analysis--a table or figure--may be copied and pasted into a presentation document. It can be very difficult for future researchers to trace the table or figure back to a particular statistical model and a particular data set without clear documentation. Therefore, it is important to make the links between your files explicit. Tie commands are the most dynamic way to explicitly link your files together.\index{tie commands} These commands instruct the computer program you are using to use information from another file. In Table \ref{TableTieCommands} I have compiled a selection of key tie commands you will learn how to use in this book. We'll discuss many more, but these are some of the most important. \begin{table} \caption{A Selection of Commands/Packages/Programs for Tying Together Your Research Files} \label{TableTieCommands} \vspace{0.3cm} {\footnotesize{ \begin{tabular}{p{2.5cm} c p{5.25cm} p{2cm}} \hline Command/Package/ Program & Language & Description & Chapters Discussed \\[0.3cm] \hline \hline {\emph{knitr}} & R & R package with commands for tying analysis code into presentation documents including those written in LaTeX and Markdown. & \hfill Throughout \\[0.25cm] \emph{rmarkdown} & R & R package that builds on \emph{knitr}. It allows you to use Markdown to output to HTML, PDFs compiled with LaTeX or Microsoft Word. & \hfill Throughout \\[0.25cm] {\tt{download.file}} & R & Downloads a file from the internet. & \hfill\ref{DataGather} \\[0.25cm] {\tt{read.table}} & R & Reads a table into R. You can use this to import a plain-text file formatted data into R. & \hfill\ref{DataGather} \\[0.25cm] {\tt{read.csv}} & R & Same as \texttt{read.table} with default arguments set to import \texttt{.csv} formatted data files. & \hfill\ref{DataGather} \\[0.25cm] {\tt{source\_data}} & R & Reads a table stored on the internet into R. You can use it to import a plain-text formatted data file into R from secure (https) URLs. & \hfill\ref{DataGather} \\[0.25cm] {\tt{source\_DropboxData}} & R & Imports a plain-text data file stored in a Dropbox non-Public folder into R. & \hfill\ref{DataGather} \\[0.25cm] API-based packages & R & Various packages use APIs to gather data from the internet. & \hfill\ref{DataGather} \\[0.25cm] {\tt{merge}} & R & Merges together data frames. & \hfill\ref{DataClean} \\[0.25cm] {\tt{source}} & R & Runs an R source code file. & \hfill\ref{StatsModel} \\[0.25cm] {\tt{source\_url}} & R & From the {\emph{devtools}} package. Runs an R source code file from a secure ({\tt{https}}) url like those used by GitHub. & \hfill\ref{StatsModel} \\[0.25cm] {\tt{kable}} & R & Creates tables from data frames that can be rendered using Markdown or LaTeX. & \hfill\ref{TablesChapter} \\[0.25cm] {\tt{toLaTeX}} & R & Converts R objects to LaTeX. & \hfill\ref{GettingStartedRR} \\[0.25cm] {\tt{input}} & LaTeX & Includes LaTeX files inside of other LaTeX files. & \hfill\ref{LargeDocs} \\[0.25cm] {\tt{include}} & LaTeX & Similar to {\tt{input}}, but puts page breaks on either side of the included text. Usually it is used for including chapters. & \hfill\ref{LargeDocs} \\[0.25cm] {\tt{includegraphics}} & LaTeX & Inserts a figure into a LaTeX document. & \hfill\ref{FiguresChapter} \\[0.25cm] \texttt{![]()} & Markdown & Inserts a figure into a Markdown document. & \hfill\ref{MarkdownChapter} \\ [0.25cm] Pandoc & shell & A shell program for converting files from one markup language to another. Allows you to tie presentation documents together. & \hfill\ref{LargeDocs} \& \ref{MarkdownChapter} \\[0.25cm] Make & shell & A shell program for automatically building many files at the same time. & \hfill\ref{DataGather} \\[0.25cm] \hline \end{tabular} }} \end{table} \subsection{Have a plan to organize, store, and make your files available} Finally, in order for independent researchers to reproduce your work, they need to be able access the files that instruct them how to do this. Files also need to be organized so that independent researchers can figure out how they fit together. So, from the beginning of your research process you should have a plan for organizing your files and a way to make them accessible. One rule of thumb for organizing your research in files is to limit the amount of content any one file has. Files that contain many different operations can be very difficult to navigate, even if they have detailed comments. For example, it would be very difficult to find any particular operation in a file that contained the code used to gather the data, run all of the statistical models, and create the results figures and tables. If you have a hard time finding things in a file you created, think of the difficulties independent researchers will have! Because we have so many ways to link files together, there is really no need to lump many different operations into one file. So, we can make our files modular. One source code file should be used to complete one or just a few tasks. Breaking your operations into discrete parts will also make it easier for you and others to find errors \cite[490]{Nagler1995}. Chapter \ref{DirectoriesChapter} discusses file organization in much more detail. Chapter \ref{Storing} teaches you a number of ways to make your files accessible through the cloud computing services Dropbox\index{Dropbox} and GitHub\index{GitHub}. ================================================ FILE: Old/Source-v2/Children/Chapter3/chapter3.Rnw ================================================ % Chapter Chapter 3 For Reproducible Research in R and RStudio % Christopher Gandrud % Created: 16/07/2012 05:45:03 pm CEST % Updated: 22 March 2015 <>= set_parent('Rep-Res-Parent.Rnw') @ \chapter{Getting Started with R, RStudio, and {\normalfont{knitr}}/\normalfont{rmarkdown}}\label{GettingStartedRKnitr} If you have rarely or never used R before, the first section of this chapter gives you enough information to be able to get started and understand the R code I use throughout the book. For more detailed introductions on how to use R please refer to the resources I mentioned in Chapter \ref{Intro} (Section \ref{OtherBooks}). Experienced R users might want to skip the first section. In the second section I'll give a brief overview of RStudio. I highlight the key features of the main RStudio panel (what appears when you open RStudio) and some of its key features for reproducible research. Finally, I discuss the basics of the {\emph{knitr}}/\emph{rmarkdown} packages, how to use them in R, and how they are integrated into RStudio. %%%%%%%%%%%%% Using R \section{Using R: The Basics} To get you started with reproducible research, we'll cover some very basic R\index{syntax} syntax--the rules for talking to R. I cover key parts of the R language including: \begin{itemize} \item objects \& assignment, \item component selection, \item functions and commands, \item arguments, \item the workspace and history, \item packages. \end{itemize} Before discussing each of these in detail, let's open R and look around.\footnote{Please see Chapter \ref{Intro} for instructions on how to install R.} When you open the R GUI program by clicking on the R icon you should get a window that looks something like Figure \ref{RBlankMain}.\footnote{This figure and almost all screenshots in this book were taken on a computer using the Mac OS 10.10 operating system.} This window is the {\bf{R console}}\index{R!console}. Below the startup information--information about what version of R you are using, license details, and so on--you should see a \verb|>| (greater-than sign). This prompt is where you enter R code.\footnote{If you are using a Unix-like\index{Unix-like} system such as Linux\index{Linux} Ubuntu\index{Ubuntu} or Mac OS 10\index{Mac}, you can also access R via an application called the Terminal\index{Terminal}. If you have installed R on your computer you can type {\tt{R}} into the Terminal and then the {\tt{Enter}} or {\tt{Return}} key. This will begin a new R session. You will know you are in a new R session because the same type of startup information as in Figure \ref{RBlankMain} will be printed in your Terminal.} To run R code that you have typed after the prompt, hit the {\tt{Enter}} or {\tt{Return}} key. Now that we have a new R session\index{R!session} open we can get started. \begin{figure}[th!] \caption{R Startup Console} \label{RBlankMain} \begin{center} \includegraphics[scale=0.4]{Children/Chapter3/images3/BlankRConsole.png} \end{center} \end{figure} \subsection{Objects}\label{Objects} If you've read a description of R before, you will probably have seen it referred to as an `object-oriented\index{object-oriented} language'. What are objects? Objects are like the R language's nouns. They are things, like a vector of numbers, a data set, a word, a table of results from some analysis, and so on. Saying that R is object-oriented means that R is focused on doing actions to objects. We will talk about the actions--commands and functions--later in this section.\footnote{Somewhat confusingly, commands and functions are themselves objects. In this chapter I treat them as distinct from other object types to avoid confusion.} Now let's create a few objects. \paragraph{Numeric \& string objects} Objects can have a number of different types. Let's make two simple objects. The first is a numeric-type object. The other is a character object. We can choose almost any name\label{ObjectNames}\index{R!object names} we want for our objects as long as it begins with an alphabetic character and does not contain spaces.\footnote{It is common for people to use either periods (\texttt{.}) or capital letters (referred to as CamelBack) to separate words in object names instead of using spaces. For example: {\emph{new.data}} or {\emph{NewData}} rather than {\emph{new data}}. For more information on R naming conventions see \cite{Baath2012}.} Let's call our numeric object {\emph{Number}}.\index{R!numeric} It is a good idea to give each object a unique name to avoid conflicts and confusion. Also make sure that object names are different from the names of their components, e.g. individual variable names. This will avoid many complications like accidentally overwriting an object or confusing R about what object or component you are referring to. To put something into the object we use the assignment operator\index{R!assignment operator}\footnote{The assignment operator is sometimes also referred to as the `gets arrow'.} (\verb|<-|). Let's assign the number 10 to our {\emph{Number}} object. <>= Number <- 10 @ \noindent To see the contents of our object, type its name. <>= Number @ \noindent Let's briefly breakdown this output. \texttt{10} is clearly the contents of {\emph{Number}}. The double hash (\texttt{\#\#}) is included here to tell you that this is output rather than R code.\footnote{The double hash is generated automatically by {\emph{knitr}}. They make it easier to copy and paste code into R from a document created by {\emph{knitr}} because R will ignore everything after a hash.} If you type the commands in your R console, you will not get the double hash in your output. Finally, \texttt{[1]} is the row number of the object that 10 is on. Clearly our object only has one row. Creating an object with words and other characters--a character object--is very similar. The only difference is that you enclose the character string\index{R!character strings} (letters in a word for example) inside of single or double quotation\index{R!quotation marks} marks (\verb|''|, or \verb|""|).\footnote{Single and double quotation marks are interchangeable in R for this purpose. In this book I always use double quotes, except for \emph{knitr} code chunk options.} To create an object called \emph{Words} that contains the character string ``Hello World'':\label{StringObject} <>= Words <- "Hello World" @ An object's type is important to keep in mind as it determines what we can do to it. For example, you cannot take the mean\index{R function!mean}\index{mean} of a character object like the {\emph{Words}} object: <>= mean(Words) @ \noindent Trying to find the mean of our {\emph{Words}} object gives us a warning message and returns the value {\tt{NA}}\index{NA}\index{R!NA}: not applicable. You can also think of {\tt{NA}} as meaning ``missing''.\index{missing}\index{R!missing} To find out an object's type, use the {\tt{class}}\index{R function!class}\index{class} command. For example: <>= class(Words) @ \paragraph{Vector \& data frame objects} So far we have only looked at objects with a single number or character string.\footnote{These might be called scalar objects,\index{scalar} though in R scalars are just vectors with a length of 1.} Clearly we often want to use objects that have many strings and numbers. In R these are usually data frame\index{data frame}\index{R!data frame}-type objects and are roughly equivalent to the data structures you would be familiar with from using a program such as Microsoft Excel. We will be using data frames extensively throughout the book. Before looking at data frames it is useful to first look at the simpler objects that make up data frames. These are called vectors. Vectors are R's ``workhorse'' \cite[]{Matloff2011}. Knowing how to use vectors\index{vector}\index{R!vector} will be especially helpful when you cleanup raw data in Chapter \ref{DataClean} and make tables in Chapter \ref{TablesChapter}.\footnote{If you want information about other types of R objects such as lists\index{list}\index{R!list object} and matrices\index{matrix}\index{R!matrix object}, Chapter 1 of Norman Matloff's \citeyearpar{Matloff2011} book is a really good place to look.} \\[0.25cm] \noindent {\bf{Vectors}} \\[0.25cm] Vectors are the ``fundamental data type'' in R \cite[]{Matloff2011}. They are simply an ordered group of numbers, character strings, and so on.\footnote{In a vector, every member of the group must be of the same type. If you want an ordered group of values with different types you can use lists.} It may be useful to think of basically all R objects as composed of vectors. For example, data frames are basically multiple vectors of the same length--i.e. they have the same number of rows--attached together to form columns. Let's create a simple numeric vector containing the numbers 2.8, 2, and 14.8. To do this we will use the \texttt{c} (combine)\index{combine}\index{R function!c}\index{R function!combine} function: <>= NumericVect <- c(2.8, 2, 14.8) # Show NumericVect's contents NumericVect @ Vectors of character strings are created in a similar way. The only major difference is that each character string is enclosed in quotation marks like this: <>= CharacterVect <- c("Albania", "Botswana", "Cambodia") # Show CharacterVect's contents CharacterVect @ To give you a preview of what we are going to do when we start working with real data sets, let's combine the two vectors {\emph{NumericVect}} and {\emph{CharacterVect}} into a new object with the \texttt{cbind}\index{R function!cbind} function. This function binds the two vectors together side-by-side as columns.\footnote{If you want to combine objects as if they were rows of the same column(s), use the \texttt{rbind}\index{R function!rbind} function.}\label{cbind} <>= StringNumObject <- cbind(CharacterVect, NumericVect) # Show StringNumObject's contents StringNumObject @ \noindent By binding these two objects together we've created a new matrix\index{matrix} object.\footnote{Matrices are vectors with columns as well as rows.} You can see that the numbers in the {\emph{NumericVect}} column are between quotation marks. Matrices, like vectors, can only have one data type. \\[0.25cm] \noindent {\bf{Data frames}} \\[0.25cm] If we want to have an object with rows and columns and allow the columns to contain data with different types, we need to use data frames\index{data frame}. Let's use the \texttt{data.frame}\index{R function!data.frame} command to combine the {\emph{NumericVect}} and {\emph{CharacterVect}} objects.\label{data.frame} <>= StringNumObject <- data.frame(CharacterVect, NumericVect) # Display contents of StringNumObject data frame StringNumObject @ \noindent There are a few important things to notice in this output. The first is that because we used the same name for the data frame object as the previous matrix object, R deleted the matrix object and replaced it with the data frame. This is something to keep in mind when you are creating new objects. In general it is a better idea to assign elements to new objects rather than overwriting old ones. This will help avoid accidentally using an object you had not intended to. It also allows you to more easily change previously run source code. You can see the data frame's names\index{R function!names}\index{R!names attribute} attribute.\footnote{Matrices can also have a names attribute.} It is the column names. You can use the \texttt{names} command to see any data frame's names:\footnote{You can also use \texttt{names} to assign names for the entire data frame. For example, \texttt{names(StringNumObject) <- c("Variable1", "Variable2")}} <>= names(StringNumObject) @ You will also notice that the first column of the data set has no name and is a series of numbers. This is the row.names attribute.\index{row.names attribute} Data frame rows can be given any name as long as each row name is unique. We can use the \texttt{row.names} command\index{R function!row.names} to set the row names from a vector. For example, <>= # Reassign row.names row.names(StringNumObject) <- c("First", "Second", "Third") # Display new row.names row.names(StringNumObject) @ \noindent You can see in this example how the \texttt{row.names} command can also be used to print the row names.\footnote{Note that this is really only useful for data frames with few rows.} The row.names attribute does not behave like a regular data frame column. You cannot, for example, include it as a variable in a regression. You can use the \texttt{row.names} command to assign the row.names values to a regular column (for an example see Section \ref{RowNamesTidy}). You will notice in the output for \emph{StringNumObject} that the strings in the \textbf{CharacterVect} column are no longer in quotation marks. This does not mean that they are somehow now numeric data. To prove this, try to find the mean of \textbf{CharacterVect} by running it through the \texttt{mean}\index{R function!mean} command: <>= mean(StringNumObject$ChacterVect) @ \subsection{Component selection}\label{ComponentSelect} The last bit of code we just saw will probably be confusing. Why do we have a dollar sign (\texttt{\$}) between the name of our data frame object name and the \textbf{CharacterVect} variable? The dollar sign is called the component selector.\index{R!component selector}\index{R!\$, component selector}\footnote{It's also sometimes called the element name operator.} It basically extracts a part--component--of an object. In the previous example it extracted the \textbf{CharacterVect} column from the {\emph{StringNumObject}} and fed it to the \texttt{mean} command, which tried (in this case unsuccessfully) to find its mean. We can, of course, use the component selector to create new objects with parts of other objects. Imagine that we have the {\emph{StringNumObject}} and want an object with only the information in the numbers column. Let's use the following code: <>= NewNumeric <- StringNumObject$NumericVect # Display contents of NewNumeric NewNumeric @ \noindent Knowing how to use the component selector will be especially useful when we discuss making tables for presentation documents in Chapter \ref{TablesChapter}. \paragraph{{\tt{attach}} and {\tt{with}}} Using the component selector can create long repetitive code if you want to select many components. You have to write the object name, a dollar sign, and the component name every time you want to select a component. You can streamline your code by using commands such as \texttt{attach}\index{R function!attach} and \texttt{with}\index{R function!with}. The \texttt{attach} command attaches a database to R's search path.\footnote{You can see what is in your current search path with the \texttt{search}\index{R function!search} command. Just type \texttt{search()} into your R console.}\index{R!search path} R will then search the database for variables you specify. You don't need to use the component selector to tell R again to look in a particular data frame after you have attached it. For example, let's attach the \emph{cars} data that comes with R. It has two variables, \textbf{speed} and \textbf{dist}.\footnote{For more information on this data set, type \texttt{?cars}\index{R function!?} into your R console.} {\small <>= # Attach cars to search path attach(cars) # Display speed head(speed) # Display dist head(dist) # Detach cars detach(cars) @ } \noindent We used the \texttt{head}\index{R function!head} command to see just the first few values of each variable. It is a good idea to \texttt{detach}\index{R function!detach} a data frame after you are done using it, to avoid confusing R. Similarly, you can use \texttt{with} when you run commands using a particular database (see Section \ref{FunctionsCommands} for more details about commands). For example, we can find the mean of \emph{NumericVect} \texttt{with} the \emph{StringNumObject} data frame: <>= with(StringNumObject, { mean(NumericVect) } ) @ \noindent You can see that in the \texttt{with} command the data frame object goes first and then the \texttt{mean}\index{R function!mean} command\footnote{Using R terminology, the second ``argument''\index{R!argument} value--the code after the comma--of the \textbf{with} command is called an ``expression'',\index{expression, R} because it can contain more than one R command or statement. See Section \ref{arguments} for a more comprehensive discussion of R command arguments.} goes second in curly brackets (\verb|{}|). For examples in this book I largely avoid using the \texttt{attach} and \texttt{with} commands. I mostly use the component selector. Though it creates longer code, I find that code written with the component selector is easier to follow. It's always clear which object we are selecting a component from. Nonetheless, \textbf{attach} and \textbf{with} are very useful for streamlining your R code. \subsection{Subscripts} Another way to select parts of an object is to use subscripts\index{subscripts}\index{R!subscripts}. You have already seen subscripts in the output from our examples so far. They are denoted with square braces (\texttt{[]}). We can use subscripts to select not only columns from data frames but also rows and individual values. As we began to see in some of the previous output, each part of a data frame has an address captured by its row and column number. We can tell R to find a part of an object by putting the row number/name, column number/name, or both in square braces. The first part denotes the rows and separated by a comma (\texttt{,}) are the columns. To give you an idea of how this works let's use the {\emph{cars}} data set that comes with R. Use the \texttt{head} command to get a sense of what this data set looks like. <>= head(cars) @ \noindent We can see a data frame with information on various cars' speeds (\textbf{speed}) and stopping distances (\textbf{dist}). If we want to select only the third through seventh rows we can use the following subscript commands: <>= cars[3:7, ] @ \noindent The colon (\texttt{:})\index{R!sequence}\index{R function!:} creates a sequence of whole numbers from 3 to 7. To select the fourth row of the \textbf{dist} column we can type: <>= cars[4, 2] @ \noindent An equivalent way to do this is: <>= cars[4, "dist"] @ \noindent Finally, we can even include a vector of column names to select: <>= cars[4, c("speed", "dist")] @ \subsection{Functions and commands}\label{FunctionsCommands} If objects are the nouns of the R language, functions and commands\index{R!commands}\index{R!functions}\footnote{For the purposes of this book I treat the two as the same.} are the verbs. They do things to objects. Let's use the \texttt{mean} command as an example. This command takes the mean of a numeric vector object. Remember our {\emph{NumericVect}} object from before: <>= # Show contents of NumericVect NumericVect @ \noindent To find the mean of this object simply type: <>= mean(x = NumericVect) @ \noindent We use the assignment operator to place a command's output into an object. For example: <>= MeanNumericVect <- mean(x = NumericVect) @ \noindent Notice that we typed the command's name then enclosed the object name in parentheses immediately afterwards. This is the basic syntax that all commands use, i.e. \texttt{COMMAND(ARGUMENTS)}. If you don't want to explicitly include an argument \emph{you still need to type the parentheses after the command}. \subsection{Arguments}\label{arguments} Arguments\index{R!command argument} modify what commands do. In our most recent example we gave the \texttt{mean} command one argument (\texttt{x = NumericVect}) telling it that we wanted to find the mean of {\emph{NumericVect}}. Arguments use the \texttt{ARGUMENTLABEL = VALUE} syntax.\footnote{Note: you do not have to put spaces between the argument label and the equals sign or the equals sign and the value. However, having spaces can make your code easier for other people to read.} In this case \textbf{x} is the argument label\index{R!argument label}. To find all of the arguments that a command can accept, look at the {\bf{Arguments}} section of the command's help file\index{help file}. To access the help file type: \texttt{?COMMAND}. For example,\index{R!?} <>= ?mean @ \noindent The help file\index{R!help file} will also tell you the default values that the arguments are set to. Clearly, you do not need to explicitly set an argument if you want to use its default value. You do have to be fairly precise with the syntax for your argument's values. Values for logical arguments\index{logical}\index{R!logical values} must written as \texttt{TRUE} or \texttt{FALSE}.\footnote{They can be abbreviated \texttt{T} and \texttt{F}.} Arguments that accept character strings require quotation marks. Let's see how to use multiple arguments with the \texttt{round} command.\index{R function!round} This command rounds a vector of numbers. We can use the \texttt{digits}\index{R!digits} argument to specify how many decimal places we want the numbers rounded to. To round the object \emph{MeanNumericVect} to one decimal place type: <>= round(x = MeanNumericVect, digits = 1) @ \noindent Note that arguments are separated by commas. Some arguments do not need to be explicitly labeled. For example, we could have written: <>= # Find mean of NumericVect mean(NumericVect) @ \noindent R will do its best to figure out what you want and will only give up when it can't. This will generate an error message. However, to avoid any misunderstandings between yourself and R it can be good practice to label most of your arguments. This will also make your code easier for other people to read, i.e. it will be more reproducible. You can stack arguments inside of other arguments. To have R find the mean of {\emph{NumericVect}} and round it to one decimal place use: <>= round(mean(NumericVect), digits = 1) @ \noindent Stacking functions inside of each other can create code that is difficult to read. Another option that potentially makes more easily understandable code is piping\index{pipe} using the pipe function (\texttt{\%>\%}) that you can access from the \emph{magrittr} package \citep{R-magrittr}.\index{magrittr}\index{R function!\%>\%} The basic idea behind the pipe function is that the output of one function is set as the first argument of the next. For example, to find the mean of \textbf{NumericVect} and then round it to one decimal place use: <>= # Load magrittr package library(magrittr) # Find mean of NumericVect and round to 1 decimal place mean(NumericVect) %>% round(digits = 1) @ \subsection{The workspace \& history} \index{R!workspace|(}\index{R!environment|(} All of the objects you create become part of your workspace, alternatively known as the current working environment. Use the \texttt{ls}\index{R function!ls} command to list all of the objects in your current workspace.\footnote{Note: your workspace will probably include different objects than this example. These are objects created to knit the book.} {\small <>= ls() @ } You can remove specific objects from the workspace using the \texttt{rm} command\index{R function!rm}. For example, to remove the \texttt{CharacterVect} and \texttt{Words} objects type: <>= rm(CharacterVect, Words) @ To save the entire workspace into a binary--not plain-text--RData file use the \texttt{save.image}\index{R function!save.image}\label{SaveLoadWS} command. The main argument of \texttt{save.image} is the location and name of the file in which you want to save the workspace. If you don't specify the file path it will be saved into your current working directory (see Chapter \ref{DirectoriesChapter} for information on files paths and working directories). For example, to save the current workspace in a file called \emph{DecemberWorkspace.RData} in the current working directory type: <>= save.image(file = "DecemberWorkspace.RData") @ \noindent Use the \texttt{load}\index{R function!load} command to load a saved workspace back into R: <>= load(file = "DecemberWorkspace.RData") @ You should generally avoid having R automatically save your workspace when you quit and reload it when you start R again. Instead, when you return to working on a project, rerun the source code files. This avoids any complications caused when you use an object in your workspace that is left over from running an older version of the source code.\footnote{For example, imagine you create an object, then change the source code you used to create the object. However, there is a syntax error in the new version of the source code. The old object won't be overwritten and you will be mistakenly using the old object in future commands.} In general I also recommend against saving data in binary RData formatted files. Because they are not text files they are not human readable and are much less future-proof. One of the few times when saving your workspace is very useful is when it includes an object that was computationally difficult and took a long time to create. In this case you can save only the large object with the \texttt{save} command.\footnote{The \texttt{save.image} command is just a special case of \texttt{save}.}\index{R function!save}\label{RSave} For example, if we have a very large object called \emph{Comp} we can save it to a file called \emph{Comp.RData} like this: <>= save(Comp, file = "Comp.RData") @ \index{R!workspace|)}\index{R!environment|)} \paragraph{R history} When you enter a command into R it becomes part of your history.\index{R!history} To see the most recent commands in your history use the \texttt{history}\index{R function!history} command. You can also use the up and down arrows on your keyboard when your cursor is in the R console to scroll through your history. \subsection{Global R options}\label{ROptions} In R you can set global options with the \texttt{options} command.\index{R function!options}\index{R!global options} This lets you set how R runs and outputs commands through an entire R session. For example, to have output rounded to one decimal place, set the \texttt{digits}\index{digits} argument: <>= options(digits = 1) @ \subsection{Installing new packages and loading functions}\label{Packages} Commands are stored in R packages\index{R!packages}. The commands we have used so far were loaded automatically by default. One of the great things about R is the many user-created packages\footnote{For the latest list see: \url{http://cran.r-project.org/web/packages/available_packages_by_name.html}.} that greatly expand the number of commands we can use. To install commands that do not come with the basic R installation you need to install the add-on packages\label{packages} that contain them. To do this, use the {\tt{install.packages}}\index{R function!install.packages} command. By default this command downloads and installs the packages from the Comprehensive R Archive Network (CRAN)\index{CRAN}. For the code you need to install all of the packages used in this book, see page \pageref{ReqPackages}. When you install a package, you will likely be given a list of mirrors\index{mirrors, CRAN} from which you can download the package. Simply select the mirror closest to you. Once you have installed a package you need to load it so that you can use its functions. Use the \texttt{library} command to load a package.\footnote{You will probably see R packages referred to as ``libraries'', though this is a misnomer. See this blog post by Carlisle Rainey for a discussion: \url{http://www.carlislerainey.com/2013/01/02/packages-v-libraries-in-r/?utm_source=rss&utm_medium=rss&utm_campaign=packages-v-libraries-in-r} (posted 2 January 2013).} Use the following code to load the {\emph{ggplot2}} package that we use in Chapter \ref{FiguresChapter} to create figures. <>= library(ggplot2) @ \noindent Please note that for the examples in this book I only specify what package a command is from if it is not loaded by default when you start an R session. Finally, if you want to make sure R uses a command from a specific package you can use the double-colon operator (\verb|::|).\index{R function!double-colon operator}\index{R function!::} For example, to make sure that we use the \texttt{qplot}\index{ggplot2!qplot} function from the \emph{ggplot2} package we type: <>= ggplot2::qplot(. . .) @ \noindent We can use the double-colon to simplify our code as we don't need to include \verb|library(. . .)|. Using the double-colon in this way ensures that R will use the command from the particular package you want and makes it clear to a source code reader what package a command comes from. Note that it does not load all of the functions in the package, just the one you ask for. %%%%%%%%%%%%%%% Using RStudio \section{Using RStudio} As I mentioned in Chapter \ref{Intro}, RStudio is an integrated development environment for R. It provides a centralized and well-organized place to do almost anything you want to do with R. As we will see later in this chapter, it is especially well integrated with literate programming tools for reproducible research. Right now let's take a quick tour of the basic RStudio window. \paragraph{The default window} When you first open RStudio\index{RStudio} you should see a default window that looks like Figure \ref{BlankMain}. In this figure you see three window panes\index{RStudio!Pane}. The large one on the left is the {\emph{Console}}. This pane functions exactly the same as the console in regular R. Other panes include the {\emph{Environment/History}} panes, in the upper right-hand corner. The \emph{Environment} pane\index{RStudio!Environment pane} shows you all of the objects in your workspace and some of their characteristics, like how many observations a data frame has. You can click on an object in this pane to see its contents. This is especially useful for quickly looking at a data set in much the same way that you can visually scan a Microsoft Excel spreadsheet. The \emph{History} pane\index{RStudio!History pane} records all of the commands you have run. It also allows you to rerun code and insert it into a source code file. \begin{figure}[ht] \caption{RStudio Startup Panel} \label{BlankMain} \begin{center} \includegraphics[width = \textwidth]{Children/Chapter3/images3/BlankMainPanel.png} \end{center} \end{figure} In the lower right-hand corner you will see the {\emph{Files/Plots/Packages/ Help/Viewer}} panes. We will discuss the \emph{Files}\index{RStudio!Files pane} pane in more detail in Chapter \ref{DirectoriesChapter}. Basically, it allows you to see and organize your files. The \emph{Plots} pane\index{RStudio!Plots pane} is where figures you create in R appear. This pane allows you to see all of the figures you have created in a session using the right and left arrow icons. It also lets you save the figures in a variety of formats. The \emph{Packages} pane\index{RStudio!Packages pane} shows the packages you have installed, allows you to load individual packages by clicking on the dialog box next to them, access their help files (just click on the package name), update the packages, and even install new packages. The \emph{Help} pane\index{RStudio!Help pane} shows you help files. You can search for help files and search within help files using this pane. Finally, the \emph{Viewer} pane\index{RStudio!Viewer pane} allows you to view local web content like JavaScript\index{JavaScript} graphics and Shiny\index{Shiny} web apps. \paragraph{The {\normalfont{Source}} pane} There is an important pane that does not show up when you open RStudio for the first time. This is the {\emph{Source}} pane.\index{RStudio!Source pane} The \emph{Source} pane is where you create, edit, and run your source code files. It also functions as an editor for your markup files. It is the center of reproducible research in RStudio. Let's first look at how to use the \emph{Source} pane with regular R files. We will then cover how it works with {\emph{knitr}}/\emph{rmarkdown} in more detail in the next section. R source code files have the file extension \texttt{.R}. When you create a new source code document, RStudio will open a new \emph{Source} pane. Do this by going to the menu bar and clicking on \texttt{File} \textrightarrow \: \texttt{New}. In the \texttt{New} drop-down menu you have the option to create a variety of different source code documents. Select the \texttt{R Script} option. You should now see a new pane with a bar across the top that looks like the first image in Figure \ref{SourcePanes}. To run the R code you have in your source code file simply highlight it\footnote{If you are only running one line of code, you don't need to highlight the code; you can simply put your cursor on that line.} and click the \texttt{Run} icon (\includegraphics[scale=0.5]{Children/Chapter3/images3/RunIcon.png}) on the top bar. This sends the code to the console where it is run. The icon to the right of \texttt{Run} simply runs the code above where you have highlighted. The \texttt{Source} icon next to this runs all of the code in the file using R's \texttt{source} command\index{R function!source}. \begin{figure}[ht] \caption{RStudio Source Code Pane Top Bars} \label{SourcePanes} \begin{center} \begin{subfigure} \caption{R Source Code} \label{fig:Rsource} \includegraphics[width = \textwidth]{Children/Chapter3/images3/RSourceBar.png} \end{subfigure} \vspace{0.5cm} \begin{subfigure} \caption{R Markdown Files} \label{fig:RMarkdown} \includegraphics[width = \textwidth]{Children/Chapter3/images3/MarkdownSourceBar.png} \end{subfigure} \vspace{0.5cm} \begin{subfigure} \caption{R LaTeX Files} \label{fig:RLaTeX} \includegraphics[width = \textwidth]{Children/Chapter3/images3/LaTeXSourceBar.png} \end{subfigure} \end{center} \end{figure} \setcounter{figure}{3} %%%%%%%%%%%%% Using knitr \section{Using \emph{knitr} and \emph{rmarkdown}: The Basics} To get started with {\emph{knitr}}\index{knitr} and \emph{rmarkdown} in R or RStudio we need to learn some of the basic concepts and syntax. The concepts are the same regardless of the markup language we are knitting R code with, but much of the syntax varies by markup language. \emph{rmarkdown}\index{rmarkdown} relies on \emph{knitr} and a utility called \emph{Pandoc} to create many different types of presentation documents (HTML, PDF, or MS Word) from one document written largely using \emph{knitr}'s R Markdown syntax. \subsection{What \emph{knitr} does}\index{knitr|(} Let's take a quick, abstract look at what the \emph{knitr} package does. As I've mentioned, \emph{knitr} ties together your presentation of results with the creation of those results. The \emph{knitr} process takes three steps (see Figure \ref{KnitProcess}). First we create a knittable markup document. This contains both the analysis code and the presentation document's markup--the text and rules for how to format the text. \emph{knitr} then \emph{knits}: i.e. it runs the analysis code and converts the output into the markup language you are using according to the rules that you tell it to use. It inserts the marked-up results into a document that only contains markup for the presentation document. You \emph{compile}\index{compile markup} this markup document as you would if you hadn't used \emph{knitr} into your final PDF document or webpage presenting your results. \begin{figure} \caption{The {\emph{knitr}/\emph{rmarkdown}} Process} \label{KnitProcess} \begin{center} \input{Children/Chapter3/images3/KnitrProcess.tex} \end{center} \end{figure} \index{knitr|)} \subsection{What \emph{rmarkdown} does}\label{rmardownHeader}\index{rmarkdown|(} The \emph{rmarkdown} package implements a variation on this process that utilizes a program called Pandoc to create presentation documents in multiple formats from an a knittable document written in Markdown. The main difference between pure \emph{knitr} markdown and \emph{rmarkdown} documents is the inclusion of a header specifying how you want to render the document with Pandoc.\footnote{Note: you can also create an \emph{rmarkdown} document without a header.} \index{rmarkdown!header |(} The header is written in YAML\index{YAML}.\footnote{YAML is a recursive acronym that means ``YAML Ain't Markup Language''.} The YAML header can include information such as the document's title, author, whether or not to include a table of contents, and a link to a BibTeX bibliography file. YAML is a straightforward data format that organizes information in a simple hierarchy. The header begins and ends with three dashes (\verb|---|). Information keys--like ``title'' and ``author''--are separated from their associated ``values'' by a colon (\texttt{:}). Sub-values of a hierarchy are denoted by being placed on a new line and indented.\footnote{It doesn't matter how many spaces you use to indent, as long as all indentations have the same number of spaces.} Here is a basic \emph{rmarkdown} header that indicates the document's title, author, date, and that it will be turned into a PDF document (via LaTeX). \begin{knitrout} \definecolor{shadecolor}{rgb}{1, 1, 1} \color{fgcolor}\begin{kframe} \begin{alltt} --- title: "A Basic PDF Presentation Document" author: "Christopher Gandrud" date: "30 November 2015" output: pdf_document: toc: true --- \end{alltt} \end{kframe} \end{knitrout} \noindent The title, author, and date, will be placed at the beginning of the output document. The final line (\texttt{toc: true}) creates a table of contents near the beginning of the PDF document when we knit it. We will discuss more header options in Chapter \ref{MarkdownChapter}. RStudio can automatically create a basic header for the type of output document that you want when you open a new \emph{rmarkdown} file. Simply select \texttt{File} \textrightarrow \texttt\:{New File} \textrightarrow\: \texttt{R Markdown\ldots}. A window will appear that looks like Figure \ref{rmarkdownWindow}. Simply select the type of output document you want to create and click \texttt{Ok}. \index{rmarkdown!header |)} In addition to the header, \emph{rmarkdown} differs from \emph{knitr} in that you can include Pandoc syntax in your R Markdown document. This can be useful for bibliographies as we will discuss in Chapter \ref{MarkdownChapter}. Nonetheless, \textbf{remember} that apart from the header and ability to include Pandoc syntax, at the simplest level \emph{rmarkdown} documents are \emph{knitr} documents written in R Markdown syntax. Importantly, they have the same code chunk syntax we will see shortly. \begin{figure} \caption{The New R Markdown Options Window} \label{rmarkdownWindow} \begin{center} \includegraphics[scale=0.5]{Children/Chapter3/images3/newRMarkdown.png} \end{center} \end{figure} \index{rmarkdown|)} \subsection{File extensions} When you save a knittable file, use a file extension that indicates (a) that it is knittable and (b) what markup language it is using. You can use a number of file extensions for R Markdown files including: \texttt{.Rmd} and \texttt{.Rmarkdown}.\footnote{R Markdown files that you compile with \emph{knitr} or \emph{rmarkdown} have the same \texttt{.Rmd} file extension.} LaTeX documents that include {\emph{knitr}} code chunks are generally called R Sweave\index{R Sweave}\index{Sweave} files and have the file extension {\tt{.Rnw}}. This terminology is a little confusing.\footnote{The ``nw'' refers to noweb simple literate programming tool that Sweave builds on.} It is a holdover from {\emph{knitr}}'s main literate programming predecessor {\emph{Sweave}}. You can also use the less confusing file extension \texttt{.Rtex}, as regular LaTeX files have the extension \texttt{.tex}. However, the syntax for \texttt{.Rtex} files is different from that used with \texttt{.Rnw} files. We'll look at this issue in more detail below. \subsection{Code chunks} When you want to include R code into your markup presentation documents, place them in a code chunk\index{code chunk}\index{knitr!code chunk}. Code chunk syntax differs depending on the markup language we are using to write our documents. Let's see the syntax for R Markdown and R LaTeX files. If you are unfamiliar with basic LaTeX or Markdown syntax you might want to skim chapters \ref{LatexChapter} and \ref{MarkdownChapter} to familiarize yourself with it before reading this section. \paragraph{R Markdown}\label{RMarkdownChunkBasic} In R Markdown\index{R Markdown} files we begin a code chunk by writing the head:\index{knitr!code chunk head} \verb|```{r}|. A code chunk is closed--ended--simply with: \verb|```|. For example: <>= ```{r} # Example of an R Markdown code chunk StringNumObject <- cbind(CharacterVect, NumericVect) ``` @ \noindent The R Markdown code chunk syntax is exactly the same for files you compile with \emph{knitr} or \emph{rmarkdown}. \paragraph{R LaTeX} There are two different ways to delimit code chunks in R LaTeX documents. One way largely emulates the established {\emph{Sweave}} syntax.\footnote{The syntax has its genesis in a literate programming tool called noweb \cite[]{Leisch2002,RamseyNoweb}.} {\emph{knitr}} also supports files with the {\tt{.Rtex}} extension, though the code chunk syntax is different. I will cover both types of syntax for code chunks in LaTeX documents. Throughout the book I use the older and more established Sweave-style syntax. \\[0.25cm] \noindent {\bf{Sweave-style}} \\[0.25cm] Traditional Sweave-style\index{knitr!Sweave-style} code chunks begin with the following head: \texttt{\textless\textless \textgreater\textgreater=}. The code chunk is closed with an at sign (\verb|@|). \begin{knitrout} \definecolor{shadecolor}{rgb}{1, 1, 1} \color{fgcolor}\begin{kframe} \begin{alltt} \textless\textless \textgreater\textgreater= \hlcom{# Example of a Sweave-style code chunk} StringNumObject <- \hlkwd{cbind}(CharacterVect, NumericVect) @ \end{alltt} \end{kframe} \end{knitrout} \noindent {\bf{Rtex-style}} \\[0.25cm] Sweave-style code chunk syntax is fairly baroque compared to the Rtex-style syntax.\index{knitr!Rtex} To begin a code chunk in an \texttt{Rtex} file simply type double percent signs followed by \texttt{begin.rcode}, i.e. \texttt{\%\% begin.rcode}. To close the chunk you use double percent signs: \texttt{\%\%}. Each line in the code chunk needs to begin with a single percent sign. For example: \begin{knitrout} \definecolor{shadecolor}{rgb}{1, 1, 1} \color{fgcolor} \begin{kframe} \begin{verbatim} %% begin.rcode % # Example of a Rtex-style code chunk % StringNumObject <- cbind(CharacterVect, NumericVect) %% \end{verbatim} \end{kframe} \end{knitrout} \paragraph{Code chunk labels} Each chunk has a label.\index{knitr!code chunk label} When a code chunk creates a plot or the output is cached\index{knitr option!cache}--stored for future use--{\emph{knitr}} uses the chunk label for the new file's name. If you do not explicitly give the chunk a label it will be assigned one like: \texttt{unnamed-chunk-1}. To explicitly assign chunk labels in R Markdown documents, place the label name inside of the braces after the \texttt{r}. If we wanted to use the label \texttt{ChunkLabel} we type: <>= ```{r ChunkLabel} # Example chunk label ``` @ \noindent The same general format applies to the two types of LaTeX chunks. In Sweave-style chunks we type: \texttt{\textless\textless ChunkLabel\textgreater\textgreater=}. In Rtex-style we use: \texttt{\%\% begin.rcode ChunkLabel}. Try not to use spaces or periods in your label names. Also remember that chunk labels {\emph{must}} be unique. \paragraph{Code chunk options}\index{knitr!code chunk option|(} There are many times when we want to change how our code chunks are knitted and presented. Maybe we only want to show the code and not the results or perhaps we don't want to show the code at all but just a figure that it produces. Maybe we want the figure to be formatted on a page in a certain way. To make these changes and many others we can specify code chunk options. Like chunk labels, you specify options in the chunk head. Place them after the chunk label, separated by a comma. Chunk options are written following pretty much the same rules as regular R command arguments. They have a similar \verb|OPTION_LABEL=VALUE| structure as arguments. The option values must be written in the same way that argument values are. Character strings need to be inside of quotation marks. The logical \texttt{TRUE} and \texttt{FALSE} operators cannot be written \verb|''true''| and \verb|''false''|. For example, imagine we have a Markdown code chunk called \texttt{ChunkLabel}. If we want to run the {\emph{knitr}} code chunk, but not show the code in the final presentation document, we can use the option \texttt{echo=FALSE}.\index{knitr option!echo} <>= ```{r ChunkLabel, echo=FALSE} StringNumObject <- cbind(CharacterVect, NumericVect) ``` @ \noindent Note that all labels and code chunk options must be on the same line. Options are separated by commas. The syntax for {\emph{knitr}} options is the same regardless of the markup language. Here is the same chunk option in Rtex-style syntax: \begin{knitrout} \definecolor{shadecolor}{rgb}{1, 1, 1} \color{fgcolor} \begin{kframe} \begin{verbatim} %% begin.rcode ChunkLabel, echo=FALSE % StringNumObject <- cbind(CharacterVect, NumericVect) %% \end{verbatim} \end{kframe} \end{knitrout} Throughout this book we will look at a number of different code chunk options. Many of the chunk options we will use in this book are listed in Table \ref{ChunkOptionsTable}. For the full list of {\emph{knitr}} options see the {\emph{knitr}} chunk options page maintained by {\emph{knitr}}'s creator Yihui Xie: \url{http://yihui.name/knitr/options}. \begin{table} \caption{A Selection of {\emph{knitr}} Code Chunk Options} \begin{center} \label{ChunkOptionsTable} \begin{tabular}{l c p{6cm}} \hline Chunk Option Label & Type & Description \\[0.25cm] \hline\hline \texttt{cache} & Logical & Whether or not to save results from the code chunk in a cache database. Note: cached chunks are only run when they are changed. \\[0.25cm] \texttt{cache.vars} & Character Vector & Specify the variable names to save in the cache database. \\[0.25cm] \texttt{eval} & Logical & Whether or not to run the chunk. \\[0.25cm] \texttt{echo} & Logical & Whether or not to include the code in the presentation document. \\[0.25cm] \texttt{error} & Logical & Whether or not to include error messages. \\[0.25cm] \texttt{engine} & Character & Set the programming language for {\emph{knitr}} to evaluate the code chunk with. \\[0.25cm] \texttt{fig.align} & Character & Align figures. (Note: does not work with R Markdown documents.) \\[0.25cm] \texttt{fig.path} & Character & Set the directory where figures will be saved. \\[0.25cm] \texttt{include} & Logical & When \texttt{include=FALSE} the chunk is evaluated, but the results are not included in the presentation document. \\[0.25cm] \texttt{message} & Logical & Whether or not to include R messages. \\[0.25cm] \texttt{out.height} & Numeric & Set figures' heights in the presentation document. \\[0.25cm] \texttt{out.width} & Numeric & Set figures' widths in the presentation document. \\[0.25cm] \texttt{results} & Character & How to include results in the presentation document. \\[0.25cm] \texttt{tidy} & Logical & Whether or not to have \emph{knitr} format printed code chunks. \\[0.25cm] \texttt{warning} & Logical & Whether or not to include warnings. \\[0.25cm] \hline \end{tabular} \end{center} {\scriptsize{These commands are discussed in more detail in Chapter \ref{StatsModel}.}} \end{table} \index{knitr!code chunk option|)} \subsection{Global chunk options}\index{knitr!global chunk options|(}\label{GlobalChunkOptions} So far we have only looked at how to set local options in {\emph{knitr}} code chunks, i.e. options for only one specific chunk. If we want an option to apply to all of the chunks in our document we can set global chunk options. Options are `global' in the sense that they apply to the entire document. Setting global chunk options helps us create documents that are formatted consistently without having to repetitively specify the same option every time we create a new code chunk. For example, in this book I center almost all of the figures. Instead of using the \verb|fig.align='center'| option in each code chunk that creates a figure, I set the option globally. To set a global option, first create a new code chunk at the beginning of your document.\footnote{In Markdown, you can put global chunk options at the very top of the document. In LaTeX they should be placed after the \texttt{\textbackslash{}begin\{document\}} command (see Chapter \ref{LatexChapter} for more information on how LaTeX documents are structured).} You will probably want to set the option {\tt{include=FALSE}} so that {\emph{knitr}} doesn't include the code in your presentation document. Inside the code chunk use {\tt{opts\_chunk\$set}}. You can set any chunk option as an argument to {\tt{opts\_chunk\$set}}. The option will be applied across your document, unless you set a different local option. Here is an example of how you can center align all of the figures in Sweave-style code chunks. Place the following code at the beginning of the document: \begin{knitrout} \definecolor{shadecolor}{rgb}{1, 1, 1} \color{fgcolor} \begin{kframe} \begin{alltt} \textless{\textless}ChunkLabel, include=FALSE\textgreater{\textgreater}= \hlcom{# Center align all knitr figures} opts_chunk$\hlkwd{set}(fig.align='center') @ \end{alltt} \end{kframe} \end{knitrout} \noindent \texttt{Note:} if you want to use \texttt{opts\_chunk} in a document rendered with \emph{rmarkdown} you will need to load \emph{knitr} in a code chunk preceding the call. \index{knitr!global chunk options|)} \subsection{\emph{knitr} package options}\index{knitr!package options|(} {\emph{knitr}} package options affect how the package itself runs. For example, the {\tt{progress}} option can be set as either {\tt{TRUE}} or {\tt{FALSE}}\footnote{It's set as {\tt{TRUE}} by default.} depending on whether or not you want a progress bar\index{progress bar} to be displayed when you knit a code chunk. You can use {\tt{base.dir}} to set the directory where you want all of your figures to be saved (see Chapter \ref{DirectoriesChapter}) or the {\tt{child.path}} option to specify where child documents are located (see Chapter \ref{LargeDocs}). You set package options in a similar way as global chunk options with {\tt{opts\_knit\$set}}. For example, include this code at the beginning of a document to turn off the progress bar when it is knitted: \begin{knitrout} \definecolor{shadecolor}{rgb}{1, 1, 1} \color{fgcolor} \begin{kframe} \begin{alltt} \textless{\textless}ChunkLabel, include=FALSE\textgreater{\textgreater}= \hlcom{# Turn off knitr progress bar} opts_knit$\hlkwd{set}(progress=FALSE) @ \end{alltt} \end{kframe} \end{knitrout} \index{knitr!package options|)} \subsection{Hooks} You can also set hooks\index{knitr!hook}. Hooks come in two types: chunk hooks and output hooks. Chunk hooks\index{knitr!chunk hooks} run a function before or after a code chunk. Output hooks\index{knitr!output hooks} change how the raw output is formatted. I don't cover hooks in much detail in this book. For more information on hooks, please see Yihui Xie's webpage: \url{http://yihui.name/knitr/hooks}. %%%%%%%%%% Knitr, rmarkdown, & RStudio \subsection{\emph{knitr}, \emph{rmarkdown}, \& RStudio} RStudio is highly integrated with {\emph{knitr}}/\emph{rmarkdown} and the markup languages that they work with. Because of this integration it is easier to create and compile {\emph{knitr}}/\emph{rmarkdown} documents in RStudio than plain R. Most of the RStudio/{\emph{knitr}}/\emph{rmarkdown} features are accessed in the \emph{Source} pane\index{RStudio!Source pane}. The \emph{Source} pane's appearance and capabilities change depending on the type of file you have open in it. RStudio uses a file's extension\index{file extension} and, if it is an \emph{rmarkdown} document, its header, to determine what type of file you have open.\footnote{You can manually set how you want the \emph{Source} pane to act by selecting the file type using the drop-down menu in the lower right-hand corner of the \emph{Source} pane.} We have already seen some of the features the \emph{Source} pane has for R source code files. Let's now look at how to use {\emph{knitr}} and \emph{rmarkdown} with R source code files as well as the markup formats we cover in this book: R Markdown\index{R Markdown} and R LaTeX\index{R LaTeX}. \\[0.25cm] \paragraph{Compiling R source code Notebooks} If you want a quick well-formatted account of the code that you ran and the results that you got you can use RStudio's ``Compile Notebook''\index{RStudio!Notebook} capabilities. RStudio uses \emph{rmarkdown} to create a standalone file presenting your source code and results. It will include all of the code from an R source file as well as the output. This can be useful for recording the steps you took to do an analysis. You can see an example RStudio Notebook in Figure \ref{NotebookExample}. If you want to create a Notebook from an open R source code file simply click the \texttt{Compile Notebook} icon (\includegraphics[scale=0.5]{Children/Chapter3/images3/CompileNotebook.png}) in the \emph{Source} pane's top bar.\footnote{Alternatively, \texttt{File} \textrightarrow \; \texttt{Compile Notebook\ldots}} Then in the window that pops up select the output type you would like (HTML, PDF or MS Word) and click the \texttt{Compile} button. For this example I selected HTML. In Figure \ref{NotebookExample} you can see near the top center right a small globe icon next to the word ``Publish''. Clicking this allows you to publish your Notebook to RPubs (\url{http://www.rpubs.com/}).\index{RPubs} RPubs is a site for sharing your Notebooks over the internet. You can publish not only Notebooks, but also any {\emph{knitr}} \emph{rmarkdown} Markdown document you compile in RStudio.\label{PublishRPubs} \begin{figure} \caption{RStudio Notebook Example} \label{NotebookExample} \begin{center} \includegraphics[scale=0.4]{Children/Chapter3/images3/NotebookExample.png} \end{center} \end{figure} \paragraph{R Markdown} The second image in Figure \ref{SourcePanes} is what the \emph{Source} pane's top bar looks like when you have an R Markdown file open. You'll notice the familiar \texttt{Run} button for running R code. At the far right you can see a new \texttt{Chunks} drop-down menu (\includegraphics[scale=0.5]{Children/Chapter3/images3/ChunksIcon.png}). In this menu you can select \texttt{Insert Chunk} to insert the basic syntax required for a code chunk. There is also an option to \texttt{Run Current Chunk}--i.e. the chunk where your cursor is located--\texttt{Run Next Chunk}, and \texttt{Run All} chunks. You can navigate to a specific chunk using a drop-down menu on the bottom left-hand side of the \emph{Source} pane (e.g. \includegraphics[scale=0.5]{Children/Chapter3/images3/ChunkNav.png}). This can be very useful if you are working with a long document. To knit your file, click the \texttt{Knit HTML} icon on the left side of the \emph{Source} pane's top bar. This will clearly knit it to an HTML file. If you click on the downward arrow on the right of this icon you will be given the opportunity to also knit the document to a PDF or an MS Word\index{MS Word} file using \emph{rmarkdown}. Other useful buttons in the R Markdown \emph{Source} pane's top bar include the \texttt{ABC} spell check icon and question mark icon, which gives you a Markdown syntax reference file in the Help pane. Another useful RStudio {\emph{knitr}}/\emph{rmarkdown} integration feature is that RStudio can properly highlight both the markup language syntax and the R code in the \emph{Source} pane. This makes your source code much easier to read and navigate. RStudio can also fold code chunks.\index{code folding} This makes navigating through long documents, with long code chunks, much easier. In the first image in Figure \ref{CodeFold} you can see a small downward facing arrow at line 25. If you click this arrow the code chunk will collapse to look like the second image in Figure \ref{CodeFold}. To unfold the chunk, just click on the arrow again. You may also notice that there are code folding arrows on lines 27 and 34 in the first image. These allow us to fold parts of the code chunk. To enable this option, create a comment line with at least one hash before the comment text and at least four after it like this: <>= #### An RStudio Foldable Comment #### @ \noindent You will be able to fold all of the text after this comment up until the next similarly formatted comment (or the end of the chunk). \begin{figure}[ht!] \caption{Folding Code Chunks in RStudio} \label{CodeFold} \setlength{\belowcaptionskip}{5pt} \centering \begin{subfigure} \caption{Not Folded} \includegraphics[width = \textwidth]{Children/Chapter3/images3/MarkdownNoCollapse.png} \end{subfigure} \\[0.5cm] \begin{subfigure} \caption{Folded} \includegraphics[width = \textwidth]{Children/Chapter3/images3/MarkdownCollapse.png} \end{subfigure} \end{figure} \paragraph{R LaTeX} You can see in the final image in Figure \ref{SourcePanes} that many of the \emph{Source} pane options for R LaTeX files are the same as R Markdown files, the key differences being that there is a \texttt{Compile PDF} icon (\includegraphics[scale=0.5]{Children/Chapter3/images3/CompilePDF.png}) instead of \texttt{Knit HTML}. Clicking this icon knits the file and creates a PDF file in your R LaTeX file's directory. There is also a \texttt{Format} icon instead of the question mark icon. This actually inserts LaTeX formatting commands into your document for things such as section headings and bullet lists. These commands can be very tedious to type out by hand otherwise. \paragraph{Change default .Rnw knitter} By default RStudio may be set up to use \emph{Sweave}\index{Sweave} for compiling LaTeX documents. To use {\emph{knitr}} instead of \emph{Sweave} to knit \texttt{.Rnw} files you should click on \texttt{Tools} in the RStudio menu bar then click on \texttt{Global Options...}.\index{RStudio!Options window} Once the {\bf{Options}} window opens, click on the \texttt{Sweave} button. Select \texttt{knitr} from the drop-down menu for ``Weave Rnw files using:''. Finally, click \texttt{Apply}.\footnote{In the Mac version of RStudio, you can also access the \texttt{Options} window via \texttt{RStudio} \textrightarrow\: \texttt{Preferences} in the menu bar.} In the \texttt{Sweave} options menu you can also set which LaTeX typesetting engine to use. By default it is set to the more established engine pdfLaTeX.\index{pdfLaTeX} Another option is XeLaTeX.\index{XeLaTeX} XeLaTeX has the ability to use many more characters than pdfLaTeX as it works with UTF-8\index{UTF-8} encoded input. It can also use any font on your computer. XeLaTeX is especially useful compared to pdfLaTeX if you are using characters that are not found in standard English. \subsection{\emph{knitr} \& R} As {\emph{knitr}} is a regular R package, you can of course, knit documents in R (or using the console in RStudio). All of the {\emph{knitr}} syntax in your markup document is the same as before, but instead of clicking a {\tt{Compile PDF}} or {\tt{knit HTML}} button use the {\tt{knit}} function.\index{R function!knit} To knit a hypothetical Markdown file {\emph{Example.Rmd}} you first use the \texttt{setwd} command to set the working directory (for more details see Chapter \ref{DirectoriesChapter}) to the folder where the {\emph{Example.Rmd}} file is located. In this example it is located in the Documents folder.\footnote{Using the directory name {\tt{$\sim$/Documents/}} is for Mac computers. Please use alternative syntax discussed in Chapter \ref{DirectoriesChapter} on other types of systems.} <>= setwd("~/Documents/") @ \noindent Then you knit the file: <>= knit(input = "Example.Rmd", output = "Example.md") @ \noindent You use the same steps for all other knittable document types. Note that if you do not specify the output file, {\emph{knitr}} will determine what the file name and extension should be. In this example it would come up with the same name and location as we gave it. In this example, using the \texttt{knit} function only creates a Markdown file and not an HTML file, as clicking the RStudio {\tt{knit HTML}} did. Likewise, if you use {\tt{knit}} on a {\tt{.Rnw}} file you will only end up with a basic LaTeX {\tt{.tex}} file and not a compiled PDF. To convert the Markdown file into HTML you need to further run the {\tt{.md}} file through the {\tt{markdownToHTML}}\index{R function!markdownToHTML} command from the {\emph{markdown}} package, i.e. <>= mardownToHTML(file = "Example.md", output = "Example.html") @ \noindent This is a bit tedious. Luckily, there is a command in the {\emph{knitr}} package that combines \texttt{markdownToHTML} and \texttt{knit}. It is called \texttt{knit2html}.\index{R function!knit2html} You use it like this: <>= knit2html(file = "Example.Rmd", output = "Example.html") @ \noindent If we want to compile a {\tt{.tex}} file in R we run it through the {\tt{texi2pdf}}\index{R function!texi2pdf} function in the {\emph{tools}} package. This package will run both LaTeX and BibTeX to create a PDF with a bibliography (see Chapter \ref{LatexChapter} for more details on using BibTeX\index{BibTeX} for bibliographies). Here is a {\tt{texi2pdf}} example: <>= # Load tools package library(tools) # Compile pdf texi2pdf(file = "Example.tex") @ \noindent Just like with \texttt{knit2html}, you can simplify this process by using the \texttt{knit2pdf} command to compile a PDF file from a \texttt{.Rnw} or \texttt{.Rtex} document. \subsection{\emph{rmarkdown} and R}\index{rmarkdown}\index{rmarkdown!header |(}\label{rmarkdownRender} Just as \emph{knitr} is an R package that you can run from the console, you can also run \emph{rmarkdown} from the console. Instead of the \texttt{knit} function use \texttt{render}. Imagine that \emph{Example.Rmd} now has an \emph{rmarkdown} header: \begin{knitrout} \definecolor{shadecolor}{rgb}{1, 1, 1} \color{fgcolor}\begin{kframe} \begin{alltt} --- title: "A Basic PDF Presentation Document" author: "Christopher Gandrud" date: "30 November 2015" output: pdf_document: toc: true html_document: toc: false --- \end{alltt} \end{kframe} \end{knitrout} \index{rmarkdown!header |)} \noindent This header specifies how the file can be compiled to either PDF or HTML. When compiled to PDF it will include a table of contents. When compiled to HTML it won't. Now we use \texttt{render}: <>= render("Example.Rmd") @ \noindent This call will compile the document to a PDF in the working directory, because PDF is listed as the first output format in the header. The document will be called \emph{Example.pdf}. Alternatively, to compile the R Markdown file to HTML use: <>= render("Example.Rmd", "html_document") @ \noindent We could compile to both formats using: <>= render("Example.Rmd", "all") @ \noindent or <>= render("Example.Rmd", c("pdf_document", "html_document")) @ In all of these cases, \texttt{render} will not keep the intermediate \emph{.md} or \emph{.tex} document. You can have these documents saved by adding \texttt{keep\_md} or \texttt{keep\_tex} to the header. For example: \begin{knitrout} \definecolor{shadecolor}{rgb}{1, 1, 1} \color{fgcolor}\begin{kframe} \begin{alltt} --- output: pdf_document: keep_tex: true html_document: keep_md: true --- \end{alltt} \end{kframe} \end{knitrout} \noindent Finally, if you want to output to one format with the default rendering style, for example, the HTML document, use \texttt{html\_document: default}. \subsection*{Chapter summary} We've covered a lot of ground in this chapter, including R basics, how to use RStudio, and \emph{knitr}/\emph{rmarkdown} syntax for multiple markup languages. These tools, especially R and \emph{knitr}/\emph{rmarkdown}, are fundamental to the reproducible research process we will learn in this book. They enable us to create dynamic text-based files that record our research steps in detail. In the next chapter we will look at how to organize files created with these types of tools into reproducible research projects. \section*{Appendix: knitr and Lyx}\label{LyxAppendix} You may be more comfortable using a what-you-see-is-what-you-get\index{WYSIWYG} (WYSIWYG) editor, similar to Microsoft Word. Lyx\index{Lyx}\index{knitr!Lyx} is a WYSIWYG LaTeX editor that can be used with \emph{knitr}. I don't cover Lyx in detail in this book, but here is a little information to get you started. \paragraph{Set Up} To set up Lyx so that it can compile \texttt{.Rnw} files, click \texttt{Document} in the menu bar then \texttt{Settings}. In the left-hand panel the second option is \texttt{Modules}. Click on \texttt{Modules} and select \texttt{Rnw (knitr)}. Click \texttt{Add} then \texttt{Ok}. Now, compile your LaTeX document in the normal Lyx way. \paragraph{Code Chunks} Enter code chunks into TeX Code blocks within your Lyx documents. To create a new TeX Code block, select \texttt{Insert} from the menu bar then \texttt{TeX Code}. ================================================ FILE: Old/Source-v2/Children/Chapter4/chapter4.Rnw ================================================ % Chapter Chapter 4 For Reproducible Research in R and RStudio % Christopher Gandrud % Created: 16/07/2012 05:45:03 pm CEST % Updated: 16 March 2016 <>= set_parent('Rep-Res-Parent.Rnw') @ \chapter{Getting Started with File Management}\label{DirectoriesChapter} Careful file management is crucial for reproducible research. Remember two of the guidelines from Chapter \ref{GettingStartedRR}: \begin{itemize} \item Explicitly tie your files together. \item Have a plan to organize, store, and make your files available. \end{itemize} \noindent Apart from the times when you have an email exchange (or even meet in person) with someone interested in reproducing your research, the main information independent researchers have about the procedures is what they access in files you make available: data files, analysis files, and presentation files. If these files are well organized and the way they are tied together is clear, replication will be much easier. File management is also important for you as a researcher, because if your files are well organized you will be able to more easily make changes, benefit from work you have already done, and collaborate with others. Using tools such as R, {\emph{knitr}}/\emph{rmarkdown}, and markup languages like LaTeX requires fairly detailed knowledge of where files are stored in your computer. Handling files to enable reproducibility may require you to use command-line tools to access and organize your files. R and Unix-like shell programs\index{Unix-like shell program}\index{command-line} allow you to control files--creating, deleting, relocating--in powerful and really reproducible ways. By typing these commands you are documenting every step you take. This is a major advantage over graphical user interface-type systems where you organize files by clicking and dragging them with the cursor. However, text commands require you to know your files' specific addresses--their file paths.\index{file path} In this chapter we discuss how a reproducible research project may be organized and cover the basics of file path naming conventions\index{file path naming conventions} in Unix-like operating systems, such as Mac OS X and Linux, as well as Windows. We then learn how to organize them with RStudio Projects\index{RStudio!Projects}. Finally, we'll cover some basic R and Unix-like shell commands for manipulating files as well as how to navigate through files in RStudio in the {\emph{Files}} pane. The skills you will learn in this chapter will be heavily used in the next chapter (Chapter \ref{Storing}) and throughout the book. In this chapter we work with locally stored files\index{locally stored}, i.e. files stored on your computer. In the next chapter we will discuss various ways to store and access files remotely stored in the cloud.\index{remotely stored}\index{cloud storage} \section{File Paths \& Naming Conventions} All of the operating systems\index{operating systems} covered in this book organize files in hierarchical directories\index{directories}, also known as file trees. To a large extent, directories can be thought of as the folders you usually see on your Windows or Mac desktop.\footnote{To simplify things, I use the terms `directory' and `folder' interchangeably in this book.} They are called hierarchical because directories\index{hierarchical directories} are located inside of other directories, as in Figure \ref{ExampleTree}. \subsection{Root directories} A root directory\index{root directory} is the first level in a disk, such as a hard drive. It is the root out of which the file tree `grows'.\index{file tree} All other directories are subdirectories\index{subdirectory} of the root directory. On Windows computers you can have multiple root directories, one for each storage device or partition of a storage device. The root directory is given a drive letter assignment\index{drive letter assignment}. If you use Windows regularly you will most likely be familiar with \texttt{C:\textbackslash{}} used to denote the C partition of the hard drive. This is a root directory. On Unix-like systems, including Macs and Linux computers, the root directory is simply denoted by a forward slash (\texttt{/}) with nothing before it. \subsection{Subdirectories \& parent directories} You will probably not store all of your files in the root directory. This would get very messy. Instead you will likely store your files in subdirectories of the root directory. Inside of these subdirectories may be further subdirectories and so on. Directories inside of other directories are also referred to as child directories\index{child directory} of a parent directory\index{parent directory}. On Windows computers separate subdirectories are indicated with a back slash (\textbackslash{}). For example, if we have a folder called {\emph{Data}} inside of a folder called {\emph{ExampleProject}} which is located in the C root directory it has the address \texttt{C:\textbackslash{}ExampleProject\textbackslash{}Data}.\footnote{For more information on Windows file path names see this helpful website: \url{http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx}} When you type Windows file paths into R you need to use two backslashes rather than one: e.g. \verb|C:\\ExampleProject\\Data|. This is because the \texttt{\textbackslash{}} is an escape character\index{escape character} in R.\footnote{As we will see in Part IV, it is also a LaTeX and Markdown escape character.} Escape characters tell R to interpret the next character or sequence of characters differently. For example, in Section \ref{TSVEscape} you'll see how \texttt{\textbackslash{}t} can be interpreted by R as a tab rather than the letter ``t''. Add another escape character to neutralize the escape character so that R interprets it as a backslash. In other words, use an escape character to escape the escape character. Another option for writing Windows file names in R is to use one forward slash (\texttt{/}). On Unix-like systems, including Mac computers, directories are indicated with a forward slash (\texttt{/}). The file path of the {\emph{Data}} file on a Unix-like system would be: \texttt{/ExampleProject/Data}. Remember that a forward slash with nothing before it indicates the root directory. So \texttt{/ExampleProject/Data} has a different meaning than \texttt{ExampleProject/Data}. In the former, \emph{ExampleProject} is a subdirectory of the root. In the latter, \emph{ExampleProject} is a subdirectory of the current working directory (see below for details about working directories). This is also true in Windows. In this chapter I switch between the two file system naming conventions to expose you to both. For the remainder of the book I use Unix-like file paths. When you use relative paths, these will work across operating systems in R. We'll get to relative paths in a moment. \subsection{Working directories} When you use R, markup languages, and many of the other tools covered in this book, it is important to keep in mind what your current working directory is\index{working directory}.\index{current working directory} The working directory is the directory where the program automatically looks for files and other directories, unless you tell it to look elsewhere. It is also where it will save files. Later in this chapter we will cover commands for finding and changing the working directory. \subsection{Absolute vs. relative paths} For reproducible research, collaborative research, and even if you ever change the computer you work on, it is a good idea to use relative rather than absolute file paths.\index{file path!relative}\index{file path!absolute} Absolute file paths give the entire path of a given file or directory on a specific system. For example, \texttt{/ExampleProject/Data} is an absolute path as it specifies the path of the \emph{Data} child directory all the way back to the root directory. However, if our current working directory is \emph{ExampleProject} and we want to link to the \emph{Data} child directory or a file in it, we don't need the absolute path. We could simply use \texttt{Data/}, i.e. the path relative to the working directory. It is good practice to use relative paths when possible and organize your files such that using relative paths is easy. This makes your code less dependent on the particular file structure of a particular computer. For example, imagine you use \texttt{C:\textbackslash{}\textbackslash{}ExampleProject\textbackslash{}\textbackslash{}Data} in your source code to link to the \emph{Data} directory. Someone--a collaborator, a researcher reproducing your work, or even you--then tries to run the code on a different computer. The code will break if they are, for instance, using a Unix-like system or have placed \emph{ExampleProject} in a different partition of their hard drive. This can be fixed relatively straightforwardly by changing the file path in the source. However, this is tedious (often not well documented) and unnecessary if you use relative file paths. \subsection{Spaces in directory \& file names} It is generally good practice to avoid putting spaces in your file and directory names. For example, I called the example project parent directory ``ExampleProject'' rather than ``Example Project''. Spaces in file and directory names can sometimes create problems for computer programs trying to read the file path. The program may believe that the space indicates that the path name has ended. To make multi-word names easily readable without using spaces, adopt a convention such as CamelBack\index{CamelBack}. In CamelBack new words are indicated with capital letters, while all other letters are lower case. For example, ``ExampleProject''. \begin{figure}[th!] \caption{Example Research Project File Tree} \label{ExampleTree} \begin{center} \input{Children/Chapter4/images4/ExampleFilePath.tex} \end{center} \end{figure} \section{Organizing Your Research Project} Figure \ref{ExampleTree} gives an example of how the files in a simple reproducible research project could be organized. The project's parent directory is called {\emph{ExampleProject}}. Inside this directory are the primary knittable documents (\emph{Paper.Rnw} \emph{Slideshow.Rnw}, and \emph{Website.Rmd}). In addition there is an \emph{Analysis} sub-directory with the R files to run the statistical analyses followed by a further \emph{Data} child directory. The nested file structure allows you to use relative file paths. The knittable documents can call \emph{Analysis1.R} with the relative path \emph{Analysis/Analysis1.R}, which in turn could call a file in the \emph{Data/} subdirectory. If all of the directories were at the same level of the file tree then you would need to use absolute file paths.\index{file path!absolute}\index{file path!relative} \begin{wrapfigure}{r}{0.4\textwidth} \caption{An Example RStudio Project Menu} \label{ProjectMenu} \begin{center} \includegraphics[width=0.3\textwidth]{Children/Chapter4/images4/ProjectMenu.png} \end{center} \end{wrapfigure} In addition to the main files and subdirectories in {\emph{ExampleProject}} you will probably notice a file called {\emph{README.md}}. The {\emph{README.md}} file\index{README file} gives an overview of all the files in the project. It should briefly describe the project including things like its title, author(s), topic, any copyright information, and so on. It should also indicate how the folders in the project are organized and give instructions for how to reproduce the project. The README file should be in the main project folder--in our example this is called {\emph{ExampleProject}}--so that it is easy to find. If you are storing your project as a GitHub\index{GitHub} repository (see Chapter \ref{Storing}) and the file is called \emph{README}, its contents will automatically be displayed on the repository's main page. If the \emph{README} file is written using Markdown (e.g. \emph{README.md}), it will also be properly formatted. Figure \ref{BookRepository} shows an example of this. It is good practice to dynamically include the system information for the R session you used to create the project. To do this you can write your README file with R Markdown. Simply include the \texttt{sessionInfo()}\index{R function!sessionInfo} command in a \emph{knitr} code chunk in the R Markdown document. If you knit this file immediately after knitting your presentation document, it will record the information for that session. You can also dynamically include session info in a LaTeX document. To do this, use the {\tt{toLatex}} command in a code chunk. The code chunk should have the option \verb|results='asis'|. The code is: <>= toLatex(sessionInfo()) @ \section{Setting Directories as RStudio Projects} If you are using RStudio, you may want to organize your files as Projects\index{RStudio!Projects}.\label{CreateRStudioProject} You can turn a normal directory into an RStudio Project by clicking on \texttt{File} in the RStudio menu bar and selecting \texttt{New Project\ldots}. A new window will pop-up. Select the option \texttt{Existing Directory}. Find the directory you want to turn into an RStudio Project by clicking on the \texttt{Browse} button. Finally, select \texttt{Create Project}. You will also notice in the Create Project pop-up window that you can build new project directories and create a project from a directory already under version control\index{version control} (we'll do this at the end of Chapter \ref{Storing}). When you create a new project you will see that RStudio has put a file with the extension \texttt{.Rproj} into the directory. Making your research project directories RStudio Projects is useful for a number of reasons: \begin{itemize} \item The project is listed in RStudio's Project menu where it can be opened easily (see Figure \ref{ProjectMenu}). \item When you open the project in RStudio it automatically sets the working directory to the project's directory and loads the workspace, history, and source code files you were last working on. \item You can set project specific options like whether PDF presentation documents should be compiled with \emph{Sweave} or {\emph{knitr}}. \item When you close the project your R workspace and history are saved in the project directory if you want. \item It helps you version control your files. \item You can build your Project--run the files in a specific way--with makefiles. \item Gives you an easy-to-use interface for managing the R packages that your project depends on. \end{itemize} \noindent We will look at many of these points in more detail in the next few chapters. %%%%%%%%%%%%%%% File Manipulation \section{R File Manipulation Commands} R has a range of commands for handling and navigating through files. Including these commands in your source code files allows you to more easily replicate your actions. \paragraph{{\tt{getwd}}}\index{R function!getwd} To find your current working directory use the \texttt{getwd} command: <>= getwd() @ \noindent The example here shows you the current working directory that was used while knitting this chapter. \paragraph{{\tt{list.files}}}\index{R function!list.files} Use the \texttt{list.files} command to see all of the files and subdirectories in the current working directory. You can list the files in other directories too by adding the directory path as an argument to the command. <>= list.files() @ \noindent You can see that the \emph{Chapter4} folder has the file \emph{chapter4.Rnw} (the markup file used to create this chapter) and a child directory called \emph{images4} where I stored the original versions of the figures included in this chapter. \paragraph{{\tt{setwd}}}\index{R function!setwd} The {\tt{setwd}} command sets the current working directory\index{working directory}. For example, if we are on a Mac or other Unix-like computer we can set the working directory to the {\emph{Analysis}} directory in our Example Project (see Figure \ref{ExampleTree}) like this: <>= setwd("/ExampleProject/Analysis/") @ \noindent Now R will automatically look in the {\emph{Analysis}} folder for files and will save new files into this folder, unless we explicitly tell it to do otherwise. When working with a knittable document, setting the working directory once in a code chunk changes the working directory for all subsequent code chunks. \paragraph{{\tt{root.dir}}}\index{knitr option!root.dir} By default the root (or working) directory for all of the code chunks in a knittable document is the directory where this document is located. You can reset the directory by feeding a new file path to the \texttt{root.dir} option.\index{knitr option!root.dir} We can set this globally\footnote{See the discussion of global chunk options in Chapter \ref{GettingStartedRKnitr}, page \pageref{GlobalChunkOptions}.} for all of the chunks in the document by including the following code in the document's first chunk. <>= opts_knit$set(root.dir = '/ExampleProject/Analysis') @ \noindent Here we set the \emph{/ExampleProject/Analysis} sub-directory as the root directory for all of the chunks in our presentation document. \textbf{Note:} In general it is preferable to use a nested file structure, as we saw before, rather than specify \texttt{root.dir}. A nested file structure creates one less step for those trying to reproduce your work on a different computer. They do not need to change the \texttt{root.dir} file path. \paragraph{{\tt{dir.create}}}\index{R function!dir.create} Sometimes you may want to create a new directory. You can use the {\tt{dir.create}} command to do this.\footnote{Note: you will need the correct system permissions to be able to do this.} For example, to create a {\emph{ExampleProject}} file in the root \emph{C} directory on a Windows computer type: <>= dir.create("C:\\ExampleProject") @ \paragraph{{\tt{file.create}}}\index{R function!file.create} Similarly, you can create a new blank file with the \texttt{file.create} command. To add a blank R source code file called {\emph{SourceCode.R}} to the {\emph{ExampleProject}} directory on the \emph{C} drive use: <>= file.create("C:\\ExampleProject\\SourceCode.R") @ \paragraph{{\tt{cat}}}\index{R function!cat}\label{catR} If you want to create a new file and put text into it use the \texttt{cat} (concatenate and print) command. For example, to create a new file in the current working directory called \emph{ExampleEcho.md} that includes the text ``Reproducible Research with R and RStudio'' type: <>= cat("Reproducible Research with R and RStudio", file = "ExampleCat.md") @ \noindent In this example we created a Markdown formatted file by using the \texttt{.md} file extension. We could, of course, change the file extension to \texttt{.R} to set it as an R source code file, \texttt{.Rnw} to create a \emph{knitr} LaTeX file, and so on. You can use \texttt{cat} to print the contents of one or more objects to a file. \textbf{Warning:} The \texttt{cat} command will overwrite existing files with the new contents. To add the text to existing files use the \texttt{append = TRUE} argument. <>= cat("More Text", file = "ExampleCat.md", append = TRUE) @ \paragraph{{\tt{unlink}}}\index{R function!unlink} You can use the {\tt{unlink}} command to delete files and directories. <>= unlink("C:\\ExampleProject\\SourceCode.R") @ \noindent \textbf{Warning:} the \texttt{unlink} command permanently deletes files, so be very careful using this command. \paragraph{{\tt{file.rename}}}\index{R function!file.rename} You can use the \texttt{file.rename} to, obviously, rename a file. It can also be used to move a file from one directory to another. For example, imagine that we want to move the \emph{ExampleCat.md} file from the directory \emph{ExampleProject} to one called \emph{MarkdownFiles} that we already created.\footnote{The \texttt{file.rename} command won't create new directories. To move a file to a new directory you will need to create the directory first with \texttt{dir.create}.} <>= file.rename(from = "C:\\ExampleProject\\ExampleCat.md", to = "C:\\MarkdownFiles\\ExampleCat.md") @ \paragraph{{\tt{file.copy}}}\index{R function!file.copy} The \texttt{file.rename} fully moves a file from one directory to another. To copy the file to another directory use the \texttt{file.copy} command. It has the same syntax as \texttt{file.rename}: <>= file.copy(from = "C:\\ExampleProject\\ExampleCat.md", to = "C:\\MarkdownFiles\\ExampleCat.md") @ \section{Unix-like Shell Commands for File Management} Though this book is mostly focused on using R for reproducible research it can be useful to use a Unix-like shell program\index{Unix-like shell program} to manipulate files in large projects. Unix-like shell programs including Bash on Mac and Linux\index{Bash} and Windows PowerShell\index{PowerShell} allow you to type commands to interact with your computer's operating system.\footnote{You can access Bash via the Terminal program\index{Terminal} on Mac OS 10 and Linux computers. It is the default shell on Mac and Linux, so it loads automatically when you open the Terminal. Windows PowerShell comes installed with Windows.} We will especially return to shell commands in the next chapter when we discuss Git\index{Git} version control and makefiles\index{makefile} for collecting data in Chapter \ref{DataGather}, as well as the command-line program\footnote{A command-line program\index{command-line program}\index{command-line} is just a program you run from a shell.} Pandoc (Chapter \ref{LargeDocs} and \ref{MarkdownChapter}). We don't have enough space to fully introduce shell programs or even all of the commands for manipulating files. We are just going to cover some of the basic and most useful commands for file management. For good introductions for Unix and Mac OS 10 computers see William E. Shotts Jr.'s \citeyearpar{ShottsJr2012} book on the Linux command-line. For Windows users, Microsoft maintains a tutorial on Windows PowerShell at \url{http://technet.microsoft.com/en-us/library/hh848793}. The commands discussed in this chapter should work in both Unix-like shells and Windows PowerShell. It's important at this point to highlight a key difference between R and Unix-like shell syntax. In shell commands you don't need to put parentheses around your arguments. For example, if I want to change my working directory to my Mac Desktop in a shell using the \texttt{cd} command I simply type:\footnote{Many shell code examples in other sources include the shell prompt, like the \texttt{\$} in Bash or \texttt{\textgreater{}} in PowerShell. These are like R's \texttt{\textgreater{}} prompt. I don't include the prompt in code examples in this book because you don't type them.} <>= cd /Users/Me/Desktop @ \noindent In this example \texttt{Me} is my user name. \paragraph{{\tt{cd}}}\index{shell command!cd} As we just saw, to change the working directory in the shell just use the {\tt{cd}} (change directory) command. Here is an example of changing the directory in Windows PowerShell: <>= cd C:/Users/Me/Desktop @ \noindent If you are in a child directory and want to change the working directory to the previous working directory you were in, simply type: <>= cd - @ \noindent If, for example, our current working directory is \emph{/User/Me/Desktop} and we typed \texttt{cd} followed by a minus sign (\texttt{cd -}) then the working directory would change to \emph{/User/Me}. Note this will not work in PowerShell. \paragraph{{\tt{pwd}}}\index{shell command!pwd} To find your current working directory, use the \texttt{pwd} command (present working directory). This is essentially the same as R's \texttt{getwd} command. <>= pwd ## /Users/Me/Desktop @ \paragraph{{\tt{ls}}}\index{shell command!ls} The \texttt{ls} (list) command works very similarly to R's \texttt{list.files} command. It shows you what is in the current working directory. <>= ls ## chapter4.Rnw images4 @ \noindent As we saw earlier, R also has an \texttt{ls} command.\index{R function!ls} R's \texttt{ls} command lists items in the R workspace. The shell's \texttt{ls} command lists files and directories in the working directory. \paragraph{{\tt{mkdir}}}\index{shell command!mkdir} Use \texttt{mkdir} to create a new directory. For example, if I wanted to create a directory in my Linux root directory called {\emph{NewDirectory}} I would type: <>= mkdir /NewDirectory @ \noindent If running this code on Mac or Linux gives you an error message like this: <>= mkdir: /NewDirectory: Permission denied @ \noindent you simply need to use the \texttt{sudo}\index{shell command!sudo} command to run the command with higher privileges. <>= sudo mkdir /NewDirectory @ \noindent Running this code will prompt you to enter your administrator password. \paragraph{{\tt{echo}}}\index{shell command!echo} There are a number of ways to create new files in Unix-like shells. One of the simplest ways is with the \texttt{echo} command. This command simply prints its arguments. For example: <>= echo Reproducible Research with R and RStudio @ \noindent If you add the greater-than symbol (\verb|>|) after the text you want to print and then a file name, \texttt{echo} will create the file (if it doesn't already exist) in the current working directory and then print the text into the file. <>= echo Reproducible Research with R and RStudio > ExampleEcho.md @ \noindent Using only one greater-than sign will completely erase the \emph{ExampleEcho.md} file's contents and replace them with \texttt{Reproducible Research with R and RStudio}. To add the text at the end of an existing file, use two greater-than signs (\verb|>>|). <>= echo More text. >> ExampleEcho.md @ \noindent There is also a \texttt{cat} shell command.\index{shell command!cat} It works slightly differently than the R version of the command and I don't cover it here. \paragraph{{\tt{rm}}}\index{shell command!rm} The {\tt{rm}} command is similar to R's {\tt{unlink}} command. It removes (deletes) files or directories. Again, be careful when using this command, because it permanently deletes the files or directories. <>= rm ExampleEcho.md @ \noindent As we saw in Chapter \ref{GettingStartedRKnitr}, R also has an \texttt{rm} command. It is different because it removes objects from your R workspace rather than files from your working directory. \paragraph{{\tt{mv}}}\index{shell command!mv} To move a file from one directory to another with the shell, use the \texttt{mv} (move) command. For example, to move the file \emph{ExampleEcho.md} from \emph{ExampleProjects} to \emph{MarkdownFiles} use the following code and imagine both directories are in the root directory:\footnote{If they were not in the root directory we would not place a forward slash at the beginning.} <>= mv /ExampleProject/ExampleEcho.md /MarkdownFiles @ \noindent Note that the \emph{MarkdownFiles} directory must already exist, otherwise it will simply rename the file. So this command is similar to the R command \texttt{file.rename}. \paragraph{{\tt{cp}}}\index{shell command!cp} The \texttt{mv} command completely moves a file from one directory to another. To copy a version of the file to a new directory use the \texttt{cp} command. The syntax is similar to \texttt{mv}: <>= cp /ExampleProject/ExampleEcho.md /MarkdownFiles @ \paragraph{{\tt{system}} (R command)}\index{R function!system}\label{systemRcommand}\index{R!shell commands} You can run shell commands from within R using R's \texttt{system} command. For example, to run the \texttt{echo} command from within R type: <>= system("echo Text to Add > ExampleEcho.md") @ \section{File Navigation in RStudio}\index{RStudio!Files pane}\index{RStudio} The RStudio \emph{Files} pane allows us to navigate our file tree and do some basic file manipulations. The left panel of Figure \ref{FilesPane} shows us what this pane looks like. The pane allows us to navigate to specific files and folders and delete and rename files. To select a folder as the working directory tick the dialog box next to the file then click the \texttt{More} button and select \texttt{Set As Working Directory}. Under the \texttt{More} button (\includegraphics[scale=0.5]{Children/Chapter4/images4/MoreIcon.png}) you will also find options to \texttt{Move} and \texttt{Copy} files (see the right pane of Figure \ref{FilesPane}). The \emph{Files} pane is a GUI, so our actions in the \emph{Files} pane are not as easily reproducible as the commands we learned earlier in this chapter. \begin{figure} \caption{The RStudio Files Pane} \label{FilesPane} \vspace{0.25cm} \centering \mbox{ \subfigure{\includegraphics[width=0.4\textwidth]{Children/Chapter4/images4/RStudioFiles.png}}\quad \subfigure{\includegraphics[width=0.4\textwidth]{Children/Chapter4/images4/MoreMore.png} }} \end{figure} \subsection*{Chapter summary} In this chapter we've learned how to organize our research files to enable dynamic replication. This included not only how they can be ordered in a computer's file system, but also the file path naming conventions--the addresses--that computers use to locate files. Once we know how these addresses work we can use R and shell commands to refer to and manipulate our files. This skill is particularly useful because it allows us to place code in text-based files to manipulate our project files in highly reproducible ways. In the next few chapters we will begin to put these skills in practice when we learn how to store our files and create data files in reproducible ways. ================================================ FILE: Old/Source-v2/Children/Chapter5/chapter5.Rnw ================================================ % Chapter Chapter 5 For Reproducible Research in R and RStudio % Christopher Gandrud % Created: 16/07/2012 05:45:03 pm CEST % Updated: 5 May 2015 <>= set_parent('Rep-Res-Parent.Rnw') @ \chapter{Storing, Collaborating, Accessing Files, and Versioning}\label{Storing} In addition to being well organized, your research files need to be accessible for other researchers to be able to reproduce your findings. A useful way to make your files accessible is to store them on a cloud storage service\footnote{These services store your data on remote servers.} \cite[see][]{Howe2012}. This chapter describes in detail two different cloud storage services--Dropbox and GitHub--that you can use to make your research files easily accessible to others. Not only do these services enable others to reproduce your research, they also have a number of benefits for your research workflow. Researchers often face a number of data management issues that, beyond making their research difficult to reproduce, can make doing the initial research difficult. First, there is the problem of \textbf{storing} data so that it is protected against computer failure--virus infections, spilling coffee on your laptop, and so on. Storing data locally--on your computer--or on a flash drive is generally more prone to loss than on remote servers in the cloud. Second, we may work on a project with different computers and mobile devices. For example, we may use a computer at work to run computationally intensive analysis, while editing our presentation document on a tablet computer while riding the train to the office. So, we need to be able to \textbf{access} our files from multiple devices in different locations. We often need a way for our \textbf{collaborators} to access and edit research files as well. Finally, we almost never create a data set or write a paper perfectly all at once. We may make changes and then realize that we liked an earlier version, or parts of an earlier version better. This is a particularly important issue in data management where we may transform our data in unintended ways and want to go back to earlier versions. Also, when working on a collaborative project, one of the authors may accidentally delete something in a file that another author needed. To deal with these issues we need to store our data in a system that has \textbf{version control}. Version control systems keep track of changes we make to our files and allows us to access previous versions if we want to.\index{version control} You can solve all of these problems in a couple of different ways using free or low cost cloud-based storage formats. In this chapter we will learn how to use Dropbox and Git/GitHub for research files: \begin{itemize} \item storage, \item accessing, \item collaboration, \item version control. \end{itemize} \section{Saving Data in Reproducible Formats} Before getting into the details of cloud-based data storage for all of our research files, let's consider what type of formats you should actually save your data in\index{data file formats}. A key issue for reproducibility is that others are able to not only get hold of the exact data you used in your analysis, but be able to understand and use the data now and in the future. Some file formats make this easier than others. In general, for small to moderately-sized data sets\footnote{I don't cover methods for storing and handling very large data sets--with high hundreds of thousands and more observations. For information on large data and R, not just storage, one place to look is this blog post from RDataMining: \url{http://rdatamining.wordpress.com/2012/05/06/online-resources-for-handling-big-data-and-parallel-computing-in-r/} (posted 6 May 2012). One popular service for large file storage is Amazon S3\index{Amazon S3} (\url{http://aws.amazon.com/s3/}). I haven't used this service and can't suggest ways to integrate it with R.} plain-text formats like comma-separated values\index{comma-separated values} (\texttt{.csv}) or tab-separated values\index{tab-separated values}\footnote{Sometimes this format is called tab-delimited values\index{tab-delimited values}.} (\texttt{.tsv}) are good ways to store your data. These formats simply store a data set as a text file. A row in the data set is a line in the text file. Data is separated into columns with commas or tabs, respectively. These formats are not dependent on a specific program. Any program that can open text files can open them, including a wide variety of statistical programs other than R as well as spreadsheet programs like Microsoft Excel.\index{Microsoft Excel} Using text file formats helps future-proof your research. Version control systems that track changes to text--like Git\index{Git}--are also very effective version control systems for these types of files. Use the \texttt{write.table} command\index{R function!write.table} to save data in plain-text formats from R. For example, to save a data frame called {\emph{Data}} as a CSV file called {\emph{MainData.csv}} in our example {\emph{DataFiles}} directory (see Figure \ref{ExampleTree}): <>= write.table(Data, "/ExampleProject/Data/DataFiles/MainData.csv", sep = ",", row.names = FALSE) @ \noindent \texttt{row.names = FALSE}\index{R!row.names} prevents R from including the row names in the output file.\footnote{Frequently the row names are just the row numbers which may have no substantive meaning.} The \texttt{sep = ","} argument specifies that we want to use commas to separate values into columns. For CSV files you can use a modified version of this command called \texttt{write.csv}\index{R function!write.csv}. This command simply makes it so that you don't have to write \texttt{sep = ","}.\footnote{\texttt{write.csv} is a `wrapper'\index{wrapper} for \emph{write.table}.} If you want to save your data with values separated by tabs, rather than commas, simply set the argument \verb|sep = "\t"| and set the file extension to \texttt{.tsv}.\label{TSVEscape} R is able to save data in a wide variety of other file formats, mostly through the {\emph{foreign}} package (see Chapter \ref{DataGather}). These formats may be less future-proof than simple text-formatted data files. One advantage of many other statistical programs' file formats is that they include not only the underlying data but also other information like variable descriptions. If you are using plain-text files to store your data you will need to include a separate file, preferably in the same directory as the data file describing the variables and their sources. In Chapter \ref{TablesChapter} (Section \ref{VarDescriptTables}) we will look at how to automate the creation of variable description files. \section{Storing Your Files in the Cloud: Dropbox} In this book we'll cover two (largely) free cloud storage services that allow you to store, access, collaborate on, and version control your research files. These services are Dropbox and GitHub.\footnote{Dropbox provides a minimum amount of storage for free, above which they charge a fee. GitHub lets you create publicly accessible repositories--kind of like project folders--for free, but they charge for private repositories.} Though they both meet our basic storage needs, they do so in different ways and require different levels of effort to set up and maintain. These two services are certainly not the only way to make your research files available. Research-oriented services include the SDSC Cloud,\footnote{\url{https://cloud.sdsc.edu/hp/index.php}} the Dataverse Project,\index{Dataverse Project}\footnote{\url{http://thedata.org/}} figshare,\footnote{\url{http://figshare.com/}} and RunMyCode.\footnote{\url{http://www.runmycode.org/}} These services include good built-in citation systems, unlike Dropbox and GitHub. They may be a very good place to store research files once the research is completed or close to completion. Some journals are beginning to require key reproducibility files be uploaded to these sites. However, these sites' ability to store, access, collaborate on, and version control files \emph{during} the main part of the research process is mixed. Services like Dropbox and GitHub are very capable of being part of the research workflow from the beginning. The easiest types of cloud storage for your research are services like Dropbox\footnote{\url{http://www.dropbox.com/}} and Google Drive.\footnote{\url{https://drive.google.com/}} These services not only store your data in the cloud, but also provide ways to share files. They even include basic version control capabilities. I'm going to focus on Dropbox because it currently offers a complete set of features that allow you to store, version, collaborate, and access your data. I will focus on how to use Dropbox on a computer. Some Dropbox functionality may be different on mobile devices. \subsection{Storage} When you sign up for Dropbox and install the program\footnote{See \url{https://www.dropbox.com/downloading} for downloading and installation instructions.} it creates a directory on your computer's hard drive. When you place new files and folders in this directory and make changes to them, Dropbox automatically syncs the directory with a similar folder on a cloud-based server. Typically when you sign up for the service you'll receive a limited amount of storage space for free, usually a few gigabytes. This is probably enough storage space for a number of text file-based research projects. \subsection{Accessing data} \label{EnablePublicFolder}All files stored on Dropbox have a URL address through which they can be accessed from a computer connected to the internet. Files in either the Dropbox \emph{Public}\index{Dropbox!Public folder}\footnote{Note: if you created your Dropbox account after 4 October 2012 you will not automatically have a \emph{Public} folder. To enable the folder on your account see this website: \url{https://www.dropbox.com/help/16/en}. You will need a Pro or Dropbox for Business account to enable a new Public folder. \index{Dropbox!enable Public folder}} folder or in other, non-\emph{Public} folders can be downloaded into R. Downloading files from these two different sources requires two different methods. Let's quickly look at how to download files from the Public folder. In the next chapter (see Section \ref{DropboxNonPublic}) we'll look at how to download data from non-\emph{Public} Dropbox folders into R. If the file is stored in the \emph{Public} folder, right-click on the file icon in your Dropbox folder on your computer. Then click \texttt{Copy Public Link}.\label{PublicLink} This copies the URL into your clipboard, from which you can paste it into your R source code (or wherever). If you are logged into the Dropbox website, right-click on files in your \emph{Public} folder and then select \texttt{Copy public link\ldots}. Once you have the URL you can load the file directly into R using the \verb|source_data| command in the \emph{repmis} package\index{repmis} \citep{R-repmis} for plain-text formatted data or use the \texttt{source\_url}\index{R function!source\_url} command in the \emph{devtools} package \citep{R-devtools} for source code files (see Chapter \ref{StatsModel}). Let's download data directly into R from my Dropbox Public folder. The data set's URL is: \url{https://www.dropbox.com/s/130c5ol3o2jjmgk/public.fin.msm.model.csv?dl=1}.\footnote{This data is from \cite{Gandrud2012}. I've shortened the URL using Bitly\index{Bitly} (\url{https://bitly.com/}) so that it will fit on the page.} <>= # Download data on Financial Regulators # stored in a Dropbox Public folder # Load repmis library(repmis) # Place the URL into the object FinURL FinURL <- "https://bit.ly/2xlQ2j5" # Download data FinRegulatorData <- source_data(FinURL, sep = ",", header = TRUE) # Show variables in FinRegulatorData names(FinRegulatorData) @ \noindent Let's go through this code.\label{SepHeadExplain} We already saw in our discussion of \texttt{write.table} how the \texttt{sep = ","} argument specifies that the data file's values are separated by commas. The \texttt{header = TRUE} argument tells R that the first row of the file contains the variable names. Note that from version 0.4 \emph{repmis} automatically guesses how the columns are separated and whether or not to use the first row as the header. Because of this, we usually don't need to set the \texttt{sep} and \texttt{header} arguments explicitly. I've only done so in this example for illustration. You're probably also wondering about the line that begins \emph{\texttt{\#\# SHA-1 hash of}} \ldots in the output. The long string of numbers and letters at the end of this line is basically a unique ID that \verb|source_data| assigns to the file. It is called an SHA-1 hash. We'll see SHA-1\index{SHA-1 hash} hashes more in the next section on GitHub (Section \ref{GitHubMain}) and in Chapter \ref{DataGather} (Section \ref{SecureData6}). To give you a preview: it allows us to see if the file that we downloaded is the file we thought we downloaded. To get a file's URL from your local Dropbox folder when the file is \emph{not} in your \emph{Public} folder, you also right-click on the file. Then choose \texttt{Share Dropbox Link}. This will copy the link URL into your clipboard. You can also get these URL links through the online version of your Dropbox. First, log into the Dropbox website. You can again right-click on the file name and then \texttt{Share}. This will bring up a box displaying the link for you to copy. Alternatively, when you hover your cursor over a file or folder you will see a \texttt{Share} icon (\includegraphics[scale=0.3]{Children/Chapter5/images5/DropboxLink.png}) appear on the far right. Clicking on this icon will also get you the link. In either case, you cannot use the \verb|source_data| command to download data from non-\emph{Public} folders into R. In the next chapter we'll see how to import this type of data into R (see Section \ref{DropboxNonPublic}). To give you a preview: we'll use the \verb|source_DropboxData|\index{R function!source\_DropboxData} function from the \emph{repmis}\index{repmis} package. \subsection{Collaboration} Though others can easily access your data and files with Dropbox URL links, you cannot save files through the link. You must save files in the Dropbox folder on your computer or upload them through the website. If you would like collaborators to be able to modify the research files you will need to `share' the Dropbox folder with them. You cannot fully share your \emph{Public} folder, i.e. give others write permission, so you will need to keep the files you want collaborators to be able to modify in a \emph{non-Public} folder. Once you create this \emph{non-Public} folder you can share it with your collaborators by going to the Dropbox website and right-clicking on the folder's name. Then select \texttt{Invite people to collaborate\ldots}. Enter your collaborator's email address when prompted. They will be sent an email that will allow them to accept the share request and, if they don't already have an account, sign up for Dropbox. \subsection{Version control} Dropbox has a simple version control system. Every time you save a document a new version is created on Dropbox. To view a previous version, navigate to the file on the Dropbox website. Then right-click on the file. In the menu that pops up select \texttt{Previous Versions}. This will take you to a webpage listing previous versions of the file, who created the version, and when it was created. A new version of a file is created every time you save a file and it is synced to the Dropbox cloud service. You can see a list of changes made to files in your Dropbox folder by going to the website and clicking on \texttt{Events}. Note that with a free Dropbox account, previous versions of a file are only stored for \textbf{30 days}. To be able to save previous versions for more than 30 days you will need a paid account.\footnote{For more details see: \url{https://www.dropbox.com/en/help/11}.} \section{Storing Your Files in the Cloud: GitHub}\label{GitHubMain}\index{GitHub|(}\index{Git|(} Dropbox adequately meets our four basic criteria for reproducible data storage. It is easy to set up and use. GitHub meets the criteria and more, especially when it comes to version control. It is, however, less straightforward at first. In this section we will learn enough of the basics to get you started using GitHub to store, access, collaborate on, and version control your research. GitHub is an interface and cloud hosting service built on top of the Git\index{Git} version control system.\footnote{I used Git version 1.7.9.6 for this book.} Git does the version control. GitHub stores the data remotely as well as providing a number of other features, some of which we look at below. GitHub was not explicitly designed to host research projects or even data. It was designed to host ``socially coded'' computer programs--in what Git calls ``repositories''\index{git!repository}--repos\index{GitHub!repo}\index{repo} for short--by making it easy for a number of collaborators to work together to build computer programs. This seems very far from reproducible research. Remember that as reproducible researchers, we are building projects out of interconnected text files. In important ways, this is exactly the same as building a computer program. Computer programs are also basically large collections of interconnected text files. Like computer programmers, we need ways to store, version control, access, and collaborate on our text files. Because GitHub is very actively used by people with similar needs (who are also really good programmers), the interface offers many highly developed and robust features for reproducible researchers. GitHub's extensive features and heart in the computer programming community means that it takes a longer time than Dropbox for novice users to set up and become familiar with. So we need good reasons to want to invest the time needed to learn GitHub. Here is a list of GitHub's advantages over Dropbox for reproducible research that will hopefully convince you to get started using it:\footnote{Because many of these features apply to any service that relies on Git, much of this list of advantages also applies to alternative Git cloud storage services such as Bitbucket (\url{https://bitbucket.org/}).} \\[0.25cm] \noindent{\bf{Storage and access}} \begin{itemize} \item Dropbox simply creates folders stored in the cloud which you can share with other people. GitHub makes your projects accessible on a fully featured project website (see Figure \ref{BookRepository}). An example feature is that it automatically renders Markdown files called {\emph{README.md}}\footnote{You can use a variety of other markup languages as well. See \url{https://GitHub.com/GitHub/markup}.} in a GitHub directory on the repository's website. This makes it easy for independent researchers to find the file and read it. \item GitHub can create and host a website for your research project that you could use to present the results, not just the replication files. \end{itemize} \vspace{0.2cm} \noindent{\bf{Collaboration}} \begin{itemize} \item Dropbox allows multiple people to share files and change them. GitHub does this and more. \item GitHub keeps meticulous records of who contributed what to a project. \item Each GitHub repository has an ``Issues'' area where you can note issues and discuss them with your collaborators. Basically, this is an interactive to-do list for your research project. It also stores the issues so you have a full record. \item Each repository can also host a wiki\index{wiki} that, for example, could explain in detail how certain aspects of a research project were done. \item Anyone can suggest changes to files in a public repository. These changes can be accepted or declined by the project's authors. The changes are recorded by the Git version control system. This could be especially useful if an independent researcher notices an error. \end{itemize} \vspace{0.2cm} \noindent{\bf{Version control}} \begin{itemize} \item Dropbox's version control system only lets you see files' names, the times they were created, who created them, and revert back to specific versions. Git tracks every change you make. The GitHub website and GUI programs for Mac and Windows provide nice interfaces for examining specific changes in text files. \item Dropbox creates a new version every time you save a file. This can make it difficult to actually find the version you want as the versions quickly multiply. Git's version control system only creates a new version when you tell it to. \item All files in Dropbox are version controlled. Git allows you to ignore specific files. This is helpful if you have large binary files (i.e. not text files) that you do not want to version control because doing so will use up considerable storage space. \item Unless you have a paid account, previous file versions in Dropbox disappear after 30 days. GitHub stores previous versions indefinitely for all account types. \item Dropbox does not merge conflicting versions of a file together. This can be annoying when you are collaborating on a project and more than one author is making changes to documents at the same time. Git identifies conflicts and lets you reconcile them. \item Git is directly integrated into RStudio Projects\index{RStudio!Projects}.\footnote{RStudio also supports the Subversion\index{Subversion version control} version control system, but I don't cover that here.} \end{itemize} \begin{figure} \caption{A Basic Git Repository with Hidden {\emph{.git}} Folder Revealed} \label{BasicGitRepo} \begin{center} \includegraphics[width=0.5\textwidth]{Children/Chapter5/images5/BasicGitRepository.png} \end{center} \end{figure} \subsection{Setting up GitHub: Basic} There are at least three ways to use Git/GitHub on your computer. You can use the command-line version of Git. It's available for Mac and Linux (in the Terminal\index{Terminal}) as well as Windows through Git Bash\index{Git Bash}.\footnote{The interface for Git Bash looks a lot like the Terminal or Windows PowerShell.} You can also use the Graphical User Interface GitHub program. Currently, it's only available for Windows and Mac. RStudio also has GUI-style Git functionality for RStudio Projects.\index{RStudio!Projects} In this section I focus on how to use the command-line version, because it will help you understand what the GUI versions are doing and allow you to better explore more advanced Git features not covered in this book. In the next section I will mention how to use Git with RStudio Projects. The first thing to do to set up Git and GitHub is go to the GitHub website (\url{https://GitHub.com/}) and sign up for an account. Second, you should go to the following website for instructions on setting up GitHub:\index{git!install} \url{https://help.GitHub.com/articles/set-up-git}. The instructions on that website are very comprehensive, so I'll direct you there for the full setup information. Note that installing the GUI version of GitHub also installs Git and, on Windows, Git Bash. \subsection{Version control with Git} Git is primarily a version control system, so we will start our discussion of how to use it by looking at how to version your repositories. \paragraph{Setting up Git repositories locally} You can setup a Git repo on your computer with the command-line.\footnote{Much of the discussion of the command-line in this section is inspired by Nick Farina's blog post on Git (see \url{http://nfarina.com/post/9868516270/git-is-simpler}, posted 7 September 2012).} I keep my repositories\index{git!repository} in a folder called {\emph{git\_repositories}},\footnote{To follow along with this code you will first need to create a folder called {\emph{git\_repositories}} in your root directory. Note also that throughout this section I use Unix file path conventions.} though you can use Git with almost any directory you like. The \emph{git\_repositories} directory has the root folder as its parent. Imagine that we want to set up a repository in this directory for a project called {\emph{ExampleProject}}. Initially it will have one README file called {\emph{README.md}}. To do this, we would first type into the Terminal for Mac and Linux computers: <>= # Make new directory 'ExampleProject' mkdir /git_repositories/ExampleProject # Change to directory 'ExampleProject' cd /git_repositories/ExampleProject # Create new file README.md echo "# An Example Repository" > README.md @ \noindent So far we have only made the new directory and set it as our working directory (see Chapter \ref{DirectoriesChapter}). All of the examples in this section assume your current working directory is set to the repo. Then, with the \texttt{echo} shell command we created a new file named {\emph{README.md}}\index{README file} that includes the text \verb|# An Example Repository|. Note that the code is basically the same in Windows PowerShell\index{PowerShell} or Git Bash\index{Git Bash}. Also, you don't have to do these steps in the command-line. You could just create the new folders and files the same way that you normally do with your mouse in your GUI operating system. Now that we have a directory with a file, we can tell Git that we want to treat the directory {\emph{ExampleProject}} as a repository and that we want to track changes made to the file {\emph{README.md}}. Use Git's \texttt{init} (initialize)\index{Git command!init} command to set the directory as a repository. See Table \ref{GitCommandsTable} for the list of Git commands covered in this chapter.\footnote{For a comprehensive guide to Git commands, see \url{http://git-scm.com/}.} Use Git's \texttt{add} command to add a file to the Git repository.\index{Git command!add} For example, <>= # Initialize the Git repository git init # Add README to the repository git add README.md @ \noindent You probably noticed that you always need to put \texttt{git} before the command. This tells the shell what program the command is from. When you initialize a folder as a Git repository, a hidden folder called {\emph{.git}} is added to the directory (see Figure \ref{BasicGitRepo}). This is where all of your changes are kept. If you want to add all of the files in the working directory to the Git repository type: <>= # Add all files to the repository git add . @ \noindent When we want Git to track changes made to files added to the repository we can use the \texttt{commit}\index{Git command!commit} command. In Git language we are ``committing'' the changes to the repository. <>= # Commit changes git commit -a -m "First Commit, created README file" @ \noindent Note: the files won't appear on GitHub yet. Later in the chapter we will learn how to push commits to your remote GitHub repository. The \texttt{-a} (all) option commits changes made to all of the files that have been added to the repository. You can include a message with the commit using the \texttt{-m} option like: \texttt{"First Commit, created README file"}. Messages help you remember general details about individual commits. This is helpful when you want to revert to old versions. \textbf{Remember:} Git only tracks changes when you commit them. Finally, you can use the \texttt{status} command for details about your repository, including uncommitted changes. Generally it's a good idea to use the \texttt{-s} (short) option, so that the output is more readable. <>= # Display status git status -s @ \begin{table} \caption{A Selection of Git Commands} \label{GitCommandsTable} \begin{center} \begin{tabular}{l p{7cm}} \hline Command & Description \\[0.25cm] \hline\hline \texttt{add} & Add a file to a Git repository. \\[0.25cm] \texttt{branch} & Create and delete branches. \\[0.25cm] \texttt{checkout} & Checkout a branch. \\[0.25cm] \texttt{clone} & Clone a repository (for example, the remote GitHub version) into the current working directory. \\[0.25cm] \texttt{commit} & Commit changes to a Git repository. \\[0.25cm] \texttt{fetch} & Download objects from the remote (or another) repository. \\[0.25cm] \texttt{.gitignore} & Not a Git command, but a file you can add to your repository to specify what files/file types Git should ignore. \\[0.25cm] \texttt{init} & Initialize a Git repository. \\[0.25cm] \texttt{log} & Show a repo's commit history. \\[0.25cm] \texttt{merge} & Merge two or more commits/branches together. \\[0.25cm] \texttt{pull} & \texttt{fetch} data from a remote repository and try to \texttt{merge} it with your commits. \\[0.25cm] \texttt{push} & Add committed changes to a remote Git repository, i.e. GitHub. \\[0.25cm] \texttt{remote add} & Add a new remote repository to an existing project. \\[0.25cm] \texttt{rm} & Remove files from Git version tracking. \\[0.25cm] \texttt{status} & Show the status of a Git repository including uncommitted changes made to files. \\[0.25cm] \texttt{tag} & Bookmark particularly significant commits. \\[0.25cm] \hline \end{tabular} \end{center} {\scriptsize{Note: when you use these commands in the shell, you will need to precede them with \texttt{git} so the shell knows what program they are from.}} \end{table} \begin{figure}[t] \caption{Part of this Book's GitHub Repository Webpage} \label{BookRepository} \begin{center} \includegraphics[width=0.9\textwidth]{Children/Chapter5/images5/GitHubReadme.png} \end{center} \end{figure} \paragraph{Checkout}\index{Git command!checkout} It is useful to step back for a second and try to understand what Git is doing when you commit your changes. In the hidden {\emph{.git}}, folder Git is saving all of the information in compressed form from each of your commits into a sub-folder called {\emph{objects}}. Commit objects\index{git!commit object}\footnote{Other Git objects include trees\index{git!tree} (sort of like directories), tags\index{git!tag} (bookmarks for important points in a repo's history), and blobs\index{git!blob} (individual files).} are everything from a particular commit. I mean everything. If you delete all of the files in your repository (except for the {\emph{.git}} folder) you can completely recover all of the files from your most recent commit with the \texttt{checkout} command: <>= # Checkout latest commit git checkout -- . @ \noindent Note that there is a space between the two dashed lines and the period. You can also change to any other commit or any committed version of a particular file with \texttt{checkout}. Simply replace the \verb|--| with the commit reference.\index{git!commit reference} Note that the period at the end is still very important to include after the commit reference. The commit reference is easy to find and copy from a repository's GitHub webpage\index{GitHub!repository webpage} (see below for more information on how to create a GitHub webpage).\footnote{You can also search your commit history and roll back to a previous commit using only the command-line. To see the commit history use the \texttt{log}\index{Git command!log} command (more details at \url{http://git-scm.com/book/en/Git-Basics-Viewing-the-Commit-History}). When a repo has many commits, this can be a very tedious command to use, so I highly recommend the GUI version of GitHub or the repo's GitHub website.} For an example of a GitHub repo webpage, see Figure \ref{BookRepository}. Click on the link that lists the number of repo commits on the left-hand side of the repo's webpage. This will show you all of the commits. A portion of this book's commit history is shown in Figure \ref{BookHistory}. By clicking on the {\tt{Browse Code}} icon (\includegraphics[scale=0.4]{Children/Chapter5/images5/BrowseCodeIcon.png}) you can see what the files at any commit looked like. Next to this button is another with a series of numbers and letters. This is the commit's SHA-1 hash.\footnote{Secure Hash Algorithm}\index{SHA-1 hash} For our purposes, it is the commit's reference number. Click on the {\tt{Copy SHA}} button to the left of the SHA to copy it. You can then paste it as an argument to your {\tt{git checkout}} command. This will revert you to that particular commit. Also include the file name if you want to revert to a particular version of a particular file. \begin{figure}[t] \caption{Part of this Book's GitHub Repository Commit History Page} \label{BookHistory} \begin{center} \includegraphics[width=0.9\textwidth]{Children/Chapter5/images5/CommitHistory.png} \end{center} \end{figure} \paragraph{Tags} SHA-1 hashes are a bit cumbersome to use as references. What was the hash number for that one commit? To solve this problem you can add bookmarks, known as ``tags'',\index{git!tag}\index{Git command!tag} to particularly important commits. Imagine we just committed our first full draft of a project. We want to tag it as version 0.1, i.e. ``v0.1''. To do this use Git's tag command: <>= # Tag most recent commit v0.1 git tag -a v0.1 -m "First draft" @ \noindent The \verb|-a| option adds the tag \texttt{v0.1} and \verb|-m| lets us add a message. Now we can checkout this particular commit by using its tag, i.e.: <>= # Checkout v0.1 git checkout v0.1 @ \noindent This will create a new ``branch'' with a generic name \emph{(detached from v0.1)} where you can make changes and commit them. If you plan to checkout a previous tagged version and make changes to it, it is a good idea to specifically name the branch using the \verb|-b| argument.\footnote{If you don't, then the new branch will have a ``detached head'' which will create problems using the branch in the future.} For example, to give it the name \emph{v0.1Branch} type: <>= # Checkout v0.1 as v0.1Brance git checkout v0.1 -b v0.1Branch @ \noindent What is a branch? \paragraph{Branches} Sometimes you may want to work on an alternative version of your project and then merge changes made to this version back into the main one. For example, the main version could be the most stable current copy of your research, while the alternative version could be a place where you test out new ideas. Git allows you to create a new \emph{branch}\index{git!branch}\index{Git command!branch} (alternative version of the repo) which can be merged back into the \emph{master} (main) branch. To see what branch you are using type: <>= # Show git branch git branch @ To create a new branch use, simply enough, the \texttt{branch} command. For example, to create a new branch called \emph{Test}: <>= # Create Test branch git branch Test @ \noindent You can now use \texttt{checkout} to switch to this branch.\footnote{To delete the \emph{Test} branch use the \texttt{-d} argument, i.e. \texttt{git branch -d Test}.} Here is a shortcut for creating and checking out the branch: <>= # Create and checkout Test branch git checkout -b Test @ \noindent The \texttt{-b} (branch) option for \texttt{checkout} creates the new \emph{Test} branch before switching to it. To merge\index{Git command!merge} changes you commit in the \emph{Test} branch to the \emph{master}, \texttt{add} and \texttt{commit} your changes, \texttt{checkout} the \emph{master} branch, then use the \texttt{merge}\index{merge}\index{git!merge} command.\footnote{Any uncommitted changes are merged with a branch when it is checked out.} <>= # Add files git add . # Commit changes to Test branch git commit -a -m "Commit changes to Test" # Checkout master branch git checkout master # Merge master and Test branches git merge Test @ \noindent Note, when you merge a branch you may encounter conflicts in the files that make it impossible to smoothly merge the files together. Git will tell you what and where these are; you then need to decide what to keep and what to delete. \paragraph{Having Git ignore files} There may be files in your repository that you do not want to keep under version control. Maybe this is because they are very large files or cached files from \emph{knitr} or other files that are byproducts of compiling an R LaTeX document (see Chapter \ref{StatsModel}). To have Git ignore particular files, simply create a file called \emph{.gitignore}.\footnote{Note that like \emph{.git}, \emph{.gitignore} files are hidden.}\index{git!.gitignore}\index{git!ignore files} You can either put this file in the repository's parent directory to create a \emph{.gitignore} file for the whole repository or in a subdirectory to ignore files in that subdirectory. In the \emph{.gitignore} file, add ignore rules by simply including the names of the files that you want to have Git ignore. For example, a \emph{.gitignore} file that is useful for ignoring files that are the byproduct of compiling an R LaTeX file would look something like this: <>= # Ignore LaTeX compile byproduct files # ######################################## *.aux *.bbl *.blg cache/* figure/* *.log *.pdf *.gz *.tex @ \noindent The asterisk (\verb|*|) is a ``wildcard''\index{wildcard} and stands for any character. In other words, it tells Git to look for files with any name that end with a specified file extension. This is faster than writing out the full name of every file you want to ignore individually. It also makes it easy to copy the rules into new repos. You'll notice the \texttt{cache/*} and \texttt{figure/*} rules. These tell Git to ignore all of the files in the \emph{cache} and \emph{figure} subdirectories. These files are the product of caching code chunks and creating figures with \emph{knitr}, respectively. Git will not ignore files that have already been committed to a repository. To ignore these files you will first need to remove them from Git with Git's \texttt{rm} (remove) command.\index{Git command!rm} If you wanted to remove a file called \emph{ExampleProject.tex} from version tracking type: <>= # Remove ExampleProject.tex from Git version tracking git rm --cached ExampleProject.tex @ \noindent Using the \texttt{--cached} argument tells Git not to track the file, but not delete it. For more information on \emph{.gitignore} files, see GitHub's reference page on the topic at: \url{https://help.GitHub.com/articles/ignoring-files}. \subsection{Remote storage on GitHub} So far we've been using repos stored locally. Let's now look at how to also store a repository remotely on GitHub. You can either create a new repository on GitHub and download (\texttt{clone}) it to your computer or upload (\texttt{push}) an existing repository to a new GitHub remote repo. In both cases you need to create a new repository on GitHub. To create a new repository on GitHub go to your main GitHub account webpage and click the \texttt{New repository} button. On the next page that appears, give the repository a name, brief description, and choose whether to make it public or private. If you want to store an existing repository on GitHub give it the same name as the one that already exists on your computer. If you already have files in your local repository do not check the boxes for creating \emph{README.md}, \emph{LICENSE}, and \emph{.gitignore} files. When you then click \texttt{Create Repository} you will be directed to the repository's GitHub webpage.\footnote{Before the repo has any files in it, the webpage will include instructions for how to set it up on your computer.}\label{NewGitHubRepo} \paragraph{Clone a new remote repository} If you are working with a new repository and do not have an existing version on your computer you need to ``clone'' the GitHub repo to your computer.\index{git!clone} The repo's GitHub page contains a button called \texttt{Clone in Desktop}. Clicking this will open GUI GitHub (if it is installed) and prompt you to specify what directory on your computer you would like to clone the repository into. You can also use the \texttt{clone}\index{Git command!clone} command in the shell. Imagine that the URL for a repo called \emph{Example Project} is \texttt{https://GitHub.com/USERNAME/ExampleProject.git}. To clone it into the \emph{/git\_repositories} directory type:\footnote{If you are on the repo's webpage the URL to copy is under \texttt{HTTPS clone URL}.}\label{GitClone} <>= # Change working directory cd /git_repositories/ # Clone ExampleProject git clone https://GitHub.com/USERNAME/ExampleProject.git @ \paragraph{Push an existing repository to a new GitHub repo}\label{RemoteAdd} If you already have a repository with files in it on your computer and you want to store them remotely in a new GitHub repo, you need to add the remote repository and \texttt{push}\index{Git command!push} your files to it. Type Git's \texttt{remote add}\index{Git command!remote add} command. For example, if your repository's GitHub URL is \texttt{https://GitHub.com/USERNAME/ExampleProject.git}, then type: {\small <>= # Change working directory to existing local repo cd /git_repositories/ExampleProject # Add a remote (GitHub) repository to an existing repo git remote add origin https://GitHub.com/USERNAME/ExampleProject.git @ } \noindent This will tell your local repository where the remote one is. Finally, push the repository to GitHub: <>= # Push local repository to GitHub for the first time git push -u origin master @ \noindent The \texttt{-u} (upstream tracking)\index{git!upstream tracking} option adds a tracking reference for the upstream (GitHub) repository branches. \paragraph{Pushing commits to a GitHub repo} Once you have your local repository connected to GitHub you can add new commits with the \texttt{push}\index{Git command!push} command. For example, if your current working directory is the Git repo you want to push and you have already added/committed the changes you want to include in the remote repo, type: <>= # Add changes to the GitHub remote master branch git push origin master @ \noindent The \texttt{origin} is simply the remotely stored repository on GitHub and \texttt{master} is the master branch. You can change this to another branch if you'd like. If you have not set up password caching\footnote{See \url{https://help.GitHub.com/articles/set-up-git} for more details.} you will now be prompted to give your GitHub user name and password. You can also push your tags to GitHub.\index{GitHub!tag} To push all of the tags to GitHub type: <>= git push --tags @ \noindent Now on the repo's GitHub page there will be a \texttt{Tags} section that will allow you to view and download the files in each tagged version of the repository. \subsection{Accessing on GitHub} \paragraph{Downloading into R}\label{GitDownload} In general, the process of downloading data directly into R is similar to what we saw earlier for loading data from Dropbox Public folders. We can simply use the \verb|source_data| command.\index{R function!source\_data} First we need to find our plain-text data file's \emph{raw} URL. To do this, go to your repository's GitHub site, navigate to the file you want to load, and click the \texttt{Raw} button on the right just above the file preview.\label{RawGitHub} I have data in comma-separated values format stored in a GitHub\index{GitHub} repository.\footnote{For full information about the disproportionality data set, please see \url{http://christophergandrud.github.io/Disproportionality_Data/}.} The URL for the raw (plain-text) version of the data is \url{https://raw.githubusercontent.com/christophergandrud/Disproportionality_Data/master/Disproportionality.csv}.\footnote{It has been shortened with Bitly\index{Bitly} in the example.} <>= # Place shortened url into URLAddress UrlAddress <- "http://bit.ly/14aSjxB" # Download data DispropData <- repmis::source_data(UrlAddress) # Show variable names names(DispropData) @ \noindent \verb|source_data| downloaded the most recent version of the file from the master branch. As we saw in Section \ref{SepHeadExplain}, running \verb|source_data| gives us a line beginning \emph{\texttt{\#\# SHA-1 hash of}} \ldots. \textbf{Note:} this SHA-1 hash\index{SHA-1 hash} is different from the file's Git commit's SHA-1 hash we discussed earlier. The \verb|source_data| SHA-1 hash is specific to the \emph{file}, and has nothing to do with Git. We will look at this hash more in Chapter \ref{DataGather} (Section \ref{SecureData6}). We can actually use \verb|source_data| to download a particular version of a file--from a particular Git commit--directly into R. This makes reproducing a specific result much easier. To do this you just need to use a file's raw URL from a particular commit. To find a file's particular commit raw URL first click on the file on GitHub's website. Then click the \texttt{History} button (\includegraphics[scale=0.3]{Children/Chapter8/images8/GitHistory.png}). This will take you to a page listing all of the file's versions. Click on the \texttt{Browse Code} button (\includegraphics[scale=0.5]{Children/Chapter5/images5/BrowseCodeIcon.png}) next to the version of the file that you want to use. Click on the \texttt{Raw} button to be taken to the text-only version of the file. Finally, copy this page's URL address and use it with \verb|source_data|. For example, I have an old version of the disproportionality data. To download it I find this particular version of the file's URL and use it in \verb|source_data|: {\small{ <>= # Create object containing the file's URL OldUrlAddress <- paste0("https://raw.githubusercontent.com/", "christophergandrud/", "Disproportionality_Data/", "1a59d360b36eade3b183d6336a", "2262df4f9555d1/", "Disproportionality.csv") # Download old disproportionality data DispropOld <- repmis::source_data(OldUrlAddress) @ }} \noindent In this example I did not shorten the URL, but instead used the \texttt{paste0}\index{R function!paste0} function to paste it together.\footnote{\texttt{paste0} is the same as \texttt{paste}\index{R function!paste}, but has the argument \texttt{sep = ""} so that white space is not placed between the pasted elements.} You do not have to do this. I did it here so that the URL would fit on the printed page. Notice that the URL is the same as before with one exception: instead of \verb|master| after \verb|Disproportionality_Data| we have this strange series of number and letters: \verb|1a59d360b36ea| \ldots. This is the \emph{commit's} SHA-1 hash.\index{SHA-1 hash} As we will see in Chapter \ref{StatsModel} (Section \ref{sourceurl}) we can use a very similar process to easily run source code files in R directly downloaded from GitHub with the \verb|source_url| command. \paragraph{Viewing files} The GitHub web user interface also allows you, your collaborators (see below) or, if the repo is public, anyone to look at text files from a web browser. Collaborators can actually also create, modify, and commit changes in the web user interface. This can be useful for making small changes, especially from a mobile device without a Git installation. Anyone with a GitHub account can suggest changes to files in a public repository on the repo's website. Simply click the \texttt{Edit} button (\includegraphics[scale=0.5]{Children/Chapter5/images5/EditIcon.png}) above the file and make edits. If the person making the edits is not a designated collaborator, their edits will be sent to the repository's owner for approval.\footnote{This is called a \texttt{pull} request\index{Git command!pull} in Git terminology. See the next section for more details.} This can be a useful way for independent researchers to fix errors. \subsubsection{Collaboration with GitHub} Repositories can have official collaborators that can make changes to files in the repo. Public repositories can have unlimited collaborators. Anyone with a GitHub account can be a collaborator. To add a collaborator to a repository you created, click on the \texttt{Settings} button on the repository's website (see Figure \ref{BookRepository}). Then click the \texttt{Collaborators} button on the left-hand side of the page. You will be given a box to enter your collaborator's GitHub user name. If your collaborator doesn't have a GitHub account, they will have to create a new one. Once you add someone as a collaborator they can clone the repository onto their computer as you did earlier and push changes. \paragraph{Syncing a repository} If you and your collaborators are both making changes to the files in a repo you might create conflicting changes, i.e. different changes to the same part of a file.\index{conflicts, in files} To avoid too many conflicts, it is a good idea to sync your local repository with the remote repository \textbf{before} you push your commits to GitHub. Use the \texttt{pull command}\index{Git command!pull} to sync your local and remote repository. First add and commit your changes, then type: <>= # Sync repository git pull @ \noindent If the files you are pulling conflict with your local files you will probably want to resolve these in the individual files and commit the changes. When there are merge conflicts, Git adds both versions of a piece of text to the file. You then open the file and decide which version to keep and which one to delete. When the conflicts are resolved and changes committed, push your merged changes up to the remote repository as we did before. \subsection{Summing up the GitHub workflow} We've covered a lot of ground in this section. Let's sum up the basic GitHub workflow you will probably follow once your repo is set up. \begin{enumerate} \item Add any changes you've made with \texttt{git add}. \item \texttt{commit} the changes. \item \texttt{pull} your collaborators' changes from the GitHub repo, resolve any merge conflicts, and \texttt{commit} the changes. \item \texttt{push} your changes to GitHub. \end{enumerate} \begin{figure} \caption{Creating RStudio Projects} \label{NewRStudioProject} \begin{center} \includegraphics[scale=0.5]{Children/Chapter5/images5/GitNewProject.png} \end{center} \end{figure} \begin{figure} \caption{Creating RStudio Projects in New Directories} \label{NewProjectNewDirectory} \begin{center} \includegraphics[scale=0.5]{Children/Chapter5/images5/NewProject_NewDirectory.png} \end{center} \end{figure} \section{RStudio \& GitHub} When you open a Project\index{RStudio!Project}\index{RStudio} with a Git repository in RStudio you will see a new \emph{Git}\index{Git} tab next to \emph{Environment} and \emph{History} (see Figure \ref{GitTab}). From here you can do many of the things we covered in the previous section. Let's look at how to set up and use Git in RStudio Projects. \subsection{Setting up Git/GitHub with Projects} You can Git initialize new RStudio Projects, Git initialize existing projects, and create RStudio Projects\index{RStudio!Projects} from cloned repos. When you do any of these things RStudio automatically adds a \emph{.gitignore}\index{git!.gitignore} file telling Git to ignore \emph{.Rproj.user}, \emph{.Rhistory}, and \emph{.RData} files. \paragraph{Git with a new project}\label{NewProjectGit} To create a new project with Git version control, go to \texttt{File} in the RStudio menu bar. Then click \texttt{New Project\ldots}. In the box that appears (see Figure \ref{NewRStudioProject}) select \texttt{New Directory} \textrightarrow \texttt{Empty Project}. Enter the Project's name and desired directory. Make sure to check the dialog box for \texttt{Create a git repository} (see Figure \ref{NewProjectNewDirectory}). \paragraph{Git initialize existing projects} If you have an existing RStudio Project and want to add Git version control to it, first go to \texttt{Tools} in the RStudio menu bar. Then select \texttt{Project Options \ldots}. Select the \texttt{Git/SVN} icon. Finally, select \texttt{Git} from the drop-down menu for \texttt{Version Control System:}. \paragraph{Clone repository into a new project} Again go to \texttt{File} in the RStudio menu bar to create a new project from a cloned GitHub repository.\index{GitHub} Then click \texttt{New Project\ldots}. Select the \texttt{Version Control} option and then \texttt{Git}. Finally, paste the repository's URL in the field called \texttt{Repository URL:}, enter the directory you would like to locate the cloned repo in, and click \texttt{Create Project}. \paragraph{Add existing Project repository to GitHub} You can push an existing Project repository stored on your computer to a new remote repository on GitHub. To do this, first create a new repo on GitHub with the same name as your RStudio Project (see Section \ref{NewGitHubRepo}). Then copy the remote repository's URL like we saw before when we cloned a repository from GitHub (see Section \ref{GitClone}). Open a new shell from within RStudio. To do this, click the \texttt{Shell} button in the \emph{Git} tab's \texttt{More} drop-down menu. Now follow the same steps that we used in Section \ref{RemoteAdd} to connect a locally stored repository to GitHub for the first time. \begin{figure} \caption{The RStudio Git Tab} \label{GitTab} \begin{center} \begin{subfigure} \caption{New \emph{ExampleProject} \emph{Git} Tab} \label{fig:NewGitTab} \includegraphics[scale=0.6]{Children/Chapter5/images5/GitTab.png} \end{subfigure} \vspace{0.25cm} \begin{subfigure} \caption{Adding Changes to the Repository} \label{fig:AddingChangesToRepo} \includegraphics[scale=0.6]{Children/Chapter5/images5/GitAdd.png} \end{subfigure} \end{center} \end{figure} \subsection{Using Git in RStudio Projects} The RStudio \emph{Git} tab\index{RStudio!Git tab} allows you to do many of the same things with Git that we covered in the previous section. In the top panel of Figure \ref{GitTab} you will see the \emph{Git} tab for a new RStudio Project called \emph{ExampleProject}. It has two files that have not been added or committed to Git. To add and commit the files to the repository, click on the dialog boxes next to the file names. In the bottom panel of Figure \ref{GitTab} you can see that I've created a new R file called \emph{ExampleScript.R} and clicked the dialog box next to it, along with the other files. The yellow question marks in the top panel have now become green A's for ``add''. Clicking \texttt{Commit} opens a new window called \textbf{Review Changes}\index{RStudio!Review Changes window}\index{Git commit!add}\index{Git command!commit} where you can commit the changes. Simply write a commit message in the box called \emph{Commit Message} in the \textbf{Review Changes} window and click \texttt{Commit}. If you add file names to the \emph{.gitignore} files, they will not show up in RStudio's \emph{Git} tab. If you are using a GitHub repo that is associated with a remote repository on GitHub,\index{GitHub} you can push and pull it with the \texttt{Pull Branches}\index{Git command!pull} and \texttt{Push Branch}\index{Git command!push} buttons in Git menu bar (the blue and green arrows, respectively). You can use the same buttons in the \textbf{Review Changes} window. The \emph{Git} tab also allows you to change branches, revert to previous commits, add files to \verb|.gitignore|, and view your commit history. You can always use the \texttt{More \textrightarrow{} Shell \ldots} option to open a new shell with the Project set as the working directory to complete any other Git task you might want to do. \index{GitHub|)}\index{Git|)} \subsection*{Chapter summary} In this chapter we have primarily learned how to store text-based reproducible research files in ways that allow us and others to access them easily from many locations, enable collaboration, and keep a record of previous versions. In the next chapter we will learn how to use text-based files to reproducibly gather data that we can use in our statistical analyses. ================================================ FILE: Old/Source-v2/Children/Chapter6/chapter6.Rnw ================================================ % Chapter Chapter 6 For Reproducible Research in R and RStudio % Christopher Gandrud % Created: 16/07/2012 05:45:03 pm CEST % Updated: 5 May 2015 <>= set_parent('Rep-Res-Parent.Rnw') @ \chapter{Gathering Data with R}\label{DataGather} How you gather your data directly impacts how reproducible your research will be. You should try your best to document every step of your data gathering process. Reproduction will be easier if your documentation--especially, variable descriptions and source code--makes it easy for you and others to understand what you have done. If all of your data gathering steps are tied together by your source code, then independent researchers (and you) can more easily regather the data. Regathering data will be easiest if running your code allows you to get all the way back to the raw data files--the rawer the better. Of course, this may not always be possible. You may need to conduct interviews or compile information from paper based archives, for example. The best you can sometimes do is describe your data gathering process in detail. Nonetheless, R's automated data gathering capabilities for internet-based information is extensive. Learning how to take full advantage of these capabilities greatly increases reproducibility and can save you considerable time and effort over the long run. In this chapter we'll learn how to gather quantitative data in a fully reproducible way. We'll start by learning how to use data gathering makefiles to organize your whole data gathering process so that it can be completely reproduced. Then we will learn the details of how to actually load data into R from various sources, both locally on your computer and remotely via the internet. In the next chapter (Chapter \ref{DataClean}) we'll learn the details of how to cleanup raw data so that it can be merged together into data frames that you can use for statistical analyses. %%%%%%%%%%%%% Organizing data gathering \section{Organize Your Data Gathering: Makefiles} Before getting into the details of using R to gather data, let's start by creating a plan to organize the process. Organizing your data gathering process from the beginning of a research project improves the possibility of reproducibility and can save you significant effort over the course of the project by making it easier to add and regather data later on. A key part of reproducible data gathering with R, like reproducible research in general, is segmenting the process into modular\index{modular files} files that can all be run by a common ``makefile''\index{makefile}. In this chapter we'll learn how to create make-like files run exclusively from R as well as GNU Make makefiles\index{GNU Make}\index{GNU},\footnote{GNU stands for ``GNU's Not Unix'', indicating that it is Unix-like.}\index{GNU Make} which you run from a shell.\footnote{To standardize things, I use the terms ``R make-like file'' for files created and run in R and the standard ``makefile'' for files run by Make.} Learning how to create R make-like files is fairly easy. Using GNU Make does require learning some more new syntax. However, it has one very clear advantage: it only runs a source code file that has been updated since the last time you ran the makefile. This is very useful if part of your data gathering process is very computationally and time intensive. Segmenting your data gathering into modular files and tying them together with some sort of makefile allows you to more easily navigate research text and find errors in the source code. The makefile's output is the data set that you'll use in the statistical analyses. There are two types of source code files that the makefile runs: data gathering/cleanup files and merging files. Data cleanup files bring raw individual data sources into R and transform them so that they can be merged together with data from the other sources. Many of the R tools for data cleanup and merging will be covered in Chapter \ref{DataClean}. In this chapter we mostly cover the ways to bring raw data into R. Merging files are executed by the makefile after it runs the data gathering/cleanup files. It's a good idea to have the source code files use very raw data as input. Your source code should avoid directly changing these raw data files. Instead changes should be put into new objects and data files. Doing this makes it easier to reconstruct the steps you took to create your data set. Also, while cleaning and merging your data you may transform it in unintended ways, for example, accidentally deleting some observations that you wanted to keep. Having the raw data makes it easy to go back and correct your mistakes. The files for the examples used in this section can be downloaded from GitHub at: \url{http://bit.ly/YnMKBG}. \subsection{R Make-like files} When you create make-like files in R to organize and run your data gathering you usually only need one or two commands, {\tt{setwd}}\index{R function!setwd} and {\tt{source}}\index{R function!source}. As we talked about in Chapter \ref{DirectoriesChapter}, {\tt{setwd}} simply tells R where to look for and place files. {\tt{source}} tells R to run code in an R source code file.\footnote{We use the {\tt{source}} command more in the Chapter \ref{StatsModel}.} Let's see what an R data make file might look like for a project with a file structure similar to the example project in Figure \ref{ExampleTree}. The file paths in this example are for Unix-like systems and the make-like file is called \emph{Makefile.R}. <>= ################ # Example R make-like file # Christopher Gandrud # Updated 15 January 2015 ################ # Set working directory setwd("/ExampleProject/Analysis/Data/") # Gather and cleanup raw data files. source("Gather1.R") source("Gather2.R") source("Gather3.R") # Merge cleaned data frames into data frame object CleanedData source("MergeData.R") @ This code first sets the working directory. Then it runs three source code files to gather data from three different sources. These files gather the data and clean it so that it can be merged together. The cleaned data frames are available in the current workspace. Next the code runs the \emph{MergeData.R} file that merges the data frames and saves the output data frame as a CSV\index{CSV} formatted file. The CSV file could be the main file we use for statistical analysis. \emph{MergeData.R} also creates a Markdown file with a table describing the variables and their sources. We'll come back to how to create tables in Chapter \ref{TablesChapter}. You can run the commands in this file one by one or run the make-like file by putting it through the \texttt{source} command so that it will run it all at once. \subsection{GNU Make} R make-like files are a simple way to tie together a segmented data gathering process. If one or more of the source files that our example before runs is computationally intensive it is a good idea to run them only when they are updated. However, this can become tedious, especially if there are many segments. The well-established GNU Make\index{GNU Make} command-line program\footnote{GNU Make was originally developed in 1977 by Stuart Feldman as a way to compile computer programs from a series of files, its primary use to this day. For an overview see: \url{http://en.wikipedia.org/wiki/Make_(software)}. For installation instructions please see Section \ref{InstallMake}.} deals with this problem by comparing the output files' time stamps\footnote{A file's time stamp records the time and date when it was last changed.}\index{time stamp} to time stamps of the source files that created them. If a source file has a time stamp that is newer than its output, Make will run it. If the source's time stamp is older than its output, Make will skip it. In Make terminology the output files are called ``targets''\index{Make!targets} and the files that create them are called ``prerequisites''\index{Make!prerequisites}. You specify a ``recipe''\index{Make!recipe} to create the targets from the prerequisites. The recipe is basically just the code you want to run to make the target file. The general form is: \begin{knitrout} \definecolor{shadecolor}{rgb}{1, 1, 1} \color{fgcolor} \begin{kframe} \begin{verbatim} TARGET ... : PREREQUISITE ... RECIPE ... ... \end{verbatim} \end{kframe} \end{knitrout} Note that, unlike in R, tabs are important in Make. They indicate what lines are the recipe. Make uses the recipe to ensure that targets are newer than prerequisites. If a target is newer than its prerequisite, Make does not run the prerequisite. The basic idea of reproducible data gathering with Make is similar to what we saw before, with a few twists and some new syntax. Let's see an example that does what we did before: gather data from three sources, clean and merge the data, and save it in CSV\index{CSV} format. \subsubsection{Example makefile} The first thing we need to do is create a new file called \emph{Makefile}\footnote{Alternatively you can call the file \emph{GNUmakefile} or \emph{makefile}.} and place it in the same directory as the data gathering files we already have. The makefile we are going to create runs prerequisite files by the alphanumeric order of their file names. So we need to ensure that the files are named in the order that we want to run them. Now let's look at the actual makefile: \begin{knitrout} \definecolor{shadecolor}{rgb}{1, 1, 1} \color{fgcolor} \begin{kframe} \begin{verbatim} ################ # Example Makefile # Christopher Gandrud # Updated 1 July 2013 # Influenced by Rob Hyndman (31 October 2012) # See: http://robjhyndman.com/researchtips/makefiles/ ################ # Key variables to define RDIR = . MERGE_OUT = MergeData.Rout # Create list of R source files RSOURCE = $(wildcard $(RDIR)/*.R) # Files to indicate when the RSOURCE file was run OUT_FILES = $(RSOURCE:.R=.Rout) # Default target all: $(OUT_FILES) # Run the RSOURCE files $(RDIR)/%.Rout: $(RDIR)/%.R R CMD BATCH $< # Remove Out Files clean: rm -fv $(OUT_FILES) # Remove MergeData.Rout cleanMerge: rm -fv $(MERGE_OUT) \end{verbatim} \end{kframe} \end{knitrout} \noindent Ok, let's break down the code. The first part of the file defines variables that will be used later on. For example, in the first line of executable code (\texttt{RDIR = .}) we create a simple variable\footnote{Simple string variables are often referred to as ``macros''\index{Make!macros} in GNU Make. A common convention in Make and Unix-like shells generally is to use all caps for variable names.} called \texttt{RDIR} with a period (\texttt{.}) as its value. In Make and Unix-like shells, periods indicate the current directory. The next line allows us to specify a variable for the outfile created by running the \emph{MergeData.R} file. This will be useful later when we create a target for removing this file to ensure that the \emph{MergeData.R} file is always run. The third executed line (\verb|RSOURCE:= $(wildcard $(RDIR)/*.R)|) creates a variable containing a list of all of the names of files with the extension \texttt{.R}, i.e. our data gathering and merge source code files. This line has some new syntax, so let's work through it. In Make (and Unix-like shells generally) a dollar sign (\verb|$|)\index{Make!\$} followed by a variable name substitutes the value of the variable in place of the name.\footnote{This is a kind of parameter expansion\index{parameter expansion}. For more information about parameter expansion see \cite{Frazier2008}.} For example, \verb|$(RDIR)| inserts the period \texttt{.} that we defined as the value of \texttt{RDIR} previously. The parentheses are included to clearly demarcate where the variable name begins and ends.\footnote{Braces (\texttt{\{\}}) are also sometimes used for this.} You may remember the asterisk (\verb|*|) from the previous chapter. It is a ``wildcard'',\index{wildcard}\label{AsteriskWildcard} a special character that allows you to select file names that follow a particular pattern. Using \verb|*.R| selects any file name that ends in \texttt{.R}. Why did we also include the actual word \texttt{wildcard}?\index{Make function!wildcard}\index{Make!wildcard} The \texttt{wildcard} function is different from the asterisk wildcard character. The function creates a list of files that match a pattern. In this case the pattern is \verb|$(RDIR)/*.R|. The general form for writing the \texttt{wildcard} function is: \verb|$(wildcard PATTERN)|. The third line (\verb|OUT_FILES = $(RSOURCE:.R=.Rout)|) creates a variable for the \texttt{.Rout} files that Make will use to tell how recently each R file was run.\footnote{The R out-file contains all of the output from the R session used while running the file. These can be a helpful place to look for errors if your makefiles give you an error like \texttt{make: *** [Gather.Rout] Error 1}.\index{Make!Error 1}} \verb|$(RSOURCE:.R=.Rout)| is a variable that uses the same file name as our RSOURCE files, but with the file extension \texttt{.Rout}. The second part of the makefile tells Make what we want to create and how to create it. In the line \verb|all: $(OUT_FILES|) we are specifying the makefile's default target.\index{Make!targets} Targets are the files that you instruct Make to make. \texttt{all:} sets the default target; it is what Make tries to create when you enter the command \texttt{make} in the shell with no arguments. We will see later how to instruct Make to compile different targets. The next two executable lines (\verb|$(RDIR)/%.Rout: $(RDIR)/%.R| and \verb|R CMD BATCH $<|) run the R source code files in the directory. The first line specifies that the \texttt{.Rout} files are the targets of the \texttt{.R} files. The percent sign (\verb|%|) is another wildcard.\index{Make!\%} Unlike the asterisk, it replaces the selected file names throughout the command used to create the target. The dollar and less-than signs (\verb|$<|) indicate the first prerequisite for the target, i.e. the \texttt{.R} files. \texttt{R CMD BATCH}\index{R CMD BATCH} is a way to call R from a Unix-like shell, run source files, and output the results to other files.\footnote{You will need to make sure that R is in your PATH. Setting this up is different on different systems. If on Mac and Linux you can load R from the Terminal by typing \texttt{R}, R is in your PATH. The usual R installation usually sets this up correctly. There are different methods for changing the file path on different versions of Windows.} The out-files it creates have the extension \texttt{.Rout}. The next two lines specify another target: \texttt{clean}. When you type \texttt{make clean} into your shell Make will follow the recipe: \verb|rm -fv $(OUT_FILES)|.\index{shell command!rm} This removes (deletes) the \texttt{.Rout} files. The \texttt{f} argument (force) ignores files that don't exist and the \texttt{v} argument (verbose) instructs Make to tell you what is happening when it runs. When you delete the \texttt{.Rout} files, Make will run all of the \texttt{.R} files the next time you call it. The last two lines help us solve a problem created by the fact that our simple makefile doesn't push changes downstream. For example, if we make a change to \emph{Gather2.R} and run \texttt{make}, only \emph{Gather2.R} will be rerun. The new data frame will not be added to the final merged data set. To overcome this problem the last two lines of code create a target called \texttt{cleanMerge}, this removes only the \emph{MergeData.Rout} file. \paragraph{Running the Makefile} To run the makefile for the first time, simply change the working directory to where the file is and type \texttt{make} into your shell. It will create the CSV final data file and four files with the extension \texttt{.Rout}, indicating when the segmented data gathering files were last run.\footnote{If you open these files you fill find the output from the R session used when their source file was last run.} When you run \verb|make| in the shell for the first time you should get the output: \begin{knitrout} \definecolor{shadecolor}{rgb}{1, 1, 1} \color{fgcolor} \begin{kframe} \begin{verbatim} ## R CMD BATCH Gather1.R ## R CMD BATCH Gather2.R ## R CMD BATCH Gather3.R ## R CMD BATCH MergeData.R \end{verbatim} \end{kframe} \end{knitrout} \noindent If you run it a second time without changing the R source files you will get the following output: \begin{knitrout} \definecolor{shadecolor}{rgb}{1, 1, 1} \color{fgcolor} \begin{kframe} \begin{verbatim} ## make: Nothing to be done for 'all'. \end{verbatim} \end{kframe} \end{knitrout} \noindent To remove all of the \texttt{.Rout} files, set the make target to \texttt{clean}:\index{Make!clean} \begin{knitrout} \definecolor{shadecolor}{rgb}{1, 1, 1} \color{fgcolor} \begin{kframe} \begin{verbatim} make clean ## rm -fv ./Gather1.Rout ./Gather2.Rout ./Gather3.Rout ## ./MergeData.Rout ## ./Gather1.Rout ## ./Gather2.Rout ## ./Gather3.Rout ## ./MergeData.Rout \end{verbatim} \end{kframe} \end{knitrout} \noindent If we run the following code:\label{MakeAllCommand} <>= # Remove MergeData.Rout and make all R source files make cleanMerge all @ \noindent then Make will first remove the \emph{MergeData.Rout} file (if there is one) and then run all of the R source files as need be. \emph{MergeData.R} will always be run. This ensures that changes to the gathered data frames are updated in the final merged data set. \subsubsection{Makefiles and RStudio Projects} You can run makefiles from RStudio's \emph{Build} tab.\index{RStudio!Build} For the type of makefile we have been using, the main advantage of running it from within RStudio is that you don't have to toggle between RStudio and the shell. Everything is in one place. Imagine that the directory with our makefile is an RStudio Project.\index{RStudio!Projects} If a Project already contains a makefile, RStudio will automatically open a \emph{Build} tab on the \emph{Environment/History} pane, the same place where the \emph{Git} tab appears (see Figure \ref{BuildTab}).\footnote{If a project doesn't have a makefile you can still set up RStudio Build. Click on \texttt{Build} in the Menu bar then \texttt{Configure Build Tools . . .}. Select \texttt{Makefile} from the drop-down menu then \texttt{Ok}. You will still need to manually add a Makefile in the Project's root directory.} The \emph{Build} tab has buttons you can click to \texttt{Build All} (this is equivalent to \texttt{make all}), and, in the \texttt{More} drop-down menu, \texttt{Clean all} (i.e., \texttt{make clean}) and \texttt{Clean and Rebuild} (i.e., \texttt{make clean all}). As you can see in Figure \ref{BuildTab}, the \emph{Build} tab shows you the same output you get in the shell. \begin{figure} \caption{The RStudio Build Tab} \label{BuildTab} \begin{center} \includegraphics[scale=0.5]{Children/Chapter6/images6/BuildTab.png} \end{center} \end{figure} \subsubsection{Other information about makefiles} Note that Make relies heavily on commands and syntax of the shell program that you are using. The above example was written and tested on a Mac. It should work on other Unix-like computers without modification. You can use Make to build almost any project from the shell, not just to run R source code files. It was an integral part of early reproducible computational research \citep{Fomel2009, Buckheit1995}. Rob Hyndman more recently posted a description of the makefile he uses to create a project with R and LaTeX.\footnote{See his blog at: \url{http://robjhyndman.com/researchtips/makefiles/}. Posted 31 October 2012. This method largely replicates what we do in this book with \emph{knitr}. Nonetheless, it has helpful information about Make that can be used in other tasks. It was in fact helpful for writing this section of the book.} The complete source of information on GNU Make is the official online manual. It is available at: \url{http://www.gnu.org/software/make/manual/}. \section{Importing Locally Stored Data Sets} Now that we've covered the big picture, let's learn the different tools you will need to know to gather data from different types of sources. The most straightforward place to load data from is a local file, e.g. one stored on your computer. Though storing your data locally does not really encourage reproducibility, most research projects will involve loading data this way at some point. The tools you will learn for importing locally stored data files will also be important for most of the other methods further on. Data stored in plain-text files on your computer can be loaded into R using the \texttt{read.table}\index{R function!read.table} command. For example, imagine we have a CSV file called \emph{TestData.csv} stored in the current working directory. To load the data set into R simply type: <>= TestData <- read.table("TestData.csv", sep = ",", header = TRUE) @ \noindent See Section \ref{SepHeadExplain} for a discussion of the arguments in this command. If you are using RStudio you can do the same thing with drop-down menus. To open a plain-text data file click on \texttt{Environment} \textrightarrow\: \texttt{Import Dataset\ldots} \textrightarrow\: \texttt{From Text File\ldots}. In the box that pops up, specify the column separator, whether or not you want the first line to be treated as variable labels, and other options. This is initially easier than using \texttt{read.table}. But it is much less reproducible. If the data is not stored in plain-text format, but is instead saved in a format created by another statistical program such as SPSS,\index{SPSS} SAS,\index{SAS} or Stata,\index{Stata} we can import it using commands in the \emph{foreign} package\index{foreign}. For example, imagine we have a data file called \emph{Data1.dta} stored in our working directory. This file was created by the Stata\index{Stata} statistical program. To load the data into an R data frame object called \emph{StataData} simply type: <>= # Load foreign package library(foreign) # Load Stata formatted data StataData <- read.dta(file = "Data1.dta") @ \noindent As you can see, commands in the \emph{foreign} package have similar syntax to \texttt{read.table}. To see the full range of commands and file formats that the \emph{foreign} package supports, use the following command: <>= library(help = "foreign") @ If you have data stored in a spreadsheet format such as Excel's\index{Microsoft Excel} \emph{.xlsx}, it may be best to first cleanup the data in the spreadsheet program by hand and then save the file in plain-text format. When you cleanup the data make sure that the first row has the variable names and that observations are in the following rows. Also, remove any extraneous information such as notes, colors, and so on that will not be part of the data frame. To aid reproducibility, locally stored data should include careful documentation of where the data came from and how, if at all, it was transformed before it was loaded into R. Ideally, the documentation would be written in a text file saved in the same directory as the raw data file. \section{Importing Data Sets from the Internet} There are many ways to import data that is stored on the internet directly into R. We have to use different methods depending on where and how the data is stored. \subsection{Data from non-secure ({\tt{http}}) URLs} Importing data into R that is located at a non-secure URL\index{URL}\footnote{URL stands for ``Uniform Resource Locator''.}--ones that start with {\tt{http}}\index{http}--is straightforward provided that: \begin{itemize} \item the data is stored in a simple format, e.g. plain-text, \item the file is not embedded in a larger HTML\index{HTML} website. \end{itemize} \noindent We already discussed the first issue in detail. You can determine if the data file is embedded in a website by opening the URL in your web browser. If you only see the raw plain-text data, you are probably good to go. To import the data, simply include the URL as the file's name in your \texttt{read.table} command. \subsection{Data from secure ({\tt{https}}) URLs}\label{SecureDataDownload} \noindent Storing data at non-secure URLs is becoming less common. Services like Dropbox and GitHub now store their data at secure URLs.\footnote{Dropbox used to host files in the Public folder at non-secure URLs, but switched to secure URLs.} You can tell if the data is stored at a secure web address if it begins with \texttt{https}\index{https} rather than \texttt{http}. We have to use different commands to download data from secure URLs. Let's look at three methods for downloading data into R: \verb|source_data|, \verb|source_DropboxData|, and the \emph{RCurl} package. \paragraph{Loading data from secure URLs with {\tt{source\_data}}}\label{SecureData6} As we saw in Chapter \ref{Storing}, we can use the \verb|source_data| command in the \emph{repmis} package to simplify the process of downloading data from Dropbox \emph{Public} folders (Section \ref{EnablePublicFolder}) and GitHub (Section \ref{GitDownload}).\index{repmis}\index{R function!source\_data} You can use \verb|source_data| to download data in plain-text format from almost any URL, as long as the file is not embedded in a larger HTML website. One problem for reproducible research with sourcing data located on the internet is that data files may change without us knowing. This could change the results we get. Luckily, we can solve this problem with \verb|source_data|. In Chapter \ref{Storing} we saw that when we run the \verb|source_data| command we not only download a data file, but also find its SHA-1 hash.\index{SHA-1 hash} The SHA-1 hash is basically a unique number for the file. If the file changes, its SHA-1 hash will change. Once we know the file's SHA-1 hash we can use \verb|source_data|'s \verb|sha1| argument to make sure the file that we downloaded is the same as the one we intended to download. For example, let's find the SHA-1 hash for the disproportionality data set we downloaded in the last chapter (Section \ref{GitDownload}):\footnote{Remember we placed the file's raw GitHub URL address inside of the object \emph{UrlAddress}.} <>= DispropData <- repmis::source_data(UrlAddress) @ \noindent You can see that the file's SHA-1 hash begins \emph{\texttt{20a0b022bbcf}} \ldots. Let's see what happens when we try to download an older version of the same file while placing this SHA-1 hash in \verb|source_url|'s \verb|sha1| argument. The URL of the alternative version of the file is in the object \emph{OldUrlAddress}:\footnote{See Section \ref{GitDownload} for the full URL.} <>= DispropData <- repmis::source_data(OldUrlAddress, sha1 = "20a0b022bbcf947917878680df85f7b4dcaaf44a") @ \noindent If we set the \texttt{sha1} argument in our replication files, others can be sure that they are using the same data files that we used to generate a particular result. It may not be practical to do this while a piece of research is under active development, as the files may be regularly updated. However, it can be very useful for source code files that underlie published results. \paragraph{Loading data from Dropbox non-Public folders with {\tt{source\_DropboxData}}}\label{DropboxNonPublic} Files stored on Dropbox\index{Dropbox}\index{Dropbox!non-Public folders} non-\emph{Public} folders are a little trickier to download. If you go to the Dropbox website and click the \texttt{Share Link} button next to a file (\includegraphics[scale=0.35]{Children/Chapter5/images5/DropboxLink.png}) you will be given an information box. This is not the raw data file. Luckily, \emph{repmis}\index{repmis} includes a \verb|source_DropboxData|\index{R function!source\_DropboxData} command for downloading data stored in a non-Public Dropbox folder into R. It works in much the same way as \verb|source_data|, the only difference is that instead of using the URL we need (a) the file's name and (b) its Dropbox key.\index{Dropbox!key} To find the file's key simply click on the \texttt{Share Link} button next to the file on the Dropbox website. Look at the URL for the webpage that appears. Here's an example: \url{https://dl.dropboxusercontent.com/s/exh4iobbm2p5p1v/fin_research_note.csv} You can see that the last part of the URL (\texttt{fin\_research\_note.csv}) is the data file's name. The key is the string of letters and numbers just after \texttt{https://www.dropbox.com/s/}, i.e. \texttt{exh4iobbm2p5p1v}. Now that we have the file name and key we can download the data into R using \verb|source_DropboxData|. For example: <>= # Download data from a Dropbox non-Public folder FinDataFull <- repmis::source_DropboxData("fin_research_note.csv", "exh4iobbm2p5p1v", sep = ",", header = TRUE) @ \paragraph{Loading data using {\normalfont{RCurl}}} A more laborious way to download data from a secure URL that does not rely on \emph{repmis} is to use the \texttt{getURL}\index{getURL}\index{R function!getURL} command in the {\emph{RCurl}} package \cite[]{R-RCurl} as well as \verb|read.table|\index{R function!read.table} and \texttt{textConnection}.\index{R function!textConnection} The latter commands are in base R. The two rules about data being stored in plain text-formats and not being embedded in a larger HTML website apply to this method as well. Let's try an example. To download the data file we used in Section \ref{GitDownload} into R we could use this code: <>= # Put URL address into the object UrlAddress UrlAddress <- paste0("https://raw.githubusercontent.com/", "christophergandrud/Disproportionality", "_Data/master/Disproportionality.csv") # Download Electoral disproportionality data DataUrl <- RCurl::getURL(UrlAddress) # Convert Data into a data frame DispropData <- read.table(textConnection(DataUrl), sep = ",", header = TRUE) # Show variables in the data names(DispropData) @ \noindent If running \texttt{getURL(UrlAddress)} gives you an error about an \texttt{SSL certificate problem} simply add the argument \texttt{ssl.verifypeer = FALSE}. This allows you to skip certification verification and access the data.\footnote{For more details see the \emph{RCurl} help page at \url{http://www.omegahat.org/RCurl/FAQ.html}.} \subsection{Compressed data stored online} Sometimes data files are large, making them difficult to store and download without compressing\index{file compression} them. There are a number of compression methods such as Zip\index{Zip} and Tar\index{Tar}.\footnote{Tar archives are sometimes referred to as `tar balls'.\index{tar balls}} Zip files have the extension {\tt{.zip}} and Tar files use extensions such as {\tt{.tar}} and {\tt{.gz}}. In most cases\footnote{Some formats that require the {\emph{foreign}} package to open are more difficult. This is because functions such as {\tt{read.dta}} for opening Stata {\tt{.dta}} files only accept file names or URLs as arguments, not connections, which you create for unzipped files.} you can download, decompress, and create data frame objects from these files directly in R. To do this you need to:\footnote{The description of this process is based on a Stack Overflow comment by Dirk Eddelbuettel (see {\url{http://stackoverflow.com/questions/3053833/using-r-to-download-zipped-data-file-extract-and-import-data?answertab=votes\#tab-top}}, posted 10 June 2010.)} \begin{itemize} \item create a temporary file with {\tt{tempfile}} to store the zipped file, which you will later remove with the {\tt{unlink command}}\index{R function!unlink} at the end, \item download the file with {\tt{download.file}},\index{R function!download.file} \item decompress the file with one of the {\tt{connections}}\index{R function!connections} commands in base R,\footnote{To find a full list of commands type {\tt{?connections}} into the R console.} \item read the file with {\tt{read.table}}.\index{R function!read.table} \end{itemize} \noindent The reason that we have to go through so many extra steps is that compressed files are more than just a single file and contain a number of files as well as metadata.\index{metadata} Let's download a compressed file called {\emph{uds\_summary.csv}} from \cite{Pemstein2010}. It's in a compressed file called {\emph{uds\_summary.csv.gz}}. At the time of writing, the file's URL address is {\url{http://www.unified-democracy-scores.org/files/20140312/z/uds_summary.csv.gz}}. <>= # For simplicity, store the URL in an object called 'URL' URL <- "http://www.unified-democracy-scores.org/files/20140312/z/uds_summary.csv.gz" # Create a temporary file called 'temp' to put the zip file into. temp <- tempfile() # Download the compressed file into the temporary file. download.file(URL, temp) # Decompress the file and convert it into a data frame UDSData <- read.csv(gzfile(temp, "uds_summary.csv")) # Delete the temporary file. unlink(temp) # Show variables in data names(UDSData) @ \subsection{Data APIs \& feeds} There are a growing number of packages that can gather data directly from a variety of internet sources and import them into R. Most of these packages use the sources' web application programming interfaces (API).\index{API} Web APIs allow programs to interact with a website. Needless to say, this is great for reproducible research. It not only makes the data gathering process easier as you don't have to download many Excel files and fiddle around with them before even getting the data into R, but it also makes replicating the data gathering process much more straightforward and makes it easy to update data sets when new information becomes available. Some examples of these packages include: \begin{itemize} \item The \emph{openair} package \citep{R-openair},\index{openair} which beyond providing a number of tools for analyzing air quality data also has the ability to directly gather data directly from sources such as Kings College London's London Air (\url{http://www.londonair.org.uk/}) database.\index{air quality} \item The \emph{quantmod} package \citep{R-quantmod} allows you to access data from Google Finance,\footnote{\url{http://www.google.com/finance}} Yahoo Finance,\footnote{\url{http://finance.yahoo.com/}} and the US Federal Reserve's FRED\footnote{\url{http://research.stlouisfed.org/fred2/}} economic database.\index{finance data}\index{Google!Finance}\index{Yahoo Finance}\index{US Federal Reserve} \item The \emph{treebase} package by \cite{Boettiger2012} allows you to access phylogenetic\index{phylogenetic} data from TreeBASE.\footnote{\url{http://treebase.org}}\index{treebase} \item The \emph{twitteR} package \citep{R-twitteR} accesses Twitter's\footnote{\url{https://twitter.com/}} API. This allows you to download data from Twitter\index{Twitter} including tweets\index{tweet} and trending topics.\index{twitteR} \item The \emph{WDI} package \citep{R-WDI} allows you to directly download data from the World Bank's\index{World Bank}\index{WDI} Development Indicators database.\footnote{\url{http://data.worldbank.org/data-catalog/world-development-indicators}} This database includes numerous country-level economic, health, and environment variables. \item The rOpenSci\footnote{\url{http://ropensci.org/}}\index{rOpenSci} group has and is developing a number of packages for accessing scientific data from web-based sources with R. They have a comprehensive set of packages for accessing biological data and academic journals. For a list of their packages see: \url{http://ropensci.org/packages/index.html}. \item Stack Exchange's\index{Stack Exchange} Cross Validated\index{Cross Validated} website\footnote{{\small{\url{http://stats.stackexchange.com/questions/12670/data-apis- feeds-available-as-packages-in-r}}}} also has a fairly comprehensive and regularly updated list of APIs accessible from R packages. \end{itemize} \paragraph{API Package Example: World Bank Development Indicators} Each of these packages has its own syntax and it isn't possible to go over all of them here. Nonetheless, let's look at an example of accessing World Bank data with the \emph{WDI}\index{WDI}\index{World Bank} to give you a sense of how these packages work. Imagine that we want to gather data on fertilizer consumption.\index{fertilizer} We can use \emph{WDI}'s \texttt{WDIsearch} command to find fertilizer consumption data available at the World Bank: {\small <>= # Load WDI package library(WDI) # Search World Bank for fertilizer consumption data WDIsearch("fertilizer consumption") @ } \noindent This shows us a selection of indicator numbers and their names.\footnote{You can also search the World Bank Development Indicators website. The indicator numbers are at the end of each indicator's URL.} Let's gather data on countries' fertilizer consumption in kilograms per hectare of arable land. The indicator number for this variable is: AG.CON.FERT.ZS. We can use the command \texttt{WDI} to gather the data and put it in an object called \emph{FertConsumpData}. <>= FertConsumpData <- WDI(indicator = "AG.CON.FERT.ZS") @ \noindent The data we downloaded looks like this: <>= # This ensures that the PDF will still compile even if the data is unavailable via WDI #load(file = "Source/Children/Chapter6/FertData.RData") @ <>= head(FertConsumpData) @ \noindent You can see that \texttt{WDI} has downloaded data for four variables: \textbf{iso2c},\footnote{These are the countries' or regions' International Standards Organization's\index{ISO} two-letter codes. For more details see: \url{http://www.iso.org/iso/country_codes.htm}.} \textbf{country}, \textbf{AG.CON.FERT.ZS} and \textbf{year}. \section{Advanced Automatic Data Gathering: Web Scraping} \index{web scraping|(} If a package does not already exist to access data from a particular website, there are other ways to automatically ``scrape'' data with R. This section briefly discusses some of R's web scraping tools and techniques to get you headed in the right direction to do more advanced data gathering. \paragraph{The general process} Simple web scraping involves downloading a file from the internet, parsing\index{parse} it (i.e. reading it), and extracting the data you are interested in then putting it into a data frame object. We already saw a simple example of this when we downloaded data from the a secure HTTPS website. We downloaded a website's content from a URL address into R with the \texttt{getURL}\index{R function!getURL} command. We then parsed the downloaded text as a CSV formatted data file, extracted it, and put it into a new data frame object. This was a relatively simple process, because the webpage was very simply formatted. It basically only contained the CSV formatted text. So, the process of parsing and extracting the data was very straightforward. You may not be so lucky with other data sources. Data may be stored in an HTML\index{HTML} formatted table within a more complicated HTML marked up webpage. The \emph{XML}\index{XML} package \citep{R-XML} has a number of useful commands such as \texttt{readHTMLTable}\index{R function!readHTMLTable} for parsing and extracting this kind of data. The \emph{XML} package also clearly has functions for handling XML formatted data.\footnote{XML stands for ``Extensible Markup Language''.}\index{XML} In addition, the helpful \emph{rvest} \citep{R-rvest} package provides an easy to use set of functions with capabilities similar to and often more capable than \emph{XML}.\index{rvest} If the data is stored in JSON\footnote{JSON means ``JavaScript Object Notation''}\index{JSON} you can read it with the \emph{rjson} \citep{R-rjson}\index{rjson} or \emph{RJSONIO} \citep{R-RJSONIO}\index{RJSONIO} packages. There are more websites with APIs\index{API} than R packages designed specifically to access each one. If an API is available, the \emph{httr} package \citep{R-httr}\index{httr} may be useful. It is a wrapper\index{wrapper} for \emph{RCurl} intended to make accessing APIs easier. \paragraph{More tools to learn for web scraping} Beyond learning about the various R packages that are useful for R web scraping, an aspiring web scraper should probably invest time learning a number of other skills: \begin{itemize} \item HTML:\index{HTML} Obviously you will encounter a lot of HTML markup when web scraping. Having a good understanding of the HTML markup language will be very helpful. W3 Schools (\url{http://www.w3schools.com/}) is a free resource for learning HTML as well as JSON, JavaScript, XML, and other languages you will likely come across while web scraping. \item Regular Expressions:\index{regular expressions} Web scraping often involves finding character patterns. Some of this is done for you by the R packages above that parse text. There are times, however, when you are looking for particular patterns, like tag\index{HTML tag} IDs, that are particular to a given website and change across the site based on a particular pattern. You can use regular expressions to deal with these situations. R has a comprehensive if bare-bones introduction to regular expressions. To access it type \verb|?regex|\index{R!regex} into your R console. \item Looping:\index{loop} Web scraping often involves applying a function to multiple things, e.g. tables or HTML tags. To do this in an efficient way you will need to use loops and apply functions\index{R!apply functions}. \cite{Matloff2011} provides a comprehensive overview. The \emph{dplyr} \citep{R-dplyr}\index{dplyr} for data frame manipulation is also particularly useful. \noindent Finally, \cite{Munzert2015} provide a comprehensive overview of web scraping and text mining with R. \end{itemize} \index{web scraping|)} \subsection*{Chapter summary} In this chapter we have learned how to reproducibly gather data from a number of sources. If the data we are using is available online we may be able to create really reproducible data gathering files. These files have commands that others can execute with makefiles that allow them to actually regather the exact data we used. The techniques we can use to gather online data also make it easy to update our data when new information becomes available. Of course, it may not always be possible to have really reproducible data gathering. Nonetheless, you should always aim to make it clear to others (and yourself) how you gathered your data. In the next chapter we will learn how to clean and merge multiple data files so that they can easily be used in our statistical analyses. ================================================ FILE: Old/Source-v2/Children/Chapter7/chapter7.Rnw ================================================ % Chapter Chapter 7 For Reproducible Research in R and RStudio % Christopher Gandrud % Created: 16/07/2012 05:45:03 pm CEST % Updated: 17 April 2015 <>= set_parent('Rep-Res-Parent.Rnw') @ \chapter{Preparing Data for Analysis}\label{DataClean} Once we have gathered the raw data that we want to include in our statistical analyses we generally need to clean it up so that it can be merged into a single data file. In this chapter we will learn how to create the data gather and merging files we saw in the last chapter. The chapter also includes information on recoding and transforming variables. This is important for merging data and will be useful information in later chapters as well. If you are very familiar with data transformations in R you may want to skip to the next chapter. \section{Cleaning Data for Merging} In order to successfully merge two or more data frames we need to make sure that they are in the same format. Let's look at some of the important formatting issues and how to reformat your data frames so that they can be easily merged. \subsection{Get a handle on your data} Before doing anything to your data it is a good idea to take a look at it and see what needs to be done. Taking a little time to become acquainted with your data will help you avoid many error messages and much frustration. You could type a data frame object's name into the R console. This will print the entire data frame in your console. For data frames with more than a few variables and observations this is very impractical. We have already seen a number of commands that are useful for looking at parts of your data. As we saw in Chapter \ref{GettingStartedRKnitr}, the \texttt{names}\index{R function!names} command shows you the variable names in a data frame object. The \texttt{head}\index{R function!head} command shows the names plus the first few observations in a data frame. The \texttt{tail}\index{R function!tail} shows the last few. Use the \texttt{dim}\index{R function!dim} (dimensions) command to quickly see the number of observations and variables (the number of rows and columns) in a data frame object. For example, let's use the \emph{FertConsumpData} object we created in Chapter \ref{DataGather} to test out \texttt{dim}: <>= dim(FertConsumpData) @ \noindent The first number is the number of rows in the data frame (\Sexpr{dim(FertConsumpData)[[1]]}) and the second is the number of columns (\Sexpr{dim(FertConsumpData)[[2]]}). You can also use the \texttt{nrow} command to find just the number of rows and \texttt{ncol} to see only the columns.\index{R function!nrow}\index{R function!ncol} The \texttt{summary} command\index{R function!summary} is especially helpful for seeing basic descriptive statistics for all of the variables in a data frame and also the variables' types. Here is an example: {\small <>= # Summarize FertConsumpData data frame object # This was loaded in Chapter 6 summary(FertConsumpData) @ } \noindent We can immediately see that the variables \textbf{iso2c} and \textbf{country} are character strings. Because \texttt{summary} is able to calculate means, medians, and so on for \textbf{AG.CON.FERT.ZS} and \textbf{year}, we know they are numeric. Have a look over the summary to see if there is anything unexpected like lots of missing values (\textbf{NA's}) or unusual maximum and minimum values. You can of course, run \texttt{summary} on a particular variable by using the component selector (\verb|$|): <>= # Summarize fertilizer consumption variable from FertConsumpData summary(FertConsumpData$AG.CON.FERT.ZS) @ \noindent We'll come back to why knowing this type of information is important for merging and data analysis later in this chapter. Another important command for quickly summarizing a data frame is \texttt{table}.\index{R function!table} This creates a contingency table\index{contingency table} with counts of the number of observations per combination of factor variables. You can view a portion of a data frame object with the \texttt{View} function.\index{R function!View} This will open a new window that lets you see a selection of the data frame. If you are using RStudio, you can click on the data frame in the \emph{Environment} tab\index{RStudio!Environment tab} and you will get something similar. Note that neither of these viewers are interactive in that you can't use them to manipulate the data. They are only data viewers. To be able to see similar windows that you can interactively edit, use the \texttt{fix}\index{R function!fix} function in the same way that you use \texttt{View}. This can be useful for small edits, but remember that the edits are not reproducible. \subsection{Reshaping data}\index{R!reshaping data} Obviously it is usually a good idea if your data sets are kept in data frame type objects. See Chapter \ref{GettingStartedRKnitr} (Section \ref{data.frame}) for how to convert objects into data frames with the \texttt{data.frame} command.\index{R function!data.frame}\index{R!data frame} Not only do data sets (generally) need to be stored in data frame objects, they also need to have the same layout before they can be merged. Most R statistical analysis tools assume that your data is in ``long'' format\index{long formatted data}.\footnote{For an excellent discussion of ideal data formats see \cite{Wickham2014article}.} This usually means that data frame columns are variables and rows are specific observations (see Table \ref{ExampleLong}). \begin{table}[h!] \caption{Long Formatted Data Example} \label{ExampleLong} \begin{tabular}{l c} \\[0.15cm] \hline Subject & Variable1 \\ \hline \\[0.1cm] Subject1 & \\[0.25cm] Subject2 & \\[0.25cm] Subject3 & \\[0.25cm] \ldots & \\[0.25cm] \hline \end{tabular} \end{table} \noindent In this chapter we will mostly use examples of time-series cross-sectional data (TSCS)\index{time-series cross-sectional}\index{TSCS} that we want to have in long-format. Long formatted TSCS data is simply a data frame where rows identify observations of a particular subject at particular points in time and there are multiple observations per subject (see Table \ref{ExampleTSCSLong}). In this chapter our TSCS data is specifically going to be countries that are observed in multiple years. \begin{table}[h!] \caption{Long Formatted Time-Series Cross-Sectional Data Example} \label{ExampleTSCSLong} \begin{tabular}{l c c} \\[0.15cm] \hline Subject & Time & Variable1 \\ \hline \\[0.1cm] Subject1 & 1 & \\[0.25cm] Subject1 & 2 & \\[0.25cm] Subject1 & 3 & \\[0.25cm] Subject2 & 1 & \\[0.25cm] Subject2 & 2 & \\[0.25cm] Subject2 & 3 & \\[0.25cm] \ldots & & \\[0.25cm] \hline \end{tabular} \end{table} If one of our raw data sets is not in this format then we will need to reshape or, using Wickham's \citeyearpar{Wickham2014article} terminology, ``tidy'' it.\index{reshape data}\index{tidy data} Some data sets are in ``wide'' format,\index{wide formatted data} where one of the columns in what would be long formatted data is widened to cover multiple columns. This is confusing to visualize without an example. Table \ref{ExampleWide} shows how Table \ref{ExampleTSCSLong} looks when we widen the time variable. \begin{table}[h!] \caption{Wide Formatted Data Example} \label{ExampleWide} \begin{tabular}{l c c c} \\[0.15cm] \hline Subject & Time1 & Time2 & Time3 \\ \hline \\[0.1cm] Subject1 & & & \\[0.25cm] Subject2 & & & \\[0.25cm] \ldots & & & \\[0.25cm] \hline \end{tabular} \end{table} Tidying data is often the cause of much confusion and frustration. Though probably never easy, there are a number of useful R functions for changing data from wide format to long and vice versa. These include the matrix transpose function (\textbf{t})\footnote{See this example by Rob Kabacoff: \url{http://www.statmethods.net/management/reshape.html}. Note also that because the matrix transpose function is denoted with \texttt{t}, you should not give any object the name \emph{t}.}\index{matrix transpose} and the \texttt{reshape}\index{R function!reshape} command, both are loaded in R by default. \emph{tidyr} \citep{R-tidyr} is a very helpful package for reshaping data.\index{tidyr} This provides more general tools for reshaping data and is worth investing some time to learn well. In this section we will look at \emph{tidyr}'s \texttt{gather} function and use it to reshape a TSCS data frame from wide to long format. We will also encounter this function again in Chapter \ref{FiguresChapter} when we want to transform data so that it can be graphed. For illustration let's imagine that the fertilizer consumption data we previously downloaded from the World Bank is in wide, rather than long, format and is in a data frame object called \emph{SpreadFert}. It looks like this:\footnote{See the chapter's Appendix (page \pageref{WideAppendix}) for the code I used to reshape the data from long to wide format.} <>= @ <>= # Reshape Wide SpreadFert <- spread(FertConsumpData, year, AG.CON.FERT.ZS) # Order by country SpreadFert <- arrange(SpreadFert, country) @ <>= # Show the first 5 columns head(SpreadFert[, 1:5]) @ \noindent We can use the \texttt{gather}\index{gather}\index{R function!gather}\label{GatherReshape} command to reshape this data from wide to long format. The term ``gather'' is intended to evoke an image of the many wide columns being gathered together.\footnote{The opposite \texttt{spread} function\index{R function!spread} is supposed to evoke an image of spreading out the data from long to wide format. See Chapter Appendix for an example using the \texttt{spread} command.} Let's think about how we want to tidy the data. We want to create two new columns from the many columns that are now labeled by year. Let's call the new columns \textbf{Year} and \textbf{Fert}. The \textbf{Year} column will clearly contain the year of each observation and \textbf{Fert} will contain the fertilizer consumption. \textbf{Year} will be what \texttt{gather} calls the ``key'' and \textbf{Fert} is the ``value''. In our \emph{SpreadFert} data we don't want the \textbf{iso2c} and \textbf{country} variables to be gathered. These variables identify the data set's subjects. So we can tell \texttt{gather} that we only want columns three through nine gathered. The third column is the first one we want to gather and the ninth--the final column in the data set--is the last. <>= # Gather SpreadFert GatheredFert <- gather(SpreadFert, Year, Fert, 3:9) # Show GatheredFert head(GatheredFert) @ \subsection{Renaming variables}\index{R!renaming variables} Frequently, in the data cleaning process we want to change the names of our variables. This will make our data easier to understand and may even be necessary to properly combine data sets (see below). In the previous example, for instance, our \emph{GatheredFert} data frame has two variables--\textbf{Year} and \textbf{Fert}. Imagine, for the sake of demonstration, that we want to rename them \textbf{year} and \textbf{FertilizerConsumption}. Renaming data frame variables is straightforward with the \texttt{rename}\index{R function!rename}\index{rename variable} command in the \emph{dplyr} package \citep{R-dplyr}. To rename both \textbf{variable} and \textbf{value} with the \texttt{rename} command type: <>= GatheredFert <- rename(GatheredFert, year = Year, FertilizerConsumption = Fert) # Show GatheredFert head(GatheredFert) @ \subsection{Ordering data}\index{R!ordering data} You may have noticed that as a result of gathering \emph{SpreadFert} the data is now ordered by year rather than country name. Typically, TSCS data is sorted by country then year, or more generally: subject-time. Though not required for merging in R,\footnote{Unlike in other statistical programs.} some statistical analyses assume that the data is ordered in a specific way. Well-ordered data is also easier for people to read. We can order observations in our data set using the \texttt{order} command.\index{R function!order}\index{sort}\index{order} For example, to order \emph{GatheredFert} by country-year we type: <>= # Order GatheredFert by country-year GatheredFert <- GatheredFert[order(GatheredFert$country, GatheredFert$year), ] # Show GatheredFert head(GatheredFert) @ \emph{dplyr} has a function called \texttt{arrange}\index{R function!arrange} that can also be useful for ordering your data. \noindent \texttt{arrange}'s syntax is much cleaner and easier to remember for data frames than the operation we did with \texttt{order}. To arrange the \emph{GatheredFert} data as in the previous example, but with \texttt{arrange} use: <>= GatheredFert <- arrange(GatheredFert, country, year) @ To arrange a variable in descending order, simply place it in the \texttt{desc} function from \emph{dplyr}, e.g. \verb|arrange(GatheredFert, country, desc(year))|.\index{R function!desc} \subsection{Subsetting data}\index{R!subsetting data} Sometimes you may want to use only a subset of a data frame. For example, the density plot in Figure \ref{FertilizerConsumptionDens} shows us that the \emph{GatheredFert} data has a few very extreme values. We can use the \texttt{subset}\index{subset}\index{R function!subset} command to examine these outliers,\index{outliers} for example, countries that have fertilizer consumption greater-than 1000 kilograms per hectare. \begin{figure} \caption{Density Plot of Fertilizer Consumption (kilograms per hectare of arable land)} \label{FertilizerConsumptionDens} <>= # Create density plot of GatheredFert # Load ggplot2 library(ggplot2) # Create histogram ggplot(data = GatheredFert, aes(FertilizerConsumption)) + geom_density() + xlab("\n Fertilizer Consumption") + ylab("Density\n") + theme_bw() @ {\scriptsize{See the chapter's Appendix for the source code to create this figure.}} \end{figure} {\small <>= # Create outlier data frame FertOutliers <- subset(x = GatheredFert, FertilizerConsumption > 1000) # Show FertOutliers head(FertOutliers) @ } \noindent If we want to drop these outliers from our data set we can use \texttt{subset} again. <>= GatheredFertSub <- subset(x = GatheredFert, FertilizerConsumption <= 1000) @ In this data example, non-country units like ``Arab World'' are included. We might want to drop these units with the \texttt{subset} function as well. For example: <>= # Drop Arab World type from GatheredFertSub GatheredFertSub <- subset(x = GatheredFertSub, country != "Arab World") @ \noindent We can also use \texttt{subset} to remove observations with missing values (\texttt{NA}) for \textbf{FertilizerConsumption}. <>= # Remove observations of FertilizerConsumption # with missing values GatheredFertSub <- subset(x = GatheredFertSub, !is.na(FertilizerConsumption)) # Summarize FertilizerConsumption summary(GatheredFertSub$FertilizerConsumption) @ \begin{table} \caption{R's Logical Operators} \label{LogicalOp} \begin{center} \begin{tabular}{l l} \hline\vspace{0.15cm} Operator & Meaning \\ \hline\hline \\ \verb|<| & less-than \\ \verb|>| & greater-than \\ \verb|==| & equal to \\ \verb|<=| & less-than or equal to \\ \verb|>=| & greater-than or equal to \\ \verb|!=| & not equal to \\ \verb+a | b + & a or b \\ \verb|a & b| & a \& b \\ \verb|isTRUE(a)| & determine if a is TRUE \\ \hline \\ \verb|is.na| & missing\\ \verb|!is.na| & not missing \\ \verb|duplicated| & duplicated observation \\ \verb|!duplicated| & not a duplicated observation \\ \hline \end{tabular} \end{center} \end{table} Let's step back one second. I've introduced a number of new logical operators\index{R!logical operators} and a new function in the four subsetting examples. The first example included a very simple one, the greater-than sign (\verb|>|). The second example included the less-than or equal to operator: \verb|<=|. The third example included the not equal operator: \verb|!=|.\index{R!not equal} In R, exclamation points (\verb|!|) generally denote `not'.\index{R!exclamation point} We used this again in the final example in combination with the \texttt{is.na} command.\index{R function!is.na} This command indicates if an element is missing, so \verb|!is.na| means ``not missing''. For a list of R's logical operators, see Table \ref{LogicalOp}. You can use these operators and functions when subsetting data and throughout R. \subsection{Recoding string/numeric variables}\index{R!recode} You may want to recode your variables. In particular, when you merge data sets together you need to have \textbf{identical} identification values that R can use to match each observation on. If in one data set observations for the Republic of Korea\index{Republic of Korea} are referred to as ``Korea, Rep.'' and in another they are labeled ``South Korea'', R will not know to merge them. We need to recode values in the variables that we want to match our data sets on. For example, in \emph{GatheredFertSub} the southern Korean country is labeled ``Korea, Rep.''. To recode it to ``South Korea'' we type: <>= # Recode country == "Korea, Rep." to "South Korea" GatheredFertSub$country[GatheredFertSub$country == "Korea, Rep."] <- "South Korea" @ \noindent This code assigns ``South Korea'' to all values of the \textbf{country} variable that equal ``Korea, Rep.''.\footnote{The \emph{countrycode} package \citep{R-countrycode}\index{countrycode} is very helpful for creating standardized country identification variables.} You can use a similar technique to recode numeric variables as well. The only difference is that you omit the quotation marks. We will look at how to code factor variables later. \subsection{Creating new variables from old} As part of your data cleanup process (or later during statistical analysis) you may want to create new variables based on existing variables. For example, we could create a new variable that is the natural logarithm of \textbf{FertilizerConsumption}. To do this we run the variable through the \texttt{log}\index{R function!log}\index{logarithmic transformation} function and assign a new variable that we'll call \textbf{logFertConsumption}. {\small <>= GatheredFertSub$logFertConsumption <- log( GatheredFertSub$FertilizerConsumption ) # Summarize the log transformed variable summary(GatheredFertSub$logFertConsumption) @ } \noindent We can use a similar procedure to create new variables from R's many other mathematical commands and arithmetic operations.\footnote{E.g \texttt{+, -, *, /, \^} for addition, subtraction, multiplication, division, and exponentiation, respectively.\index{R!addition}\index{R!subtraction}\index{R!multiplication}\index{R!division}\index{R!exponentiation}} \label{Infinity}Notice that when we summarize the new log transformed variable that we have a minimum (and mean) value of \texttt{-Inf}.\index{R!-Inf}\index{infinity} This indicates that by logging the variable we have created observations with the value negative infinity. R calculates the natural logarithm of zero as negative infinity.\footnote{R denotes positive infinity with \texttt{Inf}.\index{positive infinity}\index{R!Inf}}\index{negative infinity} We probably don't want negative infinity values. There are a few ways to deal with this. We could drop all observations of \textbf{FertilizerConsumption} with the value zero before log transforming it. Another common solution is recoding zeros as some small nonnegative number like 0.001. For example: {\small <>= # Recode zeros in Fertilizer Consumption GatheredFertSub$FertilizerConsumption[ GatheredFertSub$FertilizerConsumption == 0 ] <- 0.001 # Natural log transform Fertilizer Consumption GatheredFertSub$logFertConsumption <- log( GatheredFertSub$FertilizerConsumption ) # Summarize the log transformed variable summary(GatheredFertSub$logFertConsumption) @ } \noindent Note that this example is included to demonstrate R syntax rather than to prescribe a certain transformation of skewed data with zeros. The choice of which transformation to make should ultimately be made based on the data, model, and context. See \cite{Hyndman2010} for more information on various alternatives including Box-Cox \citep{box1964analysis}\index{Box-Cox transformation} and inverse hyperbolic sine transformations\index{inverse hyperbolic sine transformation} \citep{Burbidge1988}. \begin{table} \caption{Example Factor Levels} \label{ExampleFactorRecode} \begin{center} \begin{tabular}{l l p{4cm}} \hline Number & Label & Value of \textbf{FertilizerConsumption} \\ \hline\hline \\ 1 & low & $< 18$ \\ 2 & medium low & $\ge 18$ \& $< 81$ \\ 3 & medium high & $\ge 81$ \& $< 158$ \\ 4 & high & $\ge 158$ \\ \hline \end{tabular} \end{center} \end{table} \paragraph{Creating factor variables}\index{R!factors} We can create factor variables from numeric or string variables. For example, we may want to turn the continuous numeric \textbf{FertilizerConsumption} variable into an ordered categorical (i.e. factor) variable.\index{factor variable} Imagine that we want to create a factor variable called \textbf{FertConsGroup} with four levels called `low', `medium low', `medium high', and `high'. To do this let's first create a new numeric variable based on the values listed in Table \ref{ExampleFactorRecode}. Now let's use a procedure that is similar to the variable recoding we did earlier:\footnote{In this code I attached the data frame \emph{GatheredFertSub}\index{R function!attach} so that it is easier to read.\index{R!attach}} <>= #### Create numeric factor levels variable #### # Attach GatheredFertSub data frame attach(GatheredFertSub) # Created new FertConsGroup variable based on # FertilizerConsumption GatheredFertSub$FertConsGroup[FertilizerConsumption < 18] <- 1 GatheredFertSub$FertConsGroup[FertilizerConsumption >= 18 & FertilizerConsumption < 81] <- 2 GatheredFertSub$FertConsGroup[FertilizerConsumption >= 81 & FertilizerConsumption < 158] <- 3 GatheredFertSub$FertConsGroup[FertilizerConsumption >= 158] <- 4 GatheredFertSub$FertConsGroup[is.na(FertilizerConsumption)] <- NA # Detach data frame detach(GatheredFertSub) # Summarize FertConsGroup summary(GatheredFertSub$FertConsGroup) @ \noindent You'll notice that we don't have a factor variable yet; our new variable is numeric. We can use the \texttt{factor} command\index{R function!factor} to convert \emph{FertConsGroup} into a factor variable with the labels we want.\index{R!factor labels} <>= # Create vector of factor level labels FCLabels <- c("low", "medium low", "medium high", "high") # Convert FertConsGroup to a factor GatheredFertSub$FertConsGroup <- factor(GatheredFertSub$FertConsGroup, labels = FCLabels) # Summarize FertConsGroup summary(GatheredFertSub$FertConsGroup) @ \noindent We first created a character vector with the factor-level labels and then applied using \texttt{factor}'s \texttt{labels} argument. Using \texttt{summary}\index{R function!summary} with a factor variable gives us its level labels as well as the number of observations per level. The \texttt{cut} function \index{R function!cut} provides a less code-intensive way of creating factors from numeric ones and labeling factor levels. For example: <>= # Create a factor variable with the cut command FertFactor <- cut(GatheredFertSub$FertilizerConsumption, breaks = c(-0.01, 17.99, 80.99, 157.99, 999.99), labels = c("low", "medium low", "medium high", "high")) # Summarize FertFactor summary(FertFactor) @ \noindent The \texttt{labels} argument lets us specify the factor levels' names. The \texttt{breaks} argument lets us specify what values separate the factor levels. Note that we set the first break as \texttt{-0.01}, not because any country had negative fertilizer consumption, but because the intervals created by \texttt{break} exclude the left value and include the right value.\footnote{In mathematical notation the ``low'' level includes all values in the interval $(-0.01,\:17.99]$.} If we had used \texttt{0} then all of the observations where a country used effectively no fertilizer would be excluded from the ``low'' category. \subsection{Changing variable types}\index{R!change variable type} Sometimes a variable will have the wrong type. For example, a numeric variable may be incorrectly made a character string when a data set is imported from Excel. You can change variables' types with a number of commands. We already saw how to convert a numeric variable to a factor variable with the \texttt{factor}\index{R function!factor} command. Unsurprisingly, to convert a variable to a character use \texttt{character}\index{R function!character} and \texttt{numeric}\index{R function!numeric} to convert it to a numeric type variable. We can place \texttt{as.} before these commands (e.g. \texttt{as.factor})\index{R function!as.factor}\index{R function!as.} as a way of coercing a change in type.\index{R!coercion} \textbf{Warning:} Though these commands have straightforward names, a word of caution is necessary. Always try to understand why a variable is not of the type you would expect. Oftentimes variables have unexpected types because they are coded (or miscoded) in a way that you didn't anticipate. Changing the variables' types, especially when using \texttt{as.}, can introduce new errors. Make sure that the conversion made the changes you expected. \section{Merging Data Sets}\index{R!merging}\index{merge} In the previous section we learned crucial skills for cleaning up data sets. When your data sets are (a) in the same format and (b) have variables with identically matching ID values, you can merge your data sets together. In this section we'll look at two different ways to merge data sets: binding and the \texttt{merge} command. We'll also look at ways to address a common issue when merging data: duplicated observations and columns. \subsection{Binding}\index{R!binding} As we saw in Chapter \ref{GettingStartedRKnitr}, if your data sets are in the same order--rows in all of the data sets represent the same observation of the same subject--then you can simply use the \texttt{cbind}\index{R function!cbind} command to bind columns from the data sets together. This situation is unusual when merging real-world data. If your data sets are not in exactly the same order you will create a data set with nonsensical rows that combine data from multiple observations. Therefore, you should avoid using \texttt{cbind} for merging most real-world data. If you have data sets with the exact same columns and variable types and you just want to attach one under the other you can use the \texttt{rbind}\index{rbind} command. It binds the rows in one object to the rows in another.\footnote{Some programming languages and statistical programs refer to this type of action as ``appending''\index{append} one data set to another.} It has the same syntax as \texttt{cbind} (see page \pageref{cbind}). Again, you should be cautious when using this command, though it is more difficult to accidentally create a nonsensical data set with \texttt{rbind}. R will give you an error if it cannot match your objects' columns. \subsection{The merge command} Generally, the safest and most effective way to merge two data sets together is with the \texttt{merge}\index{merge}\index{R function!merge} command. Imagine that we want to merge our \emph{GatheredFertSub} data frame with two other data frames we created in Chapter \ref{DataGather}: \emph{FinRegulatorData} and \emph{DispropData}. The simplest way to do this is to use the merge command twice, i.e.: <>= ## Add iso2c codes to FinRegulatorData and DispropData ## as ID variables for merging # Load countrycode library(countrycode) # FinRegulatorData FinRegulatorData$iso2c <- countrycode(FinRegulatorData$country, origin = "country.name", destination = "iso2c") @ <>= # Merge FinRegulatorData and DispropData MergedData1 <- merge(x = FinRegulatorData, y = DispropData, by = "iso2c", all = TRUE) # Merge combined data set with and GatheredFertSub MergedData1 <- merge(x = MergedData1, y = GatheredFertSub, by = "iso2c", all = TRUE) # Show MergedData1 variables names(MergedData1) @ \noindent Let's go through this code. The \texttt{x} and \texttt{y} arguments simply specify which data frames we want to merge. The \texttt{by} argument specifies what variable in the two frames identify the observations so that we can match them. In this example we are merging by countries' ISO country two-letter codes.\footnote{Please see this chapter's Appendix for details on how I created an ISO country two-letter code variable in the \emph{FinRegulatorData} data frame.} We set the argument \texttt{all = TRUE} so that we keep all of the observations from both of the data frames. If the argument is set to \texttt{FALSE} only observations that are common to both data frames will be included in the merged data frame. The others will not be included. You might have noticed that this isn't actually the merge that we want to accomplish with these data frames. Remember that observations are not simply identified in this time-series cross-section data by one country name or other country code variable. Instead they are identified by both country and year variables. To merge data frames based on the overlap of two variables (e.g. match Afghanistan-2004 in one data frame with Afghanistan-2004 in the other) we need to add the \texttt{union}\index{R!command} command to \texttt{merge}'s \texttt{by} argument. Here is a full example:\footnote{You can download a modified version of this example as part of the makefile exercise from Chapter \ref{DataGather}: \url{http://bit.ly/YnMKBG}.} <>= # Merge FinRegulatorData and DispropData MergedData2 <- merge(FinRegulatorData, DispropData, union("iso2c", "year"), all = TRUE) # Merge combined data frame with GatheredFertSub MergedData2 <- merge(MergedData2, GatheredFertSub, union("iso2c", "year"), all = TRUE) # Show MergedData2 variable names names(MergedData2) @ After merging data frames it is always a good idea to look at the result and make sure it is what you expected. Some post-merging cleanup may be required to get the data frame ready for statistical analysis. \paragraph{Big data}\index{big data} Before discussing post-merge cleanup it is important to highlight ways to handle large data sets. The \texttt{merge} function and many of the other data frame manipulation functions covered so far in this chapter may not perform well with very large data sets. If you are using very large data sets it might be worth investing time learning how to use either the \emph{dplyr} or \emph{data.table} packages \citep{R-data.table}. They have many capabilities for working efficiently with large data sets. Another approach is to learn SQL\footnote{Structured Query Language}\index{SQL}\index{Structured Query Language} or another special purpose data handling language.\footnote{w3schools has an online SQL tutorial at: \url{http://www.w3schools.com/sql/default.asp}.} Once you know how these languages work, you can incorporate them into your R workflow with R packages like \emph{dplyr}\index{dplyr}.\footnote{See the \emph{dplyr} vignette on using the package with SQL databases at \url{http://cran.r-project.org/web/packages/dplyr/vignettes/databases.html}.} \subsection{Duplicate values}\index{R!duplicate values|(} Duplicate observations are one thing to look out for after (and before) merging. You can use the \texttt{duplicated}\index{R function!duplicated}\index{R!duplicates} command to check for duplicates. Use the command in conjunction with subscripts to remove duplicate observations. For example, let's create a new object called \emph{DataDuplicates} from the iso2c-years that are duplicated in \emph{MergedData2}. Remember that \textbf{iso2c} and \textbf{year} are in the first and second columns of the data frame. <>= # Created a data frame of duplicated country-years DataDuplicates <- MergedData2[duplicated( MergedData2[, 1:2]), ] # Show the number of rows in DataDuplicates nrow(DataDuplicates) @ \noindent In this data frame there are \Sexpr{nrow(DataDuplicates)} duplicated iso2c-year observations. We know this because \texttt{nrow}\index{R function!nrow} tells us that the data frame with the duplicated values has \Sexpr{nrow(DataDuplicates)} rows, i.e. \Sexpr{nrow(DataDuplicates)} observations. To create a data set without duplicated observations (if there are duplicates) we just add an exclamation point (\texttt{!}) before \texttt{duplicated}--i.e. not duplicated--in the above code. <>= # Created a data frame of unique country-years DataNotDuplicates <- MergedData2[!duplicated( MergedData2[, 1:2]), ] @ \noindent Note that if you do have duplicated values in your data set and you run a similar procedure on it, it will drop duplicated values that have a lower order in the data frame. To keep the lowest ordered value and drop duplicates higher in the data set, use \texttt{duplicated}'s \texttt{fromLast} argument like this: \texttt{fromLast = TRUE}. \textbf{Warning:} look over your data set and the source code that created the data set to try to understand why duplicates occurred. There may be a fundamental problem in the way you are handling your data that resulted in the duplicated observations. \index{R!duplicate values|)} \subsection{Duplicate columns}\index{R!duplicate variables|(} Another common post-merge cleanup issue is duplicate columns, i.e. variables. These are variables from the two data frames with the same name that were not included in \texttt{merge}'s \texttt{by} argument. For example, in our previous merged data examples there are three country name variables: \textbf{country.x}, \textbf{country.y}, and \textbf{country} to signify which data frame they are from.\footnote{The former two were created in the first merge between \emph{FinRegulatorData} and \emph{DispropData}. When the second merge was completed there were no variables named \textbf{country} in the MergeData2 data frame, so \textbf{country} did not need to be renamed in the new merged data set.} You should of course, decide what to do with these variables on a case-by-case basis. But if you decide to drop one of the variables\index{drop variable} and rename\index{R function!rename} the other, you can use subscripts (as we saw in Chapter \ref{GettingStartedRKnitr}). The \emph{dplyr}\index{dplyr} package \citep{R-dplyr} has a useful function called \texttt{select}\index{R function!select} that can also remove variables from data frames. To remove variables simply write a minus sign (\texttt{-}) and then the variable name without quotes. For example, imagine that we want to keep \textbf{country.x} and drop the other variables.\footnote{This version of the country variable is the most complete.} Let's also remove the \textbf{idn} variable: <>= # Remove country.y, country, X, and idn FinalCleanedData <- dplyr::select(DataNotDuplicates, -country.y, -country, -idn) # Rename country.x = country FinalCleanedData <- dplyr::rename(FinalCleanedData, country = country.x) @ {\footnotesize <>= # Show FinalCleanedData variables names(FinalCleanedData) @ } \noindent Alternatively, you can select specific variables to keep with the \texttt{select} function by writing the variables' names without a minus sign. \textbf{Note}: if you are merging many data sets it can sometimes be good to cleanup duplicate columns between each \texttt{merge} call. \index{R!duplicate variables|)} \subsection*{Chapter summary} This chapter has provided you with many tools for cleaning up your data to get it ready for statistical analysis. Before moving on to the next chapter to learn how to incorporate statistical analysis as part of a reproducible workflow with \emph{knitr}/\emph{rmarkdown}, it's important to reiterate that the function we've covered in this chapter should usually be embedded in the types of data creation files we saw in Chapter \ref{DataGather}. These files can then be tied together with a makefile into a process that should be able to relatively easily take very raw data and clean it up for use in your analyses. Embedding these commands in data creation source code files, rather than just typing the commands into your R console\index{R!console} or manually changing data in Excel, will make your research much more reproducible. It will also make it easier to backtrack and find mistakes that you may have made while transforming the data. Including new or updated data when it becomes available will also be much easier if you use a series of segmented data creation source code files that are tied together with a makefile. \section*{Appendix}\label{WideAppendix} \noindent R code for turning \emph{FertConsumData} into year-wide format: <>= # Load WDI and tidyr package library(WDI) library(tidyr) # Gather fertilizer consumption data from WDI FertConsumpData <- WDI(indicator = "AG.CON.FERT.ZS") # Spread FertConsumpData to year wide format SpreadFert <- spread(FertConsumpData, year, AG.CON.FERT.ZS) # Order SpreadFert by country SpreadFert <- arrange(SpreadFert, country) @ \noindent R code for creating iso2c country codes with the \emph{countrycode} package:\label{CountryCodeExample} <>= # Load countrycode package library(countrycode) # FinRegulatorData FinRegulatorData$iso2c <- countrycode(FinRegulatorData$country, origin = "country.name", destination = "iso2c") @ \noindent R code for creating Figure \ref{FertilizerConsumptionDens}: <>= # Load ggplot2 library(ggplot2) # Create density plot ggplot(data = GatheredFert, aes(FertilizerConsumption)) + geom_density() + xlab("\n Fertilizer Consumption") + ylab("Density\n") + theme_bw() @ ================================================ FILE: Old/Source-v2/Children/Chapter8/chapter8.Rnw ================================================ % Chapter Chapter 8 For Reproducible Research in R and RStudio % Christopher Gandrud % Created: 16/07/2012 05:45:03 pm CEST % Updated: 31 March 2015 <>= set_parent('Rep-Res-Parent.Rnw') @ \chapter{Statistical Modeling and {\emph{knitr}}}\label{StatsModel} When you have your data cleaned and organized you will begin to examine it with statistical analyses. In this book we don't look at how to do statistical analysis in R (a subject that would and does take up many books). Instead we focus on how to make your analyses really reproducible. To do this you dynamically connect your data gathering and analysis source code to your presentation documents. When you dynamically connect your data gathering makefiles and analysis source code to your markup document you will be able to completely rerun your data gathering and analysis and present the results whenever you compile the presentation documents. Doing this makes it very clear how you found the results that you are advertising. It also automatically keeps the presentation of your results--including tables and figures--up-to-date with any changes you make to your data and analyses source code files. You can dynamically tie your data gathering, statistical analyses, and presentation documents together with \emph{knitr}/\emph{rmarkdown}. In Chapter \ref{GettingStartedRKnitr} you learned basic \emph{knitr}/\emph{rmarkdown} syntax. For the rest of the chapter I'll simply refer to it as ``\emph{knitr} syntax'', but it applies to \emph{rmarkdown} as well. In this chapter we will begin to learn \emph{knitr} syntax in more detail, particularly code chunk options for including dynamic code in your presentation documents. This includes code that is run in the background, i.e. not shown in the presentation document, as well as displaying the code and output in your presentation document both as separate blocks and inline with the text. We will also learn how to dynamically include code from languages other than R. We examine how to use \emph{knitr} with modular source code files. Finally, we will look at how to create reproducible `random' analyses and how to work with computationally intensive code chunks. The goal of this and the next two chapters--which cover dynamically presenting results in tables and figures--is to show you how to tie data gathering and analyses into your presentation documents so closely that every time the documents are compiled they actually reproduce your analysis and present the results. Please see the next part of this book, Part IV, for details on how to create the LaTeX and Markdown documents that can include \emph{knitr} code chunks. \textbf{Reminder:} Before discussing the details of how to incorporate your analysis into your source code, it's important to reiterate something we discussed in Chapter \ref{GettingStartedRR}. The syntax and capabilities of R packages and R itself can change with new versions. Also, as we have seen for file path names, syntax can change depending on what operating system you are using. So it is important to have your R session info available (see Section \ref{SessionInfoHow} for details) to make your research more reproducible and future-proof. If someone reproducing your research has this information, they will be able to download your files and use the exact version of the software that you used. For example, CRAN\index{CRAN archive} maintains an archive of previous R package versions that can be downloaded.\footnote{See: \url{http://cran.r-project.org/src/contrib/Archive/}.} Previous versions of R itself can also be downloaded through CRAN.\footnote{See: \url{http://cran.r-project.org/src/base/}.} \section{Incorporating Analyses into the Markup} For a relatively short piece of code that you don't need to run in multiple presentation documents it may be simplest to type the code directly into chunks written in your \emph{knitr} markup document. In this section we will learn how to set \emph{knitr} options for handling these code chunks. For a list of many of the chunk options covered here see Table \ref{ChunkOptionsTable}. \subsection{Full code chunks}\index{knitr!code chunk option|(} By default, {\emph{knitr}} code chunks are run by R, and the code and any text output (including warnings and error messages) are inserted into the text of your presentation documents in blocks. The blocks are positioned in the final presentation document text at the points where the code chunk was written in the knittable markup. Figures are inserted as well. Let's look at the main options for determining how code chunks are handled by \emph{knitr}. \paragraph{{\tt{include}}}\index{knitr option!include} Use \texttt{include=FALSE} if you don't want to include anything in the text of your presentation document, but you still want to evaluate a code chunk. It is \texttt{TRUE} by default. \paragraph{{\tt{eval}}}\index{knitr option!eval} The \texttt{eval} option determines whether or not the code in a chunk will be run. Set the \texttt{eval} option to \texttt{FALSE} if you would like to include code in the presentation document text without actually running the code. By default it is set to \texttt{TRUE}, i.e. the code is run. You can alternatively use a numerical vector with \texttt{eval}. The numbers in the vector tell \emph{knitr} which expressions in the chunk to evaluate. For example, if you only want to evaluate the first two expressions, simply set \texttt{eval=1:2}. \paragraph{{\tt{echo}}}\index{knitr option!echo} If you would like to hide a chunk's code from the presentation document you can set \texttt{echo=FALSE}. Note that if you also have \texttt{eval=TRUE} then the chunk will still be evaluated and the output will be included in your presentation document. Clearly, if \texttt{echo=TRUE}, then source code will be included in the presentation document. As with \texttt{eval}, you can alternatively use a numerical vector in \texttt{echo}. The numbers in the vector indicate which expressions to echo in your final document. \paragraph{{\tt{results}}}\index{knitr option!results} We will look at the \texttt{results} option in more detail in the next two chapters (see especially Section \ref{ResultsOptions}). However, let's briefly discuss the option value \texttt{hide}. Setting \verb|results='hide'| is almost the opposite of \texttt{echo=FASLE}. Instead of showing the results of the code chunk and hiding the code, \verb|results='hide'| shows the code, but not the results. Warnings, errors, and messages will still be printed. \paragraph{{\tt{warning}}, {\tt{message}}, {\tt{error}}}\index{knitr option!warning}\index{knitr option!error}\index{knitr option!message} If you don't want to include the warnings, messages, and error messages that R outputs in the text of your presentation documents, just set the \texttt{warning}, \texttt{message}, and \texttt{error} options to \texttt{FALSE}. They are set to \texttt{TRUE} by default. \paragraph{{\tt{cache}}}\index{knitr option!cache} If you want to run a code chunk once and save the output for when you knit the document again, rather than running the code chunk every time, set the option \texttt{cache=TRUE}. When you do this the first time the document is knitted, the chunk will be run and the output stored in a subdirectory of the working directory called \emph{cache}. When the document is subsequently knitted, the chunk will only be run if the code in the chunk changes or its options change. This is very handy if you have a code chunk that is computationally intensive to run. The \texttt{cache} option is set to \texttt{FALSE} by default. Later in this chapter (Section \ref{CacheVars}) we will see how to use the \texttt{cache.vars} command to cache only certain variables created by a code chunk. \paragraph{{\tt{dependson}}}\index{knitr option!dependson} Cached chunks are only rerun when their code changes. Sometimes one chunk will depend on the results from a prior chunk. In these cases it is good to rerun the chunk if the prior one is also rerun. The \texttt{dependson} option allows you to do this. You can specify either a vector of the labels for the chunks depended on or their numbers in order from the start of the document. For example, \texttt{dependson=c(2, 3)} specifies that if the second or third chunks are rerun, then the current chunk will also be rerun. \paragraph{{\tt{cache.extra}}}\index{knitr option!cache.extra} Sometimes to ensure reproducibility it may be useful to rerun a chunk when some other condition changes, such as when a new version of R is installed or a dependent file changes. You can feed a list of conditions to \texttt{cache.extra} to do this. For instance: <>= cache.extra=list(file.info(Data.csv)$mtime, R.version) @ \noindent Here we set two conditions under which the chunk will be rerun. The first specifies that the chunk should be rerun whenever the \emph{Data.csv} file is modified. The \texttt{file.info} function extracts information about the file and \texttt{mtime} gives the last time that the file was modified. If this differs from when the chunk was last run, then it will be run again. This is very useful for keeping your cached chunks and the files they rely on in sync. The second condition enabled by \texttt{R.version}\index{R function!R.version} reruns the chunk whenever the R version or even the operating system changes. If you only want to rerun the chunk when the version of R is different, then use \texttt{R.version.string}.\index{R function!R.version.string} \paragraph{{\tt{size}}}\index{knitr option!size} If you do want to print part or all of your code chunk into your LaTeX document, you may also want to resize the text. To do this, use the \texttt{size} option. By default it is set to \verb|size='normalsize'|. You can use any of the LaTeX font sizes listed in Table \ref{LaTeXFontSize} from Chapter \ref{LatexChapter}. \index{knitr!code chunk option|)} \subsection{Showing code \& results inline}\index{inline} Sometimes you may want to have R code or output show up inline with the rest of your presentation document's text. For example, you may want to include a small chunk of stylized code in your text when you discuss how you did an analysis. Or you may want to dynamically report the mean of some variable in your text so that the text will change when you change the data. The {\emph{knitr}} syntax for including inline code is different for the LaTeX and Markdown languages. We'll cover both in turn. \subsubsection{LaTeX} \paragraph{Inline static code} There are a number of ways to include a code snippet inline with your text in LaTeX. You can simply use the LaTeX command \verb|\texttt| to have text show up in the \texttt{typewriter} font\index{texttt} commonly used in LaTeX-produced documents to indicate that some text is code (I use typewriter font for this purpose in this book, as you have probably noticed). For example, using \verb|\texttt{2 + 2}| will give you \texttt{2 + 2} in your text. Note that in LaTeX curly brackets (\verb|{}|) work exactly like parentheses in R, i.e. they enclose a command's arguments. However, the \verb|\texttt| command isn't always ideal, because your LaTeX compiler\index{compiler} will still try to run the code inside of the command as if it were LaTeX markup. This can be problematic if you include characters like the backslash \verb|\| or curly brackets \verb|{}|. They have special meanings for LaTeX. The hard way to solve this problem is to use escape characters\index{escape character} (see Chapter \ref{DirectoriesChapter}). The backslash is an escape character in LaTeX. Probably the better option is to use the \verb|\verb| command\index{LaTeX command!verb}. It is equivalent to the \texttt{eval=FALSE} option for full {\emph{knitr}} code chunks. To use the \verb|\verb| command, pick some character you will not use in the inline code. For example, you could use the vertical bar (\texttt{|}). This will be the \verb|\verb| delimiter. Imagine that we want to actually include `\verb|\texttt|' in the text. We would type: \begin{knitrout} \definecolor{shadecolor}{rgb}{1, 1, 1} \color{fgcolor} \begin{kframe} \begin{verbatim} \verb|\texttt| \end{verbatim} \end{kframe} \end{knitrout} \noindent The LaTeX compiler will ignore almost anything from the first vertical bar up until the second bar following \verb|\verb|. All of the text in-between the delimiter characters is put in typewriter font.\footnote{For more details see the LaTeX Wikibooks page: \url{http://en.wikibooks.org/wiki/LaTeX/Paragraph_Formatting#Verbatim_Text} (accessed 24 November 2012). Also, for help troubleshooting see the UK List of Frequently Asked Questions: \url{http://www.tex.ac.uk/cgi-bin/texfaq2html?label=verbwithin} (accessed 4 January 2012).} \paragraph{Inline dynamic code} If you want to dynamically show the results of some R code in your \emph{knitr} LaTeX-produced text you can use \verb|\Sexpr|\index{Sexpr}.\index{knitr!Sexpr} This is a pseudo LaTeX command; it looks like LaTeX, but is actually {\emph{knitr}}.\footnote{The command directly descends from \emph{Sweave}.} Its structure is more like a LaTeX command's structure than {\emph{knitr}}'s in that you enclose your R code in curly brackets (\texttt{\{\}}) rather than the \verb|<<>>= . . . @| syntax you use for block code chunks. For example, imagine that you wanted to include the mean of a vector of river lengths--\Sexpr{round(mean(rivers), digits = 0)}--in the text of your document. The {\emph{rivers}} numeric vector, loaded by default in R, has the lengths of 141 major rivers recorded in miles. You can simply use the \texttt{mean}\index{R function!mean} command to find the mean and the \texttt{round}\index{R function!round} command to round the result to the nearest whole number: <>= round(mean(rivers), digits = 0) @ \noindent To have just the output show up inline with the text of your document you would type something like: <>= The mean length of 141 major rivers in North America is \Sexpr{round(mean(rivers), digits = 0)} miles. @ \noindent This produces the sentence: \begin{quote} The mean length of 141 major rivers in North America is \Sexpr{round(mean(rivers), digits = 0)} miles. \end{quote} \noindent R code included inline with \texttt{Sexpr} is evaluated using current R options.\index{R!global options} So if you want all of the output from \texttt{Sexpr} to be rounded to the same number of digits, for example, it might be a good idea to set this in a code chunk with R's \texttt{options} command.\index{R function!options} See page \pageref{ROptions} for more details. \subsubsection{Markdown} \paragraph{Inline static code} To include static code inline in an R Markdown (and regular Markdown) document, enclose the code in single backticks (\verb|` . . . ` |). For example: <>= This is example R code: `MeanRiver <- mean(rivers)`. @ \noindent produces:\footnote{The exact look of the text depends on the Cascading Style Sheets (CSS)\index{Cascading Style Sheets}\index{CSS} style file you are using. The example here was created with RStudio's default style file.} \includegraphics[scale = 0.6]{Children/Chapter8/images8/MeanRiverMarkdown.png} \paragraph{Inline dynamic code} Including dynamic code in the body of your R Markdown text is similar to including static code. The only difference is that you put the letter \texttt{r} after the first single backtick. For example: <>= `r mean(rivers)` @ \noindent will include the mean value of the {\emph{rivers}} vector in the text of your Markdown document. \subsection{Dynamically including non-R code in code chunks} You are not limited to dynamically including just R code in your presentation documents. {\emph{knitr}} can run code from a variety of other languages including: Python\index{Python}, Ruby, Bash, Haskell, and Awk. All you have to do to dynamically include code from one of these languages is use the \texttt{engine}\index{knitr option!engine} code chunk option to tell \emph{knitr} which language you are using. For example, to dynamically include a simple line of Python code in an R Markdown document type: \begin{knitrout} \definecolor{shadecolor}{rgb}{1, 1, 1}\color{fgcolor} \begin{kframe} \begin{verbatim} ```{r engine='python'} print "Reproducible Research" ``` \end{verbatim} \end{kframe} \end{knitrout} \noindent In the final HTML\index{HTML} file you will get:\footnote{Again, this was created using RStudio's default CSS style file.} \includegraphics[scale = 0.6]{Children/Chapter8/images8/PythonRR.png} Many of the programming language values \texttt{engine} can take are listed in Table \ref{EngineOptions}. To enable more complete syntax highlighting for non-R languages you will need to download Andre Simon's \emph{highlighter} utility.\index{highlighter} Installation instructions can be found on his website at: \url{http://www.andre-simon.de/zip/download.html}. \begin{table}[ht] \caption{A Selection of \emph{knitr} \texttt{engine} Values} \label{EngineOptions} \begin{center} \begin{tabular}{l p{2.25cm}} \hline\vspace{0.15cm} Value & Programming Language \\ \hline\hline \texttt{awk} & Awk\index{Awk} \\ \texttt{bash} & Bash shell \index{Bash} \\ \texttt{coffeescript} & CoffeeScript\index{CoffeeScript} \\ \texttt{gawk} & Gawk\index{Gawk} \\ \texttt{haskell} & Haskell\index{Haskell} \\ \texttt{highlight} & Highlight\index{knitr!highlight} \\ \texttt{python} & Python\index{Python} \\ \texttt{R} & R (default) \\[0.25cm] \texttt{ruby} & Ruby\index{Ruby} \\ \texttt{sas} & SAS\index{SAS} \\ \texttt{sh} & Bourne shell\index{Bourne shell} \\ \hline \end{tabular} \end{center} \end{table} \section{Dynamically Including Modular Analysis Files} There are a number of reasons why you might want to have your R source code located in separate files from your markup documents even if you compile them together with \emph{knitr}. First, it can be unwieldy to edit both your markup and long R source code chunks in the same document, even with RStudio's handy \emph{knitr} code folding and chunk management options. There are just too many things going on in one document. Second, you may want to use the same code in multiple documents--an article and slide show presentation, for example. It is nice to not have to copy and paste the same code into multiple places. Instead, it is easier to have multiple documents link to the same source code file. When you make changes to this source code file, the changes will automatically be made across all of your presentation documents. You don't need to make the same changes multiple times. Third, other researchers trying to replicate your work might only be interested in specific parts of your analysis. If you have the analysis broken into separate and clearly labeled modular files that are explicitly tied together in the markup file with \emph{knitr}, it is easy for them to find the specific bits of code that they are interested in. \subsection{Source from a local file} Usually, in the early stages of your research, you may want to run code stored in analysis files located on your computer. Doing this is simple. The \emph{knitr} syntax is the same as for block code chunks. The only change is that instead of writing all of your code in the chunk, you save it to its own file and use the \texttt{source}\index{R function!source} command to access it.\footnote{We used the \texttt{source} command in Chapter \ref{DataGather} in our make-like data gathering file.} For example, in an R Markdown file we could run the R code in a file called \emph{MainAnalysis.R} from our \emph{ExampleProject} like this: <>= ```{r, include=FALSE} # Run main analysis source("/ExampleProject/Analysis/MainAnalysis.R"} ``` @ \noindent Notice that we set the option \texttt{include=FALSE}. This will run the analysis and produce objects created by the analysis code that can be used by other code chunks, but the output will not show up in the presentation document's text. \paragraph{Sourcing a makefile in a code chunk} In Chapter \ref{DataGather} we created a GNU Makefile\index{GNU Make}\index{makefile} to organize our data gathering. You can run makefiles every time you compile your presentation document. This can keep your data, analyses, figures, and tables up-to-date. One way to do this is to run the GNU makefile in an R code chunk with the \texttt{system}\index{R function!system} command (see Section \ref{systemRcommand}). Perhaps a better way to run makefiles from \emph{knitr} presentation documents is to include the commands in a code chunk using the Bash engine. For example, a Sweave-style code chunk for running the makefiles in our example project would look like this: \begin{knitrout} \definecolor{shadecolor}{rgb}{1, 1, 1} \color{fgcolor} \begin{kframe} \begin{alltt} \textless{\textless}engine='bash', include=FALSE\textgreater{\textgreater}= # Change working directory to /ExampleProject/Analysis/Data cd /ExampleProject/Analysis/Data/ # Run makefile make cleanMerge all # Change to working directory to /ExampleProject/Analysis/ cd /ExampleProject/Analysis/ @ \end{alltt} \end{kframe} \end{knitrout} \noindent Please see page \pageref{MakeAllCommand} for details on the \texttt{make} command arguments used here. You can of course, also use R's \texttt{source} command to run an R make-like data gathering file. Unlike GNU Make, this will rerun all of the data gathering files, even if they have not been updated. This may become very time consuming depending on the size of your data sets and how they are manipulated. One final note on including makefiles in your \emph{knitr} presentation document source code: it is important to place the code chunk with the makefile before code chunks containing statistical analyses that depend on the data file it creates. Placing the makefile first will keep the others up-to-date. \subsection{Source from a non-secure URL (\texttt{http})}\index{http} Sourcing from your computer is fine if you are working alone and do not want others to access your code. Once you start collaborating and generally wanting people to be able to reproduce your analyses, you need to use another storage method. The simplest method is to host the replication code in your Dropbox public folder. You can find the file's public URL in the same way that you did in Chapter \ref{Storing}. Then use the \texttt{source}\index{R function!source} command the same way as we did before with the \texttt{read.table}\index{R function!read.table} command.\footnote{You can also make the replication code accessible for download and either instruct others to change the working directory to the replication file or have them change the directory information as necessary. You will need to do this with GNU makefiles like those included with this book.} \subsection{Source from a secure URL (\texttt{https})}\label{SourceSecureURL}\index{https} If you are using GitHub\index{GitHub} or another service that uses secure URLs to host your analysis source code files you need to use the \verb|source_url|\index{R function!source\_url} command in the {\emph{devtools}}\index{devtools} package. For GitHub based source code we find the file's URL the same way we did in Chapter \ref{Storing} (Section \ref{RawGitHub}). Remember to use the URL for the {\emph{raw}}\index{raw} version of the file. I have a short script hosted on GitHub for creating a scatterplot from data in R's {\emph{cars}} data set. The script's shortened URL is \url{http://bit.ly/1D5p1w6}.\footnote{The original URL is at \url{https://raw.githubusercontent.com/christophergandrud/Rep-Res-Examples/master/Graphs/SimpleScatter.R}. This is very long, so I shortened it using bitly. You may notice that the shortened URL is not secure. However, it does link to the original secure {\tt{https}} URL.} To run this code and create the scatterplot\index{scatterplot} using \verb|source_url| you simply type:\label{SourceCarsGraph} <>= # Load devtools package library(devtools) # Run the source code to create the scatter plot source_url("http://bit.ly/1D5p1w6") @ \noindent You can also use the {\emph{devtools}} command \verb|source_gist|\index{R function!source\_gist} in a similar way to source GitHub Gists\index{GitHub!Gist}. Gists are a handy way to share code over the internet. For more details see: \url{https://gist.GitHub.com/}. Similar to what we saw in Chapter \ref{Storing} (Section \ref{GitDownload}),\label{sourceurl} if you would like to use a particular version of a file stored on GitHub,\index{GitHub} simply include that version's URL in the \verb|source_url| call. This can be useful for replicating particular results. Linking to a particular version of a source code file will enable replication even if you later make changes to the file. To access the URL for a particular version of a file, first click on the file on GitHub's website. Then click the \texttt{History} button (\includegraphics[scale=0.3]{Children/Chapter8/images8/GitHistory.png}). This will take you to a page listing all of the file's versions. Click on the \texttt{Browse Code} button (\includegraphics[scale=0.5]{Children/Chapter5/images5/BrowseCodeIcon.png}) next to the version of the file that you want to use. Finally, click on the \texttt{Raw} button to be taken to the text-only version of the file. Copy this page's URL and use it in \verb|source_url|. Also, just like with \verb|source_data|\index{R function!source\_data}, we can set the \verb|sha1| argument to tell \verb|source_url| to make sure that the source code file it is downloading is the one we intended.\index{SHA-1 hash} This will work regardless of whether or not the file is stored on GitHub. \section{Reproducibly Random: {\tt{set.seed}}} If you are including simulations in your analysis it is often a good idea to specify the random number generator state you used.\index{random number generator} This will allow others to exactly replicate your `randomly'--really pseudo-randomly--generated simulation results. Use the \texttt{set.seed} command in your source code files or code chunks to do this.\index{R function!set.seed} For example, use the following code to set the random number generator state\footnote{See the \texttt{Random} help file for detailed information on R's random number generation capabilities by typing \texttt{?Random} into your console.} and randomly draw 1,000 numbers from a standard normal distribution\index{normal distribution} with a mean of 0 and a standard deviation of 2\index{standard deviation}. <>= # Set seed as 125 set.seed(125) # Draw 1000 numbers Draw1 <- rnorm(1000, mean = 0, sd = 2) # Summarize Draw1 summary(Draw1) @ \noindent The \texttt{rnorm} command\index{R function!rnorm} draws the 1,000 simulations. The \texttt{mean} argument allows us to set the normal distribution's mean and \texttt{sd} sets its standard deviation. Just to show you that we will draw the same numbers if we use the same seed, let's run the code again: <>= # Set seed as 125 set.seed(125) # Draw 1000 numbers Draw2 <- rnorm(1000, mean = 0, sd = 2) # Summarize Draw1 summary(Draw2) @ \section{Computationally Intensive Analyses} Sometimes you may want to include computationally intensive analyses that take a long time to run as part of a \emph{knitr} document. This can make writing the document frustrating because it will take a long time to knit it each time you make changes. There are at least two solutions to this problem: the \texttt{cache} chunk option and makefiles.\index{makefile} We discussed makefiles in Chapter \ref{DataGather}, so let's look at how to work with the \texttt{cache} option.\index{knitr option!cache} When you set \verb|cache=TRUE|\index{knitr option!cache} for the code chunk that contains the analysis, the code chunk will only be run when the chunk's contents change\footnote{Note that the chunk will not be run if only the contents of a file the chunk sources are changed. Use the \texttt{dependson} option in cases where it is important to rerun a chunk when a prior chunk changes.} or the chunk options change. This is a very easy solution to the problem. It does have a major drawback: other chunks can't access objects created by the chunk or use commands from packages loaded in it. Solve these problems by (a) having packages loaded in a separate chunk and (b) save objects created by the cached chunk to a separate RData file that can be loaded in later chunks (see Section \ref{RSave} for information on saving to RData files).\footnote{It's true that when \texttt{knitr} caches a code chunk it saves the chunk's objects to an \texttt{.RData} file. However, it is difficult to load this file directly because the file name changes every time the cached chunk is rerun.} Imagine that in a cached code chunk we create an object called \emph{Sample}. Then in a later code chunk we want to use the \texttt{hist}\index{R function!hist} command to create a histogram\index{histogram} of the sample. In the cached code chunk we save \emph{Sample} to a file called \emph{Sample.RData}. \begin{knitrout} \definecolor{shadecolor}{rgb}{1, 1, 1} \color{fgcolor} \begin{kframe} \begin{alltt} \textless{}\textless{}Sample, cache=TRUE\textgreater{}\textgreater{}= \hlcom{# Create data} Sample <- \hlkwd{rnorm}(n = 1000, mean = 5, sd = 2) \hlcom{# Save sample} \hlkwd{save}(Sample, file = \hlstr{"Sample.RData"}) @ \end{alltt} \end{kframe} \end{knitrout} \noindent The latter code chunk for creating the histogram would go something like this:\footnote{For reference, \emph{Sample} was created by using the \texttt{rnorm} command\index{R function!rnorm} to take a random sample of size 1,000 from a normal distribution\index{normal distribution} with a mean of five and standard deviation of two.} \begin{knitrout} \definecolor{shadecolor}{rgb}{1, 1, 1} \color{fgcolor} \begin{kframe} \begin{alltt} \textless{}\textless{}Histogram\textgreater{}\textgreater{}= \hlcom{# Load Sample} \hlkwd{load}(file = \hlcom{"Sample.RData"}) \hlcom{# Create histogram} \hlkwd{hist}(Sample) @ \end{alltt} \end{kframe} \end{knitrout} \noindent \label{CacheVars}If the code chunk you want to cache creates many objects, but you only want to save a few of them, you can use \emph{knitr}'s \texttt{cache.vars} chunk option.\index{knitr option!cache.vars} Simply give it a character vector of the objects' names that you want to save. \subsection*{Chapter summary} In this chapter we covered in more detail key \emph{knitr} syntax for including code chunks in our presentation documents. This and other tools we learned in this chapter are important for tying our statistical analyses directly to its advertising, i.e. our presentation documents. In the next two chapters we will learn how to take the output from our statistical analysis and, using \emph{knitr}, present the results with dynamically created tables and figures. ================================================ FILE: Old/Source-v2/Children/Chapter9/chapter9.Rnw ================================================ % Chapter Chapter 9 For Reproducible Research in R and RStudio % Christopher Gandrud % Created: 16/07/2012 05:45:03 pm CEST % Updated: 9 September 2018 <>= set_parent('Rep-Res-Parent.Rnw') @ \chapter{Showing Results with Tables}\label{TablesChapter} Graphs and other visual methods, discussed in the next chapter, can often be more effective ways to present descriptive and inferential statistics than tables.\footnote{This is especially true of the small-print, high-density coefficient estimate tables that are sometimes descriptively called `train schedule' tables.} Nonetheless, tables of parameter estimates, descriptive statistics, and so on can sometimes be important tools for describing your data and presenting research findings. See \cite{Ehrenberg1977} and \cite{Gelman2011tables} for information on creating tables for effective communication. Learning how to dynamically connect statistical results with tables in your presentation documents aids reproducibility and can ultimately save you a lot of time. Manually typing results into tables by hand is tedious, not very reproducible, and can introduce errors.\footnote{For example, in a replication of Reinhart and Rogoff's much cited \citeyearpar{RR2010} study of economic growth and public debt, \cite{Herndon2014} found a number of apparent transcription errors. Analysis results in the original spreadsheets appear to not have been entered into the paper's tables accurately.} It's especially tedious to retype tables to reflect changes you made to your data and models. Fortunately, you don't actually need to create tables by hand. There are many ways to have R do the work for you. The goal of this chapter is for you to learn how to dynamically create tables for your presentation documents written in LaTeX and Markdown. We will first learn the simple \emph{knitr}/\emph{rmarkdown} syntax we need to dynamically include tables created from R objects. Then we will learn how to actually create the tables. There are a number of ways to turn R objects into tables that can be dynamically included in LaTeX or Markdown/HTML markup. In this chapter we mostly focus on three tools for creating tables: the \texttt{kable} function from \emph{knitr}\index{R function!kable}, the \emph{xtable} package, and the \emph{texreg}\index{texreg} package \cite[]{R-texreg}. \texttt{kable} can create tables from data frames for both LaTeX and Markdown/HTML documents. \emph{xtable} does the same, but is much more customizable. \emph{texreg} produces publication-quality tables from objects containing statistical model results--model objects.\index{model object} It allows you to combine results from multiple models into one table. Unfortunately \emph{texreg} is less flexible with objects of classes it does not support.\footnote{These are not the only packages available in R for creating presentation document tables from R objects. I personally really like the \emph{stargazer} package \citep{R-stargazer}. It has a similar syntax to \emph{texreg} and is particularly good for showing results from multiple models estimated using different model types in one table.} \textbf{Warning:} Automating table creation removes the possibility of adding errors to the presentation of your analyses by incorrectly copying output, a big potential problem in hand-created tables. However, it is not error-free. You could easily create inaccurate tables with coding errors. So, as always, it is important to `eyeball' the output. Does it make sense? If you select a couple values in the R output, do they match what is in the presentation document's table? If not, you need to go back to the code and see where things have gone wrong. With that caveat, let's start making tables. \section{Basic \emph{knitr} Syntax for Tables} The most important \texttt{knitr} \emph{rmarkdown} chunk option for showing tables is \texttt{results}\index{knitr option!results}.\label{ResultsOptions} The \texttt{results} option can have one of four values: \begin{itemize} \item \verb|'hide'|, \item \verb|'asis'|, \item \verb|'markup'|, \item \verb|'hold'|. \end{itemize} \noindent The value \texttt{hide} clearly hides the results of your code chunk from your presentation document. \texttt{hold} collects all of the output and prints it at the end of the chunk. To include tables created from R objects in your LaTeX or Markdown output you should set \verb|results='asis'| or \verb|results='markup'|. \texttt{asis} is the simplest option as it writes the raw markup form of the table into the presentation document, not as a highlighted code chunk, but as markup. It is then compiled as table markup with the rest of the document. \texttt{markup} uses an output hook\index{knitr!hook}\index{knitr!hook} to mark up the results in a predefined way. In this chapter we will work with examples using the \texttt{asis} option.\footnote{Note that the \texttt{results} option is a major difference in syntax between \emph{knitr} and \emph{Sweave}.\index{Sweave} In \emph{Sweave} the equivalent option is \texttt{results=TEX}.\index{Sweave}} \section{Table Basics} Before getting into the details of how to create tables from R objects we need to first learn how generic tables are created in LaTeX and Markdown/HTML. If you are not familiar with basic LaTeX or Markdown syntax you might want to skip ahead to chapters \ref{LatexChapter} and \ref{MarkdownChapter}, respectively, before coming back to learn about making tables in these languages. \subsection{Tables in LaTeX}\label{LaTeXTables}\index{LaTeX!tables|(} Tables in LaTeX are usually embedded in two environments:\index{LaTeX environment!table} the \texttt{table} and \texttt{tabular} environments. What is a LaTeX environment in general? A LaTeX environment\label{LaTeXEnviron} is a part of the markup where special commands are executed. A simple environment is the \texttt{center} environment.\footnote{For a comprehensive list of LaTeX environments see: \url{http://latex.wikia.com/wiki/List_of_LaTeX_environments}.} Everything typed in a center environment is, unsurprisingly, centered. Typing: <>= \begin{center} This is a center environment. \end{center} @ \noindent creates the following text in the PDF output: \begin{center} This is a center environment. \end{center} \noindent LaTeX environments all follow the same general syntax: <>= \begin{ENVIRONMENT_NAME} . . . . . . \end{ENVIRONMENT_NAME} @ \noindent You do not have to indent the contents of an environment. Indentations neither affect how the document is compiled nor show up in the final PDF.\footnote{An aside: the \texttt{tabbing}\index{tabbing, LaTeX environment} environment is a useful way to create tabbed text in LaTeX. We don't cover this here though.\index{LaTeX!tabs}} It is conventional to indent them, however, because it makes the markup easier to read. In this chapter we will learn about two types of environments you need for tables in LaTeX. The \texttt{tabular} environment allows you to format the content of a table. The \texttt{table} environment allows you to format a table's location in the text and its caption. \paragraph{The \texttt{tabular} environment}\index{LaTeX environment!tabular|(} The \texttt{tabular} environment allows you to create tables in LaTeX. Let's work through the basic syntax for a simple table.\footnote{For a comprehensive overview, see the LaTeX Wiki page on tables: \url{http://en.wikibooks.org/wiki/LaTeX/Tables}.} To begin a simple tabular environment type \verb|\begin{tabular}{TABLE_SPEC}|. The \verb|TABLE_SPEC| argument allows you to specify the number of columns in a table and the alignment of text in each column. For example, to create a table with three columns, the first of which is left-justified and the latter two center-justified we type: <>= \begin{tabular}{l c c} @ \noindent The \texttt{l} argument creates a left-justified column, \texttt{c} creates a centered one. If we wanted a right-justified column we would use \texttt{r}.\footnote{You can also specify a column's width by using \texttt{m\{WIDTH\}} instead. Be sure to load the \emph{array} package in the preamble for this to work.\index{LaTeX package!array} Using \texttt{m} will create a column of a specified width that is vertically justified in the middle. For example, \texttt{m\{3cm\}} would create a column with a width of 3 centimeters. Text in the column would automatically be wrapped onto multiple lines if need be. You can replace the \texttt{m} with either \texttt{p} or \texttt{b}. \texttt{p} vertically aligns the text at the top, \texttt{b} aligns it at the bottom.} Finally, we can add a horizontal line between columns by adding a vertical bar \texttt{|} between the column arguments.\footnote{If you add two vertical bars (\texttt{||}) you will get two lines.} For example, to place a vertical line between the first and second column in our example table we would type: <>= \begin{tabular}{l | c c} @ Now let's enter content into our table. We saw earlier how CSV files delimit individual columns with commas. In LaTeX's \texttt{tabular} environment, columns are delimited with ampersands (\verb|&|).\footnote{If you want to include an ampersand in the text of your LaTeX document you need to escape it like this: \texttt{\textbackslash{}\&}.\index{escape character}}\index{LaTeX!ampersand} In CSV tables, new lines are delimited by starting a new line. In LaTeX tables you use two backslashes (\verb|\\|).\footnote{You can use two backslashes outside of the \texttt{tabular} environment as well to force a new line. Also, to increase the space between the line you can add a vertical width argument to the double backslashes. For example, \texttt{\textbackslash{}\textbackslash{}[3cm]} will give you a 3-centimeter gap between the current line and the next one.} Here is a simple example of the first two lines of a table: <>= \begin{tabular}{l | c c} Observation & Variable1 & Variable2 \\ Subject1 & a & b \\ @ \noindent It is common to demarcate the row with a table's column names--the first row--with horizontal lines. A horizontal line also often visually demarcates a table's end. You can add horizontal lines in the \texttt{tabular} environment with the \verb|\hline| command. <>= \begin{tabular}{l | c c} \hline Observation & Variable1 & Variable2 \\ \hline \hline Subject1 & a & b \\ \hline @ \noindent Finally, we close the \texttt{tabular} environment with \verb|\end{tabular}|. The full code (with a few extra rows added) is: <>= \begin{tabular}{l | c c} \hline Observation & Variable1 & Variable2 \\ \hline \hline Subject1 & a & b \\ Subject2 & c & d \\ Subject3 & e & f \\ Subject4 & g & h \\ \hline \end{tabular} @ \noindent This produces the following table: \vspace{0.75cm} \begin{tabular}{l | c c} \hline Observation & Variable1 & Variable2 \\ \hline \hline Subject1 & a & b \\ Subject2 & c & d \\ Subject3 & e & f \\ Subject4 & g & h \\ \hline \end{tabular} \index{LaTeX environment!tabular|)} \paragraph{The \texttt{table} float environment}\index{LaTeX environment!table|(} You might notice that the table we created so far lacks a title and is bunched very closely to the surrounding text. In LaTeX we can create a \texttt{table} float environment\index{LaTeX!float} to solve this problem. Float environments allow us to separate a table from the text, specify its location, and give it a caption.\footnote{We will see in the next chapter how to use \texttt{figure} floats as well.} To begin a \texttt{table} float environment use \verb|\begin{table}[POSITION_SPEC]|. The \verb|POSITION_SPEC|\label{POSITIONSPEC} argument allows us to determine the location of the table. It can be set to \texttt{h} for here, i.e. where the table is written in the text. It can also be \texttt{t} to place it on the top of a page or \texttt{b} for the bottom of the page. To set a title for the table use the \verb|\caption| command.\index{LaTeX command!caption} LaTeX automatically determines the table's number, so you only need to enter the text. You can also declare a cross-reference key for the table with the \verb|\label| command.\footnote{This command works throughout LaTeX. To reference the table type in the text of your document \texttt{\textbackslash{}ref\{KEY\}}, where \texttt{KEY} is what you set with the \texttt{\textbackslash{}label} command. Use \texttt{\textbackslash{}pageref}\index{LaTeX command!pageref} to reference the page number.}\index{LaTeX command!label} A \texttt{table} environment is Of course, closed with \verb|\end{table}|. Let's see a full example. <>= \begin{table}[t] \caption{Example Simple LaTeX Table} \label{ExLaTeXTable} \begin{center} \begin{tabular}{l | c c} \hline Observation & Variable1 & Variable2 \\ \hline \hline Subject1 & a & b \\ Subject2 & c & d \\ Subject3 & e & f \\ Subject4 & g & h \\ \hline \end{tabular} \end{center} \end{table} @ \begin{table}[t] \caption{Example Simple LaTeX Table} \label{ExLaTeXTable} \begin{center} \begin{tabular}{l | c c} \hline Observation & Variable1 & Variable2 \\ \hline \hline Subject1 & a & b \\ Subject2 & c & d \\ Subject3 & e & f \\ Subject4 & g & h \\ \hline \end{tabular} \end{center} \end{table} \noindent Notice that the \texttt{tabular} environment is further nested in the \texttt{center} environment. This centers the table while leaving the table's title left-justified. The final result is Table \ref{ExLaTeXTable}. One final tip: to have the caption placed at the bottom rather than the top of the table in the final document, simply put the \texttt{caption} command after the \texttt{tabular} environment is closed. You can see how typing out a table in LaTeX gets very tedious very fast. For all but the simplest tables it is best to try to have R do the table-making work for you. \index{LaTeX environment!table|)} \index{LaTeX!tables|)} \subsection{Tables in Markdown/HTML}\index{Markdown!tables} Now we will briefly look at the syntax for creating simple Markdown and HTML tables before turning to learn how to have R create these tables for us. \paragraph{Markdown tables}\index{Markdown!tables|(} Markdown table syntax, as with all Markdown syntax, is generally much simpler than LaTeX's tabular syntax. The markup is much more human readable. Nonetheless, larger tables can still be tedious to create. You do not need to declare any new environments to start creating a Markdown table. Just start typing in the content. Columns are delimited in Markdown tables with a vertical bar (\texttt{|}). Rows are started with a new line. To indicate the head of the table--usually the row(s) containing the column names--separate it from the body of the table with a row of dashes (e.g. \verb|----|). Here is an example based on the table we created in the previous section: <>= Observation | Variable1 | Variable2 ----------- | ---------- | --------- Subject1 | a | b @ \noindent Note that it is not necessary to line up the vertical bars. You just need to have the same number of them on each row. You can specify each column's text justification using colons on the dashed row. For example, this code will create the left-center-center justified formatted table we made earlier: <>= Observation | Variable1 | Variable2 :---------- | :-------: | :-------: Subject1 | a | b Subject2 | c | d Subject3 | e | f Subject4 | g | c @ \noindent To create a left-justified column simply use a colon on only the left side of the dashes. The ultimate look of a Markdown table is highly dependent on the CSS style file you are using (see Chapter \ref{MarkdownChapter} for how to change your CSS style file). The default RStudio CSS style as of late 2014 formats our table to look like this: \includegraphics[scale = 0.6]{Children/Chapter9/images9/RStudioDefaultTableExample.png} \noindent Using a different CSS style file\footnote{The table was created using the Upstanding Citizen style from the program Marked.\index{Marked}} we can get something like this: \includegraphics[scale = 0.6]{Children/Chapter9/images9/MarkedTableExample.png} \noindent In basic Markdown you can add a caption with the heading syntax (see Section \ref{MarkdownHeader}). In this example the three hashes (\verb|###|) create the header: <>= ### Example Simple Markdown Table Observation | Variable1 | Variable2 :---------- | :-------: | :-------: Subject1 | a | b @ \noindent producing something like this: \includegraphics[scale = 0.6]{Children/Chapter9/images9/MarkedCaptionTableExample.png} \index{Markdown!tables|)} \paragraph{HTML tables}\index{HTML!tables|(} The \texttt{texreg} function that we will learn in the next section doesn't create tables formatted with Markdown syntax. It can create tables with HTML\index{HTML} syntax. This is useful for us because virtually any HTML markup can be incorporated into a Markdown document. In fact, Markdown table syntax is only a stepping stone for more easily producing tables with HTML syntax. So it is useful to also understand the basic syntax for HTML tables. HTML uses element ``tags''\index{HTML!element tag} to begin and end tables. The main element we use to create tables is, well, the \texttt{tables} element.\index{HTML element!tables} This is very similar to LaTeX's \texttt{tabular} environment. An HTML element generally begins with a start tag and ends with an end tag. Clearly this is very similar to LaTeX's \verb|\begin{}| and \verb|\end{}| commands. Begin tags are encapsulated in a greater-than and less-than sign and include the element tag name (\verb||). End tags are similar, but include a forward slash like this \verb||. The content of the element goes between the start and end tags. For example: <>= . . . . . .
@ \noindent As in LaTeX you are not required to tab the content of a table element; however, it does make the markup document easier to read and, as the number of tags proliferates, easier to write. You can specify element attributes\index{HTML element!attributes} inside of start tags.\footnote{These work like arguments in R in that they change how the element is evaluated.} For example, to add a border to the table use: \verb||.\footnote{Whether or not a border appears is determined by whether or not the style sheet you are using includes borders.}\index{HTML!table borders} Table rows are put inside of \texttt{tr}\index{HTML element!tr} (table rows) element tags. Individual cells are delimited with \texttt{td} (standard cell) tags.\index{HTML element!td} Here is what the first row of our example table looks like in basic HTML: <>=
@ \noindent We can further delimit a table's header row(s) from its body with the \texttt{thead} and \texttt{tbody} tags. Finally, before making a full table it's useful to mention that table captions can be included with \texttt{caption} tags. Let's put this all together: {\small <>=
Observation Variable1
Observation Variable1 Variable2
Subject1 a b
Subject2 c d
Subject3 e e
Subject4 f f
@ } \noindent As with Markdown tables, the ultimate appearance of the table is highly dependent on the style files you use. \index{HTML!tables|)} \section{Creating Tables from Supported Class R Objects} Just as the \texttt{write.csv} command turns an R data frame into a CSV formatted text file, there are a number of methods in R to take an object--e.g. a matrix, data frame--the output from a statistical analysis, and so on--and turn them into LaTeX and HTML tables. \texttt{kable}, \emph{xtable}, and \emph{texreg} each work most easily with specific object classes that their designers explicitly supported. \subsection{\texttt{kable} for Markdown and LaTeX}\index{kable|(} \texttt{kable} easily converts matrices and data frames into tables for Markdown, HTML, and LaTeX among others. Let's create a simple data frame: <>= kable_ex <- data.frame( Observation = c("Subject1", "Subject2", "Subject3", "Subject4"), Variable1 = c("a", "c", "e", "g"), Variable2 = c("b", "d", "f", "c") ) @ \noindent Then simply place this data frame into a \texttt{kable} call: <>= kable(kable_ex, caption = "Example kable Table") @ \noindent This creates the following table: \vspace{0.75cm} \includegraphics[scale=0.5]{Children/Chapter9/images9/BasicKableExample.png} \vspace{0.75cm} Beyond setting the tables caption with \texttt{caption}, there are a few other alterations that can be made with \texttt{kable} arguments. You can specify new column and row names by passing character vectors to \texttt{col.names} and \texttt{row.names}, respectively. These are very useful as it can be difficult, or at least irritating, for your readers to try to decode the names you give to your data frame rows and columns in R. Another useful argument is \texttt{digits}. This will round numbers in the table to a specified number of digits after the decimal place.\index{decimal places} To effectively convey your results you should \emph{at the least} only include digits that are significant in that they meaningfully vary in the data \citep[281]{Ehrenberg1977}. You can also change the markup language that the table is created in using the \texttt{format} argument. For example, to create a LaTeX formatted table use \verb|format = 'latex'|. In general, you do not need to specify the format if you are using \emph{knitr} or \emph{rmarkdown} to include the table in a presentation document. This will be done automatically. \index{kable|)} \subsection{\emph{xtable} for LaTeX and HTML}\index{xtable|(} While \texttt{kable} allows you to quickly create simple tables, it can only do so from matrices and data frames. It also has limited customizability. The \emph{xtable} package can create more customizable tables from a wider variety of R objects, including statistical model objects. Different R statistical model estimation commands can produce model objects\index{model object} of different classes. For example, the \texttt{lm} (linear model) command creates model summaries of the \texttt{lm} class. Let's create a simple linear regression using the \emph{swiss} data frame and \texttt{lm}\index{R function!lm}\index{simple linear regression} command. This data frame is included with R by default. The simple linear regression model we are going to make has the \emph{swiss} variable \textbf{Examination} as the dependent variable and \textbf{Education} as the only independent variable.\footnote{For a description of these variables type \texttt{?swiss} into the console} <>= # Fit simple linear regression model M1 <- lm(Examination ~ Education, data = swiss) # Show M1 class class(M1) @ \noindent By using the \texttt{class}\index{R function!class} command we can see that \emph{M1} is of the \texttt{lm} class. \emph{M1} contains items estimated by the linear regression model\footnote{If you are unfamiliar with the syntax of R statistical estimation models the previous code might be confusing. In general `response' ($Y$) variables are written first and are separated from the `explanatory' ($X$) variables by a tilde (\texttt{$\sim$}).\index{R!tilde} Crawley \citeyearpar[107]{Crawley2005} notes that you can read $Y \sim X$ as `$Y$ is modeled as a function of $X$'. In later examples we will see that individual explanatory variables are generally separated by plus signs (\texttt{+}), indicating that they are included in the model, not that they are added. For more information see Crawley \citeyearpar[][Ch. 7]{Crawley2005}.} such as the coefficient estimates and their standard errors. To get a summary of a model object's contents use the \texttt{summary}\index{R function!summary} command like this: <>= # Show summary of M1 model object summary(M1) @ \noindent To find a full list of object classes that \emph{xtable} supports, type \texttt{methods(xtable)}\index{R function!methods} into the R Console after you have loaded the package. \paragraph{\emph{xtable} for LaTeX} Let's look at how to create LaTeX tables with \emph{xtable} by creating a table summarizing the estimates from the \emph{M1} model object. \begin{knitrout} \definecolor{shadecolor}{rgb}{1, 1, 1}\color{fgcolor} \begin{kframe} \begin{alltt} \textless{\textless}results='asis', echo=FALSE\textgreater{\textgreater}= \hlcom{# Load xtable} \hlstd{library}(xtable) \hlcom{# Create LaTeX table from M1 and show the output markup} \hlstd{xtable}(M1, \hlkwc{caption} = \hlstr{"Linear Regression,} \hlstr{Dependent Variable: Exam Score"}, \hlkwc{label} = \hlstr{"BasicXtableSummary"}, \hlkwc{digits} = 1) @ \end{alltt} \end{kframe} \end{knitrout} \noindent When included in an R Sweave-style LaTeX document, this code will create a table exactly like Table \ref{BasicXtableSummary}. Let's go through this code, working from the outside in. First you'll notice that we've set two \emph{knitr} code chunk options. As we discussed earlier, \verb|results='asis'| allows us to include the LaTeX formatted table created by \emph{xtable}. The next option \verb|echo=FALSE| hides the code from being shown in our final document. The \emph{xtable} command creates the summary table of our \emph{M1} model object. Not only does it produce both complete \texttt{tabular} and \texttt{table} environments, but also through the \texttt{caption} and \texttt{label} arguments it automatically adds in the table's title and cross-reference label, respectively. Finally, notice that I added the \texttt{digits = 1} argument. As in \texttt{kable}, this specifies that I want numbers in the table to be rounded to one decimal digit. % Actually show M1Table in the text <>= # Load xtable library(xtable) # Create LaTeX table from M1 and show the output markup xtable(M1, caption = "Linear Regression, Dependent Variable: Exam Score", label = "BasicXtableSummary", digits = 1) @ \paragraph{\emph{xtable} for Markdown/HTML} We can use \emph{xtable} and the \texttt{print.xtable}\index{print.xtable} command\footnote{Note: you can abbreviate \texttt{print.xtable} simply as \texttt{print}.} to also create tables for Markdown and HTML documents. The \emph{xtable} command produces, unsurprisingly, \texttt{xtable} class objects. We can run these through the \texttt{print} command and add arguments to customize how the table is formatted. By default, \texttt{print.xtable}'s \texttt{type} argument is set to \verb|"latex"|. To create an HTML table that can be inserted into Markdown and HTML documents, set the \texttt{type} argument from \verb|"latex"| to \verb|"html"|. For example, to create an HTML version of the table summarizing \emph{M1} and include it in an R Markdown document we type: <>= ```{r, results='asis', echo=FALSE} # Load xtable library(xtable) # Create an xtable object from M1 M1Table <- xtable(M1, caption = "Linear Regression, Dependent Variable: Exam Score", label = "BasicXtableSummary", digits = 1) # Create HTML summary table of M1Table print.xtable(M1Table, type = "html", caption.placement = "top") ``` @ \noindent If you intend to include multiple tables in your R Markdown document you will want to set all of the tables to be printed in HTML. You can place \verb|options("xtable.type" = "html")|\index{R function!options} in a code chunk near the beginning of your document.\footnote{Of course, you will probably want to use the \texttt{include=FALSE} \emph{knitr} option with this code chunk.} This simply makes it so that you don't need to include \verb|type = "html"| every time you use \texttt{print}. Notice in the previous code example that we also added the \texttt{caption.placement = "top"} argument. This will move the caption from the bottom of the table, as it is in Table \ref{BasicXtableSummary}, to the top. See the \emph{xtable} package documentation\footnote{\url{http://cran.r-project.org/web/packages/xtable/xtable.pdf}} for the full list of \texttt{print.xtable} options. \index{xtable|(} \subsection{\emph{texreg} for LaTeX and HTML}\index{texreg|(} \texttt{kable} and \emph{xtable} are limited when it comes to creating tables from statistical model objects. \texttt{kable} only works with matrices and data frames. \emph{xtable} is easiest when working with only one model object at a time. Furthermore, by default these tools do not create output tables that present estimates from multiple statistical models in the style used by many prominent academic journals. The \emph{texreg} package is very useful for creating these types of tables. It also supports more model object types than \emph{xtable}. \paragraph{\emph{texreg} for LaTeX}\index{R function!texreg|(} Imagine we want to show the estimates from a number of nested regression models in LaTeX a table like Table \ref{Basic_texregTable}. For example, to estimate nested regression models from the remaining variables in the \emph{swiss} data set we would type: <>= # Estimated nested regression models M2 <- lm(Examination ~ Education + Agriculture, data = swiss) M3 <- lm(Examination ~ Education + Agriculture + Catholic, data = swiss) M4 <- lm(Examination ~ Education + Agriculture + Catholic + Infant.Mortality, data = swiss) M5 <- lm(Examination ~ Education + Agriculture + Catholic + Infant.Mortality + Fertility, data = swiss) @ \paragraph{\emph{xtable} for LaTeX} \noindent We can now include these model objects in one LaTeX table with \emph{texreg}. Remember to include \verb|results='asis'| in the code chunk head. <>= # Load texreg package library(texreg) # Create custom coefficient names cust_coef <- c('(Intercept)', 'Education', 'Agriculture', 'Catholic', 'Infant Mortality', 'Fertility') # Create nested regression model table texreg(list(M1, M2, M3, M4, M5), caption = 'Nested Estimates Table with \\emph{texreg}', caption.above = TRUE, label = 'Basic_texregTable', custom.coef.names = cust_coef) @ <>= # Load texreg package library(texreg) # Create custom coefficient names cust_coef <- c('(Intercept)', 'Education', 'Agriculture', 'Catholic', 'Infant Mortality', 'Fertility') # Create nested regression model table texreg(list(M1, M2, M3, M4, M5), caption = 'Nested Estimates Table with \\emph{texreg}', caption.above = TRUE, label = 'Basic_texregTable', custom.coef.names = cust_coef) @ \noindent Notice that we placed the model objects in a list when we called \texttt{texreg}. \texttt{texreg} automatically created the \texttt{table} and \texttt{tabular} environments and by default centers the table.\footnote{Use the \texttt{center = FALSE} argument to override centering. If you would like to only create the \texttt{tabular} environment use the argument \texttt{table = FALSE}. Creating your own \texttt{table} environment can be useful in situations where you want more customizability.} We added a caption and reference label with the \texttt{caption} and \texttt{label} arguments, respectively. By default, the caption is placed below the table, so we used \texttt{caption.above = TRUE} to place it on top. Finally, we created custom coefficient names with \texttt{custom.coef.names} that are a bit tidier than the variable names in our R dataset. Your readers will appreciate easily discernible coefficient names. In the LaTeX caption you'll notice \verb|\\emph{texreg}|. In LaTeX the \texttt{emph}\index{LaTeX!emph} command italicizes text (we'll see this again in Chapter \ref{LatexChapter}). We added an additional escape character \verb|\| so that R would not try to interpret the \texttt{e} and instead feed it to LaTeX. By default, \texttt{texreg} uses \verb|stars = c(0.001, 0.01, 0.05)| to determine at what p-values to display statistical significance stars. This is the same as the \texttt{lm} model summary default showing three sets of statistical significance stars.\index{significance stars} You can define the significance levels by assigning a different numeric vector to the \texttt{stars} argument. There are many other changes you can make to tables created with \emph{texreg}. You can change the column and coefficient names, determine what type of standard errors to show, and so on. For the full list of arguments, see the help file by typing \texttt{?texreg} into your R Console. \index{R function!texreg|)} \paragraph{\emph{texreg} for HTML} You can also use the \emph{texreg} package to create tables in Markdown/HTML documents. Instead of the \texttt{texreg} function, use \texttt{htmlreg}\index{R function!htmlreg}. The syntax is largely similar, though arguments relating to LaTeX are not available, while others relating the HTML are. Here is a simple example creating Table \ref{Basic_texregTable} in an HTML document: <>= htmlreg(list(M1, M2, M3, M4, M5), caption = 'Nested Estimates Table in HTML Document', caption.above = TRUE, custom.coef.names = cust_coef) @ \noindent Notice that we did not include the \texttt{label} argument as this is not available in HTML. The resulting table looks like this:\\[0.5cm] \includegraphics[scale=0.6]{Children/Chapter9/images9/htmlregExample.png} \index{texreg|)} \subsection{Fitting Large Tables in LaTeX} Sometimes you may have large tables that are difficult to fit onto a page in LaTeX. There are a number of ways to adjust tables so that they fit on the page. \paragraph{LaTeX landscape tables}\index{LaTeX!landscape} If your LaTeX table is very wide, e.g. because it shows results from many estimation models, you can use LaTeX's \texttt{lscape} package\index{lscape} to create \texttt{landscape} formatting environments. Rather than orienting the text of a page so that it is in profile (a long page), a \texttt{landscape} environment\index{LaTeX environment!landscape} turns it 90 degrees so that it has a landscape orientation (a wide page). To use the \emph{lscape} package, first place \verb|\usepackage{lscape}| in your LaTeX document's preamble. Then begin a \texttt{landscape} environment with \verb|\begin{landscape}| where you would like it located in the text. Then place the \texttt{table} environment information and \emph{knitr} code for creating the table. Finally, close the \texttt{landscape} environment with \verb|\end{landscape}|. \paragraph{LaTeX scalebox for tables}\index{LaTeX!scalebox} In addition, the \texttt{scalebox} command from the \emph{graphics} package\index{LaTeX!graphics package} could be useful for fitting large tables onto a PDF page. This command expands or shrinks the text in the table. \texttt{texreg}\index{R function!texreg} actually has a \texttt{scalebox} argument. If you use \texttt{scalebox = 0.5} it will halve the size of the table; \texttt{scalebox = 2} doubles it. More generally, to rescale a table use: \begin{knitrout} \definecolor{shadecolor}{rgb}{1, 1, 1}\color{fgcolor} \begin{kframe} \begin{alltt} \textbackslash{}scalebox\{HORIZONTAL\_SCALE\}{[}VERTICAL\_SCALE{]}\{TABLE\} \end{alltt} \end{kframe} \end{knitrout} \noindent \texttt{HORIZONTAL\_SCALE} is how much to scale the table horizontally. \texttt{VERTICAL\_SCALE} is how much to scale vertically and \texttt{TABLE} is the table or R code chunk to create the table. \subsection{\emph{xtable} with non-supported class objects}\label{NonSupportedClasses} The \texttt{kable}, \emph{texreg}, and \emph{xtable} packages are very convenient for model objects they know how to handle. With supported class objects the functions in these packages know where to look for the vectors containing the things--coefficient names, standard errors, and so on--that they need to create tables. With unsupported classes, however, they don't know where to look for these things. Luckily, there is a work around. You tell \texttt{xtable} where to find elements you want to include in your table. \texttt{xtable} can handle matrix and data frame class objects. The rows of these objects become table rows and the columns become the table columns. So, to create tables with non-supported class objects you need to: \begin{enumerate} \item find and extract the information from the unsupported class object that you want in the table, \item convert this information into a matrix or data frame where the rows and columns of the object correspond to the rows and columns of the table that you want to create, \item use \emph{xtable} with this object to create the table. \end{enumerate} Imagine that you want to create a results table showing the covariate names, coefficient means, and quantiles for marginal posterior distributions estimated from an linear regression using the \emph{brms}\index{brms} package \cite[]{R-brms} and data from the \emph{swiss} data frame. Let's fit the model: <>= # Load brms package library(brms) # Fit model linearBRMS <- brm(Examination ~ Education, data = swiss, family = gaussian(link = "identity")) # Find ordinal_fit's class class(linearBRMS) @ Using the \texttt{class}\index{R function!class}\index{R function!brm} function we see that the model output object in \emph{linearBRMS} is of the \texttt{brmsfit} class.\index{linear regression}\index{Bayesian} This class is not supported by \emph{xtable}. If you try to create a table summarizing the estimates in \emph{linearBRMStable} you will get the following error: {\small <>= # Load xtable library(xtable) # Attempt to create a table with ordinal_fit linearBRMStable <- xtable(linearBRMS) @ } \noindent With unsupported class objects you have to create the summary yourself and extract the elements that you want from it manually. A good knowledge of vectors, matrices, and component selection is very handy for this (see Chapter \ref{GettingStartedRKnitr}). First, create a summary\index{R function!summary} of your output object {\emph{ordinal\_fit}}: <>= linearBRMSsummary <- summary(linearBRMS) @ \noindent This creates a new object of the class \texttt{brmssummary}. We're still not there yet as this object contains not just the covariate names and so on but also information we don't want to include in the results table, like the estimation formula. The second step is to extract a matrix from inside \emph{ordinal\_fit\_summary} called \emph{summary} with the component selector (\verb|$|). Remember that to find the components of an object use the \texttt{names}\index{R function!names} command. <>= names(linearBRMSsummary) @ \noindent The \emph{fixed} matrix is where the things we want in our table are located. I find it easier to work with data frames, so let's also convert the matrix into a data frame. <>= linearBRMSsummaryDF <- data.frame(linearBRMSsummary$fixed) @ \noindent Here is what the model summary data frame looks like: <>= # Show NBSumDataFrame linearBRMSsummaryDF @ \noindent Now we have a data frame object \emph{xtable} can handle. After a little cleaning up (see the chapter's Appendix for more details) you can use {\emph{NBSumdata frame}} with \emph{xtable} as before to create Table \ref{CoefEstTable}. <>= # Load dplyr package library(dplyr) # Change posterior summary variable names linearBRMSsummaryDF <- rename(linearBRMSsummaryDF, "2.5%" = l.95..CI) linearBRMSsummaryDF <- rename(linearBRMSsummaryDF, "50%" = Estimate) linearBRMSsummaryDF <- rename(linearBRMSsummaryDF, "97.5%" = u.95..CI) # Reorder variables and remove the Est. Error linearBRMSsummaryDF <- linearBRMSsummaryDF[, c("2.5%", "50%", "97.5%", "Rhat")] # Create table xtable(linearBRMSsummaryDF, caption = "Coefficient Estimates Predicting Examination Scores in Swiss Cantons (1888) Found Using Bayesian Linear Regression", label = "CoefEstTable") @ It may take some hunting to find what you want, but a similar process can be used to create tables from objects of virtually any class.\footnote{This process can also be useful for creating graphics as we will see in Chapter \ref{FiguresChapter}.} Hunting for what you want can be easier if you look inside of objects by clicking on them in RStudio's \index{Environment} tab.\index{RStudio!Environment tab} \subsection{Creating variable description documents with \emph{xtable}}\label{VarDescriptTables} You can use \emph{xtable} to create a table describing variables in your data set and insert these into Markdown documents created with the concatenate and print (\texttt{cat}) command (see Section \ref{catR}).\index{R function!cat} This is useful because our data so far has been stored in plain-text files. Unlike binary Stata or SAS data files, plain-text data files do not include variable descriptions. Imagine that we want to create a Markdown file with a table describing the variables from the \emph{swiss} data frame. First we will create two vectors: one for the variable names and the other for the variable descriptions. {\small <>= # Create variable vector from column names Variable <- names(swiss) # Create variable description vector Description <- c("common standardized fertility measures", "% of males involved in agriculture as occupation", "% draftees receiving highest mark on army examination", "% education beyond primary school for draftees", "% 'catholic' (as opposed to 'protestant')", "% live births who live less-than 1 year" ) @ } \noindent In the first line we use the \texttt{names} command to create a vector of the \emph{swiss} data frame's column names. Then we simply create a vector of descriptions with the combine command (\texttt{c}).\index{R function!combine}\index{R function!c} Now we can combine these vectors into a matrix and use it to create an HTML table. <>= # Combine Variable and Description variables into a matrix DescriptionsBound <- cbind(Variable, Description) # Create an xtable object from DescriptionsBound DescriptionsTable <- xtable(DescriptionsBound) # Format table in HTML DescriptTable <- print.xtable(DescriptionsTable, type = "html") @ \noindent Finally, we can use \texttt{cat} to create our Markdown variable description file. <>= # Create variable description file cat("# Swiss Data Variable Descriptions \n", "### Source: Mosteller and Tukey, (1977) \n", DescriptTable, file = "SwissVariableDescriptions.md" ) @ \noindent The first part of the \texttt{cat} command here is the title of the document. As we will see in Chapter \ref{MarkdownChapter}, hashes (\verb|#|) create headers. The \verb|\n| creates a new line in the Markdown document. The next line is information on the \emph{swiss} data frame's source. We then include the HTML table in the \emph{DescriptTable} object and save it to a file called \emph{SwissVariableDescriptions.md}. It is convenient to simply include the creation of this table in your data gathering makefiles and have it saved into the same directory as your data. This way it will be easy to update as you update your data and easy to find. If you are storing your data on GitHub it will automatically render the variable description Markdown file and make it easy for others to read. See this book's makefile example for more information: \url{http://bit.ly/1AaOuDx}.\footnote{The long URL is: \url{https://GitHub.com/christophergandrud/Rep-Res-Examples/tree/master/DataGather_Merge}.} \subsection*{Chapter summary} In this chapter we have learned how to take the results from our statistical analyses and other information from our data and dynamically present it in LaTeX and Markdown documents with \emph{knitr}/\emph{rmarkdown}. In the next chapter we will do the same thing with figures. \section*{Appendix} Source code for cleaning \emph{linearBRMSsummaryDF} and using it to create a LaTeX table: <>= # Load packages library(dplyr) library(xtable) # Change posterior summary variable names linearBRMSsummaryDF <- rename(linearBRMSsummaryDF, "2.5%" = l.95..CI) linearBRMSsummaryDF <- rename(linearBRMSsummaryDF, "50%" = Estimate) linearBRMSsummaryDF <- rename(linearBRMSsummaryDF, "97.5%" = u.95..CI) # Reorder variables and remove the Est. Error linearBRMSsummaryDF <- linearBRMSsummaryDF[, c("2.5%", "50%", "97.5%", "Rhat")] # Create table xtable(linearBRMSsummaryDF, caption = "Coefficient Estimates Predicting Examination Scores in Swiss Cantons (1888) Found Using Bayesian Linear Regression", label = "CoefEstTable") # Create table xtable(linearBRMSsummaryDF, caption = "Coefficient Estimates Predicting Examination Scores in Swiss Cantons (1888) Found Using Bayesian Normal Linear Regression") @ \noindent Note that the new variable names are in quotation marks, in contrast to the example from Chapter \ref{DataClean}. The quotation marks allow us to specify a name that begins with a number and has special characters like \texttt{\%}. ================================================ FILE: Old/Source-v2/Children/FrontMatter/AdditionalResources/AdditionalResources.Rnw ================================================ % Example Project Explanation For Reproducible Research in R and RStudio % Christopher Gandrud % Updated: 20 March 2015 \chapter*{Additional Resources} \addcontentsline{toc}{chapter}{Additional Resources} Additional resources that supplement the examples in this book can be freely downloaded and experimented with. These resources include longer examples discussed in individual chapters and a complete short reproducible research project. \subsection*{Chapter Examples} %\addcontentsline{toc}{subsection}{Chapter Examples} Longer examples discussed in individual chapters, including files to dynamically download data, code for creating figures, and markup files for creating presentation documents, can be accessed at: \url{https://GitHub.com/christophergandrud/Rep-Res-Examples}. Please see Chapter \ref{Storing} for more information on downloading files from GitHub, where the examples are stored.\index{GitHub} \subsection*{Short Example Project} %\addcontentsline{toc}{subsection}{Short Example Project} To download a full (though very short) example of a reproducible research project created using the tools covered in this book go to: \url{https://GitHub.com/christophergandrud/Rep-Res-ExampleProject1}. Please follow the replication instructions in the main \emph{README.md} file to fully replicate the project. It is probably a good idea to hold off looking at this complete example in detail until after you have become acquainted with the individual tools it uses. Become acquainted with the tools by reading through this book and working with the individual chapter examples. The following two figures give you a sense of how the example's files are organized. Figure \ref{ExampProjeFiles} shows how the files are organized in the file system. Figure \ref{ExampProjDiagram} illustrates how the main files are dynamically tied together. In the \emph{Data} directory we have files to gather raw data from the \cite{WorldBank2013} on fertilizer consumption and from \cite{Pemstein2010} on countries' levels of democracy. They are tied to the data through the \texttt{WDI}\index{WDI} and \texttt{download.file} commands.\index{R function!download.file} A \emph{Makefile}\index{Makefile} can run \emph{Gather1.R} and \emph{Gather2.R} to gather and clean the data. It runs \emph{MergeData.R} to merge the data into one data file called \emph{MainData.csv}. It also automatically generates a variable description file and a \emph{README.md}\index{README file} recording the session info.\index{R!session info} The \emph{Analysis} folder contains two files that create figures presenting this data. They are tied to \emph{MainData.csv} with the \texttt{read.csv} command.\index{R function!read.csv} These files are run by the presentation documents when they are knitted. The presentation documents tie to the analysis documents with \emph{knitr} and the \texttt{source} command.\index{R function!source} Though a simple example, hopefully these files will give you a complete sense of how a reproducible research project can be organized. Please feel free to experiment with different ways of organizing the files and tying them together to make your research really reproducible. \thispagestyle{plain} \begin{figure}[th!] \caption{Short Example Project File Tree} \label{ExampProjeFiles} \begin{center} \input{Children/FrontMatter/AdditionalResources/imagesExamp/FileTree.tex} \end{center} \end{figure} \clearpage \thispagestyle{plain} \begin{landscape} \begin{figure}[th!] \caption{Short Example Main File Ties} \label{ExampProjDiagram} \begin{center} \input{Children/FrontMatter/AdditionalResources/imagesExamp/ExampDiagram.tex} \end{center} \end{figure} \end{landscape} \subsection*{Updates} Many of the reproducible research tools discussed in this book are improving rapidly. Because of this I will regularly post updates to the content covered in the book at: \url{https://GitHub.com/christophergandrud/Rep-Res-Book}. \subsection*{Corrections} If you notice any corrections that should be made to fix typos, broken URLs, and so on, you can report them at: \url{https://GitHub.com/christophergandrud/Rep-Res-Book/issues}. I'll post notifications of changes to an Errata page at: \url{http://christophergandrud.GitHub.io/RepResR-RStudio/errata.htm}. ================================================ FILE: Old/Source-v2/Children/FrontMatter/Packages.Rnw ================================================ <>= set_parent('Rep-Res-Parent.Rnw') @ \chapter*{Required R Packages} \label{ReqPackages} \addcontentsline{toc}{chapter}{Required R Packages} In this book I discuss how to use a number of user-written R packages for reproducible research. Many of these packages are not included in the default R installation. They need to be installed separately. \index{R!packages|(} \textbf{Note:} in general you should aim to minimize the number of packages that your research depends on. Doing so will lessen the possibility that your code will ``break'' when a package is updated. This book depends on relatively many packages because of its special and unusual purpose of illustrating a variety of tools that you can use for reproducible research. To install key user-written packages discussed in this book, copy the following code and paste it into your R console: <>= install.packages(c("brew", "countrycode", "devtools", "dplyr", "ggplot2", "googleVis", "knitr", "MCMCpack", "repmis", "RCurl", "rmarkdown", "texreg", "tidyr", "WDI", "xtable", "brms")) @ \noindent Once you enter this code, you may be asked to select a CRAN ``mirror''\index{CRAN!mirror} to download the packages from.\footnote{CRAN stands for the Comprehensive R Archive Network.} Simply select the mirror closest to you. \index{R!packages|(} \paragraph{Special issues for Windows and Linux Users} If you are using Windows, you will also need to install \emph{Rtools} \cite[]{Rtools}.\index{Rtools} You can download \emph{Rtools} from: \url{http://cran.r-project.org/bin/windows/Rtools/}.\label{RtoolsDownload} Please use the recommended installation to ensure that your system PATH\index{PATH} is set up correctly. Otherwise your computer will not know where the tools are. On Linux you will need to install the \emph{RCurl} \citep{R-RCurl}\index{RCurl} and \emph{XML}\index{XML} \citep{R-XML} packages separately. Use your Terminal\index{Terminal} to install these packages with the following code: <>= sudo apt-get update sudo apt-get install libcurl4-gnutls-dev sudo apt-get install libxml2-dev sudo apt-get install r-cran-xml sudo apt-get install r-cran-rjava @ <>= # Install & load required packages as well as all cited packages # Note: many of these packages are loaded to access their citation information only. ## Based on https://gist.GitHub.com/3710171 ## See also http://bit.ly/PbabKd library(pacman) book_packages <- c("animation", "brew", "brms", "countrycode", "data.table", "devtools", "digest", "dplyr", "extrafont", "formatR", "ggplot2", "googleVis", "highlight", 'htmlwidgets', "httr", "knitcitations", "knitr", "magrittr", "markdown", "openair", "packrat", "quantmod", "RCurl", "repmis", "rjson", "rmarkdown", "RJSONIO", "rvest", "shiny", "stargazer", "survival", "texreg", "tidyr", "twitteR", "WDI", "XML", "xtable", ) p_load(book_packages) # Write citations to bibtex file knitr::write_bib(toInstall, file = "rep-res-PackagesCited.bib") # Remove conflicting packages detach(package:twitteR) detach(package:rjson) detach(package:XML) @ ================================================ FILE: Old/Source-v2/Children/FrontMatter/Preface.Rnw ================================================ <>= set_parent('Rep-Res-Parent.Rnw') @ \chapter*{Preface} \addcontentsline{toc}{chapter}{Preface} \noindent This book has its genesis in my PhD research at the London School of Economics. I started the degree with questions about the 2008/09 financial crisis and planned to spend most of my time researching capital adequacy requirements. But I quickly realized that I would actually spend a large proportion of my time learning the day-to-day tasks of data gathering, analysis, and results presentation. After plodding through for a while with Word, Excel, and Stata, my breaking point came while reentering results into a regression table after I had tweaked one of my statistical models, yet again. Surely there was a better way to \emph{do} research that would allow me to spend more time answering my research questions. Making research reproducible for others also means making it better organized and efficient for yourself. My search for a better way led me straight to the tools for reproducible computational research. The reproducible research community is very active, knowledgeable, and helpful. Nonetheless, I often encountered holes in this collective knowledge, or at least had no resource organize it all together as a whole. That is my intention for this book: to bring together the skills I have picked up for actually doing and presenting computational research. Hopefully, the book, along with making reproducible research more widely used, will save researchers hours of googling, so they can spend more time addressing their research questions. \section*{Changes to the Second Edition} The tools of reproducible research have developed rapidly since the first edition of this book was published just two years ago. The second edition has been updated to incorporate the most important of these advancements, including discussions of: \begin{itemize} \item The \emph{rmarkdown} package, which allows you to create reproducible research documents in PDF, HTML, and Microsoft Word formats using the simple and intuitive Markdown syntax. \item Improvements and changes to RStudio's interface and capabilities, such as its new tools for handling R Markdown documents. \item Expanded \emph{knitr} R code chunk capabilities. \item The \texttt{kable} function in the \emph{knitr} package and the \emph{texreg} package for dynamically creating tables to present your data and statistical results. \item An improved discussion of file organization allowing you to take full advantage of relative file paths so that your documents are more easily reproducible across computers and systems. \item The \emph{dplyr}, \emph{magrittr}, and \emph{tidyr} packages for fast data manipulation. \item Numerous changes to R syntax in user-created packages. \item Changes to GitHub's and Dropbox's interfaces. \end{itemize} \section*{Acknowledgements} I would not have been able to write this book without many people's advice and support. Foremost is John Kimmel, acquisitions editor at Chapman and Hall. He approached me in Spring 2012 with the general idea and opportunity for this book. Other editors at Chapman and Hall and Taylor and Francis have greatly contributed to this project, including Marcus Fontaine. I would also like to thank all of the book's reviewers whose helpful comments have greatly improved it. The first edition's reviewers include: \begin{itemize} \item Jeromy Anglim, Deakin University \item Karl Broman, University of Wisconsin, Madison \item Jake Bowers, University of Illinois, Urbana-Champaign \item Corey Chivers, McGill University \item Mark M. Fredrickson, University of Illinois, Urbana-Champaign \item Benjamin Lauderdale, London School of Economics \item Ramnath Vaidyanathan, McGill University \end{itemize} \vspace{0.5cm} The developer and blogging community has also been incredibly important for making this book possible. Foremost among these people is Yihui Xie. He is the main developer behind the {\emph{knitr}} package, co-developer of \emph{rmarkdown}, and also an avid blog writer and commenter. Without him the ability to do reproducible research would be much harder and the blogging community that spreads knowledge about how to do these things would be poorer. Other great contributors to the reproducible research community include Carl Boettiger, Karl Broman, Markus Gesmann (who developed {\emph{googleVis}}), Rob Hyndman, and Hadley Wickham (who has developed numerous very useful R packages). Thank you also to Victoria Stodden and Michael Malecki for helpful suggestions. And, of course, thank you to everyone at RStudio (especially JJ Allaire) for creating an increasingly useful program for reproducible research. The second edition has benefited immensely from first edition readers' comments and suggestions. For a list of their valuable contributions, please see the book's GitHub Issues page \url{https://GitHub.com/christophergandrud/Rep-Res-Book/issues} and the first edition's Errata page \url{http://christophergandrud.GitHub.io/RepResR-RStudio/errata.htm}. My students at Yonsei University were an important part of making the first edition. One of the reasons that I got interested in using many of the tools covered in this book, like using \emph{knitr} in slideshows, was to improve a course I taught there: Introduction to Social Science Data Analysis. I tested many of the explanations and examples in this book on my students. Their feedback has been very helpful for making the book clearer and more useful. Their experience with using these tools on Microsoft Windows computers was also important for improving the book's Windows documentation. Similarly, my students at the Hertie School of Governance inspired and tested key sections of the second edition. The vibrant community at Stack Overflow \url{http://stackoverflow.com/} and Stack Exchange \url{http://stackexchange.com/} are always very helpful for finding answers to problems that plague any computational researcher. Importantly, the sites make it easy for others to find the answers to questions that have already been asked. My wife, Kristina Gandrud, has been immensely supportive and patient with me throughout the writing of this book (and pretty much my entire academic career). Certainly this is not the proper forum for musing about marital relations, but I'll do a musing anyways. Having a person who supports your interests, even if they don't completely share them, is immensely helpful for a researcher. It keeps you going. ================================================ FILE: Old/Source-v2/Children/FrontMatter/StylisticConventions.md ================================================ %% Stylistic Conventions for Reproducible Research with R and RStudio I use the following conventions throughout this book: - **Abstract variables**: Abstract variables, i.e. variables that do not represent specific objects in an example, are in `ALL CAPS TYPEWRITER TEXT`. - **Clickable buttons**: Clickable Buttons are in `typewriter text`. - **Code**: All code is in `typewriter text`. - **Filenames and directories**: Filenames and directories more generally are printed in *italics*. I use CamelBack for file and directory names. - **File extensions**: Like filenames, file extensions are *italicized*. - **Individual variable values**: Individual variable values mentioned in the text are in *italics*. - **Objects**: Objects are printed in *italics*. I use CamelBack for object names. - **Object columns**: Data frame object columns are printed in *italics*. - **Packages**: **R** packages are printed in *italics*. - **Windows and RStudio panes**: Open windows and RStudio panes are written in *italics*. - **Variable names**: Variable names are printed in **bold**. I use CamelBack for individual variable names. ================================================ FILE: Old/Source-v2/Children/FrontMatter/rep-res-PackagesCited.bib ================================================ @Manual{R-animation, title = {animation: A gallery of animations in statistics and utilities to create animations}, author = {Yihui Xie}, year = {2014}, note = {R package version 2.3}, url = {http://CRAN.R-project.org/package=animation}, } @Manual{R-brew, title = {brew: Templating Framework for Report Generation}, author = {Jeffrey Horner}, year = {2011}, note = {R package version 1.0-6}, url = {http://CRAN.R-project.org/package=brew}, } @Manual{R-countrycode, title = {countrycode: Convert Country Names and Country Codes}, author = {Vincent Arel-Bundock}, year = {2014}, note = {R package version 0.18}, url = {http://CRAN.R-project.org/package=countrycode}, } @Manual{R-data.table, title = {data.table: Extension of data.frame}, author = {M Dowle and T Short and S Lianoglou and A Srinivasan with contributions from R Saporta and E Antonyan}, year = {2014}, note = {R package version 1.9.4}, url = {http://CRAN.R-project.org/package=data.table}, } @Manual{R-devtools, title = {devtools: Tools to Make Developing R Packages Easier}, author = {Hadley Wickham and Winston Chang}, year = {2015}, note = {R package version 1.8.0}, url = {http://CRAN.R-project.org/package=devtools}, } @Manual{R-digest, title = {digest: Create Cryptographic Hash Digests of R Objects}, author = {Dirk Eddelbuettel}, year = {2014}, note = {R package version 0.6.8}, url = {http://CRAN.R-project.org/package=digest}, } @Manual{R-dplyr, title = {dplyr: A Grammar of Data Manipulation}, author = {Hadley Wickham and Romain Francois}, year = {2015}, note = {R package version 0.4.2}, url = {http://CRAN.R-project.org/package=dplyr}, } @Manual{R-extrafont, title = {extrafont: Tools for using fonts}, author = {Winston Chang,}, year = {2014}, note = {R package version 0.17}, url = {http://CRAN.R-project.org/package=extrafont}, } @Manual{R-formatR, title = {formatR: Format R Code Automatically}, author = {Yihui Xie}, year = {2015}, note = {R package version 1.2}, url = {http://CRAN.R-project.org/package=formatR}, } @Manual{R-ggplot2, title = {ggplot2: An Implementation of the Grammar of Graphics}, author = {Hadley Wickham and Winston Chang}, year = {2015}, note = {R package version 1.0.1}, url = {http://CRAN.R-project.org/package=ggplot2}, } @Manual{R-googleVis, title = {googleVis: R Interface to Google Charts}, author = {Markus Gesmann and Diego {de Castillo}}, year = {2015}, note = {R package version 0.5.9}, url = {http://CRAN.R-project.org/package=googleVis}, } @Manual{R-highlight, title = {highlight: Syntax Highlighter}, author = {Romain Francois}, year = {2015}, note = {R package version 0.4.7}, url = {http://CRAN.R-project.org/package=highlight}, } @Manual{R-htmlwidgets, title = {htmlwidgets: HTML Widgets for R}, author = {Ramnath Vaidyanathan and Yihui Xie and JJ Allaire and Joe Cheng and Kenton Russell}, year = {2015}, note = {R package version 0.5}, url = {http://CRAN.R-project.org/package=htmlwidgets}, } @Manual{R-httr, title = {httr: Tools for Working with URLs and HTTP}, author = {Hadley Wickham}, year = {2015}, note = {R package version 1.0.0}, url = {http://CRAN.R-project.org/package=httr}, } @Manual{R-knitcitations, title = {knitcitations: Citations for Knitr Markdown Files}, author = {Carl Boettiger}, year = {2015}, note = {R package version 1.0.6}, url = {http://CRAN.R-project.org/package=knitcitations}, } @Manual{R-knitr, title = {knitr: A General-Purpose Package for Dynamic Report Generation in R}, author = {Yihui Xie}, year = {2015}, note = {R package version 1.10.5}, url = {http://CRAN.R-project.org/package=knitr}, } @Manual{R-magrittr, title = {magrittr: A Forward-Pipe Operator for R}, author = {Stefan Milton Bache and Hadley Wickham}, year = {2014}, note = {R package version 1.5}, url = {http://CRAN.R-project.org/package=magrittr}, } @Manual{R-markdown, title = {markdown: 'Markdown' Rendering for R}, author = {JJ Allaire and Jeffrey Horner and Vicent Marti and Natacha Porte}, year = {2015}, note = {R package version 0.7.7}, url = {http://CRAN.R-project.org/package=markdown}, } @Manual{R-openair, title = {openair: Tools for the Analysis of Air Pollution Data}, author = {David Carslaw and Karl Ropkins}, year = {2015}, note = {R package version 1.6}, url = {http://CRAN.R-project.org/package=openair}, } @Manual{R-packrat, title = {packrat: A Dependency Management System for Projects and their R Package Dependencies}, author = {Kevin Ushey and Jonathan McPherson and Joe Cheng and JJ Allaire}, year = {2015}, note = {R package version 0.4.4}, url = {http://CRAN.R-project.org/package=packrat}, } @Manual{R-quantmod, title = {quantmod: Quantitative Financial Modelling Framework}, author = {Jeffrey A. Ryan}, year = {2015}, note = {R package version 0.4-5}, url = {http://CRAN.R-project.org/package=quantmod}, } @Manual{R-RCurl, title = {RCurl: General Network (HTTP/FTP/...) Client Interface for R}, author = {Duncan {Temple Lang} and the CRAN team}, year = {2015}, note = {R package version 1.95-4.7}, url = {http://CRAN.R-project.org/package=RCurl}, } @Manual{R-repmis, title = {repmis: Miscellaneous Tools for Reproducible Research}, author = {Christopher Gandrud}, year = {2015}, note = {R package version 0.4.4}, url = {http://cran.r-project.org/package=repmis}, } @Manual{R-rjson, title = {rjson: JSON for R}, author = {Alex Couture-Beil}, year = {2014}, note = {R package version 0.2.15}, url = {http://CRAN.R-project.org/package=rjson}, } @Manual{R-RJSONIO, title = {RJSONIO: Serialize R objects to JSON, JavaScript Object Notation}, author = {Duncan {Temple Lang}}, year = {2014}, note = {R package version 1.3-0}, url = {http://CRAN.R-project.org/package=RJSONIO}, } @Manual{R-rmarkdown, title = {rmarkdown: Dynamic Documents for R}, author = {JJ Allaire and Joe Cheng and Yihui Xie and Jonathan McPherson and Winston Chang and Jeff Allen and Hadley Wickham and Rob Hyndman}, year = {2015}, note = {R package version 0.7}, url = {http://CRAN.R-project.org/package=rmarkdown}, } @Manual{R-rvest, title = {rvest: Easily Harvest (Scrape) Web Pages}, author = {Hadley Wickham}, year = {2015}, note = {R package version 0.2.0}, url = {http://CRAN.R-project.org/package=rvest}, } @Manual{R-shiny, title = {shiny: Web Application Framework for R}, author = {Winston Chang and Joe Cheng and JJ Allaire and Yihui Xie and Jonathan McPherson}, year = {2015}, note = {R package version 0.12.2}, url = {http://CRAN.R-project.org/package=shiny}, } @Manual{R-stargazer, title = {stargazer: Well-Formatted Regression and Summary Statistics Tables}, author = {Marek Hlavac}, year = {2015}, note = {R package version 5.2}, url = {http://CRAN.R-project.org/package=stargazer}, } @Manual{R-survival, title = {survival: Survival Analysis}, author = {Terry M Therneau}, year = {2015}, note = {R package version 2.38-3}, url = {http://CRAN.R-project.org/package=survival}, } @Manual{R-texreg, title = {texreg: Conversion of R Regression Output to LaTeX or HTML Tables}, author = {Philip Leifeld}, year = {2015}, note = {R package version 1.35}, url = {http://CRAN.R-project.org/package=texreg}, } @Manual{R-tidyr, title = {tidyr: Easily Tidy Data with spread() and gather() Functions.}, author = {Hadley Wickham}, year = {2014}, note = {R package version 0.2.0}, url = {http://CRAN.R-project.org/package=tidyr}, } @Manual{R-twitteR, title = {twitteR: R Based Twitter Client}, author = {Jeff Gentry}, year = {2015}, note = {R package version 1.1.9}, url = {http://CRAN.R-project.org/package=twitteR}, } @Manual{R-WDI, title = {WDI: World Development Indicators (World Bank)}, author = {Vincent Arel-Bundock}, year = {2013}, note = {R package version 2.4}, url = {http://CRAN.R-project.org/package=WDI}, } @Manual{R-XML, title = {XML: Tools for Parsing and Generating XML Within R and S-Plus}, author = {Duncan {Temple Lang} and the CRAN Team}, year = {2015}, note = {R package version 3.98-1.3}, url = {http://CRAN.R-project.org/package=XML}, } @Manual{R-xtable, title = {xtable: Export tables to LaTeX or HTML}, author = {David B. Dahl}, year = {2014}, note = {R package version 1.7-4}, url = {http://CRAN.R-project.org/package=xtable}, } @Manual{R-Zelig, title = {Zelig: Everyone's Statistical Software}, author = {Matt Owen and Kosuke Imai and Gary King and Olivia Lau}, year = {2013}, note = {R package version 4.2-1}, url = {http://CRAN.R-project.org/package=Zelig}, } @Manual{R-ZeligBayesian, title = {ZeligBayesian: A Zelig Model}, author = {Matt Owen}, year = {2011}, note = {R package version 0.1}, url = {http://gking.harvard.edu/zelig}, } ================================================ FILE: Old/Source-v2/Rep-Res-Parent.Rnw ================================================ %%%%%%%%%%%%%%% % Parent document for the book Reproducible Research with R and RStudio % Christopher Gandrud % 17 April 2015 %%%%%%%%%%%%%% % Tell RStudio that weaving is to be done with the knitr package % !Rnw weave = knitr \documentclass[krantz1]{krantz} % Load required LaTeX packages \usepackage[authoryear]{natbib} \usepackage{amssymb} \usepackage{amsmath} \usepackage{graphicx} %\usepackage{caption} \usepackage{subfigure} %\usepackage{epsfig} \usepackage{makeidx} \usepackage{emptypage} %\usepackage{showidx} \usepackage{multicol} \frenchspacing \tolerance=5000 %\usepackage[usenames,dvipsnames,svgnames]{xcolor} \usepackage{dcolumn} \usepackage{booktabs} \usepackage{multirow} \usepackage[T1]{fontenc} \usepackage{lmodern} \usepackage{lscape} \usepackage{url} \usepackage{todonotes} \usepackage{tikz} \usetikzlibrary{trees} \usetikzlibrary{decorations.pathmorphing} \usetikzlibrary{shapes,arrows} \usepackage{wrapfig} \usepackage{alltt} \makeatletter \def\section{\@startsection{section}{1}{\z@}{-3.5ex \@plus -1ex \@minus -.2ex}{2.3ex \@plus .2ex} {\normalfont\raggedright\Large\bfseries}} \def\subseciton{\@startsection{subsection}{2}{\z@}{-3.25ex\@plus -1ex \@minus -.2ex}{1.5ex \@plus .2ex} {\normalfont\raggedright\large\bfseries}} \def\subsubsection{\@startsection{subsubsection}{3}{\z@}{-3.25ex\@plus -1ex \@minus -.2ex}{1.5ex \@plus .2ex} {\normalfont\raggedright\normalsize\bfseries}} \makeatother \newcommand{\blankpage}{ \newpage \thispagestyle{empty} \mbox{} \newpage } % Set margins on highlighted code output boxes \setlength\fboxsep{6.25mm} % Set knitr global options <>= opts_chunk$set(concordance=TRUE) opts_chunk$set(fig.align='center') opts_chunk$set(echo=TRUE) opts_chunk$set(background='#FFFFFF') @ \makeatletter \makeatother \makeindex \begin{document} % Title page \title{Reproducible Research with R and RStudio Second Edition} \author{Christopher Gandrud} \maketitle % Set roman numeral page counter \pagenumbering{roman} % Front matter \frontmatter \newpage\null\newpage \newpage\null\newpage \newpage\null\newpage \newpage\null\newpage \newpage\null\newpage % Table of Contents \tableofcontents % Preface <>= @ % Convert Stylistic Conventions child document from Markdown to LaTex and include \chapter*{Stylistic Conventions}\label{StylisticConventions} \addcontentsline{toc}{chapter}{Stylistic Conventions} <>= pandoc Children/FrontMatter/StylisticConventions.md -t latex -o StyleTemp.tex @ \input{StyleTemp.tex} % Include page on installing R packages used in the book <>= @ % Add lists of figures and tables \cleardoublepage \phantomsection \label{listoffig} \addcontentsline{toc}{chapter}{List of Figures} \listoffigures \newpage \phantomsection \label{listoftables} \addcontentsline{toc}{chapter}{List of Tables} \listoftables \mainmatter % Start Arabic numeral page counter \setcounter{page}{1} % Part 1, include child documents \part{Getting Started} <>= @ % Part 2, include child documents \part{Data Gathering and Storage} <>= @ % Part 3, include child documents \part{Analysis and Results} <>= @ % Part 4, include child documents \part{Presentation Documents} <>= @ % Include bibliography \cleardoublepage \addcontentsline{toc}{chapter}{\bibname} \bibliographystyle{apa} \markboth{\bibname}{\bibname} \bibliography{rep-res-book,Children/FrontMatter/rep-res-PackagesCited} %Include index \let\myindtmp\indexspace %to tighten up the index \renewcommand{\indexspace}{\myindtmp\vspace*{-2pt}} \cleardoublepage \markboth{\indexname}{\indexname} \printindex \end{document} ================================================ FILE: Old/Source-v2/Rep-Res-Parent.toc ================================================ \contentsline {chapter}{Preface}{xiii} \contentsline {chapter}{Stylistic Conventions}{xvii} \contentsline {chapter}{Required R Packages}{xix} \contentsline {paragraph}{Special issues for Windows and Linux Users}{xx} \contentsline {chapter}{Additional Resources}{xxi} \contentsline {chapter}{List of Figures}{xxv} \contentsline {chapter}{List of Tables}{xxvii} \contentsline {part}{I\hspace {1em}Getting Started}{1} \contentsline {chapter}{\numberline {1}Introducing Reproducible Research}{3} \contentsline {section}{\numberline {1.1}What Is Reproducible Research?}{3} \contentsline {section}{\numberline {1.2}Why Should Research Be Reproducible?}{5} \contentsline {subsection}{\numberline {1.2.1}For science}{5} \contentsline {paragraph}{Standard to judge scientific claims}{5} \contentsline {paragraph}{Avoiding effort duplication \& encouraging cumulative knowledge development}{6} \contentsline {subsection}{\numberline {1.2.2}For you}{6} \contentsline {paragraph}{Better work habits}{6} \contentsline {paragraph}{Better teamwork}{6} \contentsline {paragraph}{Changes are easier}{7} \contentsline {paragraph}{Higher research impact}{7} \contentsline {section}{\numberline {1.3}Who Should Read This Book?}{8} \contentsline {subsection}{\numberline {1.3.1}Academic researchers}{8} \contentsline {subsection}{\numberline {1.3.2}Students}{8} \contentsline {subsection}{\numberline {1.3.3}Instructors}{8} \contentsline {subsection}{\numberline {1.3.4}Editors}{9} \contentsline {subsection}{\numberline {1.3.5}Private sector researchers}{9} \contentsline {section}{\numberline {1.4}The Tools of Reproducible Research}{10} \contentsline {section}{\numberline {1.5}Why Use R, \emph {knitr}/\emph {rmarkdown}, and RStudio for Reproducible Research?}{11} \contentsline {paragraph}{Why R?}{11} \contentsline {paragraph}{Why {\normalfont {knitr}} and {\normalfont {rmarkdown}}?}{11} \contentsline {paragraph}{Why RStudio?}{12} \contentsline {subsection}{\numberline {1.5.1}Installing the main software}{13} \contentsline {paragraph}{Installing markup languages}{14} \contentsline {paragraph}{GNU Make}{14} \contentsline {paragraph}{Other Tools}{14} \contentsline {section}{\numberline {1.6}Book Overview}{14} \contentsline {subsection}{\numberline {1.6.1}How to read this book}{16} \contentsline {paragraph}{More-experienced R users}{16} \contentsline {paragraph}{More-experienced LaTeX users}{16} \contentsline {paragraph}{Less-experienced LaTeX/Markdown users}{16} \contentsline {subsection}{\numberline {1.6.2}Reproduce this book}{16} \contentsline {subsection}{\numberline {1.6.3}Contents overview}{17} \contentsline {chapter}{\numberline {2}Getting Started with Reproducible Research}{19} \contentsline {section}{\numberline {2.1}The Big Picture: A Workflow for Reproducible Research}{19} \contentsline {subsection}{\numberline {2.1.1}Reproducible theory}{20} \contentsline {section}{\numberline {2.2}Practical Tips for Reproducible Research}{22} \contentsline {subsection}{\numberline {2.2.1}Document everything!}{22} \contentsline {paragraph}{Document your R session info}{22} \contentsline {subsection}{\numberline {2.2.2}Everything is a (text) file}{24} \contentsline {subsection}{\numberline {2.2.3}All files should be human readable}{24} \contentsline {paragraph}{Commenting}{25} \contentsline {paragraph}{Style guides}{26} \contentsline {paragraph}{Literate programming}{26} \contentsline {subsection}{\numberline {2.2.4}Explicitly tie your files together}{26} \contentsline {subsection}{\numberline {2.2.5}Have a plan to organize, store, and make your files available}{27} \contentsline {chapter}{\numberline {3}Getting Started with R, RStudio, and {\normalfont {knitr}}/\normalfont {rmarkdown}}{29} \contentsline {section}{\numberline {3.1}Using R: The Basics}{29} \contentsline {subsection}{\numberline {3.1.1}Objects}{30} \contentsline {paragraph}{Numeric \& string objects}{31} \contentsline {paragraph}{Vector \& data frame objects}{32} \contentsline {subsection}{\numberline {3.1.2}Component selection}{36} \contentsline {paragraph}{{\tt {attach}} and {\tt {with}}}{36} \contentsline {subsection}{\numberline {3.1.3}Subscripts}{38} \contentsline {subsection}{\numberline {3.1.4}Functions and commands}{39} \contentsline {subsection}{\numberline {3.1.5}Arguments}{40} \contentsline {subsection}{\numberline {3.1.6}The workspace \& history}{42} \contentsline {paragraph}{R history}{43} \contentsline {subsection}{\numberline {3.1.7}Global R options}{43} \contentsline {subsection}{\numberline {3.1.8}Installing new packages and loading functions}{44} \contentsline {section}{\numberline {3.2}Using RStudio}{45} \contentsline {paragraph}{The default window}{45} \contentsline {paragraph}{The {\normalfont {Source}} pane}{45} \contentsline {section}{\numberline {3.3}Using \emph {knitr} and \emph {rmarkdown}: The Basics}{47} \contentsline {subsection}{\numberline {3.3.1}What \emph {knitr} does}{47} \contentsline {subsection}{\numberline {3.3.2}What \emph {rmarkdown} does}{49} \contentsline {subsection}{\numberline {3.3.3}File extensions}{50} \contentsline {subsection}{\numberline {3.3.4}Code chunks}{50} \contentsline {paragraph}{R Markdown}{51} \contentsline {paragraph}{R LaTeX}{51} \contentsline {paragraph}{Code chunk labels}{52} \contentsline {paragraph}{Code chunk options}{52} \contentsline {subsection}{\numberline {3.3.5}Global chunk options}{53} \contentsline {subsection}{\numberline {3.3.6}\emph {knitr} package options}{55} \contentsline {subsection}{\numberline {3.3.7}Hooks}{56} \contentsline {subsection}{\numberline {3.3.8}\emph {knitr}, \emph {rmarkdown}, \& RStudio}{56} \contentsline {paragraph}{Compiling R source code Notebooks}{56} \contentsline {paragraph}{R Markdown}{58} \contentsline {paragraph}{R LaTeX}{58} \contentsline {paragraph}{Change default .Rnw knitter}{59} \contentsline {subsection}{\numberline {3.3.9}\emph {knitr} \& R}{60} \contentsline {subsection}{\numberline {3.3.10}\emph {rmarkdown} and R}{61} \contentsline {paragraph}{Set Up}{63} \contentsline {paragraph}{Code Chunks}{63} \contentsline {chapter}{\numberline {4}Getting Started with File Management}{65} \contentsline {section}{\numberline {4.1}File Paths \& Naming Conventions}{66} \contentsline {subsection}{\numberline {4.1.1}Root directories}{66} \contentsline {subsection}{\numberline {4.1.2}Subdirectories \& parent directories}{66} \contentsline {subsection}{\numberline {4.1.3}Working directories}{67} \contentsline {subsection}{\numberline {4.1.4}Absolute vs. relative paths}{67} \contentsline {subsection}{\numberline {4.1.5}Spaces in directory \& file names}{68} \contentsline {section}{\numberline {4.2}Organizing Your Research Project}{69} \contentsline {section}{\numberline {4.3}Setting Directories as RStudio Projects}{70} \contentsline {section}{\numberline {4.4}R File Manipulation Commands}{70} \contentsline {paragraph}{{\tt {getwd}}}{71} \contentsline {paragraph}{{\tt {list.files}}}{71} \contentsline {paragraph}{{\tt {setwd}}}{71} \contentsline {paragraph}{{\tt {root.dir}}}{72} \contentsline {paragraph}{{\tt {dir.create}}}{72} \contentsline {paragraph}{{\tt {file.create}}}{72} \contentsline {paragraph}{{\tt {cat}}}{73} \contentsline {paragraph}{{\tt {unlink}}}{73} \contentsline {paragraph}{{\tt {file.rename}}}{73} \contentsline {paragraph}{{\tt {file.copy}}}{74} \contentsline {section}{\numberline {4.5}Unix-like Shell Commands for File Management}{74} \contentsline {paragraph}{{\tt {cd}}}{75} \contentsline {paragraph}{{\tt {pwd}}}{75} \contentsline {paragraph}{{\tt {ls}}}{76} \contentsline {paragraph}{{\tt {mkdir}}}{76} \contentsline {paragraph}{{\tt {echo}}}{76} \contentsline {paragraph}{{\tt {rm}}}{77} \contentsline {paragraph}{{\tt {mv}}}{77} \contentsline {paragraph}{{\tt {cp}}}{78} \contentsline {paragraph}{{\tt {system}} (R command)}{78} \contentsline {section}{\numberline {4.6}File Navigation in RStudio}{78} \contentsline {part}{II\hspace {1em}Data Gathering and Storage}{81} \contentsline {chapter}{\numberline {5}Storing, Collaborating, Accessing Files, and Versioning}{83} \contentsline {section}{\numberline {5.1}Saving Data in Reproducible Formats}{84} \contentsline {section}{\numberline {5.2}Storing Your Files in the Cloud: Dropbox}{85} \contentsline {subsection}{\numberline {5.2.1}Storage}{86} \contentsline {subsection}{\numberline {5.2.2}Accessing data}{86} \contentsline {subsection}{\numberline {5.2.3}Collaboration}{88} \contentsline {subsection}{\numberline {5.2.4}Version control}{88} \contentsline {section}{\numberline {5.3}Storing Your Files in the Cloud: GitHub}{89} \contentsline {subsection}{\numberline {5.3.1}Setting up GitHub: Basic}{91} \contentsline {subsection}{\numberline {5.3.2}Version control with Git}{92} \contentsline {paragraph}{Setting up Git repositories locally}{92} \contentsline {paragraph}{Checkout}{95} \contentsline {paragraph}{Tags}{96} \contentsline {paragraph}{Branches}{97} \contentsline {paragraph}{Having Git ignore files}{99} \contentsline {subsection}{\numberline {5.3.3}Remote storage on GitHub}{100} \contentsline {paragraph}{Clone a new remote repository}{100} \contentsline {paragraph}{Push an existing repository to a new GitHub repo}{101} \contentsline {paragraph}{Pushing commits to a GitHub repo}{101} \contentsline {subsection}{\numberline {5.3.4}Accessing on GitHub}{102} \contentsline {paragraph}{Downloading into R}{102} \contentsline {paragraph}{Viewing files}{104} \contentsline {subsubsection}{\numberline {5.3.4.1}Collaboration with GitHub}{104} \contentsline {paragraph}{Syncing a repository}{104} \contentsline {subsection}{\numberline {5.3.5}Summing up the GitHub workflow}{105} \contentsline {section}{\numberline {5.4}RStudio \& GitHub}{105} \contentsline {subsection}{\numberline {5.4.1}Setting up Git/GitHub with Projects}{105} \contentsline {paragraph}{Git with a new project}{105} \contentsline {paragraph}{Git initialize existing projects}{107} \contentsline {paragraph}{Clone repository into a new project}{107} \contentsline {paragraph}{Add existing Project repository to GitHub}{107} \contentsline {subsection}{\numberline {5.4.2}Using Git in RStudio Projects}{107} \contentsline {chapter}{\numberline {6}Gathering Data with R}{109} \contentsline {section}{\numberline {6.1}Organize Your Data Gathering: Makefiles}{109} \contentsline {subsection}{\numberline {6.1.1}R Make-like files}{110} \contentsline {subsection}{\numberline {6.1.2}GNU Make}{111} \contentsline {subsubsection}{\numberline {6.1.2.1}Example makefile}{112} \contentsline {paragraph}{Running the Makefile}{115} \contentsline {subsubsection}{\numberline {6.1.2.2}Makefiles and RStudio Projects}{116} \contentsline {subsubsection}{\numberline {6.1.2.3}Other information about makefiles}{116} \contentsline {section}{\numberline {6.2}Importing Locally Stored Data Sets}{117} \contentsline {section}{\numberline {6.3}Importing Data Sets from the Internet}{118} \contentsline {subsection}{\numberline {6.3.1}Data from non-secure ({\tt {http}}) URLs}{118} \contentsline {subsection}{\numberline {6.3.2}Data from secure ({\tt {https}}) URLs}{119} \contentsline {paragraph}{Loading data from secure URLs with {\tt {source\_data}}}{119} \contentsline {paragraph}{Loading data from Dropbox non-Public folders with {\tt {source\_DropboxData}}}{120} \contentsline {paragraph}{Loading data using {\normalfont {RCurl}}}{121} \contentsline {subsection}{\numberline {6.3.3}Compressed data stored online}{121} \contentsline {subsection}{\numberline {6.3.4}Data APIs \& feeds}{123} \contentsline {paragraph}{API Package Example: World Bank Development Indicators}{124} \contentsline {section}{\numberline {6.4}Advanced Automatic Data Gathering: Web Scraping}{125} \contentsline {paragraph}{The general process}{125} \contentsline {paragraph}{More tools to learn for web scraping}{126} \contentsline {chapter}{\numberline {7}Preparing Data for Analysis}{129} \contentsline {section}{\numberline {7.1}Cleaning Data for Merging}{129} \contentsline {subsection}{\numberline {7.1.1}Get a handle on your data}{129} \contentsline {subsection}{\numberline {7.1.2}Reshaping data}{131} \contentsline {subsection}{\numberline {7.1.3}Renaming variables}{134} \contentsline {subsection}{\numberline {7.1.4}Ordering data}{134} \contentsline {subsection}{\numberline {7.1.5}Subsetting data}{135} \contentsline {subsection}{\numberline {7.1.6}Recoding string/numeric variables}{137} \contentsline {subsection}{\numberline {7.1.7}Creating new variables from old}{139} \contentsline {paragraph}{Creating factor variables}{140} \contentsline {subsection}{\numberline {7.1.8}Changing variable types}{142} \contentsline {section}{\numberline {7.2}Merging Data Sets}{143} \contentsline {subsection}{\numberline {7.2.1}Binding}{143} \contentsline {subsection}{\numberline {7.2.2}The merge command}{143} \contentsline {paragraph}{Big data}{145} \contentsline {subsection}{\numberline {7.2.3}Duplicate values}{146} \contentsline {subsection}{\numberline {7.2.4}Duplicate columns}{147} \contentsline {part}{III\hspace {1em}Analysis and Results}{151} \contentsline {chapter}{\numberline {8}Statistical Modeling and {\emph {knitr}}}{153} \contentsline {section}{\numberline {8.1}Incorporating Analyses into the Markup}{154} \contentsline {subsection}{\numberline {8.1.1}Full code chunks}{154} \contentsline {paragraph}{{\tt {include}}}{154} \contentsline {paragraph}{{\tt {eval}}}{154} \contentsline {paragraph}{{\tt {echo}}}{155} \contentsline {paragraph}{{\tt {results}}}{155} \contentsline {paragraph}{{\tt {warning}}, {\tt {message}}, {\tt {error}}}{155} \contentsline {paragraph}{{\tt {cache}}}{155} \contentsline {paragraph}{{\tt {dependson}}}{155} \contentsline {paragraph}{{\tt {cache.extra}}}{156} \contentsline {paragraph}{{\tt {size}}}{156} \contentsline {subsection}{\numberline {8.1.2}Showing code \& results inline}{156} \contentsline {subsubsection}{\numberline {8.1.2.1}LaTeX}{156} \contentsline {paragraph}{Inline static code}{156} \contentsline {paragraph}{Inline dynamic code}{157} \contentsline {subsubsection}{\numberline {8.1.2.2}Markdown}{158} \contentsline {paragraph}{Inline static code}{158} \contentsline {paragraph}{Inline dynamic code}{158} \contentsline {subsection}{\numberline {8.1.3}Dynamically including non-R code in code chunks}{159} \contentsline {section}{\numberline {8.2}Dynamically Including Modular Analysis Files}{159} \contentsline {subsection}{\numberline {8.2.1}Source from a local file}{160} \contentsline {paragraph}{Sourcing a makefile in a code chunk}{161} \contentsline {subsection}{\numberline {8.2.2}Source from a non-secure URL (\texttt {http})}{162} \contentsline {subsection}{\numberline {8.2.3}Source from a secure URL (\texttt {https})}{162} \contentsline {section}{\numberline {8.3}Reproducibly Random: {\tt {set.seed}}}{163} \contentsline {section}{\numberline {8.4}Computationally Intensive Analyses}{164} \contentsline {chapter}{\numberline {9}Showing Results with Tables}{167} \contentsline {section}{\numberline {9.1}Basic \emph {knitr} Syntax for Tables}{168} \contentsline {section}{\numberline {9.2}Table Basics}{168} \contentsline {subsection}{\numberline {9.2.1}Tables in LaTeX}{169} \contentsline {paragraph}{The \texttt {tabular} environment}{169} \contentsline {paragraph}{The \texttt {table} float environment}{171} \contentsline {subsection}{\numberline {9.2.2}Tables in Markdown/HTML}{173} \contentsline {paragraph}{Markdown tables}{173} \contentsline {paragraph}{HTML tables}{175} \contentsline {section}{\numberline {9.3}Creating Tables from Supported Class R Objects}{177} \contentsline {subsection}{\numberline {9.3.1}\texttt {kable} for Markdown and LaTeX}{177} \contentsline {subsection}{\numberline {9.3.2}\emph {xtable} for LaTeX and HTML}{178} \contentsline {paragraph}{\emph {xtable} for LaTeX}{179} \contentsline {paragraph}{\emph {xtable} for Markdown/HTML}{180} \contentsline {subsection}{\numberline {9.3.3}\emph {texreg} for LaTeX and HTML}{181} \contentsline {paragraph}{\emph {texreg} for LaTeX}{181} \contentsline {paragraph}{\emph {xtable} for LaTeX}{182} \contentsline {paragraph}{\emph {texreg} for HTML}{183} \contentsline {subsection}{\numberline {9.3.4}Fitting Large Tables in LaTeX}{184} \contentsline {paragraph}{LaTeX landscape tables}{185} \contentsline {paragraph}{LaTeX scalebox for tables}{185} \contentsline {subsection}{\numberline {9.3.5}\emph {xtable} with non-supported class objects}{185} \contentsline {subsection}{\numberline {9.3.6}Creating variable description documents with \emph {xtable}}{188} \contentsline {chapter}{\numberline {10}Showing Results with Figures}{191} \contentsline {section}{\numberline {10.1}Including Non-knitted Graphics}{191} \contentsline {subsection}{\numberline {10.1.1}Including graphics in LaTeX}{192} \contentsline {paragraph}{{\tt {figure}} float environment}{193} \contentsline {subsection}{\numberline {10.1.2}Including graphics in Markdown/HTML}{194} \contentsline {section}{\numberline {10.2}Basic \emph {knitr}/\emph {rmarkdown} Figure Options}{195} \contentsline {subsection}{\numberline {10.2.1}Chunk options}{195} \contentsline {paragraph}{{\tt {fig.path}}}{195} \contentsline {paragraph}{{\tt {out.height}}}{196} \contentsline {paragraph}{{\tt {out.width}}}{196} \contentsline {paragraph}{{\tt {fig.align}}}{196} \contentsline {paragraph}{Other figure chunk options}{196} \contentsline {subsection}{\numberline {10.2.2}Global options}{196} \contentsline {section}{\numberline {10.3}Knitting R's Default Graphics}{197} \contentsline {section}{\numberline {10.4}Including \emph {ggplot2} Graphics}{200} \contentsline {subsection}{\numberline {10.4.1}Showing regression results with caterpillar plots}{204} \contentsline {section}{\numberline {10.5}JavaScript Graphs with \emph {googleVis}}{209} \contentsline {paragraph}{Basic googleVis figures}{209} \contentsline {paragraph}{Including \emph {googleVis} in knitted documents}{210} \contentsline {paragraph}{Note for Motion Charts}{211} \contentsline {subsection}{\numberline {10.5.1}JavaScript Graphs with \emph {htmlwidgets}-based packages}{212} \contentsline {part}{IV\hspace {1em}Presentation Documents}{213} \contentsline {chapter}{\numberline {11}Presenting with \emph {knitr}/LaTeX}{215} \contentsline {section}{\numberline {11.1}The Basics}{215} \contentsline {subsection}{\numberline {11.1.1}Getting started with LaTeX editors}{216} \contentsline {subsection}{\numberline {11.1.2}Basic LaTeX command syntax}{216} \contentsline {subsection}{\numberline {11.1.3}The LaTeX preamble \& body}{217} \contentsline {subsection}{\numberline {11.1.4}Headings}{220} \contentsline {subsection}{\numberline {11.1.5}Paragraphs \& spacing}{221} \contentsline {subsection}{\numberline {11.1.6}Horizontal lines}{221} \contentsline {subsection}{\numberline {11.1.7}Text formatting}{221} \contentsline {paragraph}{Italics \& Bold}{222} \contentsline {paragraph}{Font size}{222} \contentsline {paragraph}{Diacritics}{222} \contentsline {paragraph}{Quotation marks}{223} \contentsline {subsection}{\numberline {11.1.8}Math}{223} \contentsline {subsection}{\numberline {11.1.9}Lists}{224} \contentsline {subsection}{\numberline {11.1.10}Footnotes}{225} \contentsline {subsection}{\numberline {11.1.11}Cross-references}{225} \contentsline {section}{\numberline {11.2}Bibliographies with BibTeX}{225} \contentsline {subsection}{\numberline {11.2.1}The \emph {.bib} file}{225} \contentsline {subsection}{\numberline {11.2.2}Including citations in LaTeX documents}{227} \contentsline {subsection}{\numberline {11.2.3}Generating a BibTeX file of R package citations}{227} \contentsline {section}{\numberline {11.3}Presentations with LaTeX Beamer}{230} \contentsline {subsection}{\numberline {11.3.1}Beamer basics}{231} \contentsline {paragraph}{The Beamer preamble}{231} \contentsline {paragraph}{Slide frames}{231} \contentsline {paragraph}{Title frames}{233} \contentsline {paragraph}{Sections \& outlines}{233} \contentsline {paragraph}{Make list items appear}{233} \contentsline {subsection}{\numberline {11.3.2}\emph {knitr} with LaTeX slideshows}{234} \contentsline {chapter}{\numberline {12}Large \emph {knitr}/LaTeX Documents: Theses, Books, and Batch Reports}{237} \contentsline {section}{\numberline {12.1}Planning Large Documents}{237} \contentsline {section}{\numberline {12.2}Large Documents with Traditional LaTeX}{238} \contentsline {subsection}{\numberline {12.2.1}Inputting/including children}{239} \contentsline {subsection}{\numberline {12.2.2}Other common features of large documents}{240} \contentsline {paragraph}{Table of contents}{240} \contentsline {paragraph}{Lists of figures and tables}{240} \contentsline {paragraph}{Blank Pages}{240} \contentsline {paragraph}{Index}{241} \contentsline {section}{\numberline {12.3}\emph {knitr} and Large Documents}{241} \contentsline {subsection}{\numberline {12.3.1}The parent document}{241} \contentsline {subsection}{\numberline {12.3.2}Knitting child documents}{242} \contentsline {paragraph}{Other markup languages}{242} \contentsline {section}{\numberline {12.4}Child Documents in a Different Markup Language}{243} \contentsline {section}{\numberline {12.5}Creating Batch Reports}{244} \contentsline {chapter}{\numberline {13}Presenting on the Web and Other Formats with R Markdown}{249} \contentsline {section}{\numberline {13.1}The Basics}{249} \contentsline {subsection}{\numberline {13.1.1}Getting started with Markdown editors}{250} \contentsline {subsection}{\numberline {13.1.2}Preamble and document structure}{250} \contentsline {subsection}{\numberline {13.1.3}Headings}{252} \contentsline {subsection}{\numberline {13.1.4}Horizontal lines}{253} \contentsline {subsection}{\numberline {13.1.5}Paragraphs and new lines}{253} \contentsline {subsection}{\numberline {13.1.6}Italics and bold}{254} \contentsline {subsection}{\numberline {13.1.7}Links}{254} \contentsline {subsection}{\numberline {13.1.8}Special characters and font customization}{254} \contentsline {subsection}{\numberline {13.1.9}Lists}{254} \contentsline {subsection}{\numberline {13.1.10}Escape characters}{255} \contentsline {subsection}{\numberline {13.1.11}Math with MathJax}{255} \contentsline {section}{\numberline {13.2}Further Customizability with \emph {rmarkdown}}{256} \contentsline {subsection}{\numberline {13.2.1}More on \emph {rmarkdown} Headers}{256} \contentsline {paragraph}{Bibliographies with Pandoc}{258} \contentsline {paragraph}{Footnotes with Pandoc}{259} \contentsline {subsection}{\numberline {13.2.2}CSS style files and Markdown}{260} \contentsline {paragraph}{Rendering R Markdown files to HTML using custom CSS}{261} \contentsline {section}{\numberline {13.3}Slideshows with Markdown, \emph {rmarkdown}, and HTML}{261} \contentsline {paragraph}{HTML5 frameworks}{262} \contentsline {subsection}{\numberline {13.3.1}HTML Slideshows with \emph {rmarkdown}}{262} \contentsline {subsection}{\numberline {13.3.2}LaTeX Beamer Slideshows with \emph {rmarkdown}}{264} \contentsline {subsection}{\numberline {13.3.3}Slideshows with Markdown and RStudio's R Presentations}{265} \contentsline {paragraph}{Editing and compiling the presentation}{266} \contentsline {paragraph}{Publishing slideshows}{267} \contentsline {section}{\numberline {13.4}Publishing HTML Documents Created by R Markdown}{268} \contentsline {subsection}{\numberline {13.4.1}Standalone HTML files}{268} \contentsline {subsection}{\numberline {13.4.2}Hosting webpages with Dropbox}{268} \contentsline {subsection}{\numberline {13.4.3}GitHub Pages}{269} \contentsline {subsection}{\numberline {13.4.4}Further information on R Markdown}{270} \contentsline {chapter}{\numberline {14}Conclusion}{271} \contentsline {section}{\numberline {14.1}Citing Reproducible Research}{271} \contentsline {section}{\numberline {14.2}Licensing Your Reproducible Research}{273} \contentsline {section}{\numberline {14.3}Sharing Your Code in Packages}{273} \contentsline {section}{\numberline {14.4}Project Development: Public or Private?}{274} \contentsline {section}{\numberline {14.5}Is it Possible to Completely Future-Proof Your Research?}{275} \contentsline {chapter}{Bibliography}{277} \contentsline {chapter}{Index}{285} ================================================ FILE: Old/Source-v2/krantz.cls ================================================ %% %% This is file `Krantz.cls' %%% Created by Shashi Kumar / ITC [August 2008] \NeedsTeXFormat{LaTeX2e}[1995/12/01] \ProvidesClass{krantz} [2005/09/16 v1.4f Standard LaTeX document class] \newcommand\@ptsize{} \newif\if@restonecol \newif\if@titlepage \@titlepagetrue \newif\if@openright \newif\if@mainmatter \@mainmattertrue \if@compatibility\else \DeclareOption{a4paper} {\setlength\paperheight {297mm}% \setlength\paperwidth {210mm}} \DeclareOption{a5paper} {\setlength\paperheight {210mm}% \setlength\paperwidth {148mm}} \DeclareOption{b5paper} {\setlength\paperheight {250mm}% \setlength\paperwidth {176mm}} \DeclareOption{letterpaper} {\setlength\paperheight {11in}% \setlength\paperwidth {8.5in}} \DeclareOption{legalpaper} {\setlength\paperheight {14in}% \setlength\paperwidth {8.5in}} \DeclareOption{executivepaper} {\setlength\paperheight {10.5in}% \setlength\paperwidth {7.25in}} \DeclareOption{landscape} {\setlength\@tempdima {\paperheight}% \setlength\paperheight {\paperwidth}% \setlength\paperwidth {\@tempdima}} \fi \if@compatibility \renewcommand\@ptsize{0} \else \DeclareOption{10pt}{\renewcommand\@ptsize{0}} \fi \DeclareOption{11pt}{\renewcommand\@ptsize{1}} \DeclareOption{12pt}{\renewcommand\@ptsize{2}} \if@compatibility\else \DeclareOption{oneside}{\@twosidefalse \@mparswitchfalse} \fi \DeclareOption{twoside}{\@twosidetrue \@mparswitchtrue} \DeclareOption{draft}{\setlength\overfullrule{5pt}} \if@compatibility\else \DeclareOption{final}{\setlength\overfullrule{0pt}} \fi \DeclareOption{titlepage}{\@titlepagetrue} \if@compatibility\else \DeclareOption{notitlepage}{\@titlepagefalse} \fi \if@compatibility \@openrighttrue \else \DeclareOption{openright}{\@openrighttrue} \DeclareOption{openany}{\@openrightfalse} \fi \if@compatibility\else \DeclareOption{onecolumn}{\@twocolumnfalse} \fi \DeclareOption{twocolumn}{\@twocolumntrue} \DeclareOption{leqno}{\input{leqno.clo}} \DeclareOption{fleqn}{\input{fleqn.clo}} \DeclareOption{openbib}{% \AtEndOfPackage{% \renewcommand\@openbib@code{% \advance\leftmargin\bibindent \itemindent -\bibindent \listparindent \itemindent \parsep \z@ }% \renewcommand\newblock{\par}}% } %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \newif\if@numbysec \DeclareOption{numbysec}{\@numbysectrue} \newif\if@numberinsequence \DeclareOption{numberinsequence}{\@numberinsequencetrue} \newif\if@nocaptionbreak \DeclareOption{NoCaptionBreak}{\@nocaptionbreaktrue} \newif\if@sevenbyten \DeclareOption{sevenbyten}{\@sevenbytentrue} \newif\if@cip \DeclareOption{cip}{\@ciptrue} \newif\if@times \DeclareOption{times}{\@timestrue} \newif\if@chapnumonly \DeclareOption{chapnumonly}{\@chapnumonlytrue} \newif\if@ChapterResetsPage \DeclareOption{ChapterResetsPage}{\@ChapterResetsPagetrue} \newif\if@ChapterTOCs \DeclareOption{ChapterTOCs}{\@ChapterTOCstrue} \newif\if@EOCRefs \DeclareOption{EOCRefs}{\@EOCRefstrue}% \newif\if@SuperscriptCites \DeclareOption{SuperscriptCites}{\@SuperscriptCitestrue}% \newif\if@UnnumberedReferences \DeclareOption{UnnumberedReferences}{\@UnnumberedReferencestrue}% \newif\if@pdf \DeclareOption{pdf}{\@pdftrue} \DeclareOption{krantz1}{\@krantzatrue} \newif\if@krantza \DeclareOption{krantz2}{\@krantzbtrue} \newif\if@krantzb %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \ExecuteOptions{letterpaper,10pt,twoside,onecolumn,final,openright} \ProcessOptions %%%%%%%%%%%%%%%%%%% \def\helv@scale{.82} % \DeclareFontFamily{T1}{helvetica}{}% \DeclareFontShape{T1}{helvetica}{m}{n}{<->s*[\helv@scale]phvr8t}{}% \DeclareFontShape{T1}{helvetica}{m}{it}{<->s*[\helv@scale]phvro8t}{}% \DeclareFontShape{T1}{helvetica}{m}{sc}{<->s*[\helv@scale]phvrc8t}{}% \DeclareFontShape{T1}{helvetica}{b}{n}{<->s*[\helv@scale]phvb8t}{}% \DeclareFontShape{T1}{helvetica}{b}{it}{<->s*[\helv@scale]phvbo8t}{}% \DeclareFontShape{T1}{helvetica}{m}{sl}{<->s*[\helv@scale]phvro8t}{}% \DeclareFontShape{T1}{helvetica}{b}{sc}{<->s*[\helv@scale]phvbc8t}{}% \DeclareFontShape{T1}{helvetica}{b}{sl}{<->s*[\helv@scale]phvbo8t}{}% \DeclareFontShape{T1}{helvetica}{bx}{n}{<->s*[\helv@scale]phvb8t}{}% \DeclareFontShape{T1}{helvetica}{bx}{it}{<->s*[\helv@scale]phvbo8t}{}% \DeclareFontShape{T1}{helvetica}{bx}{sc}{<->s*[\helv@scale]phvbc8t}{}% \DeclareFontShape{T1}{helvetica}{bx}{sl}{<->ssub * helvetica/b/it}{}% \DeclareFontFamily{OT1}{helvetica}{}% \DeclareFontShape{OT1}{helvetica}{m}{n}{<->s*[\helv@scale]phvr7t}{}% \DeclareFontShape{OT1}{helvetica}{m}{it}{<->s*[\helv@scale]phvro7t}{}% \DeclareFontShape{OT1}{helvetica}{m}{sc}{<->s*[\helv@scale]phvrc7t}{}% \DeclareFontShape{OT1}{helvetica}{b}{n}{<->s*[\helv@scale]phvb7t}{}% \DeclareFontShape{OT1}{helvetica}{b}{it}{<->s*[\helv@scale]phvbo7t}{}% \DeclareFontShape{OT1}{helvetica}{m}{sl}{<->s*[\helv@scale]phvro7t}{}% \DeclareFontShape{OT1}{helvetica}{b}{sc}{<->s*[\helv@scale]phvbc8t}{}% \DeclareFontShape{OT1}{helvetica}{b}{sl}{<->s*[\helv@scale]phvbo7t}{}% \DeclareFontShape{OT1}{helvetica}{bx}{n}{<->s*[\helv@scale]phvb7t}{}% \DeclareFontShape{OT1}{helvetica}{bx}{it}{<->s*[\helv@scale]phvbo7t}{}% \DeclareFontShape{OT1}{helvetica}{bx}{sc}{<->s*[\helv@scale]phvbc8t}{}% \DeclareFontShape{OT1}{helvetica}{bx}{sl}{<->s*[\helv@scale]phvbo7t}{}% %%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%% Font Defined %%%%%%%%%%%%%%%%% \def\@xipt{11} \def\@xviiipt{18} \def\@xxivpt{24} \newcommand\ContributorAffiliationFont{\reset@font\fontsize{10}{12}\raggedright\selectfont} \newcommand\ContributorNameFont{\reset@font\fontsize{10}{12}\bfseries\raggedright\selectfont} \newcommand\TitlePageTitleFont{\fontsize{24}{28}\slshape\bfseries\selectfont} \newcommand\PageNumFont{\reset@font\fontsize{10}{12}\selectfont} \newcommand\ChapNumFont{\reset@font\fontsize{24}{24}\bfseries\selectfont} \newcommand\ChapTitleFont{\reset@font\fontsize{18}{20}\slshape\selectfont} \newcommand\SectionHeadFont{\fontsize{12}{14}\bfseries\selectfont} \newcommand\SubsectionHeadFont{\fontsize{11}{13}\bfseries\selectfont} \newcommand\SubsubsectionHeadFont{\fontsize{10}{12}\bfseries\selectfont} \newcommand\ParagraphHeadFont{\fontsize{10}{12}\itshape\selectfont} \newcommand\SubParagraphHeadFont{\fontsize{10}{12}\itshape\selectfont} \newcommand\FMHeadFont{\reset@font\fontsize{18}{20}\slshape\bfseries\selectfont} \newcommand\RunningHeadFont{\fontsize{10}{12}\itshape\selectfont} \newcommand\NameFont{\fontsize{10}{12}\itshape\selectfont} \newcommand\AffiliationFont{\fontsize{8}{10}\selectfont} \newcommand\FigCapFont{\fontsize{10}{12}\bfseries\selectfont} \newcommand\FigCapBIFont{\fontsize{10}{12}\bfseries\itshape\selectfont} \newcommand\TableColHeadFont{\fontsize{10}{12}\bfseries\selectfont} \newcommand\TableTitleFont{\fontsize{10}{12}\selectfont} \newcommand\TableNumberFont{\fontsize{11}{13}\bfseries\selectfont} \newcommand\TableBodyFont{\reset@font\fontsize{9}{11}\selectfont} \newcommand\TableSubheadFont{\reset@font\fontsize{9}{11}\selectfont} \newcommand\TableFootnoteFont{\reset@font\fontsize{8}{10}\selectfont} \newcommand\CAPlusOneFont{\fontsize{10}{12}\bfseries\selectfont} \newcommand\CAAPlusOneFont{\fontsize{10}{12}\itshape\selectfont} \newcommand\tocfont{\fontsize{10}{12}\selectfont} \newcommand\extraFont{\fontsize{24}{28}\selectfont} \newcommand\VfFont{\fontsize{10}{12}\selectfont} %%%%%%%%%%%%%%%%% \input{bk1\@ptsize.clo} \setlength\lineskip{1\p@} \setlength\normallineskip{1\p@} \renewcommand\baselinestretch{} \setlength\parskip{0\p@ \@plus \p@} \@lowpenalty 51 \@medpenalty 151 \@highpenalty 301 \@beginparpenalty -\@lowpenalty \@endparpenalty -\@lowpenalty \@itempenalty -\@lowpenalty % \clubpenalty=0 % 'Club line' at bottom of page. \widowpenalty=10000 % 'Widow line' at top of page. \setcounter{topnumber}{2} \renewcommand\topfraction{.7} \setcounter{bottomnumber}{1} \renewcommand\bottomfraction{.3} \setcounter{totalnumber}{3} \renewcommand\textfraction{.2} \renewcommand\floatpagefraction{.5} \setcounter{dbltopnumber}{2} \renewcommand\dbltopfraction{.7} \renewcommand\dblfloatpagefraction{.5} % **************************************** % * PAGE LAYOUT * % **************************************** % % All margin dimensions measured from a point one inch from top and side % of page. % % SIDE MARGINS: % \oddsidemargin 6pc %5pc \evensidemargin 5.7pc %5pc \marginparwidth 4pc \marginparsep 1pc \topmargin 12pt %0pt \headheight 12pt \headsep 12pt \footskip 2pc % % DIMENSION OF TEXT: \newdimen\trimheight \newdimen\trimwidth \newdimen\normaltextheight \newdimen\tempa \newdimen\tempdimen % %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% Parameter Initializaton %%%%%%%%%%%%%%%%%%%%%%%%%% % \newdimen\htrim \newdimen\vtrimtop \newdimen\vtrimbot \setlength\trimheight{9in} \setlength\trimwidth{6in} % % \if@krantza \textheight = 45\baselineskip %\advance\textheight by \topskip \addtolength\textheight{3pt} \textwidth 28pc \addtolength\textwidth{.5pt} \topmargin0in \oddsidemargin1.1875in \evensidemargin1.1875in \htrim.7365in \vtrimtop1.068in \vtrimbot1.068in \hoffset-15pt \voffset39pt \let\normaltextheight\textheight \else\if@krantzb \textheight = 51pc % \advance\textheight by \topskip \textwidth 33pc \topmargin0in \oddsidemargin.5in \evensidemargin.5in \htrim.75in \vtrimtop.8607in \vtrimbot1.027in \hoffset-.1in \voffset-.15in%.04in \let\normaltextheight\textheight \else \textheight = 43\baselineskip %\advance\textheight by \topskip \addtolength\textheight{3pt} \textwidth 26pc \addtolength\textwidth{.5pt} \topmargin0in \oddsidemargin1.1875in \evensidemargin1.1875in \htrim5.05pc \vtrimtop7.7pc \vtrimbot5.44pc % \hoffset-5pt \voffset45pt \let\normaltextheight\textheight \fi \fi % \columnsep 1pc \columnseprule 0pt % % FOOTNOTES % \footnotesep 6.65pt \skip\footins 12pt plus 3pt minus 1.5pt % %%%% Trim marks %%%%%%%%%%% \newsavebox\ul@box \newsavebox\ur@box \newsavebox\ll@box \newsavebox\lr@box \def\top@cornermarks{% \hskip-\htrim \vbox to 0\p@{\vskip-\vtrimtop\llap{\copy\ul@box}\vss}% \vbox to 0\p@{\vskip-\vtrimtop\rlap{\hskip\textwidth\hskip2\htrim\copy\ur@box}\vss}% \vbox to 0\p@{\vskip\textheight\vskip\vtrimbot\llap{\copy\ll@box}\vss}% \vbox to 0\p@{\vskip\textheight\vskip\vtrimbot\rlap{\hskip\textwidth\hskip2\htrim\copy\lr@box}\vss}% \hskip\htrim} \def\make@cornermarks{% \sbox\ul@box{\rule{18\p@}{.25\p@}\hskip8\p@\hbox to.25\p@{\vbox to26\p@{\noindent\rule{.25\p@}{18\p@}}}}% \sbox\ur@box{\hbox to.25\p@{\vbox to26\p@{\noindent\rule{.25\p@}{18\p@}}}\hskip8\p@\rule{18\p@}{.25\p@}}% \sbox\ll@box{\rule{18\p@}{.25\p@}\hskip8\p@\lower34\p@\hbox to.25\p@{\vbox to26\p@{\noindent\rule{.25\p@}{18\p@}}}}% \sbox\lr@box{\lower34\p@\hbox to.25\p@{\vbox to26\p@{\noindent\rule{.25\p@}{18\p@}}}\hskip8\p@\rule{18\p@}{.25\p@}}} %%%%%%%%%%%%%%%%%%%% End Trim Marks %%%%%%%%%%%% \def\ps@plain{\let\@mkboth\@gobbletwo \let\@oddhead\top@cornermarks%\@empty \def\@oddfoot{\reset@font\hfil\thepage \hfil}\let\@evenhead\@empty\let\@evenfoot\@oddfoot} \def\even@head{% \top@cornermarks {\@the@page\RunningHeadFont \hfill \leftmark }} \def\odd@head{% \top@cornermarks \hfil{\RunningHeadFont \rightmark } \hfill \@the@page } \def\@the@page{{\PageNumFont\thepage}} \if@twoside \def\ps@headings{% \let\@mkboth\@gobbletwo \if@pdf \let\@evenhead\@empty \let\@oddhead\@empty \def\@oddfoot{\@cip\hfil}% \def\@evenfoot{\@cip\hfil}% \else \let\@oddfoot\@empty \let\@evenfoot\@empty \let\@evenhead\even@head \let\@oddhead\odd@head \fi } \else \def\ps@headings{\let\@mkboth\@gobbletwo% \if@pdf \let\@evenhead\@empty \let\@oddhead\@empty \def\@oddfoot{\@cip\hfil}% \def\@evenfoot{\@cip\hfil}% \else \let\@oddfoot\@empty \let\@evenfoot\@empty \let\@evenhead\even@head \let\@oddhead\odd@head \fi } \fi \def\ps@myheadings{% \let\@oddfoot\@empty\let\@evenfoot\@empty \def\@evenhead{\thepage\hfil\slshape\leftmark}% \def\@oddhead{{\slshape\rightmark}\hfil\thepage}% \let\@mkboth\@gobbletwo \let\chaptermark\@gobble \let\sectionmark\@gobble } \def\ps@empty{% \let\@mkboth\@gobbletwo \if@pdf \let\@evenhead\@empty \let\@oddhead\@empty \def\@oddfoot{\@cip\hfil}% \def\@evenfoot{\@cip\hfil}% \else \make@cornermarks \let\@oddhead\top@cornermarks \let\@evenhead\top@cornermarks \let\@oddfoot\@empty \let\@evenfoot\@empty \fi } \def\ps@folio{% \let\@mkboth\@gobbletwo \if@pdf \let\@evenhead\@empty \let\@oddhead\@empty \def\@oddfoot{\@cip\hfil}% \def\@evenfoot{\@cip\hfil}% \else \let\@oddhead\top@cornermarks \def\@oddfoot{% \parindent\z@ \baselineskip7\p@ \hbox{% \textwidth\@ciprulewidth \vbox{% \if@cip\rule{\@ciprulewidth}{.25pt}\par \hbox{\vbox{\noindent\copy\@cipboxa\par\noindent\copy\@cipboxb}}\fi}} \hfill\@the@page} \let\@evenhead\top@cornermarks%\odd@head \let\@evenfoot\@oddfoot \fi } \newcommand\HeadingsBookChapter{% \def\chaptermark##1{% \markboth{\@title}{% ##1}}% \def\sectionmark##1{}} \def\HeadingsChapterSection{% \def\chaptermark##1{% \markboth{% ##1}{}}% \def\sectionmark##1{% \markright{% ##1}}} \def\pdfon{\@pdftrue} \def\pdfoff{\@pdffalse} \if@pdf \def\@cip{{\fontsize{6\p@}{8\p@}\selectfont\copyright 2001 by CRC Press LLC}} \else \newsavebox\@cipboxa \newsavebox\@cipboxb \newdimen\@ciprulewidth \def\@cip#1#2{% \sbox\@cipboxa{\fontsize{6\p@}{8\p@}\selectfont #1}% \sbox\@cipboxb{\fontsize{6\p@}{8\p@}\selectfont #2}% \@ciprulewidth\wd\@cipboxa \ifnum\@ciprulewidth<\wd\@cipboxb\@ciprulewidth\wd\@cipboxb\fi}% \fi \if@pdf \else \AtBeginDocument{% \@cip{\rule{0pt}{9pt}0-8493-0052-5/00/\$0.00+\$.50}% {\copyright\ \ 2001 by CRC Press LLC}}% \fi \if@titlepage \newcommand\maketitle{\begin{titlepage}% \let\footnotesize\small \let\footnoterule\relax \let \footnote \thanks {\parindent \z@ \raggedright \baselineskip \z@ \lineskip \z@ \parskip \z@ \vbox{ \vskip -7bp {\baselineskip 10bp\lineskip 10bp\NameFont\uppercase{\@author}\par} \vskip 6bp \AffiliationFont \@affiliation \vskip -2bp \crcrule \vskip 22bp {\baselineskip 24bp\lineskip 24bp\TitlePageTitleFont\@title\par}}} \@thanks \vfil\null \end{titlepage}% \setcounter{footnote}{0}% \global\let\thanks\relax \global\let\maketitle\relax \global\let\@thanks\@empty \global\let\@author\@empty \global\let\@date\@empty % \global\let\@title\@empty \global\let\title\relax \global\let\author\relax \global\let\date\relax \global\let\and\relax } \else \newcommand\maketitle{\par \begingroup \renewcommand\thefootnote{\@fnsymbol\c@footnote}% \def\@makefnmark{\rlap{\@textsuperscript{\normalfont\@thefnmark}}}% \long\def\@makefntext##1{\parindent 1em\noindent \hb@xt@1.8em{% \hss\@textsuperscript{\normalfont\@thefnmark}}##1}% \if@twocolumn \ifnum \col@number=\@ne \@maketitle \else \twocolumn[\@maketitle]% \fi \else \newpage \global\@topnum\z@ % Prevents figures from going at top of page. \@maketitle \fi \thispagestyle{empty}\@thanks \endgroup \setcounter{footnote}{0}% \global\let\thanks\relax \global\let\maketitle\relax \global\let\@maketitle\relax \global\let\@thanks\@empty \global\let\@author\@empty \global\let\@date\@empty \global\let\@title\@empty \global\let\title\relax \global\let\author\relax \global\let\date\relax \global\let\and\relax } \def\@maketitle{% \newpage \null \vskip 2em% {\parindent \z@ \raggedright \baselineskip \z@ \lineskip \z@ \parskip \z@ \vbox{ \vskip -7bp {\baselineskip 10bp\lineskip 10bp\NameFont\uppercase{\@author}\par} \vskip 6bp \AffiliationFont \@affiliation \vskip 10bp \crcrule \vskip 26bp {\baselineskip 24bp\lineskip 24bp\TitlePageTitleFont\@title\par}}} \par \vskip 1.5em} \fi %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \newcommand*\chaptermark[1]{} \setcounter{secnumdepth}{3} \newcounter {part} \newcounter {chapter} \newcounter {section}[chapter] \newcounter {subsection}[section] \newcounter {subsubsection}[subsection] \newcounter {paragraph}[subsubsection] \newcounter {subparagraph}[paragraph] \renewcommand \thepart {\@Roman\c@part} \renewcommand \thechapter {\@arabic\c@chapter} \renewcommand \thesection {\thechapter.\@arabic\c@section} \renewcommand\thesubsection {\thesection.\@arabic\c@subsection} \renewcommand\thesubsubsection{\thesubsection .\@arabic\c@subsubsection} \renewcommand\theparagraph {\thesubsubsection.\@arabic\c@paragraph} \renewcommand\thesubparagraph {\theparagraph.\@arabic\c@subparagraph} \newcommand\@chapapp{\chaptername} \newcommand\frontmatter{% \cleardoublepage \@mainmatterfalse \pagenumbering{roman}} \newcommand\mainmatter{% \cleardoublepage \@mainmattertrue \pagenumbering{arabic}} \newcommand\backmatter{% \if@openright \cleardoublepage \else \clearpage \fi \@mainmatterfalse} \newcommand\part{\make@cornermarks% \if@openright \cleardoublepage \else \clearpage \fi \thispagestyle{plain}% \if@twocolumn \onecolumn \@tempswatrue \else \@tempswafalse \fi \null\vfil \secdef\@part\@spart} \def\@part[#1]#2{% \ifnum \c@secnumdepth >-2\relax \refstepcounter{part}% \addcontentsline{toc}{part}{\thepart\hspace{1em}#1}% \else \addcontentsline{toc}{part}{#1}% \fi \markboth{}{}% {\centering \interlinepenalty \@M \normalfont \ifnum \c@secnumdepth >-2\relax \huge\bfseries \partname\nobreakspace\thepart \par \vskip 20\p@ \fi \Huge \bfseries #2\par}% \@endpart} \def\@spart#1{% {\centering \interlinepenalty \@M \normalfont \Huge \bfseries #1\par}% \@endpart} \def\@endpart{\vfil\newpage \if@twoside \if@openright \null \thispagestyle{empty}% \newpage \fi \fi \if@tempswa \twocolumn \fi} \if@ChapterTOCs \newwrite\@chaptoc \def\secnumwidth{21pt}\def\subsecnumwidth{30pt}\def\ssubsecnumwidth{36pt}\fi \long\def\@trplarg#1{\@ifnextchar[{\@xtrplarg{#1}}{\@ztrplarg{#1}}} \long\def\@xtrplarg#1[#2]{\@ifnextchar[{#1[#2]}{\@ytrplarg{#1}[{#2}]}} \long\def\@ytrplarg#1[#2]#3{#1[{#2}][{#2}]{#3}} \long\def\@ztrplarg#1#2{#1[{#2}][{#2}]{#2}} \newcommand\chapter{\if@openright\cleardoublepage\else\clearpage\fi \make@cornermarks \cleardoublepage \if@ChapterTOCs\if@filesw\immediate\closeout\@chaptoc\fi\fi \pagestyle{headings}% \thispagestyle{folio}% \if@ChapterResetsPage\global\c@page\@ne\fi \global\@topnum\z@ \gdef\chapterauthor{\@ca}% \gdef\endchapterauthors{\end@cas}% \@afterindentfalse % \secdef\@chapter\@schapter \@ifstar{\@schapter}{\@trplarg{\@chapter}}} \def\@chapter[#1][#2]#3{% \ifnum\c@secnumdepth>\m@ne \if@mainmatter \refstepcounter{chapter}% \typeout{\@chapapp\space\thechapter.}% \addcontentsline{toc}{chapter}{\protect\numberline{\thechapter}#1}% \else \addcontentsline{toc}{chapter}{#1}\fi \else \addcontentsline{toc}{chapter}{#1}\fi \chaptermark{% #2}% \addtocontents{lof}{\protect\addvspace{10\p@}}% \addtocontents{lot}{\protect\addvspace{10\p@}}% \if@twocolumn \@topnewpage[\@makechapterhead{#3}]% \else \@makechapterhead{#3}% \@afterheading\fi \if@ChapterTOCs\if@filesw\immediate\openout\@chaptoc\thechapter.toc\fi\fi } \def\@makechapterhead#1{% {\parindent \z@ \raggedright \baselineskip \z@ \lineskip \z@ \parskip \z@ \vbox{ \vskip -2\p@ \ChapNumFont %Remove comment if "Chapter" word required before Number %\if@chapnumonly\else % \@chapapp\ %\fi \thechapter \vskip -15\p@ \chap@rule \vskip 6\p@ {\baselineskip 20\p@\lineskip 20\p@\ChapTitleFont #1\par\vskip-15pt}% \noindent\hbox{\vrule height.5pt width84pt} \vskip28\p@} \if@ChapterTOCs \make@chaptoc \else \fi \vskip 19.3\p@} \def\theequation{\thechapter.\arabic{equation}}}% \def\@schapter#1{\if@twocolumn \@topnewpage[\@makeschapterhead{#1}]% \else \@makeschapterhead{#1}% \@afterheading \fi} \def\@makeschapterhead#1{% {\parindent \z@ \raggedright \baselineskip 6\p@ \lineskip \z@ \parskip \z@ \vbox{ \vskip 22\p@ \unnumchap@rule \vskip 5\p@ \FMHeadFont #1\par\vskip-12pt \noindent\hbox{\vrule height.5pt width84pt} \vskip 41\p@}}% \def\theequation{\thechapter.\arabic{equation}}} \def\@startsection#1#2#3#4#5#6{% \if@noskipsec\leavevmode\fi \par \@tempskipa #4\relax \@afterindenttrue \ifdim \@tempskipa <\z@ \@tempskipa -\@tempskipa \@afterindentfalse \fi \if@nobreak \everypar{}% \else \addpenalty\@secpenalty\addvspace\@tempskipa \fi \@ifstar {\@ssect{#1}{#3}{#4}{#5}{#6}}% {\@trplarg{\@sect{#1}{#2}{#3}{#4}{#5}{#6}}}} \def\@ssect#1#2#3#4#5#6{% \@tempskipa #4\relax \ifdim \@tempskipa>\z@ \begingroup #5{% \@hangfrom{\hskip #2}% \interlinepenalty \@M #6\@@par}% \endgroup \csname #1mark\endcsname{#6}% \else \def\@svsechd{#5{\hskip #2\relax #6}\csname #1mark\endcsname{#6}}% \fi \@xsect{#4}} \def\@sect#1#2#3#4#5#6[#7][#8]#9{% \ifnum #2>\c@secnumdepth \let\@svsec\@empty \else \refstepcounter{#1}% \protected@edef\@svsec{\@seccntformat{#1}\relax}% \fi \@tempskipa #5\relax \ifdim \@tempskipa>\z@ \begingroup #6{% \@hangfrom{\hskip #3\relax\@svsec}\interlinepenalty \@M % #9\@@par}% \endgroup \csname #1mark\endcsname{% #8}% \addcontentsline{toc}{#1}{% \ifnum #2>\c@secnumdepth \else \protect\numberline{\csname the#1\endcsname}% \fi #7}% \else \def\@svsechd{% #6{\hskip #3\relax \@svsec #9}% \csname #1mark\endcsname{% #8}% \addcontentsline{toc}{#1}{% \ifnum #2>\c@secnumdepth \else \protect\numberline{\csname the#1\endcsname}% \fi #7}}% \fi \@xsect{#5}} \newcommand\section{% \gdef\chapterauthor{\@caplusone}% \gdef\endchapterauthors{\end@casplusone}% \@ifstar{\@ssection}{\@trplarg{\@section}}} \def\@ssection#1{% \if@ChapterTOCs \myaddcontentsline{\@chaptoc}{chapsection}{\string\makebox[\secnumwidth][l]{}#1}\fi \@startsection{section}{1}{\z@}{-30\p@}{6\p@}{\sec@rule\nopagebreak\vskip9.5\p@\nopagebreak\SectionHeadFont}*{#1}} \def\@section[#1][#2]#3{% \if@ChapterTOCs \addtocounter{section}{1}% \myaddcontentsline{\@chaptoc}{chapsection}{\string\makebox[\secnumwidth][l]{\thesection}#1}% \addtocounter{section}{-1}\fi \@startsection{section}{1}{\z@}{-30\p@}{6\p@}{\sec@rule\nopagebreak\vskip9.5\p@\nopagebreak\SectionHeadFont}[#2]{#3}} \def\sectionauthor#1{\hfill{\ChapTOCAuthorFont #1}} \newcommand\subsection{\@ifstar{\@ssubsection}{\@trplarg{\@subsection}}} \def\@ssubsection#1{% \if@ChapterTOCs \myaddcontentsline{\@chaptoc}{chapsubsection}{\string\makebox[\subsecnumwidth][l]{}#1}\fi \@startsection{subsection}{2}{\z@}{-18\p@}{6\p@}{% \SubsectionHeadFont}*{#1}} \def\@subsection[#1][#2]#3{% \if@ChapterTOCs \addtocounter{subsection}{1}% \myaddcontentsline{\@chaptoc}{chapsubsection}{\string\makebox[\subsecnumwidth][l]{\thesubsection}#1}% \addtocounter{subsection}{-1}\fi \@startsection{subsection}{2}{\z@}{-18\p@}{6\p@}{% \SubsectionHeadFont}[#2]{#3}} \newcommand\subsubsection{\@ifstar{\@ssubsubsection}{\@trplarg{\@subsubsection}}} \def\@ssubsubsection#1{% \if@ChapterTOCs \myaddcontentsline{\@chaptoc}{chapsubsubsection}{\string\makebox[\subsecnumwidth][l]{}#1}\fi \@startsection{subsubsection}{3}{\z@}{-12\p@}{6\p@}{% \SubsubsectionHeadFont}*{#1}} \def\@subsubsection[#1][#2]#3{% \if@ChapterTOCs \addtocounter{subsubsection}{1}% \myaddcontentsline{\@chaptoc}{chapsubsubsection}{\hskip21pt\string\makebox[\ssubsecnumwidth][l]{\thesubsubsection}#1}% \addtocounter{subsubsection}{-1}\fi \@startsection{subsubsection}{3}{\z@}{-12\p@}{6\p@}{% \SubsubsectionHeadFont}[#2]{#3}} \newcommand\paragraph{\@startsection{paragraph}{4}{\z@}% {-12\p@}{6\p@}{\ParagraphHeadFont}} \newcommand\subparagraph{\@startsection{subparagraph}{5}{\parindent}% {-12\p@}{6\p@}{\SubParagraphHeadFont}} \if@twocolumn \setlength\leftmargini {2em} \else \setlength\leftmargini {2.5em} \fi \leftmargin \leftmargini \setlength\leftmarginii {2.2em} \setlength\leftmarginiii {1.87em} \setlength\leftmarginiv {1.7em} \if@twocolumn \setlength\leftmarginv {.5em} \setlength\leftmarginvi {.5em} \else \setlength\leftmarginv {1em} \setlength\leftmarginvi {1em} \fi \setlength \labelsep {.5em} \setlength \labelwidth{\leftmargini} \addtolength\labelwidth{-\labelsep} \@beginparpenalty -\@lowpenalty \@endparpenalty -\@lowpenalty \@itempenalty -\@lowpenalty \renewcommand\theenumi{\@arabic\c@enumi} \renewcommand\theenumii{\@alph\c@enumii} \renewcommand\theenumiii{\@roman\c@enumiii} \renewcommand\theenumiv{\@Alph\c@enumiv} \newcommand\labelenumi{\theenumi.} \newcommand\labelenumii{(\theenumii)} \newcommand\labelenumiii{\theenumiii.} \newcommand\labelenumiv{\theenumiv.} \renewcommand\p@enumii{\theenumi} \renewcommand\p@enumiii{\theenumi(\theenumii)} \renewcommand\p@enumiv{\p@enumiii\theenumiii} \newcommand\labelitemi{\textbullet} \newcommand\labelitemii{\normalfont\bfseries \textendash} \newcommand\labelitemiii{\textasteriskcentered} \newcommand\labelitemiv{\textperiodcentered} \newenvironment{description} {\list{}{\labelwidth\z@ \itemindent-\leftmargin \let\makelabel\descriptionlabel}} {\endlist} \newcommand*\descriptionlabel[1]{\hspace\labelsep \normalfont\bfseries #1} \newenvironment{verse} {\let\\\@centercr \list{}{\itemsep \z@ \itemindent -1.5em% \listparindent\itemindent \rightmargin \leftmargin \advance\leftmargin 1.5em}% \item\relax} {\endlist} \newenvironment{quotation} {\list{}{\listparindent 1.5em% \itemindent \listparindent \rightmargin \leftmargin \parsep \z@ \@plus\p@}% \item\relax} {\endlist} \newenvironment{quote} {\list{}{\rightmargin\leftmargin}% \item\relax} {\endlist} \if@compatibility \newenvironment{titlepage} {% \cleardoublepage \if@twocolumn \@restonecoltrue\onecolumn \else \@restonecolfalse\newpage \fi \thispagestyle{empty}% \setcounter{page}\z@ }% {\if@restonecol\twocolumn \else \newpage \fi } \else \newenvironment{titlepage} {% \cleardoublepage \if@twocolumn \@restonecoltrue\onecolumn \else \@restonecolfalse\newpage \fi \thispagestyle{empty}% \setcounter{page}\@ne }% {\if@restonecol\twocolumn \else \newpage \fi \if@twoside\else \setcounter{page}\@ne \fi } \fi \newcommand\appendix{\par \setcounter{chapter}{0}% \setcounter{section}{0}% \gdef\@chapapp{\appendixname}% \gdef\thechapter{\@Alph\c@chapter}} \setlength\arraycolsep{5\p@} \setlength\tabcolsep{6\p@} \setlength\arrayrulewidth{.4\p@} \setlength\doublerulesep{2\p@} \setlength\tabbingsep{\labelsep} \skip\@mpfootins = \skip\footins \setlength\fboxsep{3\p@} \setlength\fboxrule{.4\p@} \@addtoreset {equation}{chapter} \renewcommand\theequation {\ifnum \c@chapter>\z@ \thechapter.\fi \@arabic\c@equation} \newcounter{figure}[chapter] \renewcommand \thefigure {\ifnum \c@chapter>\z@ \thechapter.\fi \@arabic\c@figure} \def\fps@figure{tbp} \def\ftype@figure{1} \def\ext@figure{lof} \def\fnum@figure{\figurename\nobreakspace\thefigure} \newenvironment{figure} {\@float{figure}} {\end@float} \newenvironment{figure*} {\@dblfloat{figure}} {\end@dblfloat} \newcounter{table}[chapter] \renewcommand \thetable {\ifnum \c@chapter>\z@ \thechapter.\fi \@arabic\c@table} \def\fps@table{tbp} \def\ftype@table{2} \def\ext@table{lot} \def\fnum@table{\tablename\nobreakspace\thetable} \newenvironment{table} {\@float{table}} {\end@float} \newenvironment{table*} {\@dblfloat{table}} {\end@dblfloat} \newlength\abovecaptionskip \newlength\belowcaptionskip \setlength\abovecaptionskip{10\p@} \setlength\belowcaptionskip{0\p@} \long\def\@makecaption#1#2{% \vskip\abovecaptionskip \sbox\@tempboxa{#1: #2}% \ifdim \wd\@tempboxa >\hsize {\FigCapFont #1}\par #2\par \else \global \@minipagefalse % \hb@xt@\hsize{\hfil\box\@tempboxa\hfil}% {\FigCapFont #1}\par #2\par \fi \vskip\belowcaptionskip} \DeclareOldFontCommand{\rm}{\normalfont\rmfamily}{\mathrm} \DeclareOldFontCommand{\sf}{\normalfont\sffamily}{\mathsf} \DeclareOldFontCommand{\tt}{\normalfont\ttfamily}{\mathtt} \DeclareOldFontCommand{\bf}{\normalfont\bfseries}{\mathbf} \DeclareOldFontCommand{\it}{\normalfont\itshape}{\mathit} \DeclareOldFontCommand{\sl}{\normalfont\slshape}{\@nomath\sl} \DeclareOldFontCommand{\sc}{\normalfont\scshape}{\@nomath\sc} \DeclareRobustCommand*\cal{\@fontswitch\relax\mathcal} \DeclareRobustCommand*\mit{\@fontswitch\relax\mathnormal} \newcommand\@pnumwidth{1.55em} \newcommand\@tocrmarg{2.55em} \newcommand\@dotsep{4.5} \setcounter{tocdepth}{3} \newcounter{numauthors} \newif\if@break \newif\if@firstauthor \newcommand\tableofcontents{% \if@twocolumn \@restonecoltrue\onecolumn \else \@restonecolfalse \fi \chapter*{\contentsname \@mkboth{% \MakeUppercase\contentsname}{\MakeUppercase\contentsname}}% {\let\break\space \let\author\toc@author \reset@authors \let\toc@draw\relax \@starttoc{toc} \toc@draw } \if@restonecol\twocolumn\fi } \def\draw@part#1#2{% \addpenalty{-\@highpenalty}% \vskip1em plus\p@ \@tempdima1.5em \begingroup \parindent\z@\rightskip\@pnumwidth \parfillskip-\rightskip \bfseries \leavevmode \advance\leftskip\@tempdima \hskip-\leftskip {#1\hfil}\nobreak \if@pdf \else \hfil\nobreak\hb@xt@\@pnumwidth{\hss #2}% \fi \par \penalty\@highpenalty\endgroup} \let\toc@draw\relax % \def\l@part#1#2{% \toc@draw \gdef\toc@draw{\draw@part{\large #1}{\large #2}}} \def\l@chapter#1#2{% \toc@draw \gdef\toc@draw{\draw@chapter{#1}{#2}}} \def\@pnumwidth{1.8em} \def\draw@chapter#1#2{% \addpenalty{-\@highpenalty}% \vskip1em plus\p@ \@tempdima1.5em \begingroup \parindent\z@\rightskip\@pnumwidth \parfillskip-\rightskip \bfseries \leavevmode \advance\leftskip\@tempdima \hskip-\leftskip {#1\hfil}\nobreak \if@pdf \else \hfil\nobreak\hb@xt@\@pnumwidth{\hss #2}% \fi \par {\it\draw@authors}% \penalty\@highpenalty\endgroup} \def\toc@author#1#2{% \if@firstauthor \@firstauthorfalse \else \ifx\@authors\@empty \xdef\@authors{\last@author}% \else \@cons{\@authors}{, \last@author}\fi\fi \stepcounter{numauthors}% %%%%%%% commented and deleted below the second part to aviod inaccessible error % shashi % September-2008 %% \gdef\last@author{#1 {\rm\fontsize{9\p@}{11\p@}\selectfont #2}} \gdef\last@author{#1} } \def\draw@authors{% \let\@t\@authors \ifx\@t\@empty \let\@t\last@author\fi \ifx\@t\@empty\else \hskip\leftskip \ifx\@authors\@empty \else \@authors \ifnum\c@numauthors>2,\fi \if@break\break\fi \ and \fi \last@author\break\fi \reset@authors} \def\reset@authors{% \gdef\@authors{}% \gdef\last@author{}% \@firstauthortrue \setcounter{numauthors}{0}} \newlength\section@toc@skip \section@toc@skip1.5em \newlength\SectionTOCWidth \SectionTOCWidth2.3em \def\l@section#1#2{% \toc@draw \gdef\toc@draw{\draw@section{#1}{#2}}} \def\draw@section#1#2{% \@dottedtocline{1}{\section@toc@skip}{\SectionTOCWidth}{#1 }{{ \tocfont #2}}} \newlength\subsection@toc@skip \subsection@toc@skip\section@toc@skip \advance\subsection@toc@skip\SectionTOCWidth \newlength\SubSectionTOCWidth \SubSectionTOCWidth3.2em \def\l@subsection#1#2{% \toc@draw \gdef\toc@draw{\draw@subsection{#1}{#2}}} \def\draw@subsection#1#2{% \@dottedtocline{2}{\subsection@toc@skip}{\SubSectionTOCWidth}{#1}{{ \tocfont #2}}} \newlength\subsubsection@toc@skip \subsubsection@toc@skip\subsection@toc@skip \advance\subsubsection@toc@skip\SubSectionTOCWidth \newlength\SubSubSectionTOCWidth \SubSubSectionTOCWidth4.1em \def\l@subsubsection#1#2{% \toc@draw \gdef\toc@draw{\draw@subsubsection{#1}{#2}}} \def\draw@subsubsection#1#2{% \@dottedtocline{3}{\subsubsection@toc@skip}{\SubSubSectionTOCWidth}{#1}{{ \tocfont #2}}} \newlength\paragraph@toc@skip \paragraph@toc@skip\subsubsection@toc@skip \advance\paragraph@toc@skip\SubSubSectionTOCWidth \newlength\ParagraphTOCWidth \ParagraphTOCWidth4.1em \def\l@paragraph#1#2{% \toc@draw \gdef\toc@draw{\draw@paragraph{#1}{#2}}} \def\draw@paragraph#1#2{% \@dottedtocline{4}{\paragraph@toc@skip}{\ParagraphTOCWidth}{#1}{{ \tocfont #2}}} \newlength\subparagraph@toc@skip \subparagraph@toc@skip\paragraph@toc@skip \advance\subparagraph@toc@skip\ParagraphTOCWidth \def\l@subparagraph#1#2{% \toc@draw \gdef\toc@draw{\draw@subparagraph{#1}{#2}}} \def\draw@subparagraph#1#2{% \@dottedtocline{5}{\subparagraph@toc@skip}{6em}{#1}{{ \tocfont #2}}} \def\@dottedtocline#1#2#3#4#5{% \ifnum #1>\c@tocdepth \else \vskip \z@ \@plus.2\p@ {\leftskip #2\relax\rightskip\@tocrmarg\parfillskip-\rightskip \parindent #2\relax\@afterindenttrue \interlinepenalty\@M \leavevmode \@tempdima #3\relax \advance\leftskip\@tempdima\null\hskip-\leftskip {#4\hfil}\nobreak \if@pdf \else \leaders\hbox{$\m@th\mkern\@dotsep mu\hbox{.}\mkern\@dotsep mu$}\hfill \nobreak \hb@xt@\@pnumwidth{\hfil\normalfont\normalcolor #5}% \fi \par}\fi} \newcommand\chapterauthors{% \def\break{\string\break\ }% \def\protect##1{\string ##1 }} \def\end@cas{} \def\end@casplusone{\vskip4pt\@doendpe} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \def\make@chaptoc{% chapter author {\parindent\z@ \newcommand\FolioBoldFont{}% \let\@b\bullet \def\bullet{\raisebox{2pt}{$\scriptscriptstyle\@b$}}% \let\SubsectionItalicFont\it %\ifx\chapter@author\@empty\else {\rm\fontsize{10\p@}{10\p@}\bfseries\selectfont %\the\c@numauthors \ifnum\c@numauthors=1 \chapter@authorone\vskip6\p@ {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationone}\vskip12\p@ \fi \ifnum\c@numauthors=2 \chapter@authorone\vskip6\p@ {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationone}\vskip12\p@ \chapter@authortwo\vskip6\p@ {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationtwo} \fi \ifnum\c@numauthors=3 \chapter@authorone\vskip6\p@ {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationone}\vskip12\p@ \chapter@authortwo\vskip6\p@ {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationtwo}\vskip12\p@ \chapter@authorthree\vskip6\p@ {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationthree} \fi \ifnum\c@numauthors=4 \chapter@authorone\vskip6\p@ {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationone}\vskip12\p@ \chapter@authortwo\vskip6\p@ {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationtwo}\vskip12\p@ \chapter@authorthree\vskip6\p@ {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationthree}\vskip12\p@ \chapter@authorfour\vskip6\p@ {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationfour} \fi } \gdef\chapter@authorone{}\gdef\chapter@affiliationone{}% \gdef\chapter@authortwo{}\gdef\chapter@affiliationtwo{}% \gdef\chapter@authorthree{}\gdef\chapter@affiliationthree{}% \gdef\chapter@authorfour{}\gdef\chapter@affiliationfour{}% \vskip 14.6\p@ {\leftskip\secnumwidth\def\author##1##2{}\vskip14pt\hbox{\leftskip0pt\SubsectionHeadFont CONTENTS}\vskip6pt\par\@input{\thechapter.toc}\par}% } \reset@authors} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \newif\iffinishedfromone \global\finishedfromonefalse % \newif\iffinishedfromtwo \global\finishedfromtwofalse % \newif\iffinishedfromthree \global\finishedfromthreefalse % \newif\iffinishedfromfour \global\finishedfromfourfalse %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % \newcommand\singleauthorchapter{\finishedfromonetrue} \newcommand\twoauthorchapter{\finishedfromtwotrue} \newcommand\threeauthorchapter{\finishedfromthreetrue} \newcommand\fourauthorchapter{\finishedfromfourtrue} % %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \newif\iffinish \global\finishfalse %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \newsavebox\@AUonebox \newsavebox\@AUtwobox \newsavebox\@AUthreebox \newsavebox\@AUfourbox % \newsavebox\@AUaffonebox \newsavebox\@AUafftwobox \newsavebox\@AUaffthreebox \newsavebox\@AUafffourbox % \newsavebox\@finalAUboxfromone \newsavebox\@finalAUboxfromtwo \newsavebox\@finalAUboxfromthree \newsavebox\@finalAUboxfromfour %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \def\@ca#1#2{% % \def\chapter@author{#1}% % \def\chapter@affiliation{#2}% \if@filesw% \write\@auxout{% \string\@writefile{toc}{\string\author{#1}{}}% }% \fi %%%%%%%%%%%%%%% \ifnum\c@numauthors>4 \resetcounter{numauthors} \fi \stepcounter{numauthors} %%\the\c@numauthors \ifnum\c@numauthors=1 % \sbox\@AUonebox{\CAPlusOneFont#1} \sbox\@AUaffonebox{\vbox{\hsize\textwidth\CAAPlusOneFont\noindent #2\par}} \sbox\@finalAUboxfromone{\copy\@AUonebox} \def\chapter@authorone{\copy\@finalAUboxfromone} \def\chapter@affiliationone{\copy\@AUaffonebox} \fi \ifnum\c@numauthors=2 \sbox\@AUtwobox{\CAPlusOneFont#1} \sbox\@AUafftwobox{\vbox{\hsize\textwidth\CAAPlusOneFont\noindent #2\par}} \sbox\@finalAUboxfromtwo{\copy\@AUtwobox} \def\chapter@authortwo{\copy\@finalAUboxfromtwo} \def\chapter@affiliationtwo{\copy\@AUafftwobox} \fi \ifnum\c@numauthors=3 \sbox\@AUthreebox{\CAPlusOneFont#1} \sbox\@AUaffthreebox{\vbox{\hsize\textwidth\CAAPlusOneFont\noindent #2\par}} \sbox\@finalAUboxfromthree{\copy\@AUthreebox} \def\chapter@authorthree{\copy\@finalAUboxfromthree} \def\chapter@affiliationthree{\copy\@AUaffthreebox} \fi \ifnum\c@numauthors=4 \sbox\@AUfourbox{\CAPlusOneFont#1} \sbox\@AUafffourbox{\vbox{\hsize\textwidth\CAAPlusOneFont\noindent #2\par}} \sbox\@finalAUboxfromfour{\copy\@AUfourbox} \def\chapter@authorfour{\copy\@finalAUboxfromfour} \def\chapter@affiliationfour{\copy\@AUafffourbox} \fi} \def\@caplusone{\@ifstar{\@scaplusone}{\@ifnextchar[{\@xcaplusone}{\@xcaplusone[]}}} \def\@xcaplusone[#1]#2#3{% \def\@@empty{#1}\ifx\@empty\@@empty\@ca{#2}{#3}\else\@ca{#2}{#1}\fi\@scaplusone{#2}{#3}} \def\@scaplusone#1#2{% \ifhmode\vskip-12pt\fi %%Shashi Commented %%% \noindent\hskip3pc{\CAPlusOneFont\baselineskip14pt #1\def\@t{#2}\ifx\@t\@empty\else,\fi}\hskip6pt{\CAAPlusOneFont #2}\par } \def\chapterauthoronly#1#2{\@ca{#1}{}\@scaplusone{#1}{#2}} \def\myaddcontentsline#1#2#3{% \if@filesw \begingroup \let\label\@gobble\let\index\@gobble\let\glossary\@gobble \def\break{\ }% \def\protect##1{\string ##1 }% \@temptokena{\thepage}% \edef\@tempa{\write#1{\string\chapcontentsline{#2}{\string\raggedright\space #3}{\the\@temptokena}}}\@tempa \if@nobreak\ifvmode\nobreak\fi\fi \endgroup \fi} \def\chapcontentsline#1{\csname l@#1\endcsname} \def\l@chapsection{\@mydottedtocline{1}{\z@}{6pt}} \def\l@chapsubsection{\@mydottedtocline{2}{\secnumwidth}{6pt}} \def\l@chapsubsubsection{\@mydottedtocline{3}{\subsecnumwidth}{36pt}} \newcount\c@chaptocdepth \setcounter{chaptocdepth}{3} \def\@mytocline#1#2#3#4#5{% \ifnum #1>\c@chaptocdepth \else \vskip 2pt plus.2\p@ \ifnum #1=1\ifnum\c@chaptocdepth>1\addvspace{12pt}\fi\fi {\leftskip #2\relax% \rightskip \@tocrmarg \parfillskip -\rightskip \interlinepenalty\@M \leavevmode \@tempdima #3\relax \rightskip\z@ \vbox{\ChapTOCFont #4\nobreak}% \par}\fi} \def\@mydottedtocline#1#2#3#4#5{% \ifnum #1>\c@chaptocdepth \else \vskip 2pt plus.2\p@ {\leftskip #2\relax \rightskip \@tocrmarg \parfillskip -21pt %-\rightskip % \parindent #2\relax\@afterindenttrue \interlinepenalty\@M \leavevmode \def\@dotsep{1.2}% \@tempdima #3\relax \rightskip\z@ \advance\hsize-\secnumwidth {\fontsize{9.5\p@}{\baselineskip}\selectfont #4 \nobreak\leaders\hbox{$\m@th\mkern\@dotsep mu.\mkern\@dotsep mu$} \hfill\hbox to 1.5pc{\hfill#5}} \par}\fi} \newcommand\listoffigures{% \if@twocolumn \@restonecoltrue\onecolumn \else \@restonecolfalse \fi \chapter*{\listfigurename}% \@mkboth{\MakeUppercase\listfigurename}% {\MakeUppercase\listfigurename}% \@starttoc{lof}% \if@restonecol\twocolumn\fi } \newcommand*\l@figure{\@dottedtocline{1}{1.5em}{2.3em}} \newcommand\listoftables{% \if@twocolumn \@restonecoltrue\onecolumn \else \@restonecolfalse \fi \chapter*{\listtablename}% \@mkboth{% \MakeUppercase\listtablename}% {\MakeUppercase\listtablename}% \@starttoc{lot}% \if@restonecol\twocolumn\fi } \let\l@table\l@figure \newdimen\bibindent \setlength\bibindent{1.5em} \newenvironment{thebibliography}[1] {\chapter*{\bibname}% \@mkboth{\MakeUppercase\bibname}{\MakeUppercase\bibname}% \addcontentsline{toc}{chapter}{\bibname} \list{\@biblabel{\@arabic\c@enumiv}}% {\settowidth\labelwidth{\@biblabel{#1}}% \leftmargin\labelwidth \advance\leftmargin\labelsep \@openbib@code \usecounter{enumiv}% \let\p@enumiv\@empty \renewcommand\theenumiv{\@arabic\c@enumiv}}% \sloppy \clubpenalty4000 \@clubpenalty \clubpenalty \widowpenalty4000% \sfcode`\.\@m} {\def\@noitemerr {\@latex@warning{Empty `thebibliography' environment}}% \endlist} \newcommand\newblock{\hskip .11em\@plus.33em\@minus.07em} \let\@openbib@code\@empty \newenvironment{theindex} {\cleardoublepage\if@twocolumn \@restonecolfalse \else \@restonecoltrue \fi \twocolumn[\@makeschapterhead{\indexname}]% \@mkboth{\MakeUppercase\indexname}% {\MakeUppercase\indexname}% \pagestyle{headings} \addcontentsline{toc}{chapter}{\indexname} \thispagestyle{folio}\parindent\z@ \parskip\z@ \@plus .3\p@\relax \columnseprule \z@ \columnsep 35\p@ \let\item\@idxitem} {\if@restonecol\onecolumn\else\clearpage\fi} \newcommand\@idxitem{\par\hangindent 40\p@} \newcommand\subitem{\@idxitem \hspace*{20\p@}} \newcommand\subsubitem{\@idxitem \hspace*{30\p@}} \newcommand\indexspace{\par \vskip 10\p@ \@plus5\p@ \@minus3\p@\relax} \renewcommand\footnoterule{% \kern-3\p@ \hrule\@width.4\columnwidth \kern2.6\p@} \@addtoreset{footnote}{chapter} \newcommand\@makefntext[1]{% \parindent 1em% \noindent \hb@xt@1.8em{\hss\@makefnmark}#1} \newcommand\contentsname{Contents} \newcommand\listfigurename{List of Figures} \newcommand\listtablename{List of Tables} \newcommand\bibname{Bibliography} \newcommand\indexname{Index} \newcommand\figurename{FIGURE} \newcommand\tablename{TABLE} \newcommand\partname{Part} \newcommand\chaptername{Chapter} \newcommand\appendixname{Appendix} \def\today{\ifcase\month\or January\or February\or March\or April\or May\or June\or July\or August\or September\or October\or November\or December\fi \space\number\day, \number\year} \setlength\columnsep{10\p@} \setlength\columnseprule{0\p@} \pagestyle{headings} \pagenumbering{arabic} \if@twoside \else \raggedbottom \fi \if@twocolumn \twocolumn \sloppy \flushbottom \else \onecolumn \fi \newcommand\unnumcrcrule{\hbox to\textwidth{\rlap{\rule[-3.5\p@]{84\p@}{4\p@}}}} \newcommand\unnumchap@rule{\unnumcrcrule} \newcommand\crcrule{\hbox to\textwidth{\rlap{\rule[-3.5\p@]{84\p@}{4\p@}}\rule{\textwidth}{.5\p@}}} \newcommand\chap@rule{\crcrule} \newcommand\sec@rule{\crcrule} \def\@affiliate[#1]{\gdef\@affiliation{#1}} \def\@affiliation{} \def\def@theequation{% \if@numberinsequence \def\theequation{% \if@numbysec\thesection\else\thechapter\fi.% \@arabic\c@shared}% \else \def\theequation{% \if@numbysec\thesection\else\thechapter\fi.% \@arabic\c@equation}\fi} \def\affiliation#1{{\AffiliationFont\noindent #1\vskip 36bp}} \newbox\tempbox \newdimen\nomenwidth \newenvironment{symbollist}[1]{% \addvspace{12pt} \setbox\tempbox\hbox{#1\hskip1em}% \global\nomenwidth\wd\tempbox %\section*{Sumbol Description} \noindent{\SectionHeadFont Symbol Description}\vskip6pt \begin{multicols}{2}}{% \end{multicols}\par\addvspace{12pt}} \def\symbolentry#1#2{\par\noindent\@hangfrom{\hbox to \nomenwidth{#1\hss}}#2\par} \tabcolsep 5pt \arrayrulewidth .5pt \doublerulesep 1pt %\newcounter{subtable}[table] \newif\if@tablerules\@tablerulestrue \newif\if@centertable\@centertabletrue \newif\if@centertabletitle\@centertabletitletrue \newbox\@tablebox \newbox\@tabletitlebox \newdimen\@tablewidth \newdimen\@tabletitlewidth \newdimen\max@tablewidth \newcommand\automaticrules{\@tablerulestrue} \newcommand\noautomaticrules{\@tablerulesfalse} \def\thetable{% \thechapter.% \@arabic\c@table} \def\thesubtable{% \thechapter.% \@arabic\c@table\alph{subtable}} \def\resettableletter{\setcounter{subtable}{0}} \def\@Tabletitle{} \newcommand\tabletitle{\@ifnextchar[{\@xtabletitle}{\@tabletitlewidth\z@\@ytabletitle}} \def\@@tabletitle{} \newif\ifshorttabletitle \global\shorttabletitlefalse %\def\@xtabletitle#1{\@tabletitlewidth#1\@ytabletitle} % \def\@xtabletitle[#1]#2{% \gdef\@@tabletitle{#1}% \gdef\@tabletitle{#2}% \let\@Tabletitle\@TableTitle \refstepcounter{table}% {\let\footnotemark\@empty \let\footnote\@gobble \addcontentsline{\ext@table}{table}{\protect\numberline{\thetable}{\@@tabletitle}}}} %%%% %\long\def\@xtabletitle[#1]#2{% % \setbox\@ttbox\hbox{#1}\global\shorttabletitletrue % \def\@@tabletitle{\ifx\@ttbox\@empty\else#1\fi}% % \def\@tabletitle{#2}% % \let\@Tabletitle\@TableTitle % \refstepcounter{table}% % {\let\footnotemark\@empty % \let\footnote\@gobble % \addcontentsline{\ext@table}{table}{\protect\numberline{\thetable}{% %\ifshorttabletitle\@@tabletitle\else\@tabletitle\fi}}}} %%% % \long\def\@ytabletitle#1{% \def\@tabletitle{#1}% \let\@Tabletitle\@TableTitle \refstepcounter{table}% {\let\footnotemark\@empty \let\footnote\@gobble \addcontentsline{\ext@table}{table}{\protect\numberline{\thetable}{\@tabletitle}}}} \def\tabletitlelet{\@ifnextchar[{\@xtabletitlelet}{\@tabletitlewidth\z@\@ytabletitlelet}} \def\@xtabletitlelet[#1]{\@tabletitlewidth#1\@ytabletitlelet} \long\def\@ytabletitlelet#1{% \def\@tabletitle{#1}% \let\@Tabletitle\@TableTitle \ifnum\c@subtable=0\stepcounter{table}\fi \let\@currentlabel\thesubtable {\let\footnotemark\@empty \let\footnote\@gobble \addcontentsline{\ext@table}{table}{\protect\numberline{\thetable}{\@tabletitle}}}} \def\@TableTitle{% \noindent {% \vbox{{\TableNumberFont TABLE\ \thetable}}\par\TableTitleFont\@tabletitle}} \def\table{% %\long\def\caption##1{\tabletitle{##1}\@TableTitle\par}% \@float{table}} \@namedef{table*}{% \long\def\caption##1{\tabletitle{##1}\@TableTitle\par}% \@dblfloat{table}} \def\@tabular{% \leavevmode \if@centertable\hfil\fi \vbox\bgroup \setbox\@tablebox\hbox\bgroup \baselineskip11pt \global\let\@fn\@empty \def\footnote##1{\footnotemark\gdef\@fn{##1}} \renewcommand{\arraystretch}{.916666666667}% $\let\@acol\@tabacol \let\@classz\@tabclassz \let\@classiv\@tabclassiv \let\\\@tabularcr \@tabarray} \def\endtabular{% \crcr\egroup\egroup $\egroup \@tablewidth\wd\@tablebox \ifnum\@tabletitlewidth>0 {\hsize\@tabletitlewidth\raggedright\global\setbox\@tabletitlebox\vbox{\@Tabletitle}}% \else \setbox\@tabletitlebox\hbox{\@Tabletitle}% \ifnum\wd\@tabletitlebox>\@tablewidth {\hsize\@tablewidth\raggedright\global\setbox\@tabletitlebox\vbox{\@Tabletitle}}\fi \@tabletitlewidth\wd\@tabletitlebox\fi \ifnum\@tabletitlewidth>0 \ifnum\@tabletitlewidth>\@tablewidth\@tablewidth\@tabletitlewidth\fi \hbox to\@tabletitlewidth{\if@centertabletitle\hfil\fi\box\@tabletitlebox\hfil}\par\fi \max@tablewidth\@tablewidth \ifnum\@tabletitlewidth>\max@tablewidth\max@tablewidth\@tabletitlewidth\fi \if@tablerules \ifnum\@tabletitlewidth>0\vskip-6pt\fi \hbox to\max@tablewidth{\if@centertable\hfil\fi\rule{\@tablewidth}{1pt}\hfil}\par\fi \hbox to\max@tablewidth{\if@centertable\hfil\fi\box\@tablebox\hfil}\vskip1pt \if@tablerules\hbox to\max@tablewidth{\if@centertable\hfil\fi\rule{\@tablewidth}{1pt}\hfil}\par\fi \ifx\@fn\@empty\else\FootnoteFont\parindent\z@\noindent\@makefnmark\@fn\par\fi \egroup\hfil \vskip 0pt plus 12pt \gdef\@Tabletitle{}} \def\tch#1{\TableColHeadFont #1\llstrut\hfill} \def\tsh#1{\TableSubheadFont #1\hfill} \newcommand\llstrut{\rule[-6pt]{0pt}{14pt}} \newcommand\flstrut{\rule{0pt}{10pt}} \newcommand\tabletitlestrut{\rule{0pt}{20pt}} \def\Boxhead#1{\par\addvspace{3pt plus2pt}\noindent{\centering\bfseries#1\par}\vskip3pt} \newbox\tempbox% \newdimen\tempdimen% % \newenvironment{shortbox}{\par\addvspace{12pt plus2pt}% \if@krantza \setbox\tempbox\vbox\bgroup\hsize27pc% \else\if@krantzb \setbox\tempbox\vbox\bgroup\hsize32pc% \else \setbox\tempbox\vbox\bgroup\hsize25pc% \fi\fi }{% \egroup% \noindent\fboxsep6pt\fboxrule.5pt\hspace*{0pt}\fbox{\box\tempbox} \par\addvspace{12pt plus2pt}}% % \def\grayink{\special{color cmyk 0 0 0 0.2}} \def\blackink{\special{color cmyk 0 0 0 1.0}} % \def\whiteink{\special{color cmyk 0 0 0 0}} % 0% \newenvironment{shadebox}{% \setbox\tempbox\hbox\bgroup\vbox\bgroup\leftskip12pt\rightskip\leftskip}{\par\addvspace{12pt} \egroup\egroup\par\addvspace{25pt} \tempdimen\ht\tempbox \advance\tempdimen by 1pc \noindent{\hbox to \wd\tempbox{\vbox to \ht\tempbox{\hsize\textwidth{\special{color push}\grayink\vspace*{-12pt}\noindent\vrule height\tempdimen width\textwidth \special{color pop}\blackink}}}}% \llap{\unhbox\tempbox}\par\addvspace{12pt}} %%%%%%%%%% Note %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \newbox\tempbox \newdimen\notewidth \newenvironment{notelist}[1]{% \addvspace{6pt} \setbox\tempbox\hbox{#1\hskip.57em}% \global\notewidth\wd\tempbox }{% \par\addvspace{6pt}} \def\notes#1#2{\par\noindent\@hangfrom{\hbox to \notewidth{\bf #1\hss}}#2\par} %%%%%%%%%%%%%%%% wherelist %%%%%%%%%%%%%%%% \newbox\wherebox \newdimen\wherewidth \newenvironment{wherelist}[1]{\leftskip10pt% \addvspace{6pt} \setbox\wherebox\hbox{#1\hskip1em}% \global\wherewidth\wd\wherebox \noindent\hspace*{-14pt} where }{% \par\addvspace{6pt}} \def\whereentry#1#2#3{\par\noindent\@hangfrom{\hbox to \wherewidth{#1\hss}#2\hskip6pt}#3\par} %%%%%%%%%%%% \newenvironment{unnumlist}{% \ifnum \@enumdepth >3 \@toodeep\else \advance\@enumdepth\@ne \list{}{% \leftmargini27.5pt \leftmarginii17.5pt\leftmarginiv17.5pt % \leftmargin\parindent \advance\leftmargin-.2em \advance\leftmarginii.2em \advance\leftmarginiii.1em \advance\leftmarginiv.2em \def\makelabel##1{\hss\llap{##1}}} \fi% }{% \endlist} % \newenvironment{extract}{% \par\addvspace{11.5pt minus2pt}% \leftskip2em\rightskip\leftskip \noindent\ignorespaces }{% \par\addvspace{11.5pt minus2pt}% \@endparenv} % % \def\VA#1#2{\addvspace{12pt}\raggedleft #1\rightskip3em\par #2\rightskip3em} % \newenvironment{VF}{\VfFont% \par\addvspace{12pt minus2pt}% \noindent{\vrule height2pt width\textwidth}\par\vskip7.3pt \leftskip3em\rightskip\leftskip \noindent\ignorespaces }{% \par\vskip6pt\leftskip0pt\noindent{{\vrule height2pt width\textwidth}}\par\addvspace{12pt minus2pt}% \@endparenv} % \def\VTA#1#2{\addvspace{12pt}\raggedleft #1\rightskip3em\par {\it #2}\rightskip3em} % % \def\VT{\par\addvspace{3.5pt}\noindent} \def\VH#1{{\normalfont\fontsize{12.5}{14.5}\itshape\centering\selectfont #1\par}\addvspace{5.5pt}} % \newenvironment{VT1}{\VfFont% \par\addvspace{12pt minus2pt}% \noindent{\vrule height2pt width\textwidth}\par\vskip7.5pt \leftskip3em\rightskip\leftskip %\@afterheading \parindent0pt \noindent\ignorespaces }{% \par\vskip6pt\leftskip0pt\noindent{{\vrule height2pt width\textwidth}}\par\addvspace{10pt minus2pt}% \@endparenv} % %%%%%%%%%%%% Glossary %%%%%%%%%%%%%%%%%%%%%%% \newenvironment{Glossary} {\list{}{\labelwidth\z@\leftmargin18pt \itemindent-18pt \let\makelabel\glosslabel}} {\endlist} \newcommand\glosslabel[1]{\hspace\labelsep\normalfont\bfseries #1:} %%%%%%%%%%%% \newif\iffnalpha \global\fnalphafalse \newskip\listtextleftmargin\listtextleftmargin 20pt%24pt \newskip\listtextleftmarginii\listtextleftmarginii0pt% 24pt \newskip\listtextleftmarginiii\listtextleftmarginiii0pt% 24pt \newskip\listtextrightmargin\listtextrightmargin12pt%.5pc \newskip\listlabelleftskip \listlabelleftskip4pt%3.3pt \newskip\listlabelleftskipii \listlabelleftskipii0pt%3.3pt \newskip\listlabelleftskipiii \listlabelleftskipiii0pt%3.3pt \newskip\abovelistskipi\abovelistskipi6pt plus2pt \newskip\belowlistskipi\belowlistskipi6pt plus2pt \newskip\abovelistskipii\abovelistskipii0pt plus2pt \newskip\belowlistskipii\belowlistskipii0pt plus2pt \newskip\abovelistskipiii\abovelistskipiii0pt plus2pt \newskip\belowlistskipiii\belowlistskipiii0pt plus2pt \newskip\labelsepi \labelsepi6pt \newskip\labelsepii \labelsepii6pt \newskip\labelsepiii \labelsepiii6pt%\z@ \newskip\itemsepi \itemsepi0pt%10pt \newskip\itemsepii \itemsepii0pt \newskip\itemsepiii \itemsepiii0pt \newdimen\enumdimwd \newif\iflabelrightalign\labelrightaligntrue \newdimen\enumdim% % \def\enummax#1{% \labelsep\csname labelsep\romannumeral\the\@enumdepth\endcsname \ifdim\listtextleftmargin>\z@\labelsepi0pt\fi \ifdim\listtextleftmarginii>\z@\labelsepii0pt\fi \ifdim\listtextleftmarginiii>\z@\labelsepiii0pt\fi \setbox\tempbox\hbox{\csname listdevicefont\romannumeral\the\@enumdepth\endcsname#1\hskip\labelsep}% \enumdim\wd\tempbox \setbox\tempbox\hbox{\csname listdevicefont\romannumeral\the\@enumdepth\endcsname#1}% \enumdimwd\wd\tempbox \expandafter\global\csname leftmargin\romannumeral\the\@enumdepth\endcsname\enumdim \ifdim\listtextleftmargin>\z@ \leftmargini\listtextleftmargin \ifdim\listlabelleftskip>\z@ \advance\leftmargini-\listlabelleftskip \fi \fi \ifdim\listtextleftmarginii>\z@ \leftmarginii\listtextleftmarginii \ifdim\listlabelleftskipii>\z@ \advance\leftmarginii-\listlabelleftskipii \fi \fi \ifdim\listtextleftmarginiii>\z@ \leftmarginiii\listtextleftmarginiii \ifdim\listlabelleftskipiii>\z@ \advance\leftmarginiii-\listlabelleftskipiii \fi \fi } % \enummax{1.} % \def\enumerate{\@ifnextchar[{\@enumerate}{\@enumerate[\csname label\@enumctr\endcsname]}}%% % \def\@enumerate[#1]{\par \ifnum \@enumdepth >3 \@toodeep \else \advance\@enumdepth\@ne \edef\@enumctr{enum\romannumeral\the\@enumdepth}% \setcounter{\@enumctr}{1}\enummax{#1}% \list {\csname label\@enumctr\endcsname}{\usecounter{\@enumctr}% \topsep\csname abovelistskip\romannumeral\the\@enumdepth\endcsname \itemsep\csname itemsep\romannumeral\the\@enumdepth\endcsname % \listfont %\listparindent18.25pt \ifnum \@enumdepth=1 \leftmargin32.7pt \rightmargin\listtextrightmargin \advance\rightmargin\rightskip \advance\leftmargin\leftskip \tempdimen\leftmargini \advance\tempdimen-\labelsep %%%%%%%%%%% \iffnalpha \def\makelabel##1{{\hskip\listlabelleftskip{\csname listdevicefont\romannumeral\the\@enumdepth\endcsname{\iflabelrightalign\hss\fi\textlistlabel##1}}}}% \global\fnalphafalse \else \def\makelabel##1{\hbox to \tempdimen{\hskip\listlabelleftskip{\csname listdevicefont\romannumeral\the\@enumdepth\endcsname\hbox to \enumdimwd{\iflabelrightalign\hss\fi\textlistlabel##1}}\blackink}}% \fi %%%%%%%%%%%%%%%%%%%%%%%%%%% \else \ifnum \@enumdepth=2 \tempdimen\leftmarginii \advance\tempdimen-\labelsep \def\makelabel##1{\hbox to \tempdimen{\hskip\listlabelleftskipii{\csname listdevicefont\romannumeral\the\@enumdepth\endcsname\hbox to \enumdimwd{\iflabelrightalign\hss\fi##1}\blackink}}}% \else \ifnum \@enumdepth=3 \tempdimen\leftmarginiii \advance\tempdimen-\labelsep \def\makelabel##1{\hbox to \tempdimen{\hskip\listlabelleftskipiii{\csname listdevicefont\romannumeral\the\@enumdepth\endcsname\hbox to \enumdimwd{\iflabelrightalign\hss\fi##1}\blackink}}}% \else \def\makelabel##1{\hss\llap{\csname listdevicefont\romannumeral\the\@enumdepth\endcsname##1}}% \fi \fi \fi} \fi} % \def\endenumerate{\@topsepadd\csname belowlistskip\romannumeral\the\@enumdepth\endcsname\endlist}% % \def\textlistlabel{} %%%%%%%%%%%%%%%%%%%%%%%%%%% \newdimen\concolwidth \newbox\stempbox \def\contributor#1#2#3{\addvspace{10pt}{% \setbox\stempbox\hbox{\ContributorAffiliationFont #2} \concolwidth\wd\stempbox \noindent{\ContributorNameFont #1}\par \ifdim\concolwidth>\columnwidth \vspace*{3pt} \else \fi \noindent{\vbox{\hangindent12pt\ContributorAffiliationFont #2}}\vskip-1\p@ \noindent{\vbox{\hangindent12pt\ContributorAffiliationFont #3}}}} %%\def\contributors{% %% \twocolumn[\contributorshead] %% \pagestyle{empty} %% \leftskip1pc %% \parindent-1pc} %%\def\contributorshead{% %% \vbox{}\vskip2pc %% {\centering\HeadFont CONTRIBUTORS\vskip2\p@} %% \noindent\rule{\textwidth}{1\p@}\vskip25\p@} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \@centertabletitlefalse \HeadingsBookChapter %\HeadingsChapterSection \endinput %% %% End of file `krantz.cls'. ================================================ FILE: Old/Source-v2/rep-res-book.bib ================================================ % Main Bibliography For Reproducible Research in R and RStudio % Christopher Gandrud % Updated: 24 April 2015 @article{Burbidge1988, author = {John B. Burbidge and Leslie Robb}, title = {Alternative Transformations to Handle Extreme Values of the Dependent Variable}, journal = {Journal of the American Statistical Association}, volume = {83}, number = {401}, year = {1988}, pages = {123--127} } @article{box1964analysis, title={An analysis of transformations}, author={Box, George EP and Cox, David R}, journal={Journal of the Royal Statistical Society. Series B (Methodological)}, pages={211--252}, volume = {26}, year={1964} } @misc{Hyndman2010, author = {Rob J. Hyndman}, title = {Transforming Data with Zeros}, year = {2010}, note = {Available at: \url{http://robjhyndman.com/hyndsight/transformations/}. Accessed March 2015.} } @article{Altman2007, author = {Micah Altman and Gary King}, title = {A Proposed Standard for the Scholarly Citation of Quantitative Data}, year = {2007}, journal = {D-Lib Magazine}, volume = {13}, number = {3/4} } @article{Baath2012, author = {B{\aa}{\aa}th, Rasmus}, title = {{The state of naming conventions in {R}}}, journal = {The R Journal}, year = {2012}, volume = {4}, number = {2}, pages = {74--75} } @book{Bacon1267, author = {Fr. Rogeri Bacon}, title = {Opera quaedam hactenus inedita. Vol. I. containing I.--Opus tertium. II.--Opus minus. III.--Compendium philosophiae.}, publisher = {Google eBook}, year = {1267/1859}, note = {Retrieved from \url{http://books.google.com/books?id=wMUKAAAAYAAJ}} } @article{Ball2012, author = {Ball, Richard and Medeiros, Norm}, title = {{Teaching integrity in empirical research: A protcol for documenting data management and analysis}}, journal = {The Journal of Economic Education}, year = {2011}, volume = {43}, number = {2}, pages = {182--189} } @article{Barr2012, author = {Barr, Christopher D}, title = {{Establishing a culture of reproducibility and openness in medical research with an emphasis on the training years}}, journal = {Chance}, year = {2012}, volume = {25}, number = {3}, pages = {8--10} } @BOOK{vanBelle2008, author = {Gerald van Belle}, title = {Statistical Rules of Thumb}, publisher = {John Wiley and Sons}, address = {Hoboken, NJ}, edition = {2nd}, year = {2008} } @article{Boettiger2012, author = {Boettiger, Carl and Temple Lang, Duncan}, title = {{Treebase: An R package for discovery, access and manipulation of online phylogenies}}, journal = {Methods in Ecology and Evolution}, year = {2012}, volume = {3}, number = {6}, pages = {1060--1066} } @ARTICLE{Bowers2011, author = {Jake Bowers}, title = {Six Steps to a Better Relationship with Your Future Self}, journal = {The Political Methodologist}, year = {2011}, volume = {18}, number = {2}, pages = {2-8} } @BOOK{Braude1979, author = {S.E. Braude}, year = {1979}, title = {ESP and Psychokinesis. A Philosophical Examination}, publisher = {Temple University Press}, address = {Philadelphia, PA} } @INCOLLECTION{Buckheit1995, author = {Jonathan B. Buckheit and David L. Donoho}, title = {Wavelab and Reproducible Research}, booktitle = {Wavelets and Statistics}, editor = {A. Antoniadis}, publisher = {Springer}, address = {New York}, pages = {55-81}, year = {1995} } @BOOK{Chang2012, author = {Winston Chang}, title = {R Graphics Cookbook: Practical Recipes for Visualizing Data}, year = {2012}, publisher = {O'Reilly Media, Inc.}, address = {Sebastopol, CA} } @INCOLLETION{Cortez2007, author = {Paulo Cortez and An\'{i}bal Morais}, title = {A Data Mining Approach to Predict Forest Fires Using Meteorological Data}, editor = {J. Neves and M.F. Santos and J. Machado}, booktitle = {New Trends in Artificial INtelligence, Proceedings of the 13th EPIA}, year = {2007}, pages = {512-523}, note = {\url{http://archive.ics.uci.edu/ml/datasets/Forest+Fires}} } @BOOK{Crawley2005, author = {Micheal J. Crawley}, title = {Statistics: An Introduction Using R}, publisher = {John Wiley and Sons Ltd.}, address = {Chichester}, year = {2005} } @BOOK{Crawley2013, author = {Micheal J. Crawley}, title = {The R Book}, edition = {2nd}, publisher = {John Wiley and Sons Ltd.}, address = {Chichester}, year = {2013} } @unpublished{CreativeCommons2012, author = {{Creative Commons}}, title = {Data}, year = {2012}, journal = {Creative Commons Wiki}, number = {11 December}, note = {\url{http://wiki.creativecommons.org/Data}} } @article{Donoho2002, author = {Donoho, David L}, title = {How to be a highly cited author in mathematical sciences}, journal = {in-cites}, year = {2002}, note = {\url{http://www.in-cites.com/scientists/DrDavidDonoho.html}} } @article{Donoho2009, author = {Donoho, David L and Maleki, Arian and Shahram, Morteza and Rahman, Inam Ur and Stodden, Victoria}, title = {{Reproducible research in computational harmonic analysis}}, journal = {Computing in Science {\&} Engineering}, year = {2009}, volume = {11}, number = {1}, pages = {8--18} } @article{Donoho2010, author = {Donoho, David L}, title = {{An invitation to reproducible computational research}}, journal = {Biostatistics}, year = {2010}, volume = {11}, number = {3}, pages = {385--388} } @unpublished{Drummond2012, month = {September}, title = {Reproducible Research: a Dissenting Opinion}, author = {Chris Drummond}, year = {2012}, url = {http://cogprints.org/8675/} } @article{Ehrenberg1977, author = {Ehrenberg, A S C}, title = {{Rudiments of numeracy}}, journal = {Journal of the Royal Statistical Society. Series A General}, year = {1977}, volume = {140}, number = {3}, pages = {277--297} } @article{Fomel2009, author = {Fomel, Sergey and Claerbout, Jon F}, title = {{Reproducible Research}}, journal = {Computing in Science {\&} Engineering}, year = {2009}, volume = {11}, number = {1}, pages = {5--7} } @article{Frazier2008, author = {Mitch Frazier}, title = {Bash Parameter Expansion}, journal = {The Linux Journal}, year = {2008}, note = {Available at: \url{http://www.linuxjournal.com/content/bash-parameter-expansion}} } @article{Gandrud2012, author = {Gandrud, Christopher}, title = {{The diffusion of financial supervisory governance ideas}}, journal = {Review of International Political Economy}, year = {2013}, volume = {20}, number = {4}, pages = {881--916} } @article{GandrudGrafstrom2012, author = {Christopher Gandrud and Cassandra Grafstr\"{o}m}, title = {Inflated Expectations: How government partisanship shapes bureaucrats' inflation forecasts}, year = {2015}, journal = {Political Science Research and Methods}, note = {Available at: \url{http://dx.doi.org/10.1017/psrm.2014.34}} } @article{Gandrud2013, author = {Christopher Gandrud}, title = {GitHub: A Tool for Social Data Set Development and Verification in the Cloud}, year = {2013}, journal = {The Political Methodologist}, volume = {20}, number = {2}, pages = {2-7} } @article{Gelman2011tables, author = {Gelman, Andrew}, title = {{Tables as graphs: The Ramanujan principle}}, journal = {Significance}, year = {2011}, volume = {8}, number = {4}, pages = {183} } @article{Gentleman2004, author = {Gentleman, Robert and Lang, Duncan Temple}, title = {{Statistical Analyses and Reproducible Research}}, journal = {Bioconductor Project Working Papers}, year = {2004} } @article{Goodrich2007, author = {Ben Goodrich and Ying Lu}, year = {2007}, title = {normal.bayes: Bayesian Normal Linear Regression}, journal = {Zelig Everyone's Statistical Software}, note = {Available at: \url{http://gking.harvard.edu/zelig}} } @article{Herndon2014, title={Does high public debt consistently stifle economic growth? A critique of {R}einhart and {R}ogoff}, author={Herndon, Thomas and Ash, Michael and Pollin, Robert}, journal={Cambridge Journal of Economics}, volume={38}, number={2}, pages={257--279}, year={2014} } @article{Howe2012, author = {Howe, Bill}, title = {{Virtual appliances, cloud computing, and reproducible research}}, journal = {Computing in Science {\&} Engineering}, year = {2012}, volume = {14}, number = {4}, pages = {36--41} } @BOOK{Kabacoff2012, author = {Robert I. Kabacoff}, title = {R in Action: Data Analysis and Graphics with R}, publisher = {Manning Publications Co.}, address = {Shelter Island, NY}, year = {2011} } @article{Kelly2006, author = {Kelly, Clint D}, title = {{Replicating empirical research in behavioral ecology: How and why it should be done but rarely ever is}}, journal = {The Quarterly Review of Biology}, year = {2006}, volume = {81}, number = {3}, pages = {221--236} } @book{King1994, author = {King, Gary. and Keohane, Robert and Verba, S.}, title = {{Designing Social Inquiry}}, publisher = {Princeton University Press}, year = {1994}, address = {Princeton} } @article{King1995, author = {King, Gary}, title = {{Replication, replication}}, journal = {PS: Political Science and Politics}, year = {1995}, volume = {28}, number = {3}, pages = {444--452} } @article{King2007, author = {King, Gary}, title = {An Introduction to the Dataverse Network as an Infrastructure for Data Sharing}, journal = {Sociological Methods {\&} Research}, year = {2007}, volume = {36}, number = {2}, pages = {173--199} } @article{Knuth1990, author = {Donald E. Knuth}, title = {The Future of TeX and MetaFont}, year = {1990}, journal = {NTG: Maps}, volume = {5}, issue = {November}, pages = {145} } @BOOK{Knuth1992, title = {Literate Programming}, author = {Donald E. Knuth}, year = {1992}, publisher = {Center for the Study of Language and Information}, address = {Stanford, CA}, series = {CSLI Lecture Notes} } @inproceedings{Leisch2002, author = {Friedrich Leisch}, title = {Sweave: Dynamic Generation of Statistical Reports Using Literate Data Analysis}, booktitle = {Compstat 2002: Proceedings in Computational Statistics}, pages = {575--580}, year = 2002, editor = {Wolfgang H{\"a}rdle and Bernd R{\"o}nz}, publisher = {Physica Verlag, Heidelberg}, note = {\url{http://www.stat.uni-muenchen.de/~leisch/Sweave}} } @article{Lykken1968, author = {David T. Lykken}, title = {Statistical Significance in Psychological Research}, year = {1968}, journal = {Psychologial Bulletin}, volume = {70}, pages = {151-159} } @article{Makel2014, author = {Makel, M C and Plucker, J A}, title = {{Facts are more important than novelty: Replication in the education sciences}}, journal = {Educational Researcher}, year = {2014}, volume = {43}, number = {6}, pages = {304--316} } @BOOK{Matloff2011, author = {Norman Matloff}, title = {The Art of Programming in R: A Tour of Statistical Programming Design}, publisher = {No Starch Press}, address = {San Francisco}, year = {2011} } @article{McCullough2008, author = {McCullough, B D and McGeary, Kerry Anne and Harrison, Teresa D}, title = {{Do Economics Journal Archives Promote Replicable Research?}}, journal = {Canadian Journal of Economics}, year = {2008}, volume = {41}, number = {4}, pages = {1406--1420} } @BOOK{Munzert2015, title = {Automated Data Collection with R: A Practical Guide to Web Scraping and Text Mining}, author = {Simon Munzert and Christian Rubba and Peter Mei{\ss}ner and Dominic Nyhuis}, year = {2015}, publisher = {Wiley}, address = {Chichester} } @Manual{Pandoc2014, title = {Pandoc: A Universal Document Converter}, author = {John MacFarlane}, year = {2014}, note = {Version 1.13.0.1}, url = {http://johnmacfarlane.net/pandoc/index.html} } @article{Mesirov2010, author = {Mesirov, Jill P.}, title = {{Accessible reproducible research}}, journal = {Science}, year = {2010}, volume = {327}, number = {5964}, pages = {415--416} } @article{Meyer2006, author = {Axel Meyer}, title = {Repeating Patterns of Mimicry}, journal = {PLoS Biol}, volume = {4}, number = {10}, year = {2006} } @book{Murrell2011, author = {Paul Murrell}, title = {R Graphics}, publisher = {Chapman and Hall/CRC Press}, address = {Boca Raton, FL}, year = {2011}, edition ={2nd} } @article{Nagler1995, author = {Nagler, Jonathan}, title = {{Coding style and good computing practices}}, journal = {PS: Political Science and Politics}, year = {1995}, volume = {28}, number = {3}, pages = {488--492} } @article{Nosek2012, author = {Nosek, Brian A and Spies, Jeffrey R and Motyl, Matt}, title = {{Scientific utopia: II. Restructring incentives and practices to promote truth over publishability}}, journal = {Perspectives on Psychological Science}, year = {2012}, volume = {7}, number = {6}, pages = {615-631} } @book{ONeil2013, author = {Cathy O'Neal and Rachel Schutt}, title = {Doing Data Science: Straight Talk from the Frontline}, year = {2013}, address = {Sebastopol, CA}, publisher = {O'Reilly Media Inc.} } @ARTICLE{Pemstein2010, author = {Daniel Pemstein and Stephen A. Meserve and James Melton}, title = {Democratic Compromise: A Latent Variable Analysis of Ten Measures of Regime Type}, journal = {Political Analysis}, year = {2010}, volume = {18}, pages = {426-449}, number = {4} } @article{Peng2009, author = {Peng, Roger D}, title = {{Reproducible research and biostatistics}}, journal = {Biostatistics}, year = {2009}, volume = {10}, number = {3}, pages = {405--408} } @article{Peng2011, author = {Peng, Roger D}, title = {{Reproducible research in computational science}}, journal = {Science}, year = {2011}, volume = {334}, pages = {1226-1227} } @article{Peng2014, author = {Roger D. Peng}, title = {The Real Reason Reproducible Research is Important}, journal = {Simply Statistics}, year = {2014}, note = {\url{http://simplystatistics.org/2014/06/06/the-real-reason-reproducible-research-is-important/}} } @article{Piwowar2007, author = {Piwowar, Heather A and Day, Roger S and Fridsma, Douglas B}, title = {{Sharing detailed research data is associated with increased citation rate}}, journal = {PLoS ONE}, year = {2007}, volume = {2}, number = {3}, pages = {1-5} } @article{RR2010, author = {C.M. Reinhart and K.S. Rogoff}, title = {Growth in a Time of Debt}, journal = {American Economic Review: Papers \& Proceedings}, volume = {100}, year = {2010} } @Manual{RLanguage, title = {R: A Language and Environment for Statistical Computing}, author = {{R Core Team}}, organization = {R Foundation for Statistical Computing}, address = {Vienna, Austria}, year = {2014}, note = {\url{http://www.R-project.org/}} } @Manual{RStudioCite, title = {RStudio: Integrated development environment for R}, author = {{RStudio,}{ Inc.}}, address= {Boston, MA}, year = {2015}, note = {Version 0.99}, url = {\url{http://www.rstudio.com/}} } @Manual{Rtools, title = {Rtools: Building R for Windows}, author = {Brian Ripley and Duncan Murdoch}, year = {2012}, note = {\url{http://cran.r-project.org/bin/windows/Rtools/}} } @misc{RamseyNoweb, author = {Norman Ramsey}, title = {Noweb: {A} Simple, Extensible Tool for Literate Programming}, year = {2011}, howpublished = {\url{http://www.cs.tufts.edu/~nr/noweb/}} } @book{ShottsJr2012, author = {Shotts Jr., William E}, title = {The Linux Command-line: A Complete Introduction}, publisher = {No Starch Press}, year = {2012}, address = {San Francisco} } @article{Stodden2009, author = {Stodden, Victoria}, title = {{The legal framework for reproducible scientific research}}, journal = {Computing in Science {\&} Engineering}, year = {2009}, volume = {11}, number = {1}, pages = {35--40} } @inproceedings{Stodden2009b, author = {Stodden, Victoria}, year = {2009}, title = {The Reproducible Research Standard: Reducing Legal Barriers to Scientific Knowledge and Innovation}, booktitle = {Communia: Global Science \& Economics of Knowledge-Sharing Institutions Torino, Italy June 30}, note = {\url{http://www.stanford.edu/~vcs/talks/VictoriaStoddenCommuniaJune2009-2.pdf}} } @article{Stodden2010, author = {Stodden, Victoria}, title = {{The Scientific Method in Practice: Reproducibility in the Computational Sciences}}, journal = {MIT Sloan School Working Paper, 4773-10}, year = {2010} } @BOOK{Tufte2001, author = {Edward R. Tufte}, title = {The Visual Display of Quantitative Information}, publisher = {Graphics Press}, address = {Cheshire, CT}, year = {2001}, edition = {2nd} } @article{Vandewalle2007, author = {Vandewalle, P and Barrenetxea, G and Jovanovic, I and Ridolfi, A and Vetterli, M}, title = {{Experiences with reproducible research in various facets of signal processing research}}, journal = {Acoustics, Speech and Signal Processing}, year = {2007}, volume = {4}, pages = {1253--1256} } @article{Vandewalle2012, author = {Vandewalle, Patrick}, title = {{Code sharing is associated with research impact in image processing}}, journal = {Computing in Science {\&} Engineering}, year = {2012}, volume = {14}, number = {4}, pages = {42--47} } @book{Whickham2009book, author = {Hadley Wickham}, title = {ggplot2: Elegant Graphics for Data Analysis}, year = {2009}, publisher = {Springer}, address = {New York}, edition = {2nd} } @article{Whickham2010journal, author = {Hadley Wickham}, title = {A Layered Grammar of Graphics}, journal = {Journal of Computational and Graphical Statistics}, volume = {19}, number = {1}, year = {2010}, pages = {3-28} } @book{Whickham2014book, author = {Hadley Wickham}, title = {Advanced R}, publisher = {Chapman and Hall/CRC Press}, address = {Boca Raton, FL}, year = {2014} } @article{Wickham2014article, author = {Wickham, Hadley}, title = {{Tidy Data}}, journal = {Journal of Statistical Software}, year = {2014}, volume = {59}, number = {10}, pages = {1--23} } @article{Wilson2012, author = {Wilson, Greg and Aruliah, D A and Brown, C Titus and Hong, Niel P Chue and Davis, Matt and Guy, Richard T and Haddock, Steven H D and Huff, Katy and Mitchell, Ian M and Plumbley, Mark D and Ben Waugh and White, Ethan P and Wilson, Paul}, title = {{Best practices for scientific computing}}, journal = {arXiv}, note = {Available at: \url{http://arxiv.org/pdf/1210.0530v3}}, year = {2012}, volume = {29 November 2012}, pages = {1--6}, month = nov } @misc{WorldBank2013, author = {{World Bank}}, title = {World Development Indicators}, year = {2015}, note = {\url{http://data.worldbank.org/data-catalog/world-development-indicators}} } @BOOK{Xie2013, author = {Yihue Xie}, title = {Dynamic Documents with R and knitr}, publisher = {Chapman and Hall/CRC Press}, address = {Boca Raton, FL}, year = {2013} } ================================================ FILE: Old/SourceOld/Chapter1/chapter1.Rmd ================================================ \chapter{Chapter 1: Introducing Reproducible Research} # What is reproducible research? # Why should research be reproducible? # Who should read this book? ## Students ## Researchers ## Industry practitioners # Why use R/RStudio for reproducible research? ## Complete integration of data gathering, analysis, and presentation. ### Presentation There are many LaTeX editors available, both open source and paid, as well as other ways to compile LaTeX documents, including directly through the command-line. **R** is capable of compiling LaTeX documents through . **RStudio** is actually as a very nice LaTeX editor. For creating documents that integrate markup and **R** code, at the moment it pretty much can't be beat. It has full syntax highlighting, even for documents with `knitr` code (which it can collapse when you just want to work on the text). It can spell check LaTeX documents. It handles `knitr` code chunks beautifully making it easy to navigate through complex documents and run individual chunks. Even if you aren't creating documents that integrate **R** code, *R* is still a decent full functioning LaTeX editor. It can insert common commands like `\section*{}` for unnumbered sections. Most importantly it easily compile LaTeX documents and show you a preview. # Book overview ## What this book is not. This book describes a workflow for reproducible research primarily using **R** and **RStudio**. It is designed to give you the necessary tools to use this workflow for your own research. It is not designed to be a complete introduction to **R**, **RStudio**, **GitHub**, the command-line, or any other program that is a part of this workflow. Instead it shows you how these tools can fit together to make yourß research more reproducible. To get the most out of these individual programs I point you to other resources that cover these programs in more detail. That being said, my goal in this for this book to be self-sufficient to the extent that a reader without a detailed understanding of these programs will be able to understand and use the commands and procedures I cover in this book. While learning how to use **R** and the other programs I often encountered examples that included commands, variables, and other things that were not well explained in the texts that I was reading. This caused me to waste many hours trying to figure out, for example, what the `$` in **R** is used for. I hope to save you from this wasted time by either providing a brief explanation of these possibly frustratingly mysterious conventions and/or pointing you in the direction of a good explanation. To that end, I can recommend a number of books for that cover more of the nitty-gritty of **R** and the command-line. - Michael J. Crawley's encyclopaedic **R** book, appropriately titled, **The R Book** published by Wiley. - Norman Matloff's tour through the programming language aspects of **R** called **The Art of R Programming: A Tour of Statistical Design Software** published by No Starch Press. - For an excellent introduction to the command-line in Linux and Mac, though with pretty clear implications for Windows users if they are running **PowerShell** (see Chapter 2) see William E. Shotts Jr.'s book *The Linux Command-line: A Complete Introduction* also published by No Starch Press. - The **RStudio** website () has a number of useful tutorials on how to use `knitr` with LaTeX and Markdown. ## How to read this book. This book tells a story. It has a beginning, middle, and end. So, unlike a reference book it can and should be read like a novel, taking you through an empirical research processes from an empty folder maybe called `Research Paper` to a completed set of documents that showcase your findings. That being said, readers with more experience using tools like **R** or \\( LaTeX \\) may want to skip over the nitty-gritty parts of the book that describe how to manipulate data frames or compile a \\( LaTeX \\) document into a PDF. Please feel free to do this. If you are experienced with **R** in particular you may want to skip over Chapter 3: Getting Started with **R**/**RStudio**. ## How this book was written This book practices what it preaches. It can be reproduced. It was written using the programs and methods that it describes. Full documentation and source files can be found at the Book's **GitHub** repository. Feel free to read and even copy (within reason and with attribution, of course) the Book's source code. You can find it at . This is especially useful if you want to know how to do something in the book that I don't directly cover in the text. In the same spirit, I encourage you to make your research files--not just data, but analysis code and markup--available for other researchers to learn from. Not only does reproducibility help us evaluate past work, but it also pushes forward knowledge in the scientific community. ## Contents overview. ================================================ FILE: Old/SourceOld/Chapter10/chapter10.Rmd ================================================ \chapter{Chapter 10:} ================================================ FILE: Old/SourceOld/Chapter11/chapter11.Rmd ================================================ \chapter{Chapter 11: Presenting with LaTeX} # The Basics ## Editors ## The Header ## Headings ## Footnotes & Bibliographies ### Footnotes Plain, non-bibliographic footnotes are easy to create in LaTeX. Simply place `\footnote{` where you would like the footnote number to apear in the text. Then type in the footnote's text and Of course, remember to close it with a `}`. LaTeX does the rest, including formatting and numbering. ### Bibliographies #### Citing R Packages with \BibTeX Researchers are pretty good about consistently citing others' articles and data. However, citing the **R** packages used in an analysis is very inconsistent. This is unfortunate not only because correct attribution is not being given but also because it makes reproducibility harder because it obscures important steps that were taken in the research process. Fortunately, **R** actually includes the tools to quickly generate citations, including the version of the package you are using. It can also add them directly to an existing bibliography file. You can automatically create citations for **R** packages using the `citation` command in *base* **R**. For example, if we want the citation information for the `Zelig` package we would simply type: ```{r} citation("Zelig") ``` This gives us both the plain citation as well as the \BibTeX version for use in LaTeX and **MultiMarkdown** documents. If you are creating a LaTeX article and only want the \BibTeX version of the citation we can use the `toBibtex` command in the *utils* package. ```{r, message=FALSE} toBibtex(citation("Zelig")) ``` You can append the citation to your existing \BibTeX file using the `sink` command in *base* **R**. This command diverts our output and/or the messages to a file. Imagine that our existing \BibTeX file is called `bibliography.bib`. To add the *Zelig* package citation: ```{r, message=FALSE, tidy=TRUE} sink(file = "bibliography.bib", append = TRUE, type = c("output")) toBibtex(citation("Zelig")) sink() ``` This places the citation at the end of our `bibliography.bib` file. It is very important to include the argument `append = TRUE`. If you don't you will erase the existing file. The argument `type = c("output")` tells **R** to include only the output, not the messages. An even faster way to add citations to a bibliography is with `write.bibtex` command in the *knitcitations* package. To add the *Zelig* citation to our `bibliography.bib` file we only need to enter: ```{r, message=FALSE} library(knitcitations) write.bibtex(entry = c("Zelig"), file = "bibliography.bib", append = TRUE) ``` In Chapter 13 we'll look at the `knitcitations` package in more detail. ================================================ FILE: Old/SourceOld/Chapter12/chapter12.Rmd ================================================ \chapter{Chapter 12:} ## Editors ================================================ FILE: Old/SourceOld/Chapter13/chapter13.Rmd ================================================ \chapter{Chapter 13: Presenting on the Web and Beyond with Markdown/HTML} # The Basics ## Headings Headings in Markdown are extremely simple. To create a line in the style of the topmost heading--maybe a title--just place one hash mark (`#`) at the beginning of the line. The second tier heading just gets two hashes (`##`) and so on. You can also put the hash mark(s) at the end of the heading, but this is not necessary. ## Footnotes and bibliographies with MultiMarkdown ## Math ## Drawing figures with CSS # Simple webpages ## RPubs ## Hosting webpages with Dropbox # Presentations with `Slidify` # Reproducible websites ## Blogging with Tumblr ## Jekyll-Bootstrap and GitHub see ## Jekyll and GitHub Pagesß # Using Markdown for non-HTML output with Pandoc ================================================ FILE: Old/SourceOld/Chapter14/chapter14.Rmd ================================================ \chapter{Chapter 14:} ================================================ FILE: Old/SourceOld/Chapter2/chapter2.Rmd ================================================ \chapter{Chapter 2: Getting Started with Reproducible Research} # The Big Picture: A workflow for reproducible research ## Data Gathering ## Data Analysis ## Data Presentation # Practical tips for reproducible research ## Document everything We'll discuss this more later in this chapter, but one important part of reproducible research with **R** is to *record your session info*. Many things in **R** stay the same over time, which makes it easy to recreate for future researchers to recreate what was done in the past. However, things do change from one version of **R** to another. Also, the way **R** functions may be slightly different on different operating systems. Finally, you may have **R** set to load packages by default. These packages might be necessary to run your code, but other people might not be able to easily know this from just looking at your source code. The `sessionInfo` command prints a record of all of these things. ## Everything is a (text) file ## All files should be human readable ## Research projects are many files tied together ## Have a plan to organize, store, and make your files available # Introduction to the tools of reproducible research covered in this book ## **R**/**RStudio** ## `knitr` ## Cloud storage & versioning ## The command-line ## Markup languages: LaTeX & Markdown/HTML ================================================ FILE: Old/SourceOld/Chapter3/chapter3.Rmd ================================================ \chapter{Chapter 3:} ================================================ FILE: Old/SourceOld/Chapter4/chapter4.Rmd ================================================ \chapter{Chapter 4:} ================================================ FILE: Old/SourceOld/Chapter5/chapter5.Rmd ================================================ \chapter{Chapter 5: Gathering Data with R} # Importing locally stored data sets ## Single files ## Looping through multiple files # Importing data sets from the internet ## Data from non-secure (`http`) URLs ## Data from secure (`https`) URLs ## Data APIs & feeds There are growing number of commands that can gather data directly from their sources and import them into **R**. Needless to say, this is great for reproducible research since it not only makes the data gathering process easier (you don't have to download a ton of Excel files and fiddle around with them before even getting the data into **R**), but it also makes replicating the data gathering process much more straightforward. Some examples include: - The *openair* package, which beyond providing a number of tools for analysing air quality data also has the ability to directly gather data directly from sources such as Kings College London's London Air () database with the `importKCL` command. # Basic web scraping ## Scraping tables ## Gathering and parsing text ================================================ FILE: Old/SourceOld/Chapter6/chapter6.Rmd ================================================ \chapter{Chapter 6: Storing, Collaborating, Accessing Files, Versioning} A stumbling block to actually reproducing a piece of research is getting a hold of the datasets and the codebooks that describe the data used in an analysis. Researchers often face a number of data management issues that, beyond making their research difficult to reproduce, can make doing the initial research difficult. First, there is the problem of **storing** the data so that it is protected against computer failure--virus infections, spilling coffee on your laptop, and so on. Fourth, we almost never create a data set or write a paper perfectly all at once. We may make changes and then realize that we liked an earlier version, or parts of an earlier version better. This is a particularly important issue in data management where we may transform our data in unintended ways and want to go back to an earlier version. Collaborative projects can have regular incidents of one author accidentally deleting something in a file that another author needed, for example. To deal with these issues we need to store our data in a system that has **version control**. Version control systems keep track of changes we make to our files and allow us to access previous versions if we like. the data set can often grow and become disorganized. Perhaps even during a data transformation This creates problems You can solve all of these problems in a couple of different ways using free or low cost cloud-based storage formats. In this chapter we will learn how to use **Dropbox** and **GitHub** for data: - storage, - accessing, - collaboration, - version control. # Saving data in reproducible formats Before getting into the details of cloud-based data storage, lets just consider what type of formats you should actually save your data in. A key issue for reproducibility is that others be able to not only get ahold of the exact data you used in your analysis, but be able to understand and use the data not only now, but in the future. Some file formats make this easier than others. **R** is able to read (and write) a very wide variety of file formats, mostly through the `foreign` package in `base` **R**. This includes # Storing data in the cloud Storing data locally--on your computer--or on a flash drive is generally more prone to loss than storing data on remote servers, often referred to as 'the cloud'. # Dropbox The easiest types of cloud storage for your research are services like **Dropbox** and **Google Drive**. These services typically involve a folder based on your computer's hard drive that is automatically synced with a similar folder on a cloud-based server. Typically you can sign up for the service for free and receive a limited amount of storage space (usually a few gigabytes, which should be plenty if your research is made up of text files.). Most of these services not only store your data in the cloud, but also provide some way to share files and maybe even includes basic version control. I am going to focus on using **Dropbox** because it currently offers a complete set of features that allow you to store, version, collaborate, and access your data. ## Version control **Dropbox** has a simple version control system. Every time you save a document on **Dropbox** a new version is created. One the **Dropbox** website ## Accessing Data There are two similar, but importantly different ways to access data stored on **Dropbox**. All files stored on **Dropbox** have a URL address through which they can be access from computer connected to the internet. Some of these files can be easily loaded directly into **R**, while others must me manually (point-and-click) downloaded onto your computer and then loaded into **R**. The key factor is whether or not the files are located in your **Dropbox**'s *Public* folder. Files in the *Public* folder can be downloaded directly into *R*. Files not in the *Public* folder have to be downloaded manually.[^scrapeDropbox] Either way you find a file's URL address by first right-clicking on the file icon in you *Dropbox* folder. If the file is stored in the *Public* folder, you go to **Dropbox** then **Copy Public Link**. This copies the URL into your clipboard from where you can paste it into your **R** source code (or wherever). Once you have the URL you can load the file directly into **R** using the `read.table` command for dataframes or the `source` command for source files. This was covered in more detail in chapter GET. If the file is not in your *Public* folder you also go to **Dropbox** after right-clicking. Then choose **Get Link**. This will open a webpage in your default web browser from where you can download the file. You can copy and paste the page's URL from your browser's address bar. You can also get these URL links through the online version of your **Dropbox**. First log into the **Dropbox** website. When you hover your curser over a file (or folder) name you will see a chain icon appear on the far right. Clicking on this icon will get you the link. Storing files in the *Public* folder clearly makes replication easier because the files can be downloaded and run directly in **R**. Note that you cannot save files through the URL link. You must save files in the **Dropbox** folder on your computer. # GitHub **Dropbox** does a fine job of meeting our four basic criteria for reproducible data storage. **GitHub** meets these criteria and more. **GitHub** was not explicitly designed to host research projects or even data. It was designed to host 'socially coded' computer programs. It built an interface on top of the **git** version control system that makes it easy relatively easy for a number of collaborators to work together to build a computer program. This seems very far from reproducible research. However, remember that as reproducible researchers we are just building projects out of interconnected text files. This is exactly the same as computer programming. and like computer programers, we need ways to store, version control, access, and collaborate on our text files. Because **GitHub** is very actively used by people with very similar needs (who are also really good programmers), the interface offers many highly developed and robust features for reproducible researchers. As is usually the case, **GitHub**'s added features mean that it is takes a longer time than **Dropbox** to set up and become familiar. So we need good reasons to want to invest the time needed to learn **GitHub** rather than just sticking with **Dropbox** or a similar service. Here is a list of **GitHub**'s key features relative to **Dropbox** for reproducible research: - **Git** is directly integrated into **RStudio** projects (**RStudio** also supports the **subversion** version control system, but I don't cover that here). - **Dropbox**'s version control system only lets you the see the file names, the times they were created, who created them, and revert back to specific versions. **git** tracks every change you make in a way that makes it relatively easy to find the version you want. The **GitHub** website and GUI programs for Mac and Windows provide nice interfaces for examining specific changes. You can also use the command-line to see changes. - **Dropbox** creates a new version every time you save a file, which can make it difficult to actually find the version you want. **git**'s version control system only creates a new version when you tell it to. - **Dropbox** does not merge conflicting versions of a file together. This can be annoying when you are collaborating on project and more than one author is making changes to documents. **GitHub** identifies conflicts and lets you reconcile them. - The **GitHub** website as an ''Issues'' area where you can to note and discuss issues you have while doing your research. Basically this is an interactive to-do list for your research project. ### Setting Up GitHub There are a number of ways to set up **GitHub** on your computer. I will briefly cover both the command-line version (available for Windows, Mac, and Linux) and the GUI[^GUI] version currently available only for Windows and Mac. ### Version Control in GitHub **GitHub**'s version control system is much more comprehensive than **Dropbox**'. However, it also has a steeper learning curve. #### More Practice If you want more practice setting up **GitHub** in the command-line, **GitHub** and the website Code School have an interactive tutorial that you might find interesting. You can find it at: . [^GUI]: Graphical User Interface, i.e. not the command-line version, but the one with windows that you navigate with your mouse. [^scrapeDropbox]: This is not completely true. It is possible to create a web scraper (see Chapter GET) that could download a data file from a file not in your *Public* folder. However, this is kind of a hassle and not practical, especially since the accessing files from the *Public* folder is so easy. ================================================ FILE: Old/SourceOld/Chapter7/chapter7.Rmd ================================================ \chapter{Chapter 7:} ================================================ FILE: Old/SourceOld/Chapter8/chapter8.Rmd ================================================ \chapter{Chapter 8: Statistical Modelling and `knitr`} # Incorporating analyses into the markup ## Full code in the main document ### LaTeX ### Markdown ## Showing code & results inline Sometimes we want to have some **R** code or output to show up in the text of our documents. We may want to include stylized code in our text when we discuss how we did an analysis. We may want to report the mean of some variable in our text. ### LaTeX #### Static code If we just want to include a code snippet in out text we can simply use the LaTeX command `\tt`. This sets our text to 'typewriter' font, the standard font for inline code in LaTeX (I use it in this book, as you've probably noticed). #### Dynamic code If we want to dynamically show the results of some **R** code in our text we can use the `\Sexpr` command. This is a pseudo-LaTeX command. Its structure is more like a LaTeX command's structure than `knitr` in that we enclose our **R** code in curly brackets (`{}`) rather than the usual ``<<>>= . . . @` syntax for code chunks. For example ### Markdown #### Static code To include static code inline in an **R Markdown** document we enclose the code in single backticks (``). #### Dynamic code To include dynamic code in an **R Markdown** document we use the backticks as be fore but include a the letter `r` after the first one. ## Sourcing R code from another file There are a number of reasons that you might want to have your **R** source code located in a separate file from your markup even if you plan to compile them together with `knitr`. First, it can be unwieldy to edit both your markup and long **R** source code chunks in the same document, even with **RStudio**'s handy `knitr` code collapsing and chunk management options. There are just too many things going on in one document. Second, you may want to use the same code in multiple documents--an article and presentation for example. It is nice to not have to copy and paste the same code into multiple places, but have multiple documents link to the same source code. Plus if you make changes to the source code, these changes will automatically be made across all of your presentation documents. You don't need to make the same changes multiple times. Third, other researchers trying to replicate your work might only be interested in specific parts of your analysis. If you have the analysis broken into separate and clearly labeled files it is easier for these researchers to find the specific bits of code that they are interested compared to digging through long markup files. ### Source from a local file Usually in the early stages of research you may want to source analysis files located on your computer. Doing this is simple. The `knitr` syntax is the same as above. The only change is that instead of writing all of our code in the chunk we save it to its own file and use the `source` command in *base* **R** to access it. For example: ### Source from a non-secure URL (`http`) Sourcing from your local computer is fine if you are working alone and do not want others to access your code. Once you start collaborating and generally wanting people to be able to replicate your code, you need to use another method.[^sourceCaveat] The simplest solution to these issues is to host the replication code in your **Dropbox** public folder. You can find the file's public URL the same way we did in Chapter 6. Now use the `source` command the same way as before. For example: ### Source from a secure URL (`https`) If you are using **GitHub** or another service that uses secure URLs the steps are generally the same, but you need to use the `source_url` command in the *devtools* package. For **GitHub** based source code find the file's URL the same way we did in Chapter 6. Remember to get the URL for the *raw* version of the file. # Saving output objects for future use # Including highlighted syntax in the output ## LaTeX ## Markdown/HTML # Debugging [^sourceCaveat]: Sure you can make the replication code accessible for download and either instruct others to change the working directory to the replication file or have them change the directory information as necessary. However, this usually just adds an extra complicating step that makes replication harder. It is also a pain if you are collaborating and each author has to constantly change the directories. ================================================ FILE: Old/SourceOld/Chapter9/chapter9.Rmd ================================================ \chapter{Chapter 9: Showing Results with Tables} Graphs and other visual methods, discussed in the next chapter, can often be a more affective way to present results than tables.[^tablerant] However, tables of results, descriptive statistics, and so on can sometimes still be an important part of communicating research. Creating tables by hand can be tedious no matter what program you are using to type up your results. Even more tedious is making changes to hand-created tables when you make changes to your data and models. Creating these tables can actually introduce new errors--post-analysis!--if you incorrectly copy what is in your **R** output. This is a very real possibility. The mind can go numb doing that sort of work. Also, creating tables by hand is not very reproducible. Fortunately, we don't actually need to create tables by hand. There are many ways to have **R** do the work for us. The goal of this chapter is to learn how to how to **automate table creation** for documents produced with both LaTeX and Markdown/HTML. There are a number of ways to turn **R** objects into tables written in LaTeX or HTML markup. In this chapter I mostly focus on the `xtable` and `texreg` packages. `xtable` can created tables for both of these markup languages. `texreg` only produces output for LaTeX. `knitr` allows us to incorporate these tables directly into our documents. **Warning:** Automating table creation removes the possibility of adding errors to our analyses by incorrectly copying **R** output, which is a big potential problem in hand-created tables. Be warned, it is not an error free process. We could easily create inaccurate tables through coding errors. For example, we may incorrectly merge together columns in so that our id variables no longer match the data they are supposed to. So, as always, it is important to 'eyeball' the output. Does it make sense? If we picked a couple values in the **R** output do the match what is in our final table? If not, we need to go back to the code and see where things have gone wrong. With that caveat, lets start making tables. # Table Basics Before getting into the details of how to create tables from **R** objects we need to first learn how generic tables are created in LaTeX and Markdown. ## Tables in LaTeX ## Tables in Markdown # Creating tables from R objects ## `xtable` & `texreg` basics with supported class objects ### `xtable` for LaTeX ### `xtable` for Markdown ## `xtable` with non-supported class objects ## Basic `knitr` syntax for tables The most important `knitr` chunk option for showing the markup created by these packages as tables is `results`. The `results` option can have three values: - `markup`, - `asis`, - `hide`. `hide` clearly hides the results of whatever we have in our code chunk; no results show up. # Tables with `apsrtable` # [^tablerant]: This is especially true of the small-print, high-density coefficient estimate tables that are sometimes descriptively called 'train schedule' tables. ================================================ FILE: Old/Writing_Setup/Early_Book_Origins.md ================================================ # Description of the Origins of Reproducible Research in R and RStudio ## Christopher Gandrud --- The book began as a class called Introductory Data Analysis for Social Science that I began to put together in April and May 2012. The basic idea for the course was to have a place where students could learn how the whole process of data collection, analysis, and presentation. Sure there are lots of statistics courses, some even covering the practicalities of data analysis using computers. However, all through my undergrad masters, masters again, and PhD I never had a course that brought all of of the major parts of quantitative data analysis together into one place. A big stumbling block for me and many other people I know was trying to take the information I learned in a stats class and actually use it in my own research. Just lfiguring out how to get new data into **R** was a really big challenge. This seemingly simple task has turned away many researchers from tools that could really help their research. I wanted a course that actually gave my students the tools to *do* research. ================================================ FILE: Old/Writing_Setup/HeaderFooter/IndvChapterFoot.tex ================================================ \bibliographystyle{plain} \bibliography{/git_repositories/Rep-Res-Book/Source/rep-res-book.bib} \end{document} ================================================ FILE: Old/Writing_Setup/HeaderFooter/IndvChapterHead.tex ================================================ \documentclass{article} \usepackage{amssymb} \usepackage{amsmath} \usepackage{graphicx} \usepackage{subfigure} %\usepackage{epsfig} \usepackage{makeidx} %\usepackage{showidx} \usepackage{multicol} \tolerance=5000 \usepackage{setspace} \doublespacing \usepackage{hyperref} \hypersetup{ colorlinks, citecolor=black, filecolor=black, linkcolor=cyan, urlcolor=cyan } \usepackage{dcolumn} \usepackage{booktabs} \usepackage{url} \usepackage{tikz} \begin{document} ================================================ FILE: Old/Writing_Setup/IndvChapter.sh ================================================ ########## # Shell script to create individual chapters that are compilable in LaTeX # Christopher Gandrud # Updated 18 July 2012 # Helpful information found in "The Linux Command-line: A Complete Introduction" (Shotts 2012) ########## #!/bin/bash read -p "Please enter the number of the chapter you want to compile -> " i # Make a new directory in the Desktop folder Rep-Res-Book-Test for the chapter mkdir ~/Desktop/Rep-Res-Book-Test/ mkdir ~/Desktop/Rep-Res-Book-Test/Chapter$i cd ~/Desktop/Rep-Res-Book-Test/Chapter$i # Delete \chapter command which is undefined for article class documents. cp /git_repositories/Rep-Res-Book/Source/Children/Chapter$i/chapter$i.Rnw chapter$i.Rnw sed '1,6d' chapter$i.Rnw > chapterNoChapter$i.Rnw # Combine the header main document and footer cat /git_repositories/Rep-Res-Book/Writing_Setup/HeaderFooter/IndvChapterHead.tex chapterNoChapter$i.Rnw /git_repositories/Rep-Res-Book/Writing_Setup/HeaderFooter/IndvChapterFoot.tex > IndvChapter$i.Rnw # Remove files that will not be used in the future. rm chapter$i.Rnw chapterNoChapter$i.Rnw echo "Finished" ================================================ FILE: Old/Writing_Setup/IndvChapter1.Rnw ================================================ \documentclass{article} \usepackage{amssymb} \usepackage{amsmath} \usepackage{graphicx} \usepackage{subfigure} %\usepackage{epsfig} \usepackage{makeidx} %\usepackage{showidx} \usepackage{multicol} \tolerance=5000 \usepackage{setspace} \doublespacing \usepackage{hyperref} \hypersetup{ colorlinks, citecolor=black, filecolor=black, linkcolor=cyan, urlcolor=cyan } \usepackage{dcolumn} \usepackage{booktabs} \usepackage{url} \usepackage{tikz} \begin{document} \bibliographystyle{plain} \bibliography{/git_repositories/Rep-Res-Book/Source/rep-res-book.bib} \end{document} ================================================ FILE: Old/Writing_Setup/OldScripts/ConvertRmdtoRnw.sh ================================================ ########## # Shell script for Converting Early Chapter Drafts from Markdown to LaTeX # Christopher Gandrud # Updated 16 July 2012 # Helpful information found in "The Linux Command-line: A Complete Introduction" (Shotts 2012) ########## #!/bin/bash cd /git_repositories/Rep-Res-Book # Use Pandoc to convert markdown files to LaTeX then append to new LaTeX Source files for i in `seq 1 14`; do pandoc -f markdown -t latex SourceOld/Chapter$i/chapter$i.Rmd >> Source/Chapter$i/chapter$i.Rnw done echo "Finished" ================================================ FILE: Old/Writing_Setup/OldScripts/Rmd_Book.sh ================================================ ########## # Shell script to create directories & files for Reproducible Research in R/RStudio # With Markdown # Christopher Gandrud # Updated 28 June 2012 # Helpful information found in "The Linux Command-line: A Complete Introduction" (Shotts 2012) ########## #!/bin/bash cd /git_repositories/Rep-Res-Book mkdir Source cd /git_repositories/Rep-Res-Book/Source ## Create 14 chapters and add main source files and image directories ## Use loops for 14 Chapters for i in `seq 1 14`; do cd /git_repositories/Rep-Res-Book/Source CHP_NAME=Chapter$i mkdir $CHP_NAME cd /git_repositories/Rep-Res-Book/Source/$CHP_NAME mkdir images$i ## Template Text CHAPTER="Chapter $i" CURRENT_TIME=$(date +"%x %r %Z") cat > chapter$i.Rmd << _EOF_ \chapter{$CHAPTER:} _EOF_ done echo "Finished" ================================================ FILE: Old/Writing_Setup/ProductionNotes.md ================================================ # Reproducible Research for R and RStudio ## Production Notes ### Christopher Gandrud --- --- ## Shell Script for directory and file set up The basic directories and chapter files were created using the shell script: `Rnw_Book.sh`. The script creates a directory called `Source` and Chapter files for all 14 chapters. It also creates a `Chapter*.Rmd` file and `images` folder in each chapter file. To run this script first make sure the file is executable. To see whether or not the file is executable in the shell type: ls -l Rnw_Book.sh It should produce something like: -rwxr-xr-x@ 1 christophergandrud admin 963 28 Jun 17:35 Rnw_Book.sh If not use: chmod 755 Rnw_Book.sh To run the script just type: ./Rnw_Book.sh ## Shell Script for creating individual chapters that are compilable. I wanted to be able to compile individual chapters as I while writing them to be able to preview the output and find syntax errors. Doing this was not straightforward because the shell script `Rnw_Book.sh` created chapter files that can be used as input in a book parent document. They lacked the header and footer material needed for compiling. They also include a `\chapter` command which is undefined in shorter *article* class documents. To deal with these issues I created another shell script: `IndvChapter.sh`. This script: - prompts the user to give the number of the chapter that they want to compile, - makes new directories to hold the individual chapter file, - copies the chapter file into the new directory, - deletes the line of code beginning with `\chapter`, - combines the chapter file with text from files that contain appropriate header and footer material, - deletes extraneous files. Follow the instructions above to run the script. See Shotts (2012, Chapter 24) for more details. ================================================ FILE: Old/Writing_Setup/Rnw_Book.sh ================================================ ########## # Shell script to create directories & files for Reproducible Research in R/RStudio # With LaTeX # Christopher Gandrud # Updated 30 July 2012 # Helpful information found in "The Linux Command-line: A Complete Introduction" (Shotts 2012) ########## #!/bin/bash cd /git_repositories/Rep-Res-Book mkdir Source mkdir Source/Parent cd /git_repositories/Rep-Res-Book/Source/Parent ## Create parent document ## CURRENT_TIME=$(date +"%x %r %Z") cat > Rep-Res-Parent.Rnw << _EOF_ %%%%% % Parent Document For Reproducible Research in R and RStudio % Christopher Gandrud % Created: $CURRENT_TIME % Updated: %%%%% % !Rnw weave = knitr \documentclass[]{krantz} \usepackage{amssymb} \usepackage{amsmath} \usepackage{graphicx} \usepackage{subfigure} %\usepackage{epsfig} \usepackage{makeidx} %\usepackage{showidx} \usepackage{multicol} \frenchspacing \tolerance=5000 \usepackage{dcolumn} \usepackage{booktabs} \usepackage{url} \usepackage{todonotes} \usepackage{tikz} \makeatletter \makeatother \makeindex \begin{document} \SweaveOpts{concordance=TRUE} \title{Reproducible Research with R and RStudio: Data Gathering, Analysis, \& Presentation} \author{Christopher Gandrud} \maketitle \frontmatter <>= @ \listoffigures \listoftables \tableofcontents \mainmatter \setcounter{page}{1} \part{Getting Started} <>= @ \part{Data Gathering and Storage} <>= @ \part{Analysis and Results} <>= @ \part{Presentation Documents} <>= @ \bibliographystyle{plain} \bibliography{/git_repositories/Rep-Res-Book/Source/rep-res-book.bib} \clearpage \printindex \end{document} _EOF_ ## Create FrontMatter ## cd /git_repositories/Rep-Res-Book mkdir Source/FrontMatter cd /git_repositories/Rep-Res-Book/Source/FrontMatter ## Author cat > Author.tex << _EOF_ \chapter*{Author} \contributor{Christopher Gandrud}{Yonsei University}{Wonju, Republic of Korea} FILL IN _EOF_ ## Forward cat > Foreword.Rnw << _EOF_ <>= set_parent('/git_repositories/Rep-Res-Book/Source/Parent/Rep-Res-Parent.Rnw') @ \chapter*{Forward} _EOF_ ## Preface cat > Preface.Rnw << _EOF_ <>= set_parent('/git_repositories/Rep-Res-Book/Source/Parent/Rep-Res-Parent.Rnw') @ \chapter*{Preface} _EOF_ ## Create 14 chapters and add child source files and image directories ## ## Use loops for 14 Chapters cd /git_repositories/Rep-Res-Book mkdir Source/Chidren cd /git_repositories/Rep-Res-Book/Source/Children for i in `seq 1 14`; do cd /git_repositories/Rep-Res-Book/Source/Children CHP_NAME=Chapter$i mkdir $CHP_NAME cd /git_repositories/Rep-Res-Book/Source/$CHP_NAME mkdir images$i ## Template Text CHAPTER="Chapter $i" cat > chapter$i.Rnw << _EOF_ <>= set_parent('/git_repositories/Rep-Res-Book/Source/Parent/Rep-Res-Parent.Rnw') @ %%%%% % Chapter $CHAPTER For Reproducible Research in R and RStudio % Christopher Gandrud % Created: $CURRENT_TIME % Updated: %%%%% \chapter{} FILL IN _EOF_ done ## Create BibTeX file cat > rep-res-book.bib << _EOF_ % Bibliography For Reproducible Research in R and RStudio % Christopher Gandrud % Created: $CURRENT_TIME _EOF_ echo "Finished" ================================================ FILE: Old/Writing_Setup/TableofContentPDF/GandrudRep-Res-Book-TOC.fdb_latexmk ================================================ # Fdb version 3 ["pdflatex"] 1359598830 "GandrudRep-Res-Book-TOC.tex" "GandrudRep-Res-Book-TOC.pdf" "GandrudRep-Res-Book-TOC" "/git_repositories/Rep-Res-Book/Source/Rep-Res-Parent.toc" 1359598237 21869 3f0bbc975b11974f96103567fe0d9816 "" "/usr/local/texlive/2011/texmf-dist/tex/latex/base/bk10.clo" 1254151887 8882 41e4149a0e29b927a7d5a0e13d2ebab4 "" "/usr/local/texlive/2011/texmf-dist/tex/latex/base/omscmr.fd" 1254151887 2109 c400bd6c901edc4a09e4d53fcffd3b3c "" "/usr/local/texlive/2011/texmf-dist/tex/latex/fancyhdr/fancyhdr.sty" 1160175134 20521 e5d13d98d57bd53d4fed3aa61bd29c86 "" "GandrudRep-Res-Book-TOC.aux" 1359598830 8 a94a2480d3289e625eea47cd1b285758 "" "GandrudRep-Res-Book-TOC.tex" 1359598829 427 140103c644b85c19fb83a4b4454ba26f "" "krantz.cls" 1252659577 59742 6b88abc1b7e64e7d10cb07088c204342 "" (generated) "GandrudRep-Res-Book-TOC.aux" "GandrudRep-Res-Book-TOC.log" "GandrudRep-Res-Book-TOC.pdf" ================================================ FILE: Old/Writing_Setup/TableofContentPDF/GandrudRep-Res-Book-TOC.tex ================================================ \documentclass[krantz1]{krantz} % Load required LaTeX packages \usepackage[authoryear]{natbib} \usepackage{amssymb} \usepackage{amsmath} \usepackage{graphicx} %\usepackage{caption} \usepackage{subfigure} %\usepackage{epsfig} \usepackage{makeidx} %\usepackage{showidx} \usepackage{multicol} \frenchspacing \tolerance=5000 \usepackage[usenames,dvipsnames,svgnames]{xcolor} \usepackage{dcolumn} \usepackage{booktabs} \usepackage{multirow} \usepackage{lscape} \usepackage{url} \usepackage{todonotes} \usepackage{tikz} \usetikzlibrary{trees} \usetikzlibrary{decorations.pathmorphing} \usetikzlibrary{shapes,arrows} \usepackage{wrapfig} \usepackage{alltt} \usepackage{fancyhdr} \rhead{Christopher Gandrud (\emph{Draft: Current as of \today})} \begin{document} \title{Reproducible Research with R and RStudio \\[2cm] {\large{Chapman \& Hall/CRC Press}} \\ {\normalsize{Expected Publication: August 2013}}} \author{Christopher Gandrud} \maketitle \pagestyle{fancy} \input{/git_repositories/Rep-Res-Book/Source/Rep-Res-Parent.toc} \end{document} ================================================ FILE: Old/Writing_Setup/TableofContentPDF/krantz.cls ================================================ %% %% This is file `Krantz.cls' %%% Created by Shashi Kumar / ITC [August 2008] \NeedsTeXFormat{LaTeX2e}[1995/12/01] \ProvidesClass{krantz} [2005/09/16 v1.4f Standard LaTeX document class] \newcommand\@ptsize{} \newif\if@restonecol \newif\if@titlepage \@titlepagetrue \newif\if@openright \newif\if@mainmatter \@mainmattertrue \if@compatibility\else \DeclareOption{a4paper} {\setlength\paperheight {297mm}% \setlength\paperwidth {210mm}} \DeclareOption{a5paper} {\setlength\paperheight {210mm}% \setlength\paperwidth {148mm}} \DeclareOption{b5paper} {\setlength\paperheight {250mm}% \setlength\paperwidth {176mm}} \DeclareOption{letterpaper} {\setlength\paperheight {11in}% \setlength\paperwidth {8.5in}} \DeclareOption{legalpaper} {\setlength\paperheight {14in}% \setlength\paperwidth {8.5in}} \DeclareOption{executivepaper} {\setlength\paperheight {10.5in}% \setlength\paperwidth {7.25in}} \DeclareOption{landscape} {\setlength\@tempdima {\paperheight}% \setlength\paperheight {\paperwidth}% \setlength\paperwidth {\@tempdima}} \fi \if@compatibility \renewcommand\@ptsize{0} \else \DeclareOption{10pt}{\renewcommand\@ptsize{0}} \fi \DeclareOption{11pt}{\renewcommand\@ptsize{1}} \DeclareOption{12pt}{\renewcommand\@ptsize{2}} \if@compatibility\else \DeclareOption{oneside}{\@twosidefalse \@mparswitchfalse} \fi \DeclareOption{twoside}{\@twosidetrue \@mparswitchtrue} \DeclareOption{draft}{\setlength\overfullrule{5pt}} \if@compatibility\else \DeclareOption{final}{\setlength\overfullrule{0pt}} \fi \DeclareOption{titlepage}{\@titlepagetrue} \if@compatibility\else \DeclareOption{notitlepage}{\@titlepagefalse} \fi \if@compatibility \@openrighttrue \else \DeclareOption{openright}{\@openrighttrue} \DeclareOption{openany}{\@openrightfalse} \fi \if@compatibility\else \DeclareOption{onecolumn}{\@twocolumnfalse} \fi \DeclareOption{twocolumn}{\@twocolumntrue} \DeclareOption{leqno}{\input{leqno.clo}} \DeclareOption{fleqn}{\input{fleqn.clo}} \DeclareOption{openbib}{% \AtEndOfPackage{% \renewcommand\@openbib@code{% \advance\leftmargin\bibindent \itemindent -\bibindent \listparindent \itemindent \parsep \z@ }% \renewcommand\newblock{\par}}% } %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \newif\if@numbysec \DeclareOption{numbysec}{\@numbysectrue} \newif\if@numberinsequence \DeclareOption{numberinsequence}{\@numberinsequencetrue} \newif\if@nocaptionbreak \DeclareOption{NoCaptionBreak}{\@nocaptionbreaktrue} \newif\if@sevenbyten \DeclareOption{sevenbyten}{\@sevenbytentrue} \newif\if@cip \DeclareOption{cip}{\@ciptrue} \newif\if@times \DeclareOption{times}{\@timestrue} \newif\if@chapnumonly \DeclareOption{chapnumonly}{\@chapnumonlytrue} \newif\if@ChapterResetsPage \DeclareOption{ChapterResetsPage}{\@ChapterResetsPagetrue} \newif\if@ChapterTOCs \DeclareOption{ChapterTOCs}{\@ChapterTOCstrue} \newif\if@EOCRefs \DeclareOption{EOCRefs}{\@EOCRefstrue}% \newif\if@SuperscriptCites \DeclareOption{SuperscriptCites}{\@SuperscriptCitestrue}% \newif\if@UnnumberedReferences \DeclareOption{UnnumberedReferences}{\@UnnumberedReferencestrue}% \newif\if@pdf \DeclareOption{pdf}{\@pdftrue} \DeclareOption{krantz1}{\@krantzatrue} \newif\if@krantza \DeclareOption{krantz2}{\@krantzbtrue} \newif\if@krantzb %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \ExecuteOptions{letterpaper,10pt,twoside,onecolumn,final,openright} \ProcessOptions %%%%%%%%%%%%%%%%%%% \def\helv@scale{.82} % \DeclareFontFamily{T1}{helvetica}{}% \DeclareFontShape{T1}{helvetica}{m}{n}{<->s*[\helv@scale]phvr8t}{}% \DeclareFontShape{T1}{helvetica}{m}{it}{<->s*[\helv@scale]phvro8t}{}% \DeclareFontShape{T1}{helvetica}{m}{sc}{<->s*[\helv@scale]phvrc8t}{}% \DeclareFontShape{T1}{helvetica}{b}{n}{<->s*[\helv@scale]phvb8t}{}% \DeclareFontShape{T1}{helvetica}{b}{it}{<->s*[\helv@scale]phvbo8t}{}% \DeclareFontShape{T1}{helvetica}{m}{sl}{<->s*[\helv@scale]phvro8t}{}% \DeclareFontShape{T1}{helvetica}{b}{sc}{<->s*[\helv@scale]phvbc8t}{}% \DeclareFontShape{T1}{helvetica}{b}{sl}{<->s*[\helv@scale]phvbo8t}{}% \DeclareFontShape{T1}{helvetica}{bx}{n}{<->s*[\helv@scale]phvb8t}{}% \DeclareFontShape{T1}{helvetica}{bx}{it}{<->s*[\helv@scale]phvbo8t}{}% \DeclareFontShape{T1}{helvetica}{bx}{sc}{<->s*[\helv@scale]phvbc8t}{}% \DeclareFontShape{T1}{helvetica}{bx}{sl}{<->ssub * helvetica/b/it}{}% \DeclareFontFamily{OT1}{helvetica}{}% \DeclareFontShape{OT1}{helvetica}{m}{n}{<->s*[\helv@scale]phvr7t}{}% \DeclareFontShape{OT1}{helvetica}{m}{it}{<->s*[\helv@scale]phvro7t}{}% \DeclareFontShape{OT1}{helvetica}{m}{sc}{<->s*[\helv@scale]phvrc7t}{}% \DeclareFontShape{OT1}{helvetica}{b}{n}{<->s*[\helv@scale]phvb7t}{}% \DeclareFontShape{OT1}{helvetica}{b}{it}{<->s*[\helv@scale]phvbo7t}{}% \DeclareFontShape{OT1}{helvetica}{m}{sl}{<->s*[\helv@scale]phvro7t}{}% \DeclareFontShape{OT1}{helvetica}{b}{sc}{<->s*[\helv@scale]phvbc8t}{}% \DeclareFontShape{OT1}{helvetica}{b}{sl}{<->s*[\helv@scale]phvbo7t}{}% \DeclareFontShape{OT1}{helvetica}{bx}{n}{<->s*[\helv@scale]phvb7t}{}% \DeclareFontShape{OT1}{helvetica}{bx}{it}{<->s*[\helv@scale]phvbo7t}{}% \DeclareFontShape{OT1}{helvetica}{bx}{sc}{<->s*[\helv@scale]phvbc8t}{}% \DeclareFontShape{OT1}{helvetica}{bx}{sl}{<->s*[\helv@scale]phvbo7t}{}% %%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%% Font Defined %%%%%%%%%%%%%%%%% \def\@xipt{11} \def\@xviiipt{18} \def\@xxivpt{24} \newcommand\ContributorAffiliationFont{\reset@font\fontsize{10}{12}\raggedright\selectfont} \newcommand\ContributorNameFont{\reset@font\fontsize{10}{12}\bfseries\raggedright\selectfont} \newcommand\TitlePageTitleFont{\fontsize{24}{28}\slshape\bfseries\selectfont} \newcommand\PageNumFont{\reset@font\fontsize{10}{12}\selectfont} \newcommand\ChapNumFont{\reset@font\fontsize{24}{24}\bfseries\selectfont} \newcommand\ChapTitleFont{\reset@font\fontsize{18}{20}\slshape\selectfont} \newcommand\SectionHeadFont{\fontsize{12}{14}\bfseries\selectfont} \newcommand\SubsectionHeadFont{\fontsize{11}{13}\bfseries\selectfont} \newcommand\SubsubsectionHeadFont{\fontsize{10}{12}\bfseries\selectfont} \newcommand\ParagraphHeadFont{\fontsize{10}{12}\itshape\selectfont} \newcommand\SubParagraphHeadFont{\fontsize{10}{12}\itshape\selectfont} \newcommand\FMHeadFont{\reset@font\fontsize{18}{20}\slshape\bfseries\selectfont} \newcommand\RunningHeadFont{\fontsize{10}{12}\itshape\selectfont} \newcommand\NameFont{\fontsize{10}{12}\itshape\selectfont} \newcommand\AffiliationFont{\fontsize{8}{10}\selectfont} \newcommand\FigCapFont{\fontsize{10}{12}\bfseries\selectfont} \newcommand\FigCapBIFont{\fontsize{10}{12}\bfseries\itshape\selectfont} \newcommand\TableColHeadFont{\fontsize{10}{12}\bfseries\selectfont} \newcommand\TableTitleFont{\fontsize{10}{12}\selectfont} \newcommand\TableNumberFont{\fontsize{11}{13}\bfseries\selectfont} \newcommand\TableBodyFont{\reset@font\fontsize{9}{11}\selectfont} \newcommand\TableSubheadFont{\reset@font\fontsize{9}{11}\selectfont} \newcommand\TableFootnoteFont{\reset@font\fontsize{8}{10}\selectfont} \newcommand\CAPlusOneFont{\fontsize{10}{12}\bfseries\selectfont} \newcommand\CAAPlusOneFont{\fontsize{10}{12}\itshape\selectfont} \newcommand\tocfont{\fontsize{10}{12}\selectfont} \newcommand\extraFont{\fontsize{24}{28}\selectfont} \newcommand\VfFont{\fontsize{10}{12}\selectfont} %%%%%%%%%%%%%%%%% \input{bk1\@ptsize.clo} \setlength\lineskip{1\p@} \setlength\normallineskip{1\p@} \renewcommand\baselinestretch{} \setlength\parskip{0\p@ \@plus \p@} \@lowpenalty 51 \@medpenalty 151 \@highpenalty 301 \@beginparpenalty -\@lowpenalty \@endparpenalty -\@lowpenalty \@itempenalty -\@lowpenalty % \clubpenalty=0 % 'Club line' at bottom of page. \widowpenalty=10000 % 'Widow line' at top of page. \setcounter{topnumber}{2} \renewcommand\topfraction{.7} \setcounter{bottomnumber}{1} \renewcommand\bottomfraction{.3} \setcounter{totalnumber}{3} \renewcommand\textfraction{.2} \renewcommand\floatpagefraction{.5} \setcounter{dbltopnumber}{2} \renewcommand\dbltopfraction{.7} \renewcommand\dblfloatpagefraction{.5} % **************************************** % * PAGE LAYOUT * % **************************************** % % All margin dimensions measured from a point one inch from top and side % of page. % % SIDE MARGINS: % \oddsidemargin 6pc %5pc \evensidemargin 5.7pc %5pc \marginparwidth 4pc \marginparsep 1pc \topmargin 12pt %0pt \headheight 12pt \headsep 12pt \footskip 2pc % % DIMENSION OF TEXT: \newdimen\trimheight \newdimen\trimwidth \newdimen\normaltextheight \newdimen\tempa \newdimen\tempdimen % %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% Parameter Initializaton %%%%%%%%%%%%%%%%%%%%%%%%%% % \newdimen\htrim \newdimen\vtrimtop \newdimen\vtrimbot \setlength\trimheight{9in} \setlength\trimwidth{6in} % % \if@krantza \textheight = 45\baselineskip %\advance\textheight by \topskip \addtolength\textheight{3pt} \textwidth 28pc \addtolength\textwidth{.5pt} \topmargin0in \oddsidemargin1.1875in \evensidemargin1.1875in \htrim.7365in \vtrimtop1.068in \vtrimbot1.068in \hoffset-15pt \voffset39pt \let\normaltextheight\textheight \else\if@krantzb \textheight = 51pc % \advance\textheight by \topskip \textwidth 33pc \topmargin0in \oddsidemargin.5in \evensidemargin.5in \htrim.75in \vtrimtop.8607in \vtrimbot1.027in \hoffset-.1in \voffset-.15in%.04in \let\normaltextheight\textheight \else \textheight = 43\baselineskip %\advance\textheight by \topskip \addtolength\textheight{3pt} \textwidth 26pc \addtolength\textwidth{.5pt} \topmargin0in \oddsidemargin1.1875in \evensidemargin1.1875in \htrim5.05pc \vtrimtop7.7pc \vtrimbot5.44pc % \hoffset-5pt \voffset45pt \let\normaltextheight\textheight \fi \fi % \columnsep 1pc \columnseprule 0pt % % FOOTNOTES % \footnotesep 6.65pt \skip\footins 12pt plus 3pt minus 1.5pt % %%%% Trim marks %%%%%%%%%%% \newsavebox\ul@box \newsavebox\ur@box \newsavebox\ll@box \newsavebox\lr@box \def\top@cornermarks{% \hskip-\htrim \vbox to 0\p@{\vskip-\vtrimtop\llap{\copy\ul@box}\vss}% \vbox to 0\p@{\vskip-\vtrimtop\rlap{\hskip\textwidth\hskip2\htrim\copy\ur@box}\vss}% \vbox to 0\p@{\vskip\textheight\vskip\vtrimbot\llap{\copy\ll@box}\vss}% \vbox to 0\p@{\vskip\textheight\vskip\vtrimbot\rlap{\hskip\textwidth\hskip2\htrim\copy\lr@box}\vss}% \hskip\htrim} \def\make@cornermarks{% \sbox\ul@box{\rule{18\p@}{.25\p@}\hskip8\p@\hbox to.25\p@{\vbox to26\p@{\noindent\rule{.25\p@}{18\p@}}}}% \sbox\ur@box{\hbox to.25\p@{\vbox to26\p@{\noindent\rule{.25\p@}{18\p@}}}\hskip8\p@\rule{18\p@}{.25\p@}}% \sbox\ll@box{\rule{18\p@}{.25\p@}\hskip8\p@\lower34\p@\hbox to.25\p@{\vbox to26\p@{\noindent\rule{.25\p@}{18\p@}}}}% \sbox\lr@box{\lower34\p@\hbox to.25\p@{\vbox to26\p@{\noindent\rule{.25\p@}{18\p@}}}\hskip8\p@\rule{18\p@}{.25\p@}}} %%%%%%%%%%%%%%%%%%%% End Trim Marks %%%%%%%%%%%% \def\ps@plain{\let\@mkboth\@gobbletwo \let\@oddhead\top@cornermarks%\@empty \def\@oddfoot{\reset@font\hfil\thepage \hfil}\let\@evenhead\@empty\let\@evenfoot\@oddfoot} \def\even@head{% \top@cornermarks {\@the@page\RunningHeadFont \hfill \leftmark }} \def\odd@head{% \top@cornermarks \hfil{\RunningHeadFont \rightmark } \hfill \@the@page } \def\@the@page{{\PageNumFont\thepage}} \if@twoside \def\ps@headings{% \let\@mkboth\@gobbletwo \if@pdf \let\@evenhead\@empty \let\@oddhead\@empty \def\@oddfoot{\@cip\hfil}% \def\@evenfoot{\@cip\hfil}% \else \let\@oddfoot\@empty \let\@evenfoot\@empty \let\@evenhead\even@head \let\@oddhead\odd@head \fi } \else \def\ps@headings{\let\@mkboth\@gobbletwo% \if@pdf \let\@evenhead\@empty \let\@oddhead\@empty \def\@oddfoot{\@cip\hfil}% \def\@evenfoot{\@cip\hfil}% \else \let\@oddfoot\@empty \let\@evenfoot\@empty \let\@evenhead\even@head \let\@oddhead\odd@head \fi } \fi \def\ps@myheadings{% \let\@oddfoot\@empty\let\@evenfoot\@empty \def\@evenhead{\thepage\hfil\slshape\leftmark}% \def\@oddhead{{\slshape\rightmark}\hfil\thepage}% \let\@mkboth\@gobbletwo \let\chaptermark\@gobble \let\sectionmark\@gobble } \def\ps@empty{% \let\@mkboth\@gobbletwo \if@pdf \let\@evenhead\@empty \let\@oddhead\@empty \def\@oddfoot{\@cip\hfil}% \def\@evenfoot{\@cip\hfil}% \else \make@cornermarks \let\@oddhead\top@cornermarks \let\@evenhead\top@cornermarks \let\@oddfoot\@empty \let\@evenfoot\@empty \fi } \def\ps@folio{% \let\@mkboth\@gobbletwo \if@pdf \let\@evenhead\@empty \let\@oddhead\@empty \def\@oddfoot{\@cip\hfil}% \def\@evenfoot{\@cip\hfil}% \else \let\@oddhead\top@cornermarks \def\@oddfoot{% \parindent\z@ \baselineskip7\p@ \hbox{% \textwidth\@ciprulewidth \vbox{% \if@cip\rule{\@ciprulewidth}{.25pt}\par \hbox{\vbox{\noindent\copy\@cipboxa\par\noindent\copy\@cipboxb}}\fi}} \hfill\@the@page} \let\@evenhead\top@cornermarks%\odd@head \let\@evenfoot\@oddfoot \fi } \newcommand\HeadingsBookChapter{% \def\chaptermark##1{% \markboth{\@title}{% ##1}}% \def\sectionmark##1{}} \def\HeadingsChapterSection{% \def\chaptermark##1{% \markboth{% ##1}{}}% \def\sectionmark##1{% \markright{% ##1}}} \def\pdfon{\@pdftrue} \def\pdfoff{\@pdffalse} \if@pdf \def\@cip{{\fontsize{6\p@}{8\p@}\selectfont\copyright 2001 by CRC Press LLC}} \else \newsavebox\@cipboxa \newsavebox\@cipboxb \newdimen\@ciprulewidth \def\@cip#1#2{% \sbox\@cipboxa{\fontsize{6\p@}{8\p@}\selectfont #1}% \sbox\@cipboxb{\fontsize{6\p@}{8\p@}\selectfont #2}% \@ciprulewidth\wd\@cipboxa \ifnum\@ciprulewidth<\wd\@cipboxb\@ciprulewidth\wd\@cipboxb\fi}% \fi \if@pdf \else \AtBeginDocument{% \@cip{\rule{0pt}{9pt}0-8493-0052-5/00/\$0.00+\$.50}% {\copyright\ \ 2001 by CRC Press LLC}}% \fi \if@titlepage \newcommand\maketitle{\begin{titlepage}% \let\footnotesize\small \let\footnoterule\relax \let \footnote \thanks {\parindent \z@ \raggedright \baselineskip \z@ \lineskip \z@ \parskip \z@ \vbox{ \vskip -7bp {\baselineskip 10bp\lineskip 10bp\NameFont\uppercase{\@author}\par} \vskip 6bp \AffiliationFont \@affiliation \vskip -2bp \crcrule \vskip 22bp {\baselineskip 24bp\lineskip 24bp\TitlePageTitleFont\@title\par}}} \@thanks \vfil\null \end{titlepage}% \setcounter{footnote}{0}% \global\let\thanks\relax \global\let\maketitle\relax \global\let\@thanks\@empty \global\let\@author\@empty \global\let\@date\@empty % \global\let\@title\@empty \global\let\title\relax \global\let\author\relax \global\let\date\relax \global\let\and\relax } \else \newcommand\maketitle{\par \begingroup \renewcommand\thefootnote{\@fnsymbol\c@footnote}% \def\@makefnmark{\rlap{\@textsuperscript{\normalfont\@thefnmark}}}% \long\def\@makefntext##1{\parindent 1em\noindent \hb@xt@1.8em{% \hss\@textsuperscript{\normalfont\@thefnmark}}##1}% \if@twocolumn \ifnum \col@number=\@ne \@maketitle \else \twocolumn[\@maketitle]% \fi \else \newpage \global\@topnum\z@ % Prevents figures from going at top of page. \@maketitle \fi \thispagestyle{empty}\@thanks \endgroup \setcounter{footnote}{0}% \global\let\thanks\relax \global\let\maketitle\relax \global\let\@maketitle\relax \global\let\@thanks\@empty \global\let\@author\@empty \global\let\@date\@empty \global\let\@title\@empty \global\let\title\relax \global\let\author\relax \global\let\date\relax \global\let\and\relax } \def\@maketitle{% \newpage \null \vskip 2em% {\parindent \z@ \raggedright \baselineskip \z@ \lineskip \z@ \parskip \z@ \vbox{ \vskip -7bp {\baselineskip 10bp\lineskip 10bp\NameFont\uppercase{\@author}\par} \vskip 6bp \AffiliationFont \@affiliation \vskip 10bp \crcrule \vskip 26bp {\baselineskip 24bp\lineskip 24bp\TitlePageTitleFont\@title\par}}} \par \vskip 1.5em} \fi %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \newcommand*\chaptermark[1]{} \setcounter{secnumdepth}{3} \newcounter {part} \newcounter {chapter} \newcounter {section}[chapter] \newcounter {subsection}[section] \newcounter {subsubsection}[subsection] \newcounter {paragraph}[subsubsection] \newcounter {subparagraph}[paragraph] \renewcommand \thepart {\@Roman\c@part} \renewcommand \thechapter {\@arabic\c@chapter} \renewcommand \thesection {\thechapter.\@arabic\c@section} \renewcommand\thesubsection {\thesection.\@arabic\c@subsection} \renewcommand\thesubsubsection{\thesubsection .\@arabic\c@subsubsection} \renewcommand\theparagraph {\thesubsubsection.\@arabic\c@paragraph} \renewcommand\thesubparagraph {\theparagraph.\@arabic\c@subparagraph} \newcommand\@chapapp{\chaptername} \newcommand\frontmatter{% \cleardoublepage \@mainmatterfalse \pagenumbering{roman}} \newcommand\mainmatter{% \cleardoublepage \@mainmattertrue \pagenumbering{arabic}} \newcommand\backmatter{% \if@openright \cleardoublepage \else \clearpage \fi \@mainmatterfalse} \newcommand\part{\make@cornermarks% \if@openright \cleardoublepage \else \clearpage \fi \thispagestyle{plain}% \if@twocolumn \onecolumn \@tempswatrue \else \@tempswafalse \fi \null\vfil \secdef\@part\@spart} \def\@part[#1]#2{% \ifnum \c@secnumdepth >-2\relax \refstepcounter{part}% \addcontentsline{toc}{part}{\thepart\hspace{1em}#1}% \else \addcontentsline{toc}{part}{#1}% \fi \markboth{}{}% {\centering \interlinepenalty \@M \normalfont \ifnum \c@secnumdepth >-2\relax \huge\bfseries \partname\nobreakspace\thepart \par \vskip 20\p@ \fi \Huge \bfseries #2\par}% \@endpart} \def\@spart#1{% {\centering \interlinepenalty \@M \normalfont \Huge \bfseries #1\par}% \@endpart} \def\@endpart{\vfil\newpage \if@twoside \if@openright \null \thispagestyle{empty}% \newpage \fi \fi \if@tempswa \twocolumn \fi} \if@ChapterTOCs \newwrite\@chaptoc \def\secnumwidth{21pt}\def\subsecnumwidth{30pt}\def\ssubsecnumwidth{36pt}\fi \long\def\@trplarg#1{\@ifnextchar[{\@xtrplarg{#1}}{\@ztrplarg{#1}}} \long\def\@xtrplarg#1[#2]{\@ifnextchar[{#1[#2]}{\@ytrplarg{#1}[{#2}]}} \long\def\@ytrplarg#1[#2]#3{#1[{#2}][{#2}]{#3}} \long\def\@ztrplarg#1#2{#1[{#2}][{#2}]{#2}} \newcommand\chapter{\if@openright\cleardoublepage\else\clearpage\fi \make@cornermarks \cleardoublepage \if@ChapterTOCs\if@filesw\immediate\closeout\@chaptoc\fi\fi \pagestyle{headings}% \thispagestyle{folio}% \if@ChapterResetsPage\global\c@page\@ne\fi \global\@topnum\z@ \gdef\chapterauthor{\@ca}% \gdef\endchapterauthors{\end@cas}% \@afterindentfalse % \secdef\@chapter\@schapter \@ifstar{\@schapter}{\@trplarg{\@chapter}}} \def\@chapter[#1][#2]#3{% \ifnum\c@secnumdepth>\m@ne \if@mainmatter \refstepcounter{chapter}% \typeout{\@chapapp\space\thechapter.}% \addcontentsline{toc}{chapter}{\protect\numberline{\thechapter}#1}% \else \addcontentsline{toc}{chapter}{#1}\fi \else \addcontentsline{toc}{chapter}{#1}\fi \chaptermark{% #2}% \addtocontents{lof}{\protect\addvspace{10\p@}}% \addtocontents{lot}{\protect\addvspace{10\p@}}% \if@twocolumn \@topnewpage[\@makechapterhead{#3}]% \else \@makechapterhead{#3}% \@afterheading\fi \if@ChapterTOCs\if@filesw\immediate\openout\@chaptoc\thechapter.toc\fi\fi } \def\@makechapterhead#1{% {\parindent \z@ \raggedright \baselineskip \z@ \lineskip \z@ \parskip \z@ \vbox{ \vskip -2\p@ \ChapNumFont %Remove comment if "Chapter" word required before Number %\if@chapnumonly\else % \@chapapp\ %\fi \thechapter \vskip -15\p@ \chap@rule \vskip 6\p@ {\baselineskip 20\p@\lineskip 20\p@\ChapTitleFont #1\par\vskip-15pt}% \noindent\hbox{\vrule height.5pt width84pt} \vskip28\p@} \if@ChapterTOCs \make@chaptoc \else \fi \vskip 19.3\p@} \def\theequation{\thechapter.\arabic{equation}}}% \def\@schapter#1{\if@twocolumn \@topnewpage[\@makeschapterhead{#1}]% \else \@makeschapterhead{#1}% \@afterheading \fi} \def\@makeschapterhead#1{% {\parindent \z@ \raggedright \baselineskip 6\p@ \lineskip \z@ \parskip \z@ \vbox{ \vskip 22\p@ \unnumchap@rule \vskip 5\p@ \FMHeadFont #1\par\vskip-12pt \noindent\hbox{\vrule height.5pt width84pt} \vskip 41\p@}}% \def\theequation{\thechapter.\arabic{equation}}} \def\@startsection#1#2#3#4#5#6{% \if@noskipsec\leavevmode\fi \par \@tempskipa #4\relax \@afterindenttrue \ifdim \@tempskipa <\z@ \@tempskipa -\@tempskipa \@afterindentfalse \fi \if@nobreak \everypar{}% \else \addpenalty\@secpenalty\addvspace\@tempskipa \fi \@ifstar {\@ssect{#1}{#3}{#4}{#5}{#6}}% {\@trplarg{\@sect{#1}{#2}{#3}{#4}{#5}{#6}}}} \def\@ssect#1#2#3#4#5#6{% \@tempskipa #4\relax \ifdim \@tempskipa>\z@ \begingroup #5{% \@hangfrom{\hskip #2}% \interlinepenalty \@M #6\@@par}% \endgroup \csname #1mark\endcsname{#6}% \else \def\@svsechd{#5{\hskip #2\relax #6}\csname #1mark\endcsname{#6}}% \fi \@xsect{#4}} \def\@sect#1#2#3#4#5#6[#7][#8]#9{% \ifnum #2>\c@secnumdepth \let\@svsec\@empty \else \refstepcounter{#1}% \protected@edef\@svsec{\@seccntformat{#1}\relax}% \fi \@tempskipa #5\relax \ifdim \@tempskipa>\z@ \begingroup #6{% \@hangfrom{\hskip #3\relax\@svsec}\interlinepenalty \@M % #9\@@par}% \endgroup \csname #1mark\endcsname{% #8}% \addcontentsline{toc}{#1}{% \ifnum #2>\c@secnumdepth \else \protect\numberline{\csname the#1\endcsname}% \fi #7}% \else \def\@svsechd{% #6{\hskip #3\relax \@svsec #9}% \csname #1mark\endcsname{% #8}% \addcontentsline{toc}{#1}{% \ifnum #2>\c@secnumdepth \else \protect\numberline{\csname the#1\endcsname}% \fi #7}}% \fi \@xsect{#5}} \newcommand\section{% \gdef\chapterauthor{\@caplusone}% \gdef\endchapterauthors{\end@casplusone}% \@ifstar{\@ssection}{\@trplarg{\@section}}} \def\@ssection#1{% \if@ChapterTOCs \myaddcontentsline{\@chaptoc}{chapsection}{\string\makebox[\secnumwidth][l]{}#1}\fi \@startsection{section}{1}{\z@}{-30\p@}{6\p@}{\sec@rule\nopagebreak\vskip9.5\p@\nopagebreak\SectionHeadFont}*{#1}} \def\@section[#1][#2]#3{% \if@ChapterTOCs \addtocounter{section}{1}% \myaddcontentsline{\@chaptoc}{chapsection}{\string\makebox[\secnumwidth][l]{\thesection}#1}% \addtocounter{section}{-1}\fi \@startsection{section}{1}{\z@}{-30\p@}{6\p@}{\sec@rule\nopagebreak\vskip9.5\p@\nopagebreak\SectionHeadFont}[#2]{#3}} \def\sectionauthor#1{\hfill{\ChapTOCAuthorFont #1}} \newcommand\subsection{\@ifstar{\@ssubsection}{\@trplarg{\@subsection}}} \def\@ssubsection#1{% \if@ChapterTOCs \myaddcontentsline{\@chaptoc}{chapsubsection}{\string\makebox[\subsecnumwidth][l]{}#1}\fi \@startsection{subsection}{2}{\z@}{-18\p@}{6\p@}{% \SubsectionHeadFont}*{#1}} \def\@subsection[#1][#2]#3{% \if@ChapterTOCs \addtocounter{subsection}{1}% \myaddcontentsline{\@chaptoc}{chapsubsection}{\string\makebox[\subsecnumwidth][l]{\thesubsection}#1}% \addtocounter{subsection}{-1}\fi \@startsection{subsection}{2}{\z@}{-18\p@}{6\p@}{% \SubsectionHeadFont}[#2]{#3}} \newcommand\subsubsection{\@ifstar{\@ssubsubsection}{\@trplarg{\@subsubsection}}} \def\@ssubsubsection#1{% \if@ChapterTOCs \myaddcontentsline{\@chaptoc}{chapsubsubsection}{\string\makebox[\subsecnumwidth][l]{}#1}\fi \@startsection{subsubsection}{3}{\z@}{-12\p@}{6\p@}{% \SubsubsectionHeadFont}*{#1}} \def\@subsubsection[#1][#2]#3{% \if@ChapterTOCs \addtocounter{subsubsection}{1}% \myaddcontentsline{\@chaptoc}{chapsubsubsection}{\hskip21pt\string\makebox[\ssubsecnumwidth][l]{\thesubsubsection}#1}% \addtocounter{subsubsection}{-1}\fi \@startsection{subsubsection}{3}{\z@}{-12\p@}{6\p@}{% \SubsubsectionHeadFont}[#2]{#3}} \newcommand\paragraph{\@startsection{paragraph}{4}{\z@}% {-12\p@}{6\p@}{\ParagraphHeadFont}} \newcommand\subparagraph{\@startsection{subparagraph}{5}{\parindent}% {-12\p@}{6\p@}{\SubParagraphHeadFont}} \if@twocolumn \setlength\leftmargini {2em} \else \setlength\leftmargini {2.5em} \fi \leftmargin \leftmargini \setlength\leftmarginii {2.2em} \setlength\leftmarginiii {1.87em} \setlength\leftmarginiv {1.7em} \if@twocolumn \setlength\leftmarginv {.5em} \setlength\leftmarginvi {.5em} \else \setlength\leftmarginv {1em} \setlength\leftmarginvi {1em} \fi \setlength \labelsep {.5em} \setlength \labelwidth{\leftmargini} \addtolength\labelwidth{-\labelsep} \@beginparpenalty -\@lowpenalty \@endparpenalty -\@lowpenalty \@itempenalty -\@lowpenalty \renewcommand\theenumi{\@arabic\c@enumi} \renewcommand\theenumii{\@alph\c@enumii} \renewcommand\theenumiii{\@roman\c@enumiii} \renewcommand\theenumiv{\@Alph\c@enumiv} \newcommand\labelenumi{\theenumi.} \newcommand\labelenumii{(\theenumii)} \newcommand\labelenumiii{\theenumiii.} \newcommand\labelenumiv{\theenumiv.} \renewcommand\p@enumii{\theenumi} \renewcommand\p@enumiii{\theenumi(\theenumii)} \renewcommand\p@enumiv{\p@enumiii\theenumiii} \newcommand\labelitemi{\textbullet} \newcommand\labelitemii{\normalfont\bfseries \textendash} \newcommand\labelitemiii{\textasteriskcentered} \newcommand\labelitemiv{\textperiodcentered} \newenvironment{description} {\list{}{\labelwidth\z@ \itemindent-\leftmargin \let\makelabel\descriptionlabel}} {\endlist} \newcommand*\descriptionlabel[1]{\hspace\labelsep \normalfont\bfseries #1} \newenvironment{verse} {\let\\\@centercr \list{}{\itemsep \z@ \itemindent -1.5em% \listparindent\itemindent \rightmargin \leftmargin \advance\leftmargin 1.5em}% \item\relax} {\endlist} \newenvironment{quotation} {\list{}{\listparindent 1.5em% \itemindent \listparindent \rightmargin \leftmargin \parsep \z@ \@plus\p@}% \item\relax} {\endlist} \newenvironment{quote} {\list{}{\rightmargin\leftmargin}% \item\relax} {\endlist} \if@compatibility \newenvironment{titlepage} {% \cleardoublepage \if@twocolumn \@restonecoltrue\onecolumn \else \@restonecolfalse\newpage \fi \thispagestyle{empty}% \setcounter{page}\z@ }% {\if@restonecol\twocolumn \else \newpage \fi } \else \newenvironment{titlepage} {% \cleardoublepage \if@twocolumn \@restonecoltrue\onecolumn \else \@restonecolfalse\newpage \fi \thispagestyle{empty}% \setcounter{page}\@ne }% {\if@restonecol\twocolumn \else \newpage \fi \if@twoside\else \setcounter{page}\@ne \fi } \fi \newcommand\appendix{\par \setcounter{chapter}{0}% \setcounter{section}{0}% \gdef\@chapapp{\appendixname}% \gdef\thechapter{\@Alph\c@chapter}} \setlength\arraycolsep{5\p@} \setlength\tabcolsep{6\p@} \setlength\arrayrulewidth{.4\p@} \setlength\doublerulesep{2\p@} \setlength\tabbingsep{\labelsep} \skip\@mpfootins = \skip\footins \setlength\fboxsep{3\p@} \setlength\fboxrule{.4\p@} \@addtoreset {equation}{chapter} \renewcommand\theequation {\ifnum \c@chapter>\z@ \thechapter.\fi \@arabic\c@equation} \newcounter{figure}[chapter] \renewcommand \thefigure {\ifnum \c@chapter>\z@ \thechapter.\fi \@arabic\c@figure} \def\fps@figure{tbp} \def\ftype@figure{1} \def\ext@figure{lof} \def\fnum@figure{\figurename\nobreakspace\thefigure} \newenvironment{figure} {\@float{figure}} {\end@float} \newenvironment{figure*} {\@dblfloat{figure}} {\end@dblfloat} \newcounter{table}[chapter] \renewcommand \thetable {\ifnum \c@chapter>\z@ \thechapter.\fi \@arabic\c@table} \def\fps@table{tbp} \def\ftype@table{2} \def\ext@table{lot} \def\fnum@table{\tablename\nobreakspace\thetable} \newenvironment{table} {\@float{table}} {\end@float} \newenvironment{table*} {\@dblfloat{table}} {\end@dblfloat} \newlength\abovecaptionskip \newlength\belowcaptionskip \setlength\abovecaptionskip{10\p@} \setlength\belowcaptionskip{0\p@} \long\def\@makecaption#1#2{% \vskip\abovecaptionskip \sbox\@tempboxa{#1: #2}% \ifdim \wd\@tempboxa >\hsize {\FigCapFont #1}\par #2\par \else \global \@minipagefalse % \hb@xt@\hsize{\hfil\box\@tempboxa\hfil}% {\FigCapFont #1}\par #2\par \fi \vskip\belowcaptionskip} \DeclareOldFontCommand{\rm}{\normalfont\rmfamily}{\mathrm} \DeclareOldFontCommand{\sf}{\normalfont\sffamily}{\mathsf} \DeclareOldFontCommand{\tt}{\normalfont\ttfamily}{\mathtt} \DeclareOldFontCommand{\bf}{\normalfont\bfseries}{\mathbf} \DeclareOldFontCommand{\it}{\normalfont\itshape}{\mathit} \DeclareOldFontCommand{\sl}{\normalfont\slshape}{\@nomath\sl} \DeclareOldFontCommand{\sc}{\normalfont\scshape}{\@nomath\sc} \DeclareRobustCommand*\cal{\@fontswitch\relax\mathcal} \DeclareRobustCommand*\mit{\@fontswitch\relax\mathnormal} \newcommand\@pnumwidth{1.55em} \newcommand\@tocrmarg{2.55em} \newcommand\@dotsep{4.5} \setcounter{tocdepth}{3} \newcounter{numauthors} \newif\if@break \newif\if@firstauthor \newcommand\tableofcontents{% \if@twocolumn \@restonecoltrue\onecolumn \else \@restonecolfalse \fi \chapter*{\contentsname \@mkboth{% \MakeUppercase\contentsname}{\MakeUppercase\contentsname}}% {\let\break\space \let\author\toc@author \reset@authors \let\toc@draw\relax \@starttoc{toc} \toc@draw } \if@restonecol\twocolumn\fi } \def\draw@part#1#2{% \addpenalty{-\@highpenalty}% \vskip1em plus\p@ \@tempdima1.5em \begingroup \parindent\z@\rightskip\@pnumwidth \parfillskip-\rightskip \bfseries \leavevmode \advance\leftskip\@tempdima \hskip-\leftskip {#1\hfil}\nobreak \if@pdf \else \hfil\nobreak\hb@xt@\@pnumwidth{\hss #2}% \fi \par \penalty\@highpenalty\endgroup} \let\toc@draw\relax % \def\l@part#1#2{% \toc@draw \gdef\toc@draw{\draw@part{\large #1}{\large #2}}} \def\l@chapter#1#2{% \toc@draw \gdef\toc@draw{\draw@chapter{#1}{#2}}} \def\@pnumwidth{1.8em} \def\draw@chapter#1#2{% \addpenalty{-\@highpenalty}% \vskip1em plus\p@ \@tempdima1.5em \begingroup \parindent\z@\rightskip\@pnumwidth \parfillskip-\rightskip \bfseries \leavevmode \advance\leftskip\@tempdima \hskip-\leftskip {#1\hfil}\nobreak \if@pdf \else \hfil\nobreak\hb@xt@\@pnumwidth{\hss #2}% \fi \par {\it\draw@authors}% \penalty\@highpenalty\endgroup} \def\toc@author#1#2{% \if@firstauthor \@firstauthorfalse \else \ifx\@authors\@empty \xdef\@authors{\last@author}% \else \@cons{\@authors}{, \last@author}\fi\fi \stepcounter{numauthors}% %%%%%%% commented and deleted below the second part to aviod inaccessible error % shashi % September-2008 %% \gdef\last@author{#1 {\rm\fontsize{9\p@}{11\p@}\selectfont #2}} \gdef\last@author{#1} } \def\draw@authors{% \let\@t\@authors \ifx\@t\@empty \let\@t\last@author\fi \ifx\@t\@empty\else \hskip\leftskip \ifx\@authors\@empty \else \@authors \ifnum\c@numauthors>2,\fi \if@break\break\fi \ and \fi \last@author\break\fi \reset@authors} \def\reset@authors{% \gdef\@authors{}% \gdef\last@author{}% \@firstauthortrue \setcounter{numauthors}{0}} \newlength\section@toc@skip \section@toc@skip1.5em \newlength\SectionTOCWidth \SectionTOCWidth2.3em \def\l@section#1#2{% \toc@draw \gdef\toc@draw{\draw@section{#1}{#2}}} \def\draw@section#1#2{% \@dottedtocline{1}{\section@toc@skip}{\SectionTOCWidth}{#1 }{{ \tocfont #2}}} \newlength\subsection@toc@skip \subsection@toc@skip\section@toc@skip \advance\subsection@toc@skip\SectionTOCWidth \newlength\SubSectionTOCWidth \SubSectionTOCWidth3.2em \def\l@subsection#1#2{% \toc@draw \gdef\toc@draw{\draw@subsection{#1}{#2}}} \def\draw@subsection#1#2{% \@dottedtocline{2}{\subsection@toc@skip}{\SubSectionTOCWidth}{#1}{{ \tocfont #2}}} \newlength\subsubsection@toc@skip \subsubsection@toc@skip\subsection@toc@skip \advance\subsubsection@toc@skip\SubSectionTOCWidth \newlength\SubSubSectionTOCWidth \SubSubSectionTOCWidth4.1em \def\l@subsubsection#1#2{% \toc@draw \gdef\toc@draw{\draw@subsubsection{#1}{#2}}} \def\draw@subsubsection#1#2{% \@dottedtocline{3}{\subsubsection@toc@skip}{\SubSubSectionTOCWidth}{#1}{{ \tocfont #2}}} \newlength\paragraph@toc@skip \paragraph@toc@skip\subsubsection@toc@skip \advance\paragraph@toc@skip\SubSubSectionTOCWidth \newlength\ParagraphTOCWidth \ParagraphTOCWidth4.1em \def\l@paragraph#1#2{% \toc@draw \gdef\toc@draw{\draw@paragraph{#1}{#2}}} \def\draw@paragraph#1#2{% \@dottedtocline{4}{\paragraph@toc@skip}{\ParagraphTOCWidth}{#1}{{ \tocfont #2}}} \newlength\subparagraph@toc@skip \subparagraph@toc@skip\paragraph@toc@skip \advance\subparagraph@toc@skip\ParagraphTOCWidth \def\l@subparagraph#1#2{% \toc@draw \gdef\toc@draw{\draw@subparagraph{#1}{#2}}} \def\draw@subparagraph#1#2{% \@dottedtocline{5}{\subparagraph@toc@skip}{6em}{#1}{{ \tocfont #2}}} \def\@dottedtocline#1#2#3#4#5{% \ifnum #1>\c@tocdepth \else \vskip \z@ \@plus.2\p@ {\leftskip #2\relax\rightskip\@tocrmarg\parfillskip-\rightskip \parindent #2\relax\@afterindenttrue \interlinepenalty\@M \leavevmode \@tempdima #3\relax \advance\leftskip\@tempdima\null\hskip-\leftskip {#4\hfil}\nobreak \if@pdf \else \leaders\hbox{$\m@th\mkern\@dotsep mu\hbox{.}\mkern\@dotsep mu$}\hfill \nobreak \hb@xt@\@pnumwidth{\hfil\normalfont\normalcolor #5}% \fi \par}\fi} \newcommand\chapterauthors{% \def\break{\string\break\ }% \def\protect##1{\string ##1 }} \def\end@cas{} \def\end@casplusone{\vskip4pt\@doendpe} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \def\make@chaptoc{% chapter author {\parindent\z@ \newcommand\FolioBoldFont{}% \let\@b\bullet \def\bullet{\raisebox{2pt}{$\scriptscriptstyle\@b$}}% \let\SubsectionItalicFont\it %\ifx\chapter@author\@empty\else {\rm\fontsize{10\p@}{10\p@}\bfseries\selectfont %\the\c@numauthors \ifnum\c@numauthors=1 \chapter@authorone\vskip6\p@ {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationone}\vskip12\p@ \fi \ifnum\c@numauthors=2 \chapter@authorone\vskip6\p@ {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationone}\vskip12\p@ \chapter@authortwo\vskip6\p@ {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationtwo} \fi \ifnum\c@numauthors=3 \chapter@authorone\vskip6\p@ {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationone}\vskip12\p@ \chapter@authortwo\vskip6\p@ {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationtwo}\vskip12\p@ \chapter@authorthree\vskip6\p@ {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationthree} \fi \ifnum\c@numauthors=4 \chapter@authorone\vskip6\p@ {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationone}\vskip12\p@ \chapter@authortwo\vskip6\p@ {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationtwo}\vskip12\p@ \chapter@authorthree\vskip6\p@ {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationthree}\vskip12\p@ \chapter@authorfour\vskip6\p@ {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationfour} \fi } \gdef\chapter@authorone{}\gdef\chapter@affiliationone{}% \gdef\chapter@authortwo{}\gdef\chapter@affiliationtwo{}% \gdef\chapter@authorthree{}\gdef\chapter@affiliationthree{}% \gdef\chapter@authorfour{}\gdef\chapter@affiliationfour{}% \vskip 14.6\p@ {\leftskip\secnumwidth\def\author##1##2{}\vskip14pt\hbox{\leftskip0pt\SubsectionHeadFont CONTENTS}\vskip6pt\par\@input{\thechapter.toc}\par}% } \reset@authors} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \newif\iffinishedfromone \global\finishedfromonefalse % \newif\iffinishedfromtwo \global\finishedfromtwofalse % \newif\iffinishedfromthree \global\finishedfromthreefalse % \newif\iffinishedfromfour \global\finishedfromfourfalse %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % \newcommand\singleauthorchapter{\finishedfromonetrue} \newcommand\twoauthorchapter{\finishedfromtwotrue} \newcommand\threeauthorchapter{\finishedfromthreetrue} \newcommand\fourauthorchapter{\finishedfromfourtrue} % %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \newif\iffinish \global\finishfalse %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \newsavebox\@AUonebox \newsavebox\@AUtwobox \newsavebox\@AUthreebox \newsavebox\@AUfourbox % \newsavebox\@AUaffonebox \newsavebox\@AUafftwobox \newsavebox\@AUaffthreebox \newsavebox\@AUafffourbox % \newsavebox\@finalAUboxfromone \newsavebox\@finalAUboxfromtwo \newsavebox\@finalAUboxfromthree \newsavebox\@finalAUboxfromfour %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \def\@ca#1#2{% % \def\chapter@author{#1}% % \def\chapter@affiliation{#2}% \if@filesw% \write\@auxout{% \string\@writefile{toc}{\string\author{#1}{}}% }% \fi %%%%%%%%%%%%%%% \ifnum\c@numauthors>4 \resetcounter{numauthors} \fi \stepcounter{numauthors} %%\the\c@numauthors \ifnum\c@numauthors=1 % \sbox\@AUonebox{\CAPlusOneFont#1} \sbox\@AUaffonebox{\vbox{\hsize\textwidth\CAAPlusOneFont\noindent #2\par}} \sbox\@finalAUboxfromone{\copy\@AUonebox} \def\chapter@authorone{\copy\@finalAUboxfromone} \def\chapter@affiliationone{\copy\@AUaffonebox} \fi \ifnum\c@numauthors=2 \sbox\@AUtwobox{\CAPlusOneFont#1} \sbox\@AUafftwobox{\vbox{\hsize\textwidth\CAAPlusOneFont\noindent #2\par}} \sbox\@finalAUboxfromtwo{\copy\@AUtwobox} \def\chapter@authortwo{\copy\@finalAUboxfromtwo} \def\chapter@affiliationtwo{\copy\@AUafftwobox} \fi \ifnum\c@numauthors=3 \sbox\@AUthreebox{\CAPlusOneFont#1} \sbox\@AUaffthreebox{\vbox{\hsize\textwidth\CAAPlusOneFont\noindent #2\par}} \sbox\@finalAUboxfromthree{\copy\@AUthreebox} \def\chapter@authorthree{\copy\@finalAUboxfromthree} \def\chapter@affiliationthree{\copy\@AUaffthreebox} \fi \ifnum\c@numauthors=4 \sbox\@AUfourbox{\CAPlusOneFont#1} \sbox\@AUafffourbox{\vbox{\hsize\textwidth\CAAPlusOneFont\noindent #2\par}} \sbox\@finalAUboxfromfour{\copy\@AUfourbox} \def\chapter@authorfour{\copy\@finalAUboxfromfour} \def\chapter@affiliationfour{\copy\@AUafffourbox} \fi} \def\@caplusone{\@ifstar{\@scaplusone}{\@ifnextchar[{\@xcaplusone}{\@xcaplusone[]}}} \def\@xcaplusone[#1]#2#3{% \def\@@empty{#1}\ifx\@empty\@@empty\@ca{#2}{#3}\else\@ca{#2}{#1}\fi\@scaplusone{#2}{#3}} \def\@scaplusone#1#2{% \ifhmode\vskip-12pt\fi %%Shashi Commented %%% \noindent\hskip3pc{\CAPlusOneFont\baselineskip14pt #1\def\@t{#2}\ifx\@t\@empty\else,\fi}\hskip6pt{\CAAPlusOneFont #2}\par } \def\chapterauthoronly#1#2{\@ca{#1}{}\@scaplusone{#1}{#2}} \def\myaddcontentsline#1#2#3{% \if@filesw \begingroup \let\label\@gobble\let\index\@gobble\let\glossary\@gobble \def\break{\ }% \def\protect##1{\string ##1 }% \@temptokena{\thepage}% \edef\@tempa{\write#1{\string\chapcontentsline{#2}{\string\raggedright\space #3}{\the\@temptokena}}}\@tempa \if@nobreak\ifvmode\nobreak\fi\fi \endgroup \fi} \def\chapcontentsline#1{\csname l@#1\endcsname} \def\l@chapsection{\@mydottedtocline{1}{\z@}{6pt}} \def\l@chapsubsection{\@mydottedtocline{2}{\secnumwidth}{6pt}} \def\l@chapsubsubsection{\@mydottedtocline{3}{\subsecnumwidth}{36pt}} \newcount\c@chaptocdepth \setcounter{chaptocdepth}{3} \def\@mytocline#1#2#3#4#5{% \ifnum #1>\c@chaptocdepth \else \vskip 2pt plus.2\p@ \ifnum #1=1\ifnum\c@chaptocdepth>1\addvspace{12pt}\fi\fi {\leftskip #2\relax% \rightskip \@tocrmarg \parfillskip -\rightskip \interlinepenalty\@M \leavevmode \@tempdima #3\relax \rightskip\z@ \vbox{\ChapTOCFont #4\nobreak}% \par}\fi} \def\@mydottedtocline#1#2#3#4#5{% \ifnum #1>\c@chaptocdepth \else \vskip 2pt plus.2\p@ {\leftskip #2\relax \rightskip \@tocrmarg \parfillskip -21pt %-\rightskip % \parindent #2\relax\@afterindenttrue \interlinepenalty\@M \leavevmode \def\@dotsep{1.2}% \@tempdima #3\relax \rightskip\z@ \advance\hsize-\secnumwidth {\fontsize{9.5\p@}{\baselineskip}\selectfont #4 \nobreak\leaders\hbox{$\m@th\mkern\@dotsep mu.\mkern\@dotsep mu$} \hfill\hbox to 1.5pc{\hfill#5}} \par}\fi} \newcommand\listoffigures{% \if@twocolumn \@restonecoltrue\onecolumn \else \@restonecolfalse \fi \chapter*{\listfigurename}% \@mkboth{\MakeUppercase\listfigurename}% {\MakeUppercase\listfigurename}% \@starttoc{lof}% \if@restonecol\twocolumn\fi } \newcommand*\l@figure{\@dottedtocline{1}{1.5em}{2.3em}} \newcommand\listoftables{% \if@twocolumn \@restonecoltrue\onecolumn \else \@restonecolfalse \fi \chapter*{\listtablename}% \@mkboth{% \MakeUppercase\listtablename}% {\MakeUppercase\listtablename}% \@starttoc{lot}% \if@restonecol\twocolumn\fi } \let\l@table\l@figure \newdimen\bibindent \setlength\bibindent{1.5em} \newenvironment{thebibliography}[1] {\chapter*{\bibname}% \@mkboth{\MakeUppercase\bibname}{\MakeUppercase\bibname}% \addcontentsline{toc}{chapter}{\bibname} \list{\@biblabel{\@arabic\c@enumiv}}% {\settowidth\labelwidth{\@biblabel{#1}}% \leftmargin\labelwidth \advance\leftmargin\labelsep \@openbib@code \usecounter{enumiv}% \let\p@enumiv\@empty \renewcommand\theenumiv{\@arabic\c@enumiv}}% \sloppy \clubpenalty4000 \@clubpenalty \clubpenalty \widowpenalty4000% \sfcode`\.\@m} {\def\@noitemerr {\@latex@warning{Empty `thebibliography' environment}}% \endlist} \newcommand\newblock{\hskip .11em\@plus.33em\@minus.07em} \let\@openbib@code\@empty \newenvironment{theindex} {\cleardoublepage\if@twocolumn \@restonecolfalse \else \@restonecoltrue \fi \twocolumn[\@makeschapterhead{\indexname}]% \@mkboth{\MakeUppercase\indexname}% {\MakeUppercase\indexname}% \pagestyle{headings} \addcontentsline{toc}{chapter}{\indexname} \thispagestyle{folio}\parindent\z@ \parskip\z@ \@plus .3\p@\relax \columnseprule \z@ \columnsep 35\p@ \let\item\@idxitem} {\if@restonecol\onecolumn\else\clearpage\fi} \newcommand\@idxitem{\par\hangindent 40\p@} \newcommand\subitem{\@idxitem \hspace*{20\p@}} \newcommand\subsubitem{\@idxitem \hspace*{30\p@}} \newcommand\indexspace{\par \vskip 10\p@ \@plus5\p@ \@minus3\p@\relax} \renewcommand\footnoterule{% \kern-3\p@ \hrule\@width.4\columnwidth \kern2.6\p@} \@addtoreset{footnote}{chapter} \newcommand\@makefntext[1]{% \parindent 1em% \noindent \hb@xt@1.8em{\hss\@makefnmark}#1} \newcommand\contentsname{Contents} \newcommand\listfigurename{List of Figures} \newcommand\listtablename{List of Tables} \newcommand\bibname{Bibliography} \newcommand\indexname{Index} \newcommand\figurename{FIGURE} \newcommand\tablename{TABLE} \newcommand\partname{Part} \newcommand\chaptername{Chapter} \newcommand\appendixname{Appendix} \def\today{\ifcase\month\or January\or February\or March\or April\or May\or June\or July\or August\or September\or October\or November\or December\fi \space\number\day, \number\year} \setlength\columnsep{10\p@} \setlength\columnseprule{0\p@} \pagestyle{headings} \pagenumbering{arabic} \if@twoside \else \raggedbottom \fi \if@twocolumn \twocolumn \sloppy \flushbottom \else \onecolumn \fi \newcommand\unnumcrcrule{\hbox to\textwidth{\rlap{\rule[-3.5\p@]{84\p@}{4\p@}}}} \newcommand\unnumchap@rule{\unnumcrcrule} \newcommand\crcrule{\hbox to\textwidth{\rlap{\rule[-3.5\p@]{84\p@}{4\p@}}\rule{\textwidth}{.5\p@}}} \newcommand\chap@rule{\crcrule} \newcommand\sec@rule{\crcrule} \def\@affiliate[#1]{\gdef\@affiliation{#1}} \def\@affiliation{} \def\def@theequation{% \if@numberinsequence \def\theequation{% \if@numbysec\thesection\else\thechapter\fi.% \@arabic\c@shared}% \else \def\theequation{% \if@numbysec\thesection\else\thechapter\fi.% \@arabic\c@equation}\fi} \def\affiliation#1{{\AffiliationFont\noindent #1\vskip 36bp}} \newbox\tempbox \newdimen\nomenwidth \newenvironment{symbollist}[1]{% \addvspace{12pt} \setbox\tempbox\hbox{#1\hskip1em}% \global\nomenwidth\wd\tempbox %\section*{Sumbol Description} \noindent{\SectionHeadFont Symbol Description}\vskip6pt \begin{multicols}{2}}{% \end{multicols}\par\addvspace{12pt}} \def\symbolentry#1#2{\par\noindent\@hangfrom{\hbox to \nomenwidth{#1\hss}}#2\par} \tabcolsep 5pt \arrayrulewidth .5pt \doublerulesep 1pt %\newcounter{subtable}[table] \newif\if@tablerules\@tablerulestrue \newif\if@centertable\@centertabletrue \newif\if@centertabletitle\@centertabletitletrue \newbox\@tablebox \newbox\@tabletitlebox \newdimen\@tablewidth \newdimen\@tabletitlewidth \newdimen\max@tablewidth \newcommand\automaticrules{\@tablerulestrue} \newcommand\noautomaticrules{\@tablerulesfalse} \def\thetable{% \thechapter.% \@arabic\c@table} \def\thesubtable{% \thechapter.% \@arabic\c@table\alph{subtable}} \def\resettableletter{\setcounter{subtable}{0}} \def\@Tabletitle{} \newcommand\tabletitle{\@ifnextchar[{\@xtabletitle}{\@tabletitlewidth\z@\@ytabletitle}} \def\@@tabletitle{} \newif\ifshorttabletitle \global\shorttabletitlefalse %\def\@xtabletitle#1{\@tabletitlewidth#1\@ytabletitle} % \def\@xtabletitle[#1]#2{% \gdef\@@tabletitle{#1}% \gdef\@tabletitle{#2}% \let\@Tabletitle\@TableTitle \refstepcounter{table}% {\let\footnotemark\@empty \let\footnote\@gobble \addcontentsline{\ext@table}{table}{\protect\numberline{\thetable}{\@@tabletitle}}}} %%%% %\long\def\@xtabletitle[#1]#2{% % \setbox\@ttbox\hbox{#1}\global\shorttabletitletrue % \def\@@tabletitle{\ifx\@ttbox\@empty\else#1\fi}% % \def\@tabletitle{#2}% % \let\@Tabletitle\@TableTitle % \refstepcounter{table}% % {\let\footnotemark\@empty % \let\footnote\@gobble % \addcontentsline{\ext@table}{table}{\protect\numberline{\thetable}{% %\ifshorttabletitle\@@tabletitle\else\@tabletitle\fi}}}} %%% % \long\def\@ytabletitle#1{% \def\@tabletitle{#1}% \let\@Tabletitle\@TableTitle \refstepcounter{table}% {\let\footnotemark\@empty \let\footnote\@gobble \addcontentsline{\ext@table}{table}{\protect\numberline{\thetable}{\@tabletitle}}}} \def\tabletitlelet{\@ifnextchar[{\@xtabletitlelet}{\@tabletitlewidth\z@\@ytabletitlelet}} \def\@xtabletitlelet[#1]{\@tabletitlewidth#1\@ytabletitlelet} \long\def\@ytabletitlelet#1{% \def\@tabletitle{#1}% \let\@Tabletitle\@TableTitle \ifnum\c@subtable=0\stepcounter{table}\fi \let\@currentlabel\thesubtable {\let\footnotemark\@empty \let\footnote\@gobble \addcontentsline{\ext@table}{table}{\protect\numberline{\thetable}{\@tabletitle}}}} \def\@TableTitle{% \noindent {% \vbox{{\TableNumberFont TABLE\ \thetable}}\par\TableTitleFont\@tabletitle}} \def\table{% %\long\def\caption##1{\tabletitle{##1}\@TableTitle\par}% \@float{table}} \@namedef{table*}{% \long\def\caption##1{\tabletitle{##1}\@TableTitle\par}% \@dblfloat{table}} \def\@tabular{% \leavevmode \if@centertable\hfil\fi \vbox\bgroup \setbox\@tablebox\hbox\bgroup \baselineskip11pt \global\let\@fn\@empty \def\footnote##1{\footnotemark\gdef\@fn{##1}} \renewcommand{\arraystretch}{.916666666667}% $\let\@acol\@tabacol \let\@classz\@tabclassz \let\@classiv\@tabclassiv \let\\\@tabularcr \@tabarray} \def\endtabular{% \crcr\egroup\egroup $\egroup \@tablewidth\wd\@tablebox \ifnum\@tabletitlewidth>0 {\hsize\@tabletitlewidth\raggedright\global\setbox\@tabletitlebox\vbox{\@Tabletitle}}% \else \setbox\@tabletitlebox\hbox{\@Tabletitle}% \ifnum\wd\@tabletitlebox>\@tablewidth {\hsize\@tablewidth\raggedright\global\setbox\@tabletitlebox\vbox{\@Tabletitle}}\fi \@tabletitlewidth\wd\@tabletitlebox\fi \ifnum\@tabletitlewidth>0 \ifnum\@tabletitlewidth>\@tablewidth\@tablewidth\@tabletitlewidth\fi \hbox to\@tabletitlewidth{\if@centertabletitle\hfil\fi\box\@tabletitlebox\hfil}\par\fi \max@tablewidth\@tablewidth \ifnum\@tabletitlewidth>\max@tablewidth\max@tablewidth\@tabletitlewidth\fi \if@tablerules \ifnum\@tabletitlewidth>0\vskip-6pt\fi \hbox to\max@tablewidth{\if@centertable\hfil\fi\rule{\@tablewidth}{1pt}\hfil}\par\fi \hbox to\max@tablewidth{\if@centertable\hfil\fi\box\@tablebox\hfil}\vskip1pt \if@tablerules\hbox to\max@tablewidth{\if@centertable\hfil\fi\rule{\@tablewidth}{1pt}\hfil}\par\fi \ifx\@fn\@empty\else\FootnoteFont\parindent\z@\noindent\@makefnmark\@fn\par\fi \egroup\hfil \vskip 0pt plus 12pt \gdef\@Tabletitle{}} \def\tch#1{\TableColHeadFont #1\llstrut\hfill} \def\tsh#1{\TableSubheadFont #1\hfill} \newcommand\llstrut{\rule[-6pt]{0pt}{14pt}} \newcommand\flstrut{\rule{0pt}{10pt}} \newcommand\tabletitlestrut{\rule{0pt}{20pt}} \def\Boxhead#1{\par\addvspace{3pt plus2pt}\noindent{\centering\bfseries#1\par}\vskip3pt} \newbox\tempbox% \newdimen\tempdimen% % \newenvironment{shortbox}{\par\addvspace{12pt plus2pt}% \if@krantza \setbox\tempbox\vbox\bgroup\hsize27pc% \else\if@krantzb \setbox\tempbox\vbox\bgroup\hsize32pc% \else \setbox\tempbox\vbox\bgroup\hsize25pc% \fi\fi }{% \egroup% \noindent\fboxsep6pt\fboxrule.5pt\hspace*{0pt}\fbox{\box\tempbox} \par\addvspace{12pt plus2pt}}% % \def\grayink{\special{color cmyk 0 0 0 0.2}} \def\blackink{\special{color cmyk 0 0 0 1.0}} % \def\whiteink{\special{color cmyk 0 0 0 0}} % 0% \newenvironment{shadebox}{% \setbox\tempbox\hbox\bgroup\vbox\bgroup\leftskip12pt\rightskip\leftskip}{\par\addvspace{12pt} \egroup\egroup\par\addvspace{25pt} \tempdimen\ht\tempbox \advance\tempdimen by 1pc \noindent{\hbox to \wd\tempbox{\vbox to \ht\tempbox{\hsize\textwidth{\special{color push}\grayink\vspace*{-12pt}\noindent\vrule height\tempdimen width\textwidth \special{color pop}\blackink}}}}% \llap{\unhbox\tempbox}\par\addvspace{12pt}} %%%%%%%%%% Note %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \newbox\tempbox \newdimen\notewidth \newenvironment{notelist}[1]{% \addvspace{6pt} \setbox\tempbox\hbox{#1\hskip.57em}% \global\notewidth\wd\tempbox }{% \par\addvspace{6pt}} \def\notes#1#2{\par\noindent\@hangfrom{\hbox to \notewidth{\bf #1\hss}}#2\par} %%%%%%%%%%%%%%%% wherelist %%%%%%%%%%%%%%%% \newbox\wherebox \newdimen\wherewidth \newenvironment{wherelist}[1]{\leftskip10pt% \addvspace{6pt} \setbox\wherebox\hbox{#1\hskip1em}% \global\wherewidth\wd\wherebox \noindent\hspace*{-14pt} where }{% \par\addvspace{6pt}} \def\whereentry#1#2#3{\par\noindent\@hangfrom{\hbox to \wherewidth{#1\hss}#2\hskip6pt}#3\par} %%%%%%%%%%%% \newenvironment{unnumlist}{% \ifnum \@enumdepth >3 \@toodeep\else \advance\@enumdepth\@ne \list{}{% \leftmargini27.5pt \leftmarginii17.5pt\leftmarginiv17.5pt % \leftmargin\parindent \advance\leftmargin-.2em \advance\leftmarginii.2em \advance\leftmarginiii.1em \advance\leftmarginiv.2em \def\makelabel##1{\hss\llap{##1}}} \fi% }{% \endlist} % \newenvironment{extract}{% \par\addvspace{11.5pt minus2pt}% \leftskip2em\rightskip\leftskip \noindent\ignorespaces }{% \par\addvspace{11.5pt minus2pt}% \@endparenv} % % \def\VA#1#2{\addvspace{12pt}\raggedleft #1\rightskip3em\par #2\rightskip3em} % \newenvironment{VF}{\VfFont% \par\addvspace{12pt minus2pt}% \noindent{\vrule height2pt width\textwidth}\par\vskip7.3pt \leftskip3em\rightskip\leftskip \noindent\ignorespaces }{% \par\vskip6pt\leftskip0pt\noindent{{\vrule height2pt width\textwidth}}\par\addvspace{12pt minus2pt}% \@endparenv} % \def\VTA#1#2{\addvspace{12pt}\raggedleft #1\rightskip3em\par {\it #2}\rightskip3em} % % \def\VT{\par\addvspace{3.5pt}\noindent} \def\VH#1{{\normalfont\fontsize{12.5}{14.5}\itshape\centering\selectfont #1\par}\addvspace{5.5pt}} % \newenvironment{VT1}{\VfFont% \par\addvspace{12pt minus2pt}% \noindent{\vrule height2pt width\textwidth}\par\vskip7.5pt \leftskip3em\rightskip\leftskip %\@afterheading \parindent0pt \noindent\ignorespaces }{% \par\vskip6pt\leftskip0pt\noindent{{\vrule height2pt width\textwidth}}\par\addvspace{10pt minus2pt}% \@endparenv} % %%%%%%%%%%%% Glossary %%%%%%%%%%%%%%%%%%%%%%% \newenvironment{Glossary} {\list{}{\labelwidth\z@\leftmargin18pt \itemindent-18pt \let\makelabel\glosslabel}} {\endlist} \newcommand\glosslabel[1]{\hspace\labelsep\normalfont\bfseries #1:} %%%%%%%%%%%% \newif\iffnalpha \global\fnalphafalse \newskip\listtextleftmargin\listtextleftmargin 20pt%24pt \newskip\listtextleftmarginii\listtextleftmarginii0pt% 24pt \newskip\listtextleftmarginiii\listtextleftmarginiii0pt% 24pt \newskip\listtextrightmargin\listtextrightmargin12pt%.5pc \newskip\listlabelleftskip \listlabelleftskip4pt%3.3pt \newskip\listlabelleftskipii \listlabelleftskipii0pt%3.3pt \newskip\listlabelleftskipiii \listlabelleftskipiii0pt%3.3pt \newskip\abovelistskipi\abovelistskipi6pt plus2pt \newskip\belowlistskipi\belowlistskipi6pt plus2pt \newskip\abovelistskipii\abovelistskipii0pt plus2pt \newskip\belowlistskipii\belowlistskipii0pt plus2pt \newskip\abovelistskipiii\abovelistskipiii0pt plus2pt \newskip\belowlistskipiii\belowlistskipiii0pt plus2pt \newskip\labelsepi \labelsepi6pt \newskip\labelsepii \labelsepii6pt \newskip\labelsepiii \labelsepiii6pt%\z@ \newskip\itemsepi \itemsepi0pt%10pt \newskip\itemsepii \itemsepii0pt \newskip\itemsepiii \itemsepiii0pt \newdimen\enumdimwd \newif\iflabelrightalign\labelrightaligntrue \newdimen\enumdim% % \def\enummax#1{% \labelsep\csname labelsep\romannumeral\the\@enumdepth\endcsname \ifdim\listtextleftmargin>\z@\labelsepi0pt\fi \ifdim\listtextleftmarginii>\z@\labelsepii0pt\fi \ifdim\listtextleftmarginiii>\z@\labelsepiii0pt\fi \setbox\tempbox\hbox{\csname listdevicefont\romannumeral\the\@enumdepth\endcsname#1\hskip\labelsep}% \enumdim\wd\tempbox \setbox\tempbox\hbox{\csname listdevicefont\romannumeral\the\@enumdepth\endcsname#1}% \enumdimwd\wd\tempbox \expandafter\global\csname leftmargin\romannumeral\the\@enumdepth\endcsname\enumdim \ifdim\listtextleftmargin>\z@ \leftmargini\listtextleftmargin \ifdim\listlabelleftskip>\z@ \advance\leftmargini-\listlabelleftskip \fi \fi \ifdim\listtextleftmarginii>\z@ \leftmarginii\listtextleftmarginii \ifdim\listlabelleftskipii>\z@ \advance\leftmarginii-\listlabelleftskipii \fi \fi \ifdim\listtextleftmarginiii>\z@ \leftmarginiii\listtextleftmarginiii \ifdim\listlabelleftskipiii>\z@ \advance\leftmarginiii-\listlabelleftskipiii \fi \fi } % \enummax{1.} % \def\enumerate{\@ifnextchar[{\@enumerate}{\@enumerate[\csname label\@enumctr\endcsname]}}%% % \def\@enumerate[#1]{\par \ifnum \@enumdepth >3 \@toodeep \else \advance\@enumdepth\@ne \edef\@enumctr{enum\romannumeral\the\@enumdepth}% \setcounter{\@enumctr}{1}\enummax{#1}% \list {\csname label\@enumctr\endcsname}{\usecounter{\@enumctr}% \topsep\csname abovelistskip\romannumeral\the\@enumdepth\endcsname \itemsep\csname itemsep\romannumeral\the\@enumdepth\endcsname % \listfont %\listparindent18.25pt \ifnum \@enumdepth=1 \leftmargin32.7pt \rightmargin\listtextrightmargin \advance\rightmargin\rightskip \advance\leftmargin\leftskip \tempdimen\leftmargini \advance\tempdimen-\labelsep %%%%%%%%%%% \iffnalpha \def\makelabel##1{{\hskip\listlabelleftskip{\csname listdevicefont\romannumeral\the\@enumdepth\endcsname{\iflabelrightalign\hss\fi\textlistlabel##1}}}}% \global\fnalphafalse \else \def\makelabel##1{\hbox to \tempdimen{\hskip\listlabelleftskip{\csname listdevicefont\romannumeral\the\@enumdepth\endcsname\hbox to \enumdimwd{\iflabelrightalign\hss\fi\textlistlabel##1}}\blackink}}% \fi %%%%%%%%%%%%%%%%%%%%%%%%%%% \else \ifnum \@enumdepth=2 \tempdimen\leftmarginii \advance\tempdimen-\labelsep \def\makelabel##1{\hbox to \tempdimen{\hskip\listlabelleftskipii{\csname listdevicefont\romannumeral\the\@enumdepth\endcsname\hbox to \enumdimwd{\iflabelrightalign\hss\fi##1}\blackink}}}% \else \ifnum \@enumdepth=3 \tempdimen\leftmarginiii \advance\tempdimen-\labelsep \def\makelabel##1{\hbox to \tempdimen{\hskip\listlabelleftskipiii{\csname listdevicefont\romannumeral\the\@enumdepth\endcsname\hbox to \enumdimwd{\iflabelrightalign\hss\fi##1}\blackink}}}% \else \def\makelabel##1{\hss\llap{\csname listdevicefont\romannumeral\the\@enumdepth\endcsname##1}}% \fi \fi \fi} \fi} % \def\endenumerate{\@topsepadd\csname belowlistskip\romannumeral\the\@enumdepth\endcsname\endlist}% % \def\textlistlabel{} %%%%%%%%%%%%%%%%%%%%%%%%%%% \newdimen\concolwidth \newbox\stempbox \def\contributor#1#2#3{\addvspace{10pt}{% \setbox\stempbox\hbox{\ContributorAffiliationFont #2} \concolwidth\wd\stempbox \noindent{\ContributorNameFont #1}\par \ifdim\concolwidth>\columnwidth \vspace*{3pt} \else \fi \noindent{\vbox{\hangindent12pt\ContributorAffiliationFont #2}}\vskip-1\p@ \noindent{\vbox{\hangindent12pt\ContributorAffiliationFont #3}}}} %%\def\contributors{% %% \twocolumn[\contributorshead] %% \pagestyle{empty} %% \leftskip1pc %% \parindent-1pc} %%\def\contributorshead{% %% \vbox{}\vskip2pc %% {\centering\HeadFont CONTRIBUTORS\vskip2\p@} %% \noindent\rule{\textwidth}{1\p@}\vskip25\p@} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \@centertabletitlefalse \HeadingsBookChapter %\HeadingsChapterSection \endinput %% %% End of file `krantz.cls'. ================================================ FILE: README.Rmd ================================================ --- output: github_document --- # Reproducible Research with R and RStudio (Third Edition) [](http://amzn.com/1498715370) Christopher Gandrud [CRC Press/Chapman & Hall](http://www.tandf.net/books/details/9781498715379/) The files in this repository comprise the source code for creating **Reproducible Research with R and RStudio**. ### File Organization ### Reproduce the Book The book can be reproduced by using the R package *bookdown*. To do this: ### Session Info The current version of the book manuscript was compiled with [RStudio](http://www.rstudio.com/) (v. 1.2.5019 preview build) with the following R session: ```{r, echo=FALSE} sessionInfo() ``` --- ( c ) Christopher Gandrud (2020) ================================================ FILE: README.md ================================================ # Reproducible Research with R and RStudio (Third Edition) [](http://amzn.com/1498715370) Christopher Gandrud [CRC Press/Chapman & Hall](http://www.tandf.net/books/details/9781498715379/) The files in this repository comprise the source code for creating **Reproducible Research with R and RStudio**. ### File Organization ### Reproduce the Book The book can be reproduced by using the R package *bookdown*. To do this: ### Session Info The current version of the book manuscript was compiled with [RStudio](http://www.rstudio.com/) (v. 1.2.5019 preview build) with the following R session: ## R version 3.6.2 (2019-12-12) ## Platform: x86_64-apple-darwin15.6.0 (64-bit) ## Running under: macOS Catalina 10.15.2 ## ## Matrix products: default ## BLAS: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib ## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib ## ## locale: ## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8 ## ## attached base packages: ## [1] stats graphics grDevices utils datasets methods base ## ## loaded via a namespace (and not attached): ## [1] compiler_3.6.2 magrittr_1.5 tools_3.6.2 htmltools_0.4.0 ## [5] yaml_2.2.0 Rcpp_1.0.3 stringi_1.4.3 rmarkdown_2.0 ## [9] knitr_1.26 stringr_1.4.0 xfun_0.11 digest_0.6.23 ## [13] rlang_0.4.2 evaluate_0.14 ----- ( c ) Christopher Gandrud (2020) ================================================ FILE: rep-res-3rd-edition/.gitignore ================================================ .Rproj.user .Rhistory .RData .Ruserdata ================================================ FILE: rep-res-3rd-edition/01-author.Rmd ================================================ # About the Author {-} **Christopher Gandrud** is Head of Economics and Experimentation at Zalando SE. He leads teams of social data scientists and software engineers building and evaluating large-scale automated decision-making systems. He was previously a research fellow at the Institute for Quantitative Social Science, Harvard University developing statistical software for the social and physical sciences. He has held posts at City, University of London, the Hertie School of Governance, Yonsei University, and the London School of Economics where in 2012 he completed a PhD in quantitative political science. ================================================ FILE: rep-res-3rd-edition/01-stylistic-conventions.Rmd ================================================ # Stylistic Conventions {-} I use the following conventions throughout the book: - **Abstract variables**: Abstract variables, i.e. variables that do not reference specific objects, are in `ALL CAPS TYPEWRITER TEXT`. - **Clickable buttons**: Clickable buttons are in `typewriter text`. - **Code**: All code is in `typewriter text`. - **File names and directories**: File names and directories more generally are printed in *italics*. Words are separated by em dashes---*kebab-case*.[^kebab_url] - **File extensions**: Like file names, file extensions are *italicized*. - **Individual variable values**: Individual variable values mentioned in the text are in *italics*. - **Objects**: Objects are printed in *italics*. I use underscores (`_`) to separate words in object names. - **Object columns**: Data frame object columns are printed in **bold**. - **R Function names**: are followed by parentheses (e.g., `stats::lm()`) - **Packages**: **R** packages are printed in *italics*. When a system, rather than the package that shares its name is referred to, it is not italicized, e.g. R Markdown (system) vs. *rmarkdown* (package).[^system_names] - **Windows and RStudio panes**: Open windows and RStudio panes are written in *italics*. - **Variable names**: Variable names are printed in **bold**. Underscores (`_`) separate words in variable names. [^system_names]: See Yihui Xie's comment at: . Posted 14 January 2016. [^kebab_url]: See . Posted 23 July 2013. ================================================ FILE: rep-res-3rd-edition/02-additional-resources.Rmd ================================================ # Additional Resources {-} You can freely download additional resources supplementing examples in this book. These resources include longer examples discussed in individual chapters and a complete short reproducible research project. ## Chapter Examples {-} Longer examples discussed in individual chapters, including files to dynamically download data, code for creating figures, and markup files for creating presentation documents, can be accessed at: . Please see Chapter \@ref(Storing) for more information on downloading files from GitHub, where the examples are stored.\index{GitHub} ## Short Example Project {-} To download a full (though very short) example of a reproducible research project created using the tools covered in this book, go to: . Please follow the replication instructions in the main *README.md*. It is a good idea to hold off looking at this complete example in detail until after you have become acquainted with the individual tools it uses. Become acquainted with the tools by reading through this book and working with the chapter examples. The following two figures give you a sense of how the example's files are organized. Figure \@ref(fig:ExampProjeFiles) shows how the files are organized in the file system. Figure \@ref(fig:ExampProjDiagram) illustrates how the main files are dynamically tied together. In the *data* directory, we have files to gather raw data from the @worldbank2013 on fertilizer consumption and from @pemstein2010 on countries' levels of democracy. They are tied to the data through the `WDI()`\index{R function!WDI} and `download.file()` functions.\index{R function!download.file()} A *Makefile*\index{Makefile} can run *gather-1* and *gather-2.R* to gather and clean the data. It runs *merge-data.R* to merge the data into one data file called *main-data.csv*. It also automatically generates a variable description file and a *README.md*\index{README file} recording the session info.\index{R!session info} The *analysis* folder contains two files that create figures presenting this data. They are tied to *main-data.csv* with the `import()` function.\index{R function!import} These files are run by the presentation documents when they are knitted. The presentation documents tie to the analysis documents with *knitr* and the `source()` function.\index{R function!source()} Though a simple example, hopefully these files will give you a complete sense of how a reproducible research project can be organized. Please feel free to experiment with different ways of organizing the files and tying them together to make your research really reproducible. ```{r ExampProjeFiles, engine = "tikz", fig.cap = "Short Example Project File Tree", cache=TRUE, echo=FALSE, fig.ext=if (knitr:::is_latex_output()) 'pdf' else 'png'} \usetikzlibrary{trees} % Set node styles \tikzstyle{DirBox} = [draw=black, rectangle, minimum width=5em, very thick, font=\small] \tikzstyle{every node} = [draw=gray, thin, anchor=west, font=\small] % Begin tikz picture \begin{tikzpicture}[% grow via three points={one child at (0.5,-0.7) and two children at (0.5,-0.7) and (0.5,-1.4)}, edge from parent path={(\tikzparentnode.south) |- (\tikzchildnode.west)}] % Root Directory \node (root) at (5, 10) [DirBox]{root}; % Project Directory \node (project) at (4, 8.5) [DirBox]{rep-res-book-v3-examples} child {node {{\small{paper.Rmd}}}} child {node {{\small{slideshow.Rmd}}}} child {node {{\small{website.Rmd}}}} child {node {{\small{main.bib}}}} ; % Data Directory \node (data) at (0, 4.5) [DirBox]{data} child {node {{\small{main-data.csv}}}} child {node {{\small{Makefile}}}} child {node {{\small{merge-data.R}}}} child {node {{\small{gather-1}}}} child {node {{\small{gather-2.R}}}} child {node {{\small{main-data-variable-descriptions.md}}}} child {node {{\small{README.Rmd}}}} ; % analysis subdirectores/files \node (analysis) at (1.5, 7) [DirBox]{analysis} child {node {{\small{googlevis-map.R}}}} child {node {{\small{scatter-uds-fert.R}}}} ; % README file \node (readme) at (9.5, 7) {README.md}; \node (rproj) at (10, 6) {rep-res-book-v3-examples.Rproj}; % Connect boxes that are not explicit children \draw (root) -- (project); \draw (project) -| (analysis); \draw (project) -| (data); \draw (project) -| (readme); \draw (project) -| (rproj); \end{tikzpicture} ``` ```{r ExampProjDiagram, engine = "tikz", fig.cap = "Short Example Main File Ties", cache=TRUE, echo=FALSE, fig.ext=if (knitr:::is_latex_output()) 'pdf' else 'png'} \usetikzlibrary{trees} \usetikzlibrary{decorations.pathmorphing} \usetikzlibrary{shapes,arrows} \definecolor{Blue}{HTML}{7BCCC4} \definecolor{LiteBlue}{HTML}{A8DDB5} \definecolor{DarkBlue}{HTML}{08589E} \definecolor{GrayLine}{HTML}{BDBDBD} % Set node styles %% File nodes \tikzstyle{File} = [draw=Blue, rectangle, text width=6.3em, font=\scriptsize] % Raw Data nodes \tikzstyle{RawData} = [draw=LiteBlue, %fill=LiteBlue, decorate, decoration={random steps, segment length=2pt, amplitude=2pt}, inner sep=0.25cm, font=\scriptsize] % Separator line style \tikzstyle{sepline} = [draw, very thick, color=GrayLine] % Link command nodes \tikzstyle{Links} = [draw=none, text width=6em, text=DarkBlue, font=\small] % Begin tikz picture \begin{tikzpicture} % Nodes \node (Data1) at (-3.5, 7) [RawData]{Raw WDI Data}; \node (Gather1) at (-3, 6) [File]{gather-1}; \node (Data2) at (-3.5, 5) [RawData]{Raw UDS Data}; \node (gather-2) at (-3, 4) [File]{gather-2.R}; \node (merge-data) at (0.5, 5) [File]{Makefile \\ merge-data.R}; \node (DataFile) at (0.5, 4) [File]{main-data.csv}; \node (Scatter) at (3.8, 4.5) [File]{scatter-uds-fert.R}; \node (GoogleVis) at (3.8, 3.5) [File]{googlevis-map.R}; \node (ArticleK) at (7, 5) [File]{article.Rmd}; \node (SlideshowK) at (7, 4) [File]{slideshow.Rmd}; \node (WebsiteK) at (7, 3) [File]{website.Rmd}; \node (Article) at (10, 5) [File]{article.pdf}; \node (Slideshow) at (10, 4) [File]{slideshow.pdf}; \node (Website) at (10, 3) [File]{website.html}; % Lines \draw [->] (Data1) -- (Gather1); \draw [->] (Data2) -- (gather-2); \draw [->] (Gather1) -- (merge-data); \draw [->] (gather-2) -- (merge-data); \draw [->] (merge-data) -- (DataFile); \draw [->] (DataFile) -- (Scatter); \draw [->] (DataFile) -- (GoogleVis); \draw [->] (Scatter) -- (ArticleK); \draw [->] (Scatter) -- (SlideshowK); \draw [->] (GoogleVis) -- (WebsiteK); \draw [->] (ArticleK) -- (Article); \draw [->] (SlideshowK) -- (Slideshow); \draw [->] (WebsiteK) -- (Website); \path [sepline] (-3.5, 0.75) -- (11, 0.75); % Link command nodes \node (importData) at (-1, -1) [Links]{\texttt{download.file()} \\ \texttt{Make} \\ \texttt{merge()}\\ \texttt{WDI()} }; \node (Figs) at (3, -1) [Links]{\texttt{import()}}; \node (knitr) at (7.5, -1) [Links]{ {\emph{knitr}} \\ \texttt{source()}}; \end{tikzpicture} ``` ## Updates {-} Many of the reproducible research tools discussed in this book are improving rapidly. Because of this, I will regularly post updates to the content covered in the book at: . ## Corrections {-} If you notice any corrections that should be made to fix typos, broken URLs, and so on, you can report them at: . I'll post notifications of changes to an Errata page at: . ================================================ FILE: rep-res-3rd-edition/03-introduction.Rmd ================================================ \mainmatter # (PART) Getting Started {-} # Introducing Reproducible Research{#Intro} Research is typically presented in very selective containers: slideshows, journal articles, books, or websites. These presentation documents announce a project's findings and try to convince us that the results are correct [@mesirov2010]. It's important to remember that these documents are not the research. Especially in the computational and statistical sciences, these documents are the "advertising". The research is the "full software environment, code, and data that produced the results" [@buckheit1995; @donoho2010 385]. When we separate the research from its advertisement, we are making it difficult for others to verify the findings by reproducing them. This book gives you the tools to dynamically combine your research with the presentation of your findings. The first tool is a workflow for reproducible research that weaves the principles of reproducibility throughout your entire research project, from data gathering to the statistical analysis, and the presentation of results. You will also learn how to use a number of computer tools that make this workflow easier and more robust. These tools include: - the **R** statistical language that will allow you to gather data and analyze it; - the **LaTeX** and **Markdown** markup languages that you can use to create documents--slideshows, articles, books, and webpages--for presenting your findings; - the *knitr* and *rmarkdown* **packages** for R and other tools, including **command-line programs** like GNU Make and Git version control, for dynamically tying your data gathering, analysis, and presentation documents together so that they can be easily reproduced; - **RStudio**, a program that brings all of these tools together. ## What Is Reproducible Research? Though there is some debate over the necessary and sufficient conditions for a full replication [@makel2014 2], research results are generally considered[^chapter1_1_1] *replicable* if there is sufficient information available for independent researchers to make the same findings using the same procedures with new data.[^chapter1_1] For research that relies on experiments, this can mean a researcher not involved in the original research being able to rerun the experiment, including sampling, and validate that the new results are comparable to the original results. In computational and quantitative empirical sciences, results are replicable if independent researchers can recreate findings by following the procedures originally used to gather the data and run the computer code. Of course, it is sometimes difficult to replicate the original data set because of issues such as limited resources to gather new data or because the original study already sampled the full universe of cases. So as a next-best standard, we can aim for "*really reproducible research*" [@peng2011 1226].[^chapter1_2] In computational sciences[^chapter1_3] this means: > the data and code used to make a finding are available and they are sufficient for an independent researcher to recreate the finding. In practice, research needs to be *easy* for independent researchers to reproduce [@ball2012]. If a study is difficult to reproduce, it's more likely that no one will reproduce it. If someone does attempt to reproduce this research, it will be difficult for them to tell if any errors they find were in the original research or problems they introduced during the reproduction. In this book, you will learn how to avoid these problems. In particular, you will learn tools for dynamically "*knitting*"[^chapter1_4] the data and the source code together with your presentation documents. Combined with well-organized source files and clearly and completely commented code, independent researchers will be able to understand how you obtained your results. This will make your computational research easily reproducible. ## Why Should Research Be Reproducible? Reproducible research is one of the main components of science. If that's not enough reason for you to make your research reproducible, consider that the tools of reproducible research also have direct benefits for you as a researcher. ### For science Replicability has been a key part of scientific inquiry from perhaps the 1200s [@bacon1267; @nosek2012]. It has even been called the "demarcation between science and non-science" [@braude1979 2]. Why is replication so important for scientific inquiry? #### Standard to judge scientific claims {-} *Replication* opens claims to scrutiny, allowing us to keep what works and discard what doesn't. Science, according to the American Physical Society, "is the systematic enterprise of gathering knowledge . . . organizing and condensing that knowledge into testable laws and theories". The "ultimate standard" for evaluating scientific claims is whether or not the claims can be replicated [@peng2011; @kelly2006]. Research findings cannot even really be considered "genuine contributions to human knowledge" until they have been verified through replication [@stodden2009 38]. Replication "requires the complete and open exchange of data, procedures, and materials". Scientific conclusions that are not replicable should be abandoned or modified "when confronted with more complete or reliable . . . evidence".[^chapter1_5] *Reproducibility enhances replicability*. If other researchers are able to clearly understand how a finding was originally made, then they will be better able to conduct comparable research in meaningful attempts to replicate the original findings. Sometimes strict replicability is not feasible, for example, when it is only possible to gather one data set on a population of interest. In these cases reproducibility is a "minimum standard" for judging scientific claims [@peng2011]. It is important to note that though reproducibility is a minimum standard for judging scientific claims, "a study can be reproducible and still be wrong" [@peng2014]. For example, a statistically significant finding in one study may remain statistically significant when reproduced using the original data/code, but when researchers try to replicate it using new data and even methods, they are unable to find a similar result. The original finding could have been noise, even though it is fully reproducible. #### Avoiding effort duplication and encouraging cumulative knowledge development {-} Not only is reproducibility important for evaluating scientific claims, it can also contribute to the cumulative growth of scientific knowledge [@kelly2006; @king1995]. Reproducible research cuts down on the amount of time scientists have to spend gathering data or developing procedures that have already been collected or figured out. Because researchers do not have to discover on their own things that have already been done, they can more quickly build on established findings and develop new knowledge. ### For you Working to make your research reproducible does require extra upfront effort. For example, you need to put effort into learning the tools of reproducible research by doing things such as reading this book. But beyond the clear benefits for science, why should you make this effort? Using reproducible research tools can make your research process more effective and (hopefully) ultimately easier. #### Better work habits {-} Making a project reproducible from the start encourages you to use better work habits. It can spur you to more effectively plan and organize your research. It should push you to bring your data and source code up to a higher level of quality than you might if you "thought 'no one was looking'" [@donoho2010 386]. This forces you to root out errors--a ubiquitous part of computational research--earlier in the research process [@donoho2010 385]. Clear documentation also makes it easier to find errors.[^chapter1_6] Reproducible research needs to be stored so that other researchers can actually access the data and source code. By taking steps to make your research accessible for others, you are also making it easier for yourself to find your data and methods when you revise your work or begin a new project. You are avoiding personal effort duplication, allowing you to cumulatively build on your own work more effectively. #### Better teamwork {-} The steps you take to make sure an independent researcher can figure out what you have done also make it easier for your collaborators to understand your work and build on it. This applies not only to current collaborators, but also to future collaborators. Bringing new members of a research team up to speed on a cumulatively growing research project is faster if they can easily understand what has been done already [@donoho2010 386]. #### Changes are easier {-} A third person may or may not actually reproduce your research even if you make it easy for them to do so. But, *you will almost certainly reproduce parts or even all of your own research*. No actual research process is completely linear. You almost never gather data, run analyses, and present your results without going backwards to add variables, make changes to your statistical models, create new graphs, alter results tables in light of new findings, and so on. You will probably try to make these changes long after you last worked on the project and long since you remembered the details of how you did it. Whether your changes are because of journal reviewers' and conference participants' comments or you discover that new and better data has been made available since beginning the project, designing your research to be reproducible from the start makes it much easier to change things later on. Dynamic reproducible documents make changes much easier. Changes made to one part of a research project have a way of cascading through the other parts. For example, adding a new variable to a largely completed analysis requires gathering new data and merging it with existing data sets. If you used data imputation or matching methods, you may need to rerun these models. You then have to update your main statistical analyses, and recreate the tables and graphs you used to present the results. Adding a new variable essentially forces you to reproduce large portions of your research. If when you started the project you used tools that make it easier for others to reproduce your research, you also made it easier to reproduce the work yourself. You will have taken steps to have a "better relationship with your future self" [@bowers2011 2]. #### Higher research impact {-} Reproducible research is more likely to be useful for other researchers than non-reproducible research. Useful research is cited more frequently [@donoho2002; @piwowar2007; @vandewalle2012]. Research that is fully reproducible contains more information, i.e. more reasons to use and cite it, than presentation documents merely showing findings. Independent researchers may use the reproducible data or code to look at other, often unanticipated, questions. When they use your work for a new purpose they will (should) cite your work. Because of this, Vandewalle et al. even argue that "the goal of reproducible research is to have more impact with our research" [-@vandewalle2007 1253]. A reason researchers often avoid making their research fully reproducible is that they are afraid other people will use their data and code to compete with them. I'll let Donoho et al. address this one: > *True. But competition means that strangers will read your papers, try > to learn from them, cite them, and try to do even better. If you > prefer obscurity, why are you publishing?* [-@donoho2009 16] ## Who Should Read This Book? This book is intended primarily for researchers who want to use a systematic workflow that encourages reproducibility as well as practical state-of-the-art computational tools to put this workflow into practice. These people include professional researchers, upper-level undergraduate, and graduate students working on computational data-driven projects. Hopefully, editors at academic publishers will also find the book useful for improving their ability to evaluate and edit reproducible research. The more researchers that use the tools of reproducibility, the better. So I include enough information in the book for people who have very limited experience with these tools, including limited experience with R, LaTeX, and Markdown. They will be able to start incorporating reproducible research tools into their workflow right away. The book will also be helpful for people who already have general experience using technologies such as R and LaTeX, but would like to know how to tie them together for reproducible research. ### Academic researchers Hopefully so far in this chapter I've convinced you that reproducible research has benefits for you as a member of the scientific community and personally as a computational researcher. This book is intended to be a practical guide for how to actually make your research reproducible. Even if you already use tools such as R and LaTeX, you may not be getting their full potential. This book will teach you useful ways to get the most out of them as part of a reproducible research workflow. ### Students Upper-level undergraduate and graduate students conducting original computational research should make their research reproducible for the same reasons that professional researchers should. Forcing yourself to clearly document the steps you took will also encourage you to think more clearly about what you are doing and reinforce what you are learning. It will hopefully give you a greater appreciation of research accountability and integrity early in your career [@barr2012; @ball2012 183]. Even if you don't have extensive experience with computer languages, this book will teach you specific habits and tools that you can use throughout your student research and hopefully your careers. Learning these things earlier will save you considerable time and effort later. ### Instructors When instructors incorporate the tools of reproducible research into their assignments, they not only build students' understanding of research best practice, but are also better able to evaluate and provide meaningful feedback on students' work [@ball2012 183]. This book provides a resource that you can use with students to put reproducibility into practice. If you are teaching computational courses, you may also benefit from making your lecture material dynamically reproducible. Your slides will be easier to update for the same reasons that it is easier to update research. Making the methods you used to create the material available to students will give them more information. Clearly documenting how you created lecture material can also pass information on to future instructors. ### Editors When the first edition of this book was published, there was a worrying lack of reproduciblity in published research. The infrastructure was weak [@peng2011] and many journals did not require it. However, the situation has largely changed for the better: many journals require all analyses to be in some sense reproducible. The journal *Biostatistics* is a good example of a publication that is encouraging (actually requiring) reproducible research. From 2009 the journal has had an editor for reproducibility that ensures replication files are available and that results can be replicated using these files [@peng2009]. The more editors there are with the skills to work with reproducible research, the more likely it is that researchers will do it. We need to maintain and continuously improve these standards. This book is useful for editors at academic publishers who want to be better at evaluating reproducible research, editing it, and developing systems to make it more widely available. ### Private sector researchers Researchers in the private sector may or may not want to make their work easily reproducible outside of their organization. Data compliance legislation, such as the European Union's General Data Protection Regulation (GDPR),\index{GDPR} may even make it legally problematic to share data even within a company in order to protect personal information. However, that does not mean that significant benefits cannot be gained from using the methods of reproducible research, even if only in part. Even if a company has only one person doing research, it benefits from using reproducible research methods. Just as with academic research, this person actually does have a collaborator: their future self. As discussed above, reproducible research makes this collaboration easier. Companies with more than one researcher do (or likely should) act as a research community, even if public reproducibility is ruled out to guard proprietary information.[^chapter1_7] Making as much of your research reproducible (e.g. your source code, but not the raw data if it contains personal information) to members of your organization can spread valuable information about how analyses were done and data was collected. This will help build your organization's knowledge and avoid effort duplication. Just as a lack of reproducibility hinders the spread of information in the scientific community, it can hinder it inside of a private organization. Using the sort of dynamic automated processes run with clearly documented source code we will learn in this book can also help create robust data analysis methods that help your organization avoid errors that may come from cutting-and-pasting data across spreadsheets.[^chapter1_8] ## The Tools of Reproducible Research This book will teach you the tools you need to make your research highly reproducible. Reproducible research involves two broad sets of tools. The first is a **reproducible research environment** that includes the statistical tools you need to run your analyses as well as "the ability to automatically track the provenance of data, analyses, and results and to package them (or pointers to persistent versions of them) for redistribution". The second set of tools is a **reproducible research publisher**, which prepares dynamic documents for presenting results and is easily linked to the reproducible research environment [@mesirov2010 415]. In this book, we will focus on learning how to use the widely available and highly flexible reproducible research environment--R/RStudio [@rlanguage; @rstudiocite].[^chapter1_9] R/RStudio can be linked to numerous reproducible research publishers such as LaTeX and Markdown with Yihui Xie's *knitr* package [-@R-knitr] or the related *rmarkdown* package [@R-rmarkdown]. The main tools covered in this book include: - **R**: a programming language primarily for statistics and graphics. It can also be useful for data gathering and creating presentation documents. - ***knitr* and *rmarkdown***: related R packages for literate programming. They allow you to combine your statistical analysis and the presentation of the results into one document. They work with R and a number of other languages such as Bash, Python, and Ruby. - **Markup languages**: instructions for how to format a presentation document. In this book, we cover LaTeX, Markdown, and a little HTML. - **RStudio**: an integrated developer environment (IDE) for R that tightly combines R, *knitr*, *rmarkdown*, and markup languages. - **Cloud storage and versioning**: Git/GitHub that can store data, code, and presentation files, save previous versions of these files, and make this information widely available. - **Unix-like shell programs**: These tools are useful for working with large research projects.[^chapter1_10] They also allow us to use command-line tools including GNU Make for compiling projects and Pandoc, a program useful for converting documents from one markup language to another. ### Why Use R, knitr/R Markdown, and RStudio for Reproducible Research? #### Why use R? {-} Why use a statistical programming language like R for reproducible research? R has a very active development community that is constantly expanding what it is capable of. As we will see in this book, R enables researchers across a wide range of disciplines to gather data and run statistical analyses. Using the *knitr* or *rmarkdown* package, you can connect your R-based analyses to presentation documents created with markup languages such as LaTeX and Markdown. This allows you to dynamically and reproducibly present results in articles, slideshows, and webpages. The way you interact with R has benefits for reproducible research. In general you interact with R (or any other programming and markup language) by explicitly writing down your steps as source code. This promotes reproducibility more than your typical interactions with Graphical User Interface (GUI) programs like SPSS[^chapter1_11] and Microsoft Word. When you write R code and embed it in presentation documents created using markup languages, you are forced to explicitly state the steps you took to do your research. When you do research by clicking through drop-down menus in GUI programs, your steps are lost, or at least documenting them requires considerable extra effort. Also it is generally more difficult to dynamically embed your analysis in presentation documents created by GUI word processing programs in a way that will be accessible to other researchers both now and in the future. I'll come back to these points in Chapter \@ref(GettingStartedRR). #### Why use knitr and R Markdown? {-} Literate programming is a crucial part of reproducible quantitative research.[^chapter1_12] Being able to directly link your analyses, your results, and the code you used to produce the results makes tracing your steps much easier. There are many different literate programming tools for a number of different programming languages.[^chapter1_13] Previously, one of the most common tools for researchers using R and the LaTeX markup language was *Sweave* [@leisch2002]. The packages I am going to focus on in this book are newer and have more capabilities. They are called *knitr* and *rmarkdown*. Why are we going to use these tools in this book and not *Sweave* or some other tool? The simple answer is that they are more capable than *Sweave*. Both *knitr* and *rmarkdown* can work with markup languages other than LaTeX including Markdown and HTML. *rmarkdown* can even output Microsoft Word documents. They can work with programming languages other than R. They highlight R code in presentation documents making it easier for your readers to follow.[^chapter1_14] They give you better control over the inclusion of graphics and can cache code chunks, i.e. save the output for later. *knitr* has the ability to understand *Sweave*-like syntax, so it will be easy to convert backwards to *Sweave* if you want to.[^chapter1_15] You also have the choice to use much simpler and more straightforward syntax with *knitr* and *rmarkdown*. *knitr* and *rmarkdown* have broadly similar capabilities and syntax. They both are literate programming tools that can produce presentation documents from multiple markup languages. They have almost identical syntax when used in Markdown. Their main difference is that they take different approaches to creating presentation documents. *knitr* documents must be written using the markup language associated with the desired output. For example, with *knitr*, LaTeX must be used to create PDF output documents and Markdown or HTML must be used to create webpages. R Markdown builds directly on knitr, the key difference being that it uses the straightforward Markdown markup language to generate PDF, HTML, and MS Word documents.[^chapter1_16] Because you write with the simple Markdown syntax, R Markdown is generally easier to use. It has the advantage of being able to take the same markup document and output multiple types of presentation documents. Nonetheless, for complex documents like books and long articles or work that requires custom formatting, knitr LaTeX is often preferable and extremely flexible, though the syntax is more complicated. #### Why use RStudio? {-} Why use the RStudio integrated development environment for reproducible research? R by itself has the capabilities necessary to gather data, analyze it, and, with a little help from knitr/R Markdown and markup languages, present results in a way that is highly reproducible. RStudio allows you to do all of these things, but simplifies many of them and allows you to navigate through them more easily. It also is a happy medium between R's text-based interface and a pure GUI. Not only does RStudio do many of the things that R can do but more easily, it is also a very good standalone editor for writing documents with LaTeX and Markdown. For LaTeX documents it can, for example, insert frequently used commands like `\section{}` for numbered sections (see Chapter \@ref(LatexChapter)).[^chapter1_17] There are many LaTeX editors available, both open source and paid. But RStudio is currently the best program for creating reproducible LaTeX and Markdown documents. It has full syntax highlighting. Its syntax highlighting can even distinguish between R code and markup commands in the same document. It can spell check LaTeX and Markdown documents. It handles knitr/R Markdown code chunks beautifully (see Chapter \@ref(GettingStartedRKnitr)). Finally, RStudio not only has tight integration with various markup languages, it also has capabilities for using other tools such as C++, CSS, JavaScript, Python, and a few other programming languages. It is closely integrated with the version control programs Git and SVN. Both of these programs allow you to keep track of the changes you make to your documents (see Chapter \@ref(Storing)). This is important for reproducible research since version control programs can document many of your research steps. It also has a built-in ability to make HTML slideshows from knitr/R Markdown documents. Basically, RStudio makes it easy to create and navigate through complex reproducible research documents. ## Installing the main software {#InstallR} Before you read this book you should install the main software. All of the software programs covered in this book are open source and can be easily downloaded for free. They are available for Windows, Mac, and Linux operating systems. They should run well on most modern computers. You should install R before installing RStudio. You can download the programs from the following websites: - **R**: , - **RStudio Desktop (Open Source License)**: . The webpages for downloading these programs have comprehensive information on how to install them. Please refer to those pages for more information. After installing R and RStudio, you will probably also want to install a number of user-written packages that are covered in this book. To install all of these user-written packages, please see this chapter's Appendix. ### Installing markup languages {#InstallMarkup} You will need to install the R package *rmarkdown* [@R-rmarkdown]\index{R package!rmarkdown} to turn your markdown documents into polished output that can be presented (e.g. as a website or PDF). To do this in R, use: ```{r rmarkdown-install, eval=FALSE} install.packages("rmarkdown") ``` If you plan to render your R Markdown documents from the console without RStudio, you will need to install Pandoc.\index{Pandoc} For instructions, see Pandoc's download page: . If you use RStudio, this step is unnecessary as Pandoc will be installed automatically. If you want to create LaTeX (PDF) documents, you can install a TeX distribution.[^chapter1_18] The simplest way to get all of the LaTeX capabilities you will need for this book is to use the *tinytex*\index{R package!markdown} [@R-tinytex] R package: ```{r tinytex-install, eval=FALSE} install.packages('tinytex') tinytex::install_tinytex() ``` If you want a full LaTeX distribution, see for installation information. ### GNU Make {#InstallMake} If you are using a Linux computer, you already have GNU Make installed.[^chapter1_19] Mac users will need to install the command-line developer tools. There are two ways to do this. One is go to the App Store and download Xcode (it's free). Once Xcode is installed, install command-line tools, which you will find by opening Xcode then clicking on `Preference` `Downloads`. However, Xcode is a very large download and you only need the command-line tools for Make. To install just the command-line tools, open the Terminal and try to run Make by typing `make` and hitting return. A box should appear asking you if you want to install the command-line developer tools. Click `Install`. Windows users will have Make installed if they have already installed *Rtools* (see this chapter's Appendix). Mac and Windows users will need to install this software not only so that GNU Make runs properly, but also so that other command-line tools work well. ### Other tools We will discuss other tools such as Git that can be a useful part of a reproducible research workflow. Installation instructions for these tools will be discussed below. ## Book Overview {#OtherBooks} The purpose of this book is to give you the tools that you will need to do reproducible research with R and RStudio. This book describes a workflow for reproducible research primarily using R and RStudio. It is designed to give you the necessary tools to use this workflow for your own research. It is not designed to be a complete reference for R, RStudio, *knitr*/*rmarkdown*, Git, or any other program that is a part of this workflow. Instead, it shows you how these tools can fit together to make your research more reproducible. To get the most out of these individual programs, I will along the way point you to other resources that cover these programs in more detail. To that end, I can recommend a number of resources that cover more of the nitty-gritty: - Michael J. Crawley's [-@crawley2013] encyclopedic R book, appropriately titled ***The R Book***, published by Wiley. - Hadley Whickham [-@whickham2014book] has a great new book out from Chapman & Hall on ***Advanced R***. - Yihui Xie's [-@xie2018] book ***R Markdown: The Definitive Guide***, published by Chapman & Hall, is needless to say the definitive guide on R Markdown syntax. It's a good complement to this book's generally more research project--level focus. - Cathy O'Neil and Rachel Schutt [-@oneil2013] give an introduction to the field of data science generally in ***Doing Data Science***, published by O'Reilly Media Inc. - For many real-world examples of reproducible research in action see Kitzes et al.'s [-@kitzes2018] collection of case studies ***The Practice of Reproducible Research***. - For an excellent introduction to the command-line in Linux and Mac, see William E. Shotts Jr.'s [-@shottsjr2012] book ***The Linux Command-line: A Complete Introduction*** published by No Starch Press. It is also helpful for Windows users running PowerShell (see Chapter \@ref(DirectoriesChapter)). Sean Kross' [-@kross2018] ***The Unix Workbench*** is also a great freely available online introduction to the topic. - The RStudio website () has a number of useful tutorials on how to use *knitr* with LaTeX and Markdown. They also have very good documentation for *rmarkdown* at . That being said, my goal is for this book to be *self-sufficient*. A reader without a detailed understanding of these programs will be able to understand and use the commands and procedures I cover in this book. While learning how to use R and the other programs, I personally often encountered illustrative examples that included commands, variables, and other things that were not well explained in the texts that I was reading. This caused me to waste many hours trying to figure out, for example, what the `$` is used for (preview: it's the component selector). I hope to save you from this wasted time by either providing a brief explanation of possibly frustrating and mysterious things and/or pointing you in the direction of good explanations. ### How to read this book This book gives you a workflow. It has a beginning, middle, and end. So, unlike a reference book, it can and should be read linearly as it takes you through the organizational steps of an empirical research process from an empty folder to a completed set of documents that reproducibly showcase your findings. That being said, readers with more experience using tools like R or LaTeX may want to skip over the nitty-gritty parts of the book that describe how to manipulate data frames or compile LaTeX documents into PDFs. Please feel free to skip these sections. #### More experienced R users {-} If you are an experienced R user you may want to skip over the first section of Chapter \@ref(GettingStartedRKnitr): Getting Started with R, RStudio, and *knitr*/*rmarkdown*. But don't skip over the whole chapter. The latter parts contain important information on the *knitr*/*rmarkdown* packages. If you are experienced with R data manipulation, you may also want to skip all of Chapter \@ref(DataClean). #### More experienced LaTeX users {-} If you are familiar with LaTeX, you might want to skip the first part of Chapter \@ref(LatexChapter). The second part may be useful as it includes information on how to dynamically create BibTeX bibliographies with *knitr* and how to include *knitr*/*rmarkdown* output in a Beamer slideshow. #### Less experienced LaTeX/Markdown users {-} If you do not have experience with LaTeX or Markdown, you may benefit from reading, or at least skimming, the introductory chapters on these top topics (Chapters \@ref(LatexChapter) and \@ref(MarkdownChapter)) before reading Part III. ### Reproduce this book This book practices what it preaches. It can be reproduced. I wrote the book using the programs and methods that I describe. Full documentation and source files can be found at the book's GitHub repository. Feel free to read and even use (within reason and with attribution, of course) the book's source code. You can find it at: . This is especially useful if you want to know how to do something in the book that I don't directly cover in the text. If you notice any errors or places where the book can be improved please report them on the book's GitHub Issues page: . Corrections will be posted at: . ### Contents overview The book is broken into four parts. Chapters \@ref(GettingStartedRR), \@ref(GettingStartedRKnitr), and \@ref(DirectoriesChapter) give an overview of the reproducible research workflow as well as the general computer skills that you'll need to use this workflow. Each of the next three parts of the book guides you through the specific skills you will need for each part of the reproducible research process. Chapters \@ref(Storing), \@ref(DataGather), and \@ref(DataClean) cover the data gathering and file storage process. Chapters \@ref(StatsModel), \@ref(TablesChapter), and \@ref(FiguresChapter) teach you how to dynamically incorporate your statistical analysis, results figures, and tables into your presentation documents. Finally, Chapters \@ref(LatexChapter) and \@ref(MarkdownChapter) cover how to create reproducible presentation documents including LaTeX articles, slideshows, and webpages. ## Appendix: Additional R Setup {-} Some setup is required to reproduce this book. Here are key R packages you should consider installing and specific instructions for Windows and Linux users. ### R Packages {-} In this book, I discuss how to use a number of user-written R packages for reproducible research. Many of these packages are not included in the default R installation. They need to be installed separately. \index{R!packages|(} **Note:** in general you should aim to minimize the number of packages that your research depends on. Doing so will lessen the possibility that your code will "break" when a package is updated. This book depends on relatively many packages because of its special and unusual purpose of illustrating a variety of tools that you can use for reproducible research. To install key user-written packages discussed in this book, copy the following code and paste it into your R console: ```{r package_install, results='hide'} # Packages to install pkg_to_install <- c("brew", "brms", "bookdown", "devtools", "googleVis", "knitr", "rio", "rmarkdown", "tidyverse", "WDI", "xfun", "texreg", "xtable") # Check if the packages are installed, if not install them lapply( pkg_to_install, function(pkg) { if (system.file(package = pkg) == "") { install.packages(pkg, repos = "http://cran.us.r-project.org" ) } } ) ``` Note that I specified a US based R Project CRAN "mirror"\index{CRAN!mirror} to download the packages from.^[CRAN stands for the Comprehensive R Archive Network.] There are many others to choose from. See: . The *xfun* package [@R-xfun]\index{R package!xfun} contains a function called `pkg_attach2()`. When supplied with a vector of package names like those in `pkg_to_install` above, will install all non-installed packages. `p_load()` from the *pacman* package [@R-pacman] works in a similar way. These functions are much less verbose than the example above, but they do require the user to install the package separately before `pkg_attach2()` or `p_load()` can be used. The example above relies only on functions available in the basic **R** installation. \index{R!packages|(} ### Special issues {-} You may need to install ImageMagick \index{ImageMagick} compile the book from source. \index{Windows|(} If you are using Windows, you will also need to install *Rtools*.\index{R package!Rtools} You can install *Rtools* from: .\label{RtoolsDownload} Please use the recommended installation to ensure that your system PATH\index{PATH} is set up correctly. Otherwise, your computer will not know where the tools are. Alternatively, use the `install.Rtools()` function from the *installr* [@galili2018]\index{R package!installr} package to install it. \index{Windows|(} \index{Linux|(} On Linux, you will need to install the *RCurl* [@R-RCurl]\index{RCurl} package separately. Use your Terminal\index{Terminal} to install these packages with the following (or similar depending on your system) code: ```{sh, eval=FALSE} apt-get update apt-get install libcurl4-gnutls-dev apt-get install r-cran-rcurl-dev ``` \index{Linux|(} [^chapter1_1_1]: @rokem2018 [3-4] note that some disciplines, e.g. computing machinery and meteorology, give "replicable" and "reproducible" the exact opposite meanings from the way they are used in this book and many other disciplines such as biology, economics, and epidemiology. [^chapter1_1]: This is close to what @lykken1968 calls "operational replication". [^chapter1_2]: The really reproducible computational research originates in the 1980s and early 1990s with Jon Claerbout and the Stanford Exploration Project [@fomel2009; @donoho2009]. Further seminal advances were made by Jonathan B. Buckheit and David L. Donoho who created the Wavelab library of MATLAB routines for their research on wavelets in the mid-1990s [@buckheit1995]. [^chapter1_3]: Reproducibility is important for both quantitative and qualitative research [@king1994]. Nonetheless, we will focus mainly on on methods for reproducibility in quantitative computational research. [^chapter1_4]: Much of the reproducible computational research and literate programming literatures have traditionally used the term "weave" to describe the process of combining source code and presentation documents [see @knuth1992 101]. In the R community, the term "weave" is usually used to describe the combination of source code and LaTeX documents. The term "knit" reflects the vocabulary of the *knitr* R package (knit + R). It is used more generally to describe weaving with a variety of markup languages. The term is used by RStudio if you are using the *rmarkdown* package, which is similar to *knitr*. We also cover the *rmarkdown* package in this book. Because of this, I use the term knit rather than weave in this book. [^chapter1_5]: See the American Physical Society's website at . See also @fomel2009. [^chapter1_6]: Of course, it's important to keep in mind that reproducibility is "neither necessary nor sufficient to prevent mistakes" [@stodden2009b]. [^chapter1_7]: There are ways to enable some public reproducibility without revealing confidential information. See [@vandewalle2007] for a discussion of one approach. [^chapter1_8]: See this post by David Smith about how the J.P. Morgan "London Whale" problem may have been prevented with the type of processes covered in this book: (posted 11 February 2013). [^chapter1_9]: The book was created with R version 3.6.2 and RStudio preview release version 1.2.5019. [^chapter1_10]: In this book, I cover the Bash shell for Linux and Mac as well as Windows PowerShell. [^chapter1_11]: I know you can write scripts in statistical programs like SPSS, but doing so is not encouraged by the program's interface and you often have to learn multiple languages for writing scripts that run analyses, create graphics, and deal with matrices. [^chapter1_12]: Donald Knuth coined the term literate programming in the 1970s to refer to a source file that could be both run by a computer and "woven" with a formatted presentation document [@knuth1992]. [^chapter1_13]: A very interesting tool that is worth taking a look at for the Python programming language is HTML Notebooks created with Jupyter. For more details see . We will also discuss these at the end of Chapter \@ref(GettingStartedRKnitr). [^chapter1_14]: Syntax highlighting uses different colors and fonts to distinguish different types of text. [^chapter1_15]: Note that the Sweave-style syntax is not identical to actual *Sweave* syntax. See Yihui Xie's discussion of the differences between the two at: . *knitr* has a function (`Sweave2knitr`) for converting *Sweave* to *knitr* syntax. [^chapter1_16]: It does this by relying on a tool called Pandoc [@pandoc2014]. \index{Pandoc} [^chapter1_17]: If you are more comfortable with a what-you-see-is-what-you-get (WYSIWYG) word processor like Microsoft Word, you might be interested in exploring Lyx. It is a WYSIWYG-like LaTeX editor that works with *knitr*. It doesn't work with the other markup languages covered in this book. For more information, see: . I give some brief information on using Lyx with *knitr* in Chapter 3's Appendix. [^chapter1_18]: LaTeX is really a set of macros for the TeX typesetting system. It is included in all major TeX distributions. [^chapter1_19]: To verify this, open the Terminal and type: `make –version` (I used version 3.81 for this book). This should output details about the current version of Make installed on your computer. ================================================ FILE: rep-res-3rd-edition/04-getting-started.Rmd ================================================ # Getting Started with Reproducible Research {#GettingStartedRR} Researchers often start thinking about making their work reproducible near the end of the research process when they write up their results or maybe later when a journal requires their data and code be made available for publication. Or maybe later when another researcher asks if they can use the data from a published article to reproduce the findings. By then there may be numerous versions of the data set and records of the analyses stored across multiple folders on the researcher's computers. It can be difficult and time consuming to sift through these files to create an accurate account of how the results were reached. Waiting until near the end of the research process to start thinking about reproducibility can lead to incomplete documentation that does not give an accurate account of how findings were made. Focusing on reproducibility from the beginning of the process and continuing to follow a few simple guidelines throughout your research can help you avoid these problems. Remember "reproducibility is not an afterthought–it is something that must be built-into the project from the beginning" [@donoho2010 386]. This chapter first gives you a brief overview of the reproducible research process: a workflow for reproducible research. Then it covers some of the key guidelines that can help make your research more reproducible. ## The Big Picture: A Workflow for Reproducible Research The three basic stages of a typical computational empirical research project are: - data gathering, - data analysis, and - results presentation. Each stage is part of the reproducible research workflow covered in this book. Tools for reproducibly gathering data are covered in Part II. Part III teaches tools for tying the data we gathered to our statistical analyses and presenting the results with tables and figures. Part IV discusses how to tie these findings into a variety of documents you can use to advertise your findings. Instead of starting to use the individual tools of reproducible research as soon as you learn them, I recommend briefly stepping back and considering how the stages of reproducible research *tie* together. This will make your workflow more coherent from the beginning and save you a lot of backtracking later on. Figure \@ref(fig:WorkflowTies) illustrates the workflow. Notice that most of the arrows connecting the workflow's parts point in both directions, indicating that you should always be thinking about how to make it easier to go backward through your research, i.e. reproduce it, as well as forward. Around the edges of the figure are some of the functions you will learn to make it easier to go forward and backward through the process. These functions tie your research together. For example, you can use API-based R packages to gather data from the internet. You can use R's `merge()`\index{R function!merge} function to combine data gathered from different sources into one data set. The `getURL()`\index{R function!getURL} function from R's *RCurl* package [@R-RCurl] and the `read.table()`\index{R function!read.table} function in base R or the much more versatile `import()` function from the *rio* package [@R-rio]\index{R package!rio}\index{R function!import} can be used to bring this data set into your statistical analyses. The *knitr* or *rmarkdown* package then ties your analyses into your presentation documents. This includes the code you used, the figures you created, and, with the help of tools such as the `kable()` function in the *knitr* package, tables of results. You can even tie multiple presentation documents together. For example, you can access the same figure for use in a LaTeX article and a Markdown-created website with the LaTeX `includegraphics`\index{LaTeX command!includegraphics} function or *knitr*'s `include_graphics()`\index{R function!include\_graphics} function. This helps you maintain a consistent presentation of results across multiple document types. We'll cover these functions in detail throughout the book. See Table \@ref(TableTieFunctions) for an additional overview of some of the *tie functions*. ```{r WorkflowTies, engine = "tikz", fig.cap = "Example Workflow and a Selection of Functions to Tie It Together", cache=TRUE, echo=FALSE, fig.ext=if (knitr:::is_latex_output()) 'pdf' else 'png'} \usetikzlibrary{decorations.pathmorphing} \definecolor{Blue}{HTML}{7BCCC4} \definecolor{LiteBlue}{HTML}{A8DDB5} \definecolor{DarkBlue}{HTML}{08589E} \definecolor{GrayLine}{HTML}{BDBDBD} % Set node styles %% Workflow stage nodes \tikzstyle{Stage} = [draw=Blue, %fill=Blue, rectangle, text width=7em, inner sep=0.5cm, font=\small] % Raw Data nodes \tikzstyle{RawData} = [draw=LiteBlue, %fill=LiteBlue, decorate, decoration={random steps, segment length=2pt, amplitude=2pt}, inner sep=0.25cm, font=\scriptsize] % Separator line style \tikzstyle{sepline} = [draw, very thick, color=GrayLine] % Link function nodes \tikzstyle{Links} = [draw=none, text width=6em, text=DarkBlue, font=\footnotesize] % Begin tikz picture \begin{tikzpicture} % Raw Data Nodes \node (Data1) at (-3, 7) [RawData]{Raw Data}; \node (Data2) at (-3, 5) [RawData]{Raw Data}; \node (Data3) at (-3, 3) [RawData]{Raw Data}; % Workflow stage nodes \node (DataGather) at (0.5, 5) [Stage, text width= 6em]{Data Gather}; \node (Analysis) at (5.5, 5) [Stage, text width= 4em]{Analysis}; \node (Presentation1) at (9, 8) [Stage]{LaTeX Book, \\ Article, \& \\ Slideshow \\ Presentations}; \node (Presentation2) at (9, 2.5) [Stage]{Markdown/ \\ HTML Website \\ Presentations}; % Lines \draw [->, very thick] (Data1) -- (DataGather); \draw [->, very thick] (Data2) -- (DataGather); \draw [->, very thick] (Data3) -- (DataGather); \draw [<->, very thick] (DataGather) -- (Analysis); \draw [<->, very thick] (Analysis) -- (Presentation1); \draw [<->, very thick] (Analysis) -- (Presentation2); \draw [<->, very thick] (Presentation1) -- (Presentation2); \path [sepline] (-3.5, 0.75) -- (11, 0.75); \path [sepline] (11.5, 9) -- (11.5, 1.5); % Link function nodes \node (pres) at (13, 5) [Links]{{\emph{knitr}} \\ \texttt{input} \\ \texttt{include} \\ \texttt{includegraphics} \\ \texttt{include\_graphics} \\ Pandoc \\ \texttt{![]()}}; \node (knitr) at (7.5, -1) [Links]{ {\emph{knitr}} \\ \emph{rmarkdown} \\ \texttt{source} \\ \texttt{source\_url} \\ \texttt{kable} \\ \texttt{print(xtable())} \\ \texttt{texreg} }; \node (readData) at (3, -1) [Links]{\texttt{import} \\ \texttt{read.table} \\ \texttt{getURL} }; \node (importData) at (-1, -1.3) [Links]{ \texttt{Make} \\ \texttt{download.file} \\ \texttt{read.table} \\ \texttt{import} \\ \texttt{merge}\\ \texttt{getURL} \\ API-based packages }; \end{tikzpicture} ``` ### Reproducible theory An important part of the research process that I do not discuss in this book is the theoretical stage. If you are using a deductive research design, the bulk of this work will precede and guide the data gathering and analysis stages. Just because I don't cover this stage of the research process doesn't mean that theory building can't and shouldn't be reproducible. It can in fact be "the easiest part to make reproducible" [@vandewalle2007 1254]. Quotes and paraphrases from previous works in the literature obviously need to be fully cited so that others can verify that they accurately reflect the source material. For mathematically based theory, you should give clear and complete descriptions of the proofs. Though I don't actively cover theory replication in depth in this book, I do touch on some of the ways to incorporate proofs and citations into your presentation documents. These tools are covered in Part IV. ## Practical Tips for Reproducible Research Before we start learning the details of the reproducible research workflow with R and RStudio, it's useful to cover a few broad tips that will help you organize your research process and put these skills in perspective. The tips are: 1. Document everything! 2. Everything is a (text) file. 3. All files should be human readable. 4. Explicitly tie your files together. 5. Have a plan to organize, store, and make your files available. Using these tips will help make your computational research really reproducible. ### Document everything! In order to reproduce your research, others must be able to know what you did. You have to tell them what you did by documenting as much of your research process as possible. Ideally, you should tell your readers how you gathered your data, analyzed it, and presented the results. Documenting everything is the key to reproducible research and lies behind all of the other tips in this chapter and tools you will learn throughout the book. #### Document your R session info {- #SessionInfoHow} Before discussing the other tips, it's important to learn a key part of documenting with R. You should *record your session info*. Many things in R have stayed the same since it was introduced in the early 1990s. This makes it easy for future researchers to recreate what was done in the past. However, things can change from one version of R to another and especially from one version of an R package to another. Also, the way R functions and how R packages are handled can vary across different operating systems, so it's important to note what system you used. Finally, you may have R set to load packages by default (see Section \@ref(Packages) for information about packages). These packages might be necessary to run your code, but other people might not know what packages and what versions of the packages were loaded from just looking at your source code. The `sessionInfo()` function in R prints a record of all of these things. The information from the session I used to create this book is: ```{r Ch2SessionInfoPlain, echo=TRUE, eval=FALSE} # Print R session info sessionInfo() ``` ```{r RemoveLongExtra, echo=FALSE} si <- sessionInfo() # Remove non-relevant information that doesn't fit on the page si$BLAS <- NULL si$LAPACK <- NULL si ``` Chapter \@ref(DirectoriesChapter) gives specific details about how to create files with dynamically included session information. If you used non-R tools you should also record what versions of these tools you used. ### Everything is a (text) file Your documentation is stored in files that include data, analysis code, the write-up of results, and explanations of these files (e.g. data set codebooks, session info files, and so on). Ideally, you should use the simplest file format possible to store this information. Usually the simplest file format is the humble, but versatile, text file.[^chapter2_1] Text files are extremely nimble. They can hold your data in, for example, comma-separated values (CSV) format. They can contain your analysis code in files. And they can be the basis for your presentations written in markup languages such as Markdown and LaTeX. All of these files can be opened by any program that can read text files. One reason reproducible research is best stored in text files is that this helps *future-proof* your research. Other file formats, like those used by Microsoft Word (`.docx`) or Excel (`.xlsx`), change regularly and may not be compatible with future versions of these programs. Text files, on the other hand, can be opened by a very wide range of currently existing programs and, more likely than not, future ones as well. Even if future researchers do not have R or a LaTeX distribution, they will still be able to open your text files and, aided by frequent comments (see below), be able to understand how you conducted your research [@bowers2011 3]. Text files are also very easy to search and manipulate with a wide range of programs–such as R and RStudio–that can find and replace characters as well as merge and separate files. Finally, text files are easy to version control. Changes can be tracked using programs such as Git (see Chapter \@ref(Storing)). #### Learn from the text file: keep it simple {-} Text files are simple. Their simplicitly increases the probability of baseline usefulness in the future to researchers who will reproduce the work. We can extend the logic of the simple text file to all of the tools we use: keep it simple. Avoid adding dependencies you don't need to actually gather your data, analyze it, and present the results. For example, I have been tempted to make my presentation slides look nicer with custom fonts. I was later burned when I wanted to make minor changes to slides a year after I first presented them (and a day before teaching an upcoming class) only to find that the custom fonts were no longer available. This broke my slides and forced me to spend considerable time reworking writing my source documents. If I, the creator of the slides, found this time consuming and annoying, an independent researcher would likely find it even more difficult. ### All files should be human readable Treat all of your research files as if someone who has not worked on the project will, in the future, try to understand them. Computer code is a way of communicating with the computer. It is ‘machine readable' in that the computer is able to use it to understand what you want to do.[^chapter2_2] However, there is a very good chance that other people (or you six months in the future) will not understand what you were telling the computer. So, you need to make all of your files ‘human readable'. To make them human readable, you should comment on your code with the goal of communicating its design and purpose [@wilson2012]. With this in mind, it is a good idea to *comment frequently* [@bowers2011 3] and *format your code using a style guide* [@nagler1995]. For especially important pieces of code, you should use *literate programming*–where the source code and the presentation text describing its design and purpose appear in the same document.\index{literate programming} Doing this will make it very clear to others how you accomplished a piece of research. #### Commenting {-} In R, everything on a line after a hash character (also known as 'number', 'pound', or 'sharp') is ignored by R, but is readable to people who open the file. The hash character is a comment declaration character. You can use a hash to place comments telling other people what you are doing. Here are some examples: ```{r Ch2CommentHash} # A complete comment line 2 + 2 # A comment after R code ``` On the first line, the hash is placed at the very beginning, so the entire line is treated as a comment. On the second line the hash is placed after the simple equation `2 + 2`. R runs the function and finds the answer `4`, but it ignores all of the words after the hash. Different languages have different comment declaration characters. In LaTeX everything after the percent sign is treated as a comment, and in Markdown/HTML comments are placed inside of ``. The hash character is used for comment declaration in command-line shell scripts as well as many other programming languages such as Python and Julia.\index{Python}\index{Julia} Nagler [-@nagler1995 491] gives some advice on when and how to use comments: - write a comment before a block of code describing what the code does, - comment on any line of code that is ambiguous. In this book, I follow these guidelines when displaying code. Nagler also suggests that all of your source code files should begin with a comment header. At the least, the header should include: - a description of what the file does, - the date it was last updated, - the name of the file's creator and any contributors. You may also want to include other information in the header such as what files it depends on, what output files it produces, what version of the programming language you are using, sources that may have influenced the code, and how the code is licensed. Here is an example of a minimal file header for an R source code file that creates the third figure in an article titled ‘My Article': ````R ############################ # R Source code file used to create Figure 3 in 'My Article' # Created by Christopher Gandrud # MIT License ############################ ```` Feel free to use things like the long series of hash marks above and below the header, white space, and indentations to make your comments more readable. #### Style guides {-} In natural language writing you don't necessarily have to follow a style guide. People could probably figure out what you are trying to say, but it is a lot easier for your readers if you use consistent rules. The same is true when writing computer code. It's good to follow consistent rules for formatting your code so that it's easier for you and others to understand. There are a number of R style guides. Most of them are similar to the Google R Style Guide.[^chapter2_3] Hadley Wickham also has a nicely presented R style guide.[^chapter2_4] You may want to use the *styler* [@R-styler]\index{R package!styler} package to automatically reformat your code so that it is easier to read. #### Literate programming {-} For particularly important pieces of research code, it may be useful to not only comment on the source file, but also display code in presentation text. For example, you may want to include key parts of the code you used for your main statistical models and an explanation of this code in an appendix following your article. This is commonly referred to as literate programming [@knuth1992].\index{literate programming} ### Explicitly tie your files together If everything is just a text file, then research projects can be thought of as individual text files that have a relationship with one another. They are tied together. A data file is used as input for an analysis file. The results of an analysis are shown and discussed in a markup file that is used to create a PDF document. Researchers often do not explicitly document the relationships between files that they used in their research. For example, the results of an analysis–a table or figure–may be copied and pasted into a presentation document. It can be very difficult for future researchers to trace the table or figure back to a particular statistical model and a particular data set without clear documentation. Therefore, it is important to make the links between your files explicit. Tie functions are the most dynamic way to explicitly link your files together. These functions instruct the computer program you are using to use information from another file. In Table \@ref(TableTieFunctions), I have compiled a selection of key tie functions you will learn how to use in this book. We'll discuss many more, but these are some of the most important. ### Have a plan to organize, store, and make your files available Finally, in order for independent researchers to reproduce your work, they need to be able access the files that instruct them how to do this. Files also need to be organized so that independent researchers can figure out how they fit together. So, from the beginning of your research process, you should have a plan for organizing your files and a way to make them accessible. One rule of thumb for organizing your research in files is to limit the amount of content any one file has. Files that contain many different operations can be very difficult to navigate, even if they have detailed comments. For example, it would be very difficult to find any particular operation in a file that contained the code used to gather the data, run all of the statistical models, and create the results, figures and tables. If you have a hard time finding things in a file you created, think of the difficulties independent researchers will have! Because we have so many ways to link files together, there is really no need to lump many different operations into one file. So, we can make our files modular. One source code file should be used to complete one or just a few tasks. Breaking your operations into discrete parts will also make it easier for you and others to find errors [@nagler1995 490]. Chapter \@ref(DirectoriesChapter) discusses file organization in much more detail. Chapter \@ref(Storing) teaches you a number of ways to make your files accessible through the cloud computing services like GitHub. \begin{table} \caption{A Selection of Functions/Packages/Programs for Tying Together Your Research Files} \label{TableTieFunctions} \vspace{0.3cm} {\footnotesize{ \begin{tabular}{p{2.5cm} c p{5.25cm} p{2cm}} \hline Function/Package/ Program & Language & Description & Chapters Discussed \\[0.3cm] \hline \hline {\emph{knitr}} & R & R package with commands for tying analysis code into presentation documents including those written in LaTeX and Markdown. & \hfill Throughout \\[0.25cm] \emph{rmarkdown} & R & R package that builds on \emph{knitr}. It allows you to use Markdown to output to HTML, PDFs compiled with LaTeX or Microsoft Word. & \hfill Throughout \\[0.25cm] {\tt{download.file}} & R & Downloads a file from the internet. & \hfill\ref{DataGather} \\[0.25cm] {\tt{read.table}} & R & Reads a table into R. You can use this to import a plain-text file formatted data into R. & \hfill\ref{DataGather} \\[0.25cm] {\tt{read.csv}} & R & Same as \texttt{read.table} with default arguments set to import \texttt{.csv} formatted data files. & \hfill\ref{DataGather} \\[0.25cm] {\tt{import}} & R & Reads a table stored locally or on the internet into R. You can use it to import a wide variety of plain-text data formats into R from secure (https) URLs. & \hfill\ref{DataGather} \\[0.25cm] API-based packages & R & Various packages use APIs to gather data from the internet. & \hfill\ref{DataGather} \\[0.25cm] {\tt{merge}} & R & Merges together data frames. & \hfill\ref{DataClean} \\[0.25cm] {\tt{source}} & R & Runs an R source code file. & \hfill\ref{StatsModel} \\[0.25cm] {\tt{source\_url}} & R & From the {\emph{devtools}} package. Runs an R source code file from a secure ({\tt{https}}) url like those used by GitHub. & \hfill\ref{StatsModel} \\[0.25cm] {\tt{kable}} & R & Creates tables from data frames that can be rendered using Markdown or LaTeX. & \hfill\ref{TablesChapter} \\[0.25cm] {\tt{toLaTeX}} & R & Converts R objects to LaTeX. & \hfill\ref{GettingStartedRR} \\[0.25cm] {\tt{includegraphics}} & LaTeX & Inserts a figure into a LaTeX document. & \hfill\ref{FiguresChapter} \\[0.25cm] {\tt{include\_graphics}} & R/R Markdown & Inserts a figure into an R Markdown document. & \hfill\ref{FiguresChapter} \\[0.25cm] \texttt{![]()} & Markdown & Inserts a figure into a Markdown document. & \hfill\ref{MarkdownChapter} \\ [0.25cm] Pandoc & shell & A shell program for converting files from one markup language to another. Allows you to tie presentation documents together. & \ref{MarkdownChapter} \\[0.25cm] Make & shell & A shell program for automatically building many files at the same time. & \hfill\ref{DataGather} \\[0.25cm] \hline \end{tabular} }} \end{table} [^chapter2_1]: Plain text files are usually given the file extension `.txt`. Depending on the size of your data set, it may not be feasible to store it as a text file. Nonetheless, text files can still be used for analysis code and presentation files. [^chapter2_2]: Of course, if the computer does not understand, it will usually give an error message. [^chapter2_3]: See: . [^chapter2_4]: You can find it at . ================================================ FILE: rep-res-3rd-edition/05-start-R.Rmd ================================================ # Getting Started with R, RStudio, and knitr/R Markdown {#GettingStartedRKnitr} If you have rarely or never used R before, the first section of this chapter gives you enough information to be able to get started and understand the R code I use throughout the book. For more detailed introductions on how to use R, please refer to the resources mentioned in Chapter \@ref(Intro) (Section \@ref(OtherBooks)). Experienced R users might want to skip the first section. In the second section, I'll give a brief overview of RStudio. I highlight the key features of the main RStudio panel (what appears when you open RStudio) and some of its main tools for reproducible research. Finally, I discuss the basics of the *knitr* and *rmarkdown* packages, how to use them in R, and how they are integrated into RStudio. ## Using R: The Basics To get you started with reproducible research, we'll cover some very basic R syntax---the rules for talking to R. I cover key parts of the R language including: - objects and assignment, - component selection, - functions, - arguments, - the workspace and history, - packages. Before discussing each of these in detail, let's open R and look around.[^chapter3_1] When you open the R GUI program by clicking on the R icon, you should get a window that looks something like Figure \@ref(fig:RBlankMain).[^chapter3_2] This window is the **R console**. Below the start-up information---information about what version of R you are using, license details, and so on---you should see a `>` (greater-than sign). This prompt is where you enter R code.[^chapter3_3] To run R code that you have typed after the prompt, press the `Return` or `Enter` key. Now that we have a new R session open, we can get started. ```{r RBlankMain, fig.cap="R Console at Startup", echo=FALSE, out.height="50%"} knitr::include_graphics("images/chapter_3/BlankRConsole.png") ``` ### Objects {#Objects} If you've read a description of the R language before, you will probably have seen it referred to as an 'object-oriented language'. What are objects? Objects are like the R language's nouns. They are things, like a vector of numbers, a data set, a word, a table of results from some analysis, and so on. Saying that R is object-oriented means that R is focused on doing actions to objects. We will talk about the actions, functions, later in this section.[^chapter3_4] Now let's create a few objects. #### Numeric and string objects {- #ObjectNames} Objects can have a number of different types. Let's make two simple objects. The first is a numeric-type object. The other is a character object. We can choose almost any name we want for our objects as long as it begins with an alphabetic character and does not contain spaces.[^chapter3_5] Just because there are relatively few hard restrictions on object names, doesn't mean that you should name your object anything. Your code will be much easier to read if object names are short and meaningful. Give each object a unique name to avoid confusion and conflicts. For example, if you reuse an object name in an R session, you could easily accidentally overwrite it. Let's begin working with numeric objects by creating a new object called *number* with the number 10 in it. Use the assignment operator[^chapter3_6] (`<-`)\index{R function!<-} to put something into the object: ```{r Ch3NumericObject, echo=TRUE} number <- 10 ``` To see the contents of our object, type its name into the R console. ```{r Ch3NumberSee, echo=TRUE} number ``` Let's briefly breakdown this output. `10` is clearly the contents of *number*. The double hash (`##`) is included here to tell you that this is output rather than R code.[^chapter3_7] If you run functions in your R console, you will not get the double hash in your output. Finally, `[1]` gives the position in the object that the number 10 is on. Our object only has one position. Creating an object with words and other characters, a character object, is very similar. The only difference is that you enclose the character string (letters in a word for example) inside of single or double quotation marks (`''`, or `""`).[^chapter3_8] Let's create an object called *words* containing the character string `Hello World`: ```{r Ch3CharacterObject, echo=TRUE} words <- "Hello World" ``` An object's type is important to keep in mind. It determines what we can do to the object. For example, you cannot take the mean of a character object like the *words* object: ```{r Ch3ClassError, echo=TRUE} mean(words) ``` Trying to find the mean of our *words* object gives us a warning message and returns the value `r mean(words)`: not applicable. You can also think of `NA` as meaning "missing". To find out an object's type, use the `class()` function.[^chapter_3_types]\index{R function!class} For example: ```{r Ch3ClassCommand, echo=TRUE} class(words) ``` #### Vector and data frame objects {-} So far, we have only looked at objects with a single number or character string.[^chapter3_9] Clearly we often want to use objects that have many strings and numbers. In R these are usually data frame-type objects and are roughly equivalent to the data structures you would be familiar with from using a program such as Microsoft Excel. We will be using data frames extensively throughout the book. Before looking at data frames it is useful to first look at the simpler objects that make up data frames. These are called vectors. Vectors are R's "workhorse" [@matloff2011]. Knowing how to use vectors will be especially helpful when you clean up raw data in Chapter \@ref(DataClean) and make tables in Chapter \@ref(TablesChapter).[^chapter3_10] #### Vectors {-} Vectors are the "fundamental data type" in R [@matloff2011]. They are an ordered group of numbers, character strings, and so on.[^chapter3_11] It may be useful to think of most data in R as composed of vectors. For example, data frames\index{R function!data.frame}\index{R!data frame} are basically collections of vectors of the same length, i.e. they have the same number of rows, attached together to form columns. Let's create a simple numeric vector containing the numbers 2.8, 2, and 14.8. To do this, we will use the `c()` (combine)\index{R function!combine} function and separate the numbers with commas (`,`): ```{r Ch3numeric_vectoror, echo=TRUE} numeric_vector <- c(2.8, 2, 14.8) # Show numeric_vector's contents numeric_vector ``` Vectors of character strings are created in a similar way. The only difference is that each character string is enclosed in quotation marks like this: ```{r Ch3CharcterVector, echo=TRUE} character_vector <- c("Albania", "Botswana", "Cambodia") # Show character_vector's contents character_vector ``` #### Matrices {-} To give you a preview of what we are going to do when we start working with real data sets, let's combine the two vectors *numeric_vector* and *character_vector* into a new object with the `cbind()` function. This function binds the two vectors together side-by-side as columns.[^chapter3_12]\index{R function!cbind} ```{r Ch3cbind, echo=TRUE} string_num_matrix <- cbind(character_vector, numeric_vector) string_num_matrix ``` By binding these two objects together, we've created a new matrix object.[^chapter3_13] You can see that the numbers in the **numeric_vector** column are between quotation marks. Matrices, like vectors, can only have one data type, so R has converted the numbers to strings. #### Data frames {-} If we want to have an object with rows and columns and allow the columns to contain data with different types, we need to use data frames. Let's use the `data.frame` function to combine the *numeric_vector* and *character_vector* objects.\index{R function!data.frame} ```{r Ch3dataframe, echo=TRUE, tidy=FALSE} string_num_df <- data.frame(character_vector, numeric_vector) string_num_df ``` In this output, you can see the data frame's *names* attribute.[^chapter3_14] It is the column names. You can use the `names()` function\index{R function!names} to see any data frame's names:[^chapter3_15] ```{r Ch10Names, echo=TRUE} names(string_num_df) ``` You will also notice that the first column of the data set has no name and is a series of numbers. This is the *row.names* attribute. Data frame rows can be given any name as long as each row name is unique. We can use the `row.names()` function to set the row names from a vector. For example, ```{r Ch3ReassignRowNames, echo=TRUE} # Reassign row.names row.names(string_num_df) <- c("First", "Second", "Third") # Display new row.names row.names(string_num_df) ``` You can see in this example how `row.names()` can also be used to print the row names.[^chapter3_16] The *row.names* attribute does not behave like a regular data frame column. You cannot, for example, include it as a variable in a regression. You can use the `row.names()` function to assign the *row.names* values to a regular column. You will notice in the output for *string_num_df* that the strings in the **character_vector** column are not in quotation marks. This does not mean that they are now numeric data. To prove this, try to find the mean of **character_vector** by running it through the `mean()`\index{R function!mean} function: ```{r Ch3CharcterVectorMean, echo=TRUE} mean(string_num_df$character_vector) ``` #### Component selection {- #ComponentSelect} The last bit of code we just saw will probably be confusing. Why do we have a dollar sign (`$`) between the name of our data frame object name and the `character_vector` variable? The dollar sign is called the component selector.\index{R!component selector}\index{R function!\$} It's also sometimes called the element name operator. Either way, it extracts a part, component, of an object. In the previous example, it extracted the **character_vector** column from the *string_num_df* so that it could be fed to the `mean()` function. We can use the component selector to create new objects with parts of other objects. Imagine that we have *string_num_df* and want an object with only the information in the **numeric_vector** column. Let's use the following code: ```{r Ch3CompSelect, echo=TRUE} # Extract a numeric vector from string_num_df numeric_extract <- string_num_df$numeric_vector numeric_extract ``` Knowing how to use the component selector will be especially useful when we discuss making tables for presentation documents in Chapter \@ref(TablesChapter). #### `attach()` and `with()` {-} Using the component selector can create long repetitive code if you want to select many components. You have to write the object name, a dollar sign, and the component name every time you want to select a component. You can streamline your code by using functions such as `attach()`\index{R function!attach} and `with()`\index{R function!with}. `attach()` attaches a database to R's search path.[^chapter3_17] R will then search the database for variables you specify. You don't need to use the component selector to tell R again to look in a particular data frame after you have attached it. For example, let's attach the *cars* data that comes with R. It has two variables, **speed** and **dist**.[^chapter3_18] ```{r Ch3Attach, echo=TRUE} # Attach cars to search path attach(cars) # Display speed head(speed) # Display dist head(dist) ``` We used the `head()`\index{R function!head} function to see just the first few values of each variable. Now that we are done working with the *cars* data set, we should `detach()`\index{R function!detach} it. Not doing so could confuse R later in our session. ```{r Ch3Detach, echo=TRUE} detach(cars) ``` A safer alternative to `attach()` is `with()`. It more clearly delineates when to draw from inside a particular object. For example, we can find the mean of **numeric_vector** `with()` the *string_num_df* data frame: ```{r Ch3With, echo=TRUE, tidy=FALSE} with(string_num_df, { mean(numeric_vector) } ) ``` In the `with()` call the data frame object goes first and then the `mean()` function[^chapter3_19] goes second in curly brackets (`{}`). In this book I avoid using the `attach()` and `with()` functions. Instead, I use the component selector. Though it creates longer code, I find that code written with the component selector is less ambiguous. It's always clear which object we are selecting a component from. #### Subscripts {-} Another way to select parts of an object is to use subscripts. You have already seen subscripts in the output from our examples so far. They are denoted with square braces (`[]`). We can use subscripts to select not only columns from data frames but also rows and individual values. As we began to see in some of the previous output, each part of a data frame has an address captured by its row and column number. We can tell R to find a part of an object by putting the row number/name, column number/name, or both in square braces. The first part denotes the rows and separated by a comma (`,`) are the columns. To give you an idea of how this works, let's use the *cars* data set again. Use `head()` to get a sense of what this data looks like. ```{r Ch3HeadSwiss, echo=TRUE} head(cars) ``` We can see a data frame with information on various car speeds (**speed**) and stopping distances (**dist**). If we want to select only the third through seventh rows, we can use the following subscript function call: ```{r Ch3FirstSeventhRows, echo=TRUE} cars[3:7, ] ``` The colon (`:`) creates a sequence of whole numbers from 3 to 7. To select the fourth row of the **dist** column, we can type: ```{r Ch3FourthSecond, echo=TRUE} cars[4, 2] ``` An equivalent way to do this is: ```{r Ch3FourthDist, echo=TRUE} cars[4, "dist"] ``` Finally, we can even include a vector of column names to select: ```{r Ch3FourthBoth, echo=TRUE} cars[4, c("speed", "dist")] ``` ### Functions {#FunctionsCommands} If objects are the nouns of the R language, functions are the verbs. They do things to objects. Let's use the `mean` function as an example. This function takes the mean of a numeric vector object. Remember our *numeric_vector* object from before: ```{r Shownumeric_vector, echo=TRUE} numeric_vector ``` To find the mean of this object, type: ```{r numeric_vector_mean, echo=TRUE} mean(x = numeric_vector) ``` We use the assignment operator to place a function's output into an object. For example: ```{r numeric_vector_meanAssign, echo=TRUE} numeric_vector_mean <- mean(x = numeric_vector) ``` Notice that we typed the function's name then enclosed the object name in parentheses immediately afterwards. This is the basic syntax that all functions use, i.e. `FUNCTION(ARGUMENTS)`. Even if you don't want to explicitly include an argument, *you still need to type the parentheses after the function*.[^chapter3_21a] #### Arguments {-} Arguments modify what functions do. In our most recent example, we gave the `mean` function one argument (`x = numeric_vector`) telling it that we wanted to find the mean of *numeric_vector*. Arguments use the `ARGUMENT_LABEL = VALUE` syntax.[^chapter3_21] In this case, **x** is the argument label. To find all of the arguments that a function can accept, look at the **Arguments** section of the function's help file. To access the help file, type: `?FUNCTION`.\index{R function!?} For example: ```{r Ch3HelpMean, echo=TRUE, eval=FALSE, tidy=FALSE} ?mean ``` The help file will also tell you the default values that the arguments are set to. You do not need to explicitly set an argument if you want to use its default value. You do need to be fairly precise with the syntax for your argument's values. Values for logical arguments must be written as `TRUE` or `FALSE`.[^chapter3_22] Arguments that accept character strings require quotation marks. Let's see how to use multiple arguments with the `round()`\index{R function!round} function. This function rounds a vector of numbers. We can use the `digits` argument to specify how many decimal places we want the numbers rounded to. To round the object *numeric_vector_mean* to one decimal place, type: ```{r Ch3Round, echo=TRUE} round(x = numeric_vector_mean, digits = 1) ``` Note that *arguments are always separated by commas*. Some arguments do not need to be explicitly labeled. For example, we could write: ```{r Ch3ArgeNoLabel, echo=TRUE} # Find mean of numeric_vector mean(numeric_vector) ``` R will do its best to figure out what you want and will only give up when it can't. This will generate an error message. However, to avoid any misunderstandings between yourself and R, it is good practice to label your argument values. This will also make your code easier for other people to read, i.e. it will be more reproducible. You can stack functions inside of arguments. For example, have R find the mean of *numeric_vector* and round it to one decimal place: ```{r Ch3StackedArgs, echo=TRUE} round(mean(numeric_vector), digits = 1) ``` Stacking functions inside of each other can create code that is difficult to read. Another option that potentially makes more easily understandable code is piping\index{pipe} using the pipe function (`%>%`)\index{R function!\%>\%} that you can access from the *magrittr* [@R-magrittr]\index{R package!magrittr} or *dplyr* [@R-dplyr]\index{R package!dplyr} packages. The basic idea behind the pipe function is that the output of one function is set as the first argument of the next. For example, to find the mean of *numeric_vector* and then round it to one decimal place use: ```{r Ch3Pipe, echo=TRUE} # Load magrittr package library(magrittr) # Find mean of numeric_vector and round to 1 decimal place mean(numeric_vector) %>% round(digits = 1) ``` ### The workspace and history {#RSave} All of the objects you create become part of your workspace, alternatively known as the current working environment. Use the `ls()`\index{R function!ls} function to list all of the objects in your current workspace.[^chapter3_23] ```{r Ch3LS, echo=TRUE} ls() ``` You can remove specific objects from the workspace using the `rm()`\index{R function!rm} function. For example, to remove the objects `character_vector` and `words` type: ```{r Ch3RM, echo=TRUE, eval=FALSE} rm(character_vector, words) ``` To save the entire workspace into a binary, not plain-text, RData file use `save.image()`.\index{R function!save.image} The main argument of `save.image()` is the location and name of the file in which you want to save the workspace. If you don't specify the file path it will be saved into your current working directory (see Chapter \@ref(DirectoriesChapter) for information on files paths and working directories). To save the current workspace in a file called *workspace-2019-12-22.RData* in the current working directory type: ```{r Ch3Workspace, echo=TRUE, eval=FALSE} save.image(file = "workspace-2019-12-22.RData") ``` Use `load()`\index{R function!load} to load a saved workspace back into R: ```{r Ch3LoadWS, echo=TRUE, eval=FALSE} load(file = "workspace-2019-10-22.RData") ``` You should generally avoid having R automatically save your workspace when you quit and reload it when you start R again. Instead, when you return to a project, rerun the source code files. This avoids any complications caused when you use an object in your workspace that is left over from running an older version of the source code.[^chapter3_24] In general, I also recommend against saving data in binary RData formatted files. They are not text files. They are not human readable. They are much less future-proof. One of the few times when saving your workspace is useful is when it includes an object that was computationally difficult and took a long time to create. In this case, you can save only the large object with `save()`.[^chapter3_25]\index{R function!save} For example, if we have a very large object called *model-output*, we can save it to a file called *model-output.RData* like this: ```{r Ch3Comp, echo=TRUE, eval=FALSE} save(model-output, file = "model-output.RData") ``` ### R history {#RHistory} When you execute code in the R console, it becomes part of your history. Use the `history()` function\index{R function!history} to see the most recent functions in your history, You can also use the up and down arrows on your keyboard when your cursor is in the R console to scroll through your history. ### Global R options {#ROptions} In R you can set global options with `options()`.\index{R function!options} This lets you set how R runs and outputs functions through an entire R session. For example, to have output rounded to one decimal place, set the `digits` argument: ```{r Ch3Options, echo=TRUE, eval=FALSE} options(digits = 1) ``` ### Installing new packages and loading functions {#Packages} Functions are stored in R packages. The functions we have used so far were loaded automatically by default. One of the great things about R is the many user-created packages[^chapter3_26] that expand the number of functions we can use. To install functions that do not come with the basic R installation, you need to install the add-on packages that contain them. To do this, use the `install.packages()`\index{R function!install.packages} function. By default, this function downloads and installs the packages from the Comprehensive R Archive Network (CRAN).\index{CRAN} When you install a package, you will likely be given a list of "mirrors" from which you can download the package. Select the mirror closest to you. Once you have installed a package, you need to load when you want to use its functions. Use the `library()` function\index{R function!library} to load a package.[^chapter3_27] For example, the following code loads the popular *ggplot2* plotting package: ```{r Ch3Library, echo=TRUE, warning=FALSE, message=FALSE} library(ggplot2) ``` Please note that for the examples in this book I only specify what package a function is from if it is not loaded by default when you start an R session. Finally, if you want to make sure R uses a function from a specific package, you can use the double-colon operator (`::`). For example, to make sure that we use the `qplot()` function from the *ggplot2* package, we type: ```{r Ch3ColonOperator, echo=TRUE, eval=FALSE} ggplot2::qplot(. . .) ``` Using the double-colon ensures that R will use the function from the particular package you want and makes it clear to a source code reader what package a function comes from. If you use the double-colon, you don't need to include `library()` beforehand. Note that it does not load all of the functions in the package, just the one you ask for. ## Using RStudio As I mentioned in Chapter \@ref(Intro), RStudio is an integrated development environment for R. It provides a centralized and well-organized place to do almost anything you want to do with R. As we will see later in this chapter, it is especially well integrated with literate programming tools for reproducible research. Right now, let's take a quick tour of the basic RStudio window. #### The default window {-} When you first open RStudio, you should see a default window that looks like Figure \@ref(fig:BlankMain). In this figure, you see three window panes. The large one on the left is the *Console/Terminal/Jobs* pane. The *Console* pane is an R console and functions exactly the same as the console discussed so far in this chapter. *Terminal* is a command-line terminal where you can run command-line tools like those we discuss in Chapter \@ref(DirectoriesChapter). The *Jobs* pane allows you to run R scripts in the background. This is very useful if you have computationally time consuming jobs that you would like to run while also doing other work in RStudio. The *Environment/History/Connections* panes are in the upper right-hand corner. The *Environment* pane shows you all of the objects in your workspace and some of their characteristics, like how many observations a data frame has. You can click on an object in this pane to see a preview of its contents. This is especially useful for quickly looking at a data set in much the same way that you can visually scan a Microsoft Excel spreadsheet. The *History* pane records all of the functions you have run. It also allows you to rerun code and insert it into a source code file. The *Connections* pane allows you to manage connections to databases such as an SQL server.\index{SQL} ```{r BlankMain, echo=FALSE, fig.cap="RStudio at Startup", out.width="100%"} knitr::include_graphics("images/chapter_3/RStudioStartup.png") ``` In the lower right-hand corner, you will see the *Files/Plots/Packages/ Help/Viewer* panes. We will discuss the *Files* pane in more detail in Chapter \@ref(DirectoriesChapter). Basically, it allows you to navigate and organize your files. The *Plots* pane is where figures you create in R appear. This pane allows you to see all of the figures you have created in a session using the right and left arrow icons. It also lets you copy and save the figures in a variety of formats. The *Packages* pane shows the packages you have installed, allows you to load individual packages by clicking on the dialog box next to them, access their help files (just click on the package name), update the packages, and even install new packages. The *Help* pane shows you help files. You can search for help files and search within help files using this pane. Finally, the *Viewer* pane allows you to view local web content like JavaScript graphics and Shiny apps.\index{Shiny} #### The Source pane {-} There is an important pane that does not show up when you open RStudio for the first time. This is the *Source* pane. The *Source* pane is where you create, edit, and run your source code files. It also functions as an editor for your markup files. It is the center of reproducible research in RStudio. Let's first look at how to use the *Source* pane with regular R files. We will then cover how it works with *knitr*/*rmarkdown* in more detail in the next section. R source code files have the file extension `.R`. When you create a new source code document, RStudio will open a new *Source* pane. Do this by going to the menu bar and clicking on `File` `New`. In the `New` drop-down menu, you have the option to create a variety of different source code documents. Select the `R Script` option. You should now see a new pane with a bar across the top that looks like Figure \@ref(fig:TopBarFigs). To run the R code, you have in your source code file highlight it[^chapter3_28] and click the `Run` icon on the top bar. This sends the code to the console where it is run. The icon to the right of `Run` runs the code above where you have highlighted. The `Source` icon next to this runs all of the code in the file using R's `source()`\index{R function!source} function. When you click on the last icon on the right (it has a series of stacked lines) you will get a navigable table of contents for your file; very useful for working with longer documents, especially markup documents. ```{r TopBarFigs, echo=FALSE, fig.cap="RStudio Source Code Pane Top Bar", out.width="100%"} knitr::include_graphics("images/chapter_3/RSourceBar.png") ``` ## Using knitr and R Markdown: The Basics To get started with knitr and R Markdown in R or RStudio, we need to learn some of the basic concepts and syntax. The concepts are the same regardless of the markup language we are knitting R code with, but much of the syntax varies by markup language. *rmarkdown* relies on *knitr* and a utility called *Pandoc* to create many different types of presentation documents (HTML, PDF, or MS Word) from one document written largely using knitr's R Markdown syntax. ### What *knitr* does Let's take a quick, abstract look at what the *knitr* package does. As I've mentioned, *knitr* ties together your presentation of results with the creation of those results. The *knitr* process takes three steps (see Figure \@ref(fig:KnitProcess)). First, we create a knittable markup document. This contains both the analysis code and the presentation document's markup which is the text and rules for how to format the text. *knitr* then *knits*: i.e. it runs the analysis code and converts the output into the markup language you are using according to the rules that you tell it to use. It inserts the marked up results into a document that only contains markup for the presentation document. You *compile* this markup document as you would if you hadn't used *knitr* into your final PDF document or webpage presenting your results. ```{r KnitProcess, fig.cap="Knitr/R Markdown Process", engine = "tikz", cache=TRUE, echo=FALSE, fig.ext=if (knitr:::is_latex_output()) 'pdf' else 'png'} % Define colors for figure %% Color palette (GnBU) chosen using ColorBrewer 2.0 %% See: http://colorbrewer2.org/ %% Not used in the print version \definecolor{Blue}{HTML}{7BCCC4} \definecolor{LiteBlue}{HTML}{A8DDB5} \definecolor{DarkBlue}{HTML}{08589E} \definecolor{GrayLine}{HTML}{BDBDBD} % Set node styles %% Workflow stage nodes \tikzstyle{Docs} = [draw=Blue, rectangle, inner sep=0.3cm, font=\small] % Begin tikz picture \begin{tikzpicture} \node(knit) at (2, 1.75) {{\emph{\textbf{Knit}}}}; \node(compile) at (6, 1.75) {{\emph{\textbf{Compile}}}}; % Document nodes \node (knittable) at (0, 0) [Docs, text width= 6em]{Knittable Document \\ (Markup + Code Chunks)}; \node (Markup) at (4, 0) [Docs, text width= 6em]{Markup Only Document}; \node (Presentation) at (8, 0) [Docs, text width = 6em]{Presentation Document}; % .Rnw LaTeX Example \node(LaTeX) at (1, -2.5) {\textbf{knitr LaTeX Example}}; \node (Rnw) at (0, -3.5) [Docs, text width= 6em]{\emph{Paper.Rnw}}; \node (tex) at (4, -3.5) [Docs, text width= 6em]{\emph{Paper.tex}}; \node (pdf) at (8, -3.5) [Docs, text width = 6em]{\emph{Paper.pdf}}; % Markdown to HTML Example \node(Markdown) at (2, -5) {\textbf{knitr/R Markdown Markdown Example}}; \node (Rmd) at (0, -6) [Docs, text width= 6em]{\emph{Website.Rmd}}; \node (md) at (4, -6) [Docs, text width= 6em]{\emph{Website.md}}; \node (html) at (8, -6) [Docs, text width = 6em]{\emph{Website.html}}; % Lines \draw [->, very thick] (knittable) -- (Markup); \draw [->, very thick] (Markup) -- (Presentation); \draw [->, very thick] (Rnw) -- (tex); \draw [->, very thick] (tex) -- (pdf); \draw [->, very thick] (Rmd) -- (md); \draw [->, very thick] (md) -- (html); \end{tikzpicture} ``` ### What *rmarkdown* does {#rmardownHeader} The *rmarkdown* package implements a variation on this process that utilizes a program called Pandoc to create presentation documents in multiple formats from a knittable document written in Markdown. The main difference between pure *knitr* markdown and *rmarkdown* documents is the inclusion of a header specifying how you want to render the document with Pandoc.[^chapter3_29] The header is written in YAML.[^chapter3_30]\index{YAML} The YAML header can include information such as the document's title, author, whether or not to include a table of contents, and a link to a BibTeX bibliography file. YAML is a straightforward data format that organizes information in a simple hierarchy. The header begins and ends with three dashes (`---`). Information keys--like "title" and "author"--are separated from their associated "values" by a colon (`:`). Sub-values of a hierarchy are denoted by being placed on a new line and indented.[^chapter3_31] Here is a basic R Markdown header that indicates the document's title, author, and date, and that it will be turned into a PDF document (via LaTeX). ````yaml --- title: "A Basic PDF Presentation Document" author: "Christopher Gandrud" date: "2019-12-28" output: pdf_document: toc: true —-- ```` The title, author, and date will be placed at the beginning of the output document. The final line (`toc: true`) creates a table of contents near the beginning of the PDF document when we knit it. We will discuss more header options in Chapter \@ref(MarkdownChapter). RStudio can automatically create a basic header for the type of output document that you want when you open a new R Markdown file. Simply select `File` then `R Markdown…`. A window will appear that looks like Figure \@ref(fig:rmarkdownWindow). In this window select the type of output document you want to create and click `Ok`. In addition to the header, R Markdown differs from basic knitr files in that you can include Pandoc syntax in your R Markdown document. This can be useful for bibliographies as we will discuss in Chapter \@ref(MarkdownChapter). Nonetheless, remember that apart from the header and ability to include Pandoc syntax, at the simplest level R Markdown documents are knitr documents written in R Markdown syntax. They have the same code chunk syntax, as we will see shortly. ```{r rmarkdownWindow, fig.cap="The New R Markdown Options Window", echo=FALSE, out.width="100%"} knitr::include_graphics("images/chapter_3/newRMarkdown.png") ``` ### File extensions When you save a knittable file, use a file extension that indicates (a) that it is knittable and (b) what markup language it is using. You can use a number of file extensions for R Markdown files including: `.Rmd` and `.Rmarkdown`.[^chapter3_32] LaTeX documents that include *knitr* code chunks are generally called R Sweave files and have the file extension `.Rnw`. This terminology is a little confusing.[^chapter3_33] It is a holdover from *knitr*'s main literate programming predecessor *Sweave*. Note that *rmarkdown* documents can compile to LaTeX PDF documents and support pretty much the full capabilities of LaTeX. Because markdown is generally easier to write than raw LaTeX, `.Rnw` markup is much less commonly used. For example, I converted the third edition of this book from `.Rnw` to `.Rmd`. ### Code chunks Use code chunks to include knittable R code into your markup presentation documents. Code chunk syntax differs depending on the markup language we are using to write our documents. Let's see the syntax for R Markdown and R LaTeX files. If you are unfamiliar with basic LaTeX or Markdown syntax, you might want to skim Chapters \@ref(LatexChapter) and \@ref(MarkdownChapter) to familiarize yourself with it before reading this section. #### R Markdown {- #RMarkdownChunkBasic} In R Markdown files, we begin a code chunk by writing the head: ` ```{r} `. A code chunk is closed, ended, with: ` ``` `. For example: ````markdown `r ''````{r} # Example of an R Markdown code chunk string_num_matrix <- cbind(character_vector, numeric_vector) ``` ```` The R Markdown code chunk syntax is exactly the same for markdown files you compile with *knitr* or *rmarkdown*. #### R LaTeX (.`Rnw`) {-} Code chunks are delimited in non-R Markdown R LaTeX documents in a way that emulates the long-established *Sweave* syntax. Sweave-style code chunks begin with the following head: `<<>>=`. The code chunk is closed with an at sign (`@`). ```{sh, eval=FALSE} << >>= string_num_matrix <- cbind(character_vector, numeric_vector) @ ``` #### Code chunk labels {-} Each chunk has a label. When a code chunk creates a plot or the output is cached, stored for future use, *knitr* uses the chunk label for the new file's name. If you do not explicitly give the chunk a label it will be assigned one like: `unnamed-chunk-1`. To explicitly assign chunk labels in R Markdown documents, place the label name inside of the braces after the `r`. If we wanted to use the, admittedly not descriptive, label `ex-label` we type: ````markdown `r ''````{r ex-label} # Example chunk label ``` ```` The same general format applies to the two types of LaTeX chunks. In Sweave-style chunks, we type: `<>=`. Try not to use spaces or periods in your label names. Also remember that chunk labels *must* be unique. #### Code chunk options {-} There are many times when we want to change how our code chunks are knitted and presented. Maybe we only want to show the code and not the results. Perhaps we don't want to show the code at all but just a figure that it produces. Maybe we want the figure to be formatted on a page in a certain way. To make these changes and many others, we can specify code chunk options. Like chunk labels, you specify options in the chunk head. Place them after the chunk label, separated by a comma. Chunk options are written following pretty much the same rules as regular R function arguments. They have a similar `OPTION_LABEL=VALUE` structure as arguments. The option values must be written in the same way that argument values are. Character strings need to be inside of quotation marks. The logical `TRUE` and `FALSE` operators cannot be written `"true"` and `"false"`. For example, imagine we have a Markdown code chunk called `ex-label`. If we want to run the code chunk, but not show the code in the final presentation document, we can use the option `echo=FALSE`. ````markdown `r ''````{r ex-label, echo=FALSE} string_num_matrix <- cbind(character_vector, numeric_vector) ``` ```` Note that all labels and code chunk options must be on the same line. Options are separated by commas. The syntax for *knitr* options is the same regardless of the markup language. Throughout this book, we will look at a number of different code chunk options. Many of the chunk options we will use in this book are listed in Table \@ref(ChunkOptionsTable). For the full list of *knitr* options, see the *knitr* chunk options page maintained by *knitr*'s creator Yihui Xie: . \begin{table} \caption{A Selection of {\emph{knitr}} Code Chunk Options} \begin{center} \label{ChunkOptionsTable} \begin{tabular}{l c p{6cm}} \hline Chunk Option Label & Type & Description \\[0.25cm] \hline\hline \texttt{cache} & Logical & Whether or not to save results from the code chunk in a cache database. Note: cached chunks are only run when they are changed. \\[0.25cm] \texttt{cache.vars} & Character Vector & Specify the variable names to save in the cache database. \\[0.25cm] \texttt{eval} & Logical & Whether or not to run the chunk. \\[0.25cm] \texttt{echo} & Logical & Whether or not to include the code in the presentation document. \\[0.25cm] \texttt{error} & Logical & Whether or not to include error messages. \\[0.25cm] \texttt{engine} & Character & Set the programming language for {\emph{knitr}} to evaluate the code chunk with. \\[0.25cm] \texttt{fig.align} & Character & Align figures. (Note: does not work with R Markdown documents.) \\[0.25cm] \texttt{fig.path} & Character & Set the directory where figures will be saved. \\[0.25cm] \texttt{include} & Logical & When \texttt{include=FALSE} the chunk is evaluated, but the results are not included in the presentation document. \\[0.25cm] \texttt{message} & Logical & Whether or not to include R messages. \\[0.25cm] \texttt{out.height} & Numeric & Set figures' heights in the presentation document. \\[0.25cm] \texttt{out.width} & Numeric & Set figures' widths in the presentation document. \\[0.25cm] \texttt{results} & Character & How to include results in the presentation document. \\[0.25cm] \texttt{tidy} & Logical & Whether or not to have \emph{knitr} format printed code chunks. \\[0.25cm] \texttt{warning} & Logical & Whether or not to include warnings. \\[0.25cm] \hline \end{tabular} \end{center} {\scriptsize{These functions are discussed in more detail in Chapter \ref{StatsModel}.}} \end{table} ### Global chunk options {#GlobalChunkOptions} So far, we have only looked at how to set local options in *knitr* code chunks, i.e. options for only one specific chunk. If we want an option to apply to all of the chunks in our document, we can set global chunk options. Options are 'global' in the sense that they apply to the entire document. Setting global chunk options helps us create documents that are formatted consistently without having to repetitively specify the same option every time we create a new code chunk. For example, rather than using the `fig.align='center'` option in each code chunk that creates a figure, we can center align all figures in a document by setting the option globally. To set a global option, first create a new code chunk at the beginning of your document.[^chapter3_35] You will probably want to set the option `include=FALSE` so that *knitr* doesn't include the code in your presentation document. Inside the code chunk, use `opts_chunk$set`. You can set any chunk option as an argument to `opts_chunk$set`. The option will be applied across your document, unless you set a different local option. Here is an example of how you can center align all of the figures in R Markdown in a chunk placed near the beginning of the document: ````markdown `r ''````{r set-global, include=FALSE} # Center align all knitr generated figures knitr::opts_chunk$set(fig.align='center') ``` ```` If you want to use `opts_chunk` in a document rendered with *rmarkdown*, you will need to either explicitly call it as in the example using the double colon or load the *knitr* package before calling it. ### *knitr* package options *knitr* package options affect how the package itself runs. For example, the `progress` option can be set as either `TRUE` or `FALSE` [^chapter3_36] depending on whether or not you want a progress bar to be displayed when you knit a code chunk. You can use `base.dir` to set the directory where you want all of your figures to be saved (see Chapter \@ref(DirectoriesChapter)). You set package options in a similar way as global chunk options with `opts_knitr$set`. For example, include this code at the beginning of a document to turn off the progress bar when it is knitted: ````markdown `r ''````{r set-pkg-opt, include=FALSE} # Don't show progress bars knitr::opts_knit$set(progress=FALSE) ``` ```` ### Hooks You can also set hooks. Hooks come in two types: chunk hooks and output hooks. Chunk hooks run a function before or after a code chunk. Output hooks change how the raw output is formatted. I don't cover hooks in much detail in this book. For more information on hooks, please see Yihui Xie's webpage: . ### knitr, R Markdown, and RStudio RStudio is highly integrated with knitr/R Markdown and the markup languages that they work with. RStudio is probably the easiest tool for creating and compiling knitr/R Markdown. Most of the RStudio/knitr/R Markdown features are accessed in the *Source* pane. The *Source* pane's appearance and capabilities change depending on the type of file you have open in it. RStudio uses a file's extension and, if it is an *rmarkdown* document, its header, to determine what type of file you have open.[^chapter3_37] We have already seen some of the features the *Source* pane has for R source code files. Let's now look at how to use *knitr* and *rmarkdown* with R source code files as well as the markup formats we cover in this book: R Markdown and R LaTeX. #### Compiling R source code Notebooks {- #PublishRPubs} If you want a quick, well-formatted account of the code that you ran and the results that you got you can use RStudio's "Compile Notebook" capabilities. RStudio uses *rmarkdown* to create a standalone file presenting your source code and results. It will include all of the code from an R source file as well as the output. This can be useful for quickly presenting the steps you took to do an analysis. You can see an example RStudio Notebook in Figure \@ref(fig:NotebookExample). If you want to create a Notebook from an open R source code file click the `Compile Notebook` icon (![image](images/chapter_3/CompileNotebook.png)) in the *Source* pane's top bar.[^chapter3_38] Then in the window that pops up select the output type you would like (HTML, PDF or MS Word) and click the `Compile` button. For this example I selected HTML. In Figure \@ref(fig:NotebookExample) you can see near the top center right a small globe icon next to the word "Publish". Clicking this allows you to publish your Notebook to RPubs ().\index{RPubs} RPubs is a site for sharing your Notebooks over the internet. You can publish not only Notebooks, but also any R Markdown document you compile in RStudio. ```{r NotebookExample, fig.cap="RStudio Notebook Example", echo=FALSE, out.width="100%"} #### Include notebook example image #### knitr::include_graphics("images/chapter_3/NotebookExample.png") ``` In this chapter's appendix we discuss interactive Jupyter notebooks.\index{Jupyter} They are popular in the data science and tech industries and use a somewhat different logic from R Markdown notebooks. My current tech team tends to use either Jupyter notebooks or R Markdown Notebooks to present our detailed analyses. We host and share these via GitHub.\index{GitHub}\index{GitHub!Markdown} GitHub compiles both document types nicely for online access. For R Markdown Notebooks, use `output: github_document` in the header to ensure that the output file is compiled properly on GitHub. ```{r SourcePaneRmarkdown, echo=FALSE, fig.cap="RStudio Source Pane for an RMarkdown File", out.width="100%"} knitr::include_graphics("images/chapter_3/SourcePaneRmarkdown.png") ``` #### R Markdown {- #r-markdown} Figure \@ref(fig:SourcePaneRmarkdown) is what the *Source* pane looks like when you have an R Markdown file open. You'll notice the familiar `Run` button for running R code. It now includes a drop-down menu for running code chunks. It includes options like `Run Current Chunk`, i.e. run the chunk where your cursor is located, `Run Next Chunk`, and `Run All` chunks. In this menu, you can select `Insert Chunk` to insert the basic syntax required for a code chunk. You can navigate to a specific chunk using a drop-down menu on the bottom left-hand side of the *Source* pane. This can be very useful if you are working with a long document. To knit your file, click the `Knit` icon on the left side of the *Source* pane's top bar. If you click on the downward arrow on the right of this icon, you will be given the opportunity to knit the document to HTML, PDF, or, MS Word using *rmarkdown*. Helpfully, the R Markdown *Source* pane's top bar also includes the `ABC` spell check icon. RStudio can properly highlight both the markup language syntax and the R code in the *Source* pane. This makes your source code much easier to read and navigate. RStudio can also fold code chunks. This makes navigating through long documents, with long code chunks, much easier. At line 1014 in Figure \@ref(fig:SourcePaneRmarkdown), you can see a small downward facing arrow. If you were to click this arrow, the code chunk would collapse to look like line 1021 in Figure \@ref(fig:SourcePaneRmarkdown). To unfold the chunk, just click on the arrow again. You may also notice that there is a code folding arrow on line 1015 in Figure \@ref(fig:SourcePaneRmarkdown). This allows us to fold parts of the code chunk. To enable this option, create a comment line with at least one hash before the comment text and at least four after it like this: ```{r CommentFold, echo=TRUE} #### An RStudio Foldable Comment #### ``` You will be able to fold all of the text after this comment up until the next similarly formatted comment (or the end of the chunk). #### R (Sweave) LaTeX {-} Many of the *Source* pane options for R (`.Rnw`) LaTeX files are the same as R Markdown files, the key differences being that there is a `Compile PDF` icon instead of `Knit`. Clicking this icon knits the file and creates a PDF file in your R LaTeX file's directory. There is also a `Format` icon instead of the question mark icon. This actually inserts LaTeX formatting functions into your document for things such as section headings and bullet lists. These functions can be very tedious to type out by hand otherwise. By default, RStudio may be set up to use *Sweave* for compiling LaTeX documents. To use *knitr* instead of *Sweave* to knit `.Rnw` files you should click on `Tools` in the RStudio menu bar, then click on `Global Options...`. Once the **Options** window opens, click on the `Sweave` button. Select `knitr` from the drop-down menu for "Weave Rnw files using:". Finally, click `Apply`.[^chapter3_39] In the `Sweave` options menu, you can also set which LaTeX typesetting engine to use. By default, it is set to the more established engine pdfLaTeX.\index{pdfLaTeX} Another option is XeLaTeX.\index{XeLaTeX} XeLaTeX has the ability to use many more characters than pdfLaTeX as it works with UTF-8 encoded input.\index{UTF-8} It can also use any font on your computer. XeLaTeX is especially useful compared to pdfLaTeX if you are using characters that are not found in standard English. ### knitr and R As *knitr* is a regular R package, you can of course, knit documents in R (or using the console in RStudio). All of the *knitr* syntax in your markup document is the same as before, but instead of clicking a `Compile PDF` or `knit HTML` button, use the `knit()` function. To knit a hypothetical Markdown file *example.Rmd* you first use the `setwd()` function to set the working directory (for more details see Chapter \@ref(DirectoriesChapter)) to the folder where the *example.Rmd* file is located. In this example, it is located in the Documents folder.[^chapter3_40] ```{r Ch3RawKnitSetwd, echo=TRUE, eval=FALSE, tidy=FALSE} setwd("/Documents/") ``` Then you `knit()`\index{R function!knit} the file: ```{r Ch3RawKnit, echo=TRUE, eval=FALSE, tidy=FALSE} knit(input = "example.Rmd", output = "example.md") ``` You use the same steps for all other knittable document types. Note that if you do not specify the output file, *knitr* will determine what the file name and extension should be. In this example it would come up with the same name and location as we gave it. In this example, using the `knit()` function only creates a Markdown file and not an HTML file, as clicking `Knit` in RStudio did. Likewise, if you use on a *.Rnw* file you will only end up with a basic LaTeX *.tex* file and not a compiled PDF. To convert the Markdown file into HTML, you need to further run the *.md* file through the `markdownToHTML()`\index{R function!markdownToHTML} function from the *markdown* package, i.e.: ```{r Ch3MDtoHTML, eval=FALSE, tidy=FALSE, echo=TRUE} markdownToHTML(file = "example.md", output = "example.html") ``` This is a bit tedious. Luckily, there is a function in the *knitr* package that combines `markdownToHTML()` and `knit()`. It is called `knit2html()`. You use it like this: \index{R function!knit2html} ```{r Ch3RMDtoHTML, echo=TRUE, eval=FALSE, tidy=FALSE} knit2html(file = "example.Rmd", output = "example.html") ``` If we want to compile a *.tex* file in R, we run it through the `texi2pdf()` function\index{R function!texi2pdf} from the *tools* package. This package will run both LaTeX and BibTeX to create a PDF with a bibliography. See Chapter \@ref(LatexChapter) for more details on using BibTeX for bibliographies. Here is a `texi2pdf()` example: ```{r CH3tex2pdf, echo=TRUE, eval=FALSE, tidy=FALSE} # Load tools package library(tools) # Compile pdf texi2pdf(file = "example.tex") ``` Just like with `knit2html()`, you can simplify this process by using the `knit2pdf()` function to compile a PDF file from a `.Rnw` document.\index{R function!knit2html}\index{R function!knit2pdf} ### R Markdown and R {#rmarkdownRender} Just as *knitr* is an R package that you can run from the console, you can also run *rmarkdown* from the console. Instead of the `knit()`, function use `render()`. Imagine that *example.Rmd* now has an *rmarkdown* header: ````yaml --- title: "A Basic PDF Presentation Document" author: "Christopher Gandrud" date: "2018-10-28" output: pdf_document: toc: true html_document: toc: false —-- ```` This header specifies how the file can be compiled to either PDF or HTML. When compiled to PDF, it will include a table of contents. When compiled to HTML, it won't. Now we use `render()`: \index{R function!render} ```{r Ch3RenderBasic, eval=FALSE, echo=TRUE} render("example.Rmd") ``` This call will compile the document to a PDF in the working directory, because PDF is listed as the first output format in the header. The document will be called *example.pdf*. Alternatively, to compile the R Markdown file to HTML use: ```{r Ch3RenderHTML, eval=FALSE, echo=TRUE} render("example.Rmd", "html_document") ``` We could compile to both formats using: ```{r Ch3RenderBasicAll, eval=FALSE, echo=TRUE} render("example.Rmd", "all") ``` or ```{r Ch3RenderBasicAltAll, eval=FALSE, echo=TRUE} render("example.Rmd", c("pdf_document", "html_document")) ``` In all of these cases, `render()` will create, but not keep the intermediate *.md* or *.tex* document. You can have these documents saved by adding `keep_md` or `keep_tex` to the header. For example: ````yaml output: pdf_document: toc: true keep_tex: true html_document: keep_md: true toc: false —-- ```` Finally, if you want to output to one format with the default rendering style, for example, the HTML document, use `html_document: default`. ### Chapter summary {#chapter-summary .unnumbered} We've covered a lot of ground in this chapter, including R basics, how to use RStudio, and knitr/R Markdown syntax for multiple markup languages. These tools, especially R and knitr/R Markdown, are fundamental to the reproducible research process we will learn in this book. They enable us to create dynamic text-based files that record our research steps in detail. In the next chapter, we will look at how to organize files created with these types of tools into reproducible research projects. # Appendix: Jupyter Interactive Notebooks {- #jupyter} Jupyter notebooks are a commonly used alternative to R Markdown notebooks and knitr generally for displaying and discussing computational analyses. They are especially prevalent in the data science industry. For example, I never used Jupyter notebooks during my academic life in the quantitative social sciences, but after moving to the tech industry I regularly write and read them. A reason for this is that they are useful for fast prototyping data analyses, as they are interactive. You run the code directly in the notebook and see the results printed in the notebook immediately. Jupyter is often associated with Python, but the name 'Jupyter' actually refers to three languages used in data science **Ju**lia,\index{Julia} **Py**thon,\index{Python} and **R** and can be used with other languages as well. This book is clearly focused on R Markdown, but if you would like to explore launching Jupyter from R, see the *IRkernel* package [@R-IRkernel]. Though personally I have been launching Jupyter from Python or Julia, as the installation is more straightforward. In fact, the Python installation is a prerequisite for *IRkernel*. For more details, see: - Python installation instructions: , - Julia installation instructions: . #### Controversy {-} In mid-2019 there was a major controversy about Jupyter notebooks (well at least a topic heavily discussed on data science Twitter)\index{Twitter}. Joel Grus started the controversy by giving a talk at the main Jupyter conference, JupyterCon, called, 'I Don't Like Notebooks'.[^jupyter_dont_like] His critique was multi-pronged, but one critique that resonated with my strong interest in reproducibility (and personal experience using these notebooks) is that you can execute code in Jupyter notebooks in an arbitrary order. Using R Markdown terminology: you could execute the third code chunk before the second and then make changes to and rerun the second chunk without rerunning the third. This is troubling for reproducibility, as it is difficult for a third person (or yourself a few minutes later) to know what order the code was executed in to get the displayed results. Jupyter notebooks do record the order in which code was executed within the same session, but this adds an additional layer of complexity to figuring out results. The order also becomes inconsistent when a notebook is relaunched. #### R Markdown vs. Jupyter {-} A big reason that I personally prefer R Markdown over Jupyter is that it provides the 'best of both worlds'. RStudio allows you to interact with R Markdown documents in a very similar way to Jupyter notebooks (see Figure \@ref(fig:RMarkdownInteractive)). To enable fast prototyping, you can interactively run code chunks in any order and immediately see the results in line with the markup. It also channels you towards running the code in order when you knit the document before you share it with others.[^chapter_3_jupyter_r] ```{r RMarkdownInteractive, fig.cap="R Markdown Interactive Behavior Example in RStudio", echo=FALSE, out.width="100%"} #### Include notebook example image #### knitr::include_graphics("images/chapter_3/RmarkdownInteractive.png") ``` # Appendix: knitr and Lyx {- #LyxAppendix} You may be more comfortable using a what-you-see-is-what-you-get (WYSIWYG) editor, similar to Microsoft Word. Lyx is a WYSIWYG LaTeX editor that can be used with *knitr*. I don't cover Lyx in detail in this book, but here is a little information to get you started. #### Setup {-} To set up Lyx so that it can compile `.Rnw` files, click `Document` in the menu bar, then `Settings`. In the left-hand panel, the second option is `Modules`. Click on `Modules` and select `Rnw (knitr)`. Click `Add`, then `Ok`. Now, compile your LaTeX document in the normal Lyx way. #### Code Chunks {-} Enter code chunks into TeX Code blocks within your Lyx documents. To create a new TeX Code block, select `Insert` from the menu bar, then `TeX Code`. [^chapter3_1]: Please see Chapter \@ref(Intro) for instructions on how to install R. [^chapter3_2]: This figure and almost all screenshots in this book were taken on a computer using the macOS 10.14 operating system. [^chapter3_3]: If you are using a Unix-like system such as Linux Ubuntu or macOS, you can also access R via an application called the Terminal. If you have installed R on your computer, you can type `R` into the Terminal. This will begin a new R session. You will know you are in a new R session because the same type of start-up information as in Figure \@ref(fig:RBlankMain) will be printed in your Terminal. [^chapter3_4]: Functions are also objects. In this chapter, I treat them as distinct from other object types to avoid confusion. [^chapter3_5]: @whickham2014book argues that underscores (`_`) should be used to separate words in object names to make the names easier to read. For example: `health_data` rather than `healthdata`. The underscore object naming convention appears to now be the dominant style in the R community. There are other conventions. These include using periods (`.`) or capital letters (referred to as CamelBack) to separate words. For more information on R naming conventions, see @baath2012. [^chapter3_6]: The assignment operator is sometimes also referred to as the 'gets arrow'. [^chapter3_7]: The double hash is generated automatically by *knitr*. Prepending the output with hashes makes it easier to copy and paste code into R from a document created by *knitr*/*rmarkdown* because R will ignore everything after a hash. [^chapter3_8]: Single and double quotation marks are interchangeable in R for this purpose. In this book I always use double quotes, except for *knitr* code chunk options. [^chapter_3_types]: R object types are not fixed. They can be implicitly converted by assigning values of a different type to them. Other languages, such as Scala, prohibit implicit type conversions.\index{Scala} These languages are sometimes referred to as 'type safe'. They make it impossible to implicitly change an object's type, which can sometimes produce errors.\index{type safe} [^chapter3_9]: These might be called scalar objects, though in R, scalars are just vectors with a length of 1. [^chapter3_10]: If you want information about other types of R objects such as lists and matrices, Chapter 1 of Norman Matloff's [-@matloff2011] book is a really good place to look. [^chapter3_11]: In a vector, every member of the group must be of the same type. If you want an ordered group of values with different types, you can use lists. [^chapter3_12]: If you want to combine objects as if they were rows of the same column(s), use the `rbind()` function. [^chapter3_13]: Matrices are basically collections of vectors, each represented as a column. [^chapter3_14]: Matrices can also have a *names* attribute. [^chapter3_15]: You can also use `names()` to assign names for the entire data frame. For example, `names(string_num_df) <- c(variable_1, variable_2)` [^chapter3_16]: Note that this is really only useful for data frames with few rows. [^chapter3_17]: You can see what is in your current search path with the `search` function. Just type `search()` \index{R function!search}into your R console. [^chapter3_18]: For more information on this data set, type `?cars` into your R console. [^chapter3_19]: Using R terminology, the second 'argument' value, the code after the comma, of the `with()` function is called an 'expression', because it can contain more than one R function or statement. See Section \@ref(arguments) for a more comprehensive discussion of R function arguments. [^chapter3_21a]: If you don't include the parentheses after the function name, R will return the source code for the function just like when you enter an object name into your console returns the contents. This is because in R, functions are actually also objects! [^chapter3_21]: Note: you do not have to put spaces between the argument label and the equals sign or the equals sign and the value. However, having spaces can make your code easier to read. [^chapter3_22]: They can be abbreviated `T` and `F`. [^chapter3_23]: Note: your workspace will probably include different objects than this example. These are objects created to knit the book. [^chapter3_24]: For example, imagine you create an object, then change the source code you used to create the object. However, there is a syntax error in the new version of the source code. The old object won't be overwritten, and you will be mistakenly using the old object in future functions. [^chapter3_25]: `save.image()` is just a special case of `save()`. [^chapter3_26]: For the latest list, see: . [^chapter3_27]: You will probably see R packages referred to as "libraries", though this is a misnomer. [^chapter3_28]: If you are only running one line of code, you don't need to highlight the code; you can put your cursor on that line. [^chapter3_29]: Note that you can also create an *rmarkdown* document without a header. *rmarkdown* will just use the default settings when knitting. [^chapter3_30]: YAML is a recursive acronym that means, "YAML Ain't Markup Language". [^chapter3_31]: It doesn't matter how many spaces you use to indent, as long as all indentations have the same number of spaces. [^chapter3_32]: R Markdown files that you compile with *knitr* or *rmarkdown* have the same `.Rmd` file extension. [^chapter3_33]: The "nw" refers to the noweb simple literate programming tool that Sweave built on [@leisch2002; @ramseynoweb]. [^chapter3_35]: In Markdown, you can put global chunk options at the very top of the document. In `.Rnw` documents, they should be placed after the `\begin{document}` function. See Chapter \@ref(LatexChapter) for more information on how LaTeX documents are structured. [^chapter3_36]: It's set as `TRUE` by default. [^chapter3_37]: You can manually set how you want the *Source* pane to act by selecting the file type using the drop-down menu in the lower right-hand corner of the *Source* pane. [^chapter3_38]: Alternatively, `File` `Compile Notebook...` [^chapter3_39]: In the Mac version of RStudio, you can also access the `Options` window via `RStudio` `Preferences` in the menu bar. [^chapter3_40]: Using the directory name is for Mac computers. Please use alternative syntax discussed in Chapter \@ref(DirectoriesChapter) on other types of systems. [^jupyter_dont_like]: The presentation is available here: . For a comprehensive discussion of the 'first notebook war' by Yihui Xie, see: . [^chapter_3_jupyter_r]: See Nathan Stephens' 2017 blog post further making the case for R Notebooks: . ================================================ FILE: rep-res-3rd-edition/06-file-management.Rmd ================================================ # Getting Started with File Management {#DirectoriesChapter} Careful file management is crucial for reproducible research. Remember two of the guidelines from Chapter \@ref(GettingStartedRR): - Explicitly tie your files together. - Have a plan to organize, store, and make your files available. Apart from the times when you have an email exchange (or even meet in person) with someone interested in reproducing your research, the main information independent researchers have about the procedures is what they access in files you make available: data files, analysis files, and presentation files. If these files are well organized and the way they are tied together is clear, replication will be much easier. File management is also important for you as a researcher, because if your files are well organized, you will be able to more easily make changes, benefit from work you have already done, and collaborate with others. Using tools such as R, knitr/R Markdown, and markup languages like LaTeX requires fairly detailed knowledge of where files are stored in your computer. Handling files to enable reproducibility may require you to use command-line tools to access and organize your files. R and Unix-like shell programs allow you to control files---creating, deleting, relocating---in powerful and really reproducible ways. By typing these commands you are documenting every step you take. This is a major advantage over graphical user interface-type systems where you organize files by clicking and dragging them with the cursor. However, typed commands require you to know your files' specific addresses, their file paths. In this chapter we discuss how a reproducible research project may be organized and cover the basics of file path naming conventions in Unix-like operating systems, such as macOS and Linux, and Windows. We then learn how to organize them with RStudio Projects. We'll cover some basic R and Unix-like shell commands for manipulating files as well as how to navigate through files in RStudio in the *Files* pane. The skills you will learn in this chapter will be heavily used in the next chapter (Chapter \@ref(Storing)) and throughout the book. In this chapter we work with locally stored files, i.e. files stored on your computer. In the next chapter, we will discuss various ways to store and access files remotely stored in the cloud. ## File Paths and Naming Conventions All of the operating systems covered in this book organize files in hierarchical directories, also known as file trees. To a large extent, directories can be thought of as the folders you usually see on your Windows or Mac desktop.[^chapter4_1] They are called hierarchical because directories are located inside of other directories, as in Figure \@ref(fig:ExampleTree).[^chapter_4_tree_cmd] ### Root directories A root directory is the first level in a disk, such as a hard drive. It is the root out of which the file tree 'grows'. All other directories are sub-directories of the root directory.\index{root directory} On Windows computers you can have multiple root directories, one for each storage device or partition of a storage device. The root directory is given a drive letter assignment. If you use Windows regularly, you will most likely be familiar with `C:\` used to denote the C partition of the hard drive. This is a root directory. On Unix-like systems, including Macs and Linux computers, the root directory is denoted by a forward slash (`/`) with nothing before it. ### Sub-directories and parent directories You will probably not store all of your files in the root directory. This would get very messy. Instead, you will store your files in sub-directories of the root directory. Inside of these sub-directories may be further sub-directories and so on. A directory inside of another directory is referred to as a child directory of a parent directory.\index{child directory}\index{parent directory} On Windows computers, separate sub-directories are indicated with a back slash (`\`). For example, if we have a folder called *data* inside of a folder called *example-project* which is located in the C root directory, it has the address `C:\example-project\data`.[^chapter4_2] When you type Windows file paths into R, you need to use two backslashes rather than one: e.g. `C:\\example-project\\data`. This is because the `\` is an escape character in R.[^chapter4_3]\index{escape character} Escape characters tell R to interpret the next character or sequence of characters differently. For example, in Section \@ref(TSVEscape) you'll see how `\t` can be interpreted by R as a tab rather than the letter "t". Add another escape character to neutralize the escape character so that R interprets it as a backslash. In other words, use an escape character to escape the escape character. Another option for writing Windows file names in R is to use one forward slash (`/`). On Unix-like systems, including Mac computers, directories are indicated with a forward slash (`/`). The file path of the *data* file on a Unix-like system would be: `/example-project/data`. Remember that a forward slash with nothing before it indicates the root directory. So `/example-project/data` has a different meaning than `example-project/data`. In the former, *example-project* is a sub-directory of the root. In the latter, *example-project* is a sub-directory of the current working directory (see below for details about working directories). This is also true in Windows. In this chapter, I switch between the two file system naming conventions to expose you to both. In subsequent chapters, I use Unix-like file paths. When you use relative paths (see below), these will work across operating systems in R. We'll get to relative paths in a moment. ### Working directories When you use R, markup languages, and many of the other tools covered in this book, it is important to keep in mind what your current working directory is. The working directory is the directory where the program automatically looks for files and other directories, unless you tell it to look elsewhere. It is also where it will save files. Later in this chapter, we will cover functions for finding and changing the working directory.\index{working directory} ```{r ExampleTree, engine = "tikz", fig.cap = "Example Research Project File Tree", cache=TRUE, echo=FALSE, fig.ext=if (knitr:::is_latex_output()) 'pdf' else 'png'} \usetikzlibrary{trees} % Set node styles \tikzstyle{DirBox} = [draw=black, rectangle, minimum width=5em, very thick, font=\small] \tikzstyle{every node} = [draw=gray, thin, anchor=west, font=\small] % Begin tikz picture \begin{tikzpicture}[% grow via three points={one child at (0.5,-0.7) and two children at (0.5,-0.7) and (0.5,-1.4)}, edge from parent path={(\tikzparentnode.south) |- (\tikzchildnode.west)}] % Root Directory \node (root) at (5, 10) [DirBox]{Root}; % Project Directory \node (project) at (4.5, 8.5) [DirBox]{example-project} child {node {{\small{paper.Rmd}}}} child {node {{\small{slideshow.Rmd}}}} child {node {{\small{website.Rmd}}}} child {node {{\small{main.bib}}}} ; % Data Directory \node (data) at (0, 4.5) [DirBox]{data} child {node {{\small{main-data.csv}}}} child {node {{\small{Makefile}}}} child {node {{\small{merge-data.R}}}} child {node {{\small{gather-1.R}}}} child {node {{\small{gather-2.R}}}} child {node {{\small{main-data-variable-descriptions.md}}}} child {node {{\small{README.Rmd}}}} ; % Analysis subdirectores/files \node (analysis) at (1.5, 7) [DirBox]{analysis} child {node {{\small{analysis-1.R}}}} child {node {{\small{analysis-2.R}}}} ; % README and .Rproj files \node (readme) at (9, 7) {README.md}; \node (rproj) at (10, 6) {example-project.Rproj}; % Connect boxes that are not explicit children \draw (root) -- (project); \draw (project) -| (analysis); \draw (project) -| (data); \draw (project) -| (readme); \draw (project) -| (rproj); \end{tikzpicture} ``` ### Absolute vs. relative paths \index{absolute file path}\index{relative file path} For reproducible research, collaborative research, and even if you ever change the computer you work on, it is a good idea to use relative rather than absolute file paths. Absolute file paths give the entire path of a given file or directory on a specific system. For example, `/example-project/data` is an absolute path. It specifies the path of the *data* child directory all the way back to the root directory. However, if our current working directory is *example-project* and we want to link to the *data* child directory or a file in it, we don't need the absolute path. We could use `data/`, i.e. the path relative to the working directory. It is good practice to use relative paths and organize your files such that using relative paths is easy. This makes your code less dependent on the particular file structure of a particular computer. For example, imagine you use `C:\\example-project\\data` in your source code to link to the *data* directory. If someone---a collaborator, a researcher reproducing your work, or even you---then tries to run the code on a different computer, the code will break if they are, for instance, using a Unix-like system or have placed *example-project* in a different partition of their hard drive. This can be fixed relatively by changing the file path in the source. However, this is tedious (often not well documented) and unnecessary if you use relative file paths. Below we'll see how to use RStudio Projects and also the *here* package [@R-here] to automatically set working directories so that your relative file paths will transport even more easily across computers. The *ProjectTemplate* package [@R-ProjectTemplate] provides functions to help set up a well structured research project file tree. We don't use it in the following examples, but you may find it useful in your own work.\index{R package!ProjectTemplate} ### Spaces in directory and file names It is good practice to avoid putting spaces in your file and directory names. For example, I called the example project parent directory in Figure \@ref(fig:ExampleTree) "example-project" rather than "Example Project". Spaces in file and directory names can sometimes create problems for computer programs trying to read the file path. The program may believe that the space indicates that the path name has ended. To make multi-word names easily readable without using spaces, adopt a consistent naming convention. One approach is to use a convention that contrasts with the R object naming convention you are using. A contrasting convention helps make it clear if something is an R object or a file name. For example, if we adopt the underscore method for R object names used in Chapter \@ref(GettingStartedRKnitr) (e.g. `health_data`) we could use hyphens (`-`) to separate words in file names. For example: `example-source.R`. This is sometimes called kebab-case.\index{kebab-case} ## Organizing Your Research Project Figure \@ref(fig:ExampleTree) gives an example of how the files in a simple reproducible research project could be organized. The project's parent directory is called *example-project*. Inside this directory are the primary knittable documents (*paper.Rmd* *slideshow.Rmd*, and *website.Rmd*). In addition, there is an *analysis* sub-directory with the R files to run the statistical analyses followed by a further *data* child directory. The nested file structure allows you to use relative file paths. The knittable documents can call *analysis-1.R* with the relative path *analysis/analysis-1.R*. ```{r ProjectMenu, fig.cap="An Example RStudio Project Menu", echo=FALSE, fig.align='center', out.height="30%"} knitr::include_graphics("images/chapter_4/ProjectMenu.png") ``` In addition to the main files and sub-directories in *example-project*, you will notice files called *README.md* and *example-project.Rproj*. We'll discuss the *example-project.Rproj* file in the next section. The *README.md* file is a human readable overview of all the files in the project. It should briefly describe the project including things like its title, author(s), topic, any copyright information, and so on. It should also indicate how the folders in the project are organized and give instructions for how to reproduce the project. The README file should be in the main project folder---in our example this is called *example-project*---so that it is easy to find. If you are storing your project as a GitHub repository (see Chapter \@ref(Storing)) and the file is called *README*, its contents will automatically be displayed on the repository's main page. If the *README* file is written using Markdown (e.g. *README.md*), it will also be properly formatted. Figure \@ref(fig:BookRepository) shows an example of this. It is good practice to dynamically include the system information for the R session you used to create the project. To do this, you can write your README file with R Markdown. Simply include the `sessionInfo()` function in a *knitr* code chunk in the R Markdown document. If you knit this file immediately after knitting your presentation document, it will record the information for that session. You can also dynamically include session info in a LaTeX document. To do this, use the function in a code chunk. The code chunk should have the option `results='asis'`. The code is: \index{R function!toLatex}\index{R function!sessionInfo} ```{r Ch4SessionInfoLatex, eval=FALSE, echo=TRUE} toLatex(sessionInfo()) ``` ## Organizing Research with RStudio Projects {#CreateRStudioProject} If you are using RStudio, you may want to organize your files as Projects. You can turn a normal directory into an RStudio Project by clicking on `File` in the RStudio menu bar and selecting `New Project…`. A new window will pop up. Select the option `Existing Directory`. Find the directory you want to turn into an RStudio Project by clicking on the `Browse` button. Finally, select `Create Project`. You will also notice in the Create Project pop-up window that you can build new project directories and create a project from a directory already under version control (we'll do this at the end of Chapter \@ref(Storing)). When you create a new project, you will see that RStudio has put a file with the extension `.Rproj` into the directory, like *example-project.Rproj* in Figure \@ref(fig:ExampleTree). Making your research project directories RStudio Projects is useful for a number of reasons: - The project is listed in RStudio's Project menu where it can be opened easily (see Figure \@ref(fig:ProjectMenu)). - When you open the project in RStudio, it automatically sets the working directory to the project's directory and can load the source code files you were last working on. - You can set project specific options like whether PDF presentation documents should be compiled with *Sweave* or *knitr*. - When you close the project, your R workspace and history are saved in the project directory if you want. However, avoid saving your workspace as this could make reproducibility harder. - It helps you version control your files. - You can build your Project---run the files in a specific way---with makefiles. - It gives you an easy-to-use interface for managing the R packages that your project depends on. ## R File Manipulation Functions R has a range of functions for handling and navigating through files. Including these functions in your source code files allows you to more easily replicate your actions. #### `getwd()` {-} To find your current working directory use the `getwd()` function: \index{R function!getwd} ```{r Ch4Getwd, eval=TRUE} getwd() ``` #### `list.files()` {-} Use the `list.files()` function to see all of the files and sub-directories in the current working directory. You can list the files in other directories too by adding the directory path as an argument to the function.\index{R function!list.files} Because my current working directory has a lot of files in it, I will shorten the output for illustration by piping\index{pipe}\index{R function!\%>\%} it through `head()`.\index{R function!head} ```{r Ch4ListFiles, echo=TRUE} library(magrittr) list.files() %>% head() ``` #### `setwd()` {-} \index{R function!setwd} The `setwd()` function is the base R way to set the current working directory. For example, if we are on a Mac or other Unix-like computer, we can set the working directory to the *analysis* directory in our Example Project (see Figure \@ref(fig:ExampleTree)) like this: ````r setwd("/example-project/analysis/") ```` Now R will automatically look in the *analysis* folder for files and will save new files into this folder, unless we explicitly tell it to do otherwise. When working with a knittable document, setting the working directory once in a code chunk changes the working directory for all subsequent code chunks. However . . . #### `here::set_here()` {- #sethere} \index{R function!here} It is *not* good practice for reproducibility (and just general convenience when using a source code file across multiple computers) to use `setwd()` in your R source code. You, and anyone reproducing your work, will need to tediously set specific file paths for each computer. Instead, use RStudio Projects, which automatically set the working directory to the one with the *.Rproj* file. If you are not using RStudio Projects, include `set_here()` from the *here* package at the top of your source code. This will create a file called *.here* in the current working directory. It functions similarly to *.Rproj* to automatically flag for *here* what should be the current working directory. Remember when you share your source code to also share the *.Rproj*/*.here* file. #### `root.dir` in knittable documents {-} \index{knitr option!root.dir} By default, the root (or working) directory for all of the code chunks in a knittable document is the directory where this document is located. You can reset the directory by feeding a new file path to the `root.dir` option. We can set this globally[^chapter4_4] for all of the chunks in the document by including the following code in the document's first chunk. ````r opts_knit$set(root.dir = "/example-project/analysis") ```` We set the */example-project/analysis* sub-directory as the root directory for all of the chunks in our presentation document. **Note:** In general it is preferable to use the knittable file's default directory and file paths relative to it rather than manually specifying `root.dir()`. Setting an alternate root directory will make reproducibility more difficult. #### `dir.create()` {-} \index{R function!dir.create} Sometimes you may want to create a new directory. You can use the `dir.create()` function to do this.[^chapter4_5] For example, to create an *example-project* file in the root *C* directory on a Windows computer, type: ````r dir.create("C:\\example-project") ```` #### `file.create()` {-} \index{R function!file.create} Similarly, you can create a new blank file with the `file.create()` function. To add a blank R source code file called *source-code.R* to the *example-project* directory on the *C* drive, use: ````r file.create("C:\\example-project\\source-code.R") ```` #### `cat()` {- #catR} \index{R function!cat} If you want to create a new file and put text into it, use the `cat()` (concatenate and print) function. For example, to create a new file in the current working directory called *example-cat.md* that includes the text "Reproducible Research with R and RStudio" type: ````r cat("Reproducible Research with R and RStudio", file = "example-cat.md") ```` In this example we created a markdown formatted file by using the `.md` file extension. We could, of course, change the file extension to `.R` to set it as an R source code file, `.Rnw` to create a *knitr* LaTeX file, and so on. You can use `cat()` to print the contents of one or more objects to a file. **Warning:** the `cat()` function will overwrite existing files with the new contents. To add the text to existing files, use the `append = TRUE` argument. ````r cat("More Text", file = "example-cat.md", append = TRUE) ```` #### `unlink()` {-} \index{R function!unlink} You can use the `unlink` function to delete files and directories. ````r unlink("C:\\example-project\\source-code.R") ```` **Warning:** the `unlink()` function permanently deletes files, so be very careful using it. #### `file.rename()` {-} \index{R function!file.rename} You can use `file.rename()` to, obviously, rename a file. It can also be used to move a file from one directory to another. For example, imagine that we want to move the *example-cat.md* file from the directory *example-project* to one called *markdown-files* that we already created.[^chapter4_6] ````r file.rename(from = "C:\\example-project\\example-cat.md", to = "C:\\markdown-files\\example-cat.md") ```` #### `file.copy()` {-} \index{R function!file.copy} `file.rename()` fully moves a file from one directory to another. To copy the file to another directory, use the `file.copy()` function. It has the same syntax as `file.rename()`: ````r file.copy(from = "C:\\example-project\\example-cat.md", to = "C:\\markdown-files\\example-cat.md") ```` ## Unix-like Shell Commands for File Management Though this book is mostly focused on using R for reproducible research, it can be useful to use a Unix-like shell program to manipulate files in large projects. Unix-like shell programs, including Bash on Linux (and Mac before macOS Catalina), Zsh on Mac (from macOS Catalina onwards),\index{Zsh} and Windows PowerShell, give you type-able commands to interact with your computer's operating system.[^chapter4_7] We will especially return to shell commands in the next chapter when we discuss Git version control and makefiles for collecting data in Chapter \@ref(DataGather), as well as the command-line program[^chapter4_8] Pandoc in Chapter \@ref(MarkdownChapter). We don't have enough space to fully introduce shell programs or even all of the commands for manipulating files. We are just going to cover some of the basic and most useful commands for file management. For good introductions for Unix and macOS computers, see William E. Shotts Jr.'s [-@shottsjr2012] book on the Linux command-line. For Windows users, Microsoft maintains a tutorial on Windows PowerShell at . The commands discussed in this chapter should work in both Unix-like shells and Windows PowerShell. It's important at this point to highlight a key difference between R and Unix-like shell syntax. Shell command arguments don't have parentheses. For example, if I want to change my working directory to my Mac Desktop in a shell using the `cd` command, I type:[^chapter4_9] ````bash cd /Users/cgandrud/Desktop ```` In this example `cgandrud` is my user name. #### `cd` {-} \index{shell command!cd} As we just saw, use the `cd` (change directory) command to change the working directory in the shell. Here is an example of changing the directory in Windows PowerShell to `C:/`: ````bash cd C:/ ```` If you are in a child directory and want to change the working directory to the previous working directory you were in, type: ````bash cd - ```` If, for example, our current working directory is */User/Me/Desktop* and we typed `cd` followed by a minus sign (`cd -`), then the working directory would change to */User/Me*. Note this will not work in PowerShell. #### `pwd` {-} \index{shell command!pwd} To find your current working directory, use the `pwd` command (present working directory). This is essentially the same as R's `getwd()` function. ```{r Ch4pwdcmd, engine='sh', eval=FALSE} pwd ``` #### `ls` {-} \index{shell command!ls} The `ls` (list) command works very similarly to R's `list.files()` function. It shows you what is in the current working directory. Again, I have a lot of files in my working directory, so I will shorten the output for this example by piping it through the command line's `head` command.\index{shell command!head} The command line pipe is not `%>%`\index{R function!\%>\%}, as in R, but instead `|`.\index{shell command!|} ```{r Ch4lscmd, engine='sh'} ls | head ``` As we saw earlier, R also has an `ls` command. R's `ls()` function lists items in the R workspace. The shell's `ls` command lists files and directories in the working directory. #### `mkdir` {-} \index{shell command!mkdir} Use `mkdir` to create a new directory. For example, if I wanted to create a sub-directory of my Linux root directory called *new-directory* I would type: ````shell mkdir /new-directory ```` #### `echo` {-} \index{shell command!echo} There are a number of ways to create new files in Unix-like shells. One of the simplest is the `echo` command. This command prints its argument to the Terminal. For example: ```{r Ch4Echo1, engine='sh', echo=TRUE, cache=TRUE} echo Reproducible Research with R and RStudio ``` If you add the greater-than symbol (`>`) after the text you want to print and then a file name, `echo` will create the file (if it doesn't already exist) in the current working directory and then print the text into the file. ```{r Ch4Echo2, eval=FALSE, engine='sh', echo=TRUE, cache=TRUE} echo Reproducible Research with R and RStudio > example-echo.md ``` Using only one greater-than sign will completely erase the *example-echo.md* file's contents and replace them with `Reproducible Research with R and RStudio`. To append the text at the end of an existing file, use two greater-than signs (`>>`). ```{r Ch4Echo3, eval=FALSE, engine='sh', echo=TRUE, cache=TRUE} echo More text. >> example-echo.md ``` There is also a `cat` shell command. It works slightly differently than the R version of the command and I don't cover it here. #### `rm` {-} \index{shell command!rm} The command `rm` removes (deletes) files or directories. ````bash rm example-echo.md ```` Add the `d` (directory) option to delete a directory. Note that options are like arguments in an R function.\index{shell command!options} For example: ````bash rm -d example-dir ```` Again, be careful when using this command, because it permanently deletes the files or directories. As we saw in Chapter \@ref(GettingStartedRKnitr), R also has an `rm()` function. It is different because it removes objects from your R workspace rather than files from your working directory. #### `mv` {-} \index{shell command!mv} To move a file from one directory to another from a shell, use the `mv` (move) command. For example, to move the file *example-echo.md* from *example-project* to *markdown-files*, use the following code and imagine both directories are in the root directory:[^chapter4_10] ````bash mv /example-project/example-echo.md/markdown-files ```` Note that the *markdown-files* directory must already exist. If it does not exist, the file will just be renamed. This is similar to the R function `file.rename()`. #### `cp` {-} \index{shell command!cp} The `mv` command completely moves a file from one directory to another. To copy a version of the file to a new directory use the `cp` command. The syntax is similar to `mv`: ````bash cp /example-project/ExampleEcho.md /markdown-files ```` #### `system()` (R function) {- #systemRcommand} \index{R function!system} You can run shell commands from within R using R's `system()` function. For example, to run the `echo` command from within R type: ````r system("echo Text to Add > ExampleEcho.md") ```` ## File Navigation in RStudio The RStudio *Files* pane allows us to navigate our file tree and do some basic file manipulations. Figure \@ref(fig:FilesPane) shows us what this pane looks like. The pane allows us to navigate to specific files and folders and delete and rename files. To select a folder as the working directory, tick the dialog box next to the file. Then click the `More` button and select `Set As Working Directory`. Under the `More` button, you will also find options to `Move` and `Copy` files (see Figure \@ref(fig:FilesPaneMore)). The *Files* pane is a Graphical User Interface (GUI), so our actions in the *Files* pane are not recorded, as such are not as easily reproducible as the commands we learned earlier in this chapter. ```{r FilesPane, fig.cap="The RStudio Files Pane", echo=FALSE, fig.align='center', out.height="30%"} knitr::include_graphics("images/chapter_4/RStudioFiles.png") ``` ```{r FilesPaneMore, fig.cap="More Functionality in the RStudio Files Pane", echo=FALSE, fig.align='center', out.height="10%"} knitr::include_graphics("images/chapter_4/MoreMore.png") ``` ### Chapter summary {-} In this chapter we've learned how to organize our research files to enable dynamic replication. This included not only how they can be ordered in a computer's file system, but also the file path naming conventions---the addresses---that computers use to locate files. Once we know how these addresses work, we can use R and shell commands to refer to and manipulate our files. This skill is particularly useful because it allows us to place code in text-based files to manipulate our project files in highly reproducible ways. In the next few chapters, we will put these skills into practice. We will learn how to store our files and create data files in reproducible ways. [^chapter_4_tree_cmd]: The command line utility *tree* is very useful for visualizing your file trees. For more information, see .\index{shell command!tree} [^chapter4_1]: To simplify things, I use the terms 'directory' and 'folder' interchangeably in this book. [^chapter4_2]: For more information on Windows file path names, see this helpful website: [^chapter4_3]: As we will see in Part IV, it is also a LaTeX and Markdown escape character. [^chapter4_4]: See the discussion of global chunk options in Chapter \@ref(GettingStartedRKnitr), Section \@ref(GlobalChunkOptions). [^chapter4_5]: Note: you will need the correct system permissions to be able to do this. [^chapter4_6]: The `file.rename()` function won't create new directories. To move a file to a new directory, you will need to create the directory first with `dir.create()`.\index{R function!dir.create} [^chapter4_7]: You can access Bash via the Terminal program on macOS and Linux computers. It is the default shell on Mac (before macOS Catalina) and Linux, so it loads automatically when you open the Terminal. Windows PowerShell comes installed with Windows. [^chapter4_8]: A command-line program is just a program you run from a shell. [^chapter4_9]: Many shell code examples in other sources include the shell prompt, like the `$` in Bash, or `>` in PowerShell. These are like R's `>` prompt. I don't include the prompt in code examples in this book because you don't type them. [^chapter4_10]: If they were not in the root directory, we would not place a forward slash at the beginning. ================================================ FILE: rep-res-3rd-edition/07-storage.Rmd ================================================ # (PART) Data Gathering and Storage {-} # Storing, Collaborating, Accessing Files, and Versioning {#Storing} In addition to being well organized, your research files need to be accessible for other researchers to be able to reproduce your findings. A useful way to make your files accessible is to store them on a cloud storage service[^chapter_5_1] [@howe2012]. This chapter describes in detail two different cloud storage services, Dropbox and GitHub, that you can use to make your research files easily accessible to others. Not only do these services enable others to reproduce your research, but they also have a number of benefits for your research workflow. These are certainly not the only services for remote research file storage, but discussing them does cover many common concerns of other services. Researchers often face a number of data management issues that, beyond making their research difficult to reproduce, can make doing the initial research difficult. First, there is the problem of **storing** data so that it is protected against computer failure---virus infections, spilling coffee on your laptop, and so on. Storing data locally on your computer or on a flash drive is generally more prone to loss than on remote servers in the cloud. Second, we may work on a project with different computers and mobile devices. For example, we may use a computer at work to run computationally intensive analysis, while editing our presentation document on a tablet, while riding the train to the office. So, we need to be able to **access** our files from multiple devices in different locations. We often need a way for our **collaborators** to access and edit research files as well. Finally, we almost never create a data set or write a paper perfectly all at once. We may make changes and then realize that we liked an earlier version, or parts of an earlier version better. This is a particularly important issue in data management where we may transform our data in unintended ways and want to go back to earlier versions. Also, when working on a collaborative project, one of the authors may accidentally delete something in a file that another author needed. To deal with these issues, we need to store our data in a system that has **version control**. Version control systems keep track of changes we make to our files and allows us to access previous versions if we want to. You can solve all of these problems in a couple of different ways using free or low cost cloud-based storage formats. In this chapter, we will learn how to use Dropbox and Git/GitHub for research files: - storage, - accessing, - collaboration, - version control. ## Saving Data in Reproducible Formats {#TSVEscape} Before getting into the details of cloud-based data storage for all of our research files, let's consider what type of formats you should actually save your data in. A key issue for reproducibility is that others are able to not only get hold of the exact data you used in your analysis, but also be able to understand and use the data now and in the future. Some file formats make this easier than others. In general, for small to moderately sized data sets[^chapter_5_2] plain-text formats like comma-separated values (`.csv`)\index{CSV} or tab-separated values[^chapter_5_3] (`.tsv`)\index{TSV} are good ways to store your data. These formats store a data set as a text file. A row in the data set is a line in the text file. Data is separated into columns with commas or tabs, respectively. These formats are not dependent on a specific program. Any program that can open text files can open them, including a wide variety of statistical programs other than R as well as spreadsheet programs like Microsoft Excel. Using text file formats helps future-proof your research. Version control systems that track changes to text, like Git, are also very effective version control systems for these types of files. The `write.table()` function\index{R function!write.table} is one way to save data in plain-text formats from R. For example, to save a data frame called *data* as a comma-separated-value (CSV) file called *main-data.csv* in our example *data* directory (see Figure \@ref(fig:ExampleTree)): ```{r Ch5WriteTable, eval=FALSE, tidy=FALSE} write.table(Data, "/example-project/data/main-data.csv", sep = ",", row.names = FALSE) ``` `row.names = FALSE` prevents R from including the row names in the output file.[^chapter_5_4] The `sep = ","` argument specifies that we want to use commas to separate values into columns. For CSV files, you can use a modified version of this command called `write.csv()`.\index{R function!write.csv} This function makes it so that you don't have to write `sep = ","`.[^chapter_5_5]\index{comma separated file format}\index{CSV} If you want to save your data with values separated by tabs, rather than commas, set the argument `sep = "\t"` and set the file extension to `.tsv`.\index{tab separated file format} R is able to save data in a wide variety of other file formats, mostly through the *foreign* or *rio* [@R-rio] packages (see Chapter \@ref(DataGather)). These formats may be less future-proof than simple text-formatted data files. One advantage of many other statistical program file formats is that they include not only the underlying data, but also other information like variable descriptions. If you are using plain-text files to store your data, you will need to include a separate file, preferably in the same directory as the data file describing the variables and their sources. In Chapter \@ref(TablesChapter) we will look at how to automate the creation of variable description files. ## Storing Your Files in the Cloud: Dropbox In this book we'll cover two (largely) free cloud storage services that allow you to store, access, collaborate on, and version control your research files. These services are Dropbox and GitHub.[^chapter_5_6] Though they both meet our basic storage needs, they do so in different ways and require different levels of effort to set up and maintain. These two services are certainly not the only way to make your research files available. Research-oriented services include Zenodo,[^chapter_5_7] the Dataverse Project,[^chapter_5_8] figshare,[^chapter_5_9] and RunMyCode.[^chapter_5_10] These services include good built-in citation systems, unlike Dropbox and GitHub. They also aim to provide persistent URLs for your files. This helps avoid the 'link rot' that threatens reproducibility,\index{link rot} i.e. a hosting service changes its URL structure breaking existing links. These services may be a very good place to store research files once the research is completed or close to completion. Many journals now require replication files be uploaded to these sites. However, these sites' ability to store, access, collaborate on, and version control files *during* the main part of the research process is mixed. Services like Dropbox and GitHub are very capable of being part of the research workflow from the beginning. Zenodo and GitHub have excellent integration, allowing you to actively develop a research project on GitHub then persist it on Zenodo.\index{Zenodo} For details, see . The easiest types of cloud storage for your research are services like Dropbox[^chapter_5_11] and Google Drive.[^chapter_5_12]\index{Google Drive}\index{Dropbox} These services not only store your data in the cloud, but also provide ways to share files. They even include basic version control capabilities. I'm going to focus on Dropbox because it currently offers a complete set of features that allow you to store, version, collaborate, and access your data. I will focus on how to use Dropbox on a computer. Some Dropbox functionality may be different on mobile devices. ### Storage When you sign up for Dropbox and install the program,[^chapter_5_13] it creates a directory on your computer's hard drive. When you place new files and folders in this directory and make changes to them, Dropbox automatically syncs the directory with a similar folder on a cloud-based server. Typically when you sign up for the service, you'll receive a limited amount of storage space for free, usually a few gigabytes. This is probably enough storage space for a number of text file-based research projects with smaller data sets. ### Accessing data {#EnablePublicFolder} All files stored on Dropbox have a URL address through which they can be accessed from a computer connected to the internet. To access a Dropbox file or directory's URL so that it can be downloaded, right-click on the file icon in your Dropbox folder on your computer. Then click `Copy Dropbox Link`. This copies the URL into your clipboard. You need to make one small change to the link so that it can be programmatically downloaded. By default, the link will point to the Dropbox website page for the file/directory. To be able to programmatically download it, you need to change the last `0` in the URL to a `1`. For example, change: ```` https://www.dropbox.com/s/1xapw69efofpg3b/public.fin.msm.model. csv?dl=0 ```` to ``` https://www.dropbox.com/s/1xapw69efofpg3b/public.fin.msm.model. csv?dl=1 ``` We changed the download (`dl`) option from false (`dl=0`) to true (`dl=1`). Now you can use the link to download data in your R source code, for example. Once you have the URL, you can load the file directly into R using the `import()`\index{R function!import} function from the *rio* package. `import()` works for many different data formats and is generally more robust than `read.table()`. Use the `source_url()` \index{R function!source\_url} function in the *devtools* package [@R-devtools] to download and run R source code files (see Chapter \@ref(StatsModel)). Let's download data directly into R from Dropbox. The data set's URL is: .[^chapter_5_15] ```{r Ch5DropboxDownload, message=TRUE} # Download data on financial regulators stored on Dropbox # Load rio library(rio) # Place the URL into the object fin_url fin_url <- "https://bit.ly/2xlQ2j5" # Download data fin_regulator <- import(fin_url, format = "csv") # Show variables in fin_regulator names(fin_regulator) ``` The argument `format = "csv"` tells `import()` what format the file is in. This isn't necessary if the file path has an informative file extension, e.g. it ends with `.csv`. ### Collaboration Though others can easily access your data and files with Dropbox URL links, you cannot save files through the link. You must save files in the Dropbox folder on your computer or upload them through the website. If you would like collaborators to be able to modify the research files, you will need to 'share' the Dropbox folder with them. Once you create a Dropbox folder, you can share it with your collaborators by right-clicking on the folder's name. Then select `Share`. Enter your collaborator's email address when prompted and select `Can Edit` from the permissions dropdown menu. They will be sent an email that will allow them to accept the share request and, if they don't already have an account, they can sign up for Dropbox. ### Version control Dropbox has a simple version control system. Every time you save a document a new version is created on Dropbox. To view a previous version, navigate to the file on the Dropbox website. Then click on the file. In the upper-right corner, there is a menu where you can select `Version history`. This will take you to a page listing previous versions of the file, who created the version, and when it was created. A new version of a file is created every time you save a file and it is synced to the Dropbox cloud service. Note that with a free Dropbox account, previous versions of a file are only stored for **30 days**. You need a paid account to save previous versions for more than 30 days.[^chapter_5_16] ## Storing Your Files in the Cloud: GitHub {#GitHubMain} Dropbox minimally meets our four basic criteria for reproducible data storage. It is easy to set up and use. GitHub meets the criteria and more, especially when it comes to version control. It is, however, less straightforward at first. In this section, we will learn enough of the basics to get you started using GitHub to store, access, collaborate on, and version control your research. GitHub is an interface and cloud hosting service built on top of the Git version control system.[^chapter_5_17] Git does the version control. GitHub stores the data remotely, as well as provides a number of other features, some of which we look at below. GitHub was not explicitly designed to host research projects or even data. It was designed to host "socially coded" computer programs---in what Git calls "repositories", repos for short---by making it easy for a number of collaborators to work together to build computer programs. This seems very far from reproducible research.\index{git!repo}\index{git!repository} Remember that as reproducible researchers, we are building projects out of interconnected text files. In important ways, this is exactly the same as building a computer program. Computer programs are also basically large collections of interconnected text files. Like computer programmers, we need ways to store, version control, access, and collaborate on our text files. Because GitHub is very actively used by people with similar needs (who are also really good programmers), the interface offers many highly developed and robust features for reproducible researchers. GitHub's extensive features and heart in the computer programming community means that it takes a longer time than Dropbox for novice users to set up and become familiar with. So we need good reasons to want to invest the time needed to learn GitHub. Here is a list of GitHub's advantages over Dropbox for reproducible research that will hopefully convince you to get started using it:[^chapter_5_18] #### Storage and access {-} - Dropbox creates folders stored in the cloud which you can share with other people. GitHub makes your projects accessible on a fully featured project website (see Figure \@ref(fig:BookRepository)). An example feature is that it automatically renders Markdown files called *README.md*[^chapter_5_19] in a GitHub directory on the repository's website. This makes it easy for independent researchers to find the file and read it. - GitHub can create and host a website for your research project that you could use to present the results, not just the replication files. - Its close integration with Zenodo allows you to easily make your full replication material persistently accessible and citable. #### Collaboration {-} - Dropbox allows multiple people to share files and change them. GitHub does this and more. - GitHub keeps meticulous records of who contributed what to a project. - Each GitHub repository has an "Issues" area where you can note issues and discuss them with your collaborators. Basically, this is an interactive to-do list for your research project. It also stores the issues so you have a full record. - Each repository can also host a wiki that, for example, could explain in detail how certain aspects of a research project were done. - Anyone can suggest changes to files in a public repository. These changes can be accepted or declined by the project's authors. The changes are recorded by the Git version control system. This could be especially useful if an independent researcher notices an error. #### Version control {-} - Dropbox's version control system only lets you see file names, the times they were created, who created them, and revert back to specific versions. Git tracks every change you make. The GitHub website and GUI programs for Mac and Windows provide nice interfaces for examining specific changes in text files. - Dropbox creates a new version every time you save a file. This can make it difficult to actually find the version you want as the versions quickly multiply. Git's version control system only creates a new version when you tell it to. - All files in Dropbox are version controlled. Git allows you to ignore specific files. This is helpful if you have large binary files (i.e. not text files) that you do not want to version control because doing so will use up considerable storage space. - Unless you have a paid account, previous file versions in Dropbox disappear after 30 days. GitHub stores previous versions indefinitely for all account types. - Dropbox does not merge conflicting versions of a file together. This can be annoying when you are collaborating on a project and more than one author is making changes to documents at the same time. Git identifies conflicts and lets you reconcile them. - Git is directly integrated into RStudio Projects.[^chapter_5_20] ```{r BasicGitRepo, fig.cap="A Basic Git Repository with Hidden .git Folder Revealed", echo=FALSE, fig.align='center', out.width="50%"} knitr::include_graphics("images/chapter_5/BasicGitRepository.png") ``` ### Setting up GitHub: Basic There are at least three ways to use Git/GitHub on your computer. You can use the command-line version of Git. It's available for Mac and Linux (in the Terminal) as well as Windows through Git Bash.[^chapter_5_21] You can also use the Graphical User Interface GitHub program. Currently, it's only available for Windows and Mac. RStudio also has GUI-style Git functionality for RStudio Projects. In this section, I focus on how to use the command-line version, because it will help you understand what the GUI versions are doing and will allow you to better explore more advanced Git features not covered in this book. In the next section, I will mention how to use Git with RStudio Projects. The first thing to do to set up Git and GitHub is go to the GitHub website () and sign up for an account. Second, you should go to the following website for instructions on setting up GitHub: . The instructions on that website are very comprehensive, so I'll direct you there for the full setup information. Note that installing the GUI version of GitHub also installs Git and, on Windows, Git Bash. ### Version control with Git Git is primarily a version control system, so we will start our discussion of how to use it by looking at how to version your repositories. #### Setting up Git repositories locally {-} \index{git!repositories} \index{git!repos} You can set up a Git repo on your computer with the command-line.[^chapter_5_22] I keep my repositories in a folder called *git_repositories*,[^chapter_5_23] though you can use Git with almost any directory you like. The *git_repositories* directory has the root folder as its parent. Imagine that we want to set up a repository in this directory for a project called *example_project*. Initially it will have one README file called *README.md*. To do this, we would first type into the Terminal for Mac and Linux computers: \index{shell command!mkdir}\index{shell command!cd}\index{shell command!echo} ````bash # Make new directory 'example-project' mkdir /git_repositories/example-project # Change to directory 'example-project' cd /git_repositories/example-project # Create new file README.md echo "# An Example Repository" > README.md ```` So far, we have only made the new directory and set it as our working directory (see Chapter \@ref(DirectoriesChapter)). All of the examples in this section assume your current working directory is set to the repo. Then, with the `echo` shell command we created a new file named *README.md* that includes the text `# An Example Repository`. Note that the code is basically the same in Windows PowerShell or Git Bash. Also, you don't have to do these steps in the command-line. You could just create the new folders and files the same way that you normally do with your mouse in your GUI operating system. Now that we have a directory with a file, we can tell Git that we want to treat the directory *example-project* as a repository and that we want to track changes made to the file *README.md*. Use Git's `init` (initialize) command\index{shell command!git init} to set the directory as a repository. See Table \@ref(GitCommandsTable) for the list of Git commands covered in this chapter.[^chapter_5_24] Use Git's `add` command to add a file to the Git repository. For example,\index{shell command!git add} ````bash # Initialize the Git repository git init # Add README to the repository git add README.md ```` You probably noticed that you always need to put `git` before the command. This tells the shell what program the command is from. When you initialize a folder as a Git repository, a hidden folder called *.git* is added to the directory (see Figure \@ref(fig:BasicGitRepo)). This is where all of your changes are kept. If you want to add all of the files in the working directory to the Git repository type: ````bash # Add all files to the repository git add . ```` When we want Git to track changes made to files added to the repository we can use the `commit` command. In Git language we are "committing" the changes to the repository.\index{shell command!git commit} ````bash # Commit changes git commit -a -m "First Commit, created README file" ```` Note: the files won't appear on GitHub yet. Later in the chapter, we will learn how to push commits to your remote GitHub repository. The `-a` (all) option commits changes made to all of the files that have been added to the repository. You can include a message with the commit using the `-m` option like: `"First Commit, created README file"`. Messages help you remember general details about individual commits. This is helpful when you want to revert to old versions. **Remember:** Git only tracks changes when you commit them. Finally, you can use the `status` command for details about your repository, including uncommitted changes. Generally it's a good idea to use the `-s` (short) option, so that the output is more readable.\index{shell command!git status} ````bash # Display status git status -s ```` \begin{table} \caption{A Selection of Git Commands} \label{GitCommandsTable} \begin{center} \begin{tabular}{l p{7cm}} \hline Command & Description \\[0.25cm] \hline\hline \texttt{add} & Add a file to a Git repository. \\[0.25cm] \texttt{branch} & Create and delete branches. \\[0.25cm] \texttt{checkout} & Checkout a branch. \\[0.25cm] \texttt{clone} & Clone a repository (for example, the remote GitHub version) into the current working directory. \\[0.25cm] \texttt{commit} & Commit changes to a Git repository. \\[0.25cm] \texttt{fetch} & Download objects from the remote (or another) repository. \\[0.25cm] \texttt{.gitignore} & Not a Git command, but a file you can add to your repository to specify what files/file types Git should ignore. \\[0.25cm] \texttt{init} & Initialize a Git repository. \\[0.25cm] \texttt{log} & Show a repo's commit history. \\[0.25cm] \texttt{merge} & Merge two or more commits/branches together. \\[0.25cm] \texttt{pull} & \texttt{fetch} data from a remote repository and try to \texttt{merge} it with your commits. \\[0.25cm] \texttt{push} & Add committed changes to a remote Git repository, i.e. GitHub. \\[0.25cm] \texttt{remote add} & Add a new remote repository to an existing project. \\[0.25cm] \texttt{rm} & Remove files from Git version tracking. \\[0.25cm] \texttt{status} & Show the status of a Git repository including uncommitted changes made to files. \\[0.25cm] \texttt{tag} & Bookmark particularly significant commits. \\[0.25cm] \hline \end{tabular} \end{center} {\scriptsize{Note: when you use these commands in the shell, you will need to precede them with \texttt{git} so the shell knows what program they are from.}} \end{table} ```{r BookRepository, fig.cap="Part of this Book's GitHub Repository Webpage", echo=FALSE, fig.align='center', out.width="90%"} knitr::include_graphics("images/chapter_5/GitHubReadme.png") ``` #### Checkout {-} \index{git!checkout} It is useful to step back for a second and try to understand what Git is doing when you commit your changes. In the hidden *.git*, folder Git is saving all of the information in compressed form from each of your commits into a sub-folder called *objects*. Commit objects[^chapter_5_25] are everything from a particular commit. I mean everything. If you delete all of the files in your repository (except for the *.git* folder), you can completely recover all of the files from your most recent commit with the `checkout` command: \index{shell command!git checkout} ````bash # Checkout latest commit git checkout -- . ```` Note that there is a space between the two dashed lines and the period. You can also change to any other commit or any committed version of a particular file with `checkout`. Simply replace the `--` with the commit reference. Note that the period at the end is still very important to include after the commit reference. The commit reference is easy to find and copy from a repository's GitHub webpage (see below for more information on how to create a GitHub webpage).[^chapter_5_26] For an example of a GitHub repo webpage, see Figure \@ref(fig:BookRepository). Click on the link that lists the number of repo commits on the left-hand side of the repo's webpage. This will show you all of the commits. A portion of this book's commit history is shown in Figure \@ref(fig:BookHistory). By clicking on the code icon (**<>**), you can see what the files at any commit looked like. Next to this button is another with a series of numbers and letters. This is the commit's SHA-1 hash.[^chapter_5_27] For our purposes, it is the commit's reference number. Click on the button to the left of the SHA to copy it. You can then paste it as an argument to your command. This will revert you to that particular commit. Also include the file name if you want to revert to a particular version of a particular file. ```{r BookHistory, fig.cap="Part of this Book's GitHub Repository Commit History", echo=FALSE, fig.align='center', out.width="90%"} knitr::include_graphics("images/chapter_5/CommitHistory.png") ``` #### Tags {-} \index{git!tags} \index{shell command!git tag} SHA-1 hashes are a bit cumbersome to use as references. What was the hash number for that one commit? To solve this problem you can add bookmarks, known as "tags", to particularly important commits. Imagine we just committed our first full draft of a project. We want to tag it as version 0.1, i.e. "v0.1". To do this, use Git's `tag` command: ````bash # Tag most recent commit v0.1 git tag -a v0.1 -m "First draft" ```` The `-a` option adds the tag `v0.1` and `-m` lets us add a message. Now we can check out this particular commit by using its tag, i.e.: \index{shell command!git checkout} ````bash # Checkout v0.1 git checkout v0.1 ```` This will create a new "branch" with a generic name *(detached from v0.1)* where you can make changes and commit them. If you plan to check out a previous tagged version and make changes to it, it is a good idea to specifically name the branch using the `-b` argument.[^chapter_5_28] For example, to give it the name *v0.1_branch* type: ````bash # Checkout v0.1 as v0.1_branch git checkout v0.1 -b v0.1_branch ```` What is a branch? #### Branches {-} \index{shell command!git branch} \index{git!branches} Sometimes you may want to work on an alternative version of your project and then merge changes made to this version back into the main one. For example, the main version could be the most stable current copy of your research, while the alternative version could be a place where you test out new ideas. Git allows you to create a new *branch* (alternative version of the repo) which can be merged back into the *master* (main) branch. To see what branch you are using, type: ```{r Ch5CheckBranch, engine='sh'} # Show git branch git branch ``` To create a new branch, use the `branch` command. For example, to create a new branch called *test*: ````bash # Create test branch git branch test ```` You can now use `checkout` to switch to this branch.[^chapter_5_29] Here is a shortcut for creating and checking out the branch: ````bash # Create and checkout test branch git checkout -b test ```` The `-b` (branch) option for `checkout` creates the new *test* branch before switching to it. To merge changes you commit in the *test* branch to the *master*, `add` and `commit` your changes, `checkout` the *master* branch, then use the `merge` command.[^chapter_5_30]\index{shell command!git add}\index{shell command!git merge} ````bash # Add files git add . # Commit changes to test branch git commit -a -m "commit changes to test" # Checkout master branch git checkout master # Merge master and test branches git merge test ```` Note, when you merge a branch, you may encounter conflicts in the files that make it impossible to smoothly merge the files together. Git will tell you what and where these are; you then need to decide what to keep and what to delete. #### Having Git ignore files {-} \index{git!ignore} \index{git!.gitignore} There may be files in your repository that you do not want to keep under version control. Maybe this is because they are very large files or cached files from *knitr* or other files that are byproducts of compiling a LaTeX document (see Chapter \@ref(StatsModel)). You also want to ignore files that contain private information. Make sure to **never include private information** (e.g. passwords or confidential data) in your Git history. Once they are committed, it will be very difficult to definitively remove them. Once they are on GitHub, they will be publicly accessible. To have Git ignore particular files, create a file called *.gitignore*.[^chapter_5_31] You can either put this file in the repository's parent directory to create a *.gitignore* file for the whole repository or in a sub-directory to ignore files in that sub-directory. In the *.gitignore* file, add ignore rules by including the names of the files that you want to have Git ignore. For example, GitHub has a *.gitignore* file that is useful for ignoring files[^chapter_5_github_r_ignore] that we often don't want to commit to our git history when using R and R Markdown: ```` # History files .Rhistory .Rapp.history # Session Data files .RData # Example code in package build process *-Ex.R # Output files from R CMD build /*.tar.gz # Output files from R CMD check /*.Rcheck/ # RStudio files .Rproj.user/ # produced vignettes vignettes/*.html vignettes/*.pdf # OAuth2 token .httr-oauth # knitr and R markdown default cache directories /*_cache/ /cache/ # Temporary files created by R markdown *.utf8.md *.knit.md ```` The asterisk (`*`) is a "wildcard" and stands for any character. In other words, it tells Git to look for files with any name that end with a specified file extension. This is faster than writing out the full name of every file you want to ignore individually. It also makes it easy to copy the rules into new repos. For example, you'll notice the `*-Ex.R` and `/*_cache/` rules. These tell Git to ignore all of the files with a name ending in *-Ex.R* and all files in subdirectories with a name ending in *\_cache*. Git will not ignore files that have already been committed to a repository. To ignore these files, you will first need to remove them from Git with Git's `rm` (remove) command. If you wanted to remove a file called *example-project.tex* from version tracking type: \index{shell command!git rm} ````bash # Remove example-project.tex from Git version tracking git rm --cached example-project.tex ```` Using the `–cached` argument tells Git not to track the file, but not delete it. For more information on *.gitignore* files, see GitHub's reference page on the topic at: . ### Remote storage on GitHub {#NewGitHubRepo} So far we've been using repos stored locally. Let's now look at how to also store a repository remotely on GitHub. You can either create a new repository on GitHub and download (`clone`) it to your computer or upload (`push`) an existing repository to a new GitHub remote repo. In both cases, you need to create a new repository on GitHub. To create a new repository on GitHub, go to your main GitHub account webpage and click the `New repository` button. On the next page that appears, give the repository a name, brief description, and choose whether to make it public or private. If you want to store an existing repository on GitHub, give it the same name as the one that already exists on your computer. If you already have files in your local repository do not check the boxes for creating *README.md*, *LICENSE*, and *.gitignore* files. When you then click `Create Repository`, you will be directed to the repository's GitHub webpage.[^chapter_5_32] #### Clone a new remote repository {- #GitClone} \index{shell command!git clone} \index{git!clone} If you are working with a new repository and do not have an existing version on your computer, you need to "clone" the GitHub repo to your computer. The repo's GitHub page contains a button called `Clone in Desktop`. Clicking this will open GUI GitHub (if it is installed) and prompt you to specify what directory on your computer you would like to clone the repository into. You can also use the `clone` command in the shell. Imagine that the URL for a repo called *Example Project* is `https://GitHub.com/USER/example-project.git`. To clone it into the */git_repositories* directory type:[^chapter_5_33] ````bash # Change working directory cd /git_repositories/ # Clone example-project git clone https://GitHub.com/USER/example-project.git ```` #### Push an existing repository to a new GitHub repo {- #RemoteAdd} \index{git!remote repository} \index{shell command!git remote} If you already have a repository with files in it on your computer and you want to store them remotely in a new GitHub repo, you need to add the remote repository and `push` your files to it. Type Git's `remote add` command. For example, if your repository's GitHub URL is `https://github.com/USER/example-project.git`, then type: ````bash # Change working directory to existing local repo cd /git_repositories/example-project # Add a remote (GitHub) repository to an existing repo git remote add origin https://github.com/USER/example-project.git ```` This will tell your local repository where the remote one is. Finally, push the repository to GitHub: \index{shell command!git push} ````bash # Push local repository to GitHub for the first time git push -u origin master ```` The `-u` (upstream tracking) option adds a tracking reference for the upstream (GitHub) repository branches. #### Pushing commits to a GitHub repo {-} \index{shell command!git push} Once you have your local repository connected to GitHub, you can add new commits with the `push` command. For example, if your current working directory is the Git repo you want to push and you have already added/committed the changes you want to include in the remote repo, type: ````bash # Add changes to the GitHub remote master branch git push origin master ```` The `origin` is the remotely stored repository on GitHub and `master` is the master branch. You can change this to another branch if you'd like. If you have not set up password caching[^chapter_5_34] you will now be prompted to give your GitHub username and password. You can also push your tags to GitHub. To push all of the tags to GitHub, type: ````bash git push --tags ```` Now on the repo's GitHub page, there will be a `Tags` section that will allow you to view and download the files in each tagged version of the repository. ### Accessing on GitHub #### Downloading into R {- #GitDownload #RawGitHub} In general, the process of downloading data directly into R is similar to what we saw earlier for loading data from Dropbox Public folders. We can use the `import()` function.\index{R function!import} First, we need to find our plain-text data file's *raw* URL. To do this, go to your repository's GitHub site, navigate to the file you want to load, and click the `Raw` button on the right just above the file preview. I have data in comma-separated values format stored in a GitHub repository.[^chapter_5_35] The URL for the raw (plain-text) version of the data is .[^chapter_5_36] ```{r Ch5URLAddress, message=TRUE, tidy=FALSE} # Place shortened URLinto url object url <- "http://bit.ly/14aSjxB" # Download data disprop_data <- rio::import(url, format = "csv") # Show variable names names(disprop_data) ``` `import()` downloaded the most recent version of the file from the master branch. We can actually use `import()` to download a particular version of a file---from a particular Git commit---directly into R. This makes reproducing a specific result much easier. To do this, you just need to use a file's raw URL from a particular commit. To find a file's particular commit raw URL first click on the file on GitHub's website. Then click the `History` button. This will take you to a page listing all of the file's versions. Click on the git commit hash button next to the version of the file that you want to use. Then click `View file` and finally the `Raw` button to be taken to the text-only version of the file. Copy this page's URL address and use it with `import()`. For example, I have an old version of the disproportionality data. To download it, I find this particular version of the file's URL and use it in `import()`: ```{r Ch5sourceDataOld, tidy=FALSE, message=FALSE, warning=FALSE} # Create object containing the file's URL old_url <- paste0("https://raw.githubusercontent.com/", "christophergandrud/", "Disproportionality_Data/", "1a59d360b36eade3b183d6336a", "2262df4f9555d1/", "Disproportionality.csv") # Download old disproportionality data disprop_old <- rio::import(old_url, format = "csv") ``` In this example I did not shorten the URL, but instead used the `paste0()`\index{R function!paste0} function to paste it together.[^chapter_5_37] You do not have to do this. I did it here so that the URL would fit on the printed page. #### Viewing files {-} The GitHub web user interface also allows you, your collaborators (see below) or, if the repo is public, anyone to look at text files from a web browser. Collaborators can actually also create, modify, and commit changes in the web user interface. This can be useful for making small changes, especially from a mobile device without a Git installation. Anyone with a GitHub account can suggest changes to files in a public repository on the repo's website. Simply click the `Edit` button (it looks like a pencil) above the file and make edits. If the person making the edits is not a designated collaborator, their edits will be sent to the repository's owner for approval.[^chapter_5_38] This can be a useful way for independent researchers to fix errors. #### Collaboration with GitHub {-} Repositories can have official collaborators that can make changes to files in the repo. Public repositories can have unlimited collaborators. Anyone with a GitHub account can be a collaborator. To add a collaborator to a repository you created, click on the `Settings` button on the repository's website (see Figure \@ref(fig:BookRepository)). Then click the `Collaborators` button on the left-hand side of the page. You will be given a box to enter your collaborator's GitHub username. If your collaborator doesn't have a GitHub account, they will have to create a new one. Once you add someone as a collaborator, they can clone the repository onto their computer as you did earlier and push changes. #### Syncing a repository {-} \index{shell command!git pull} \index{git!pull} If you and your collaborators are both making changes to the files in a repo you might create conflicting changes, i.e. different changes to the same part of a file. To avoid too many conflicts, it is a good idea to sync your local repository with the remote repository **before** you push your commits to GitHub. Use the `pull` command to sync your local and remote repository. First add and commit your changes, then type: ````bash git pull ```` If the files you are pulling conflict with your local files, you will probably want to resolve these in the individual files and commit the changes. When there are merge conflicts, Git adds both versions of a piece of text to the file. You then open the file and decide which version to keep and which one to delete. When the conflicts are resolved and changes committed, push your merged changes up to the remote repository as we did before. ### Summing up the GitHub workflow We've covered a lot of ground in this section. Let's sum up the basic GitHub workflow you will probably follow once your repo is set up. 1. Add any changes you've made with `git add`. 2. `commit` the changes. 3. `pull` your collaborators' changes from the GitHub repo, resolve any merge conflicts, and `commit` the changes. 4. `push` your changes to GitHub. ```{r NewRStudioProject, fig.cap="Creating RStudio Projects", echo=FALSE, fig.align='center', out.width='50%'} knitr::include_graphics("images/chapter_5/GitNewProject.png") ``` ```{r NewProjectNewDirectory, fig.cap="Creating RStudio Projects in New Directories", echo=FALSE, fig.align='center', out.width='50%'} knitr::include_graphics("images/chapter_5/NewProject_NewDirectory.png") ``` ## RStudio and GitHub When you open a Project with a Git repository in RStudio, you will see a new *Git* tab next to *Environment*, *History*, and *Connections* (see Figure \@ref(fig:GitTab)). From here, you can do many of the things we covered in the previous section. Let's look at how to set up and use Git in RStudio Projects. ### Setting up Git/GitHub with Projects You can Git initialize new RStudio Projects, Git initialize existing projects, and create RStudio Projects from cloned repos. When you do any of these things, RStudio automatically adds a *.gitignore* file telling Git to ignore *.Rproj.user*, *.Rhistory*, and *.RData* files. #### Git with a new project {- #NewProjectGit} To create a new project with Git version control, go to `File` in the RStudio menu bar. Then click `New Project…`. In the box that appears (see Figure \@ref(fig:NewRStudioProject)) select `New Directory` `Empty Project`. Enter the Project's name and desired directory. Make sure to check the dialog box for `Create a git repository` (see Figure \@ref(fig:NewProjectNewDirectory)). #### Git initialize existing projects {-} If you have an existing RStudio Project and want to add Git version control to it, first go to `Tools` in the RStudio menu bar. Then select `Project Options …`. Select the `Git/SVN` icon. Finally, select `Git` from the drop-down menu for `Version Control System:`. #### Clone repository into a new project {-} Again go to `File` in the RStudio menu bar to create a new project from a cloned GitHub repository. Then click `New Project…`. Select the `Version Control` option and then `Git`. Finally, paste the repository's URL in the field called `Repository URL:`, enter the directory you would like to locate the cloned repo in, and click `Create Project`. #### Add existing Project repository to GitHub {-} You can push an existing Project repository stored on your computer to a new remote repository on GitHub. To do this, first create a new repo on GitHub with the same name as your RStudio Project (see Section \@ref(NewGitHubRepo)). Then copy the remote repository's URL like we saw before when we cloned a repository from GitHub (see Section \@ref(GitClone)). Open a new shell from within RStudio. To do this, click the `Shell` button in the *Git* tab's `More` drop-down menu (it looks like a gear). Now follow the same steps that we used in Section \@ref(RemoteAdd) to connect a locally stored repository to GitHub for the first time. ```{r GitTab, fig.cap="The Git Repository Tab in RStudio", echo=FALSE, fig.align='center', out.width="50%"} knitr::include_graphics("images/chapter_5/GitTab.png") ``` ```{r GitRstudioAdd, fig.cap="Adding Changes to the Repository", echo=FALSE, fig.align='center', out.width="50%"} knitr::include_graphics("images/chapter_5/GitAdd.png") ``` ### Using Git in RStudio Projects The RStudio *Git* tab allows you to do many of the same things with Git that we covered in the previous section. In Figure \@ref(fig:GitTab) you will see the *Git* tab for a new RStudio Project called *example-project*. It has two files that have not been added or committed to Git. To add and commit the files to the repository, click on the dialog boxes next to the file names. In Figure \@ref(fig:GitRstudioAdd) you can see that I've created a new R file called *example-script.R* and clicked the dialog box next to it, along with the other files. The yellow question marks in the top panel have now become green A's for "add". Clicking `Commit` opens a new window called **Review Changes** where you can commit the changes. Simply write a commit message in the box called *Commit Message* in the **Review Changes** window and click `Commit`. If you add file names to the *.gitignore* files, they will not show up in RStudio's *Git* tab. If you are using a GitHub repo that is associated with a remote repository on GitHub, you can push and pull it with the `Pull Branches` and `Push Branch` buttons in Git menu bar (the down and up arrows respectively). You can use the same buttons in the **Review Changes** window. The *Git* tab also allows you to change branches, revert to previous commits, add files to `.gitignore`, and view your commit history. You can always use the `More -> Shell…` option to open a new shell with the Project set as the working directory to complete any other Git task you might want to do. ### Chapter summary {-} In this chapter we have primarily learned how to store text-based reproducible research files in ways that allow us and others to access them easily from many locations, enable collaboration, and keep a record of previous versions. In the next chapter, we will learn how to use text-based files to reproducibly gather data that we can use in our statistical analyses. [^chapter_5_1]: These services store your data on remote servers. [^chapter_5_2]: I don't cover methods for storing and handling very large data sets, with high hundreds of thousands and more observations. For information on large data and R, not just storage, one place to look is this blog post from RDataMining: (posted 6 May 2012). One popular service for large file storage is Amazon S3 (). [^chapter_5_3]: Sometimes this format is called tab-delimited values. [^chapter_5_4]: Frequently the row names are just the row numbers which may have no substantive meaning. [^chapter_5_5]: `write.csv()` is a 'wrapper' for `write.table()`. [^chapter_5_6]: Dropbox provides a minimum amount of storage for free, above which they charge a fee. GitHub lets you create publicly accessible repositories---kind of like project folders---for free, but they charge for private repositories. [^chapter_5_7]: [^chapter_5_8]: [^chapter_5_9]: [^chapter_5_10]: [^chapter_5_11]: [^chapter_5_12]: [^chapter_5_13]: See for downloading and installation instructions. [^chapter_5_15]: This data is from [@gandrud2012]. I've shortened the URL using Bitly () so that it will fit on the page. [^chapter_5_16]: For more details, see . [^chapter_5_17]: I used Git version 2.20.1 for this book. [^chapter_5_18]: Because many of these features apply to any service that relies on Git, much of this list of advantages also applies to alternative Git cloud storage services such as Bitbucket (). [^chapter_5_19]: You can use a variety of other markup languages as well. See . [^chapter_5_20]: RStudio also supports the Subversion version control system, but I don't cover that here. [^chapter_5_21]: The interface for Git Bash looks a lot like the Terminal or Windows PowerShell. [^chapter_5_22]: Much of the discussion of the command-line in this section is inspired by Nick Farina's blog post on Git (see , posted 7 September 2012). [^chapter_5_23]: To follow along with this code, you will first need to create a folder called *git_repositories* in your root directory. Note also that throughout this section I use Unix file path conventions. [^chapter_5_24]: For a comprehensive guide to Git commands, see . [^chapter_5_25]: Other Git objects include trees (sort of like directories), tags (bookmarks for important points in a repo's history), and blobs (individual files). [^chapter_5_26]: You can also search your commit history and roll back to a previous commit using only the command-line. To see the commit history, use the `log` command (more details at ). When a repo has many commits, this can be a very tedious command to use, so I highly recommend the GUI version of GitHub or the repo's GitHub website. [^chapter_5_27]: Secure Hash Algorithm. This is a unique identifier for each commit. [^chapter_5_28]: If you don't, then the new branch will have a "detached head" which will create problems using the branch in the future. [^chapter_5_29]: To delete the *test* branch, use the `-d` argument, i.e. `git branch -d Test`. [^chapter_5_30]: Any uncommitted changes are merged with a branch when it is checked out. [^chapter_5_31]: Note that like *.git*, *.gitignore* files are hidden. [^chapter_5_32]: Before the repo has any files in it, the webpage will include instructions for how to set it up on your computer. [^chapter_5_33]: If you are on the repo's webpage the URL to copy is under `HTTPS clone URL`. [^chapter_5_34]: See for more details. [^chapter_5_35]: For full information about the disproportionality data set, please see . [^chapter_5_36]: It has been shortened with Bitly in the example. [^chapter_5_37]: `paste0` is the same as `paste`, but has the argument `sep = ""` so that white space is not placed between the pasted elements. [^chapter_5_38]: This is called a `pull` request in Git terminology. See the next section for more details. [^chapter_5_github_r_ignore]: From: as of 26 December 2018. ================================================ FILE: rep-res-3rd-edition/08-gather.Rmd ================================================ # Gathering Data with R {#DataGather} How you gather your data directly impacts how reproducible your research will be. You should try your best to document every step of your data gathering process. Reproduction will be easier if your documentation---especially, variable descriptions and source code---makes it easy for you and others to understand what you have done. If all of your data gathering steps are tied by your source code, then independent researchers (and you) can more easily regather the data. Regathering data will be easiest if running your code allows you to get all the way back to the raw data files, the rawer the better. Of course, this may not always be possible. You may need to conduct interviews or compile information from paper-based archives, for example. Data hosted online may disappear when the host ceases operation. The best you can sometimes do is describe your data gathering process in detail or rehost an original data set. Nonetheless, R's automated data gathering capabilities for internet-based information is extensive. Learning how to take full advantage of these capabilities greatly increases reproducibility and can save you considerable time and effort over the long run. In this chapter we'll learn strategies for how to gather quantitative data in a fully reproducible way. We'll start by learning how to use data gathering makefiles to organize your whole data gathering process so that it can be completely reproduced. Then we will learn the details of how to actually load data into R from various sources, both locally on your computer and remotely via the internet. In the next chapter (Chapter \@ref(DataClean)), we'll learn the details of how to clean up raw data so that it can be merged into data frames that can be used for statistical analyses. ## Organize Your Data Gathering: Makefiles Before getting into the details of using R to gather data, let's start by creating a plan to organize the process. Organizing your data gathering process from the beginning of a research project improves the possibility of reproducibility and can save you significant effort over the course of the project by making it easier to add and regather data later on. A key part of reproducible data gathering with R, like reproducible research in general, is segmenting the process into modular files that can all be run in sequence by a common "makefile". In this chapter we'll learn how to create make-like files run exclusively from R as well as GNU Make makefiles,[^chapter_6_1] which you run from a shell.[^chapter_6_2] Learning how to create R make-like files is fairly easy. Using GNU Make does require learning some more new syntax. However, it has one very clear advantage: it only runs a source code file that has been updated since the last time you ran the makefile. This is very useful if part of your data-gathering process is very computationally and time intensive. Segmenting your data gathering into modular files and tying them with some sort of makefile allows you to more easily navigate research text and find errors in the source code. The makefile's output is the data set that you'll use in the statistical analyses. There are two types of source code files that the makefile runs: data gathering/cleanup files and merging files. Data cleanup files bring raw individual data sources into R and transform them so that they can be merged with data from the other sources. Many of the R tools for data cleanup and merging will be covered in Chapter \@ref(DataClean). In this chapter, we mostly cover the ways to bring raw data into R. Merging files are executed by the makefile after it runs the data gathering/cleanup files. It's a good idea to have the source code files use very raw data as input. Your source code should avoid directly changing these raw data files. Instead, changes should be put into new objects and data files. Doing this makes it easier to reconstruct the steps you took to create your data set. Also, while cleaning and merging your data you may transform it in unintended ways, for example, accidentally deleting some observations that you wanted to keep. Having the raw data makes it easy to go back and correct your mistakes. The files for the examples used in this section can be downloaded from GitHub at: . ### R Make-like files When you create make-like files in R to organize and run your data gathering, you usually only need one or two functions, `setwd()` and `source()`.\index{R function!setwd}\index{R function!source} As we talked about in Chapter \@ref(DirectoriesChapter), `setwd()` tells R where to look for and place files. `source()` tells R to run code in an R source code file.[^chapter_6_3] Let's see what an R data makefile might look like for a project with a file structure similar to the example project in Figure \@ref(fig:ExampleTree). The file paths in this example are for Unix-like systems and the make-like file is called *Makefile.R*. ```{r Ch6ExampleRMake, eval=FALSE} ################ # Example R make-like file # Christopher Gandrud # Updated 12 January 2019 ################ # Set working directory setwd("/example-project/data/") # Gather and cleanup raw data files with a for loop gatherers <- c("gather-1.R", "gather-2.R", "gather-3.R") for (i in gatherers) source(i) # Merge cleaned data frames into data frame object cleaned_data source("merge-data.R") ``` This code first sets the working directory. Then it runs three source code files to gather data from three different sources. These files gather the data and clean it so that it can be merged. The cleaned data frames are available in the current workspace. Next the code runs the *merge-data.R* file that merges the data frames and saves the output data frame as a CSV formatted file. The CSV file could be the main file we use for statistical analysis. *merge-data.R* also creates a Markdown file with a table describing the variables and their sources. We'll come back to how to create tables in Chapter \@ref(TablesChapter). You can run the commands in this file one-by-one or run the make-like file by putting it through the `source()` function\index{R function!source} so that it will run it all at once. ### GNU Make R make-like files are a simple way to tie together a segmented data gathering process. If one or more of the source files that our previous example runs is computationally intensive it is a good idea to run them only when they are updated. However, this can become tedious, especially if there are many segments. The well-established GNU Make command-line program[^chapter_6_4] deals with this problem by comparing the output files' time stamps[^chapter_6_5] to time stamps of the source files that created them. If a source file has a time stamp that is newer than its output, Make will run it. If the source's time stamp is older than its output, Make will skip it. In Make terminology the output files are called "targets" and the files that create them are called "prerequisites". You specify a "recipe" to create the targets from the prerequisites. The recipe is basically just the code you want to run to make the target file. The general form is: ```` TARGET ... : PREREQUISITE ... RECIPE ... ... ````` Note that, unlike in R, tabs are important in Make. They indicate what lines are the recipe. Make uses the recipe to ensure that targets are newer than prerequisites. If a target is newer than its prerequisite, Make does not run the prerequisite. The basic idea of reproducible data gathering with Make is similar to what we saw before, with a few twists and some new syntax. Let's see an example that does what we did before: gather data from three sources, clean and merge the data, and save it in CSV format. #### Example makefile {- #AsteriskWildcard} The first thing we need to do is create a new file called *Makefile*[^chapter_6_6] and place it in the same directory as the data gathering files we already have. The makefile we are going to create runs prerequisite files by the alphanumeric order of their file names. So we need to ensure that the files are named in the order that we want to run them. Now let's look at the actual makefile: ````bash ################ # Example Makefile # Christopher Gandrud # Updated 1 July 2013 # Influenced by Rob Hyndman (31 October 2012) # See: http://robjhyndman.com/researchtips/makefiles/ ################ # Key variables to define RDIR = . MERGE_OUT = merge-data.Rout # Create list of R source files RSOURCE = $(wildcard $(RDIR)/*.R) # Files to indicate when the RSOURCE file was run OUT_FILES = $(RSOURCE:.R=.Rout) # Default target all: $(OUT_FILES) # Run the RSOURCE files $(RDIR)/%.Rout: $(RDIR)/%.R R CMD BATCH $< # Remove Out Files clean: rm -fv $(OUT_FILES) # Remove merge-data.Rout cleanMerge: rm -fv $(MERGE_OUT) ```` Ok, let's break down the code. The first part of the file defines variables that will be used later on. For example, in the first line of executable code (`RDIR = .`) we create a simple variable[^chapter_6_7] called `RDIR` with a period (`.`) as its value. In Make and Unix-like shells, periods indicate the current directory. The next line allows us to specify a variable for the outfile created by running the *merge-data.R* file. This will be useful later when we create a target for removing this file to ensure that the *merge-data.R* file is always run. The third executed line (`RSOURCE:= $(wildcard $(RDIR)/*.R)`) creates a variable containing a list of all the names of files with the extension `.R`, i.e. our data gathering and merge source code files.\index{wildcard} This line has some new syntax, so let's work through it. In Make (and Unix-like shells generally) a dollar sign (`$`) followed by a variable name substitutes the value of the variable in place of the name.[^chapter_6_8] For example, `$(RDIR)` inserts the period `.` that we defined as the value of `RDIR` previously. The parentheses are included to clearly demarcate where the variable name begins and ends.[^chapter_6_9] You may remember the asterisk (`*`) from the previous chapter. It is a "wildcard",\index{wildcard} a special character that allows you to select file names that follow a particular pattern. Using `*.R` selects any file name that ends in `.R`. Why did we also include the actual word `wildcard`? The `wildcard` function is different from the asterisk wildcard character. The function creates a list of files that match a pattern. In this case the pattern is `$(RDIR)/*.R`. The general form for writing the `wildcard` function is: `$(wildcard PATTERN)`. The third line (`OUT_FILES = $(RSOURCE:.R=.Rout)`) creates a variable for the `.Rout` files that Make will use to tell how recently each R file was run.[^chapter_6_10] `$(RSOURCE:.R=.Rout)` is a variable that uses the same file name as our RSOURCE files, but with the file extension `.Rout`. The second part of the makefile tells Make what we want to create and how to create it. In the line `all: $(OUT_FILES`) we are specifying the makefile's default target. Targets are the files that you instruct Make to make. `all:` sets the default target; it is what Make tries to create when you enter the command `make` in the shell with no arguments. We will see later how to instruct Make to compile different targets. The next two executable lines (`$(RDIR)/%.Rout: $(RDIR)/%.R` and `R CMD BATCH $<`) run the R source code files in the directory. The first line specifies that the `.Rout` files are the targets of the `.R` files. The percent sign (`%`) is another wildcard. Unlike the asterisk, it replaces the selected file names throughout the command used to create the target. The dollar and less-than signs (`$<`) indicate the first prerequisite for the target, i.e. the `.R` files. `R CMD BATCH` is a way to call R from a Unix-like shell, run source files, and output the results to other files.[^chapter_6_11] The out-files it creates have the extension `.Rout`. The next two lines specify another target: `clean`. When you type `make clean` into your shell, Make will follow the recipe: `rm -fv $(OUT_FILES)`. This removes (deletes) the `.Rout` files. The `f` argument (force) ignores files that don't exist and the `v` argument (verbose) instructs Make to tell you what is happening when it runs. When you delete the `.Rout` files, Make will run all of the `.R` files the next time you call it. The last two lines help us solve a problem created by the fact that our simple makefile doesn't push changes downstream. For example, if we make a change to *gather-2.R* and run `make`, only *gather-2.R* will be rerun. The new data frame will not be added to the final merged data set. To overcome this problem, the last two lines of code create a target called `cleanMerge`; this removes only the *merge-data.Rout* file. #### Running the Makefile {-} To run the makefile for the first time, change the working directory to where the file is and type `make` into your shell. It will create the CSV final data file and four files with the extension `.Rout`, indicating when the segmented data gathering files were last run.[^chapter_6_12] When you run `make` in the shell for the first time, you should get the output: \index{shell command!make} ````bash ## R CMD BATCH gather-1.R ## R CMD BATCH gather-2.R ## R CMD BATCH gather-3.R ## R CMD BATCH merge-data.R ```` If you run it a second time without changing the R source files, you will get the following output: ```` ## make: Nothing to be done for 'all'. ```` To remove all of the `.Rout` files, set the make target to `clean`: ````bash make clean ## rm -fv ./gather-1.Rout ./gather-2.Rout ./gather-3.Rout ## ./merge-data.Rout ## ./gather-1.Rout ## ./gather-2.Rout ## ./gather-3.Rout ## ./merge-data.Rout ```` If we run the following code: ````bash # Remove merge-data.Rout and make all R source files make cleanMerge all ```` then Make will first remove the *merge-data.Rout* file (if there is one) and then run all of the R source files as need be. *merge-data.R* will always be run. This ensures that changes to the gathered data frames are updated in the final merged data set. #### Makefiles and RStudio Projects {-} You can run makefiles from RStudio's *Build* tab. For the type of makefile we have been using, the main advantage of running it from within RStudio is that you don't have to toggle between RStudio and the shell. Everything is in one place. Imagine that the directory with our makefile is an RStudio Project. If a Project already contains a makefile, RStudio will automatically open a *Build* tab on the *Environment/History/Connections* pane, the same place where the *Git* tab appears (see Figure \@ref(fig:Ch6BuildTab)).[^chapter_6_13] The *Build* tab has buttons you can click to `Build All` (this is equivalent to `make all`), and, in the `More` drop-down menu, `Clean all` (i.e., `make clean`) and `Clean and Rebuild` (i.e., `make clean all`). As you can see in Figure \@ref(fig:Ch6BuildTab), the *Build* tab shows you the same output you get in the shell. ```{r Ch6BuildTab, fig.cap="The RStudio Build Pane", echo=FALSE, out.width="50%"} knitr::include_graphics("images/chapter_6/BuildTab.png") ``` #### Other information about makefiles {-} Note that Make relies heavily on commands and syntax of the shell program that you are using. The above example was written and tested on a Mac. It should work on other Unix-like computers without modification. You can use Make to build almost any project from the shell, not just to run R source code files. It was an integral part of early reproducible computational research [@fomel2009; @buckheit1995]. Rob Hyndman more recently posted a description of the makefile he uses to create a project with R and LaTeX.[^chapter_6_14] The complete source of information on GNU Make is the official online manual. It is available at: . ## Importing Locally Stored Data Sets Now that we've covered the big picture, let's learn the different tools you will need to know to gather data from different types of sources. The most straightforward place to load data from is a local file, e.g. one stored on your computer. Though storing your data locally does not really encourage reproducibility, most research projects will involve loading data this way at some point. The tools you will learn for importing locally stored data files will also be important for most of the other methods further on. Data stored in plain-text files on your computer can be loaded into R using the `read.table()` function.\index{R function!read.table} For example, imagine we have a CSV file called *test-data.csv* stored in the current working directory. To load the data set into R, type: ```{r Ch6LocalReadTable, eval=FALSE} test_data <- read.table("test-data.csv", sep = ",", header = TRUE) ``` If you are using RStudio, you can do the same thing with drop-down menus. To open a plain-text data file, click on `Environment` `Import Dataset…` `From Text File…`. In the box that pops up, specify the column separator, whether or not you want the first line to be treated as variable labels, and other options. This is initially easier than using `read.table()`, but it is much less reproducible. If the data is not stored in plain-text format but is instead saved in a format created by another statistical program such as SPSS, SAS, or Stata, we can import it using commands in the *foreign* package. For example, imagine we have a data file called *data-1.dta* stored in our working directory. This file was created by the Stata statistical program. To load the data into an R data frame object called *stata_data*, type: ```{r Ch6Stata, eval=FALSE, tidy=FALSE, echo=TRUE} # Load foreign package library(foreign) # Load Stata formatted data stata_data <- read.dta(file = "data-1.dta") ``` As you can see, functions in the *foreign* package have similar syntax to `read.table()`. To see the full range of commands and file formats that the *foreign* package supports, use the following: \index{R function!read.dta} ```{r Ch6ForeignHelp, eval=FALSE, echo=TRUE} library(help = "foreign") ``` Typically an even simpler solution is to use `import()`\index{R function!rio} from the *rio* package. It will automatically try to find the right way to parse whatever data format you give it. For example: ````r stata_data <- rio::import("data-1.dta") ```` If you have data stored in a spreadsheet format such as Excel's *.xlsx*,\index{Excel} it may be best to first clean up the data in the spreadsheet program by hand and then save the file in plain-text format. When you clean up the data, make sure that the first row has the variable names and that observations are in the following rows. Also, remove any extraneous information such as notes, colors, and so on that will not be part of the data frame. `import()` can also attempt to import *.xlsx* files. This is much easier if they are cleaned up to resemble text files. To aid reproducibility, locally stored data should include careful documentation of where the data came from and how, if at all, it was transformed before it was loaded into R. Ideally, the documentation would be written in a text file saved in the same directory as the raw data file. ## Importing Data Sets from the Internet There are many ways to import data that is stored on the internet directly into R. We have to use different methods depending on where and how the data is stored. ### Data from non-secure (*http*) URLs Importing data into R that is located at a non-secure URL[^chapter_6_15]---ones that start with *http*---is straightforward, provided that: - the data is stored in a simple format, e.g. plain-text, - the file is not embedded in a larger HTML website. We already discussed the first issue in detail. You can determine if the data file is embedded in a website by opening the URL in your web browser. If you only see the raw plain-text data, you are probably good to go. To import the data, include the URL as the file's name in your `read.table()` function.\index{R function!read.table} ### Data from secure (*https*) URLs {#SecureDataDownload} Storing data at non-secure URLs is now very uncommon. Services like Dropbox, GitHub, and Dataverse store their data at secure URLs. You can tell if the data is stored at a secure web address if it begins with `https` rather than `http`. We have to use a different function to download data from secure URLs. As we saw last chapter, in Section \@ref(EnablePublicFolder), `import()` has no problem gathering data from secure URLs, e.g.: ```{r Ch6importhttps} # Place the URL into the object fin_url fin_url <- "https://bit.ly/2xlQ2j5" # Download data fin_regulator <- import(fin_url, format = "csv") ``` ### Compressed data stored online Sometimes data files are large, making them difficult to store and download without compressing them. There are a number of compression methods such as Zip and Tar.[^chapter_6_19] Zip files have the extension `.zip` and Tar files use extensions such as `.tar` and `.gz`. In most cases[^chapter_6_20] you can download, decompress, and create data frame objects from these files directly in R. To do this, you need to[^chapter_6_21] - create a temporary file with `tempfile()`\index{R function!tempfile} to store the zipped file, which you will later remove with `unlink()` at the end, - download the file with `download.file()`,\index{R function!download.file} - decompress the file with one of the commands in base R,[^chapter_6_22] - read the file with `read.csv()` or `import()`.\index{R function!read.csv}\index{R function!import} The reason that we have to go through so many extra steps is that compressed files are more than just a single file and contain a number of files as well as metadata. Let's download a compressed file called *uds_summary.csv* from [@pemstein2010]. It's in a compressed file called *uds_summary.csv.gz*. At the time of writing, the file's URL address is . ```{r Ch5ZipDownload, warning=FALSE, tidy=FALSE, eval=FALSE, size='footnotesize'} # For simplicity, store the URL in an object called 'URL' URL <- paste0("http://www.unified-democracy-scores.org/", "files/20140312/z/uds_summary.csv.gz") # Create a temporary file called 'temp' to put the zip file into. temp <- tempfile() # # Download the compressed file into the temporary file download.file(URL, temp) # # Decompress the file and convert it into a data frame uds_data <- read.csv(gzfile(temp, "uds_summary.csv")) # # Delete the temporary file unlink(temp) # # Show variables in data names(uds_data) ``` Note I used `paste0()`\index{R function!paste0} to split the URL over two lines so I could print the whole URL on this page. ### Data APIs and feeds There are a growing number of packages that can gather data directly from a variety of internet sources and import them into R. Most of these packages use the sources' web application programming interfaces (APIs). Web APIs allow programs to interact with a website. Needless to say, this is great for reproducible research. It not only makes the data gathering process easier as you don't have to download many Excel files and fiddle around with them before even getting the data into R, but it also makes replicating the data gathering process much more straightforward and makes it easy to update data sets when new information becomes available. **Warning**: An R package that downloads data from an API will only work as long as the package maintainer keeps up with changes made to the API and the service the API calls still exists. If one of these conditions doesn't hold, the function call will break. It will not be possible to easily reproduce the data gathering process. Because of these threats to reproducibility, I recommend saving a copy of the data you download and considering making it available for replication. #### API R package example {-} Each of these packages has its own syntax and it isn't possible to go over all of them here. Nonetheless, let's look at an example of accessing World Bank data with the *WDI* to give you a sense of how these packages work. Imagine that we want to gather data on fertilizer consumption. We can use *WDI*'s `WDIsearch()` function\index{R function!WDIsearch} to find fertilizer consumption data available at the World Bank: ```{r Ch6WDIsearch} # Load WDI package library(WDI) # Search World Bank for fertilizer consumption data WDIsearch("fertilizer consumption") ``` This call returns a selection of indicator numbers and their names.[^chapter_6_31] Let's gather data on countries' fertilizer consumption in kilograms per hectare of arable land. The indicator number for this variable is: `AG.CON.FERT.ZS`. We can use the function `WDI()` to gather the data and put it in an object called *fert_cons_data*. ```{r Ch6WDIFert, eval=TRUE} fert_cons_data <- WDI(indicator = "AG.CON.FERT.ZS", start = 2010, end = 2016) ``` The `start` and `end` arguments allow us to set the starting and ending year of the data to download. The data we downloaded looks like this: ```{r Ch6HeadFert} head(fert_cons_data) ``` You can see that `WDI` has downloaded data for four variables: **iso2c**,[^chapter_6_32] **country**, **AG.CON.FERT.ZS** and **year**. ## Advanced Automatic Data Gathering: Web Scraping If a package does not already exist to access data from a particular website, there are other ways to automatically "scrape" data with R. This section briefly discusses some of R's web scraping tools and techniques to get you headed in the right direction to do more advanced data gathering. #### The general process {-} Simple web scraping involves downloading a file from the internet, parsing it (i.e. reading it), and extracting the data you are interested in then putting it into a data frame object. We already saw a simple example of this when we downloaded data from a secure HTTPS website. The complexity of this process depends on how structured the data is. If the data is in a CSV file, then all we need is the `import()` function. Less structured data requires more effort to download and parse. For example, data may be stored in an HTML formatted table within a more complicated HTML marked up webpage. The *XML* package [@R-XML] has a number of useful functions such as `readHTMLTable()`\index{R function!readHTMLTable} for parsing and extracting this kind of data. The *XML* package also clearly has functions for handling XML---Extensible Markup Language---formatted data. In addition, the helpful *rvest* [@R-rvest] package provides set of functions with capabilities similar to and often more capable than *XML*. If the data is stored in JSON---JavaScript Object Notation---you can read it with a package like *jsonlite* [@R-jsonlite]. There are more websites with APIs than R packages designed specifically to access each one. If an API is available, the *httr* package [@R-httr] may be useful. #### More tools to learn for web scraping {-} Beyond learning about the various R packages that are useful for R web scraping, an aspiring web scraper should probably invest time learning a number of other skills: - HTML: Obviously you will encounter a lot of HTML markup when web scraping. Having a good understanding of the HTML markup language will be very helpful. W3 Schools () is a free resource for learning HTML as well as JSON, JavaScript, XML, and other languages you will likely come across while web scraping. - Regular Expressions: Web scraping often involves finding character patterns. Some of this is done for you by the R packages above that parse text. There are times, however, when you are looking for particular patterns, like tag IDs, that are particular to a given website and change across the site based on a particular pattern. You can use regular expressions to deal with these situations. R has a comprehensive, if bare-bones, introduction to regular expressions. To access it, type `?regex` into your R console. - Looping: Web scraping often involves applying a function to multiple things, e.g. tables or HTML tags. To do this in an efficient way, you will need to use loops and `apply` functions.\index{R function!apply} @matloff2011 provides a comprehensive overview. The *dplyr* [@R-dplyr] and *purrr* [@R-purrr] packages are useful for data frame and vector manipulation. - Finally, @munzert2015 provide a comprehensive overview of web scraping and text mining with R. ### Chapter summary {-} In this chapter, we learned how to reproducibly gather data from a number of sources. If the data we are using is available online, we may be able to create really reproducible data gathering files. These files have commands that others can execute with makefiles that allow them to actually regather the exact data we used. The techniques we can use to gather online data also make it easy to update our data when new information becomes available. Of course, it may not always be possible to have really reproducible data gathering. Nonetheless, you should always aim to make it clear to others (and yourself) how you gathered your data. In the next chapter, we will learn how to clean and merge multiple data files so that they can easily be used in our statistical analyses. [^chapter_6_1]: GNU stands for "GNU's Not Unix", indicating that it is Unix-like. [^chapter_6_2]: To standardize things, I use the terms "R make-like file" for files created and run in R and the standard "makefile" for files run by Make. [^chapter_6_3]: We use the command more in Chapter \@ref(StatsModel). [^chapter_6_4]: GNU Make was originally developed in 1977 by Stuart Feldman as a way to compile computer programs from a series of files, its primary use to this day. For an overview, see . For installation instructions, please see Section \@ref(InstallMake). [^chapter_6_5]: A file's time stamp records the time and date when it was last changed. [^chapter_6_6]: Alternatively, you can call the file *GNUmakefile* or *makefile*. [^chapter_6_7]: Simple string variables are often referred to as "macros" in GNU Make. A common convention in Make and Unix-like shells generally is to use all caps for variable names. [^chapter_6_8]: This is a kind of parameter expansion. For more information about parameter expansion, see @frazier2008. [^chapter_6_9]: Braces (`{}`) are also sometimes used for this. [^chapter_6_10]: The R out-file contains all of the output from the R session used while running the file. These can be a helpful place to look for errors if your makefiles give you an error like `make: *** [gather.Rout] Error 1`. [^chapter_6_11]: You will need to make sure that R is in your PATH. Setting this up is different on different systems. If on Mac and Linux you can load R from the Terminal by typing `R`, R is in your PATH. The R installation usually sets this up correctly. There are different methods for changing the file path on different versions of Windows. [^chapter_6_12]: If you open these files, you will find the output from the R session used when their source file was last run. [^chapter_6_13]: If a project doesn't have a makefile, you can still set up RStudio Build. Click on `Build` in the Menu bar then `Configure Build Tools . . .`. Select `Makefile` from the drop-down menu, then `Ok`. You will still need to manually add a Makefile in the Project's root directory. [^chapter_6_14]: See his blog at: , which was posted 31 October 2012. This method largely replicates what we do in this book with *knitr*. Nonetheless, it has helpful information about Make that can be used in other tasks. It was in fact helpful for writing this section of the book. [^chapter_6_15]: URL stands for "Uniform Resource Locator". [^chapter_6_17]: Remember we placed the file's raw GitHub URL address inside of the object *url_address*. [^chapter_6_19]: Tar archives are sometimes referred to as 'tar balls'. [^chapter_6_20]: Some formats that require the *foreign* package to open are more difficult. This is because functions such as for opening Stata files only accept file names or URLs as arguments, not connections, which you create for unzipped files. [^chapter_6_21]: The description of this process is based on a Stack Overflow comment by Dirk Eddelbuettel (see , posted 10 June 2010.) [^chapter_6_22]: To find a full list of functions, type `?connections` into the R console. [^chapter_6_23]: [^chapter_6_24]: [^chapter_6_25]: [^chapter_6_26]: [^chapter_6_27]: [^chapter_6_28]: [^chapter_6_29]: [^chapter_6_30]: [http://stats.stackexchange.com/questions/12670/data-apis-feeds-available-as-packages-in-r](http://stats.stackexchange.com/questions/12670/data-apis-feeds-available-as-packages-in-r) [^chapter_6_31]: You can also search the World Bank Development Indicators website. The indicator numbers are at the end of each indicator's URL. [^chapter_6_32]: These are the countries' or regions' International Standards Organization's two-letter codes. For more details, see . ================================================ FILE: rep-res-3rd-edition/09-clean.Rmd ================================================ # Preparing Data for Analysis {#DataClean} Once we have gathered the raw data that we want to include in our statistical analyses, we generally need to clean it up so that it can be merged into a single data set that we can easily use for statistical analysis. In this chapter we will learn how to create the data gathering and merging files we saw in the last chapter. This includes recoding and transforming variables in the data set so that the data sets can be easily merged. This will also be useful information in later chapters as well. If you are very familiar with data transformations in R, you may want to skip to the next chapter. ## Cleaning Data for Merging In order to successfully merge two or more data frames, we need to make sure that they are in the same format. Let's look at some of the important formatting issues and how to reformat your data frames so that they can be easily merged. ### Get a handle on your data Before doing anything to your data, it is a good idea to 'look at it' to see what needs to be done. Taking a little time to become acquainted with your data will help you avoid many error messages and much frustration. You could type a data frame object's name into the R console. This will print the entire data frame in your console. For data frames with more than a few variables and observations, this is impractical. We have already seen a number of functions that are useful for looking at parts of your data. As we saw in Chapter \@ref(GettingStartedRKnitr), the `names()`\index{R function!names} function shows you the variable names in a data frame object. The `head()`\index{R function!tail} function shows the names plus the first few observations in a data frame. `tail()`\index{R function!tail} shows the last few. `str()`\index{R function!str} returns a summary of a data frame, including the number of observations and variables as well as the variable types. Use the `dim()`\index{R function!names} (dimensions) function to quickly see the number of observations and variables (the number of rows and columns) in a data frame object. For example, let's test out `dim()` with the *fert_cons_data* object we created in Chapter \@ref(DataGather): ```{r Ch7dim} dim(fert_cons_data) ``` The first number is the number of rows in the data frame (`r nrow(fert_cons_data)`), and the second is the number of columns (`r ncol(fert_cons_data)`). You can also use the `nrow()`\index{R function!nrow} function to find just the number of rows and `ncol()`\index{R function!ncol} to see only the columns. The `summary()`\index{R function!summary} function is especially helpful for seeing basic descriptive statistics for all of the variables in a data frame and also the variable types. Here is an example: ```{r Ch7SummaryExamp} summary(fert_cons_data) ``` We can immediately see that the variables **iso2c** and **country** are character strings. Because `summary()` is able to calculate means, medians, and so on for **AG.CON.FERT.ZS** and **year**, we know they are numeric. Have a look over the summary to see if there is anything unexpected like lots of missing values (**NA's**)\index{NA}\index{missing values} or unusual maximum and minimum values. You can of course, run `summary()` on a particular variable by using the component selector (`$`): \index{component selector} ```{r Ch7SummarizeCompSelect} # Summarize fertilizer consumption variable from fert_cons_data summary(fert_cons_data$AG.CON.FERT.ZS) ``` We'll come back to why knowing this type of information is important for merging and data analysis later in this chapter. Another important function for quickly summarizing a data frame is `table()`.\index{R function!table} This creates a contingency table\index{contingency table} with counts of the number of observations per combination of factor variables. You can view a portion of a data frame object with `View()`\index{R function!View} This will open a new window that lets you see a selection of the data frame. If you are using RStudio, you can click on the data frame in the *Environment* tab\index{RStudio!Environment tab} and you will get something similar. Note that neither of these viewers are interactive in that you can't use them to manipulate the data. They are only data viewers. To be able to see similar windows that you can interactively edit, use the `fix()`\index{R function!fix} function in the same way that you use `View()`. This can be useful for small edits, but remember that the edits are not reproducible. ### Tibbles {-} Most of these data summary capabilities come "for free" when you use an alternate type of data frame called a "tibble" [@R-tibble] For example: \index{R function!tibble} ```{r Ch7Tibble} # Create example tibble data frame tbl_ex <- tibble::tibble(numbers = 1:26, letters = letters) tbl_ex ``` Entering a tibble's object name in the console returns the condensed output, the data dimmensions, and the variable types with the first 10 entries. Tibbles are the data structure favored by the tidy data/tidyverse R data paradigm [@wickham2014article]. We will work with other packages of the Tidyverse, e.g. *dplyr* and *ggplot2*, in later chapters.\index{Tidyverse} Note that these packages often work with traditional data frames as well (or will convert data frames to tibbles automatically). ### Reshaping data {#GatherReshape} \index{R!reshaping data} It is often a good idea if your data sets are kept in data frame type objects if that is the format you will use for analysis. See Chapter \@ref(GettingStartedRKnitr) for how to convert objects into data frames with the `data.frame()` function.\index{R function!data.frame}\index{R!data frame} Not only do data sets (generally) need to be stored in data frame objects, they also need to have the same layout before they can be merged. Most R statistical analysis tools assume that your data is in "long" format\index{long formatted data}. For an excellent discussion of ideal data formats for statistical analysis, see @wickham2014article. Long formatted data usually has columns that represent variables. Rows contain specific observations. For example: \begin{table}[h!] \caption{Long-Formatted Data Example} \label{ExampleLong} \begin{tabular}{l c} \\[0.15cm] \hline Subject & Variable1 \\ \hline \\[0.1cm] Subject1 & \\[0.25cm] Subject2 & \\[0.25cm] Subject3 & \\[0.25cm] \ldots & \\[0.25cm] \hline \end{tabular} \end{table} In this chapter we will mostly use examples of time-series cross-sectional data (TSCS)\index{time-series cross-sectional}\index{TSCS} that we want to have in long-format. Long-formatted TSCS data is a data frame where rows identify observations of a particular subject at particular points in time and there are multiple observations per subject (see Table \@ref(ExampleTSCSLong)). In this chapter our TSCS data is specifically going to be countries that are observed in multiple years. \begin{table}[h!] \caption{Long-Formatted Time-Series Cross-Sectional Data Example} \label{ExampleTSCSLong} \begin{tabular}{l c c} \\[0.15cm] \hline Subject & Time & Variable1 \\ \hline \\[0.1cm] Subject1 & 1 & \\[0.25cm] Subject1 & 2 & \\[0.25cm] Subject1 & 3 & \\[0.25cm] Subject2 & 1 & \\[0.25cm] Subject2 & 2 & \\[0.25cm] Subject2 & 3 & \\[0.25cm] \ldots & & \\[0.25cm] \hline \end{tabular} \end{table} If one of our raw data sets is not in this format, then we will need to reshape or, using Wickham's [-@wickham2014article] terminology, "tidy" it.\index{reshape data}\index{tidy data} Some data sets are in "wide" format,\index{wide formatted data} where one of the columns in what would be long formatted data is "widened" to cover multiple columns. This is confusing to imagine without an example. Table \@ref(ExampleWide) shows how Table \@ref(ExampleTSCSLong) looks when we widen the time variable. \begin{table}[h!] \caption{Wide-Formatted Data Example} \label{ExampleWide} \begin{tabular}{l c c c} \\[0.15cm] \hline Subject & Time1 & Time2 & Time3 \\ \hline \\[0.1cm] Subject1 & & & \\[0.25cm] Subject2 & & & \\[0.25cm] \ldots & & & \\[0.25cm] \hline \end{tabular} \end{table} The process of tidying data often causes confusion and frustration. Though probably never easy, there are a number of useful R functions for changing data from wide-format to long and vice versa. These include the matrix transpose function (`t()`)[^chapter_7_transpose]\index{matrix transpose} and the `reshape()`\index{R function!reshape} function, both are loaded in R by default. *tidyr* [@R-tidyr] is a very helpful package for reshaping data.\index{R package!tidyr} This package has more general tools for reshaping data and is worth investing some time to learn well. In this section, we will look at *tidyr*'s `pivot_longer()`\index{R function!pivot\_longer} function and use it to reshape a TSCS data frame from wide- to long-format. We will also encounter this function again in Chapter \@ref(FiguresChapter) when we want to transform data so that it can be graphed. Note that if you want to go from long to wide-format, use *tidyr*'s `pivot_wider()`\index{R function!pivot\_wider} function. For illustration, let's imagine that the fertilizer consumption data we previously downloaded from the World Bank is in wide, rather than long, format and is in a data frame object called *fert_wide*. It looks like this: ```{r Ch7WideCreate, include=FALSE} # Reshape wide fert_wide <- tidyr::pivot_wider(fert_cons_data, names_from = year, values_from = AG.CON.FERT.ZS) # Order by country fert_wide <- dplyr::arrange(fert_wide, country) ``` ```{r Ch7ShowWideFert} fert_wide[, 1:4] ``` See the chapter's Appendix for the full code I used to reshape the data from long- to wide-format. Let's think about how we want to tidy the data. We want to create two new columns from the many columns that are now labeled by year. Let's call the new columns **Year** and **Fert**. The **Year** column will clearly contain the year of each observation and **Fert** will contain the fertilizer consumption. **Year** will be what `pivot_longer()` calls the variable's "name" and **Fert** is the "value". In our *fert_wide* data, we don't want the **iso2c** and **country** variables to be gathered. These variables identify the data set's subjects. So we can tell `pivot_longer()` that we only want the columns with the between **2016** and **2010** to be used for the long variable. Note that the back ticks in the code below allow us to specify numeric values as column names. ```{r Ch7GatherFert, tidy=FALSE} # Gather fert_wide fert_long <- tidyr::pivot_longer(fert_wide, cols = `2016`:`2010`, names_to = "Year", values_to = "Fert") fert_long ``` ### Renaming variables \index{R!renaming variables} Frequently, in the data cleaning process we want to change the names of our variables. This will make our data easier to understand and may even be necessary to properly combine data sets (see below). In the previous example, for instance, our `fert_long` data frame has two variables: **Year** and **Fert**. Imagine, for the sake of demonstration, that we want to rename them **year** and **fert_cons**. Renaming data frame variables is straightforward with the `rename()`\index{R function!rename}\index{rename variable} function in the *dplyr* package \citep{R-dplyr}. To rename both **variable** and **value** with the `rename()` function type: ```{r Ch7Rename, tidy=FALSE} fert_long <- dplyr::rename(fert_long, year = Year, fert_cons = Fert) fert_long ``` ### Ordering data \index{R!ordering data} You may have noticed that as a result of gathering *fert_wide* the data is now ordered by country-year. Imagine that for some substantive reason that makes the data easier to read, we rather want it ordered by year-country. Though not required for merging in R, some statistical analyses assume that the data is ordered in a specific way. We can order observations in our data set using the `order()` function.\index{R function!order}\index{R!sort}\index{R!order} For example, to order *fert_long* by year-country, we type: ```{r Ch7Order, tidy=FALSE} # Order fert_long by year-country fert_long <- fert_long[order(fert_long$year, fert_long$country), ] head(fert_long) ``` *dplyr* has a function called `arrange()`\index{R function!arrange} that can also be useful for ordering your data. `arrange()`'s syntax is much cleaner and easier to remember for data frames than the operation we did with `order()`. To arrange the *fert_long* data back to country-year with `arrange()` use: ```{r Ch7Arrange} fert_long <- dplyr::arrange(fert_long, country, year) ``` To arrange a variable in descending order, place it in the `desc()` function from *dplyr*, e.g. `arrange(fert_long, country, desc(year))`.\index{R function!desc} ### Subsetting data \index{R!subsetting data} Sometimes you may want to use only a subset of a data frame. For example, the density plot in the following figure shows us that the *fert_long* data has a few very extreme values (see the chapter's Appendix for the source code to create this figure). ```{r fert_consDens, echo=FALSE, fig.height=3, fig.width=3, warning=FALSE} ggplot2::ggplot(data = fert_long, aes(fert_cons)) + geom_density() + xlab("\n Fertilizer Consumption") + ylab("Density\n") + theme_bw() ``` We can use the `subset()`\index{R!subset}\index{R function!subset} function to examine these outliers,\index{outliers} for example, countries that have fertilizer consumption greater than 1000 kilograms per hectare. ```{r Ch7SubsetOutliers, tidy=FALSE} # Create outlier data frame fert_outliers <- subset(x = fert_long, fert_cons > 1000) fert_outliers ``` If we want to drop these outliers from our data set, we can use `subset()` again: ```{r Ch7SubsetNoOutliers, tidy=FALSE} fert_long_sub <- subset(x = fert_long, fert_cons <= 1000) ``` In this example, non-country units like "Arab World" are included. We might also want to drop these units with `subset()`. For example: ```{r Ch7DropString, tidy=FALSE} fert_long_sub <- subset(x = fert_long_sub, country != "Arab World") ``` We can also use `subset()` to remove observations with missing values (`NA`) for **fert_cons**. ```{r Ch7IsNotNA, tidy=FALSE} # Remove observations of fert_cons # with missing values fert_long_sub <- subset(x = fert_long_sub, !is.na(fert_cons)) # Summarize fert_cons summary(fert_long_sub$fert_cons) ``` \begin{table} \caption{R's Logical Operators} \label{LogicalOp} \begin{center} \begin{tabular}{l l} \hline\vspace{0.15cm} Operator & Meaning \\ \hline\hline \\ \verb|<| & less than \\ \verb|>| & greater than \\ \verb|==| & equal to \\ \verb|<=| & less than or equal to \\ \verb|>=| & greater than or equal to \\ \verb|!=| & not equal to \\ \verb+a | b + & a or b \\ \verb|a & b| & a and b \\ \verb|isTRUE(a)| & determine if a is TRUE \\ \hline \\ \verb|is.na| & missing\\ \verb|!is.na| & not missing \\ \verb|duplicated| & duplicated observation \\ \verb|!duplicated| & not a duplicated observation \\ \hline \end{tabular} \end{center} \end{table} Let's step back. I've introduced a number of new logical operators and a new function in the subsetting examples. The first example included the greater than sign (`>`). The second example included the less than or equal to operator: `<=`. The third example included the not equal operator: `!=`. In R, exclamation points (`!`) generally denote 'not'. We used this again in the final example in combination with the `is.na` function. This function indicates if an element is missing, so `!is.na` means "not missing". See Table \@ref(LogicalOp) for a list of R's logical operators. You can use these operators and functions when subsetting data and throughout R. ### Recoding string/numeric variables You may want to recode your variables. In particular, when you merge data sets you need to have **identical** identification values that R can use to match each observation. If in one data set observations for the Republic of Korea are referred to as "Korea, Rep." and in another they are labeled "South Korea", R will not know to merge them. We need to recode values in the variables that we want to match our data sets on. For example, in *fert_long_sub* the southern Korean country is labeled "Korea, Rep.". To recode it to "South Korea", type: ```{r Ch7RecodeString, tidy=FALSE} # Recode country == "Korea, Rep." to "South Korea" fert_long_sub$country[fert_long_sub$country == "Korea, Rep."] <- "South Korea" ``` This code assigns "South Korea" to all values of the **country** variable that equal "Korea, Rep.".[^chapter_7_1] You can use a similar technique to recode numeric variables as well. The only difference is that you omit the quotation marks. We will look at how to code factor variables later. ### Creating new variables from old As part of your data cleanup process (or later during statistical analysis), you may want to create new variables based on existing variables. For example, we could create a new variable that is the natural logarithm of **fert_cons**. To do this, we run the variable through the `log()` function and assign a new variable that we'll call **fert_cons_log**. ```{r Ch7LogFertComsump, tidy=FALSE} fert_long_sub$fert_cons_log <- log(fert_long_sub$fert_cons) summary(fert_long_sub$fert_cons_log) ``` Imagine that when we summarized the new log transformed variable that we had a minimum (and mean) value of `-Inf`.\index{infinity}\index{R!inf} This would indicate that by logging the variable we have created observations with the value negative infinity. R calculates the natural logarithm of zero as negative infinity.[^chapter_7_2] We probably don't want negative infinity values. There are a few ways to deal with this. We could drop all observations of **fert_cons** with the value zero before log transforming it. Another common solution is recoding zeros as some small nonnegative number like 0.001. For example: ```{r Ch7LogFertComsumpAgain, tidy=FALSE, eval=FALSE} # Recode zeros in Fertilizer Consumption fert_long_sub$fert_cons[fert_long_sub$fert_cons == 0] <- 0.001 # Natural log transform Fertilizer Consumption fert_long_sub$fert_cons_log <- log(fert_long_sub$fert_cons) ``` Note that this example is included to demonstrate R syntax rather than to prescribe a certain transformation of skewed data with zeros. The choice of which transformation to make should ultimately be made based on the data, model, and context. See @hyndman2010 for more information on various alternatives including Box-Cox [@box1964analysis] and inverse hyperbolic sine transformations [@burbidge1988]. \begin{table} \caption{Example Factor Levels} \label{ExampleFactorRecode} \begin{center} \begin{tabular}{l l p{4cm}} \hline Number & Label & Value of \textbf{FertilizerConsumption} \\ \hline\hline 1 & low & $< 18$ \\ 2 & medium low & $\ge 18$ and $< 81$ \\ 3 & medium high & $\ge 81$ and $< 158$ \\ 4 & high & $\ge 158$ \\ \hline \end{tabular} \end{center} \end{table} #### Creating factor variables {-} We can create factor variables from numeric or string variables. For example, we may want to turn the continuous numeric **fert_cons** variable into an ordered categorical (i.e. factor) variable. Imagine that we want to create a factor variable called **fert_cons_group** with four levels called 'low', 'medium low', 'medium high', and 'high'. To do this, let's first create a new numeric variable based on the values listed in Table \@ref(ExampleFactorRecode). Now let's use a procedure that is similar to the variable recoding we did earlier:[^chapter_7_3] ```{r Ch7FactorNumeric, tidy=FALSE, warning=FALSE} # Create numeric factor levels variable # Attach fert_long_sub data frame attach(fert_long_sub) # Created new fert_cons_group variable based on # fert_cons fert_long_sub$fert_cons_group[fert_cons < 18] <- 1 fert_long_sub$fert_cons_group[fert_cons >= 18 & fert_cons < 81] <- 2 fert_long_sub$fert_cons_group[fert_cons >= 81 & fert_cons < 158] <- 3 fert_long_sub$fert_cons_group[fert_cons >= 158] <- 4 fert_long_sub$fert_cons_group[is.na(fert_cons)] <- NA # Detach data frame detach(fert_long_sub) summary(fert_long_sub$fert_cons_group) ``` You'll notice that we don't have a factor variable yet; our new variable is numeric. We can use the `factor()`\index{R function!factor} function to convert *fert_cons_group* into a factor variable with the labels we want. ```{r Ch7ChangetoFactor, tidy=FALSE} # Create vector of factor level labels fc_labels <- c("low", "medium low", "medium high", "high") # Convert fert_cons_group to a factor fert_long_sub$fert_cons_group <- factor(fert_long_sub$fert_cons_group, labels = fc_labels) summary(fert_long_sub$fert_cons_group) ``` We first created a character vector with the factor-level labels and then applied using `factor`'s `labels` argument. Using `summary()` with a factor variable gives us its level labels as well as the number of observations per level. The `cut()`\index{R function!cut} function provides a less code-intensive way of creating factors from numeric ones and labeling factor levels. For example: ```{r Ch7Cut, tidy=FALSE} # Create a factor variable with the cut function fert_factor <- cut(fert_long_sub$fert_cons, breaks = c(-0.01, 17.99, 80.99, 157.99, 999.99), labels = fc_labels) summary(fert_factor) ``` The `labels` argument lets us specify the factor levels' names. The `breaks` argument lets us specify what values separate the factor levels. Note that we set the first break as `-0.01`, not because any country had negative fertilizer consumption, but because the intervals created by `break()` exclude the left value and include the right value.[^chapter_7_4] If we had used `0`, then all of the observations where a country used effectively no fertilizer would be excluded from the "low" category. ### Changing variable types Sometimes a variable will have the wrong type. For example, a numeric variable may be incorrectly made a character string when a data set is imported from Excel. You can change variable types with a number of functions. We already saw how to convert a numeric variable to a factor variable with the `factor()` function. Unsurprisingly, to convert a variable to a character, use `character()` and `numeric()`\index{R function!character}\index{R function!factor}\index{R function!numberic} to convert it to a numeric type variable. We can place `as.` before these functions (e.g. `as.factor()`)\index{R function!as.factor} as a way of coercing a change in type. **Warning:** Though these functions have straightforward names, a word of caution is necessary. Always try to understand why a variable is not of the type you would expect. Often variables have unexpected types because they are coded (or miscoded) in a way that you didn't anticipate. Changing the variable types, especially when using `as.`, can introduce new errors. Make sure that the conversion made the changes you expected. ## Merging Data Sets In the previous section, we learned crucial skills for cleaning up data sets. When your data sets are (a) in the same format and (b) have variables with identically matching ID values, you can merge your data sets. In this section, we'll look at two different ways to merge data sets: binding and the `merge()`\index{R function!merge} function. We'll also look at ways to address a common issue when merging data: duplicated observations and columns. ### Binding As we saw in Chapter \@ref(GettingStartedRKnitr), if your data sets are in the same order---rows in all of the data sets represent the same observation of the same subject---then you can use the `cbind()`\index{R function!cbind} function to bind columns from the data sets together. This situation is unusual when merging real-world data. If your data sets are not in exactly the same order you will create a data set with nonsensical rows that combine data from multiple observations. Therefore, you should avoid using `cbind()` for merging most real-world data. If you have data sets with the exact same columns and variable types and you just want to attach one under the other, you can use the `rbind()` function. It binds the rows in one object to the rows in another.[^chapter_7_5] It has the same syntax as `cbind()`. Again, you should be cautious when using this function, though it is more difficult to accidentally create a nonsensical data set with `rbind()`. R will give you an error if it cannot match your objects' columns. ### Merging data frames Generally, the `merge()` function is the safest and most effective way to merge two data sets.\index{R function!merge} Imagine that we want to merge our *fert_long_sub* data frame with two other data frames we created in Chapter \@ref(DataGather): *fin_regulator* and *disprop_data*. The simplest way to do this is to use the merge function twice, i.e.: ```{r Ch7AddIsoCodes, include=FALSE} # Add iso2c codes to fin_regulator and disprop_data # as ID variables for merging # Load countrycode library(countrycode) # fin_regulator fin_regulator$iso2c <- countrycode(fin_regulator$country, origin = "country.name", destination = "iso2c") ``` ```{r Ch7MergeSimple} # Merge fin_regulator and disprop_data merged_data_1 <- merge(x = fin_regulator, y = disprop_data, by = "iso2c", all = TRUE) # Merge combined data set with and fert_long_sub merged_data_1 <- merge(x = merged_data_1, y = fert_long_sub, by = "iso2c", all = TRUE) names(merged_data_1) ``` Let's go through this code. The `x` and `y` arguments specify which data frames we want to merge. The `by` argument specifies what variable(s) in the two frames identify the observations so that we can match them. In this example, we are merging by countries' ISO country two-letter codes.[^chapter_7_6] We set the argument `all = TRUE` so that we keep all of the observations from both of the data frames. If the argument is set to `FALSE`, only observations that are common to both data frames will be included in the merged data frame. The others will not be included. You might have noticed that this isn't actually the merge that we want to accomplish with these data frames. Remember that observations are not identified in this time-series cross-section data by one country name or other country code variable. Instead, they are identified by both country and year variables. To merge data frames based on the overlap of two variables (e.g. match Afghanistan-2010 in one data frame with Afghanistan-2010 in the other), we need to add the `union()`\index{R function!union} function to `merge`'s `by` argument. Here is a full example: ```{r Ch7MergeFull, tidy=FALSE} # Merge fin_regulator and disprop_data merged_data_2 <- merge(fin_regulator, disprop_data, union("iso2c", "year"), all = TRUE) # Merge combined data frame with fert_long_sub merged_data_2 <- merge(merged_data_2, fert_long_sub, union("iso2c", "year"), all = TRUE) names(merged_data_2) ``` After merging data frames, it is always a good idea to look at the result and make sure it is what you expected. Some post-merging cleanup may be required to get the data frame ready for statistical analysis. ### Bigger data {-} Before discussing post-merge cleanup, it is important to highlight ways to handle large data sets. The `merge()` function and many of the other data frame manipulation functions covered so far in this chapter may not perform well with very large data sets. If you are using very large data sets, it might be worth investing time learning how to use packages like *dbplyr* [@R-dbplyr]\index{R package!dbplyr} and *data.table* packages [@R-data.table].\index{R package!data.table} They have many capabilities for working efficiently with large data sets. Likely, if you have very large data, you will need to learn SQL (Structured Query Language) or another special purpose data handling language.[^chapter_7_9] Once you know how these languages work, you can incorporate them into your R workflow with R packages like *dbplyr*. ### Duplicate values {-} Duplicate observations are one thing to look out for after (and before) merging. You can use the `duplicated()`\index{R function!duplicated} function to check for duplicates. Use the function in conjunction with subscripts to remove duplicate observations. For example, let's create a new object called *data_duplicates* from the iso2c-years that are duplicated in *merged_data_2*. Remember that **iso2c** and **year** are in the first and second columns of the data frame. ```{r Ch7Duplicated, tidy=FALSE} # Created a data frame of duplicated country-years data_duplicates <- merged_data_2[ duplicated(merged_data_2[, 1:2]), ] # Show the number of rows in data_duplicates nrow(data_duplicates) ``` In this data frame, there are duplicated iso2c-year observations. We know this because `nrow` tells us that the data frame with the duplicated values has rows, i.e. observations. To create a data set without duplicated observations (if there are duplicates), add an exclamation point (`!`) before `duplicated`, i.e. not duplicated, in the above code. ```{r Ch7NotDuplicated} # Created a data frame of unique country-years data_not_duplicates <- merged_data_2[ !duplicated(merged_data_2[, 1:2]), ] ``` Note that if you do have duplicated values in your data set and you run a similar procedure on it, it will drop duplicated values that have a lower order in the data frame. To keep the lowest ordered value and drop duplicates higher in the data set, use `duplicated`'s `fromLast` argument like this: `fromLast = TRUE`. **Warning:** Look over your data set and the source code that created the data set to try to understand why duplicates occurred. There may be a fundamental problem in the way you are handling your data that resulted in the duplicated observations. ### Duplicate columns Another common post-merge cleanup issue is duplicate columns, i.e. variables. These are variables from the two data frames with the same name that were not included in `merge`'s `by` argument. For example, in our previous merged data examples, there are three country name variables: **country.x**, **country.y**, and **country** to signify which data frame they are from.[^chapter_7_11] You should decide what to do with these variables on a case-by-case basis. But if you decide to drop one of the variables and rename the other, you can use subscripts (as we saw in Chapter \@ref(GettingStartedRKnitr)). The *dplyr* package has a useful function called `select()`\index{R function!select} that can also remove variables from data frames. To remove variables, write a minus sign (`-`) and then the variable name without quotes. For example, imagine that we want to keep **country.x** and drop the other variables.[^chapter_7_12] Let's also remove the **idn** variable: ```{r Ch7RemoveVars, tidy=FALSE} # Remove country.y, country, X, and idn final_cleaned <- dplyr::select(data_not_duplicates, -country.y, -country, -idn) # Rename country.x = country final_cleaned <- dplyr::rename(final_cleaned, country = country.x) ``` ```{r Ch7Showfinal_cleanedNames} names(final_cleaned) ``` Alternatively, you can select specific variables to keep with the `select` function by writing the variables' names without a minus sign. **Note**: If you are merging many data sets, it can sometimes be good to clean up duplicate columns between each `merge()` call. ### Chapter summary {-} This chapter has provided you with many tools for cleaning up your data to get it ready for statistical analysis. Before moving on to the next chapter to learn how to incorporate statistical analysis as part of a reproducible workflow with knitr/R Markdown, it's important to reiterate that the function we've covered in this chapter should usually be embedded in the types of data creation files we saw in Chapter \@ref(DataGather). These files can then be tied together with a makefile into a process that should be able to relatively easily take very raw data and clean it up for use in your analyses. Embedding these functions in data creation source code files, rather than just typing the functions into your R console or manually changing data in Excel, will make your research much more reproducible. It will also make it easier to backtrack and find mistakes that you may have made while transforming the data. Including new or updated data when it becomes available will also be much easier if you use a series of segmented data creation source code files that are tied together with a makefile. # Appendix {-} R code for turning *fert_cons_data* into year-wide-format: ```{r Ch7WideCreateShow, eval=FALSE, tidy=FALSE} library(WDI) library(tidyr) library(dplyr) # Gather fertilizer consumption data from WDI fert_cons_data <- WDI(indicator = "AG.CON.FERT.ZS") # Reshape fert_cons_data to year wide-format fert_wide <- tidyr::pivot_wider(fert_cons_data, names_from = year, values_from = AG.CON.FERT.ZS) # Order fert_wide by country fert_wide <- arrange(fert_wide, country) ``` R code for creating iso2c country codes with the *countrycode* package: ```{r Ch7CountryCodeShow, eval=FALSE, tidy=FALSE} library(countrycode) fin_regulator$iso2c <- countrycode(fin_regulator$country, origin = "country.name", destination = "iso2c") ``` R code for creating the chapter's density plot: ```{r Chr7FigDensity, eval=FALSE} library(ggplot2) # Set plot theme to "minimal" theme_set(theme_minimal()) # Create density plot ggplot(data = fert_long, aes(fert_cons)) + geom_density() + xlab("Fertilizer Consumption") + ylab("Density") + theme_bw() ``` [^chapter_7_transpose]: See this example by Rob Kabacoff: . Note also that because the matrix transpose function is denoted with `t`, you should not give any object the name *t*. [^chapter_7_1]: The *countrycode* package [@R-countrycode] is very helpful for creating standardized country identification variables. [^chapter_7_2]: R denotes positive infinity with `Inf`. [^chapter_7_3]: In this code, I attached the data frame *fert_long_sub* so that it is easier to read. [^chapter_7_4]: In mathematical notation, the "low" level includes all values in the interval $(-0.01,\:17.99]$. [^chapter_7_5]: Some programming languages and statistical programs refer to this type of action as "appending" one data set to another. [^chapter_7_6]: Please see this chapter's Appendix for details on how I created an ISO country two-letter code variable in the *fin_regulator* data frame. [^chapter_7_9]: w3schools has an online SQL tutorial at: . [^chapter_7_10]: See the *dplyr* vignette on using the package with SQL databases at . [^chapter_7_11]: The former two were created in the first merge between *fin_regulator* and *disprop_data*. When the second merge was completed, there were no variables named **country** in the MergeData2 data frame, so **country** did not need to be renamed in the new merged data set. [^chapter_7_12]: This version of the country variable is the most complete. ================================================ FILE: rep-res-3rd-edition/10-modeling.Rmd ================================================ # (PART) Analysis and Results {-} # Statistical Modeling and knitr/R Markdown {#StatsModel} When you have your data cleaned and organized, you will begin to examine it with statistical analyses. In this book we don't look at how to do statistical analysis in R (a subject that would and does take up many other books). Instead, we focus on how to make your analyses really reproducible. You do this by dynamically connecting your data gathering and analysis source code to your presentation documents. When you dynamically connect your data gathering makefiles and analysis source code to your markup document, you will be able to completely rerun your data gathering and analysis and present the results whenever you compile the presentation documents. This makes it very clear how you found the results that you are advertising. It also automatically keeps the presentation of your results, including tables and figures, up-to-date with any changes you make to your data and analyses source code files. You can dynamically tie your data gathering, statistical analyses, and presentation documents together with knitr/R Markdown. In Chapter \@ref(GettingStartedRKnitr) you learned basic *knitr*/*rmarkdown* package syntax. For the rest of the chapter, I'll refer to it as "*knitr* syntax", but it applies to R Markdown as well when it is not specific to LaTeX. In this chapter we will begin to learn knitr syntax in more detail, particularly code chunk options for including dynamic code in your presentation documents. This includes code that is run in the background, i.e. not shown in the presentation document, as well as displaying the code and output in your presentation document both as separate blocks and inline with the text. We will also learn how to dynamically include code from languages other than R. We examine how to use knitr with modular source code files. Finally, we will look at how to create reproducible random analyses and how to work with computationally intensive code chunks. The goal of this and the next two chapters, which cover dynamically presenting results in tables and figures, is to show you how to tie data gathering and analyses into your presentation documents so closely that every time the documents are compiled they actually reproduce your analysis and present the results. Please see the next part of this book, Part IV, for details on how to create the LaTeX and Markdown documents that can include *knitr* code chunks. **Reminder:** Before discussing the details of how to incorporate your analysis into your source code, it's important to reiterate something we discussed in Chapter \@ref(GettingStartedRR). The syntax and capabilities of R packages and R itself can change with new versions. Also, as we have seen for file path names, syntax can change depending on what operating system you are using. So it's important to have your R session info available (see Section \@ref(SessionInfoHow) for details) to make your research more reproducible and future-proof. If someone reproducing your research has this information, they will be able to download your files and use the exact version of the software that you used. For example, CRAN maintains an archive of previous R package versions that can be downloaded.[^chapter_8_1] Previous versions of R itself can also be downloaded through CRAN.[^chapter_8_2] ## Incorporating Analyses into the Markup For a relatively short piece of code that you don't need to run in multiple presentation documents, it may be simplest to type the code directly into chunks written in your *knitr* markup document. In this section we will learn how to set *knitr* options for handling these code chunks. For a list of many of the chunk options covered here, see Table \@ref(ChunkOptionsTable). ### Full code chunks By default, *knitr* code chunks are run by R, and the code and any text output (including warnings and error messages) are inserted into the text of your presentation documents in blocks. The blocks are positioned in the final presentation document text at the points where the code chunk was written in the knittable markup. Figures are inserted as well. Let's look at the main options for determining how code chunks are handled by *knitr*. #### `include` {-} Use `include=FALSE`\index{knitr option!include} if you don't want to include anything in the text of your presentation document, but you still want to evaluate a code chunk. It is `TRUE` by default. #### `eval` {-} The `eval`\index{knitr option!eval} option determines whether or not the code in a chunk will be run. Set the `eval` option to `FALSE` if you would like to include code in the presentation document text without actually running the code. By default it is set to `TRUE`, i.e. the code is run. You can alternatively use a numerical vector with `eval`. The numbers in the vector tell *knitr* which expressions in the chunk to evaluate. For example, if you only want to evaluate the first two expressions, set `eval=1:2`. #### `echo` {-} If you would like to hide a chunk's code from the presentation document, you can set `echo=FALSE`.\index{knitr option!echo} Note that if you also have `eval=TRUE`, then the chunk will still be evaluated and the output will be included in your presentation document. Clearly, if `echo=TRUE`, then source code will be included in the presentation document. As with `eval`, you can alternatively use a numerical vector in `echo`. The numbers in the vector indicate which expressions to echo in your final document. #### `results` {-} We will look at the `results`\index{knitr option!results} option in more detail in the next two chapters (see especially Section \@ref(ResultsOptions)). However, let's briefly discuss the option value `hide`. Setting `results='hide'` is almost the opposite of `echo=FALSE`. Instead of showing the results of the code chunk and hiding the code, `results='hide'` shows the code, but not the results. Warnings, errors, and messages will still be printed. #### `warning`, `message`, `error` {-} If you don't want to include the warnings, messages, and error messages that R outputs in the text of your presentation documents, just set the `warning`, `message`, and `error` options to `FALSE`. They are set to `TRUE` by default.\index{knitr option!warning}\index{knitr option!message}\index{knitr option!error} #### `cache` {-} If you want to run a code chunk once and save the output for when you knit the document again, rather than running the code chunk every time, set the option `cache=TRUE`.\index{knitr option!cache} When you do this the first time the document is knitted, the chunk will be run and the output stored in a sub-directory of the working directory called *cache*. When the document is subsequently knitted, the chunk will only be run if the code in the chunk changes or its options change. This is very handy if you have a code chunk that is computationally intensive to run. The `cache` option is set to `FALSE` by default. Later in this chapter (Section \@ref(CacheVars)), we will see how to use the `cache.vars` function to cache only certain variables created by a code chunk. #### `dependson` {-} Cached chunks are only rerun when their code changes. Sometimes one chunk will depend on the results from a prior chunk. In these cases, it is good to rerun the chunk if the prior chunk one is also rerun. The `dependson`\index{knitr option!dependson} option allows you to do this automatically. You can specify either a vector of the labels for the chunks depended on or their numbers in order from the start of the document. For example, `dependson=c(2, 3)` specifies that if the second or third chunks are rerun, then the current chunk will also be rerun. #### `cache.extra` {-} Sometimes to ensure reproducibility, it may be useful to rerun a chunk when some other condition changes, such as when a new version of R is installed or a dependent file changes. You can feed a list of conditions to `cache.extra`\index{knitr option!cache.extra} to do this. For instance: ```{r Ch8CacheExtra, eval=FALSE, echo=TRUE} cache.extra=list(file.info(data.csv)$mtime, R.version) ``` Here we set two conditions under which the chunk will be rerun. The first specifies that the chunk should be rerun whenever the *data.csv* file is modified. The `file.info` function extracts information about the file and `mtime` gives the last time that the file was modified. If this differs from when the chunk was last run, then it will be run again. This is very useful for keeping your cached chunks and the files they rely on in sync. The second condition enabled by `R.version` reruns the chunk whenever the R version or even the operating system changes. If you only want to rerun the chunk when the version of R is different, then use `R.version.string`. #### `size` {-} If you do want to print part or all of your code chunk into a LaTeX document, you may also want to resize the text. To do this, use the `size`\index{knitr option!size} option. By default, it is set to `size='normalsize'`. You can use any of the LaTeX font sizes listed in Chapter \@ref(LatexChapter). ### Showing code and results inline Sometimes you may want to have R code or output show up inline with the rest of your presentation document's text. For example, you may want to include a small chunk of stylized code in your text when you discuss how you did an analysis. Or you may want to dynamically report the mean of some variable in your text so that the text will change when you change the data. The *knitr* syntax for including inline code is different for the LaTeX and Markdown languages. We'll cover both in turn. #### LaTeX {-} #### *Inline static code* {-} There are a number of ways to include a code snippet inline with your text in LaTeX. You can use the LaTeX function `\texttt`\index{LaTeX!texttt} to have text show up in the `typewriter` font commonly used in LaTeX-produced documents to indicate that some text is code (I use typewriter font for this purpose in this book, as you have probably noticed). For example, using `\texttt{2 + 2}` will give you `2 + 2` in your text. Note that in LaTeX curly brackets (`{}`) work exactly like parentheses in R, i.e. they enclose a function's arguments. However, the `\texttt` function isn't always ideal, because your LaTeX compiler will still try to run the code inside of the function as if it were LaTeX markup. This can be problematic if you include characters like the backslash `\` or curly brackets `{}`. They have special meanings for LaTeX. The hard way to solve this problem is to use escape characters (see Chapter \@ref(DirectoriesChapter)). The backslash is an escape character in LaTeX. Probably the better option is to use the `\verb` function.\index{LaTeX!verb} It is equivalent to the `eval=FALSE` option for full *knitr* code chunks. To use the `\verb` function, pick some character you will not use in the inline code. For example, you could use the vertical bar (`|`). This will be the `\verb` delimiter. Imagine that we want to actually include `\texttt` in the text. We would type: ``` \verb|\texttt| ``` The LaTeX compiler will ignore almost anything from the first vertical bar up until the second bar following `\verb`. All of the text in-between the delimiter characters is put in typewriter font.[^chapter_8_3] #### Inline dynamic code {-} If you want to dynamically show the results of some R code in your *knitr* LaTeX-produced text you can use `\Sexpr`.\index{LaTeX!Sexpr} This is a pseudo-LaTeX function; it looks like LaTeX, but it is actually *knitr* syntax.[^chapter_8_4] Its structure is more like a LaTeX function's structure than *knitr*'s in that you enclose your R code in curly brackets (`{}`) rather than the `<<>>= . . . @` syntax you use for block code chunks. For example, imagine that you wanted to include the mean of a vector of river lengths, `r round(mean(rivers), digits = 0)`, in the text of your document. The *rivers* numeric vector, loaded by default in R, has the lengths of 141 major rivers recorded in miles. You can use the `mean()`\index{R function!mean} function to find the mean and the `round()`\index{R function!round} function to round the result to the nearest whole number: ```{r Ch8MeanRivers} round(mean(rivers), digits = 0) ``` To have just the output show up inline with the text of your document, you would type something like: ```{r Ch8SexprExample, eval=FALSE} The mean length of 141 major rivers in North America is \Sexpr{round(mean(rivers), digits = 0)} miles. ``` R code included inline with `Sexpr` is evaluated using current R options. So if you want all of the output from `Sexpr` to be rounded to the same number of digits, for example, it might be a good idea to set this in a code chunk with R's `options()` function.\index{R function!options} #### Markdown {-} #### *Inline static code* {-} To include static code inline in an R Markdown (and regular Markdown) document, enclose the code in single backticks (`` ` . . . ` ``). For example: ```{r Ch8MarkdownInline, eval=FALSE, tidy=FALSE} This is example R code: `MeanRiver <- mean(rivers)`. ``` produces:[^chapter_8_5] ![](images/chapter_8/MeanRiverMarkdown.png) #### *Inline dynamic code* {-} Including dynamic code in the body of your R Markdown text is similar to including static code. The only difference is that you put the letter `r` after the first single backtick. ### Dynamically including non-R code in code chunks You are not limited to dynamically including just R code in your presentation documents. *knitr* can run code from a variety of other languages including: Python, Ruby, Bash, Julia, and Stan.\index{Python}\index{Ruby}\index{Bash}\index{Julia}\index{Stan} All you have to do to dynamically include code from one of these languages is use the `engine`\index{knitr option!engine} code chunk option to tell *knitr* which language you are using. For example, to dynamically include a simple line of Python code in an R Markdown document type: ````r `r ''````{r engine='python'} print "Reproducible Research" ``` ```` In the final HTML file you will get output that looks like Figure \@ref(fig:Ch8PythonOutput).[^chapter_8_6] ```{r Ch8PythonOutput, fig.cap="Output from Python Engine in HTML Markdown", echo=FALSE, out.width="25%"} knitr::include_graphics("images/chapter_8/PythonRR.png") ``` Many of the programming language values `engine` can take are listed in Table \@ref(EngineOptions). \begin{table}[ht] \caption{A Selection of \emph{knitr} \texttt{engine} Values} \label{EngineOptions} \begin{center} \begin{tabular}{l p{4cm}} \hline\vspace{0.15cm} Value & Programming Language \\ \hline\hline \texttt{awk} & Awk\index{Awk} \\ \texttt{bash} & Bash shell \index{Bash} \\ \texttt{gawk} & Gawk\index{Gawk} \\ \texttt{haskell} & Haskell\index{Haskell} \\ \texttt{julia} & Julia\index{Julia} \\ \texttt{python} & Python\index{Python} \\ \texttt{R} & R (default) \\[0.25cm] \texttt{ruby} & Ruby\index{Ruby} \\ \texttt{sas} & SAS\index{SAS} \\ \texttt{sh} & Bourne shell\index{Bourne shell} \\ \texttt{stan} & Stan probablistic programming language\index{Stan} \\ \hline \end{tabular} \end{center} \end{table} ## Dynamically Including Modular Analysis Files There are a number of reasons why you might want to have your R source code located in separate files from your markup documents even if you compile them together with *knitr*. First, it can be unwieldy to edit both your markup and long R source code chunks in the same document, even with RStudio's handy *knitr* code folding and chunk management options. There are just too many things going on in one document. Second, you may want to use the same code in multiple documents, for example an article and slide show presentation. It is nice to not have to copy and paste the same code into multiple places. Instead, it is easier to have multiple documents link to the same source code file. When you make changes to this source code file, the changes will automatically be made across all of your presentation documents. You don't need to make the same changes multiple times. Third, other researchers trying to replicate your work might only be interested in specific parts of your analysis. If you have the analysis broken into separate and clearly labeled modular files that are explicitly tied together in the markup file with *knitr*, it is easy for them to find the specific bits of code that they are interested in. ### Source from a local file Usually, in the early stages of your research, you may want to run code stored in analysis files located on your computer. Doing this is simple. The *knitr* syntax is the same as for block code chunks. The only change is that instead of writing all of your code in the chunk, you save it to its own file and use the `source()`\index{R function!source} function to access it.[^chapter_8_7] For example, in an R Markdown file we could run the R code in a file called *main-analysis.R* from our *example-project* like this: ````markdown `r ''````{r, include=FALSE} # Run main analysis source("/example-project/analysis/main-analysis.R") ``` ```` Notice that we set the option `include=FALSE`. This will run the analysis and produce objects created by the analysis code that can be used by other code chunks, but the output will not show up in the presentation document's text. #### Sourcing a makefile in a code chunk {-} In Chapter \@ref(DataGather) we created a GNU Makefile to organize our data gathering. You can run makefiles every time you compile your presentation document. This can keep your data, analyses, figures, and tables up-to-date. One way to do this is to run the GNU makefile in an R code chunk with the `system()`\index{R function!system} function. Perhaps a better way to run makefiles from *knitr* presentation documents is to include the functions in a code chunk using the Bash engine. For example, a Sweave-style code chunk for running the makefiles in our example project would look like this: ````markdown <>= # Change working directory to /example-project/analysis/Data cd /example-project/analysis/Data/ # Run makefile make cleanMerge all # Change to working directory to /example-project/analysis/ cd /example-project/analysis/ @ ```` Please see Chapter \@ref(DataGather) for details on the `make` command arguments used here. You can also use R's `source()`\index{R function!source} function to run an R make-like data gathering file. Unlike GNU Make, this will rerun all of the data gathering files, even if they have not been updated. This may become very time consuming depending on the size of your data sets and how they are manipulated. One final note on including makefiles in your knitr presentation document source code: it is important to place the code chunk with the makefile before code chunks containing statistical analyses that depend on the data file it creates. Placing the makefile first will keep the others up-to-date. ### Source from a URL {#SourceSecureURL} If you are using GitHub or another service that uses secure URLs to host your analysis source code files, you need to use the `source_url()`\index{R function!source\_url} function in the *devtools* package.[^chapter_8_8] For GitHub-based source code, we find the file's URL the same way we did in Chapter \@ref(Storing). Remember to use the URL for the *raw* version of the file. I have a short script hosted on GitHub for creating a scatterplot from data in R's *cars* data set. The script's shortened URL is .[^chapter_8_9] To run this code and create the scatterplot using `source_url()`\index{R function!source\_url}, type: ```{r Ch8SourceURLExample, message=TRUE, warning=FALSE, cache=TRUE, out.width='8cm', out.height='8cm'} library(devtools) # Run the source code to create the scatter plot source_url("http://bit.ly/1D5p1w6") ``` You can also use the *devtools* function `source_gist()` \index{R function!source\_gist} in a similar way to source GitHub Gists. Gists are a handy way to share code over the internet. For more details, see: . Similar to what we saw in Chapter \@ref(Storing) if you would like to use a particular version of a file stored on GitHub, include that version's URL in the `source_url()` call. This can be useful for replicating particular results. Linking to a particular version of a source code file will enable replication even if you later make changes to the file. To access the URL for a particular version of a file, first click on the file on GitHub's website, then click the `History` button. This will take you to a page listing all of the file's versions. Click on the `Browse Code` button next to the version of the file that you want to use. Finally, click on the `Raw` button to be taken to the text-only version of the file. Copy this page's URL and use it in `source_url()`. ## Reproducibly Random: `set.seed()` \index{R function!set.seed} If you include simulations in your analysis it is often a good idea to specify the random number generator state you used. This will allow others to exactly replicate your 'randomly'---really pseudo-randomly---generated simulation results. Use the `set.seed()`\index{R function!set.seed} function in your source code files or code chunks to do this. For example, use the following code to set the random number generator state[^chapter_8_10] and randomly draw 1,000 numbers from a standard normal distribution with a mean of 0 and a standard deviation of 2. ```{r Ch8SetSeed1} # Set seed as 125 set.seed(125) # Draw 1000 numbers draw_1 <- rnorm(1000, mean = 0, sd = 2) summary(draw_1) ``` The `rnorm()`\index{R function!rnorm} function draws the 1,000 simulations. The `mean` argument allows us to set the normal distribution's mean and `sd` sets its standard deviation. Just to show you that we will draw the same numbers if we use the same seed, let's run the code again: ```{r Ch8SetSeed2} # Set seed as 125 set.seed(125) # Draw 1000 numbers draw_2 <- rnorm(1000, mean = 0, sd = 2) summary(draw_2) ``` ## Computationally Intensive Analyses Sometimes you may want to include computationally intensive analyses that take a long time to run as part of a knitr document. This can make writing the document frustrating because it will take a long time to knit it each time you make changes. There are at least two solutions to this problem: the `cache` chunk option and makefiles. We discussed makefiles in Chapter \@ref(DataGather), so let's look at how to work with the `cache` option.\index{knitr option!cache} When you set `cache=TRUE` for the code chunk that contains the analysis, the code chunk will only be run when the chunk's contents change[^chapter_8_11] or the chunk options change. This is a very easy solution to the problem. It does have a major drawback: other chunks can't access objects created by the chunk or use functions from packages loaded in it. Solve these problems by (a) having packages loaded in a separate chunk and (b) save objects created by the cached chunk to a separate RData file that can be loaded in later chunks (see Section \@ref(RSave) for information on saving to RData files).[^chapter_8_12] Imagine that in a cached code chunk we create an object called *Sample*. Then in a later code chunk we want to use the `hist()`\index{R function!hist} function to create a histogram of the sample. In the cached code chunk, we save *Sample* to a file called *sample.RData*. ````markdown <>= Sample <- (n = 1000, mean = 5, sd = 2) save(Sample, file = "sample.RData") @ ```` The latter code chunk for creating the histogram would go something like this:[^chapter_8_13] ````markdown <>= load(file = "sample.RData") hist(Sample) @ ```` #### `cache.vars` {- #CacheVars} If the code chunk you want to cache creates many objects, but you only want to save a few of them, you can use *knitr*'s `cache.vars` chunk option. Simply give it a character vector of the objects' names that you want to save. ### Chapter summary {-} In this chapter we covered in more detail key *knitr* syntax for including code chunks in our presentation documents. This and other tools we learned in this chapter are important for tying our statistical analyses directly to its advertising, i.e. our presentation documents. In the next two chapters, we will learn how to take the output from our statistical analysis and, using *knitr*, present the results with dynamically created tables and figures. [^chapter_8_1]: See: . [^chapter_8_2]: See: . [^chapter_8_3]: For more details, see the LaTeX Wikibooks page: (accessed 21 September 2019). Also, for help troubleshooting, see the UK List of Frequently Asked Questions: (accessed 21 September 2019). [^chapter_8_4]: The function directly descends from *Sweave*. [^chapter_8_5]: The exact look of the text depends on the Cascading Style Sheets (CSS) style file you are using. The example here was created with RStudio's default style file. [^chapter_8_6]: Again, this was created using RStudio's default CSS style file. [^chapter_8_7]: We used the `source()` function in Chapter \@ref(DataGather) in our make-like data gathering file. [^chapter_8_8]: You can also make the replication code accessible for download and either instruct others to change the working directory to the replication file or have them change the directory information as necessary. You will need to do this with GNU makefiles like those included with this book. [^chapter_8_9]: The original URL is at . This is very long, so I shortened it using bitly. You may notice that the shortened URL is not secure. However, it does link to the original secure URL. [^chapter_8_10]: See the `Random` help file for detailed information on R's random number generation capabilities by typing `?Random` into your console. [^chapter_8_11]: Note that the chunk will not be run if only the contents of a file that the chunk sources are changed. Use the `dependson` option in cases where it is important to rerun a chunk when a prior chunk changes. [^chapter_8_12]: It's true that when *knitr* caches a code chunk it saves the chunk's objects to an `.RData` file. However, it is difficult to load this file directly because the file name changes every time the cached chunk is rerun. [^chapter_8_13]: For reference, *Sample* was created by using the `rnorm()` function to take a random sample of size 1,000 from a normal distribution with a mean of five and standard deviation of two. ================================================ FILE: rep-res-3rd-edition/11-tables.Rmd ================================================ # Showing Results with Tables {#TablesChapter} Graphs and other visual methods, discussed in the next chapter, can often be more effective ways to present descriptive and inferential statistics than tables.[^chapter_9_1] Nonetheless, tables of parameter estimates, descriptive statistics, and so on can sometimes be important tools for describing your data and presenting research findings. See @ehrenberg1977 and @gelman2011tables for information on creating tables for effective communication. Learning how to dynamically connect statistical results with tables in your presentation documents aids reproducibility and can ultimately save you a lot of time. Manually typing results into tables by hand is tedious, not very reproducible, and can introduce errors.[^chapter_9_2] It's especially tedious to retype tables to reflect changes you made to your data and models. Fortunately, you don't actually need to create tables by hand. There are many ways to have R do the work for you. The goal of this chapter is for you to learn how to dynamically create tables for your presentation documents written in LaTeX and Markdown. We will first learn the simple knitr/R Markdown syntax we need to dynamically include tables created from R objects. Then we will learn how to actually create the tables. There are a number of ways to turn R objects into tables that can be dynamically included in LaTeX or Markdown/HTML markup. In this chapter we mostly focus on three tools for creating tables: the `kable()`\index{R function!kable} function from *knitr*, the *xtable* package, and the *texreg* package [@R-texreg]. `kable()` can create tables from data frames for both LaTeX and Markdown/HTML documents. *xtable* does the same, but is much more customizable. *texreg* produces publication-quality tables from objects containing statistical model results, or model objects. It allows you to combine results from multiple models into one table. Unfortunately *texreg* is less flexible with objects of classes it does not support.[^chapter_9_3] **Warning:** Automating table creation removes the possibility of adding errors to the presentation of your analyses by incorrectly copying output, a big potential problem in hand-created tables. However, it is not error-free. You could easily create inaccurate tables with coding errors. So, as always, it is important to 'eyeball' the output. Does it make sense? If you select a couple values in the R output, do they match what is in the presentation document's table? If not, you need to go back to the code and see where things have gone wrong. With that caveat, let's start making tables. ## Basic *knitr* Syntax for Tables {#ResultsOptions} The most important *knitr*/*rmarkdown* chunk option for showing tables is `results`. The `results` option can have one of four values: \index{knitr option!results} - `'hide'`, - `'asis'`, - `'markup'`, - `'hold'`. The value `hide` clearly hides the results of your code chunk from your presentation document. `hold` collects all of the output and prints it at the end of the chunk. To include tables created from R objects in your LaTeX or Markdown output you should set `results='asis'` or `results='markup'`. `asis` is the simplest option as it writes the raw markup form of the table into the presentation document, not as a highlighted code chunk, but as markup. It is then compiled as table markup with the rest of the document. `markup` uses an output hook to mark up the results in a predefined way. In this chapter, we will work with examples using the `asis` option. ## Table Basics Before getting into the details of how to create tables from R objects, it is useful to first learn how generic tables are created in LaTeX and Markdown/HTML. If you are not familiar with basic LaTeX or Markdown syntax, you might want to skip ahead to Chapters \@ref(LatexChapter) and \@ref(MarkdownChapter), respectively, before coming back to learn about making tables in these languages. ### Tables in LaTeX {#LaTeXTables} Tables in LaTeX are usually embedded in two environments: the `table`\index{LaTeX!table} and `tabular`\index{LaTeX!tabular} environments. What is a LaTeX environment in general? A LaTeX environment is a part of the markup where special commands are\index{LaTeX!environment} executed. A simple environment is the `center`\index{LaTeX!centre} environment.[^chapter_9_5] Everything placed in a center environment is, unsurprisingly, centered. Typing: ````latex \begin{center} This is a center environment. \end{center} ```` creates the following text in the PDF output: \begin{center} This is a center environment. \end{center} LaTeX environments all follow the same general syntax: ````latex \begin{ENVIRONMENT_NAME} ... ... \end{ENVIRONMENT_NAME} ```` You do not have to indent the contents of an environment. Indentations neither affect how the document is compiled nor show up in the final PDF.[^chapter_9_6] It is conventional to indent them, however, because it makes the markup easier to read. In this chapter we will learn about two types of environments you need for tables in LaTeX. The `tabular` environment allows you to format the content of a table. The `table` environment allows you to format a table's location in the text and its caption. #### The `tabular` environment {-} The `tabular`\index{LaTeX!tabular} environment allows you to create tables in LaTeX. Let's work through the basic syntax for a simple table.[^chapter_9_7] To begin a simple tabular environment type `\begin{tabular}{TABLE_SPEC}`. The `TABLE_SPEC` argument allows you to specify the number of columns in a table and the alignment of text in each column. For example, to create a table with three columns, the first of which is left-justified and the latter two center-justified we type: ````latex \begin{tabular}{l c c} ```` The `l` argument creates a left-justified column, `c` creates a centered one. If we wanted a right-justified column we would use `r`.[^chapter_9_8] Finally, we can add a horizontal line between columns by adding a vertical bar `|` between the column arguments.[^chapter_9_9] For example, to place a vertical line between the first and second columns in our example table, we would type: ````latex \begin{tabular}{l | c c} ```` Now let’s enter content into our table. We saw earlier how CSV files delimit individual columns with commas. In LaTeX’s `tabular` environment, columns are delimited with ampersands (`&`).[^chapter_9_10]\index{LaTeX!ampersand} In CSV tables, new lines are delimited by starting a new line. In LaTeX tables you use two backslashes (`\\`).[^chapter_9_11] Here is a simple example of the first two lines of a table: ````latex \begin{tabular}{l | c c} Observation & Variable1 & Variable2 \\ Subject1 & a & b \\ ```` It is common to demarcate the row with a table’s column names, the first row, with horizontal lines. A horizontal line also often visually demarcates a table’s end. You can add horizontal lines in the `tabular` environment with the `\hline` command. ````latex \begin{tabular}{l | c c} \hline Observation & Variable1 & Variable2 \\ \hline \hline Subject1 & a & b \\ \hline ```` Finally, we close the `tabular` environment with `\end{tabular}`. The full code (with a few extra rows added) is: ````latex \begin{tabular}{l | c c} \hline Observation & Variable1 & Variable2 \\ \hline \hline Subject1 & a & b \\ Subject2 & c & d \\ Subject3 & e & f \\ Subject4 & g & h \\ \hline \end{tabular} ```` This produces the following table: \begin{tabular}{l | c c} \hline Observation & Variable1 & Variable2 \\ \hline \hline Subject1 & a & b \\ Subject2 & c & d \\ Subject3 & e & f \\ Subject4 & g & h \\ \hline \end{tabular} #### The `table` float environment {- #POSITIONSPEC} \index{LaTeX!table} You might notice that the table we created so far lacks a title and is bunched very closely to the surrounding text. In LaTeX we can create a `table` float environment to solve this problem. Float environments allow us to separate a table from the text, specify its location, and give it a caption.[^chapter_9_16] To begin a `table` float environment, use `\begin{table}[POSITION_SPEC]`. The argument allows us to determine the location of the table. It can be set to `h` for here, i.e. where the table is written in the text. It can also be `t` to place it on the top of a page or `b` for the bottom of the page. To set a title for the table, use the `\caption` command.\index{LaTeX!caption} LaTeX automatically determines the table's number, so you only need to enter the text. You can also declare a cross-reference key for the table with the `\label` command.[^chapter_9_17] A `table` environment is closed with `\end{table}`. Let's see a full example. ````latex \begin{table}[t] \caption{Example Simple LaTeX Table} \label{ExLaTeXTable} \begin{center} \begin{tabular}{l | c c} \hline Observation & Variable1 & Variable2 \\ \hline \hline Subject1 & a & b \\ Subject2 & c & d \\ Subject3 & e & f \\ Subject4 & g & h \\ \hline \end{tabular} \end{center} \end{table} ```` \begin{table}[t] \caption{Example Simple LaTeX Table} \label{ExLaTeXTable} \begin{center} \begin{tabular}{l | c c} \hline Observation & Variable1 & Variable2 \\ \hline \hline Subject1 & a & b \\ Subject2 & c & d \\ Subject3 & e & f \\ Subject4 & g & h \\ \hline \end{tabular} \end{center} \end{table} Notice that the `tabular` environment is further nested in the `center` environment.\index{LaTeX!center} This centers the table, while leaving the table's title left-justified. The final result is Table \@ref(ExLaTeXTable). One final tip: to have the caption placed at the bottom rather than the top of the table in the final document, simply put the `caption` command after the `tabular` environment is closed. You can see how typing out a table in LaTeX gets very tedious very fast. For all but the simplest tables, it is best to try to have R do the table-making work for you. ### Tables in Markdown/HTML Now we will briefly look at the syntax for creating simple Markdown and HTML tables before turning to learn how to have R create these tables for us. #### Markdown tables {-} Markdown table syntax, as with all Markdown syntax, is generally much simpler than LaTeX's tabular syntax. The markup is much more human readable. Nonetheless, larger tables can still be tedious to create. You do not need to declare any new environments to start creating a Markdown table. Just start typing in the content. Columns are delimited in Markdown tables with a vertical bar (`|`). Rows are started with a new line. To indicate the head of the table, usually the row(s) containing the column names, separate it from the body of the table with a row of dashes (e.g. `-----`). Here is an example based on the table we created in the previous section: ````markdown Observation | Variable1 | Variable2 ----------- | ---------- | --------- Subject1 | a | b ```` Note that it is not necessary to line up the vertical bars. You just need to have the same number of them on each row. You can specify each column's text justification using colons on the dashed row. For example, this code will create the left-center-center justified formatted table we made earlier: ````markdown Observation | Variable1 | Variable2 :---------- | :-------: | :-------: Subject1 | a | b Subject2 | c | d Subject3 | e | f Subject4 | g | c ```` To create a left-justified column, use a colon on only the left side of the dashes. The ultimate look of a Markdown table is highly dependent on the CSS style file you are using (see Chapter \@ref(MarkdownChapter) for how to change your CSS style file). The default RStudio CSS style as of late 2019 formats our table to look like this: ![](images/chapter_9/RStudioDefaultTableExample.png) Using a different CSS style file,[^chapter_9_18] we can get something like this: ![](images/chapter_9/MarkedTableExample.png) In basic Markdown, you can add a caption with the heading syntax (see Section \@ref(MarkdownHeader)). In this example the three hashes (`###`) create the header: ````markdown ### Example Simple Markdown Table Observation | Variable1 | Variable2 :---------- | :-------: | :-------: Subject1 | a | b ```` producing something like this: ![](images/chapter_9/MarkedCaptionTableExample.png) #### HTML tables {-} The `texreg()`\index{R function!texreg} function that we will learn in the next section doesn't create tables formatted with Markdown syntax. It can create tables with HTML syntax. This is useful for us because virtually any HTML markup can be incorporated into a Markdown document. In fact, Markdown table syntax is only a stepping stone for more easily producing tables with HTML syntax. So it is useful to also understand the basic syntax for HTML tables. HTML uses element "tags"\index{HTML!tags} to begin and end tables. The main element we use to create tables is, well, the `tables` element. This is very similar to LaTeX's `tabular` environment. An HTML element generally begins with a start tag and ends with an end tag. This is similar to LaTeX's `\begin{}` and `\end{}` commands. Begin tags are encapsulated in a greater than and less than sign and include the element tag name (``). End tags are similar, but include a forward slash like this ``. The content of the element goes between the start and end tags. For example: ````html . . . . . .
```` As in LaTeX, you are not required to tab the content of a table element; however, it does make the markup document easier to read and, as the number of tags proliferates, easier to write. You can specify element attributes inside of start tags.[^chapter_9_19] For example, to add a border to the table, use: ``.[^chapter_9_20]\index{HTML!table border} Table rows are put inside of `tr` (table rows) element tags. Individual cells are delimited with `td` (standard cell) tags. Here is what the first row of our example table looks like in basic HTML: ````html
```` We can further delimit a table's header row(s) from its body with the `thead`\index{HTML!thead} and `tbody`\index{HTML!tbody} tags. Finally, before making a full table it's useful to mention that table captions can be included with `caption` tags.\index{HTML!caption} Let's put this all together: ````html
Observation Variable1
Observation Variable1 Variable2
Subject1 a b
Subject2 c d
Subject3 e e
Subject4 f f
```` As with Markdown tables, the ultimate appearance of the table is highly dependent on the style files you use. ## Creating Tables from Supported Class R Objects Just as the `write.csv()`\index{R function!write.csv} function turns an R data frame into a CSV formatted text file, there are a number of methods in R to take an object, e.g. a matrix, data frame, the output from a statistical analysis, and so on, and turn them into LaTeX and HTML tables. `kable()`, *xtable*, and *texreg* each work most easily with specific object classes that their designers explicitly supported. ### `kable` for Markdown and LaTeX \index{R function!kable|(} `kable()` easily converts matrices and data frames into tables for Markdown, HTML, and LaTeX among others. Let's create a simple data frame: ```{r Ch9kable_df} library(knitr) kable_ex <- data.frame( Observation = c("Subject1", "Subject2", "Subject3", "Subject4"), Variable1 = c("a", "c", "e", "g"), Variable2 = c("b", "d", "f", "c") ) ``` Then place this data frame into a `kable()` call: ```{r Ch9kableBasic, eval=FALSE} kable(kable_ex, caption = "Example kable Table") ``` Beyond setting the table's caption with `caption`, there are a few other alterations that can be made with `kable` arguments. You can specify new column and row names by passing character vectors to `col.names` and `row.names`, respectively. These are very useful, as it can be difficult, or at least irritating, for your readers to try to decode the names you give to your data frame rows and columns in R. Another useful argument is `digits`. This will round numbers in the table to a specified number of digits after the decimal place. To effectively convey your results, you should *at least* only include digits that are significant in that they meaningfully vary in the data [@ehrenberg1977 281]. You can also change the markup language that the table is created in using the `format` argument. For example, to create a LaTeX formatted table, use `format = 'latex'`. In general, you do not need to specify the format if you are using *knitr* or *rmarkdown* to include the table in a presentation document. This will be done automatically. \index{R function!kable|)} ### *xtable* for LaTeX and HTML \index{R function!xtable|(} While `kable()` allows you to quickly create simple tables, it can only do so from matrices and data frames. It also has limited customizability. The *xtable* package can create more customizable tables from a wider variety of R objects, including statistical model objects. Different R statistical model estimation commands can produce model objects of different classes. For example, the `lm()`\index{R function!lm} (linear model) function creates model summaries of the `lm` class. Let's create a simple linear regression using the *swiss* data frame and `lm()`. This data frame is included with R by default. The simple linear regression model we are going to make has the *swiss* variable **Examination** as the dependent variable and **Education** as the only independent variable.[^chapter_9_21] ```{r Ch9SimpleSwissRegression} # Fit simple linear regression model M1 <- lm(Examination ~ Education, data = swiss) # Return class class(M1) ``` By using the `class` function, we can see that *M1* is of the `lm` class. *M1* contains items estimated by the linear regression model[^chapter_9_22] such as the coefficient estimates and their standard errors. To get a summary of a model object's contents, use the `summary()`\index{R function!summary} function like this: ```{r Ch9SimpleSwissSummary} summary(M1) ``` To find a full list of object classes that *xtable* supports, type `methods(xtable)`\index{R function!methods} into the R Console after you have loaded the package. #### *xtable* for LaTeX {-} Let's look at how to create LaTeX tables with *xtable* by creating a table summarizing the estimates from the *M1* model object. ````latex <>= library(xtable) # Create LaTeX table from M1 and show the output markup xtable(M1, caption = "Linear Regression, DV: Exam Score", label = "BasicXtableSummary", digits = 1) @ ```` When included in an R Sweave-style LaTeX document, this code will create a table exactly like Table \@ref(BasicXtableSummary). Let's go through this code, working from the outside in. First you'll notice that we've set two *knitr* code chunk options. As we discussed earlier, `results='asis'`\index{knitr option!results} allows us to include the LaTeX formatted table created by *xtable*. The next option `echo=FALSE`\index{knitr option!echo} hides the code from being shown in our final document. The `xtable()` function creates the summary table of our *M1* model object. Not only does it produce both complete `tabular` and `table` environments,\index{LaTeX!tabular}\index{LaTeX!table} but also through the `caption` and `label` arguments it automatically adds in the table's title and cross-reference label, respectively. Finally, notice that I added the `digits = 1` argument. As in `kable()`, this specifies that I want numbers in the table to be rounded to one decimal digit. ```{r Ch9xtableSummaryPrint, results='asis', echo=FALSE, message=FALSE} library(xtable) options(xtable.comment = FALSE) # Create LaTeX table from M1 and show the output markup xtable(M1, caption = "Linear Regression, DV: Exam Score", label = "BasicXtableSummary", digits = 1) ``` The caption is printed below the table by default. #### *xtable* for Markdown/HTML {-} We can use *xtable* and the `print.xtable()` function[^chapter_9_23] to also create tables for Markdown and HTML documents. The *xtable* function produces, unsurprisingly, `xtable()` class objects. We can run these through the `print()`\index{R function!print} function and add arguments to customize how the table is formatted. By default, `print.xtable()`'s\index{R function!print.xtable} `type` argument is set to `"latex"`. To create an HTML table that can be inserted into Markdown and HTML documents, set the `type` argument from `"latex"` to `"html"`. For example, to create an HTML version of the table summarizing *M1* and include it in an R Markdown document, type: ````r `r ''````{r results='asis', echo=FALSE} library(xtable) # Create an xtable object from M1 m1_table <- xtable(M1, caption = "Linear Regression, DV: Exam Score", label = "BasicXtableSummary", digits = 1) # Create HTML summary table of m1_table print.xtable(m1_table, type = "html", caption.placement = "top") ``` ```` If you intend to include multiple tables in your R Markdown document, you will want to set all of the tables to be printed in HTML. You can place `options("xtable.type" = "html")` in a code chunk near the beginning of your document.[^chapter_9_24] This makes it so that you don't need to include `type = "html"` every time you use `print`. Notice in the previous code example that we also added the `caption.placement = "top"` argument. This will move the caption from the bottom of the table, as it is in Table \@ref(BasicXtableSummary), to the top. See the *xtable* package documentation[^chapter_9_25] for the full list of `print.xtable()` options. \index{R function!xtable|)} ### *texreg* for LaTeX and HTML `kable()` and *xtable* are limited when it comes to creating tables from statistical model objects. `kable` only works with matrices and data frames. *xtable* is easiest when working with only one model object at a time. Furthermore, by default these tools do not create output tables that present estimates from multiple statistical models in the style used by many prominent academic journals. The *texreg* package is very useful for creating these types of tables. It also supports more model object types than *xtable*. #### *texreg* for LaTeX {-} Imagine we want to show the estimates from a number of nested regression models in LaTeX as the next table. For example, to estimate nested regression models from the remaining variables in the *swiss* data set, we type: ```{r Ch9EstimateNested} # Estimate nested regression models M2 <- lm(Examination ~ Education + Agriculture, data = swiss) M3 <- lm(Examination ~ Education + Agriculture + Catholic, data = swiss) M4 <- lm(Examination ~ Education + Agriculture + Catholic + Infant.Mortality, data = swiss) M5 <- lm(Examination ~ Education + Agriculture + Catholic + Infant.Mortality + Fertility, data = swiss) ``` We can now include these model objects in one LaTeX table with *texreg*. Remember to include `results='asis'` in the code chunk head. ```{r Ch9BasictexregTable1Display, echo=TRUE, eval=FALSE, message=FALSE} library(texreg) # Create custom coefficient names cust_coef <- c('(Intercept)', 'Education', 'Agriculture', 'Catholic', 'Infant Mortality', 'Fertility') # Create nested regression model table texreg(list(M1, M2, M3, M4, M5), caption = 'Nested Estimates Table with \\emph{texreg}', caption.above = TRUE, label = 'Basic_texregTable', custom.coef.names = cust_coef) ``` ```{r Ch9BasictexregTable1Create, echo=FALSE, results='asis', message=FALSE} library(texreg) # Create custom coefficient names cust_coef <- c('(Intercept)', 'Education', 'Agriculture', 'Catholic', 'Infant Mortality', 'Fertility') # Create nested regression model table texreg(list(M1, M2, M3, M4, M5), caption = 'Nested Estimates Table with \\emph{texreg}', caption.above = TRUE, label = 'Basic_texregTable', custom.coef.names = cust_coef) ``` Notice that we placed the model objects in a list when we called `texreg()`. `texreg()` automatically created the `table` and `tabular` environments\index{LaTeX!tabular}\index{LaTeX!table} and by default centers the table.[^chapter_9_26] We added a caption and reference label with the `caption` and `label` arguments, respectively. By default, the caption is placed below the table, so we used `caption.above = TRUE` to place it on top. Finally, we created custom coefficient names with `custom.coef.names` that are a bit tidier than the variable names in our R data set. Your readers will appreciate easily discernible coefficient names. In the LaTeX caption, you'll notice `\\emph{texreg}`. In LaTeX the `emph` command italicizes text (we'll see this again in Chapter \@ref(LatexChapter)). We added an additional escape character `\` so that R would not try to interpret the `e` and instead feed it to LaTeX. By default, `texreg()` uses `stars = c(0.001, 0.01, 0.05)` to determine at what p-values to display statistical significance stars. This is the same as the `lm` model summary default showing three sets of statistical significance stars. You can define the significance levels by assigning a different numeric vector to the `stars` argument. There are many other changes you can make to tables created with *texreg*. You can change the column and coefficient names, determine what type of standard errors to show, and so on. For the full list of arguments, see the help file by typing `?texreg` into your R Console. #### *texreg* for HTML {-} You can also use the *texreg* package to create tables in Markdown/HTML documents. Instead of the `texreg` function, use `htmlreg`. The syntax is largely similar, though arguments relating to LaTeX are not available, while others relating the HTML are. Here is a simple example: ```{r Ch9htmlreg, eval=FALSE, echo=TRUE} htmlreg(list(M1, M2, M3, M4, M5), caption = 'Nested Estimates Table in HTML Document', caption.above = TRUE, custom.coef.names = cust_coef) ``` Notice that we did not include the `label` argument as this is not available in HTML. The resulting table looks like this: ![](images/chapter_9/htmlregExample.png) ### Fitting large tables in LaTeX Sometimes you may have large tables that are difficult to fit onto a page in LaTeX. There are a number of ways to adjust tables so that they fit on the page. #### LaTeX landscape tables {-} If your LaTeX table is very wide, e.g. because it shows results from many estimation models, you can use LaTeX's `lscape` package to create `landscape`\index{LaTeX!landscape} formatting environments. Rather than orienting the text of a page so that it is in profile (a long page), a `landscape` environment turns it 90 degrees so that it has a landscape orientation (a wide page). To use the *lscape*\index{LaTeX package!lscape} package, first place `\usepackage{lscape}` in your LaTeX document's preamble. Then begin a `landscape` environment with `\begin{landscape}` where you would like it located in the text. Then place the `table` environment information and *knitr* code for creating the table. Finally, close the `landscape` environment with `\end{landscape}`. #### LaTeX scalebox for tables {-} In addition, the `scalebox` command from the *graphics*\index{LaTeX package!graphics} package could be useful for fitting large tables onto a PDF page. This command expands or shrinks the text in the table. `texreg` actually has a `scalebox` argument. If you use `scalebox = 0.5`, it will halve the size of the table; `scalebox = 2` doubles it. More generally, to rescale a table use: `\scalebox{HORIZONTAL_SCALE}[VERTICAL_SCALE]{TABLE}` `HORIZONTAL_SCALE` is how much to scale the table horizontally. `VERTICAL_SCALE` is how much to scale vertically and `TABLE` is the table or R code chunk to create the table. ### *xtable* with non-supported class objects {#NonSupportedClasses} The `kable`, *texreg*, and *xtable* packages are very convenient for model objects they know how to handle. With supported class objects, the functions in these packages know where to look for the vectors containing the things---coefficient names, standard errors, and so on---that they need to create tables. With unsupported classes, however, they don't know where to look for these things. Luckily, there is a work-around. You tell `xtable()` where to find elements you want to include in your table. `xtable()` can handle matrix and data frame class objects. The rows of these objects become the table rows and the columns become the table columns. So, to create tables with non-supported class objects you need to: 1. find and extract the information from the unsupported class object that you want in the table, 2. convert this information into a matrix or data frame where the rows and columns of the object correspond to the rows and columns of the table that you want to create, 3. use *xtable* with this object to create the table. Imagine that you want to create a results table showing the covariate names, coefficient means, and quantiles for marginal posterior distributions estimated from an linear regression using the *brms* package [@R-brms] and data from the *swiss* data frame. Let's fit the model: \index{R package!brms}\index{R function!brm} ```{r Ch9MCMC, message=FALSE, warning=FALSE, tidy=FALSE, echo=TRUE, cache=TRUE} library(brms) # Fit model linear_brms <- brm(Examination ~ Education, data = swiss, family = gaussian(link = "identity"), refresh = 0) # Find linear_brms's class class(linear_brms) ``` Note: I included `refresh = 0` to suppress output about the model fitting process. Using the `class()` function, we see that the model output object in *linear_brms* is of the `brmsfit` class. This class is not supported by *xtable*. If you try to create a table summarizing the estimates in *linear_brms_table*, you will return an error telling you the object's class is not supported. With unsupported class objects, you have to create the summary yourself and extract the elements that you want from it manually. A good knowledge of vectors, matrices, and component selection is very handy for this (see Chapter \@ref(GettingStartedRKnitr)). First, create a summary of your output object *linear_brms*: ```{r Ch9MCMCSummary, echo=TRUE, dependson=-2} linear_brms_summary <- summary(linear_brms) ``` This creates a new object of the class `brmssummary`. We're still not there yet as this object contains not just the covariate names and so on, but also information we don't want to include in the results table, like the estimation formula. The second step is to extract a matrix from inside *linear_brms_summary* called *summary* with the component selector (`$`). Remember that to find the components of an object, use the `names()` function. ```{r Ch9MCMCSummaryNames, echo=TRUE} names(linear_brms_summary) ``` The *fixed* matrix is where the things we want in our table are located. I find it easier to work with data frames, so let's also convert the matrix into a data frame. ```{r Ch9NBSum, echo=TRUE} linear_brms_summary_df <- data.frame(linear_brms_summary$fixed) ``` Here is what the model summary data frame looks like: ```{r Ch9NBSumDisplay, echo=TRUE} linear_brms_summary_df ``` Now we have a data frame object *xtable* can handle. After a little cleaning up (see the chapter's Appendix for more details) you can use *xtable* as before to create Table \@ref(CoefEstTable). ```{r Ch9NBTable, echo=FALSE, message=FALSE, results='asis'} library(dplyr) # Change posterior summary variable names linear_brms_summary_df <- rename(linear_brms_summary_df, `2.5%` = `l.95..CI`) linear_brms_summary_df <- rename(linear_brms_summary_df, `50%` = Estimate) linear_brms_summary_df <- rename(linear_brms_summary_df, `97.5%` = `u.95..CI`) # Reorder variables and remove the Est. Error linear_brms_summary_df <- linear_brms_summary_df[, c("2.5%", "50%", "97.5%")] # Create table brms_table <- xtable(linear_brms_summary_df, caption = "Coefficient Estimates Predicting Examination Scores in Swiss Cantons (1888) Found Using Bayesian Linear Regression", label = "CoefEstTable") print(brms_table, caption.placement = "top") ``` It may take some hunting to find what you want, but a similar process can be used to create tables from objects of virtually any class.[^chapter_9_27] Hunting for what you want can be easier if you look inside of objects by clicking on them in RStudio's *Environment* tab. ### Creating variable description documents with *xtable* {#Vardescript_tables} You can use *xtable* to create a table describing variables in your data set and insert these into Markdown documents created with the concatenate and print (`cat`) command (see Section \@ref(catR)). This is useful because our data so far has been stored in plain-text files. Unlike binary Stata or SAS data files, plain-text data files do not include variable descriptions. Imagine that we want to create a Markdown file with a table describing the variables from the *swiss* data frame. First we will create two vectors: one for the variable names and the other for the variable descriptions. ```{r Ch9VarDescriptVect, tidy=FALSE, echo=TRUE} # Create variable vector from column names Variable <- names(swiss) # Create variable description vector Description <- c("common standardized fertility measures", "% of males involved in agriculture as occupation", "% draftees receiving highest mark on army examination", "% education beyond primary school for draftees", "% catholic", "% live births who live less-than 1 year" ) ``` In the first line we use the `names()` function to create a vector of the *swiss* data frame's column names. Then we create a vector of descriptions with the combine function (`c()`).\index{R function!c} Now we can combine these vectors into a matrix and use it to create an HTML table. ```{r Ch9CbindDescript, results='hide'} # Combine Variable and Description variables into a matrix descriptions_bound <- cbind(Variable, Description) # Create an xtable object from descriptions_bound descriptions_table <- xtable(descriptions_bound) # Format table in HTML descript_table <- print.xtable(descriptions_table, type = "html") ``` Finally, we can use `cat()`\index{R function!cat} to create our Markdown variable description file. ```{r Ch9Cat, tidy=FALSE, eval=FALSE} # Create variable description file cat("# Swiss Data Variable Descriptions \n", "### Source: Mosteller and Tukey, (1977) \n", descript_table, file = "swiss-variable-descriptions.md" ) ``` The first part of the `cat()` function\index{R function!cat} here is the title of the document. As we will see in Chapter \@ref(MarkdownChapter), hashes (`#`) create headers. `\n` creates a new line in the Markdown document. The next line is information on the *swiss* data frame's source. We then include the HTML table in the *descript_table* object and save it to a file called *swiss-variable-descriptions.md*. It is convenient to include the creation of this table in your data gathering makefiles and have it saved into the same directory as your data. This way it will be easy to update as you update your data and easy to find. If you are storing your data on GitHub, it will automatically render the variable description Markdown file and make it easy for others to read. See this book's makefile example for more information: .[^chapter_9_28] ### Chapter summary {-} In this chapter, we have learned how to take the results from our statistical analyses and other information from our data and dynamically present it in LaTeX and Markdown documents with knitr/R Markdown. In the next chapter, we will do the same thing with figures. ## Appendix {-} Source code for cleaning *linear_brms_summary_df* and using it to create a LaTeX table: ```{r Ch9NBSumClean, eval=FALSE, tidy=FALSE} library(dplyr) library(xtable) # Change posterior summary variable names linear_brms_summary_df <- rename(linear_brms_summary_df, `2.5%` = `l.95..CI`) linear_brms_summary_df <- rename(linear_brms_summary_df, `50%` = Estimate) linear_brms_summary_df <- rename(linear_brms_summary_df, `97.5%` = `u.95..CI`) # Reorder variables and remove the Est. Error linear_brms_summary_df <- linear_brms_summary_df[, c("2.5%", "50%", "97.5%")] # Create table xtable(linear_brms_summary_df, caption = "Coefficient Estimates Predicting Examination Scores in Swiss Cantons (1888) Found Using Bayesian Linear Regression", label = "CoefEstTable") # Create table xtable(linear_brms_summary_df, caption = "Coefficient Estimates Predicting Examination Scores in Swiss Cantons (1888) Found Using Bayesian Normal Linear Regression") ``` Note that the new variable names are in quotation marks, in contrast to the example from Chapter \@ref(DataClean). The quotation marks allow us to specify a name that begins with a number and has special characters like the percent sign. [^chapter_9_1]: This is especially true of the small-print, high-density coefficient estimate tables that are sometimes descriptively called 'train schedule' tables. [^chapter_9_2]: For example, in a replication of Reinhart and Rogoff's [-@rr2010] much cited study of economic growth and public debt, @herndon2014 found a number of apparent transcription errors. Analysis results in the original spreadsheets appear to not have been entered into the paper's tables accurately. [^chapter_9_3]: These are not the only packages available in R for creating presentation document tables from R objects. I personally really like the *stargazer* package [@R-stargazer]. It has a similar syntax to *texreg* and is particularly good for showing results from multiple models estimated using different model types in one table. [^chapter_9_5]: For a comprehensive list of LaTeX environments, see . [^chapter_9_6]: An aside: the `tabbing` environment is a useful way to create tabbed text in LaTeX. We don't cover this here though. [^chapter_9_7]: For a comprehensive overview, see the LaTeX Wiki page on tables: . [^chapter_9_8]: You can also specify a column's width by using `m{WIDTH}` instead. Be sure to load the *array* package\index{LaTeX package!array} in the preamble for this to work. Using `m` will create a column of a specified width that is vertically justified in the middle. For example, `m{3cm}` would create a column with a width of 3 centimeters. Text in the column would automatically be wrapped onto multiple lines if need be. You can replace the `m` with either `p` or `b`. `p` vertically aligns the text at the top, `b` aligns it at the bottom. [^chapter_9_9]: If you add two vertical bars (`||`), you will get two lines. [^chapter_9_10]: If you want to include an ampersand in the text of your LaTeX document, you need to escape it like this: `\&`. [^chapter_9_11]: You can use two backslashes outside of the `tabular` environment as well to force a new line. Also, to increase the space between the line, you can add a vertical width argument to the double backslashes. For example, `\[3cm]` will give you a 3-centimeter gap between the current line and the next one. [^chapter_9_12]: You can also specify a column's width by using `m{WIDTH}` instead. Be sure to load the *array* package in the preamble for this to work. Using `m` will create a column of a specified width that is vertically justified in the middle. For example, `m{3cm}` would create a column with a width of 3 centimeters. Text in the column would automatically be wrapped onto multiple lines if need be. You can replace the `m` with either `p` or `b`. `p` vertically aligns the text at the top, `b` aligns it at the bottom. [^chapter_9_13]: If you add two vertical bars (`||`) you will get two lines. [^chapter_9_14]: If you want to include an ampersand in the text of your LaTeX document you need to escape it like this: `\&`. [^chapter_9_15]: You can use two backslashes outside of the `tabular` environment as well to force a new line. Also, to increase the space between the line you can add a vertical width argument to the double backslashes. For example, `\[3cm]` will give you a 3-centimeter gap between the current line and the next one. [^chapter_9_16]: We will see in the next chapter how to use `figure` floats as well. [^chapter_9_17]: This command works throughout LaTeX. To reference the table type in the text of your document `\ref{KEY}`, where `KEY` is what you set with the `\label` command. Use `\pageref` to reference the page number. [^chapter_9_18]: The table was created using the Upstanding Citizen style from the program Marked. [^chapter_9_19]: These work like arguments in R in that they change how the element is evaluated. [^chapter_9_20]: Whether or not a border appears is determined by whether or not the style sheet you are using includes borders. [^chapter_9_21]: For a description of these variables, type `?swiss` into the console. [^chapter_9_22]: If you are unfamiliar with the syntax of R statistical estimation models, the previous code might be confusing. In general 'response' ($Y$) variables are written first and are separated from the 'explanatory' ($X$) variables by a tilde (`\sim`). Crawley [-@crawley2005 107] notes that you can read $Y \sim X$ as '$Y$ is modeled as a function of $X$'. In later examples we will see that individual explanatory variables are generally separated by plus signs (`+`), indicating that they are included in the model, not that they are added. For more information, see Crawley [-@crawley2005 Ch. 7]. [^chapter_9_23]: Note: you can abbreviate `print.xtable()` as `print()`. [^chapter_9_24]: Of course, you will probably want to use the `include=FALSE` *knitr* option with this code chunk. [^chapter_9_25]: [^chapter_9_26]: Use the `center = FALSE` argument to override centering. If you would like to only create the `tabular` environment, use the argument `table = FALSE`. Creating your own `table` environment can be useful in situations where you want more customizability. [^chapter_9_27]: This process can also be useful for creating graphics as we will see in Chapter \@ref(FiguresChapter). [^chapter_9_28]: The long URL is: . ================================================ FILE: rep-res-3rd-edition/12-figures.Rmd ================================================ # Showing Results with Figures {#FiguresChapter} One of the main reasons that many people use R is to take advantage of its comprehensive and powerful set of data visualization tools. Visually displaying information with graphics is often a much more effective way of presenting both descriptive statistics and analysis results than the tables we covered in the last chapter.[^chapter_10_1] Nonetheless, dynamically incorporating figures with knitr/R Markdown has many of the same benefits as dynamically including tables, especially the ability to have data set or analysis changes automatically cascade into your presentation documents. The basic process for including figures in knitted presentation documents is also very similar to including tables, though there are some important extra considerations we need to make to properly size the figures and be able to include interactive visualizations in our presentation documents. In this chapter we will first learn how to include non-knitted graphics in LaTeX and Markdown documents before turning to dynamically knit R graphics into presentation documents. In the remainder of the chapter, we will look at how to actually create graphics with R including some of the fundamentals of R's default graphics package, as well as the *ggplot2* [@R-ggplot2] and *googleVis* [@R-googleVis] packages. In each case we will focus on how to include the figures created by these packages in knitted presentation documents. ## Including Non-knitted Graphics Understanding how *knitr*/*rmarkdown* dynamically include figures is easier if you understand how figures are normally included in LaTeX and Markdown. Unlike a word processing program like Microsoft Word, in LaTeX, Markdown, HTML, and other markup languages you don't copy and paste figures into your document. Instead, you link to an image file outside of your markup document. Typically these image files are in formats such as *PDF*, *PNG*, and *JPEG*.[^chapter_10_2]\index{PNG}\index{JPEG} While you lose the flexibility of drag and drop, there are advantages to this method of including graphics. The first is that whenever the image files are changed, the changes are updated in the final presentation document when it is compiled, no recopying and pasting. The second advantage is that the images are sized and placed with the markup code rather than pointing and clicking. This is tedious at first, but saves considerable time and frustration when a document becomes larger. It also makes it easy to consistently format multiple images in a document. If the image files are in the same directory as the markup document, we don't need to specify the image's full file path, only its name. If they are in another directory, we need to include additional file path information. Remember to use relative paths when possible. In this section we will learn how to include graphics files in documents created with LaTeX and Markdown. ### Including graphics in LaTeX The main way to include graphics (graphs, photos, and so on) in LaTeX documents is to use the `includegraphics` function\index{LaTeX!includegraphics} to link to image files. To have the full range of features for `includegraphics`, make sure to load the *graphicx* package in your document's preamble.\index{LaTeX package!graphicx} Imagine that we wanted to include an image of butterflies stored in a file called *HeliconiusMimicry.png* in a LaTeX-produced document.[^chapter_10_3] We type: ````latex \includegraphics[scale=0.8]{HeliconiusMimicry.png} ```` In the square brackets, you'll notice `scale=0.8`. This formats the image to be included at 80 percent of its actual size. You can use other options such as `height` to specify the height, `width` to specify the width, and `angle` to specify the angle at which to rotate the image. You can add more than one option if they are separated by commas. Rather than hard coding the width in exact centimeters, you can determine its width as a proportion of the text width using `\textwidth`.[^chapter_10_4]\index{LaTeX!textwidth} For example, to set our image at 80 percent of the text width we can type: ````latex \includegraphics[scale=0.8\textwidth]{HeliconiusMimicry.png} ```` #### `figure` float environment {-} \index{LaTeX!float|(} Most often you will want to include LaTeX figures in a `figure` float environment.\index{LaTeX!figure environment} The *figure* environment works almost exactly the same way as the `table` environment we saw in the last chapter. It allows you to separate the figure from the text, add a caption, and label the figure. We begin the environment with `\begin{figure}[POSITION_SPEC]`. `POSITION_SPEC` can have the same values as we saw earlier with tables in Chapter \@ref(TablesChapter). We can then include a `caption` and `label` function.\index{LaTeX!label}\index{LaTeX!caption} The environment is closed with `\end{figure}`. For example, to create Figure \@ref(fig:ExampleLaTeXFigure) exactly as is, I used the following code:[^chapter_10_5] ````latex \begin{figure}[ht] \begin{center} \includegraphics{HeliconiusMimicry.png} \end{center} \caption{An Example Figure in LaTeX} {\scriptsize{Image source: \cite{meyer2006}}} \label{ExampleLaTeXFigure} \end{figure} ```` Notice that after the call to end the `center` environment we include `{\scriptsize{Source: \cite{meyer2006}}}`. This includes a note in the figure environment giving the image's source. The note moves with the figure and is separate from the text. The `scriptsize` function\index{LaTeX!scriptsize} transforms the text to smaller than normal size font. See Chapter \@ref(LatexChapter) for more details on LaTeX font sizes. The function `\cite{meyer2006}` inserts a citation from the bibliography for @meyer2006. We will also discuss bibliographies in more detail in Chapter \@ref(LatexChapter).\index{LaTeX!cite} \begin{figure}[ht] \begin{center} \includegraphics{images/chapter_10/HeliconiusMimicry.png} \end{center} \caption{An Example Figure in LaTeX} {\scriptsize{Image source: \cite{meyer2006}}} \label{fig:ExampleLaTeXFigure} \end{figure} \index{LaTeX!float|)} ### Including graphics in Markdown/HTML Markdown has a similar function as LaTeX's `includegraphics`. It goes like this: `![ALT_TEXT](FILE_PATH)`. This syntax may seem strange now, but it will hopefully make more sense when we cover Markdown hyperlinks in Chapter \@ref(MarkdownChapter). This is what it is intended to imitate. `ALT_TEXT` refers to HTML's `alt` (alternative text) attribute.\index{HTML!alt} This should be a very short description of the image that will appear if it fails to load in a web browser. `FILE_PATH` specifies the image's file path.[^chapter_10_6] Here is an example using the image we worked with before. ````markdown ![ButterflyImage](HeliconiusMimicry.png) ```` Note that the file path can be a URL. You may, for example, store an image on GitHub and use its raw URL to link to it in the Markdown document.[^chapter_10_7] Markdown does not easily include ways to resize or reposition an image. If you want to resize or reposition your image, it is often most straightforward to use HTML markup. Probably the simplest way to include images with HTML is by using the `img` (image)\index{HTML!img} element tag.\index{HTML!tag} To create the equivalent of what we just did in Markdown with HTML, type: ````html ButterflyImage ```` The `src` (script)\index{HTML!src} attribute specifies the file path. To change the width and height of the image, use the `width` and `height` attributes.\index{HTML!width}\index{HTML!height} For example: ````html ButterflyImage ```` creates an image that is 100 pixels (`px`) wide by 100 pixels high.[^chapter_10_8]\index{HTML!px} It is also possible to specify the alignment of figures in Markdown with a custom CSS\index{CSS} style file. I don't cover how to do that here. ### Non-knitted graphics with *knitr*/*rmarkdown* Now that we've seen how LaTeX, Markdown, and HTML include non-dynamically generated graphics, it raises a question: how do we include these graphics in a document that we intend to use *R Markdown* to compile to more than one of these formats? *knitr* includes the `include_graphics()` function just for this purpose.\index{R function!include\_graphics} For example, in a code chunk place: ````r `r ''````{r fig.cap="An Example Figure"} knitr::include_graphics('HeliconiusMimicry.png') ``` ```` Now the figure will be included regardless of which markup language we compile to. Notice the code chunk option `fig.cap`. In the next section, we discuss this type of *knitr* options in detail. ## Basic *knitr*/*rmarkdown* Figure Options In addition to including precompiled images with the `include_graphics()` function, *knitr*, and by extension *rmarkdown*, allows us to combine a figure's creation by R with its inclusion in a presentation document. They are tied together and update together. We use *knitr* chunk options to specify how the figure will look in the presentation document and where it will be saved. We can also use them to specify captions. Let's learn some of the more important chunk options for figures. ### Chunk options #### `fig.path` {-} \index{knitr option!fig.path|(} When you use *knitr* to create and include figures in your presentation documents, it (1) runs the code you give it to create the figure, (2) automatically saves it into a particular directory,[^chapter_10_9] and (3) includes the necessary LaTeX or Markdown code to include the figure in the final presentation document. By default, *knitr* saves images into a folder (it creates) called *figure* located in the working directory.[^chapter_10_10] You can tell *knitr* where to save the images with the `fig.path` option. Simply use the file path naming conventions suitable for your system and include the new path in quotation marks. Note if you use *rmarkdowm* to compile to HTML, by default the graphic will not be saved in a separate file, but instead converted to a format that is embedded directly in the HTML markup document. \index{knitr option!fig.path|)} #### `out.height` {-} \index{knitr option!out.height} To set the height that a figure will be in the final presentation document, use the `out.height` option. In R LaTeX documents, you can set the width using centimeters, inches, or as a proportion of a page element. In R Markdown documents, you use pixels to set the height. For example, to set a figure's height in an R Markdown document to 200 pixels, use `out.height='200px'`. #### `out.width` {-} \index{knitr option!out.width} Similarly, we can set the width of a *knitr* created figure using the `out.width` option. The same rules apply as with `out.width`. For example, to have a figure shown up at 80 percent of the text width in an R LaTeX document, use: `out.width='0.8\\textwidth'`.\index{LaTeX!textwidth} Notice that there are two backslashes before `textwidth`. As we saw earlier, the LaTeX function only has one. However, all *knitr* code chunk options must be written as they would be in R. We need to escape the backslash with the backslash escape character, i.e. use two backslashes. #### `fig.align` {-} \index{knitr option!fig.align} You can set a knitted figure's alignment using `fig.align`. The option can be set to `left`, `center`, or `right`. To center a figure, add `fig.align='center'`. #### `fig.cap` {-} \index{knitr option!fig.cap} If your document compiles to LaTeX, you can use the `fig.cap` option to set the figure's caption as we did in the example above. #### Other figure chunk options {-} The previous options are probably the most commonly used ways of adjusting figures with *knitr*. However, *knitr* has many other chunk options to help you adjust your figures so that they are incorporated into your presentation documents the way that you want. For example, the option `fig.lb` allows you to set the label.[^chapter_10_11] As we will see below, you can use the `dev` option to choose the figure's output file format, e.g. PDF, PNG, JPEG. Please see the official *knitr* code chunk options webpage for more information on figure chunk options: . ### Global options If you want all of your figures to share the same options---e.g. same height and alignment---you can set global figure options at the beginning of your document with `opts_chunk$set`. Imagine that we are making an R LaTeX Sweave-style document and want all of our figures to be center aligned and 80 percent of the text width. We type: ````r opts_chunk$set.(fig.align = "center", out.width = "0.8\\textwidth") ```` You can also set some global figure options, such as `fig_height` and `fig_width` in your *rmarkdown* YAML header.\index{YAML} ## Knitting R's Default Graphics R's *graphics* package, loaded by default, includes functions to create numerous plot types. These include `hist()`\index{R function!hist} for histograms, `pairs()`\index{R function!pairs} for scatterplot matrices, `boxplot()`\index{R function!boxplot} for creating boxplots, and the versatile `plot()`\index{R function!plot} for creating x-y plots, including scatterplots and bar charts depending on the data's type. There are many useful resources for learning how to fully utilize R's default graphics capabilities. These include Paul Murrell's [-@murrell2011] comprehensive *R Graphics* book. The Cookbook for R[^chapter_10_12] and Quick-R[^chapter_10_13] websites are also helpful. Winston Chang [-@chang2012], the maintainer of the Cookbook for R, also has a full book devoted to creating R graphics. Kieran Healy [-@healy2018data] is a strong intoduction to data visualisation in general with R examples. In this section we are going to see how to include R's default graphics in our LaTeX and Markdown presentation documents. We will also see an example of how to source the creation of a graph from a segmented analysis file. Most of R's default graphics capabilities create static graphics. They are not animations or interactive. The discussion in this section is exclusively about using static graphics with *knitr*/*rmarkdown*. Later in the chapter, we will discuss how to knit interactive graphics. Let's look at an example we first saw at the end of Chapter \@ref(StatsModel). Remember that we accessed an R source code file stored on GitHub to create a simple scatterplot of cars' speed and stopping distances using R's *cars* data set, which is loaded by default. We haven't yet seen the code in the R source file that created the plot. In the *cars* data frame, the variable **speed** contains the stopping speed, and **dist** contains the stopping distances. Here is the code to create the plot: ```{r Ch10CarsPlotCode, eval=FALSE, tidy=FALSE} # Create simple scatterplot of cars' speed and stopping distance plot(x = cars$speed, y = cars$dist, xlab = "Speed (mph)", ylab = "Stopping Distance (ft)", cex.lab = 1.5) ``` We select the variables from *cars* to plot on the $x$- and $y$-axes of our graph with the component selector (`$`).\index{component selector} Then we use the `xlab` and `ylab` arguments to specify the $x$- and $y$-axes labels. We could have added a title for the plot using the `main` argument. We didn't do this because we will give the plot a title in the LaTeX `figure` environment.\index{LaTeX!figure} The `cex.lab` argument increased the labels' font size. The argument specifically determines how to scale the labels relative to the default size: 1.5 means 50 percent larger than the default. Now let's see how to create this plot with *knitr* and include it in a LaTeX `figure` environment. ````latex \begin{figure}[ht] `r ''````{r echo=FALSE, fig.align='center', out.width='8cm'} plot(x = cars$speed, y = cars$dist, xlab = "Speed (mph)", ylab = "Stopping Distance (ft)", cex.lab = 1.5) ``` \caption{Example Simple Scatterplot Using \texttt{plot}} \label{BasicFigureExample} \end{figure} ```` \begin{figure}[ht] ```{r echo=FALSE, fig.align='center', out.width='8cm'} plot(x = cars$speed, y = cars$dist, xlab = "Speed (mph)", ylab = "Stopping Distance (ft)", cex.lab = 1.5) ``` \caption{Example Simple Scatter Plot Using \texttt{plot}} \label{fig:BasicFigureExample} \end{figure} This code produces Figure \@ref(fig:BasicFigureExample).[^chapter_10_14] If you are familiar with R graphics, you will notice that we did not need to tell *knitr* to save the file in a particular format. Instead, behind the scenes it automatically saves the plot as a PDF file in a folder called *figure* that is a child of the current working directory. You can choose the figure file's format with the `dev` (graphical device) chunk option.\index{knitr option!dev} For example, to save the figure in a PNG formatted file, add the chunk option `dev='PNG'`. You can choose any graphical device format supported by R. For a full list of R's graphical devices, type `?Devices` into your console. One reason you might want to change the format is to reduce your presentation document's file size. Using a bitmap format like PNG will create smaller files than PDFs, though lower-quality images. We could, of course, link to the original R source code file stored on GitHub with the `source_url()` function.\index{R function!source\_url} Let's look at an example of this with a different source code file. Remember in Chapter \@ref(DataGather) we used a makefile to gather data from three different sources on the internet. The CSV is called *main-data.csv* and is stored on GitHub at: .[^chapter_10_15] We can download this data into R and make the following scatterplot matrix (Figure \@ref(fig:BasicFigureExample)) with this code: ```{r Ch10ScatterPlotMatrix, tidy=FALSE, fig.cap='Example of a Scatterplot Matrix', fig.lb='ScatterMatrix'} # Download data main_data <- rio::import("http://bit.ly/V0ldsf", format = "csv" ) # Subset main_data so that it only includes the year 2003 data_sub <- subset(main_data, year == 2003) # Remove iso2c, country, year variables # Keep reg_4state, disproportionality, FertilizerConsumption data_sub <- data_sub[, c("reg_4state", "disproportionality", "FertilizerConsumption")] # Create a scatterplot matrix pairs(x = data_sub) ``` This is a lot of code, but you should be familiar with most of it. You will notice that after downloading the data we cleaned it up in preparation for plotting with the `pairs()`\index{R function!pairs} function by removing data from all years other than 2003 and all of the country-year identifying variables. Finally, we created the scatterplot matrix with `pairs()`. To dynamically include the plot in our final document, we don't need to include all of this code in a code chunk in our markup document. A file containing the code is available on GitHub.[^chapter_10_16] So we only need to use `source_url()`\index{R function!source\_url} to link to it. I've shortened the raw source code file's URL to: . Let's look at the syntax for knitting this into an R Markdown file: ````r `r ''````{r echo=FALSE, fig.cap='Example of a Scatterplot Matrix'} # Create scatterplot matrix from main-data.csv devtools::source_url("http://bit.ly/TE0gTc") ``` ```` Because we have linked all the way back to the original data set *main-data*, any time it is updated by the makefile, the update will automatically cascade all the way through to our final presentation document the next time we knit it. ## Including *ggplot2* Graphics \index{R package!ggplot2|(}\index{R function!ggplot|(} The *ggplot2* package[^chapter_10_17] [@R-ggplot2] is probably one of the most popular packages for making graphics with R. It greatly expands the aesthetic and substantive tools R has for displaying quantitative information. Figures created with *ggplot2* are (generally) static,[^chapter_10_18] so they are included in knitted documents the same way as most of R's default graphics. There are a number of very good resources for learning how to use *ggplot2*. These include Hadley Wickham's *ggplot2* book [-@whickham2009book] and article [-@whickham2010journal]. The official *ggplot2* website[^chapter_10_19] has up-to-date information. I've also found the Cookbook for R website helpful.[^chapter_10_20] Given that there is already extensive good documentation on *ggplot2*, we are not going to learn the full details of how to use the package here. Instead, let's look at some examples of how to manipulate a data frame and a regression results object so that they can be graphed with *ggplot2*. First we will create a multi-line time series plot. Then we will create a caterpillar plot of regression results. Along with giving you a general sense of how *ggplot2* works, the examples illuminate how *ggplot2* can be made part of a fully reproducible research workflow.[^chapter_10_21] Sometimes we may want to show how multiple variables change together overtime. For example, imagine we have data on inflation in the United States along with inflation forecasts made by the US Federal Reserve two quarters beforehand. The data is stored on GitHub at: .[^chapter_10_22] I've loaded the data into R and put it into an object called *inflation_data*. It looks like this: ```{r Ch10Loadinflation_data, include=FALSE, message=FALSE, error=FALSE} # Create URL object inflation_url <- "https://raw.githubusercontent.com/christophergandrud/Rep-Res-Examples/master/Graphs/InflationData.csv" # Load data inflation_data <- rio::import(inflation_url, format = "csv") ``` ```{r Ch10Headinflation_data} names(inflation_data) ``` We want to create a plot with **Quarter** as the $x$-axis, inflation as the $y$-axis, and two lines. One line will represent **ActualInflation** and the other **EstimatedInflation**. To do this, we need to reshape our data so that the inflation variables are in long format like this: \begin{tabular}{l l l} \hline Quarter & Variable & Value \\[0.25cm] \hline\hline 1969.1 & ActualInflation & \\ 1969.1 & EstimatedInflation & \\ 1969.2 & ActualInflation & \\ 1969.2 & EstimatedInflation & \\ \ldots & & \\ \hline \end{tabular} \vspace{0.5cm} We can use the `pivot_longer` function from *tidyr* that we first saw in Chapter \@ref(DataClean) to reshape the data. The variable identifying the observations in this case is `Quarter`. The **ActualInflation** and **EstimatedInflation** variables (in columns two and three) are the variables that we want to pivot. So let's pivot the data: ```{r Ch10GatherInflation, message=FALSE} library(tidyr) # Pivot inflation_data inflation_long <- pivot_longer(inflation_data, cols = 2:3, names_to = "variable") inflation_long ``` Now we have a data set we can use to create our line graph with *ggplot2*. Let's cover a few basic *ggplot2* ideas that will help us understand the following code better. First, plots are composed of layers including the coordinate system, points, labels, and so on. Each layer has aesthetics, including the variables plotted on the $x$- and $y$-axes, label sizes, colors, and shapes. Aesthetic elements are defined by the `aes()` argument. Finally, the main layer types are called geometrics, including lines, points, bars, and text. Functions that set geometrics usually begin with `geom`. For example, the geometric to create lines is `geom_line()`.\index{R function!geom\_line}\index{R function!ggplot} ```{r Ch10ggplot2Lines, eval=FALSE, tidy=FALSE} library(ggplot2) # Create plot line_plot <- ggplot(data = inflation_long, aes(x = Quarter, y = value, color = variable, linetype = variable)) + geom_line() + scale_color_discrete(name = "", labels = c("Actual", "Estimated")) + scale_linetype(name = "", labels = c("Actual", "Estimated")) + xlab("Quarter") + ylab("Inflation") + theme_bw(base_size = 15) ``` \index{R function!scale\_color\_discrete}\index{R function!scale\_linetype}\index{R function!xlab}\index{R function!ylab}\index{R function!aes} You can see we set the $x$- and $y$-axes using the **Quarter** and **value** variables. We told *ggplot* that elements in the geometric layer should have lines with different colors and line types (dashed, dotted, and so on) based on the value of **variable** that they represent. `geom_line` specifies that we want to add a line geometric layer.[^chapter_10_23] `scale_color_discrete()` and `scale_linetype()` are used here to hide the plot's legend title with `name = ""` and customize the legend's labels with `labels = . . .`. You can also use them to determine the specific colors and line types you would like to use. `xlab()` and `ylab()` set the axes' labels. You can add a title with `ggtitle`. Finally, I added `theme_bw()` so that the plot would use a simple black-and-white theme. We added the argument `base_size = 15` to increase the plot's font size. All of the code required to create this graph is on GitHub at: .[^chapter_10_24] To knit the graph into a LaTeX document manually specifying the figure environment, type: \index{LaTeX environment!figure} ````latex \begin{figure}[ht] \caption{Example Multi-line Time Series Plot Created with \emph{ggplot2}} \label{ggplot2Line} \begin{center} `r ''````{r echo=FALSE, out.width='10cm'out.height='8cm'} # Create plot devtools::source_url("https://bit.ly/2FaMkOJ") ``` \end{center} \end{figure} ```` ```{r Ch10MultiLines, echo=FALSE, message = FALSE, warning=FALSE, out.width='10cm', out.height='8cm', fig.cap='\\emph{ggplot2} Time Series Line Plot'} # Create plot devtools::source_url("https://bit.ly/2FaMkOJ") ``` The syntax for including this and other *ggplot2* figures in an R Markdown document is the same as we saw for default R graphics. ### Showing regression results with caterpillar plots Many packages that estimate statistical models from data in R have built-in plotting capabilities. For example, the *survival* package [@R-survival] has the `plot.survfit()`\index{R function!plot.survfit} function for plotting survival curves created using event history analysis. These plots can be knitted into presentation documents like the plots we have seen already. However, sometimes either a package doesn't have built-in functions for plotting model results the way you want to and/or you want to use *ggplot2* to improve the aesthetic quality of the plots they do create by default. In either case, you can almost always create the plot that you want by first breaking into the model results object, extracting what you want, then plotting it with *ggplot2*. The process is very similar to what we did in Chapter \@ref(TablesChapter) to create custom tables. To illustrate how this can work, let's create a caterpillar plot, like the following figure, showing the mean coefficient estimates and the uncertainty surrounding them from a Bayesian normal linear regression model using the *swiss* data frame. Here is our model: ```{r Ch10SwissModel, message=FALSE, error=FALSE, warning=FALSE, cache=TRUE} # Fit model linear_brms_2 <- brm(Examination ~ Education + Agriculture + Catholic + Infant.Mortality, data = swiss, family = gaussian(link = "identity"), refresh = 0) ``` Remember from Chapter \@ref(TablesChapter) that we can create an object summarizing our estimation results like this: ```{r Ch10ModelSummaryResults, tidy=FALSE, dependson=-1} # Create summary object linear_brms_2_sum <- summary(linear_brms_2) # Create summary data frame linear_brms_2_sum_df <- data.frame(linear_brms_2_sum$fixed) # Show data frame linear_brms_2_sum_df ``` We want to use *ggplot2* to create credibility intervals for each variable with **l.95..CI** as the minimum value and **u.95..CI** as the maximum value. These are the lower and upper bounds of the middle 95 percent of the estimates' marginal posterior distributions, i.e. the 95 percent credibility intervals.[^chapter_10_25] We will also create a point at the **mean** of each estimate. To do this, we will use *ggplot2*'s `geom_pointrange`\index{R function!geom\_pointrange} function. First we need to do a little tidying up. ```{r Ch10SubsetBayes, tidy=FALSE} # Convert row.names to column linear_brms_2_sum_df$Variable <- row.names(linear_brms_2_sum_df) # Keep only coefficient estimates ## This allows for a more interpretable scale linear_brms_2_sum_df <- subset(linear_brms_2_sum_df, Variable != "Intercept") ``` The first line of executable code creates a proper variable out of the data frame's row.names attribute. In this case, `row.names` contains the names of the variables included in the regression. The second executable line removes the *Intercept* estimates. This allows the variable's coefficient estimates to be plotted on a scale that enables easier interpretation. Now we can create our caterpillar plot (Figure \@ref(fig:Ch10CatPlot)). ```{r Ch10CatPlot, tidy=FALSE, message=FALSE, fig.cap='Example Caterpiller Plot'} library(dplyr) # Rename variables to make them easier for ggplot2 to work with linear_brms_2_sum_df <- rename(linear_brms_2_sum_df, lower = `l.95..CI`) linear_brms_2_sum_df <- rename(linear_brms_2_sum_df, upper = `u.95..CI`) # Make caterpillar plot ggplot(data = linear_brms_2_sum_df, aes(x = reorder(Variable, lower), y = Estimate, ymin = lower, ymax = upper)) + geom_pointrange(size = 1.4) + geom_hline(yintercept = 0, linetype = "dotted") + xlab("Variable") + ylab("Coefficient Estimate") + coord_flip() + theme_bw(base_size = 20) ``` There are some new pieces of code in here, so let's take a look. First, the data frame is reordered from the highest to lowest value of **l.95..CI** using the `reorder()` function.\index{R function!reorder} We renamed this column "lower" so that it would be easier to work with in `ggplot()`.\index{R function!ggplot2} Reordering by these values makes the plot easier to read. The middle point of the point range is set with `y` and the lower and upper bounds with `ymin` and `ymax`. The `geom_hline()`\index{R function!geom\_hline} function used here creates a dotted horizontal line at 0, i.e. no effect. `coord_flip()`\index{R function!cord\_flip} flips the plot's coordinates so that the variable names are on the $y$-axis. We can include this plot in a knitted document the same way as before. Note that we create this example to help you understand the power of *ggplot2* to create new graphics from complex objects. This particular task---creating caterpillar plots from *brms* objects---has been packaged into the `stanplot()`\index{R function!stanplot} function that comes with *brms*. \index{R package!ggplot2|)}\index{R function!ggplot|)} ## JavaScript Graphs with *googleVis* Markus Gesmann and Diego de Castillo's [-@R-googleVis] *googleVis* package allows us to use Google's Visualization API from within R to create interactive tables, plots, and maps with Google Chart Tools. Because the visualizations are written in JavaScript, they can be included in HTML presentation documents created by R Markdown. Unfortunately, they cannot be directly[^chapter_10_26] included in LaTeX-produced PDFs. The *animation* package [@R-animation] does have some limited features for including interactive visualizations in PDFs (as well as HTML documents) and is worth investigating if you want to do this. The *gganimate* package allows you to annimate *ggplot2* graphics as GIFs.\index{GIF} However, these cannot be included in PDFs. ### Basic googleVis figures Let's briefly look at how to make one type of figure with *googleVis*: a choropleth map. This is created with the `gvisGeoChart()`\index{R function!gvisGeoChart} function. We will use this example to illustrate how to incorporate *googleVis* figures into R Markdown.[^chapter_10_27] Imagine that we want to map global fertilizer consumption in 2011 using the World Bank data we gathered in Chapter \@ref(DataGather). Remember that the data was highly right skewed, so we will actually map the natural logarithm of the **fert_cons** variable.[^chapter_10_28] Assuming that we have already loaded the *main-data* data set, here is the code: ```{r Ch10GeoMap, eval=FALSE, tidy=FALSE} # Load googleVis library(googleVis) # Subset main_data so that it only includes 2011 data_sub <- subset(main_data, year == 2011) # Keep values of fert_cons greater-than 0.1 data_sub <- subset(data_sub, fert_cons > 0.1) # Find the natural logarithm of fert_cons ## Round the results to one decimal digit. data_sub$fert_cons_log <- round(log(data_sub$fert_cons), digits = 1) # Make a map of Fertilizer Consumption fc_map <- gvisGeoChart(data = data_sub, locationvar = "iso2c", colorvar = "LogConsumption", options = list( colors = "['#ECE7F2', '#A6BDDB', '#2B8CBE']", width = "780px", height = "500px") ) ``` The `locationvar` argument specifies the variable with information on each observation's location. Google Chart Tools can use ISO two-letter country codes to determine each country's location. `colorvar` specifies the variable with the values to map for each country. We can determine other options by creating a list-type object with arguments specifying characteristics such as the map's width, height, and colors. The colors here are written using hexadecimal values. This is a commonly used format for specifying colors on websites.[^chapter_10_29] To view the figure on your computer, use *googleVis*'s `plot()` function. For example, to view our map, type: ```{r Ch10ViewGoogleVisMapPlot, eval=FALSE} plot(fc_map) ``` Note that you need to be connected to the internet to view figures created by *googleVis*; otherwise, your image will not be able to access the required JavaScript files from the Google Visualization API. \begin{figure} \begin{center} \includegraphics[width=\textwidth]{images/chapter_10/GeoChartScreenShot.png} \end{center} \caption{Screenshot of a \emph{googleVis} Geo Chart} \label{GeoMapImage} \end{figure} ### Including *googleVis* in knitted documents Typing `print(fc_map, tag = "chart")`\index{R function!print} in a knittable document would print the entire JavaScript code needed to create the map. Much like we saw with tables produced with *xtable* and *texreg* in Chapter \@ref(TablesChapter), we need to change the code chunk `results` option to include the map as a map rather than as JavaScript markup. To have the visualization show up in your HTML output, rather than the code block, set the code chunk option to `results='asis'`.[^chapter_10_30] For example, the full code needed to create and print *fc_map* is available at: .[^chapter_10_31] To knit the map into an R Markdown document, type: ````r `r ''````{r}, echo=FALSE, message=FALSE, results='asis'} # Create and print geo map devtools::source_url("https://bit.ly/2CfXWOs") ``` ```` ### JavaScript Graphs with *htmlwidgets*-based packages The number of tools for creating JavaScript graphs from R that can be knitted into HTML files is growing rapidly. The *htmlwidgets* [@R-htmlwidgets] framework is especially making the development of these tools easier. There are tools built on *htmlwidgets* for creating maps, network graphs, time series graphs, and interactive tables, among others. Though the syntax of each of these tools differs, they can all easily be included into R Markdown documents. Often you run their core functions in a code chunk, without needing to use an additional call to `print` or `plot`. ### Chapter summary {-} In this chapter we have learned how to take results from our statistical analyses and other information from our data and dynamically present them in figures. In the next chapters, we will learn the details of how to create the LaTeX and Markdown presentation documents we use to present the tables we created in Chapter \@ref(TablesChapter) and the figures we created in this chapter. [^chapter_10_1]: There are, of course, a number of exceptions to this rule of thumb. @vanbelle2008 [Ch. 9] argues that a few numbers should be listed in a sentence, many numbers shown in tables, and relationships between numbers are best shown with graphs. Similarly, @tufte2001 argues that tables tend to outperform graphics for displaying 20 or fewer numbers. Graphics often outperform tables for showing larger data sets and relationships within the data. [^chapter_10_2]: PDF: Portable Document Format, PNG: Portable Network Graphic, JPEG: Joint Photographic Experts Group.\ A quick note about file formats: By default, *knitr* creates PDF-formatted figure files when knitting R LaTeX documents. These figures, generally built with vector graphics, allow you to zoom in on them by any amount without them becoming pixelated. This means that your images will be crisp in PDF presentation documents. For Markdown documents, *knitr* creates PNG images. PNG images are usually relatively high quality and can be rendered directly on websites, unlike PDFs. JPEG formatted files usually take up less disk space than PDF and PNG files. However, their quality is also worse and can often look very pixelated. For more information, Wikipedia has a comprehensive comparison of graphics file formats at: . [^chapter_10_3]: The image used here is from @meyer2006. [^chapter_10_4]: Note there are a number of other ways to set the size of a figure relative to a page element. See the LaTeX Wiki Book for more details: . [^chapter_10_5]: For simplicity, this code does not include the full image's actual file path. [^chapter_10_6]: You can also include a title in quotation marks after the file path. This specifies the HTML `title` attribute. However, this attribute does not create a title for the image in the way that `caption` does for LaTeX float figures. Instead, it creates a tooltip, a small box that appears when you place your cursor over the image. Specifying descriptive alt text is very useful for screen readers that help visually impaired people access web content. [^chapter_10_7]: Use the URL for the raw version of the file for images stored on GitHub. [^chapter_10_8]: A pixel is the smallest discrete part of images displayed on a screen. See the "pixel" Wikipedia page for more details: . [^chapter_10_9]: If a code chunk creates more than one figure, *knitr* automatically saves each into its own file in the same directory. [^chapter_10_10]: File names are based on the code chunk label where they were created. [^chapter_10_11]: In this chapter we will set this option in the markup rather than the code chunk. I prefer doing this because *knitr* options need to be on the same line and so they can sometimes result in very long lists of options that are difficult to read. [^chapter_10_12]: [^chapter_10_13]: [^chapter_10_14]: Note that I did not specify the center environment. This is because it is specified in a *knitr* global chunk option. [^chapter_10_15]: The full version of the URL is: [^chapter_10_16]: See: . [^chapter_10_17]: "GG" stands for grammar of graphics and "2" indicates that it is the second major version of the package. [^chapter_10_18]: It is possible to combine a series of figures created with *ggplot2* into an animation. For a nice example of an animation using *ggplot2*, see Jerzy Wieczorek's animation of 2012 US presidential campaigning: . [^chapter_10_19]: [^chapter_10_20]: [^chapter_10_21]: Note that everything we do here with *ggplot2* can also be done with R's default graphics, though the appearance will be different. [^chapter_10_22]: This data is from @gandrudgrafstrom2012. The example here partially recreates Figure 1 from that paper. [^chapter_10_23]: Remember from Chapter \@ref(GettingStartedRKnitr) that functions must be followed by parentheses. These layers are functions so they need to be followed by parentheses. [^chapter_10_24]: The full URL is: . [^chapter_10_25]: The procedures used here are also generally applicable for graphing frequentist confidence intervals once you have calculated the confidence intervals. One useful function for doing this is `confint`. [^chapter_10_26]: The example in this chapter is from a screenshot. [^chapter_10_27]: For demonstrations of the full range of plotting functions available, visit the *googleVis* website: . [^chapter_10_28]: You'll notice in the code below that we remove all values of **fert_cons** less than 0.1. This is so that we can calculate integer values with the natural logarithm. [^chapter_10_29]: You can also use hexadecimal values in *ggplot2*. The Color Brewer 2 website () is very helpful for picking hexadecimal color palettes, among others. [^chapter_10_30]: You can use `results=’asis’` to include almost any type of JavaScript graphics. For an example using the D3 JavaScript library and *knitr* see this page by Yihui Xie: . [^chapter_10_31]: The full URL is: . [^chapter_10_32]: You can use the `gvisMotionChart()` function to make these. [^chapter_10_33]: This is because motion charts and annotated time line charts rely on Flash, unlike the other Google visualizations. For more information see Markus Gesmann's blog post at: . ================================================ FILE: rep-res-3rd-edition/13-latex.Rmd ================================================ # (PART) Presentation Documents {-} # Presenting with LaTeX {#LatexChapter} We have already begun to see how LaTeX works for presenting research results. This chapter gives you a more detailed and comprehensive introduction to basic LaTeX document structures and commands. It is not a complete introduction to all that LaTeX is capable of, but we will cover enough that you will be able to create an entire well-formatted article and slideshow with LaTeX that you can use to dynamically present your results. For basic LaTeX documents, such as short articles or simple presentations, it may often be quicker and simpler to write the markup using an R Markdown document and compile it to PDF with the *rmarkdown* package. Markdown syntax is much simpler than normal LaTeX. However, there are at least two reasons why it is useful to become familiar with LaTeX syntax. First, understanding LaTeX syntax will help you debug issues you might encounter when using *rmarkdown* with LaTeX that would otherwise be mysterious if you were only familiar with Markdown. Second, R Markdown has limited capabilities for creating more complex documents such as books and documents with highly customizable formatting needs.[^chapter_11_bookdown] Using *knitr* LaTeX or including LaTeX syntax directly in R Markdown documents can be useful in these situations. In this chapter we will learn about basic LaTeX document structures and syntax as well as how to dynamically create LaTeX bibliographies with BibTeX, R, and *knitr*. Finally, we will look at how to create PDF beamer slideshows. **Note:** This chapter and the following chapter are unusual for this book in that they do not refer to knitr and R Markdown interchangeably. Remember you can almost always include LaTeX syntax in an R Markdown document, though typically this will only impact the document when it is compiled to PDF. ## The Basics In this section we look at how to create a LaTeX article including what editor programs to use, the basic structure of a LaTeX document, including preamble and body, LaTeX syntax for creating headings, paragraphs, lines, text formatting, math, lists, footnotes, and cross-references. I will assume that you already have a fully functioning TeX distribution installed on your computer. See Section \@ref(InstallMarkup) for information on how to install TeX. ### Getting started with LaTeX editors RStudio is a fully functional LaTeX editor in addition to being an integrated development environment for R. If you want to create a new LaTeX document, you can click `File` in the menu bar, then `New File` and `R Sweave`. ```{r echo=FALSE, fig.cap='RStudio TeX Format Options', fig.lb='TeXFormat', out.width='2cm', out.height='5cm'} knitr::include_graphics("images/chapter_11/TeXFormat.png") ``` Remember from Chapter \@ref(GettingStartedRKnitr) that R Sweave files are basically LaTeX files that can include *knitr* code chunks. You can use RStudio to knit and compile a document with the click of one button: `Compile PDF`. You can use this button to compile R Sweave files like regular LaTeX files in RStudio even if they do not have code chunks. If you use another program to compile them, you might need to change the file extension from *.Rnw* to *.tex*. You can also insert many of the items we will cover in this section into your documents with RStudio's LaTeX `TeX Format` button. See the figure above. There are many other LaTeX editors[^chapter_11_1] and many text editors that can be modified to compile LaTeX documents. For example, alongside writing this book in RStudio, I typed much of the LaTeX markup in the Atom[^chapter_11_2] text editor because it was easier to work with a large number of files simultaneously. However, RStudio has by far the best integration with *knitr*. ### Basic LaTeX command syntax As you probably noticed in Part III's examples, LaTeX commands start with a backslash (`\`). For example, to create a section heading you use the `\section` command. The arguments for LaTeX commands are written inside of curly braces (`{}`) like this: ````latex \section{My Section Name} ```` Probably one of the biggest sources of errors that occur when compiling a LaTeX document to PDF are caused by curly brackets that aren't closed, i.e. an open bracket (`{`) is not matched with a subsequent closed bracket (`}`). Watch out for this and use an editor (like RStudio) that highlights brackets' matching pairs. As we will see, unlike in R with parentheses, if your LaTeX command does not have an argument, you do not need to include the curly brackets at all. There are a number of places to find comprehensive lists of LaTeX commands. The Netherlands TeX users group has compiled one: . ### The LaTeX preamble and body {#LaTeXPreamble} All LaTeX documents require a preamble. The preamble goes at the very beginning of the document. The preamble usually starts with the `documentclass` command.\index{LaTeX command!documentclass} This specifies what type of presentation document you are creating, e.g. an article, a book, a slideshow,[^chapter_11_4] and so on. LaTeX refers to these as classes. Classes specify a document's formatting. You can add options to `documentclass` to change the format of the entire document. For example, if we wanted to create an article class document with two columns, we would type: ````latex \documentclass[twocolumn]{article} ```` In the preamble you can also specify other style options and load extra packages\index{LaTeX!packages}. The command to load a package in LaTeX is `\usepackage`.\index{LaTeX command!usepackage} For example, if you include `\usepackage{url}` in the preamble of your document, you will be able to specify URL links in the body with the command `\url{SOMEURL}`.\index{LaTeX package!url} The preamble is often followed by the body of your document. It is specified with the `body` environment.\index{LaTeX environment!body} See Chapter \@ref(TablesChapter) for more details about LaTeX environments. You tell LaTeX where the body\index{LaTeX!begin document} of your document starts by typing `\begin{document}`. The very last line of your document is usually `\end{document}`, indicating that your document has ended. When you open a new R Sweave file in RStudio, it creates an article class document with a very simple preamble and body like this: ````latex \documentclass{article} \begin{document} \SweaveOpts{concordance=TRUE} \end{document} ```` This is all you need to get a very basic article class document working.[^ch11_concordance] If you want the document to be of another class, change `article` to something else, a `book` for example. Let's begin to modify the markup. First we will include in the preamble the (`hyperref`) for clickable hyperlinks and `natbib` for bibliography formatting. We will discuss `natbib` in more detail below. Note that in general, and unlike in R, almost all of the LaTeX packages you will use are installed on your computer when you installed the TeX distribution. Next, it's often a good idea to include *knitr* code chunks that specify features of the document as a whole. These can include global chunk options as well as loading data and packages used throughout the document. Then you likely want to specify title information just after the `document` environment begins. Use the `title` command to add a title, the `author` command to add author information, and `date` to specify the date.[^chapter_11_5]\index{LaTeX command!title}\index{LaTeX command!author}\index{LaTeX command!date} Then include the `maketitle` command.\index{LaTeX command!maketitle} This will place your title and author information in the body of the document. If you are writing an article you may also want to follow `maketitle` with an abstract. Unsurprisingly, you can use the `abstract` environment to include this.\index{LaTeX environment!abstract} Here is a full LaTeX article class document with all of these changes added: ````latex %%%%%%%%%%%%%% Article Preamble %%%%%%%%%%%%%% \documentclass{article} %% Load LaTeX packages \usepackage{url} \usepackage{hyperref} \usepackage[authoryear]{natbib} %% Set knitr global options and gather data <>= #### Set chunk options #### knitr::opts_chunk$set(fig.align='center') #### Load and cite R packages #### # Create list of packages packages_used <- c("knitr", "ggplot2", "knitr") # Load PackagesUsed and create .bib BibTeX file. # Load packages lapply(packages_used, library, character.only = TRUE) # Create package BibTeX file knitr::write_bib(packages_used, file = "packages.bib") #### Gather Democracy data from Pemstein et al. (2010) #### # For simplicity, store the URL in an object called 'url'. url <- "http://www.unified-democracy-scores.org/files/20140312/ z/uds_summary.csv.gz" # Create a temporary file called 'temp' to put the zip file into. temp <- tempfile() # Create a temporary file called 'temp' to put the zip file into. temp <- tempfile() # Download the compressed file into the temporary file. download.file(url, temp) # Decompress the file and convert it into a data frame # class object called 'data'. uds_data <- read.csv(gzfile(temp, "uds_summary.csv")) # Delete the temporary file. unlink(temp) @ %% Start document body \begin{document} \SweaveOpts{concordance=TRUE} %%%%%%%%%%%%% Create title %%%%%%%%%%%%%%%%% \title{An Example knitr LaTeX Article} \author{Christopher Gandrud \\ Zalando SE\thanks{Email: \href{mailto:christopher.gandrud@zalando .de}{christopher.gandrud@zalando.de}}} \date{August 2019} \maketitle %%%%%%%%%%%%% Abstract %%%%%%%%%%%%%%%%%%%% \begin{abstract} Here is an example of a knittable article class LaTeX document. \end{abstract} %%%%%%%%%%% Article Main Text %%%%%%%%%%%%% \section{The Graph} I gathered data from \cite{Pemstein2010} on countries' democracy level. They call their democracy measure the Unified Democracy Score (UDS). Figure \ref{DemPlot} shows the mean UDS scores over time for all of the countries in their sample. \begin{figure} \caption{Mean UDS Scores} \label{DemPlot} <>= # Graph UDS scores ggplot(uds_data, aes(x = year, y = mean)) + geom_point(alpha = I(0.1)) + stat_smooth(size = 2) + ylab("Democracy Score") + xlab("") + theme_bw() @ \end{figure} %%%%%%%%%%% Reproducing the Document %%%%% \section*{Appendix: Reproducing the Document} This document was created using the R version \Sexpr{paste0(version$major, ".", version$minor)} and the R package \emph{knitr} \citep{R-knitr}. It also relied on the R packages \emph{ggplot2} and \citep{R-ggplot2}. The document can be completely reproduced from source files available on GitHub at: \url{https://github.com/christophergandrud/ rep-res-book-v3-examples}. %%%%%%%%% Bibliography %%%%%%%%%%%%%%%%%%%% \bibliographystyle{apa} \bibliography{main.bib, packages.bib} \end{document} ```` The *knitr* code chunk syntax should be familiar to you from previous chapters, so let's unpack the LaTeX syntax from just after the first code chunk, including the "Create title" and "Abstract" parts. New syntax shown in later parts of this example is discussed in the remainder of this section and the next section on bibliographies. First, remember that the percent sign (%) is LaTeX's comment character. Using it to comment your markup can make it easier to read. Second, as we saw in Chapter \@ref(TablesChapter), double backslashes (`\\`), like those after the author's name, force a new line in LaTeX. We will discuss the `emph` command in a moment.\index{LaTeX command!emph} Third, using the `thanks`\index{LaTeX command!thanks} command allows us to create a footnote for author contact information[^chapter_11_6] that is not numbered like the other footnotes (see below). Finally, you'll notice `\href{mailto: . . . .org}}`. This creates an email address in the final document that will open the reader's default email program when clicked.\index{LaTeX command!href} You may have noticed the following line: `\Sexpr{paste0(version$major, ".", version$minor)}` This code finds the current version of R being used and prints the version number into the presentation document.\index{LaTeX command!Sexpr} ### Headings Earlier in the chapter, we briefly saw how to create section-level headings with `section`. There are a number of other sub section-level headings including `subsection`, `subsubsection`, `paragraph`, and `subparagraph`. Headers are numbered automatically by LaTeX.[^chapter_11_7] To have an unnumbered section, place an asterisk in it like this: `\section*{UNNUMBERED_SECTION}`. In book class documents, you can also use `chapter` to create new chapters and `part` for collections of chapters.\index{LaTeX command!section}\index{LaTeX command!subsection}\index{LaTeX command!paragraph}\index{LaTeX command!subparagraph}\index{LaTeX command!chapter}\index{LaTeX command!part} ### Paragraphs and spacing In LaTeX, paragraphs are created by adding a blank line between lines. It will format all of the tabs for the beginning of paragraphs based on the document's class rules. As we discussed before, writing tabs in the markup version of your document does nothing in the compiled document. They are generally used just to make the markup easier for people to read. Note that adding more blank lines between paragraphs will not add extra space between the paragraphs in the final document. To specify the space following paragraphs (or almost any line) use the `vspace` (vertical space) command. For example, to add three centimeters of vertical space on a page type: `\vspace{3cm}`.\index{LaTeX command!vspace} Similarly, adding extra spaces between words in your LaTeX markup won't create extra spaces between words in the compiled document. To add horizontal space use the `hspace` command in the same way as `vspace`.\index{LaTeX command!hspace} ### Horizontal lines Use the `hrulefill` command to create horizontal lines in the text of your document. For example, `\hrulefill` creates: \index{LaTeX command!hrulefill} --- Inside of a `tabular` environment, use the `hline` command rather than `hrulefill`.\index{LaTeX environment!tabular}\index{LaTeX command!hline} ### Text formatting Let's briefly look at how to do some of the more common types of text formatting in LaTeX and how to create some commonly used diacritics and special characters. #### Italics and Bold {-} To italicize a word in LaTeX, use the `emph` (emphasis) command. For bold, use `textbf`. You can nest commands inside of one another to combine their effect. For example, to ***italicize and bold*** a word, use: `\emph{textbf{italicize and bold}}`.\index{LaTeX command!emph}\index{LaTeX command!textbf} #### Font size {- #FontSize} You can specify the base font size of an entire document with a `documentclass` option. For example, to create an article with 12-point font, use: `\documentclass[12pt]{article}`. There are a number of commands to set the size of specific pieces of text relative to the base size. See the following table for the full list. Usually a slightly different syntax is used for these commands that goes like this: `{\SIZE_COMMAND . . . }`. For example, to use the tiny size in your text use: `{\tiny{tiny size}}`. You can change the size of code chunks that *knitr* places in presentation documents using these commands. Just place the code chunk inside of `{\SIZE_COMMAND . . . }`. This is similar to using the `size` code chunk option. \begin{table} \caption{LaTeX Font Size Commands} \label{LaTeXFontSize} \begin{center} \vspace{0.2cm} \begin{tabular}{c} {\Huge \texttt{Huge}} \\ {\huge \texttt{huge}} \\ {\LARGE \texttt{LARGE}} \\ {\Large \texttt{Large}} \\ {\large \texttt{large}} \\ {\normalsize \texttt{normalsize}} \\ {\small \texttt{small}} \\ {\footnotesize \texttt{footnotesize}} \\ {\scriptsize \texttt{scriptsize}} \\ {\tiny \texttt{tiny}} \vspace{0.2cm} \end{tabular} \end{center} \end{table} #### Diacritics {-} You cannot directly enter letters with diacritics---e.g. accent mark---into LaTeX. For example, to create a letter c with a cedilla (ç) you need to type `\c{c}`. To create an 'a' with an acute accent (á), type: `\'{a}`. There are obviously many types of diacritics and commands to include them within LaTeX-produced documents. For a comprehensive discussion of the issue and a list of commands see the LaTeX Wikibook page on the topic: . If you regularly use non-English alphabets, you might also be interested in reading the LaTeX Wikibook page on internationalization: . #### Quotation marks {-} To specify double left quotation marks ("), use two back ticks (``` `` ```). For double right quotes ("), use two apostrophes (`''`). Single quotes follow the same format (`` `' ``). ### Math {#MathLaTeX} LaTeX is particularly popular among quantitative researchers and mathematicians because it is very good at rendering mathematical notation. A complete listing of every math command would take up quite a bit of space.[^chapter_11_8] I am briefly going to discuss how to include math in a LaTeX document. This discussion includes a few math syntax examples. To include math inline with your text, place the math syntax in between backslashes and parentheses, i.e. `\( . . . \)`. For example, `\( s^{2} = \frac{\sum(x - \bar{x})^2}{n - 1} \)` produces $s^{2} = \frac{\sum(x - \bar{x})^2}{n - 1}$ in our final document.[^chapter_11_9] We can display math separately from the text by placing the math commands inside of backslashes and square brackets: `\[ . . . \]`.[^chapter_11_10] For example, ````latex \[ s^{2} = \frac{\sum(x - \bar{x})^2}{n - 1} \] ```` gives us: $$s^{2} = \frac{\sum(x - \bar{x})^2}{n - 1}$$ ### Lists To create bullet lists in LaTeX, use the `itemize` environment. Each list item is delimited with the `item` command. For example: \index{LaTeX environment!itemize}\index{LaTeX command!item} ````latex \begin{itemize} \item The first item. \item The second item. \item The third item. \end{itemize} ```` gives us: - The first item. - The second item. - The third item. To create a numbered list, use the `enumerate` environment instead of `itemize`. You can create sublists by nesting lists inside of lists like this: \index{LaTeX environment!enumerate} ````latex \begin{itemize} \item The first item. \item The second item. \begin{itemize} \item A sublist item \end{itemize} \item The third item. \end{itemize} ```` which gives us: - The first item. - The second item. - A sublist item - The third item. ### Footnotes \index{LaTeX!footnotes} To create plain, non-bibliographic footnotes, place `\footnote{` where you would like the footnote number to appear in the text. Then type the footnote's text. Remember to close the footnote with a `}`. LaTeX does the rest, including formatting and numbering. ### Cross-references \index{LaTeX!cross-references} LaTeX will also automatically format cross-references. We were already partially introduced to cross-references in Chapters \@ref(TablesChapter) and \@ref(FiguresChapter). At the place where you would like to reference, add a `label` such as `\label{ACrossRefLabel}`.\index{LaTeX command!label} It doesn't really matter what label name you choose, though make sure they are not duplicated in the document. Then place a `ref` command (e.g. `\ref{ACrossRefLabel`) at the place in the text where you want the cross-reference to be.\index{LaTeX command!ref} If you place the `label` on the same line as a heading command, `ref` will place the heading number. If `label` is in a `table` or `figure`\index{LaTeX environment!table}\index{LaTeX environment!figure} environment, you will get the table or figure number. You can also use `pageref` instead of `ref` to include the page number.\index{LaTeX command!pageref} Finally, loading the *hyperref* package makes cross-references (or footnote) clickable.\index{LaTeX package!hyperref} Clicking on them will take you to the items they refer to. ## Bibliographies with BibTeX {#BibTeXBib} LaTeX can take advantage of very comprehensive bibliography-making capabilities. All major TeX distributions come with BibTeX. BibTeX is basically a tool for creating databases of citation information. In this section, we are going to see how to incorporate a BibTeX bibliography into your LaTeX documents. Then we will learn how to use R to automatically generate a bibliography of packages used to create a knitted document. For more information on BibTeX syntax, see the LaTeX Wikibook page on Bibliography management: . ### The *.bib* file BibTeX bibliographies are stored in plain-text files with the extension `.bib`. These files are databases of citations.[^chapter_11_11] The syntax for each citation goes like this: ````latex @DOCUMENT_TYPE{CITE_KEY, title = {TITLE}, author = {AUTHOR}, . . . = {. . .} } ```` `DOCUMENT\_TYPE` specifies what type of document---article, book, webpage, and so on---the citation is for. This determines what items the citation can and needs to include. Then we have the `CITE_KEY`. This is the reference's label that you will use to include the citation in your presentation documents. We'll look more at this later in the section. Each citation must have a unique `CITE_KEY`. A common way to write these keys is to use the author's surname and the publication year, e.g. `donoho2009`. The cite key is followed by the other citation attributes such as `author`, `title`, and `year`. These attributes all follow the same syntax: `ATTRIBUTE = {. . .}`. It's worth taking a moment to discuss the syntax for the BibTeX author attribute. First, multiple author names are separated by `and`. Second, BibTeX assumes that the last word for each author is their surname. If you would like multiple words to be taken as the "surname", then enclose these words in curly brackets. If we wanted to cite the World Bank as an author, we write `{World Bank}`; otherwise, it will be formatted "Bank, World" in the presentation document. Here is a complete BibTeX entry for @donoho2009: ````latex @article{donoho2009, author = {David L Donoho and Arian Maleki and Morteza Shahram and Inam Ur Rahman and Victoria Stodden}, title = {Reproducible research in computational harmonic analysis}, journal = {Computing in Science & Engineering}, year = {2009}, volume = {11}, number = {1}, pages = {8--18} } ```` Each item of the entry must end in a comma, except the last one.[^chapter_11_12] ### Including citations in LaTeX documents When you want to include citations from a BibTeX file in your LaTeX document, you first use the `bibliography` command.\index{LaTeX command!bibliography} For example, if the BibTeX file is called *ain.bib* and it is in the same directory as your markup document, then type: `\bibliography{ain.bib}`. You can use a bibliography stored in another directory; just include the appropriate file path information. Usually `bibliography` is placed right before `\end{document}` so that it appears at the end of the compiled presentation document. You can also specify how you would like the references to be formatted using the `bibliographystyle` command.\index{LaTeX command!bibliographystyle} For example, this book uses the American Psychological Association (APA) style for references. To set this, I included `\bibliographystyle{apa}` directly before `bibliography`. The default style[^chapter_11_13] is to number citations (e.g. `[1]`) rather than include author-year information[^chapter_11_14] used by the APA. You will need to include the LaTeX package *natbib* in your preamble to be able to use author-year citation styles. This book includes `\usepackage[authoryear]{natbib}` in its preamble.\index{author-year citations}\index{Harvard style citations} Place the `cite` command\index{LaTeX command!cite} in your document's text where you want to place a reference. You include the `CITE_KEY` for the reference in this command, e.g. `\cite{donoho2009}`. You can include multiple citations in `cite`, just separate the `CITE_KEY`s with commas. You can add options such as the page numbers or other text to a citation using square brackets (`[]`). For example, if we wanted to cite the tenth page of @donoho2009, we type: `\cite[10]{donoho2009}`. The author-year style in-text citation that this produces looks like this: [@donoho2009 10]. You can add text at the beginning of a citation with another set of square brackets. Typing `\cite[see][10]{donoho2009}` gives us: [see @donoho2009 10]. If you are using an author-year style, you can use a variety of *natbib* commands to change what information is included in the parentheses. Table \@ref(NatbibTable) contains a selection of these commands and examples. \begin{table} \caption{A Selection of \emph{natbib} In-text Citation Style Commands} \label{NatbibTable} \begin{center} \begin{tabular}{l r} \hline Command Example & Output \\[0.25cm] \hline\hline \verb|\cite{donoho2009}| & \cite{donoho2009} \\ \verb|\citep{donoho2009}| & \citep{donoho2009} \\ \verb|\citeauthor{donoho2009}| & \citeauthor{donoho2009} \\ \verb|\citeyear{donoho2009}| & \citeyear{donoho2009} \\ \verb|\citeyearpar{donoho2009}| & \citeyearpar{donoho2009} \\ \hline \end{tabular} \end{center} \end{table} ### Generating a BibTeX file of R package citations Researchers are pretty good about citing others' articles and data. However, citations of R packages used in analyses is very inconsistent. This is unfortunate not only because correct attribution is not being given to those who worked to create the packages, but also because it makes reproducibility harder. Not citing packages obscures important steps that were taken in the research process, primarily which package versions were used. Fortunately, there are R tools for quickly and dynamically generating package BibTeX files, including the versions of the packages you are using. They will automatically update the citations each time you compile your document to reflect any changes made to the packages. You can automatically create citations for R packages using the `citation()` function inside of a code chunk.\index{LaTeX command!citation} For example, if you want the citation information for the `knitr` package, you type: ```{r Ch11IntroCite} citation("knitr") ``` This gives you both the plain citation as well as the BibTeX version. If you only want the BibTeX version of the citation, use the `toBibtex()` function.\index{R function!toBibTeX} ````r toBibtex(citation("knitr")) ```` The *knitr* package creates BibTeX bibliographies for R packages with the `write_bib()` function.\index{R function!write\_bib} Let's make a BibTeX file called *packages.bib* containing citation information for the *knitr* package. ````r # Create package BibTeX file knitr::write_bib("knitr", file = "packages.bib") ```` `write_bib` automatically assigns each entry a cite key using the format `R-PACKAGE_NAME`, e.g. `R-knitr`. **Warning:** *knitr*'s `write_bib()` function currently does not have the ability to append package citations to an existing file, but instead writes them to a new file. If there is already a file with the same name, it will overwrite the file. So, be very careful using this function to avoid accidental deletions. It is a good idea to have `write_bib()` always write to a file specifically for automatically generated package citations. You can include more than one bibliography in LaTeX's `bibliography` command. All you need to do is separate them with a comma. ````latex \bibliography{main.bib, packages.bib} ```` We can use these techniques to automatically create a BibTeX file with citation information for all of the packages used in a research project. Simply make a character vector of the names of packages that you would like to include in your bibliography. Then run this through `write_bib()`. You can make sure you are citing all of the key packages used in a knitted document by (a) creating a vector of all of the packages and then (b) using this in the following code to both load the packages and write the bibliography: ````r # Package list packages_used <- c("ggplot2", "knitr", "xtable") # Load packages lapply(packages_used, library, character.only = TRUE) # Create package BibTeX file knitr::write_bib(packages_used, file = "packages.bib") ```` In the first executable line, we create our list of packages to load and cite. The next function is `lapply()` (list apply).\index{R function!lapply} This applies the function `library()`\index{R function!library} to all of the items in *packages_used*. `character.only = TRUE` is a `library` argument that allows us to use character string versions of the package names as R sees them in the *packages_used* vector, rather than as objects (how we have used `library` up until now). If you include these functions in a code chunk at the beginning of your knitted document, then you can be sure that you will have a BibTeX file with all of your packages. ## Presentations with LaTeX Beamer {#latexBeamer} You can make slideshow presentations with LaTeX. Creating a presentation with a markup language can take a bit more effort than using a WYSIWYG program like Microsoft PowerPoint or Apple's Keynote. However, combining LaTeX and *knitr* can make fully reproducible presentations that dynamically create and present results. I have found this particularly useful in my teaching. Dynamically produced presentations allow me to provide my students with fully replicable examples of how I created a figure on a slide. *knitr* also makes it easy to beautifully present code examples. One of the most popular LaTeX tools for slideshows is the beamer class. When you compile a beamer class document, a PDF will be created where every page is a different slide. All major PDF viewer programs have some sort of "View Full Screen" option to view beamer PDFs as full screen slideshows. Usually you can navigate through the slides with the forward and back arrows on the keyboard. In this section we will take a brief look at the basics of creating slideshows with beamer, highlighting special considerations that need to be made when working with beamer and *knitr*. In Chapter \@ref(MarkdownChapter) we will see how to use the *rmarkdown* package to create beamer presentations with the much simpler Markdown syntax. ```{r echo=FALSE, fig.cap='Knitted Beamer PDF Example', fig.lb='BeamerExample'} knitr::include_graphics("images/chapter_11/BeamerExample.png") ``` ### Beamer basics *knitr* largely works the same way in LaTeX slideshows as it does in article or book class documents. Nonetheless, there are a few differences to look out for. #### The Beamer preamble {-} You use `documentclass`\index{LaTeX command!documentclass} to set a LaTeX document as a `beamer` slideshow. You can also include global style information in the preamble by using the commands `usetheme`, `usecolortheme`, `useinnertheme`, `useoutertheme`.\index{LaTeX command!usetheme}\index{LaTeX command!usecolortheme}\index{LaTeX command!useinnertheme}\index{LaTeX command!useoutertheme} For a fairly comprehensive compilation of beamer themes, see the Hartwork's Beamer theme matrix: . #### Slide frames {-} \index{LaTeX!beamer slides} After the preamble, you start your document as usual by beginning the `document` environment.\index{LaTeX environment!document} Then you need to start creating slides. Individual beamer slides are created using the `frame` environments.\index{LaTeX command!frame}\index{LaTeX environment!frame} Create a frame title using `frametitle`.\index{LaTeX command!frametitle} ````latex \frame{ \frametitle{An example frame} } ```` Note that you can also use the usual `\begin{frame} . . \end{frame}` syntax. Unlike in a WYSIWYG slide show program, you will not be able to tell if you have tried to put more information on one slide than it can handle until after you compile the document.[^chapter_11_16] #### Title frames {-} One important difference from a regular LaTeX article is that instead of using `maketitle` to place your title information, in beamer you place the `titlepage` inside of a frame by itself.\index{LaTeX command!titlepage} #### Sections and outlines {-} \index{LaTeX command!section} We can use section commands in much the same way as we do in other types of LaTeX documents. Section commands do not need to be placed inside of frames. After the title slide, many slideshows have a presentation outline. You can automatically create one from your section headings using the `tableofcontents` command.\index{LaTeX command!tableofcontents} Like the `titlepage` command, `tableofcontents` can go on its own frame: \index{LaTeX!table of contents}\index{LaTeX!outlines} ```latex %%% Title slide \frame{ \titlepage } %% Table of contents slide \frame{ \frametitle{Outline} \tableofcontents } ```` #### Make list items appear {-} \index{LaTeX!list appear} Lists work the same way in beamer as they do in other LaTeX document classes. They do have an added feature in that you can have each item appear as you progress through the slide show. After `\item`, place the number of the order in which the item should appear. Enclose the number in `< ->`. For example, ````latex \begin{itemize} \item<1-> The first item. \item<2-> The second item. \item<2-> The third item. \end{itemize} ```` In this example the first item will appear before the next two. These two will appear at the same time. ### *knitr* with LaTeX slideshows *knitr* code chunks have the same syntax in LaTeX slideshows as in other LaTeX documents. You do need to make one change to the `frame` options, however, to include highlighted *knitr* code chunks on your slides. You should add the `fragile` option to the `frame` command.[^chapter_11_17] Here is an example: \index{LaTeX command!frame} ````latex \begin{frame}[fragile] \frametitle{An example fragile frame.} \end{frame} ```` ### Chapter summary {-} In this chapter we have learned the nitty-gritty of how to create simple LaTeX documents, articles and slideshows, that we can embed our reproducible research in using *knitr*. In the next chapter we look at how to use R Markdown to expand the type of presentation documents we can create reproducibly. [^chapter_11_bookdown]: The *bookdown* [@R-bookdown] R package greatly improved the ability to create book-like documents with R Markdown. The third edition of this book is made with *bookdown*. [^chapter_11_1]: Wikipedia has collated a table that comprehensively compares many of these editors: . [^chapter_11_2]: [^ch11_concordance]: `\SweaveOpts{concordance=TRUE}` maps the line numbers in your `.Rnw` file to the `.tex` file it generates. This especially helps with debugging. See: (accessed 3 October 2019). [^chapter_11_4]: "Slideshow" is not a valid class. One slideshow class that we discuss later is called "beamer". [^chapter_11_5]: In some document classes the current data will automatically be included if you don't specify the date. [^chapter_11_6]: Frequently it also includes thank yous to people who have helped with the research. [^chapter_11_7]: The `paragraph` level does not have numbers. [^chapter_11_8]: See the Netherlands TeX user group list mentioned earlier for an extensive compilation of math commands. [^chapter_11_9]: Instead of backslashes and parentheses, you can also use a pair of dollar signs (`$. . .$`). [^chapter_11_10]: Equivalently, use two pairs of dollar signs (`$$…$$`) or the `display` environment. Though it will still work in most cases, the double dollar sign math syntax may cause errors. You can also number display equations using the `equation` environment. [^chapter_11_11]: The order of the citations does not matter. [^chapter_11_12]: This is very similar to how we create vectors in R, though in BibTeX you can actually have a comma after the last attribute. [^chapter_11_13]: It is referred to in LaTeX as the plain style. [^chapter_11_14]: This is sometimes referred to as the "Harvard" style. [^chapter_11_15]: It can also install the packages if the option `install = TRUE`. You can have it install specific package versions by entering the version numbers with the `versions` argument. This is very useful for enabling the replication of analyses that rely on specific package versions. [^chapter_11_16]: One way to deal with frames that span multiple slides is to use the `allowframebreaks` command,\index{LaTeX command!allowframebreaks} `\begin{frame}[allowframebreaks].` [^chapter_11_17]: For a detailed discussion of why you need to use the `fragile` option with the `verbatim` environment that *knitr* uses to display \index{LaTeX environment!verbatim} highlighted text in LaTeX documents, see this blog post by Pieter Belmans: (posted 20 February 2011). ================================================ FILE: rep-res-3rd-edition/14-web.Rmd ================================================ # Presenting in a Variety of Formats with R Markdown {#MarkdownChapter} While Markdown started as a simple way to write HTML documents for the web, R Markdown (and the programs it relies on in the background, particularly Pandoc) dramatically expands our ability to take advantage of simple markdown syntax for creating documents in many formats. In this chapter we will learn about Markdown editors and the basic Markdown syntax for creating simple reproducible documents, including many of the things we covered for *knitr*/LaTeX documents such as headings and text formatting. Please refer back to previous chapters for syntax used to display code and code chunks (Chapter \@ref(StatsModel)), tables (Chapter \@ref(TablesChapter)), and figures (Chapter \@ref(FiguresChapter)) with R Markdown documents. In this chapter we will also briefly look at some more advanced features for including math with MathJax, footnotes and bibliographies with Pandoc, and customizing styles with CSS. Then we will learn how to create slideshows. We'll finish up the chapter by looking at options for publishing Markdown-created HTML documents, including locally on your computer and GitHub Pages. ## The Basics Markdown was created specifically to make it easy to write HTML (or XHTML[^chapter_web_1]) using a syntax that is human readable and possibly publishable without compiling. For example, compare the Markdown table syntax in Chapter \@ref(TablesChapter) to the HTML syntax for virtually the same table.[^chapter_web_2] That being said, to make Markdown simple, it does not have as many capabilities as HTML. To get around this problem, you can still use HTML in Markdown, though note that Markdown syntax cannot be used between HTML element tags. Pandoc and R Markdown extend Markdown so that it can be used to create reproducible PDF and MS Word documents. **Note:** If you are using *rmarkdown* to compile a document to PDF or Word, using raw HTML syntax will often not work as intended, if at all. As a rule, syntax specific to LaTeX or HTML that is included in an R Markdown document can only be properly compiled to a PDF or HTML document, respectively. Similarly, you are only able to include graphics that are of types supported by the output format. You are not able to include a JavaScript plot directly in a PDF. R Markdown has been continuously improving its ability to interoperate between the different formats. For example, the `kable()` function\index{R function!kable} creates tables without having to worry too much about the output format. The *knitr* code chunck option `fig.ext` (figure extension)\index{knitr option!fig.ext} allows you to more dynamically set the output format of a dynamically created figure so that it will be compilable to multiple formats. ### Getting started with Markdown editors RStudio functions as a very good editor for R Markdown documents and regular non-knittable Markdown documents as well. To create a new R Markdown document in RStudio, click `File` in the menu bar, then `New` `R Markdown`. You will then be able to select what output format you would like. RStudio has full syntax highlighting for code chunks and can compile *.Rmd* files into *.md*, then render them in *.html*, for example, with one click of the `Knit HTML` button. As we saw in Chapter \@ref(GettingStartedRKnitr), when you knit a Markdown document in RStudio, it will preview the HTML document for you. You can always view HTML documents by opening them with your web browser. You can do this directly from RStudio's **Preview HTML** window by clicking the `Open in Browser` button. ```{r echo=FALSE, fig.cap='R Markdown Compile Dropdown Menu', fig.lb='Rmarkdrop', out.width='3cm', out.height='7cm'} knitr::include_graphics("images/chapter_web/rmarkdownOutputOptions.png") ``` If you click on the downward arrow next to `Knit HTML`, you will see the above dropdown menu. This allows you to also compile the document to PDF or MS Word, regardless of which format you originally chose when you created the document. As with HTML, you will be given a preview of the PDF or Word document when it is compiled. Being plain-text, you can also use any other text editor to modify Markdown documents, though they will lack the level of integration with knitr/R Markdown that RStudio has. ### Preamble and document structure That was kind of a trick subsection title. Unlike LaTeX documents, plain Markdown documents do not have a preamble. R Markdown documents can have a header, basically another name for a preamble, but we will get to that later. There is also no need to start a body environment or anything like that. HTML head elements (HTML's preamble equivalent) are added automatically when you render Markdown documents into HTML. With Markdown, you can just start typing the content of your document. Here is an example of an R Markdown document that creates the map we saw in Chapter \@ref(FiguresChapter). We'll go through all of the code below. ````markdown --- title: "Fertilizer Consumption" author: "Christopher Gandrud" date: "12/29/2018" output: html_document --- ## Fertilizer Consumption (kilograms per hectare of arable land) in 2003 Note: Data is from the [World Bank](https://data.worldbank.org/ indicator/AG.CON.FERT.ZS) `r ''````{r CreategvisGeoMap, echo=FALSE, message=FALSE, results='asis'} source("analysis/googlevis-map.R") ``` ----------- ## R Session Info `r ''````{r echo=FALSE} sessionInfo() ``` ```` ### Headings {#MarkdownHeader} Headings in Markdown are simple. Note that Markdown *headings* and R Markdown *headers* are not the same thing. The latter gives instructions for how to render the document, the former are section titles in the text. To create a line in the topmost heading style---maybe a title---just place one hash mark (`#`) at the beginning of the line. The second-tier heading gets two hashes (`##`) and so on. You can also put the hash mark(s) at the end of the heading, but this is not necessary. Here is an example of the three headings: ````markdown # A level one heading ## A level two heading ### A level three heading ```` There are six heading levels in Markdown. You can also create a level one heading by following a line of text with equal signs. Level two headings can be created by following a line of text with dashes: ````markdown A level one heading =================== A level two heading ------------------- ```` ### Horizontal lines If you would like to create horizontal lines that run the width of the page in Markdown, place three or more equal signs or dashes separated by text from above by one blank line: ````markdown Create a horizontal line. ========= ```` ### Paragraphs and new lines \index{Markdown!lines} Just like in LaTeX, new paragraphs are created by putting text on a new line separated from previous text with a blank line. For example: ````markdown This is the first paragraph. This is the second paragraph. ```` Separating lines with a blank line places a blank line\index{Markdown!new line} in the final document. End a line with two or more white spaces ( ) to create a new line that is not separated by a blank line. ### Italics and bold \index{Markdown!italics}\index{Markdown!bold} To *italicize* a word in Markdown, place it between two asterisks, e.g. `*italicize these words*`. To make words **bold**, place them between four asterisks, two on either side: `**make these words bold**`. ### Links {#MarkdownLinks} To create hyperlinks in Markdown, use the `[LINK_TEXT](URL)` syntax.[^chapter_web_4] `LINK_TEXT` is the text that you would like to show up as the hyperlink text. When you click on this text, it will take you to the linked site specified by `URL`. If you want to show only a URL as the text, type it in both the square brackets and parentheses. This is a little tedious, so in RStudio you can just type the URL and it will be hyperlinked. In regular Markdown, place the URL between less than and greater than signs (``). ### Special characters and font customization {-} \index{Markdown!special characters} Unlike LaTeX rendered with pdfLaTeX,\index{pdfLaTeX} Markdown can include almost any letters and characters included in your system. The main exceptions are characters used by Markdown syntax (e.g. `*`, `#`, `\` and so on). You will have to escape these (see below). Font sizes and typefaces cannot be set directly with Markdown syntax. You need to set these with HTML or CSS, which I don't cover here, though below we will look at how to use a custom CSS file. ### Lists To create itemized lists in Markdown, place the items after one dash: ````markdown - Item 1 - Another item - Item 3 ```` To create a numbered list, use numbers and periods rather than dashes. ````markdown 1. Item 1 2. Another item 3. Item 3 ```` ### Escape characters {-} Markdown, like LaTeX and R, uses a backslash (`\`) as an escape character. For example, if you want to have an asterisk in the text of your document (rather than start to italicize your text, e.g. `*some italicized text*`), type: `\*`. Two characters---ampersand (`&`) and the less-than sign (`<`)---have special meanings in HTML.[^chapter_web_5] So, to have them printed literally in your text, you have to use the HTML code for the characters. Ampersands are created with `&`. Less than signs are created with `<`. ### Math with MathJax Markdown by itself can't format mathematical equations. We can create LaTeX-style equations in HTML documents by adding on the MathJax JavaScript engine. MathJax syntax is the same as LaTeX syntax (see Section \@ref(MathLaTeX)), especially when used from RStudio or when rendered with *rmarkdown*. Markdown-HTML documents rendered in RStudio automatically link to the MathJax engine online.[^chapter_web_6] If you want to use another program to render Markdown documents with MathJax equations, you may need to take extra steps to link to MathJax. For more details, see . Because backslashes are Markdown escape characters, in many Markdown editors you will have to use two backslashes to create math environments with MathJax. For example, in LaTeX and RStudio's Markdown, you can create a display equation like this: $$s^{2} = \frac{\sum(x - \bar{x})^2}{n - 1}$$ by typing:[^chapter_web_7] ````markdown $$s^{2} = \frac{\sum(x - \bar{x})^2}{n - 1}$$ ```` But, in other Markdown programs, you may have to use: ````markdown \\[ s^{2} = \frac{\sum(x - \bar{x})^2}{n - 1} \\] ```` To make inline equations, use parentheses instead of square brackets as in LaTeX, e.g. `\( s^{2} = \frac{\sum(x - \bar{x})^2}{n - 1} \)`. You can also use single dollar signs, e.g. `$ s^{2} = \frac{\sum(x - \bar{x})^2}{n - 1} $` ## Further Customizability with *rmarkdown* Markdown is simple and easy to use. But being simple means that it lacks important functionality for presenting research results, such as footnotes and bibliographies, and custom formatting. In this section we will learn how to overcome these limitations with Pandoc via the *rmarkdown* package. ### More on *rmarkdown* Headers {-} In Chapter \@ref(GettingStartedRKnitr) we first saw an R Markdown header written in YAML. Just as a refresher, here is the basic header we looked at: ````yaml --- title: "A Basic PDF Presentation Document" author: "Christopher Gandrud" date: "29 August 2019" output: pdf_document: toc: true --- ```` This header provides instructions for what to do when the document is rendered, gives instructions to render the document as a PDF (via LaTeX), and inserts a title, author, date, and table of contents at the beginning. We also have the option to include other formatting options, many of which we would include in a *knitr* LaTeX document's preamble.\index{LaTeX!preamble} You include these at the top level, i.e. without being tabbed. R Markdown refers to these options as "metadata". For example, to change the font size to 11 point we could use: ````yaml --- title: "A Basic PDF Presentation Document" author: "Christopher Gandrud" date: "30 November 2019" output: pdf_document: toc: true fontsize: 11pt --- ```` We could double-space the PDF document with a similar top-level entry: `linestretch: 2`.[^chapter_web_8] To find more options for PDF documents, type `?pdf_document` into your R console. Note that these options will only affect your PDF document, not a rendered HTML file. Remember from Chapter \@ref(GettingStartedRKnitr) that we can specify rendering instructions for multiple output formats in the same header. Here is a longer header, building on what we just saw: ````yaml --- title: "An Example rmarkdown Article" author: "Christopher Gandrud" date: "15 January 2019" output: pdf_document: latex_engine: xelatex number_sections: yes toc: yes html_document: toc: no theme: "flatly" linestretch: 2 fontsize: 11pt bibliography: - main.bib - packages.bib --- ```` Ok, let's go through this in detail. We have already seen the `title`, `author`, `date`, `linestretch`, and `fontsize` options. Notice that we used `latex_engine` to set the LaTeX engine to XeLaTeX, which is useful for documents that include non-standard English characters.\index{XeLaTeX} We also specified with `number_sections` that the PDF document should have numbered section headings. For the HTML version of the document we do not want a table of contents as we set `toc: no`. We specified a CSS theme called Flatly for our HTML document using `theme: "flatly"`. As of this writing, *rmarkdown* has a built-in ability to use a range of themes from Bootswatch\index{Bootswatch}\index{Bootswatch!Flatly} (). Alternatively, you can link to a custom CSS\index{CSS} file with the `css` option. Use `html_document` to see other options. Notice that we can use `no` and `yes` instead of `false` and `true`, respectively. We linked to two BibTeX files with the `bibliography` option. Using Pandoc syntax, the references will apply to both the PDF and HTML documents. If you want to also enable the creation of a Microsoft Word document, include `output: word_document` in the header. #### Bibliographies with Pandoc {-} \index{bibliography} Pandoc via *rmarkdown* allows us to insert citations from normal BibTeX files (see Chapter \@ref(LatexChapter)) specified in the header with `bibliography`. The main difference is that Pandoc has a different syntax from LaTeX for making in-text citations. Basic Pandoc citations begin with `@` followed by the BibTeX citation key. Square brackets (`[]`) create parentheses around the citation. Here is an example: ````markdown This is a citation \[@donoho2009]. ```` Pandoc uses *natbib*\index{LaTeX package!natbib}\index{bibliography}\index{in-text citation} by default, so the citation `[@donoho2009]` will appear as (Donoho et al., 2009). To add text before and after the citation inside of the parentheses, use something like this: `[see @donoho2009, 10]`; which creates: (see Donoho et al. 2009, 10). If you do not want the parentheses around the entire citation (only the year) then omit the square brackets. To include only the year, and not the authors' surnames, add a minus sign, e.g. `[-@donoho2009]`. See the table above for more options. Full bibliographic information for each item that is cited in the text will be produced at the end of the output document. I suggest placing a heading like `# References` at the very end of your document so that the bibliography will be differentiated from the document's text. Markup Result ---------------------------- --------------------------- `[@donoho2009]` (Donoho 2009) `[-@donoho2009]` (2009) `[see @donoho2009]` (see Donoho 2009) `[see @donoho2009, 10-11]` (see Donoho 2009, 10--11) `[@donoho2009; @Box1973]` (Donoho 2009; Box 1973) `@donoho2009 [10-11]` Donoho (2009, 10--11) : A Selection of Pandoc In-Text Citations #### Footnotes with Pandoc {-} \index{Pandoc!footnotes}\index{Markdown!footnotes} You can also include footnotes in documents rendered with *rmarkdown* by using Pandoc's footnote syntax. In the text where you would like a footnote to be located, use: `[^NOTE_KEY]`. Then at the end of your document, place `[^NOTE_KEY]: The footnote text`.[^chapter_web_9] `NOTE_KEY`s generally follow the same rules as BibTeX citation keys, so no spaces. The footnotes will be numbered sequentially when rendered. To sum up, here is an example of a document that can be rendered in HTML or PDF using R Markdown. It includes footnotes and a bibliography. ````markdown --- title: "Minimal rmarkdown Example" output: pdf_document: toc: true html_document: toc: false bibliography: main.bib --- This is some text.[^FirstNote] This is a *knitr* code chunk: `r ''````{r} plot(cars$speed, cars$dist) ``` This is a citation [see @donoho2009, 10]. [^FirstNote]: This is a footnote. # References ```` We have only covered a small proportion of Pandoc's capabilities that you can take advantage of with *rmarkdown*. For full range of Pandoc's abilities, see . ### CSS style files and Markdown You can customize the formatting of HTML documents created with Markdown files using custom CSS style sheets. CSS files allow you to specify the way a rendered Markdown file looks in a web browser including fonts, margins, background color, and so on. We don't have space to cover CSS syntax here. There are numerous online resources for learning CSS. One of the best ways may be to just copy a CSS style sheet into a new file and play around with it to see how things change. A really good resource for this is Google Chrome's Developer Tools.\index{Google Chrome!Developer Tools} The Developer Tools allows you to edit your webpages, including their CSS, and see a live preview. It is a really nice way to experiment with CSS (and HTML and JavaScript).\index{HTML}\index{JavaScript}[^chapter_web_10] There are also numerous pre-made style sheets available online.[^chapter_web_11] #### Rendering R Markdown files to HTML using custom CSS {-} The simplest way to use a custom CSS style sheet is to include the file\index{CSS} path to the CSS file in an *rmarkdown* header. As mentioned earlier, *rmarkdown* has a number of built-in CSS file options that you can access with `style`. If you want to use another custom CSS file, use the `css` option. If our custom CSS file is called *custom_style.css* in the same directory as the R Markdown document, then a basic header would be: ````yaml --- output: html_document: css: custom_style.css --- ```` If you are using the *knitr* package to render an R Markdown document to HTML, you can also include a custom CSS file. First use `knit` to knit the document to a plain Markdown file. Then use the `markdownToHTML()`\index{R function!markdownToHTML} function from the *markdown* package [@R-markdown] to render the plain Markdown document in HTML, including the `stylesheet` argument with the path to the CSS file. ## Slideshows with Markdown, R Markdown, and HTML Because R Markdown documents can be compiled into HTML files, it is possible to use them to create HTML5\index{HTML5} slideshows.[^chapter_web_12] There are a number of advantages to creating HTML presentations with Markdown: - You can use the relatively simple Markdown syntax. - HTML presentations are a nice native way to show content on the web. - HTML presentations can incorporate virtually any content that can be included in a webpage. This includes interactive content, like motion charts created by *googleVis*\index{R package!googleVis} (see Chapter \@ref(FiguresChapter)). Let's look at how to create HTML slideshows from Markdown documents using (a) the *rmarkdown* package and (b) RStudio's built-in slideshow files, called R Presentations. You can also use *rmarkdown* to create beamer presentations. #### HTML5 frameworks {-} Before getting into the details of how to use R Markdown for presentations and R Presentations, let's briefly look more into what an HTML5\index{HTML5} slideshow is and the frameworks that make it possible. HTML5 slideshows rely on a number of web technologies in addition to HTML5, including CSS,\index{CSS} and JavaScript\index{JavaScript} to create a website that behaves like a LaTeX beamer\index{beamer} or PowerPoint\index{PowerPoint} presentation. They run in your web browser and you may need to be connected to the internet for them to work properly, as key components are often located remotely. Most browsers have `Full Screen` mode you can use to view presentations. There are a number of different HTML5 slideshow frameworks that let you create and style your slideshows. In all of the frameworks, you view the slideshow in your web browser and advance through slides with the forward arrow key on your keyboard. You can go back with the back arrow. Despite these similarities, the frameworks have different looks and capabilities. ### HTML slideshows with *rmarkdown* It is very easy to create an HTML presentation using *rmarkdown* and the IO Slides[^chapter_web_13] or Slidy[^chapter_web_14] HTML5 frameworks. The syntax for IO Slides and Slidy presentations with *rmarkdown* presentations is almost exactly the same as the syntax we have seen throughout this chapter. There are two main differences from the syntax we have seen so far. First, `ioslides_presentation` for IO Slides or `slidy_presentation` for Slidy presentations is the output type to set in the header. Second, two hashes (`##`) set a frame's header.[^chapter_web_15] For example, ````markdown --- title: "Simple rmarkdown Presentation Example" author: "Christopher Gandrud" date: "26 December 2015" output: ioslides_presentation: incremental: true --- ## Access the code The code to create the following figure is available online. ```` This code creates a slide show that begins with the slide in the following figure. Bullet points will be brought in incrementally because we used `incremental: true` under `output: ioslides_presentation`. Bullets are created using Markdown list syntax. ![R Markdown/IO Slides Example Title Slide[]{label="BasicIO"}](images/chapter_web/rmarkdownIo_slidesExample.png){#BasicIO} Use three dashes (`---`) to delineate a new slide without a header. You can style the presentation further using the `css` option in the header to link to a custom CSS file. You can create a new IO Slides or Slidy *rmarkdown* presentation in RStudio by selecting `File` `R Markdown...` then `Presentation` in the menu on the left of the window (see figure below). Finally, click `HTML (ioslides)` or `HTML (Slidy)`. ![Create New R Markdown Presentation in RStudio[]{label="rmarkdownPresRStudio"}](images/chapter_web/rmarkdownPresRStudio.png){#rmarkdownPresRStudio} ### LaTeX Beamer slideshows with *rmarkdown* {#rmarkdownBeamer} As we saw in Chapter \@ref(LatexChapter), creating a presentation with LaTeX beamer involves rather convoluted syntax. Luckily, we can use *rmarkdown* to create beamer presentations using much cleaner Markdown syntax. \index{beamer|(}\index{LaTeXbeamer|(} An R Markdown beamer presentation uses the same syntax that we just saw with HTML presentations. The main difference is in the header where we use `output: beamer_presentation`. You create a new R Markdown beamer document in RStudio in a similar way as IO Slides or Slidy. The only difference is that we select `PDF (Beamer)`. As before, frame titles are delineated with two hashes (`##`). You can mark sections in much the same way with one hash. In the header you can switch the beamer theme, font theme, and color theme with `theme`, `colortheme`, and `fonttheme`, respectively. For example: ````yaml output: beamer\_presentation: incremental: true theme: "Bergen" colortheme: "crane" fonttheme: "structurebold" ```` Note that themes are placed in quotation marks. You can also include a custom template with the `template` option followed by the path to the custom template file. ![*rmarkdown*/Beamer Example Title Slide[]{label="rmarkdownBeamerExample"}](images/chapter_web/rmarkdownBeamerExample.png) \index{beamer|)}\index{LaTeXbeamer|)} ### Slideshows with Markdown and RStudio's R Presentations Another easy, but less customizable way to create HTML slideshows is with RStudio's R Presentation documents.\index{R Presentation}\index{RStudio!R Presentation} To get started, open RStudio and click `File`, `New`, then `R Presentation`. RStudio will then ask you to give the presentation a name and save it in a particular file. The reason RStudio does this is because an R Presentation is not just one file. Instead, it includes: - A *.Rpres* file, which is very similar to a *knitr* Markdown *.Rmd* file. - A *.md* Markdown file created from the *.Rpres* file. - *knitr* cache and figure folders, also created from the *.Rpres* file. #### Editing and compiling the presentation {-} You change the presentation's content by editing the *.Rpres* file using the normal *knitr* Markdown syntax we've covered. The only difference is how you create new slides. Luckily, the syntax for this is very simple. Type the slide's title, then at least three equal signs (`===`). For example, ````markdown This is an Example .Rpres Slide Title === ```` The very first slide is automatically the title slide and will be formatted differently from the rest.[^chapter_web_16] Here is an example of a complete *.Rpres* file: ````markdown Example R Presentation === ## Christopher Gandrud ## 1 July 2019 Access the Code === The code to create the following figure is available online. To access it we type: `r ''````{r, eval=FALSE} # Access and run the code to create a caterpillar plot devtools::source_url("http://bit.ly/VRKphr") ``` Caterpillar Plot === `r ''````{r, echo=FALSE, message=FALSE} # Access and run the code to create a caterpillar plot devtools::source_url("http://bit.ly/VRKphr") ``` Fertilizer Consumption Map (2003) === `r ''````{r CreategvisGeoMap, echo=FALSE, message=FALSE, results='asis'} # Create geo map of global fertilizer consumption for 2003 devtools::source\_url("http://bit.ly/VNnZxS") ``` ```` This example includes four slides and three code chunks. The last code chunk uses the *googleVis* package to create the global map of fertilizer consumption we saw earlier. Because the slideshow we are creating is in HTML, the map will be fully dynamic. Note that, like before, you will not be able to see the map in the RStudio preview, only in a web browser. To compile the slideshow, either click the `Preview` button or save the *.Rpres* document. When you do this, you can view your updated slideshow in the *Presentation* pane.\index{RStudio!Presentation pane} You can navigate through the slideshow using the arrow buttons at the bottom right of the *Presentation* pane. If you click the magnifying glass icon at the top of the *Presentation* pane, you will get a much larger view of the slideshow. You can also view the slideshow in your web browser by clicking on the `More` icon, then `View in Browser`. #### Publishing slideshows {-} You can of course, view your slideshows locally. To share your presentation with others, you probably want to either publish the presentation to a standalone HTML file and host it (e.g. with a service like Netlify \index{Netlify}) or publish it directly to RPubs. For R Presentations, create a standalone HTML file by clicking the `More` button in the *Presentation* pane, then `Save as Webpage...`. Under the `More` button, you can also choose the option `Publish to RPubs...`. ![RStudio R Presentation Pane[]{label="PresentPane"}](images/chapter_web/PresentationPane.png){#PresentPane width="\\textwidth"} ## Publishing HTML Documents Created with R Markdown In Chapter \@ref(GettingStartedRKnitr) we saw how to publish other R Markdown documents compiled with RStudio to RPubs. The *knitr* function `knit2wp()`\index{R function!knit2wp} can be used to post a knitted Markdown file to WordPress[^chapter_web_17] sites, which are often used for blogging. In this section we will look at how to publish R Markdown documents using GitHub. ### Standalone HTML files {-} You can open the HTML file rendered from any R Markdown document in your web browser. If the HTML file contains the full information for the page as it generally does when created by *rmarkdown*, e.g. the file does not depend on any auxiliary files, you can share this file via email or other means and anyone with a web browser can open it. We can of course, also send auxiliary files if need be, but this can get unwieldy. ### GitHub Pages {-} \index{GitHub!Pages}\index{GitHub} GitHub also offers a free hosting service for webpages. These can be much more complex than a single HTML file. The simplest way to create one of these pages is to create a repository with a file called *README.Rmd*. You can `knit` this file and then create your GitHub Page with it. To do this, go to the `Settings`, then `GitHub Pages` on your repository's main GitHub website. Then click `Automatic Page Generator`. This places the contents of your *README.md* file in the page and provides you with formatting options. Click `Publish` and you will have a new website. Clicking `Publish` creates a new orphan branch[^chapter_web_19] called *gh-pages*. When these branches are pushed to GitHub, it will create a website based on a file called *index.html* that you include in the branch. This will be the website's main page. If you want to create more customized and larger websites with GitHub Pages, you can manually create a GitHub Pages orphan branch\index{git!orphan branch} and push it to GitHub.\index{GitHub!gh-pages branch} This is essentially what *slidify* did for us with its `publish` function.\index{R function!publish} Imagine we have our working directory set as a repository containing an R Markdown file that we have rendered into an HTML file called *index.html*. Let's create a new orphan branch: ````sh # Create orphan gh-pages branch git checkout --orphan gh-pages ```` Now `add` the files, `commit` the changes and `push` it to GitHub. Push it to the *gh-pages* branch like this\index{git!add}\index{git!commit}\index{git!push} ````sh # Add files git add . # Commit changes git commit -am "First gh-pages commit" # Push branch to GitHub Pages git push origin gh-pages ```` A new webpage will be created at: *USERNAME.github.io/REPO\_NAME*. You can also add custom domain names. For details, see . ### Further information on R Markdown We have covered many of the core capabilities of R Markdown for creating reproducible research documents. Please see RStudio's R Markdown documentation () for even more information. Another tool to look into for interactive results presentation is the *shiny* package [@R-shiny].\index{R package!shiny} It gives R the capability to create interactive web applications, not just the static websites that we have covered in this chapter. This package is well integrated with RStudio. For more information, please see . ### Chapter summary {-} In this chapter we learned a number of tools for dynamically presenting our reproducible research on the web, as well as how to create PDFs with the simple R Markdown syntax. Though LaTeX and PDFs will likely remain the main tools for presenting research in published journals and books for some time to come, choosing to also make your research available in online native formats can make it more accessible to general readers. It also allows you to take advantage of interactive tools for presenting your research. R Markdown also makes it easy to create documents in a variety of formats. [^chapter_web_1]: Extensible HyperText Markup Language. [^chapter_web_2]: For more information, see John Gruber's website: . [^chapter_web_4]: You can also include a `title` attribute after the URL, though this is generally not very useful. [^chapter_web_5]: Ampersands declare the beginning of a special HTML character. Less-than signs begin HTML tags. [^chapter_web_6]: You will not be able to render equations when you are not online. [^chapter_web_7]: In RStudio you can also use dollar signs to delimit MathJax equations as in LaTeX. [^chapter_web_8]: 1 would be for single space and 1.5 would be for one and a half spacing. [^chapter_web_9]: You can actually put this almost anywhere and it will be placed and numbered correctly in the output document, but I find it easier to organize the footnotes when they are placed at the end. [^chapter_web_10]: For more information on how to access and use Developer Tools in Chrome see: . [^chapter_web_11]: One small note: when you create a new style sheet or copy an old one, make sure the final line is blank. Otherwise you may get an "incomplete final line" error when you render the document. [^chapter_web_12]: The slideshows created by the tools in this section use features introduced in the 5th version of HTML, i.e. HTML5. In this section I often refer to HTML5 as just HTML for simplicity. [^chapter_web_13]: [^chapter_web_14]: [^chapter_web_15]: You can create sections with one hash. [^chapter_web_16]: As of this writing, it is a blue slide with white letters. [^chapter_web_17]: [^chapter_web_19]: An orphan branch is a branch with a different root from other repository branches. Another way of thinking about this is that orphan branches have their own history. ================================================ FILE: rep-res-3rd-edition/16-conclusion.Rmd ================================================ # Conclusion {#bookconclusion} > *Well, we have completed our journey. The only thing left to do now is > practice, practice, practice.* [@shottsjr2012 432] In this book we learned a workflow for highly reproducible computational research and many of the tools needed to actually do it. Hopefully, if you haven't already, you will begin using and benefiting from these tools in your own work. Though we've covered enough material in this book to get you well on your way, there is still a lot more to learn. With most things computational (possibly most things in general), one of the best ways to continue learning is to practice and try new things. Inevitably you will hit walls, but there are almost always solutions that can be found with curiosity and patience. The R and reproducible research community is extremely helpful when it comes to finding and sharing solutions. I highly recommend getting involved in and eventually contributing to this community to get the most out of reproducible research.[^chapter_14_1] Before ending the book, I want to briefly address five issues we have not covered so far that are important for reproducible research: citing reproducible research, licensing this research, sharing your code with R packages, whether or not to make your research files public before publishing the results, and whether or not it is possible to completely future-proof your research. ## Citing Reproducible Research There are a number of well-established methods for citing presentation documents, especially published articles and books. However, as we discussed in the beginning, these documents are just the advertising for research findings rather than the actual research [@buckheit1995; @donoho2010 385]. If other researchers are going to use the data and source code used to create the findings in their own work, they need a way of actually citing the particular data and source code they used. Citing data and source code presents unique problems. Data and source code can change and be updated over time in a way that published articles and books generally are not. As such we have a much less developed, or at least less commonly used set of standards for citing these types of materials. One possibility is a standard for citing quantitative data sets laid out by @altman2007 [see also @king2007]. They argue that quantitative data set citations should: - allow a reader to quickly understand the nature of the cited data set, - unambiguously identify a particular version of the data set, and - enable reliable location, retrieval, and verification of the data set. The first issue can be solved by having a citation that includes the author, the date the data set was made public, and its title. However, these things do not unambiguously identify the data set, as it may be updated or changed and it does not enable its location and retrieval. To solve this problem, @altman2007 suggest that these citations also include: - a unique global identifier (UGI), - a universal numeric fingerprint (UNF), and - a bridge service. A UGI uniquely identifies the data set. Examples include Document Object Identifiers (DOI) and the Handel System.[^chapter_14_2] UGIs by themselves do not uniquely identify a particular version of a data set. This is where UNFs come in. They uniquely identify each version of a data set. Finally, a bridge service links the UGI and UNF to an actual document, usually posted online, so that it can be retrieved. There are many ways to register DOIs and Handel UGIs. Most of these also include means for creating UNFs and a bridge service. Examples of services that store your work and assign it DOIs are figshare[^chapter_14_3] and Zenodo.[^chapter_14_4] Zenodo can be integrated with GitHub so that it will store and create citations for a specific commit of a GitHub repository whenever you create a tag. For more information about integrating GitHub and Zenodo, see . Please see @altman2007 for details of other services.[^chapter_14_5] Though @altman2007 are interested in data sets, their system could easily be applied to source code as well. UGIs could identify a source code file or collection of files. The UNF could identify a particular version and a bridge service would create a link to the actual files. ## Licensing Your Reproducible Research In the United States and many other countries, research, including computer code made available via the internet, is automatically given copyright protection. However, copyright protection works against the scientific goals of reproducible research, because work derived from the research falls under the original copyright protections [@stodden2009 36]. To solve this problem, some authors have suggested placing code under an open source software license like the GNU General Public License (GPL) [@vandewalle2007]. @stodden2009 argues that this type of license is not really adequate for making available the data, code, and other material needed to reproduce research findings in a way that enables scientific validation and knowledge growth. I don't want to explore the intricacies of these issues here. Nonetheless, they are important for computational researchers to think about, especially if their data and source code is publicly available. Two good places to go for more information are @stodden2009 and @creativecommons2012. ## Sharing Your Code in Packages Developing R functions and putting them into packages is a good way to enable cumulative knowledge development. Many researchers spend a considerable amount of time writing code to solve problems that no one has addressed yet, or haven't addressed in a way that they believe is adequate. It is very useful if they make this code publicly accessible so that others can perhaps adopt and use it in their own work without having to duplicate the effort used to create the original functions. Abstracting your code into functions so that they can be applied to many problems and distributing them in easily installed packages makes it much easier for other researchers to adopt and use your code to help solve their research problems. The active community of researcher/package developers is one of the main reasons that R has become such a widely used and useful statistical language. Many of the tools we have covered in this book provide a good basis to start making and distributing functions. We have discussed many of the R commands and concepts that are important for creating functions. We have also looked at Git and GitHub, which are very helpful for developing and distributing packages. Learning about Hadley Wickham's *devtools* package is probably the best next step for you to take to be able to develop and distribute functions in packages. He has an excellent introduction to *devtools* and R package development in general at . RStudio Projects have excellent *devtools* integration and are certainly worth using. To begin creating a new package in RStudio, start a new project, preferably with Git version control (see Section \@ref(NewProjectGit)). In the **New Project** window, select `Package`. Now you will have a new Project with all of the files and directories you need to get started making packages that will hopefully be directly useful for the computational research community. ## Project Development: Public or Private? Hopefully I have made a convincing case in this book that research results, especially in academia, should almost always be highly reproducible. The files used to create the results need to be publicly available for the research to be really reproducible.[^chapter_14_6] During the development of a research project, however, should files be public or private? On the one hand, openness encourages transparency and feedback. Other researchers may alert you to mistakes before a result is published. On the other hand, there are worries that you may be "scooped". Another researcher might see your files, take your idea, and publish it before you have a chance to. In general, this worry may be a bit overblown. Especially if you use a version control system that clearly dates all of your file versions, it would be very easy to make the case that someone has stolen your work. Hopefully this possibility would discourage any malfeasance. That being said, unlike the clear need to make research files available after publication, during research development there are good reasons for both making files public and keeping them private. Researchers should probably make this decision on a case-by-case basis. In general, I choose to make my research repositories public to increase transparency and encourage feedback. The community of researchers in my field is relatively small and close knit. It would be hard for someone to take my work and pass it off as their own. This is especially true if many people already know that they are my ideas, because I have made by research files publicly available. Regardless, cloud storage systems like GitHub make it easy to choose whether or not to make your files public or private. You can easily keep a repository private while you create a piece of research and then make it public once the results are published. ## Is it Possible to Completely Future-Proof Your Research? In this book we've looked at a number of ways to help future-proof your research so that future researchers (and you) are able to actually reproduce it. These included storing your research in text files, clearly commenting on your code, and recording information about the software environment you used by, for example, recording your session info. Are these steps enough to completely ensure that your research will always be reproducible? The simple answer is probably no. Software changes, but it is difficult to foresee what these changes will be. Nonetheless, beyond what we have discussed so far there are other steps we can take to make our reproducible research as future-proof as possible. One of the main obstacles to completely future-proofing your research is that no (or at least very few) pieces of software are complete. R packages are updated. R is updated. Your operating system is updated. These and other software programs discussed in this book may not only be updated, but also discontinued. Changes to the software you used to find your results may change the results someone gets reproducing your research. This problem becomes larger as you use more pieces of software in your research. That being said, many of the software tools we have learned about in this book have future-proofing at their heart. TeX, the typesetting system that underlies LaTeX, is probably the best example. TeX was created in 1978 and has since been maintained with future-proofing in mind [@knuth1990]. Though changes and new versions continue to be made, we are still able to use TeX to recreate documents in their original intended form even if they were written over thirty years ago. We also saw that, though R and especially R packages change rapidly, the Comprehensive R Archive Network stores and makes accessible old versions (as the name suggests). Old versions can be downloaded by anyone wishing to reproduce a piece of research, provided the original researcher has recorded which versions they used. One approach is to use the *packrat* R package [@R-packrat] for managing the packages your project depends on. Some of the other technologies discussed in this book may be less reliable over time, so some caution should be taken if you intend to use them to create fully reproducible research. In addition to documenting what software you used and using software that archives old versions, some have suggested another step to future-proof reproducible research: encapsulate it in a virtual machine that is available on a cloud storage system. See in particular @howe2012. A virtual reproducible research machine would store a "snapshot [of] a researcher's entire working environment, including data, software, dependencies, notes, logs, scripts, and more". If the virtual machine is stored on a cloud server, then anyone wanting to reproduce the research could access the full computing environment used to create a piece of research [@howe2012 36]. As long as others could run the virtual machine and access the cloud storage system, you would not have to worry about changing software, because the exact versions of the software you used would be available in one place. We don't have space to cover the specifics of how to create a virtual machine in this book. However, using a virtual machine is a tool that can be added to the workflow discussed in this book, rather than being a replacement for it. Carefully documenting your steps, clearly organizing your files, and dynamically tying together your data gathering, analysis, and presentation files helps you and others understand how you created a result after a research project's results have been published. Being able to understand your research will give it higher research impact, as others can more easily build on it. The steps covered in this book will still encourage you to have better work habits from the beginning of your research projects even if you will be using a virtual machine. The tools and workflow will also continue to facilitate collaboration and make it easier to dynamically update your research documents when you make changes. Now, get started with reproducible research! [^chapter_14_1]: A good point of entry into the R reproducible research community is R-bloggers (). The site aggregates many different blogs on R-related topics from both advanced and relatively new R users. I have found that beyond just consuming other peoples' insights, contributing to R-bloggers---having to clearly write down my steps---has sharpened my understanding of the reproducible research process and enabled me to get great feedback. Other really useful resources are the R Stack Overflow () and Cross Validated () sites. [^chapter_14_2]: See: . [^chapter_14_3]: [^chapter_14_4]: [^chapter_14_5]: The Dataverse Project () offers a free service to host files that also uses the Handel System to assign UGIs, UNFs, and provides a bridge service. See @gandrud2013 for a comparison of Dataverse with GitHub and Dropbox for data storage. [^chapter_14_6]: There are obvious exceptions, such as when a study's participants' identities need to remain confidential. [^chapter_14_7]: Do this by entering specific package version numbers in the `versions` argument. ================================================ FILE: rep-res-3rd-edition/99-references.Rmd ================================================ `r if (knitr:::is_html_output()) '# References {-}'` ```{r include=FALSE} # Additional packages to cite pkg_additional <- c( .packages(), "animation", "bookdown", "countrycode", "data.table", "dbplyr", "dplyr", "formatR", "ggplot2", "here", "htmlwidgets", "httr", "IRkernel", "jsonlite", "magick", "markdown", "packrat", "pacman", "pdftools", "ProjectTemplate", "purrr", "RCurl", "rvest", "shiny", "stargazer", "styler", "survival", "tibble", "tidyr", "tinytex", "XML" ) # Check if the packages are installed, if not install lapply(pkg_additional, function(pkg) { if (system.file(package = pkg) == "") install.packages(pkg, repos = "http://cran.us.r-project.org") }) # pkg_to_install is created in 00-setup.Rmd pkg_to_cite <- c(pkg_to_install, pkg_additional) # generate a BibTeX database automatically for some R packages knitr::write_bib(pkg_to_cite, 'packages.bib') ``` ================================================ FILE: rep-res-3rd-edition/LICENSE ================================================ MIT License Copyright (c) 2016 Yihui Xie Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: rep-res-3rd-edition/README.md ================================================ # Reproducible Research with R and RStudio (Third Edition) This version of the book is built on top of the [Bookdown example](https://github.com/yihui/bookdown-crc) for CRC Press books helpfully created by Yihui Xie. --- Note: On macOS I had to install R from CRAN, rather than Homebrew in order to compile the souce ================================================ FILE: rep-res-3rd-edition/_bookdown.yml ================================================ book_filename: bookdown clean: [packages.bib, bookdown.bbl] delete_merged_file: true language: label: fig: "FIGURE " tab: "TABLE " ui: edit: "Edit" chapter_name: "Chapter " ================================================ FILE: rep-res-3rd-edition/_output.yml ================================================ bookdown::gitbook: css: css/style.css config: toc: collapse: none before: |
  • A Book Example
  • after: |
  • Published with bookdown
  • download: [pdf, epub] edit: https://github.com/yihui/bookdown-crc/edit/master/%s sharing: github: true facebook: false bookdown::pdf_book: includes: in_header: latex/preamble.tex before_body: latex/before_body.tex after_body: latex/after_body.tex keep_tex: true dev: "cairo_pdf" latex_engine: xelatex citation_package: natbib template: null pandoc_args: --top-level-division=chapter toc_depth: 3 toc_unnumbered: false toc_appendix: true quote_footer: ["\\VA{", "}{}"] highlight_bw: true bookdown::epub_book: stylesheet: css/style.css ================================================ FILE: rep-res-3rd-edition/book.bib ================================================ @book{healy2018data, title={Data Visualization: A Practical Introduction}, author={Healy, Kieran}, year={2018}, publisher={Princeton University Press} } @incollection{ rokem2018, author = {Ariel Rokem and Ben Marwick and Valentina Staneva}, title = {Assessing Reproducibility}, pages = {3--18}, booktitle = {The Practice of Reproducible Research: Case Studies and Lessons from the Data-Intensive Sciences}, year = {2018}, editor = {Justin Kitzes and Daniel Turek and Fatma Deniz}, publisher = {University of California Press}, address = {Oakland, CA} } @Article{ altman2007, author = {Micah Altman and Gary King}, title = {A Proposed Standard for the Scholarly Citation of Quantitative Data}, year = {2007}, journal = {D-Lib Magazine}, volume = {13}, number = {3/4} } @Article{ baath2012, author = {B{\aa}{\aa}th, Rasmus}, title = {{The state of naming conventions in {R}}}, journal = {The R Journal}, year = {2012}, volume = {4}, number = {2}, pages = {74--75} } @Book{ bacon1267, author = {Fr. Rogeri Bacon}, title = {Opera quaedam hactenus inedita. Vol. I. containing I.--Opus tertium. II.--Opus minus. III.--Compendium philosophiae.}, publisher = {Google eBook}, year = {1859}, note = {Retrieved from \url{http://books.google.com/books?id=wMUKAAAAYAAJ}} } @Article{ ball2012, author = {Ball, Richard and Medeiros, Norm}, title = {{Teaching integrity in empirical research: A protocol for documenting data management and analysis}}, journal = {The Journal of Economic Education}, year = {2011}, volume = {43}, number = {2}, pages = {182--189} } @Article{ barr2012, author = {Barr, Christopher D}, title = {{Establishing a culture of reproducibility and openness in medical research with an emphasis on the training years}}, journal = {Chance}, year = {2012}, volume = {25}, number = {3}, pages = {8--10} } @Article{ boettiger2012, author = {Boettiger, Carl and Temple Lang, Duncan}, title = {{Treebase: An R package for discovery, access and manipulation of online phylogenies}}, journal = {Methods in Ecology and Evolution}, year = {2012}, volume = {3}, number = {6}, pages = {1060--1066} } @Article{ bowers2011, author = {Jake Bowers}, title = {Six Steps to a Better Relationship with Your Future Self}, journal = {The Political Methodologist}, year = {2011}, volume = {18}, number = {2}, pages = {2-8} } @Article{ box1964analysis, title = {An analysis of transformations}, author = {Box, George EP and Cox, David R}, journal = {Journal of the Royal Statistical Society. Series B (Methodological)}, pages = {211--252}, volume = {26}, year = {1964} } @Book{ braude1979, author = {S.E. Braude}, year = {1979}, title = {ESP and Psychokinesis. A Philosophical Examination}, publisher = {Temple University Press}, address = {Philadelphia, PA} } @InCollection{ buckheit1995, author = {Jonathan B. Buckheit and David L. Donoho}, title = {Wavelab and Reproducible Research}, booktitle = {Wavelets and Statistics}, editor = {A. Antoniadis}, publisher = {Springer}, address = {New York}, pages = {55-81}, year = {1995} } @Article{ burbidge1988, author = {John B. Burbidge and Leslie Robb}, title = {Alternative Transformations to Handle Extreme Values of the Dependent Variable}, journal = {Journal of the American Statistical Association}, volume = {83}, number = {401}, year = {1988}, pages = {123--127} } @Book{ chang2012, author = {Winston Chang}, title = {R Graphics Cookbook: Practical Recipes for Visualizing Data}, year = {2012}, publisher = {O'Reilly Media, Inc.}, address = {Sebastopol, CA} } @InCollection{ cortez2007, author = {Paulo Cortez and An\'{i}bal Morais}, title = {A Data Mining Approach to Predict Forest Fires Using Meteorological Data}, editor = {J. Neves and M.F. Santos and J. Machado}, booktitle = {New Trends in Artificial INtelligence, Proceedings of the 13th EPIA}, year = {2007}, pages = {512-523}, note = {\url{http://archive.ics.uci.edu/ml/datasets/Forest+Fires}} } @Book{ crawley2005, author = {Micheal J. Crawley}, title = {Statistics: An Introduction Using R}, publisher = {John Wiley \& Sons Ltd.}, address = {Chichester}, year = {2005} } @Book{ crawley2013, author = {Micheal J. Crawley}, title = {The R Book}, edition = {2nd}, publisher = {John Wiley \& Sons Ltd.}, address = {Chichester}, year = {2013} } @Unpublished{ creativecommons2012, author = {{Creative Commons}}, title = {Data}, year = {2012}, journal = {Creative Commons Wiki}, number = {11 December}, note = {\url{http://wiki.creativecommons.org/Data}} } @Article{ donoho2002, author = {Donoho, David L}, title = {How to be a highly cited author in mathematical sciences}, journal = {in-cites}, year = {2002}, note = {\url{http://www.in-cites.com/scientists/DrDavidDonoho.html}} } @Article{ donoho2009, author = {Donoho, David L and Maleki, Arian and Shahram, Morteza and Rahman, Inam Ur and Stodden, Victoria}, title = {{Reproducible research in computational harmonic analysis}}, journal = {Computing in Science {\&} Engineering}, year = {2009}, volume = {11}, number = {1}, pages = {8--18} } @Article{ donoho2010, author = {Donoho, David L}, title = {{An invitation to reproducible computational research}}, journal = {Biostatistics}, year = {2010}, volume = {11}, number = {3}, pages = {385--388} } @Unpublished{ drummond2012, month = {September}, title = {Reproducible Research: a Dissenting Opinion}, author = {Chris Drummond}, year = {2012}, url = {http://cogprints.org/8675/} } @Article{ ehrenberg1977, author = {Ehrenberg, A S C}, title = {{Rudiments of numeracy}}, journal = {Journal of the Royal Statistical Society. Series A General}, year = {1977}, volume = {140}, number = {3}, pages = {277--297} } @Article{ fomel2009, author = {Fomel, Sergey and Claerbout, Jon F}, title = {{Reproducible Research}}, journal = {Computing in Science {\&} Engineering}, year = {2009}, volume = {11}, number = {1}, pages = {5--7} } @Article{ frazier2008, author = {Mitch Frazier}, title = {Bash Parameter Expansion}, journal = {The Linux Journal}, year = {2008}, note = {Available at: \url{http://www.linuxjournal.com/content/bash-parameter-expansion}} } @Manual{ galili2018, title = {installr: Using R to Install Stuff (Such As: R, 'Rtools', 'RStudio', 'Git', and More!)}, author = {Tal Galili and Barry Rowlingson and Boris Hejblum and Dason and G. Grothendieck and Gergely Daroczi and Heuristic Andrew and James and Thomas Leeper and VitoshKa and Yihui Xie and Michael Friendly and Kornelius Rohmeyer and Dieter Menne and Tyler Hunt and Takekatsu Hiramura and Berry Boessenkool and Jonathan Godfrey and Tom Allard and ChingChuan Chen and Jonathan Hill and Chan-Yub Park}, year = {2018}, note = {R package version 0.20.0}, url = {https://cran.r-project.org/package=installr} } @Article{ gandrud2012, author = {Gandrud, Christopher}, title = {{The diffusion of financial supervisory governance ideas}}, journal = {Review of International Political Economy}, year = {2013}, volume = {20}, number = {4}, pages = {881--916} } @Article{ gandrud2013, author = {Christopher Gandrud}, title = {GitHub: A Tool for Social Data Set Development and Verification in the Cloud}, year = {2013}, journal = {The Political Methodologist}, volume = {20}, number = {2}, pages = {2-7} } @Article{ gandrudgrafstrom2012, author = {Christopher Gandrud and Cassandra Grafstr\"{o}m}, title = {Inflated Expectations: How government partisanship shapes bureaucrats' inflation forecasts}, year = {2015}, journal = {Political Science Research and Methods}, note = {Available at: \url{http://dx.doi.org/10.1017/psrm.2014.34}} } @Article{ gelman2011tables, author = {Gelman, Andrew}, title = {{Tables as graphs: The Ramanujan principle}}, journal = {Significance}, year = {2011}, volume = {8}, number = {4}, pages = {183} } @Article{ gentleman2004, author = {Gentleman, Robert and Lang, Duncan Temple}, title = {{Statistical Analyses and Reproducible Research}}, journal = {Bioconductor Project Working Papers}, year = {2004} } @Article{ goodrich2007, author = {Ben Goodrich and Ying Lu}, year = {2007}, title = {normal.bayes: Bayesian Normal Linear Regression}, journal = {Zelig Everyone's Statistical Software}, note = {Available at: \url{http://gking.harvard.edu/zelig}} } @Article{ herndon2014, title = {Does high public debt consistently stifle economic growth? A critique of {R}einhart and {R}ogoff}, author = {Herndon, Thomas and Ash, Michael and Pollin, Robert}, journal = {Cambridge Journal of Economics}, volume = {38}, number = {2}, pages = {257--279}, year = {2014} } @Article{ howe2012, author = {Howe, Bill}, title = {{Virtual appliances, cloud computing, and reproducible research}}, journal = {Computing in Science {\&} Engineering}, year = {2012}, volume = {14}, number = {4}, pages = {36--41} } @Misc{ hyndman2010, author = {Rob J. Hyndman}, title = {Transforming Data with Zeros}, year = {2010}, note = {Available at: \url{http://robjhyndman.com/hyndsight/transformations/}. Accessed March 2015.} } @Book{ kabacoff2012, author = {Robert I. Kabacoff}, title = {R in Action: Data Analysis and Graphics with R}, publisher = {Manning Publications Co.}, address = {Shelter Island, NY}, year = {2011} } @Article{ kelly2006, author = {Kelly, Clint D}, title = {{Replicating empirical research in behavioral ecology: How and why it should be done but rarely ever is}}, journal = {The Quarterly Review of Biology}, year = {2006}, volume = {81}, number = {3}, pages = {221--236} } @Book{ king1994, author = {King, Gary. and Keohane, Robert and Verba, S.}, title = {{Designing Social Inquiry}}, publisher = {Princeton University Press}, year = {1994}, address = {Princeton} } @Article{ king1995, author = {King, Gary}, title = {{Replication, replication}}, journal = {PS: Political Science and Politics}, year = {1995}, volume = {28}, number = {3}, pages = {444--452} } @Article{ king2007, author = {King, Gary}, title = {An Introduction to the Dataverse Network as an Infrastructure for Data Sharing}, journal = {Sociological Methods {\&} Research}, year = {2007}, volume = {36}, number = {2}, pages = {173--199} } @Book{ kitzes2018, title = {The Practice of Reproducible Research: Case Studies and Lessons from the Data-Intensive Sciences}, year = {2018}, editor = {Justin Kitzes and Daniel Turek and Fatma Deniz}, publisher = {University of California Press}, address = {Oakland, CA} } @Article{ knuth1990, author = {Donald E. Knuth}, title = {The Future of TeX and MetaFont}, year = {1990}, journal = {NTG: Maps}, volume = {5}, issue = {November}, pages = {145} } @Book{ knuth1992, title = {Literate Programming}, author = {Donald E. Knuth}, year = {1992}, publisher = {Center for the Study of Language and Information}, address = {Stanford, CA}, series = {CSLI Lecture Notes} } @Book{ kross2018, title = {The Unix Workbench}, author = {Sean Kross}, publisher = {self published}, year = {2018}, note = {Accessible at: \url{https://seankross.com/the-unix-workbench/}} } @InProceedings{ leisch2002, author = {Friedrich Leisch}, title = {Sweave: Dynamic Generation of Statistical Reports Using Literate Data Analysis}, booktitle = {Compstat 2002: Proceedings in Computational Statistics}, pages = {575--580}, year = 2002, editor = {Wolfgang H{\"a}rdle and Bernd R{\"o}nz}, publisher = {Physica Verlag, Heidelberg}, note = {\url{http://www.stat.uni-muenchen.de/~leisch/Sweave}} } @Article{ lykken1968, author = {David T. Lykken}, title = {Statistical Significance in Psychological Research}, year = {1968}, journal = {Psychologial Bulletin}, volume = {70}, pages = {151-159} } @Article{ makel2014, author = {Makel, M C and Plucker, J A}, title = {{Facts are more important than novelty: Replication in the education sciences}}, journal = {Educational Researcher}, year = {2014}, volume = {43}, number = {6}, pages = {304--316} } @Book{ matloff2011, author = {Norman Matloff}, title = {The Art of Programming in R: A Tour of Statistical Programming Design}, publisher = {No Starch Press}, address = {San Francisco}, year = {2011} } @Article{ mccullough2008, author = {McCullough, B D and McGeary, Kerry Anne and Harrison, Teresa D}, title = {{Do Economics Journal Archives Promote Replicable Research?}}, journal = {Canadian Journal of Economics}, year = {2008}, volume = {41}, number = {4}, pages = {1406--1420} } @Article{ mesirov2010, author = {Mesirov, Jill P.}, title = {{Accessible reproducible research}}, journal = {Science}, year = {2010}, volume = {327}, number = {5964}, pages = {415--416} } @Article{ meyer2006, author = {Axel Meyer}, title = {Repeating Patterns of Mimicry}, journal = {PLoS Biol}, volume = {4}, number = {10}, year = {2006} } @Book{ munzert2015, title = {Automated Data Collection with R: A Practical Guide to Web Scraping and Text Mining}, author = {Simon Munzert and Christian Rubba and Peter Mei{\ss}ner and Dominic Nyhuis}, year = {2015}, publisher = {Wiley}, address = {Chichester} } @Book{ murrell2011, author = {Paul Murrell}, title = {R Graphics}, publisher = {Chapman \& Hall/CRC Press}, address = {Boca Raton, FL}, year = {2011}, edition = {2nd} } @Article{ nagler1995, author = {Nagler, Jonathan}, title = {{Coding style and good computing practices}}, journal = {PS: Political Science and Politics}, year = {1995}, volume = {28}, number = {3}, pages = {488--492} } @Article{ nosek2012, author = {Nosek, Brian A and Spies, Jeffrey R and Motyl, Matt}, title = {{Scientific utopia: II. Restructring incentives and practices to promote truth over publishability}}, journal = {Perspectives on Psychological Science}, year = {2012}, volume = {7}, number = {6}, pages = {615-631} } @Book{ oneil2013, author = {Cathy O'Neal and Rachel Schutt}, title = {Doing Data Science: Straight Talk from the Frontline}, year = {2013}, address = {Sebastopol, CA}, publisher = {O'Reilly Media Inc.} } @Manual{ pandoc2014, title = {Pandoc: A Universal Document Converter}, author = {John MacFarlane}, year = {2019}, note = {Version 2.7.3}, url = {http://pandoc.org/index.html} } @Article{ pemstein2010, author = {Daniel Pemstein and Stephen A. Meserve and James Melton}, title = {Democratic Compromise: A Latent Variable Analysis of Ten Measures of Regime Type}, journal = {Political Analysis}, year = {2010}, volume = {18}, pages = {426-449}, number = {4} } @Article{ peng2009, author = {Peng, Roger D}, title = {{Reproducible research and biostatistics}}, journal = {Biostatistics}, year = {2009}, volume = {10}, number = {3}, pages = {405--408} } @Article{ peng2011, author = {Peng, Roger D}, title = {{Reproducible research in computational science}}, journal = {Science}, year = {2011}, volume = {334}, pages = {1226-1227} } @Article{ peng2014, author = {Roger D. Peng}, title = {The Real Reason Reproducible Research is Important}, journal = {Simply Statistics}, year = {2014}, note = {\url{http://simplystatistics.org/2014/06/06/the-real-reason-reproducible-research-is-important/}} } @Article{ piwowar2007, author = {Piwowar, Heather A and Day, Roger S and Fridsma, Douglas B}, title = {{Sharing detailed research data is associated with increased citation rate}}, journal = {PLoS ONE}, year = {2007}, volume = {2}, number = {3}, pages = {1-5} } @Misc{ ramseynoweb, author = {Norman Ramsey}, title = {Noweb: {A} Simple, Extensible Tool for Literate Programming}, year = {2011}, howpublished = {\url{http://www.cs.tufts.edu/~nr/noweb/}} } @Manual{ rlanguage, title = {R: A Language and Environment for Statistical Computing}, author = {{R Core Team}}, organization = {R Foundation for Statistical Computing}, address = {Vienna, Austria}, year = {2019}, note = {\url{http://www.R-project.org/}} } @Article{ rr2010, author = {C.M. Reinhart and K.S. Rogoff}, title = {Growth in a Time of Debt}, journal = {American Economic Review: Papers \& Proceedings}, volume = {100}, year = {2010} } @Manual{ rstudiocite, title = {RStudio: Integrated development environment for R}, author = {{RStudio,}{ Inc.}}, address = {Boston, MA}, year = {2019}, note = {Version 1.2.1572}, url = {\url{https://www.rstudio.com/}} } @Manual{ rtools, title = {Rtools: Building R for Windows}, author = {Brian Ripley and Duncan Murdoch}, year = {2012}, note = {\url{http://cran.r-project.org/bin/windows/Rtools/}} } @Book{ shottsjr2012, author = {Shotts Jr., William E}, title = {The Linux Command-line: A Complete Introduction}, publisher = {No Starch Press}, year = {2012}, address = {San Francisco} } @Article{ stodden2009, author = {Stodden, Victoria}, title = {{The legal framework for reproducible scientific research}}, journal = {Computing in Science {\&} Engineering}, year = {2009}, volume = {11}, number = {1}, pages = {35--40} } @InProceedings{ stodden2009b, author = {Stodden, Victoria}, year = {2009}, title = {The Reproducible Research Standard: Reducing Legal Barriers to Scientific Knowledge and Innovation}, booktitle = {Communia: Global Science \& Economics of Knowledge-Sharing Institutions Torino, Italy June 30}, note = {\url{http://www.stanford.edu/~vcs/talks/VictoriaStoddenCommuniaJune2009-2.pdf}} } @Article{ stodden2010, author = {Stodden, Victoria}, title = {{The Scientific Method in Practice: Reproducibility in the Computational Sciences}}, journal = {MIT Sloan School Working Paper, 4773-10}, year = {2010} } @Book{ tufte2001, author = {Edward R. Tufte}, title = {The Visual Display of Quantitative Information}, publisher = {Graphics Press}, address = {Cheshire, CT}, year = {2001}, edition = {2nd} } @Book{ vanbelle2008, author = {Gerald van Belle}, title = {Statistical Rules of Thumb}, publisher = {John Wiley \& Sons}, address = {Hoboken, NJ}, edition = {2nd}, year = {2008} } @Article{ vandewalle2007, author = {Vandewalle, P and Barrenetxea, G and Jovanovic, I and Ridolfi, A and Vetterli, M}, title = {{Experiences with reproducible research in various facets of signal processing research}}, journal = {Acoustics, Speech and Signal Processing}, year = {2007}, volume = {4}, pages = {1253--1256} } @Article{ vandewalle2012, author = {Vandewalle, Patrick}, title = {{Code sharing is associated with research impact in image processing}}, journal = {Computing in Science {\&} Engineering}, year = {2012}, volume = {14}, number = {4}, pages = {42--47} } @Book{ whickham2009book, author = {Hadley Wickham}, title = {ggplot2: Elegant Graphics for Data Analysis}, year = {2009}, publisher = {Springer}, address = {New York}, edition = {2nd} } @Article{ whickham2010journal, author = {Hadley Wickham}, title = {A Layered Grammar of Graphics}, journal = {Journal of Computational and Graphical Statistics}, volume = {19}, number = {1}, year = {2010}, pages = {3-28} } @Book{ whickham2014book, author = {Hadley Wickham}, title = {Advanced R}, publisher = {Chapman \& Hall/CRC Press}, address = {Boca Raton, FL}, year = {2014} } @Article{ wickham2014article, author = {Wickham, Hadley}, title = {{Tidy Data}}, journal = {Journal of Statistical Software}, year = {2014}, volume = {59}, number = {10}, pages = {1--23} } @Article{ wilson2012, author = {Wilson, Greg and Aruliah, D A and Brown, C Titus and Hong, Niel P Chue and Davis, Matt and Guy, Richard T and Haddock, Steven H D and Huff, Katy and Mitchell, Ian M and Plumbley, Mark D and Ben Waugh and White, Ethan P and Wilson, Paul}, title = {{Best practices for scientific computing}}, journal = {arXiv}, note = {Available at: \url{http://arxiv.org/pdf/1210.0530v3}}, year = {2012}, volume = {29 November 2012}, pages = {1--6}, month = nov } @Manual{ worldbank2013, author = {{World Bank}}, title = {World Development Indicators}, year = {2018}, url = {\url{https://datacatalog.worldbank.org/dataset/world-development-indicators}} } @Book{ xie2013, author = {Yihue Xie}, title = {Dynamic Documents with R and knitr}, publisher = {Chapman \& Hall/CRC Press}, address = {Boca Raton, FL}, year = {2013} } @Book{ xie2015, title = {Dynamic Documents with {R} and knitr}, author = {Yihui Xie}, publisher = {Chapman \& Hall/CRC}, address = {Boca Raton, Florida}, year = {2015}, edition = {2nd}, note = {ISBN 978-1498716963}, url = {http://yihui.name/knitr/} } @Book{ xie2018, title = {R Markdown: The Definitive Guide}, author = {Yihui Xie}, publisher = {Chapman \& Hall/CRC}, address = {Boca Raton, Florida}, year = {2015} } ================================================ FILE: rep-res-3rd-edition/css/style.css ================================================ p.caption { color: #777; margin-top: 10px; } p code { white-space: inherit; } pre { word-break: normal; word-wrap: normal; } pre code { white-space: inherit; } p.flushright { text-align: right; } blockquote > p:last-child { text-align: right; } blockquote > p:first-child { text-align: inherit; } ================================================ FILE: rep-res-3rd-edition/index.Rmd ================================================ --- title: "Reproducible Research with R and RStudio (Third Edition)" author: "Christopher Gandrud" date: "`r Sys.Date()`" documentclass: krantz bibliography: [book.bib, packages.bib] biblio-style: apalike link-citations: yes colorlinks: yes site: bookdown::bookdown_site chapter_name: "Chapter " description: "Reproducible Research with R and RStudio (Third Edition) brings together the skills and tools needed for doing and presenting computational research. Using straightforward examples, the book takes you through an entire reproducible research workflow. This practical workflow enables you to gather and analyze data as well as dynamically present results in print and on the web." github-repo: christophergandrud/Rep-Res-Book graphics: yes #cover-image: images/cover.jpg --- ```{r setup, include=FALSE} options( htmltools.dir.version = FALSE, formatR.indent = 2, width = 55, digits = 4 ) knitr::opts_chunk$set( fig.align = 'center' ) ``` # Preface {-} ## Motivation {-} This book has its genesis in my PhD research at the London School of Economics. I started the degree with questions about the 2008/09 financial crisis and planned to spend most of my time researching capital adequacy requirements. But I quickly realized that I would actually spend a large proportion of my time learning the day-to-day tasks of data gathering, analysis, and results presentation. After plodding through for a while with Word, Excel, and Stata, my breaking point came while reentering results into a regression table after I had tweaked one of my statistical models, yet again. Surely there was a better way to *do* research that would allow me to spend more time answering my research questions. Making research reproducible for others also means making it better organized and efficient for yourself. My search for a better way led me straight to the tools for reproducible computational research. The reproducible research community is very active, knowledgeable, and helpful. Nonetheless, I often encountered holes in this collective knowledge, or at least had no resource organizing it all together as a whole. That is my intention for this book: to bring together the skills I have picked up for actually doing and presenting computational research. Hopefully, the book, along with making reproducible research more widely used, will save researchers hours of googling, so they can spend more time addressing their research questions. ## Changes to the Third Edition {-} - Spring cleaning: updated package recommendations, examples, and URLs. Removed technologies no longer in regular use. - More advanced R Markdown and less LaTeX in discussions of markup languages and examples. - Stronger focus on reproducible working directory tools. - Updated discussion of cloud storage services and persistently citing reproducible material. - Added discussion of Jupyter notebooks and reproducible practices in industry. - Examples of data manipulation with Tidyverse tibbles (in addition to standard data frames) and `pivot_longer()` and `pivot_wider()` functions for pivoting data. - Naming conventions are in current R-Tidyverse best practice. A detailed list of changes for the third edition is available at . ## Changes to the Second Edition {-} The tools of reproducible research have developed rapidly since the first edition of this book was published just two years ago. The second edition has been updated to incorporate the most important of these advancements, including discussions of: - The *rmarkdown* package, which allows you to create reproducible research documents in PDF, HTML, and Microsoft Word formats using the simple and intuitive Markdown syntax. - Improvements and changes to RStudio's interface and capabilities, such as its new tools for handling R Markdown documents. - Expanded *knitr* R code chunk capabilities. - The `kable()` function in the *knitr* package and the *texreg* package for dynamically creating tables to present your data and statistical results. - An improved discussion of file organization allowing you to take full advantage of relative file paths so that your documents are more easily reproducible across computers and systems. - The *dplyr*, *magrittr*, and *tidyr* packages for fast data manipulation. - Numerous changes to R syntax in user-created packages. - Changes to GitHub's and Dropbox's interfaces. \newpage ## Acknowledgments {-} I would not have been able to write this book without many people's advice and support. Foremost is John Kimmel, acquisitions editor at Chapman & Hall. He approached me in Spring 2012 with the general idea and opportunity for this book. Other editors at Chapman & Hall and Taylor & Francis have greatly contributed to this project, including Marcus Fontaine. I would also like to thank all of the book's reviewers whose helpful comments have greatly improved it. The first edition's reviewers include: - Jeromy Anglim, Deakin University - Karl Broman, University of Wisconsin, Madison - Jake Bowers, University of Illinois, Urbana-Champaign - Corey Chivers, McGill University - Mark M. Fredrickson, University of Illinois, Urbana-Champaign - Benjamin Lauderdale, London School of Economics - Ramnath Vaidyanathan, McGill University Many other anonymous reviewers also gave great feedback over the years. The developer and blogging community has also been incredibly important for making this book possible. Foremost among these people is Yihui Xie. He is the main developer behind the *knitr* package, co-developer of *rmarkdown*, and also an avid blog writer and commenter. Without him, the ability to do reproducible research would be much harder and the blogging community that spreads knowledge about how to do these things would be poorer. Other great contributors to the reproducible research community include Carl Boettiger, Karl Broman, Markus Gesmann (who developed *googleVis*), Rob Hyndman, and Hadley Wickham (who has developed numerous very useful R packages). Thank you also to Victoria Stodden and Michael Malecki for helpful suggestions. And, of course, thank you to everyone at RStudio (especially JJ Allaire) for creating an increasingly useful program for reproducible research. The second edition has benefited immensely from first edition readers' comments and suggestions. For a list of their valuable contributions, please see the book's GitHub Issues page and the first edition's Errata page . My students at Yonsei University were an important part of making the first edition. One of the reasons that I got interested in using many of the tools covered in this book, like using *knitr* in slideshows, was to improve a course I taught there: Introduction to Social Science Data Analysis. I tested many of the explanations and examples in this book on my students. Their feedback has been very helpful for making the book clearer and more useful. Their experience with using these tools on Microsoft Windows computers was also important for improving the book's Windows documentation. Similarly, my students at the Hertie School of Governance inspired and tested key sections of the second edition. The vibrant community at Stack Overflow and Stack Exchange are always very helpful for finding answers to problems that plague any computational researcher. Importantly, the sites make it easy for others to find the answers to questions that have already been asked. The library at the University of California, San Francisco was a great home for writing the third edition. Kristina Gandrud has been immensely supportive and patient with me throughout the writing of this book (and my entire career). ================================================ FILE: rep-res-3rd-edition/krantz.cls ================================================ %% This is file `Krantz.cls' %%% Created by Shashi Kumar / ITC [August 2008] \NeedsTeXFormat{LaTeX2e}[1995/12/01] \ProvidesClass{krantz} [2005/09/16 v1.4f Standard LaTeX document class] \newcommand\@ptsize{} \newif\if@restonecol \newif\if@titlepage \@titlepagetrue \newif\if@openright \newif\if@mainmatter \@mainmattertrue \if@compatibility\else \DeclareOption{a4paper} {\setlength\paperheight {297mm}% \setlength\paperwidth {210mm}} \DeclareOption{a5paper} {\setlength\paperheight {210mm}% \setlength\paperwidth {148mm}} \DeclareOption{b5paper} {\setlength\paperheight {250mm}% \setlength\paperwidth {176mm}} \DeclareOption{letterpaper} {\setlength\paperheight {11in}% \setlength\paperwidth {8.5in}} \DeclareOption{legalpaper} {\setlength\paperheight {14in}% \setlength\paperwidth {8.5in}} \DeclareOption{executivepaper} {\setlength\paperheight {10.5in}% \setlength\paperwidth {7.25in}} \DeclareOption{landscape} {\setlength\@tempdima {\paperheight}% \setlength\paperheight {\paperwidth}% \setlength\paperwidth {\@tempdima}} \fi \if@compatibility \renewcommand\@ptsize{0} \else \DeclareOption{10pt}{\renewcommand\@ptsize{0}} \fi \DeclareOption{11pt}{\renewcommand\@ptsize{1}} \DeclareOption{12pt}{\renewcommand\@ptsize{2}} \if@compatibility\else \DeclareOption{oneside}{\@twosidefalse \@mparswitchfalse} \fi \DeclareOption{twoside}{\@twosidetrue \@mparswitchtrue} \DeclareOption{draft}{\setlength\overfullrule{5pt}} \if@compatibility\else \DeclareOption{final}{\setlength\overfullrule{0pt}} \fi \DeclareOption{titlepage}{\@titlepagetrue} \if@compatibility\else \DeclareOption{notitlepage}{\@titlepagefalse} \fi \if@compatibility \@openrighttrue \else \DeclareOption{openright}{\@openrighttrue} \DeclareOption{openany}{\@openrightfalse} \fi \if@compatibility\else \DeclareOption{onecolumn}{\@twocolumnfalse} \fi \DeclareOption{twocolumn}{\@twocolumntrue} \DeclareOption{leqno}{\input{leqno.clo}} \DeclareOption{fleqn}{\input{fleqn.clo}} \DeclareOption{openbib}{% \AtEndOfPackage{% \renewcommand\@openbib@code{% \advance\leftmargin\bibindent \itemindent -\bibindent \listparindent \itemindent \parsep \z@ }% \renewcommand\newblock{\par}}% } %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \newif\if@numbysec \DeclareOption{numbysec}{\@numbysectrue} \newif\if@numberinsequence \DeclareOption{numberinsequence}{\@numberinsequencetrue} \newif\if@nocaptionbreak \DeclareOption{NoCaptionBreak}{\@nocaptionbreaktrue} \newif\if@sevenbyten \DeclareOption{sevenbyten}{\@sevenbytentrue} \newif\if@cip \DeclareOption{cip}{\@ciptrue} \newif\if@times \DeclareOption{times}{\@timestrue} \newif\if@chapnumonly \DeclareOption{chapnumonly}{\@chapnumonlytrue} \newif\if@ChapterResetsPage \DeclareOption{ChapterResetsPage}{\@ChapterResetsPagetrue} \newif\if@ChapterTOCs \DeclareOption{ChapterTOCs}{\@ChapterTOCstrue} \newif\if@EOCRefs \DeclareOption{EOCRefs}{\@EOCRefstrue}% \newif\if@SuperscriptCites \DeclareOption{SuperscriptCites}{\@SuperscriptCitestrue}% \newif\if@UnnumberedReferences \DeclareOption{UnnumberedReferences}{\@UnnumberedReferencestrue}% \newif\if@pdf \DeclareOption{pdf}{\@pdftrue} \DeclareOption{krantz1}{\@krantzatrue} \newif\if@krantza \DeclareOption{krantz2}{\@krantzbtrue} \newif\if@krantzb %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \ExecuteOptions{letterpaper,10pt,twoside,onecolumn,final,openright} \ProcessOptions %%%%%%%%%%%%%%%%%%% \def\helv@scale{.82} % \DeclareFontFamily{T1}{helvetica}{}% \DeclareFontShape{T1}{helvetica}{m}{n}{<->s*[\helv@scale]phvr8t}{}% \DeclareFontShape{T1}{helvetica}{m}{it}{<->s*[\helv@scale]phvro8t}{}% \DeclareFontShape{T1}{helvetica}{m}{sc}{<->s*[\helv@scale]phvrc8t}{}% \DeclareFontShape{T1}{helvetica}{b}{n}{<->s*[\helv@scale]phvb8t}{}% \DeclareFontShape{T1}{helvetica}{b}{it}{<->s*[\helv@scale]phvbo8t}{}% \DeclareFontShape{T1}{helvetica}{m}{sl}{<->s*[\helv@scale]phvro8t}{}% \DeclareFontShape{T1}{helvetica}{b}{sc}{<->s*[\helv@scale]phvbc8t}{}% \DeclareFontShape{T1}{helvetica}{b}{sl}{<->s*[\helv@scale]phvbo8t}{}% \DeclareFontShape{T1}{helvetica}{bx}{n}{<->s*[\helv@scale]phvb8t}{}% \DeclareFontShape{T1}{helvetica}{bx}{it}{<->s*[\helv@scale]phvbo8t}{}% \DeclareFontShape{T1}{helvetica}{bx}{sc}{<->s*[\helv@scale]phvbc8t}{}% \DeclareFontShape{T1}{helvetica}{bx}{sl}{<->ssub * helvetica/b/it}{}% \DeclareFontFamily{OT1}{helvetica}{}% \DeclareFontShape{OT1}{helvetica}{m}{n}{<->s*[\helv@scale]phvr7t}{}% \DeclareFontShape{OT1}{helvetica}{m}{it}{<->s*[\helv@scale]phvro7t}{}% \DeclareFontShape{OT1}{helvetica}{m}{sc}{<->s*[\helv@scale]phvrc7t}{}% \DeclareFontShape{OT1}{helvetica}{b}{n}{<->s*[\helv@scale]phvb7t}{}% \DeclareFontShape{OT1}{helvetica}{b}{it}{<->s*[\helv@scale]phvbo7t}{}% \DeclareFontShape{OT1}{helvetica}{m}{sl}{<->s*[\helv@scale]phvro7t}{}% \DeclareFontShape{OT1}{helvetica}{b}{sc}{<->s*[\helv@scale]phvbc8t}{}% \DeclareFontShape{OT1}{helvetica}{b}{sl}{<->s*[\helv@scale]phvbo7t}{}% \DeclareFontShape{OT1}{helvetica}{bx}{n}{<->s*[\helv@scale]phvb7t}{}% \DeclareFontShape{OT1}{helvetica}{bx}{it}{<->s*[\helv@scale]phvbo7t}{}% \DeclareFontShape{OT1}{helvetica}{bx}{sc}{<->s*[\helv@scale]phvbc8t}{}% \DeclareFontShape{OT1}{helvetica}{bx}{sl}{<->s*[\helv@scale]phvbo7t}{}% %%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%% Font Defined %%%%%%%%%%%%%%%%% \def\@xipt{11} \def\@xviiipt{18} \def\@xxivpt{24} \newcommand\ContributorAffiliationFont{\reset@font\fontsize{10}{12}\raggedright\selectfont} \newcommand\ContributorNameFont{\reset@font\fontsize{10}{12}\bfseries\raggedright\selectfont} \newcommand\TitlePageTitleFont{\fontsize{24}{28}\slshape\bfseries\selectfont} \newcommand\PageNumFont{\reset@font\fontsize{10}{12}\selectfont} \newcommand\ChapNumFont{\reset@font\fontsize{24}{24}\bfseries\selectfont} \newcommand\ChapTitleFont{\reset@font\fontsize{18}{20}\slshape\selectfont} \newcommand\SectionHeadFont{\fontsize{12}{14}\bfseries\selectfont} \newcommand\SubsectionHeadFont{\fontsize{11}{13}\bfseries\selectfont} \newcommand\SubsubsectionHeadFont{\fontsize{10}{12}\bfseries\selectfont} \newcommand\ParagraphHeadFont{\fontsize{10}{12}\itshape\selectfont} \newcommand\SubParagraphHeadFont{\fontsize{10}{12}\itshape\selectfont} \newcommand\FMHeadFont{\reset@font\fontsize{18}{20}\slshape\bfseries\selectfont} \newcommand\RunningHeadFont{\fontsize{10}{12}\itshape\selectfont} \newcommand\NameFont{\fontsize{10}{12}\itshape\selectfont} \newcommand\AffiliationFont{\fontsize{8}{10}\selectfont} \newcommand\FigCapFont{\fontsize{10}{12}\bfseries\selectfont} \newcommand\FigCapBIFont{\fontsize{10}{12}\bfseries\itshape\selectfont} \newcommand\TableColHeadFont{\fontsize{10}{12}\bfseries\selectfont} \newcommand\TableTitleFont{\fontsize{10}{12}\selectfont} \newcommand\TableNumberFont{\fontsize{11}{13}\bfseries\selectfont} \newcommand\TableBodyFont{\reset@font\fontsize{9}{11}\selectfont} \newcommand\TableSubheadFont{\reset@font\fontsize{9}{11}\selectfont} \newcommand\TableFootnoteFont{\reset@font\fontsize{8}{10}\selectfont} \newcommand\CAPlusOneFont{\fontsize{10}{12}\bfseries\selectfont} \newcommand\CAAPlusOneFont{\fontsize{10}{12}\itshape\selectfont} \newcommand\tocfont{\fontsize{10}{12}\selectfont} \newcommand\extraFont{\fontsize{24}{28}\selectfont} \newcommand\VfFont{\fontsize{10}{12}\selectfont} %%%%%%%%%%%%%%%%% \input{bk1\@ptsize.clo} \setlength\lineskip{1\p@} \setlength\normallineskip{1\p@} \renewcommand\baselinestretch{} \setlength\parskip{0\p@ \@plus \p@} \@lowpenalty 51 \@medpenalty 151 \@highpenalty 301 \@beginparpenalty -\@lowpenalty \@endparpenalty -\@lowpenalty \@itempenalty -\@lowpenalty % \clubpenalty=0 % 'Club line' at bottom of page. \widowpenalty=10000 % 'Widow line' at top of page. \setcounter{topnumber}{2} \renewcommand\topfraction{.7} \setcounter{bottomnumber}{1} \renewcommand\bottomfraction{.3} \setcounter{totalnumber}{3} \renewcommand\textfraction{.2} \renewcommand\floatpagefraction{.5} \setcounter{dbltopnumber}{2} \renewcommand\dbltopfraction{.7} \renewcommand\dblfloatpagefraction{.5} % **************************************** % * PAGE LAYOUT * % **************************************** % % All margin dimensions measured from a point one inch from top and side % of page. % % SIDE MARGINS: % \oddsidemargin 6pc %5pc \evensidemargin 5.7pc %5pc \marginparwidth 4pc \marginparsep 1pc \topmargin 12pt %0pt \headheight 12pt \headsep 12pt \footskip 2pc % % DIMENSION OF TEXT: \newdimen\trimheight \newdimen\trimwidth \newdimen\normaltextheight \newdimen\tempa \newdimen\tempdimen % %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% Parameter Initializaton %%%%%%%%%%%%%%%%%%%%%%%%%% % \newdimen\htrim \newdimen\vtrimtop \newdimen\vtrimbot \setlength\trimheight{9in} \setlength\trimwidth{6in} % % \if@krantza \textheight = 45pc %\advance\textheight by \topskip \addtolength\textheight{3pt} \textwidth 28pc \addtolength\textwidth{.5pt} \topmargin0in \oddsidemargin1.1875in \evensidemargin1.1875in \htrim.7365in \vtrimtop1.068in \vtrimbot1.068in \hoffset-15pt \voffset39pt \let\normaltextheight\textheight \else\if@krantzb \textheight = 51pc % \advance\textheight by \topskip \textwidth 33pc \topmargin0in \oddsidemargin.5in \evensidemargin.5in \htrim.75in \vtrimtop.8607in \vtrimbot1.027in \hoffset-.1in \voffset-.15in%.04in \let\normaltextheight\textheight \else %%%Uncomment to get 6x9 trim %%%%\textheight = 43pc %%%% %\advance\textheight by \topskip %%%%\addtolength\textheight{3pt} %%%% \textwidth 26pc %%%%\addtolength\textwidth{.5pt} %%%% \topmargin0in %%%% \oddsidemargin1.1875in %%%% \evensidemargin1.1875in %%%% \htrim5.05pc %%%% \vtrimtop7.7pc %%%% \vtrimbot5.44pc %%%%% \hoffset-5pt %%%% \voffset45pt %%%%\let\normaltextheight\textheight \textheight = 45pc %\advance\textheight by \topskip \addtolength\textheight{3pt} \textwidth 28pc \addtolength\textwidth{.5pt} \topmargin0in \oddsidemargin1.1875in \evensidemargin1.1875in \htrim.7365in \vtrimtop1.068in \vtrimbot1.068in \hoffset-15pt \voffset39pt \let\normaltextheight\textheight \fi \fi % \columnsep 1pc \columnseprule 0pt % % FOOTNOTES % \footnotesep 6.65pt \skip\footins 12pt plus 3pt minus 1.5pt % %%%% Trim marks %%%%%%%%%%% \newsavebox\ul@box \newsavebox\ur@box \newsavebox\ll@box \newsavebox\lr@box \def\top@cornermarks{% \hskip-\htrim \vbox to 0\p@{\vskip-\vtrimtop\llap{\copy\ul@box}\vss}% \vbox to 0\p@{\vskip-\vtrimtop\rlap{\hskip\textwidth\hskip2\htrim\copy\ur@box}\vss}% \vbox to 0\p@{\vskip\textheight\vskip\vtrimbot\llap{\copy\ll@box}\vss}% \vbox to 0\p@{\vskip\textheight\vskip\vtrimbot\rlap{\hskip\textwidth\hskip2\htrim\copy\lr@box}\vss}% \hskip\htrim} \def\make@cornermarks{% \sbox\ul@box{\rule{18\p@}{.25\p@}\hskip8\p@\hbox to.25\p@{\vbox to26\p@{\noindent\rule{.25\p@}{18\p@}}}}% \sbox\ur@box{\hbox to.25\p@{\vbox to26\p@{\noindent\rule{.25\p@}{18\p@}}}\hskip8\p@\rule{18\p@}{.25\p@}}% \sbox\ll@box{\rule{18\p@}{.25\p@}\hskip8\p@\lower34\p@\hbox to.25\p@{\vbox to26\p@{\noindent\rule{.25\p@}{18\p@}}}}% \sbox\lr@box{\lower34\p@\hbox to.25\p@{\vbox to26\p@{\noindent\rule{.25\p@}{18\p@}}}\hskip8\p@\rule{18\p@}{.25\p@}}} %%%%%%%%%%%%%%%%%%%% End Trim Marks %%%%%%%%%%%% \def\ps@plain{\let\@mkboth\@gobbletwo \let\@oddhead\top@cornermarks%\@empty \def\@oddfoot{\reset@font\hfil\thepage \hfil}\let\@evenhead\@empty\let\@evenfoot\@oddfoot} \def\even@head{% \top@cornermarks {\@the@page\RunningHeadFont \hfill \if@mainmatter\thechapter\enspace\fi\leftmark }} \def\odd@head{% \top@cornermarks \hfil{\RunningHeadFont \if@mainmatter\thesection\enspace\else\fi\rightmark } \hfill \@the@page } \def\@the@page{{\PageNumFont\thepage}} \if@twoside \def\ps@headings{% \let\@mkboth\@gobbletwo \if@pdf \let\@evenhead\@empty \let\@oddhead\@empty \def\@oddfoot{\@cip\hfil}% \def\@evenfoot{\@cip\hfil}% \else \let\@oddfoot\@empty \let\@evenfoot\@empty \let\@evenhead\even@head \let\@oddhead\odd@head \fi } \else \def\ps@headings{\let\@mkboth\@gobbletwo% \if@pdf \let\@evenhead\@empty \let\@oddhead\@empty \def\@oddfoot{\@cip\hfil}% \def\@evenfoot{\@cip\hfil}% \else \let\@oddfoot\@empty \let\@evenfoot\@empty \let\@evenhead\even@head \let\@oddhead\odd@head \fi } \fi \def\ps@myheadings{% \let\@oddfoot\@empty\let\@evenfoot\@empty \def\@evenhead{\thepage\hfil\slshape\leftmark}% \def\@oddhead{{\slshape\rightmark}\hfil\thepage}% \let\@mkboth\@gobbletwo \let\chaptermark\@gobble \let\sectionmark\@gobble } \def\ps@empty{% \let\@mkboth\@gobbletwo \if@pdf \let\@evenhead\@empty \let\@oddhead\@empty \def\@oddfoot{\@cip\hfil}% \def\@evenfoot{\@cip\hfil}% \else \make@cornermarks \let\@oddhead\top@cornermarks \let\@evenhead\top@cornermarks \let\@oddfoot\@empty \let\@evenfoot\@empty \fi } \def\ps@folio{% \let\@mkboth\@gobbletwo \if@pdf \let\@evenhead\@empty \let\@oddhead\@empty \def\@oddfoot{\@cip\hfil}% \def\@evenfoot{\@cip\hfil}% \else \let\@oddhead\top@cornermarks \def\@oddfoot{% \parindent\z@ \baselineskip7\p@ \hbox{% \textwidth\@ciprulewidth \vbox{% \if@cip\rule{\@ciprulewidth}{.25pt}\par \hbox{\vbox{\noindent\copy\@cipboxa\par\noindent\copy\@cipboxb}}\fi}} \hfill\@the@page} \let\@evenhead\top@cornermarks%\odd@head \let\@evenfoot\@oddfoot \fi } \newcommand\HeadingsBookChapter{% \def\chaptermark##1{% \markboth{\@title}{% ##1}}% \def\sectionmark##1{}} \def\HeadingsChapterSection{% \def\chaptermark##1{% \markboth{% ##1}{}}% \def\sectionmark##1{% \markright{% ##1}}} \def\pdfon{\@pdftrue} \def\pdfoff{\@pdffalse} \if@pdf \def\@cip{{\fontsize{6\p@}{8\p@}\selectfont\copyright 2001 by CRC Press LLC}} \else \newsavebox\@cipboxa \newsavebox\@cipboxb \newdimen\@ciprulewidth \def\@cip#1#2{% \sbox\@cipboxa{\fontsize{6\p@}{8\p@}\selectfont #1}% \sbox\@cipboxb{\fontsize{6\p@}{8\p@}\selectfont #2}% \@ciprulewidth\wd\@cipboxa \ifnum\@ciprulewidth<\wd\@cipboxb\@ciprulewidth\wd\@cipboxb\fi}% \fi \if@pdf \else \AtBeginDocument{% \@cip{\rule{0pt}{9pt}0-8493-0052-5/00/\$0.00+\$.50}% {\copyright\ \ 2001 by CRC Press LLC}}% \fi \if@titlepage \newcommand\maketitle{\begin{titlepage}% \let\footnotesize\small \let\footnoterule\relax \let \footnote \thanks {\parindent \z@ \raggedright \baselineskip \z@ \lineskip \z@ \parskip \z@ \vbox{ \vskip -7bp {\baselineskip 10bp\lineskip 10bp\NameFont\uppercase{\@author}\par} \vskip 6bp \AffiliationFont \@affiliation \vskip -2bp \crcrule \vskip 22bp {\baselineskip 24bp\lineskip 24bp\TitlePageTitleFont\@title\par}}} \@thanks \vfil\null \end{titlepage}% \setcounter{footnote}{0}% \global\let\thanks\relax \global\let\maketitle\relax \global\let\@thanks\@empty \global\let\@author\@empty \global\let\@date\@empty % \global\let\@title\@empty \global\let\title\relax \global\let\author\relax \global\let\date\relax \global\let\and\relax } \else \newcommand\maketitle{\par \begingroup \renewcommand\thefootnote{\@fnsymbol\c@footnote}% \def\@makefnmark{\rlap{\@textsuperscript{\normalfont\@thefnmark}}}% \long\def\@makefntext##1{\parindent 1em\noindent \hb@xt@1.8em{% \hss\@textsuperscript{\normalfont\@thefnmark}}##1}% \if@twocolumn \ifnum \col@number=\@ne \@maketitle \else \twocolumn[\@maketitle]% \fi \else \newpage \global\@topnum\z@ % Prevents figures from going at top of page. \@maketitle \fi \thispagestyle{empty}\@thanks \endgroup \setcounter{footnote}{0}% \global\let\thanks\relax \global\let\maketitle\relax \global\let\@maketitle\relax \global\let\@thanks\@empty \global\let\@author\@empty \global\let\@date\@empty \global\let\@title\@empty \global\let\title\relax \global\let\author\relax \global\let\date\relax \global\let\and\relax } \def\@maketitle{% \newpage \null \vskip 2em% {\parindent \z@ \raggedright \baselineskip \z@ \lineskip \z@ \parskip \z@ \vbox{ \vskip -7bp {\baselineskip 10bp\lineskip 10bp\NameFont\uppercase{\@author}\par} \vskip 6bp \AffiliationFont \@affiliation \vskip 10bp \crcrule \vskip 26bp {\baselineskip 24bp\lineskip 24bp\TitlePageTitleFont\@title\par}}} \par \vskip 1.5em} \fi %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \newcommand*\chaptermark[1]{} \setcounter{secnumdepth}{3} \newcounter {part} \newcounter {chapter} \newcounter {section}[chapter] \newcounter {subsection}[section] \newcounter {subsubsection}[subsection] \newcounter {paragraph}[subsubsection] \newcounter {subparagraph}[paragraph] \renewcommand \thepart {\@Roman\c@part} \renewcommand \thechapter {\@arabic\c@chapter} \renewcommand \thesection {\thechapter.\@arabic\c@section} \renewcommand\thesubsection {\thesection.\@arabic\c@subsection} \renewcommand\thesubsubsection{\thesubsection .\@arabic\c@subsubsection} \renewcommand\theparagraph {\thesubsubsection.\@arabic\c@paragraph} \renewcommand\thesubparagraph {\theparagraph.\@arabic\c@subparagraph} \newcommand\@chapapp{\chaptername} \newcommand\frontmatter{% \cleardoublepage \@mainmatterfalse \pagenumbering{roman}} \newcommand\mainmatter{% \cleardoublepage \@mainmattertrue \pagenumbering{arabic}} \newcommand\backmatter{% \if@openright \cleardoublepage \else \clearpage \fi \@mainmatterfalse} \newcommand\part{\make@cornermarks% \if@openright \cleardoublepage \else \clearpage \fi \thispagestyle{empty}% \if@twocolumn \onecolumn \@tempswatrue \else \@tempswafalse \fi \null\vfil \secdef\@part\@spart} \def\@part[#1]#2{% \ifnum \c@secnumdepth >-2\relax \refstepcounter{part}% \addcontentsline{toc}{part}{\thepart\hspace{1em}#1}% \else \addcontentsline{toc}{part}{#1}% \fi \markboth{}{}% {\centering \interlinepenalty \@M \normalfont \ifnum \c@secnumdepth >-2\relax \huge\bfseries \partname\nobreakspace\thepart \par \vskip 20\p@ \fi \Huge \bfseries #2\par}% \@endpart} \def\@spart#1{% {\centering \interlinepenalty \@M \normalfont \Huge \bfseries #1\par}% \@endpart} \def\@endpart{\vfil\newpage \if@twoside \if@openright \null \thispagestyle{empty}% \newpage \fi \fi \if@tempswa \twocolumn \fi} \if@ChapterTOCs \newwrite\@chaptoc \def\secnumwidth{21pt}\def\subsecnumwidth{30pt}\def\ssubsecnumwidth{36pt}\def\subsubsecnumwidth{66pt}\fi \long\def\@trplarg#1{\@ifnextchar[{\@xtrplarg{#1}}{\@ztrplarg{#1}}} \long\def\@xtrplarg#1[#2]{\@ifnextchar[{#1[#2]}{\@ytrplarg{#1}[{#2}]}} \long\def\@ytrplarg#1[#2]#3{#1[{#2}][{#2}]{#3}} \long\def\@ztrplarg#1#2{#1[{#2}][{#2}]{#2}} \newcommand\chapter{\if@openright\cleardoublepage\else\clearpage\fi \make@cornermarks \cleardoublepage \if@ChapterTOCs\if@filesw\immediate\closeout\@chaptoc\fi\fi \pagestyle{headings}% \thispagestyle{folio}% \if@ChapterResetsPage\global\c@page\@ne\fi \global\@topnum\z@ \gdef\chapterauthor{\@ca}% \gdef\endchapterauthors{\end@cas}% \@afterindentfalse \secdef\@chapter\@schapter %%% \@ifstar{\@schapter}{\@trplarg{\@chapter}} } \def\@chapter[#1]#2{% \ifnum\c@secnumdepth>\m@ne \if@mainmatter \refstepcounter{chapter}% \typeout{\@chapapp\space\thechapter.}% \addcontentsline{toc}{chapter}{\protect\numberline{\thechapter}#1}% \else \addcontentsline{toc}{chapter}{#1}\fi \else \addcontentsline{toc}{chapter}{#1}\fi \chaptermark{% #2}% \addtocontents{lof}{\protect\addvspace{10\p@}}% \addtocontents{lot}{\protect\addvspace{10\p@}}% \if@twocolumn \@topnewpage[\@makechapterhead{#2}]% \else \@makechapterhead{#2}% \@afterheading\fi \if@ChapterTOCs\if@filesw\immediate\openout\@chaptoc\thechapter.toc\fi\fi } \def\@makechapterhead#1{% {\parindent \z@ \raggedright \baselineskip \z@ \lineskip \z@ \parskip \z@ \vbox{ \vskip -2\p@ \ChapNumFont %Remove comment if "Chapter" word required before Number %\if@chapnumonly\else % \@chapapp\ %\fi \thechapter \vskip -15\p@ \chap@rule \vskip 6\p@ {\baselineskip 20\p@\lineskip 20\p@\ChapTitleFont #1\par\vskip-15pt}% \noindent\hbox{\vrule height.5pt width84pt} \vskip28\p@} \if@ChapterTOCs \make@chaptoc \else \fi \vskip 19.3\p@} \def\theequation{\thechapter.\arabic{equation}}}% \def\@schapter#1{\if@twocolumn \@topnewpage[\@makeschapterhead{#1}]% \else \@makeschapterhead{#1}% \addcontentsline{toc}{fm}{#1} \markboth{#1}{#1} \@afterheading \fi} \def\@makeschapterhead#1{% {\parindent \z@ \raggedright \baselineskip 6\p@ \lineskip \z@ \parskip \z@ \vbox{ \vskip 22\p@ \unnumchap@rule \vskip 5\p@ \FMHeadFont #1\par\vskip-12pt \noindent\hbox{\vrule height.5pt width84pt} \vskip 41\p@}}% \def\theequation{\thechapter.\arabic{equation}}} %%%\def\@startsection#1#2#3#4#5#6{% %%% \if@noskipsec\leavevmode\fi %%% \par %%% \@tempskipa #4\relax %%% \@afterindenttrue %%% \ifdim \@tempskipa <\z@ %%% \@tempskipa -\@tempskipa \@afterindentfalse %%% \fi %%% \if@nobreak %%% \everypar{}% %%% \else %%% \addpenalty\@secpenalty\addvspace\@tempskipa %%% \fi %%% \@ifstar %%% {\@ssect{#1}{#3}{#4}{#5}{#6}}% %%% {\@trplarg{\@sect{#1}{#2}{#3}{#4}{#5}{#6}}}} %%%\def\@ssect#1#2#3#4#5#6{% %%% \@tempskipa #4\relax %%% \ifdim \@tempskipa>\z@ %%% \begingroup %%% #5{% %%% \@hangfrom{\hskip #2}% %%% \interlinepenalty \@M #6\@@par}% %%% \endgroup %%% \csname #1mark\endcsname{#6}% %%% \else %%% \def\@svsechd{#5{\hskip #2\relax #6}\csname #1mark\endcsname{#6}}% %%% \fi %%% \@xsect{#4}} %%%\def\@sect#1#2#3#4#5#6[#7][#8]#9{% %%% \ifnum #2>\c@secnumdepth %%% \let\@svsec\@empty %%% \else %%% \refstepcounter{#1}% %%%\protected@edef\@svsec{\@seccntformat{#1}\relax}% %%% \fi %%% \@tempskipa #5\relax %%% \ifdim \@tempskipa>\z@ %%% \begingroup %%% #6{% %%%\@hangfrom{\hskip #3\relax\@svsec}\interlinepenalty \@M % %%% #9\@@par}% %%% \endgroup %%% \csname #1mark\endcsname{% %%% #8}% %%% \addcontentsline{toc}{#1}{% %%% \ifnum #2>\c@secnumdepth \else %%% \protect\numberline{\csname the#1\endcsname}% %%% \fi %%% #7}% %%% \else %%% \def\@svsechd{% %%% #6{\hskip #3\relax %%% \@svsec #9}% %%% \csname #1mark\endcsname{% %%% #8}% %%% \addcontentsline{toc}{#1}{% %%% \ifnum #2>\c@secnumdepth \else %%% \protect\numberline{\csname the#1\endcsname}% %%% \fi %%% #7}}% %%% \fi %%% \@xsect{#5}} %%Change mydotted also \newdimen\secwd \newdimen\subsecwd \newdimen\subsubsecwd \def\secwd{31pt} \def\subsecwd{36pt} \def\subsubsecwd{46pt} \def\ssubnumberline#1{\@hangfrom{\hbox to \secwd{#1\hfill}}} \def\subnumberline#1{\@hangfrom{\hskip\subsecnumwidth\hbox to \subsecwd{#1\hfill}}} \def\subsubnumberline#1{\@hangfrom{\hskip\subsubsecnumwidth\hbox to \subsubsecwd{#1\hfill}}} \newcommand\section{% \gdef\chapterauthor{\@caplusone}% \gdef\endchapterauthors{\end@casplusone}% \@ifstar{\@ssection}{\@trplarg{\@section}}} \def\@ssection#1{% \if@ChapterTOCs \myaddcontentsline{\@chaptoc}{chapsection}{\string\makebox[\secnumwidth][l]{}#1}\fi \@startsection{section}{1}{\z@}{-30\p@}{6\p@}{\sec@rule\nopagebreak\vskip9.5\p@\nopagebreak\SectionHeadFont}*{#1}} \def\@section[#1][#2]#3{% \if@ChapterTOCs \addtocounter{section}{1}% \myaddcontentsline{\@chaptoc}{chapsection}{\protect\ssubnumberline{\thesection}#1}% \addtocounter{section}{-1}\fi \@startsection{section}{1}{\z@}{-30\p@}{6\p@}{\sec@rule\nopagebreak\vskip9.5\p@\nopagebreak\SectionHeadFont}[#2]{#3}} \def\sectionauthor#1{\hfill{\ChapTOCAuthorFont #1}} \newcommand\subsection{\@ifstar{\@ssubsection}{\@trplarg{\@subsection}}} \def\@ssubsection#1{% \if@ChapterTOCs \myaddcontentsline{\@chaptoc}{chapsubsection}{\string\makebox[\subsecnumwidth][l]{}#1}\fi \@startsection{subsection}{2}{\z@}{-18\p@}{6\p@}{% \SubsectionHeadFont}*{#1}} \def\@subsection[#1][#2]#3{% \if@ChapterTOCs \addtocounter{subsection}{1}% \myaddcontentsline{\@chaptoc}{chapsubsection}{\protect\subnumberline{\thesubsection}#1}% \addtocounter{subsection}{-1}\fi \@startsection{subsection}{2}{\z@}{-18\p@}{6\p@}{% \SubsectionHeadFont}[#2]{#3}} \newcommand\subsubsection{\@ifstar{\@ssubsubsection}{\@trplarg{\@subsubsection}}} \def\@ssubsubsection#1{% \if@ChapterTOCs \myaddcontentsline{\@chaptoc}{chapsubsubsection}{\string\makebox[\subsecnumwidth][l]{}#1}\fi \@startsection{subsubsection}{3}{\z@}{-12\p@}{6\p@}{% \SubsubsectionHeadFont}*{#1}} \def\@subsubsection[#1][#2]#3{% \if@ChapterTOCs \addtocounter{subsubsection}{1}% \myaddcontentsline{\@chaptoc}{chapsubsubsection}{\protect\subsubnumberline{\thesubsubsection}#1}% \addtocounter{subsubsection}{-1}\fi \@startsection{subsubsection}{3}{\z@}{-12\p@}{6\p@}{% \SubsubsectionHeadFont}[#2]{#3}} \newcommand\paragraph{\@startsection{paragraph}{4}{\z@}% {-12\p@}{6\p@}{\ParagraphHeadFont}} \newcommand\subparagraph{\@startsection{subparagraph}{5}{\parindent}% {-12\p@}{6\p@}{\SubParagraphHeadFont}} \if@twocolumn \setlength\leftmargini {2em} \else \setlength\leftmargini {2.5em} \fi \leftmargin \leftmargini \setlength\leftmarginii {2.2em} \setlength\leftmarginiii {1.87em} \setlength\leftmarginiv {1.7em} \if@twocolumn \setlength\leftmarginv {.5em} \setlength\leftmarginvi {.5em} \else \setlength\leftmarginv {1em} \setlength\leftmarginvi {1em} \fi \setlength \labelsep {.5em} \setlength \labelwidth{\leftmargini} \addtolength\labelwidth{-\labelsep} \@beginparpenalty -\@lowpenalty \@endparpenalty -\@lowpenalty \@itempenalty -\@lowpenalty \renewcommand\theenumi{\@arabic\c@enumi} \renewcommand\theenumii{\@alph\c@enumii} \renewcommand\theenumiii{\@roman\c@enumiii} \renewcommand\theenumiv{\@Alph\c@enumiv} \newcommand\labelenumi{\theenumi.} \newcommand\labelenumii{(\theenumii)} \newcommand\labelenumiii{\theenumiii.} \newcommand\labelenumiv{\theenumiv.} \renewcommand\p@enumii{\theenumi} \renewcommand\p@enumiii{\theenumi(\theenumii)} \renewcommand\p@enumiv{\p@enumiii\theenumiii} \newcommand\labelitemi{\textbullet} \newcommand\labelitemii{\normalfont\bfseries \textendash} \newcommand\labelitemiii{\textasteriskcentered} \newcommand\labelitemiv{\textperiodcentered} \newenvironment{description} {\list{}{\labelwidth\z@ \itemindent-\leftmargin \let\makelabel\descriptionlabel}} {\endlist} \newcommand*\descriptionlabel[1]{\hspace\labelsep \normalfont\bfseries #1} \newenvironment{verse} {\let\\\@centercr \list{}{\itemsep \z@ \itemindent -1.5em% \listparindent\itemindent \rightmargin \leftmargin \advance\leftmargin 1.5em}% \item\relax} {\endlist} \newenvironment{quotation} {\list{}{\listparindent 1.5em% \itemindent \listparindent \rightmargin \leftmargin \parsep \z@ \@plus\p@}% \item\relax} {\endlist} \newenvironment{quote} {\list{}{\rightmargin\leftmargin}% \item\relax} {\endlist} \if@compatibility \newenvironment{titlepage} {% \cleardoublepage \if@twocolumn \@restonecoltrue\onecolumn \else \@restonecolfalse\newpage \fi \thispagestyle{empty}% \setcounter{page}\z@ }% {\if@restonecol\twocolumn \else \newpage \fi } \else \newenvironment{titlepage} {% \cleardoublepage \if@twocolumn \@restonecoltrue\onecolumn \else \@restonecolfalse\newpage \fi \thispagestyle{empty}% \setcounter{page}\@ne }% {\if@restonecol\twocolumn \else \newpage \fi \if@twoside\else \setcounter{page}\@ne \fi } \fi \newcommand\appendix{\par \setcounter{chapter}{0}% \setcounter{section}{0}% \gdef\@chapapp{\appendixname}% \gdef\thechapter{\@Alph\c@chapter}} \setlength\arraycolsep{5\p@} \setlength\tabcolsep{6\p@} \setlength\arrayrulewidth{.4\p@} \setlength\doublerulesep{2\p@} \setlength\tabbingsep{\labelsep} \skip\@mpfootins = \skip\footins \setlength\fboxsep{3\p@} \setlength\fboxrule{.4\p@} \@addtoreset {equation}{chapter} \renewcommand\theequation {\ifnum \c@chapter>\z@ \thechapter.\fi \@arabic\c@equation} \newcounter{figure}[chapter] \renewcommand \thefigure {\ifnum \c@chapter>\z@ \thechapter.\fi \@arabic\c@figure} \def\fps@figure{tbp} \def\ftype@figure{1} \def\ext@figure{lof} \def\fnum@figure{\figurename\nobreakspace\thefigure} \newenvironment{figure} {\@float{figure}} {\end@float} \newenvironment{figure*} {\@dblfloat{figure}} {\end@dblfloat} \newcounter{table}[chapter] \renewcommand \thetable {\ifnum \c@chapter>\z@ \thechapter.\fi \@arabic\c@table} \def\fps@table{tbp} \def\ftype@table{2} \def\ext@table{lot} \def\fnum@table{\tablename\nobreakspace\thetable} \newenvironment{table} {\@float{table}} {\end@float} \newenvironment{table*} {\@dblfloat{table}} {\end@dblfloat} \newlength\abovecaptionskip \newlength\belowcaptionskip \setlength\abovecaptionskip{10\p@} \setlength\belowcaptionskip{0\p@} \long\def\@makecaption#1#2{% \vskip\abovecaptionskip \sbox\@tempboxa{#1: #2}% \ifdim \wd\@tempboxa >\hsize {\FigCapFont #1} #2\par \else \global \@minipagefalse % \hb@xt@\hsize{\hfil\box\@tempboxa\hfil}% {\FigCapFont #1} #2\par \fi \vskip\belowcaptionskip} \DeclareOldFontCommand{\rm}{\normalfont\rmfamily}{\mathrm} \DeclareOldFontCommand{\sf}{\normalfont\sffamily}{\mathsf} \DeclareOldFontCommand{\tt}{\normalfont\ttfamily}{\mathtt} \DeclareOldFontCommand{\bf}{\normalfont\bfseries}{\mathbf} \DeclareOldFontCommand{\it}{\normalfont\itshape}{\mathit} \DeclareOldFontCommand{\sl}{\normalfont\slshape}{\@nomath\sl} \DeclareOldFontCommand{\sc}{\normalfont\scshape}{\@nomath\sc} \DeclareRobustCommand*\cal{\@fontswitch\relax\mathcal} \DeclareRobustCommand*\mit{\@fontswitch\relax\mathnormal} \newcommand\@pnumwidth{1.55em} \newcommand\@tocrmarg{2.55em} \newcommand\@dotsep{4.5} \setcounter{tocdepth}{3} \newcounter{numauthors} \newif\if@break \newif\if@firstauthor \newcommand\tableofcontents{\cleardoublepage\markboth{Contents}{Contents}% \make@cornermarks \gdef\chapterauthor{\@caplusone}% \gdef\endchapterauthors{\end@casplusone}% \if@twocolumn \@restonecoltrue\onecolumn \else \@restonecolfalse \fi {\parindent \z@ \raggedright \baselineskip 6\p@ \lineskip \z@ \parskip \z@ \vbox{ \vskip 22\p@ \unnumchap@rule \vskip 5\p@ \FMHeadFont \contentsname\par\vskip-12pt \noindent\hbox{\vrule height.5pt width84pt} \vskip 41\p@}} %%% \chapter*{\contentsname %%% \@mkboth{% %%% \MakeUppercase\contentsname}{\MakeUppercase\contentsname}}% \pagestyle{headings}\thispagestyle{folio} {\let\break\space \let\author\toc@author \reset@authors \let\toc@draw\relax \@starttoc{toc} %% \toc@draw } \if@restonecol\twocolumn\fi } \def\draw@part#1#2{% \addpenalty{-\@highpenalty}% \vskip1em plus\p@ \@tempdima1.5em \begingroup \parindent\z@\rightskip\@pnumwidth \parfillskip-\rightskip \bfseries \leavevmode \advance\leftskip\@tempdima \hskip-\leftskip {#1\hfil}\nobreak \if@pdf \else \hfil\nobreak\hb@xt@\@pnumwidth{\hss #2}% \fi \par \penalty\@highpenalty\endgroup} \let\toc@draw\relax % \def\l@part#1#2{% \toc@draw \gdef\toc@draw{\draw@part{\large #1}{\large #2}}} \def\l@fm#1#2{% \toc@draw \gdef\toc@draw{\draw@fm{#1}{#2}}} \def\@pnumwidth{1.8em} \def\draw@fm#1#2{% \addpenalty{-\@highpenalty}% \vskip1em plus\p@ \@tempdima1.5em \begingroup \parindent\z@\rightskip\@pnumwidth \parfillskip-\rightskip \bfseries \leavevmode \advance\leftskip\@tempdima \hskip-\leftskip {#1\hfil}\nobreak \if@pdf \else \hfil\nobreak\hb@xt@\@pnumwidth{\hss #2}% \fi \par \penalty\@highpenalty\endgroup} \def\l@chapter#1#2{% \toc@draw \gdef\toc@draw{\draw@chapter{#1}{#2}}} \def\@pnumwidth{1.8em} \def\draw@chapter#1#2{% \addpenalty{-\@highpenalty}% \vskip1em plus\p@ \@tempdima1.5em \begingroup \parindent\z@\rightskip\@pnumwidth \parfillskip-\rightskip \bfseries \leavevmode \advance\leftskip\@tempdima \hskip-\leftskip {#1\hfil}\nobreak \if@pdf \else \hfil\nobreak\hb@xt@\@pnumwidth{\hss #2}% \fi \par {\it\draw@authors}% \penalty\@highpenalty\endgroup} \def\toc@author#1#2{% \if@firstauthor \@firstauthorfalse \else \ifx\@authors\@empty \xdef\@authors{\last@author}% \else \@cons{\@authors}{, \last@author}\fi\fi \stepcounter{numauthors}% %%%%%%% commented and deleted below the second part to aviod inaccessible error % shashi % September-2008 %% \gdef\last@author{#1 {\rm\fontsize{9\p@}{11\p@}\selectfont #2}} \gdef\last@author{#1} } \def\draw@authors{% \let\@t\@authors \ifx\@t\@empty \let\@t\last@author\fi \ifx\@t\@empty\else \hskip\leftskip \ifx\@authors\@empty \else \@authors \ifnum\c@numauthors>2,\fi \if@break\break\fi \ and \fi \last@author\break\fi \reset@authors} \def\reset@authors{% \gdef\@authors{}% \gdef\last@author{}% \@firstauthortrue \setcounter{numauthors}{0}} \newlength\section@toc@skip \section@toc@skip1.5em \newlength\SectionTOCWidth \SectionTOCWidth2.3em \def\l@section#1#2{% \toc@draw \gdef\toc@draw{\draw@section{#1}{#2}}} \def\draw@section#1#2{% \@dottedtocline{1}{\section@toc@skip}{\SectionTOCWidth}{#1 }{{ \tocfont #2}}} \newlength\subsection@toc@skip \subsection@toc@skip\section@toc@skip \advance\subsection@toc@skip\SectionTOCWidth \newlength\SubSectionTOCWidth \SubSectionTOCWidth3.2em \def\l@subsection#1#2{% \toc@draw \gdef\toc@draw{\draw@subsection{#1}{#2}}} \def\draw@subsection#1#2{% \@dottedtocline{2}{\subsection@toc@skip}{\SubSectionTOCWidth}{#1}{{ \tocfont #2}}} \newlength\subsubsection@toc@skip \subsubsection@toc@skip\subsection@toc@skip \advance\subsubsection@toc@skip\SubSectionTOCWidth \newlength\SubSubSectionTOCWidth \SubSubSectionTOCWidth4.1em \def\l@subsubsection#1#2{% \toc@draw \gdef\toc@draw{\draw@subsubsection{#1}{#2}}} \def\draw@subsubsection#1#2{% \@dottedtocline{3}{\subsubsection@toc@skip}{\SubSubSectionTOCWidth}{#1}{{ \tocfont #2}}} \newlength\paragraph@toc@skip \paragraph@toc@skip\subsubsection@toc@skip \advance\paragraph@toc@skip\SubSubSectionTOCWidth \newlength\ParagraphTOCWidth \ParagraphTOCWidth4.1em \def\l@paragraph#1#2{% \toc@draw \gdef\toc@draw{\draw@paragraph{#1}{#2}}} \def\draw@paragraph#1#2{% \@dottedtocline{4}{\paragraph@toc@skip}{\ParagraphTOCWidth}{#1}{{ \tocfont #2}}} \newlength\subparagraph@toc@skip \subparagraph@toc@skip\paragraph@toc@skip \advance\subparagraph@toc@skip\ParagraphTOCWidth \def\l@subparagraph#1#2{% \toc@draw \gdef\toc@draw{\draw@subparagraph{#1}{#2}}} \def\draw@subparagraph#1#2{% \@dottedtocline{5}{\subparagraph@toc@skip}{6em}{#1}{{ \tocfont #2}}} \def\@dottedtocline#1#2#3#4#5{% \ifnum #1>\c@tocdepth \else \vskip \z@ \@plus.2\p@ {\leftskip #2\relax\rightskip\@tocrmarg\parfillskip-\rightskip \parindent #2\relax\@afterindenttrue \interlinepenalty\@M \leavevmode \@tempdima #3\relax \advance\leftskip\@tempdima\null\hskip-\leftskip {#4\hfil}\nobreak \if@pdf \else \leaders\hbox{$\m@th\mkern\@dotsep mu\hbox{.}\mkern\@dotsep mu$}\hfill \nobreak \hb@xt@\@pnumwidth{\hfil\normalfont\normalcolor #5}% \fi \par}\fi} \newcommand\chapterauthors{% \def\break{\string\break\ }% \def\protect##1{\string ##1 }} \def\end@cas{} \def\end@casplusone{\vskip4pt\@doendpe} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \def\make@chaptoc{% chapter author {\parindent\z@ \newcommand\FolioBoldFont{}% \let\@b\bullet \def\bullet{\raisebox{2pt}{$\scriptscriptstyle\@b$}}% \let\SubsectionItalicFont\it %\ifx\chapter@author\@empty\else {\rm\fontsize{10\p@}{10\p@}\bfseries\selectfont %\the\c@numauthors \ifnum\c@numauthors=1 \chapter@authorone\vskip6\p@ {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationone}\vskip12\p@ \fi \ifnum\c@numauthors=2 \chapter@authorone\vskip6\p@ {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationone}\vskip12\p@ \chapter@authortwo\vskip6\p@ {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationtwo} \fi \ifnum\c@numauthors=3 \chapter@authorone\vskip6\p@ {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationone}\vskip12\p@ \chapter@authortwo\vskip6\p@ {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationtwo}\vskip12\p@ \chapter@authorthree\vskip6\p@ {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationthree} \fi \ifnum\c@numauthors=4 \chapter@authorone\vskip6\p@ {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationone}\vskip12\p@ \chapter@authortwo\vskip6\p@ {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationtwo}\vskip12\p@ \chapter@authorthree\vskip6\p@ {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationthree}\vskip12\p@ \chapter@authorfour\vskip6\p@ {\it\fontsize{10\p@}{10\p@}\selectfont\chapter@affiliationfour} \fi } \gdef\chapter@authorone{}\gdef\chapter@affiliationone{}% \gdef\chapter@authortwo{}\gdef\chapter@affiliationtwo{}% \gdef\chapter@authorthree{}\gdef\chapter@affiliationthree{}% \gdef\chapter@authorfour{}\gdef\chapter@affiliationfour{}% \vskip 14.6\p@ {\leftskip\secnumwidth\def\author##1##2{}\vskip14pt\hbox{\leftskip0pt\SubsectionHeadFont CONTENTS}\vskip6pt\par\@input{\thechapter.toc}\par}% } \reset@authors} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \newif\iffinishedfromone \global\finishedfromonefalse % \newif\iffinishedfromtwo \global\finishedfromtwofalse % \newif\iffinishedfromthree \global\finishedfromthreefalse % \newif\iffinishedfromfour \global\finishedfromfourfalse %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % \newcommand\singleauthorchapter{\finishedfromonetrue} \newcommand\twoauthorchapter{\finishedfromtwotrue} \newcommand\threeauthorchapter{\finishedfromthreetrue} \newcommand\fourauthorchapter{\finishedfromfourtrue} % %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \newif\iffinish \global\finishfalse %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \newsavebox\@AUonebox \newsavebox\@AUtwobox \newsavebox\@AUthreebox \newsavebox\@AUfourbox % \newsavebox\@AUaffonebox \newsavebox\@AUafftwobox \newsavebox\@AUaffthreebox \newsavebox\@AUafffourbox % \newsavebox\@finalAUboxfromone \newsavebox\@finalAUboxfromtwo \newsavebox\@finalAUboxfromthree \newsavebox\@finalAUboxfromfour %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \def\@ca#1#2{% % \def\chapter@author{#1}% % \def\chapter@affiliation{#2}% \if@filesw% \write\@auxout{% \string\@writefile{toc}{\string\author{#1}{}}% }% \fi %%%%%%%%%%%%%%% \ifnum\c@numauthors>4 \resetcounter{numauthors} \fi \stepcounter{numauthors} %%\the\c@numauthors \ifnum\c@numauthors=1 % \sbox\@AUonebox{\CAPlusOneFont#1} \sbox\@AUaffonebox{\vbox{\hsize\textwidth\CAAPlusOneFont\noindent #2\par}} \sbox\@finalAUboxfromone{\copy\@AUonebox} \def\chapter@authorone{\copy\@finalAUboxfromone} \def\chapter@affiliationone{\copy\@AUaffonebox} \fi \ifnum\c@numauthors=2 \sbox\@AUtwobox{\CAPlusOneFont#1} \sbox\@AUafftwobox{\vbox{\hsize\textwidth\CAAPlusOneFont\noindent #2\par}} \sbox\@finalAUboxfromtwo{\copy\@AUtwobox} \def\chapter@authortwo{\copy\@finalAUboxfromtwo} \def\chapter@affiliationtwo{\copy\@AUafftwobox} \fi \ifnum\c@numauthors=3 \sbox\@AUthreebox{\CAPlusOneFont#1} \sbox\@AUaffthreebox{\vbox{\hsize\textwidth\CAAPlusOneFont\noindent #2\par}} \sbox\@finalAUboxfromthree{\copy\@AUthreebox} \def\chapter@authorthree{\copy\@finalAUboxfromthree} \def\chapter@affiliationthree{\copy\@AUaffthreebox} \fi \ifnum\c@numauthors=4 \sbox\@AUfourbox{\CAPlusOneFont#1} \sbox\@AUafffourbox{\vbox{\hsize\textwidth\CAAPlusOneFont\noindent #2\par}} \sbox\@finalAUboxfromfour{\copy\@AUfourbox} \def\chapter@authorfour{\copy\@finalAUboxfromfour} \def\chapter@affiliationfour{\copy\@AUafffourbox} \fi} \def\@caplusone{\@ifstar{\@scaplusone}{\@ifnextchar[{\@xcaplusone}{\@xcaplusone[]}}} \def\@xcaplusone[#1]#2#3{% \def\@@empty{#1}\ifx\@empty\@@empty\@ca{#2}{#3}\else\@ca{#2}{#1}\fi\@scaplusone{#2}{#3}} \def\@scaplusone#1#2{% \ifhmode\vskip-12pt\fi %%Shashi Commented %%% \noindent\hskip3pc{\CAPlusOneFont\baselineskip14pt #1\def\@t{#2}\ifx\@t\@empty\else,\fi}\hskip6pt{\CAAPlusOneFont #2}\par } \def\chapterauthoronly#1#2{\@ca{#1}{}\@scaplusone{#1}{#2}} \def\myaddcontentsline#1#2#3{% \if@filesw \begingroup \let\label\@gobble\let\index\@gobble\let\glossary\@gobble \def\break{\ }% \def\protect##1{\string ##1 }% \@temptokena{\thepage}% \edef\@tempa{\write#1{\string\chapcontentsline{#2}{\string\raggedright\space #3}{\the\@temptokena}}}\@tempa \if@nobreak\ifvmode\nobreak\fi\fi \endgroup \fi} \def\chapcontentsline#1{\csname l@#1\endcsname} \def\l@chapsection{\@mydottedtocline{1}{\z@}{6pt}} \def\l@chapsubsection{\@mydottedtocline{2}{\secnumwidth}{6pt}} \def\l@chapsubsubsection{\@mydottedtocline{3}{\subsecnumwidth}{36pt}} \newcount\c@chaptocdepth \setcounter{chaptocdepth}{3} \def\@mytocline#1#2#3#4#5{% \ifnum #1>\c@chaptocdepth \else \vskip 2pt plus.2\p@ \ifnum #1=1\ifnum\c@chaptocdepth>1\addvspace{12pt}\fi\fi {\leftskip #2\relax% \rightskip \@tocrmarg \parfillskip -\rightskip \interlinepenalty\@M \leavevmode \@tempdima #3\relax \rightskip\z@ \vbox{\ChapTOCFont #4\nobreak}% \par}\fi} \def\@mydottedtocline#1#2#3#4#5{% \ifnum #1>\c@chaptocdepth \else \fontsize{10}{12}\selectfont {\leftskip #2\relax \rightskip \@tocrmarg \parfillskip -\rightskip % \parindent #2\relax\@afterindenttrue \interlinepenalty\@M \leavevmode \def\@dotsep{1.2}% \@tempdima #3\relax \rightskip\z@ % \advance\hsize-\secnumwidth % \hskip-\secnumwidth \if@sevenbyten \hangindent\secnumwidth\hsize372pt\else\hangindent\secnumwidth\hsize312pt\fi #4 \if@pdf \hfill \else \nobreak\leaders\hbox{$\m@th\mkern\@dotsep mu.\mkern\@dotsep mu$}\hfill\nobreak \hbox to24\p@{\hfil #5}\fi \par}\fi} \newcommand\listoffigures{% \if@twocolumn \@restonecoltrue\onecolumn \else \@restonecolfalse \fi \chapter*{\listfigurename}% \@mkboth{\MakeUppercase\listfigurename}% {\MakeUppercase\listfigurename}% \@starttoc{lof}% \if@restonecol\twocolumn\fi } \newcommand*\l@figure{\@dottedtocline{1}{1.5em}{2.3em}} \newcommand\listoftables{% \if@twocolumn \@restonecoltrue\onecolumn \else \@restonecolfalse \fi \chapter*{\listtablename}% \@mkboth{% \MakeUppercase\listtablename}% {\MakeUppercase\listtablename}% \@starttoc{lot}% \if@restonecol\twocolumn\fi } \let\l@table\l@figure \newdimen\bibindent \setlength\bibindent{1.5em} \newenvironment{thebibliography}[1] {\chapter*{\bibname}% \@mkboth{\MakeUppercase\bibname}{\MakeUppercase\bibname}% % \addcontentsline{toc}{chapter}{\bibname} \list{\@biblabel{\@arabic\c@enumiv}}% {\settowidth\labelwidth{\@biblabel{#1}}% \leftmargin\labelwidth \advance\leftmargin\labelsep \@openbib@code \usecounter{enumiv}% \let\p@enumiv\@empty \renewcommand\theenumiv{\@arabic\c@enumiv}}% \sloppy \clubpenalty4000 \@clubpenalty \clubpenalty \widowpenalty4000% \sfcode`\.\@m} {\def\@noitemerr {\@latex@warning{Empty `thebibliography' environment}}% \endlist} \newcommand\newblock{\hskip .11em\@plus.33em\@minus.07em} \let\@openbib@code\@empty \newcommand\indexname{Index} \newenvironment{theindex} {\cleardoublepage\if@twocolumn \@restonecolfalse \else \@restonecoltrue \fi \twocolumn[\@makeschapterhead{\indexname}]% \@mkboth{\MakeUppercase\indexname}% {\MakeUppercase\indexname}% \pagestyle{headings} \addcontentsline{toc}{chapter}{\indexname} % there seems to be a weird bug in krantz.cls that prevents the very _last_ item % of \addcontentsline from being added to TOC, so I have to add an empty entry \addcontentsline{toc}{section}{} \thispagestyle{folio}\parindent\z@\markboth{\indexname}{\indexname} \parskip\z@ \@plus .3\p@\relax\raggedright \columnseprule \z@ \columnsep 35\p@ \let\item\@idxitem} {\if@restonecol\onecolumn\else\clearpage\fi} \newcommand\@idxitem{\par\hangindent 40\p@} \newcommand\subitem{\@idxitem \hspace*{20\p@}} \newcommand\subsubitem{\@idxitem \hspace*{30\p@}} \newcommand\indexspace{\par \vskip 10\p@ \@plus5\p@ \@minus3\p@\relax} \renewcommand\footnoterule{% \kern-3\p@ \hrule\@width.4\columnwidth \kern2.6\p@} \@addtoreset{footnote}{chapter} \newcommand\@makefntext[1]{% \parindent 1em% \noindent \hb@xt@1.8em{\hss\@makefnmark}#1} \newcommand\contentsname{Contents} \newcommand\listfigurename{List of Figures} \newcommand\listtablename{List of Tables} \newcommand\bibname{Bibliography} \newcommand\figurename{FIGURE} \newcommand\tablename{TABLE} \newcommand\partname{Part} \newcommand\chaptername{Chapter} \newcommand\appendixname{Appendix} \def\today{\ifcase\month\or January\or February\or March\or April\or May\or June\or July\or August\or September\or October\or November\or December\fi \space\number\day, \number\year} \setlength\columnsep{10\p@} \setlength\columnseprule{0\p@} \pagestyle{headings} \pagenumbering{arabic} \if@twoside \else \raggedbottom \fi \if@twocolumn \twocolumn \sloppy \flushbottom \else \onecolumn \fi \newcommand\unnumcrcrule{\hbox to\textwidth{\rlap{\rule[-3.5\p@]{84\p@}{4\p@}}}} \newcommand\unnumchap@rule{\unnumcrcrule} \newcommand\crcrule{\hbox to\textwidth{\rlap{\rule[-3.5\p@]{84\p@}{4\p@}}\rule{\textwidth}{.5\p@}}} \newcommand\chap@rule{\crcrule} \newcommand\sec@rule{\crcrule} \def\@affiliate[#1]{\gdef\@affiliation{#1}} \def\@affiliation{} \def\def@theequation{% \if@numberinsequence \def\theequation{% \if@numbysec\thesection\else\thechapter\fi.% \@arabic\c@shared}% \else \def\theequation{% \if@numbysec\thesection\else\thechapter\fi.% \@arabic\c@equation}\fi} \def\affiliation#1{{\AffiliationFont\noindent #1\vskip 36bp}} \newbox\tempbox \newdimen\nomenwidth \newenvironment{symbollist}[1]{% \addvspace{12pt} \setbox\tempbox\hbox{#1\hskip1em}% \global\nomenwidth\wd\tempbox %\section*{Sumbol Description} \noindent{\SectionHeadFont Symbol Description}\vskip6pt \begin{multicols}{2}}{% \end{multicols}\par\addvspace{12pt}} \def\symbolentry#1#2{\par\noindent\@hangfrom{\hbox to \nomenwidth{#1\hss}}#2\par} \tabcolsep 5pt \arrayrulewidth .5pt \doublerulesep 1pt %\newcounter{subtable}[table] \newif\if@tablerules\@tablerulestrue \newif\if@centertable\@centertabletrue \newif\if@centertabletitle\@centertabletitletrue \newbox\@tablebox \newbox\@tabletitlebox \newdimen\@tablewidth \newdimen\@tabletitlewidth \newdimen\max@tablewidth \newcommand\automaticrules{\@tablerulestrue} \newcommand\noautomaticrules{\@tablerulesfalse} \def\thetable{% \thechapter.% \@arabic\c@table} \def\thesubtable{% \thechapter.% \@arabic\c@table\alph{subtable}} \def\resettableletter{\setcounter{subtable}{0}} \def\@Tabletitle{} \newcommand\tabletitle{\@ifnextchar[{\@xtabletitle}{\@tabletitlewidth\z@\@ytabletitle}} \def\@@tabletitle{} \newif\ifshorttabletitle \global\shorttabletitlefalse %\def\@xtabletitle#1{\@tabletitlewidth#1\@ytabletitle} % \def\@xtabletitle[#1]#2{% \gdef\@@tabletitle{#1}% \gdef\@tabletitle{#2}% \let\@Tabletitle\@TableTitle \refstepcounter{table}% {\let\footnotemark\@empty \let\footnote\@gobble \addcontentsline{\ext@table}{table}{\protect\numberline{\thetable}{\@@tabletitle}}}} %%%% %\long\def\@xtabletitle[#1]#2{% % \setbox\@ttbox\hbox{#1}\global\shorttabletitletrue % \def\@@tabletitle{\ifx\@ttbox\@empty\else#1\fi}% % \def\@tabletitle{#2}% % \let\@Tabletitle\@TableTitle % \refstepcounter{table}% % {\let\footnotemark\@empty % \let\footnote\@gobble % \addcontentsline{\ext@table}{table}{\protect\numberline{\thetable}{% %\ifshorttabletitle\@@tabletitle\else\@tabletitle\fi}}}} %%% % \long\def\@ytabletitle#1{% \def\@tabletitle{#1}% \let\@Tabletitle\@TableTitle \refstepcounter{table}% {\let\footnotemark\@empty \let\footnote\@gobble \addcontentsline{\ext@table}{table}{\protect\numberline{\thetable}{\@tabletitle}}}} \def\tabletitlelet{\@ifnextchar[{\@xtabletitlelet}{\@tabletitlewidth\z@\@ytabletitlelet}} \def\@xtabletitlelet[#1]{\@tabletitlewidth#1\@ytabletitlelet} \long\def\@ytabletitlelet#1{% \def\@tabletitle{#1}% \let\@Tabletitle\@TableTitle \ifnum\c@subtable=0\stepcounter{table}\fi \let\@currentlabel\thesubtable {\let\footnotemark\@empty \let\footnote\@gobble \addcontentsline{\ext@table}{table}{\protect\numberline{\thetable}{\@tabletitle}}}} \def\@TableTitle{% \noindent {% \vbox{{\TableNumberFont TABLE\ \thetable}}\par\TableTitleFont\@tabletitle}} \def\table{% %\long\def\caption##1{\tabletitle{##1}\@TableTitle\par}% \@float{table}} \@namedef{table*}{% \long\def\caption##1{\tabletitle{##1}\@TableTitle\par}% \@dblfloat{table}} \def\endtabular{\crcr\egroup\egroup $\egroup} \expandafter \let \csname endtabular*\endcsname = \endtabular \def\tabular{\let\@halignto\@empty\@tabular} \@namedef{tabular*}#1{% \setlength\dimen@{#1}% \edef\@halignto{to\the\dimen@}\@tabular} \def\tch#1{\TableColHeadFont #1\llstrut\hfill} \def\tsh#1{\TableSubheadFont #1\hfill} \newcommand\llstrut{\rule[-6pt]{0pt}{14pt}} \newcommand\flstrut{\rule{0pt}{10pt}} \newcommand\tabletitlestrut{\rule{0pt}{20pt}} \def\Boxhead#1{\par\addvspace{3pt plus2pt}\noindent{\centering\bfseries#1\par}\vskip3pt} \newbox\tempbox% \newdimen\tempdimen% % \newenvironment{shortbox}{\par\addvspace{12pt plus2pt}% \if@krantza \setbox\tempbox\vbox\bgroup\hsize27pc% \else\if@krantzb \setbox\tempbox\vbox\bgroup\hsize32pc% \else \setbox\tempbox\vbox\bgroup\hsize25pc% \fi\fi }{% \egroup% \noindent\fboxsep6pt\fboxrule.5pt\hspace*{0pt}\fbox{\box\tempbox} \par\addvspace{12pt plus2pt}}% % \def\grayink{\special{color cmyk 0 0 0 0.2}} \def\blackink{\special{color cmyk 0 0 0 1.0}} % \def\whiteink{\special{color cmyk 0 0 0 0}} % 0% \newenvironment{shadebox}{% \setbox\tempbox\hbox\bgroup\vbox\bgroup\leftskip12pt\rightskip\leftskip\vspace*{12pt}}{\par\addvspace{-6pt} \egroup\egroup\par\addvspace{15pt} \tempdimen\ht\tempbox \advance\tempdimen by 1pc \noindent{\hbox to \wd\tempbox{\vbox to \ht\tempbox{\hsize\textwidth{\special{color push}\grayink\noindent\vrule height\tempdimen width\textwidth \special{color pop}\blackink}}}}% \llap{\unhbox\tempbox}\par\addvspace{20pt}} %%%%%%%%%% Note %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \newbox\tempbox \newdimen\notewidth \newenvironment{notelist}[1]{% \addvspace{6pt} \setbox\tempbox\hbox{#1\hskip.57em}% \global\notewidth\wd\tempbox }{% \par\addvspace{6pt}} \def\notes#1#2{\par\noindent\@hangfrom{\hbox to \notewidth{\bf #1\hss}}#2\par} %%%%%%%%%%%%%%%% wherelist %%%%%%%%%%%%%%%% \newbox\wherebox \newdimen\wherewidth \newenvironment{wherelist}[1]{\leftskip10pt% \addvspace{6pt} \setbox\wherebox\hbox{#1\hskip1em}% \global\wherewidth\wd\wherebox \noindent\hspace*{-14pt} where }{% \par\addvspace{6pt}} \def\whereentry#1#2#3{\par\noindent\@hangfrom{\hbox to \wherewidth{#1\hss}#2\hskip6pt}#3\par} %%%%%%%%%%%% \newenvironment{unnumlist}{% \ifnum \@enumdepth >3 \@toodeep\else \advance\@enumdepth\@ne \list{}{% \leftmargini27.5pt \leftmarginii17.5pt\leftmarginiv17.5pt % \leftmargin\parindent \advance\leftmargin-.2em \advance\leftmarginii.2em \advance\leftmarginiii.1em \advance\leftmarginiv.2em \def\makelabel##1{\hss\llap{##1}}} \fi% }{% \endlist} % \newenvironment{extract}{% \par\addvspace{11.5pt minus2pt}% \leftskip2em\rightskip\leftskip \noindent\ignorespaces }{% \par\addvspace{11.5pt minus2pt}% \@endparenv} % % \def\VA#1#2{\addvspace{12pt}\raggedleft #1\rightskip3em\par #2\rightskip3em} % \newenvironment{VF}{\VfFont% \par\addvspace{12pt minus2pt}% \noindent{\vrule height2pt width\textwidth}\par\vskip7.3pt \leftskip3em\rightskip\leftskip \noindent\ignorespaces }{% \par\vskip6pt\leftskip0pt\noindent{{\vrule height2pt width\textwidth}}\par\addvspace{12pt minus2pt}% \@endparenv} % \def\VTA#1#2{\addvspace{12pt}\raggedleft #1\rightskip3em\par {\it #2}\rightskip3em} % % \def\VT{\par\addvspace{3.5pt}\noindent} \def\VH#1{{\normalfont\fontsize{12.5}{14.5}\itshape\centering\selectfont #1\par}\addvspace{5.5pt}} % \newenvironment{VT1}{\VfFont% \par\addvspace{12pt minus2pt}% \noindent{\vrule height2pt width\textwidth}\par\vskip7.5pt \leftskip3em\rightskip\leftskip %\@afterheading \parindent0pt \noindent\ignorespaces }{% \par\vskip6pt\leftskip0pt\noindent{{\vrule height2pt width\textwidth}}\par\addvspace{10pt minus2pt}% \@endparenv} % %%%%%%%%%%%% Glossary %%%%%%%%%%%%%%%%%%%%%%% \newenvironment{Glossary} {\list{}{\labelwidth\z@\leftmargin18pt \itemindent-18pt \let\makelabel\glosslabel}} {\endlist} \newcommand\glosslabel[1]{\hspace\labelsep\normalfont\bfseries #1:} %%%%%%%%%%%% \newif\iffnalpha \global\fnalphafalse \newskip\listtextleftmargin\listtextleftmargin 20pt%24pt \newskip\listtextleftmarginii\listtextleftmarginii0pt% 24pt \newskip\listtextleftmarginiii\listtextleftmarginiii0pt% 24pt \newskip\listtextrightmargin\listtextrightmargin12pt%.5pc \newskip\listlabelleftskip \listlabelleftskip4pt%3.3pt \newskip\listlabelleftskipii \listlabelleftskipii0pt%3.3pt \newskip\listlabelleftskipiii \listlabelleftskipiii0pt%3.3pt \newskip\abovelistskipi\abovelistskipi6pt plus2pt \newskip\belowlistskipi\belowlistskipi6pt plus2pt \newskip\abovelistskipii\abovelistskipii0pt plus2pt \newskip\belowlistskipii\belowlistskipii0pt plus2pt \newskip\abovelistskipiii\abovelistskipiii0pt plus2pt \newskip\belowlistskipiii\belowlistskipiii0pt plus2pt \newskip\labelsepi \labelsepi6pt \newskip\labelsepii \labelsepii6pt \newskip\labelsepiii \labelsepiii6pt%\z@ \newskip\itemsepi \itemsepi0pt%10pt \newskip\itemsepii \itemsepii0pt \newskip\itemsepiii \itemsepiii0pt \newdimen\enumdimwd \newif\iflabelrightalign\labelrightaligntrue \newdimen\enumdim% % \def\enummax#1{% \labelsep\csname labelsep\romannumeral\the\@enumdepth\endcsname \ifdim\listtextleftmargin>\z@\labelsepi0pt\fi \ifdim\listtextleftmarginii>\z@\labelsepii0pt\fi \ifdim\listtextleftmarginiii>\z@\labelsepiii0pt\fi \setbox\tempbox\hbox{\csname listdevicefont\romannumeral\the\@enumdepth\endcsname#1\hskip\labelsep}% \enumdim\wd\tempbox \setbox\tempbox\hbox{\csname listdevicefont\romannumeral\the\@enumdepth\endcsname#1}% \enumdimwd\wd\tempbox \expandafter\global\csname leftmargin\romannumeral\the\@enumdepth\endcsname\enumdim \ifdim\listtextleftmargin>\z@ \leftmargini\listtextleftmargin \ifdim\listlabelleftskip>\z@ \advance\leftmargini-\listlabelleftskip \fi \fi \ifdim\listtextleftmarginii>\z@ \leftmarginii\listtextleftmarginii \ifdim\listlabelleftskipii>\z@ \advance\leftmarginii-\listlabelleftskipii \fi \fi \ifdim\listtextleftmarginiii>\z@ \leftmarginiii\listtextleftmarginiii \ifdim\listlabelleftskipiii>\z@ \advance\leftmarginiii-\listlabelleftskipiii \fi \fi } % \enummax{1.} % \def\enumerate{\@ifnextchar[{\@enumerate}{\@enumerate[\csname label\@enumctr\endcsname]}}%% % \def\@enumerate[#1]{\par \ifnum \@enumdepth >3 \@toodeep \else \advance\@enumdepth\@ne \edef\@enumctr{enum\romannumeral\the\@enumdepth}% \setcounter{\@enumctr}{1}\enummax{#1}% \list {\csname label\@enumctr\endcsname}{\usecounter{\@enumctr}% \topsep\csname abovelistskip\romannumeral\the\@enumdepth\endcsname \itemsep\csname itemsep\romannumeral\the\@enumdepth\endcsname % \listfont %\listparindent18.25pt \ifnum \@enumdepth=1 \leftmargin32.7pt \rightmargin\listtextrightmargin \advance\rightmargin\rightskip \advance\leftmargin\leftskip \tempdimen\leftmargini \advance\tempdimen-\labelsep %%%%%%%%%%% \iffnalpha \def\makelabel##1{{\hskip\listlabelleftskip{\csname listdevicefont\romannumeral\the\@enumdepth\endcsname{\iflabelrightalign\hss\fi\textlistlabel##1}}}}% \global\fnalphafalse \else \def\makelabel##1{\hbox to \tempdimen{\hskip\listlabelleftskip{\csname listdevicefont\romannumeral\the\@enumdepth\endcsname\hbox to \enumdimwd{\iflabelrightalign\hss\fi\textlistlabel##1}}\blackink}}% \fi %%%%%%%%%%%%%%%%%%%%%%%%%%% \else \ifnum \@enumdepth=2 \tempdimen\leftmarginii \advance\tempdimen-\labelsep \def\makelabel##1{\hbox to \tempdimen{\hskip\listlabelleftskipii{\csname listdevicefont\romannumeral\the\@enumdepth\endcsname\hbox to \enumdimwd{\iflabelrightalign\hss\fi##1}\blackink}}}% \else \ifnum \@enumdepth=3 \tempdimen\leftmarginiii \advance\tempdimen-\labelsep \def\makelabel##1{\hbox to \tempdimen{\hskip\listlabelleftskipiii{\csname listdevicefont\romannumeral\the\@enumdepth\endcsname\hbox to \enumdimwd{\iflabelrightalign\hss\fi##1}\blackink}}}% \else \def\makelabel##1{\hss\llap{\csname listdevicefont\romannumeral\the\@enumdepth\endcsname##1}}% \fi \fi \fi} \fi} % \def\endenumerate{\@topsepadd\csname belowlistskip\romannumeral\the\@enumdepth\endcsname\endlist}% % \def\textlistlabel{} %%%%%%%%%%%%%%%%%%%%%%%%%%% \newdimen\concolwidth \newbox\stempbox \def\contributor#1#2#3{\addvspace{10pt}{% \setbox\stempbox\hbox{\ContributorAffiliationFont #2} \concolwidth\wd\stempbox \noindent{\ContributorNameFont #1}\par \ifdim\concolwidth>\columnwidth \vspace*{3pt} \else \fi \noindent{\vbox{\hangindent12pt\ContributorAffiliationFont #2}}\vskip-1\p@ \noindent{\vbox{\hangindent12pt\ContributorAffiliationFont #3}}}} %%\def\contributors{% %% \twocolumn[\contributorshead] %% \pagestyle{empty} %% \leftskip1pc %% \parindent-1pc} %%\def\contributorshead{% %% \vbox{}\vskip2pc %% {\centering\HeadFont CONTRIBUTORS\vskip2\p@} %% \noindent\rule{\textwidth}{1\p@}\vskip25\p@} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \def\cleardoublepage{\clearpage\if@twoside \ifodd\c@page\else \hbox{}\thispagestyle{empty}\newpage\if@twocolumn\hbox{}\newpage\fi\fi\fi} \frenchspacing \tolerance=5000 \raggedbottom %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \@centertabletitlefalse %\HeadingsBookChapter \HeadingsChapterSection \endinput %% %% End of file `krantz.cls'. ================================================ FILE: rep-res-3rd-edition/latex/after_body.tex ================================================ \backmatter \printindex ================================================ FILE: rep-res-3rd-edition/latex/before_body.tex ================================================ % you may need to leave a few empty pages before the dedication page %\cleardoublepage\newpage\thispagestyle{empty}\null %\cleardoublepage\newpage\thispagestyle{empty}\null %\cleardoublepage\newpage \thispagestyle{empty} % \begin{center} % To Kristina Gandrud, % % who is currently at the movies with our son, so that I can finish the third edition. % %\includegraphics{images/dedication.pdf} % \end{center} \setlength{\abovedisplayskip}{-5pt} \setlength{\abovedisplayshortskip}{-5pt} ================================================ FILE: rep-res-3rd-edition/latex/preamble.tex ================================================ \usepackage{booktabs} \usepackage{longtable} \usepackage[bf,singlelinecheck=off]{caption} \usepackage{framed,color} \definecolor{shadecolor}{RGB}{248,248,248} \renewcommand{\textfraction}{0.05} \renewcommand{\topfraction}{0.8} \renewcommand{\bottomfraction}{0.8} \renewcommand{\floatpagefraction}{0.75} \renewenvironment{quote}{\begin{VF}}{\end{VF}} \let\oldhref\href \renewcommand{\href}[2]{#2\footnote{\url{#1}}} \makeatletter \newenvironment{kframe}{% \medskip{} \setlength{\fboxsep}{.8em} \def\at@end@of@kframe{}% \ifinner\ifhmode% \def\at@end@of@kframe{\end{minipage}}% \begin{minipage}{\columnwidth}% \fi\fi% \def\FrameCommand##1{\hskip\@totalleftmargin \hskip-\fboxsep \colorbox{shadecolor}{##1}\hskip-\fboxsep % There is no \\@totalrightmargin, so: \hskip-\linewidth \hskip-\@totalleftmargin \hskip\columnwidth}% \MakeFramed {\advance\hsize-\width \@totalleftmargin\z@ \linewidth\hsize \@setminipage}}% {\par\unskip\endMakeFramed% \at@end@of@kframe} \makeatother \renewenvironment{Shaded}{\begin{kframe}}{\end{kframe}} \usepackage{makeidx} \makeindex \urlstyle{tt} \usepackage{amsthm} \makeatletter \def\thm@space@setup{% \thm@preskip=8pt plus 2pt minus 4pt \thm@postskip=\thm@preskip } \makeatother \frontmatter ================================================ FILE: rep-res-3rd-edition/packages.bib ================================================ @Manual{R-animation, title = {animation: A Gallery of Animations in Statistics and Utilities to Create Animations}, author = {Yihui Xie}, year = {2018}, note = {R package version 2.6}, url = {https://CRAN.R-project.org/package=animation}, } @Manual{R-base, title = {R: A Language and Environment for Statistical Computing}, author = {{R Core Team}}, organization = {R Foundation for Statistical Computing}, address = {Vienna, Austria}, year = {2019}, url = {https://www.R-project.org/}, } @Manual{R-bookdown, title = {bookdown: Authoring Books and Technical Documents with R Markdown}, author = {Yihui Xie}, year = {2020}, note = {R package version 0.17}, url = {https://CRAN.R-project.org/package=bookdown}, } @Manual{R-brew, title = {brew: Templating Framework for Report Generation}, author = {Jeffrey Horner}, year = {2011}, note = {R package version 1.0-6}, url = {https://CRAN.R-project.org/package=brew}, } @Manual{R-brms, title = {brms: Bayesian Regression Models using 'Stan'}, author = {Paul-Christian Bürkner}, year = {2020}, note = {R package version 2.11.0}, url = {https://CRAN.R-project.org/package=brms}, } @Manual{R-countrycode, title = {countrycode: Convert Country Names and Country Codes}, author = {Vincent Arel-Bundock}, year = {2018}, note = {R package version 1.1.0}, url = {https://CRAN.R-project.org/package=countrycode}, } @Manual{R-data.table, title = {data.table: Extension of `data.frame`}, author = {Matt Dowle and Arun Srinivasan}, year = {2019}, note = {R package version 1.12.8}, url = {https://CRAN.R-project.org/package=data.table}, } @Manual{R-dbplyr, title = {dbplyr: A 'dplyr' Back End for Databases}, author = {Hadley Wickham and Edgar Ruiz}, year = {2019}, note = {R package version 1.4.2}, url = {https://CRAN.R-project.org/package=dbplyr}, } @Manual{R-devtools, title = {devtools: Tools to Make Developing R Packages Easier}, author = {Hadley Wickham and Jim Hester and Winston Chang}, year = {2019}, note = {R package version 2.2.1}, url = {https://CRAN.R-project.org/package=devtools}, } @Manual{R-dplyr, title = {dplyr: A Grammar of Data Manipulation}, author = {Hadley Wickham and Romain François and Lionel Henry and Kirill Müller}, year = {2019}, note = {R package version 0.8.3}, url = {https://CRAN.R-project.org/package=dplyr}, } @Manual{R-formatR, title = {formatR: Format R Code Automatically}, author = {Yihui Xie}, year = {2019}, note = {R package version 1.7}, url = {https://CRAN.R-project.org/package=formatR}, } @Manual{R-ggplot2, title = {ggplot2: Create Elegant Data Visualisations Using the Grammar of Graphics}, author = {Hadley Wickham and Winston Chang and Lionel Henry and Thomas Lin Pedersen and Kohske Takahashi and Claus Wilke and Kara Woo and Hiroaki Yutani}, year = {2019}, note = {R package version 3.2.1}, url = {https://CRAN.R-project.org/package=ggplot2}, } @Manual{R-googleVis, title = {googleVis: R Interface to Google Charts}, author = {Markus Gesmann and Diego {de Castillo}}, year = {2019}, note = {R package version 0.6.4}, url = {https://CRAN.R-project.org/package=googleVis}, } @Manual{R-here, title = {here: A Simpler Way to Find Your Files}, author = {Kirill Müller}, year = {2017}, note = {R package version 0.1}, url = {https://CRAN.R-project.org/package=here}, } @Manual{R-htmlwidgets, title = {htmlwidgets: HTML Widgets for R}, author = {Ramnath Vaidyanathan and Yihui Xie and JJ Allaire and Joe Cheng and Kenton Russell}, year = {2019}, note = {R package version 1.5.1}, url = {https://CRAN.R-project.org/package=htmlwidgets}, } @Manual{R-httr, title = {httr: Tools for Working with URLs and HTTP}, author = {Hadley Wickham}, year = {2019}, note = {R package version 1.4.1}, url = {https://CRAN.R-project.org/package=httr}, } @Manual{R-IRkernel, title = {IRkernel: Native R Kernel for the 'Jupyter Notebook'}, author = {Thomas Kluyver and Philipp Angerer and Jan Schulz and Karthik Ram}, year = {2019}, note = {R package version 1.1}, url = {https://CRAN.R-project.org/package=IRkernel}, } @Manual{R-jsonlite, title = {jsonlite: A Robust, High Performance JSON Parser and Generator for R}, author = {Jeroen Ooms and Duncan {Temple Lang} and Lloyd Hilaiel}, year = {2018}, note = {R package version 1.6}, url = {https://CRAN.R-project.org/package=jsonlite}, } @Manual{R-knitr, title = {knitr: A General-Purpose Package for Dynamic Report Generation in R}, author = {Yihui Xie}, year = {2020}, note = {R package version 1.27}, url = {https://CRAN.R-project.org/package=knitr}, } @Manual{R-magick, title = {magick: Advanced Graphics and Image-Processing in R}, author = {Jeroen Ooms}, year = {2019}, note = {R package version 2.2}, url = {https://CRAN.R-project.org/package=magick}, } @Manual{R-magrittr, title = {magrittr: A Forward-Pipe Operator for R}, author = {Stefan Milton Bache and Hadley Wickham}, year = {2014}, note = {R package version 1.5}, url = {https://CRAN.R-project.org/package=magrittr}, } @Manual{R-markdown, title = {markdown: Render Markdown with the C Library 'Sundown'}, author = {JJ Allaire and Jeffrey Horner and Yihui Xie and Vicent Marti and Natacha Porte}, year = {2019}, note = {R package version 1.1}, url = {https://CRAN.R-project.org/package=markdown}, } @Manual{R-packrat, title = {packrat: A Dependency Management System for Projects and their R Package Dependencies}, author = {Kevin Ushey and Jonathan McPherson and Joe Cheng and Aron Atkins and JJ Allaire}, year = {2018}, note = {R package version 0.5.0}, url = {https://CRAN.R-project.org/package=packrat}, } @Manual{R-pacman, title = {pacman: Package Management Tool}, author = {Tyler Rinker and Dason Kurkiewicz}, year = {2019}, note = {R package version 0.5.1}, url = {https://CRAN.R-project.org/package=pacman}, } @Manual{R-pdftools, title = {pdftools: Text Extraction, Rendering and Converting of PDF Documents}, author = {Jeroen Ooms}, year = {2019}, note = {R package version 2.3}, url = {https://CRAN.R-project.org/package=pdftools}, } @Manual{R-ProjectTemplate, title = {ProjectTemplate: Automates the Creation of New Statistical Analysis Projects}, author = {John Myles White}, year = {2019}, note = {R package version 0.9.0}, url = {https://CRAN.R-project.org/package=ProjectTemplate}, } @Manual{R-purrr, title = {purrr: Functional Programming Tools}, author = {Lionel Henry and Hadley Wickham}, year = {2019}, note = {R package version 0.3.3}, url = {https://CRAN.R-project.org/package=purrr}, } @Manual{R-RCurl, title = {RCurl: General Network (HTTP/FTP/...) Client Interface for R}, author = {Duncan {Temple Lang} and the CRAN team}, year = {2020}, note = {R package version 1.95-4.13}, url = {https://CRAN.R-project.org/package=RCurl}, } @Manual{R-rio, title = {rio: A Swiss-Army Knife for Data I/O}, author = {Chung-hong Chan and Thomas J. Leeper}, year = {2018}, note = {R package version 0.5.16}, url = {https://CRAN.R-project.org/package=rio}, } @Manual{R-rmarkdown, title = {rmarkdown: Dynamic Documents for R}, author = {JJ Allaire and Yihui Xie and Jonathan McPherson and Javier Luraschi and Kevin Ushey and Aron Atkins and Hadley Wickham and Joe Cheng and Winston Chang and Richard Iannone}, year = {2019}, note = {R package version 2.0}, url = {https://CRAN.R-project.org/package=rmarkdown}, } @Manual{R-rvest, title = {rvest: Easily Harvest (Scrape) Web Pages}, author = {Hadley Wickham}, year = {2019}, note = {R package version 0.3.5}, url = {https://CRAN.R-project.org/package=rvest}, } @Manual{R-shiny, title = {shiny: Web Application Framework for R}, author = {Winston Chang and Joe Cheng and JJ Allaire and Yihui Xie and Jonathan McPherson}, year = {2019}, note = {R package version 1.4.0}, url = {https://CRAN.R-project.org/package=shiny}, } @Manual{R-stargazer, title = {stargazer: Well-Formatted Regression and Summary Statistics Tables}, author = {Marek Hlavac}, year = {2018}, note = {R package version 5.2.2}, url = {https://CRAN.R-project.org/package=stargazer}, } @Manual{R-styler, title = {styler: Non-Invasive Pretty Printing of R Code}, author = {Kirill Müller and Lorenz Walthert}, year = {2019}, note = {R package version 1.2.0}, url = {https://CRAN.R-project.org/package=styler}, } @Manual{R-survival, title = {survival: Survival Analysis}, author = {Terry M Therneau}, year = {2019}, note = {R package version 3.1-8}, url = {https://CRAN.R-project.org/package=survival}, } @Manual{R-texreg, title = {texreg: Conversion of R Regression Output to LaTeX or HTML Tables}, author = {Philip Leifeld}, year = {2017}, note = {R package version 1.36.23}, url = {https://CRAN.R-project.org/package=texreg}, } @Manual{R-tibble, title = {tibble: Simple Data Frames}, author = {Kirill Müller and Hadley Wickham}, year = {2019}, note = {R package version 2.1.3}, url = {https://CRAN.R-project.org/package=tibble}, } @Manual{R-tidyr, title = {tidyr: Tidy Messy Data}, author = {Hadley Wickham and Lionel Henry}, year = {2019}, note = {R package version 1.0.0}, url = {https://CRAN.R-project.org/package=tidyr}, } @Manual{R-tidyverse, title = {tidyverse: Easily Install and Load the 'Tidyverse'}, author = {Hadley Wickham}, year = {2019}, note = {R package version 1.3.0}, url = {https://CRAN.R-project.org/package=tidyverse}, } @Manual{R-tinytex, title = {tinytex: Helper Functions to Install and Maintain TeX Live, and Compile LaTeX Documents}, author = {Yihui Xie}, year = {2020}, note = {R package version 0.19}, url = {https://CRAN.R-project.org/package=tinytex}, } @Manual{R-WDI, title = {WDI: World Development Indicators (World Bank)}, author = {Vincent Arel-Bundock}, year = {2019}, note = {R package version 2.6.0}, url = {https://CRAN.R-project.org/package=WDI}, } @Manual{R-xfun, title = {xfun: Miscellaneous Functions by 'Yihui Xie'}, author = {Yihui Xie}, year = {2020}, note = {R package version 0.12}, url = {https://CRAN.R-project.org/package=xfun}, } @Manual{R-XML, title = {XML: Tools for Parsing and Generating XML Within R and S-Plus}, author = {Duncan {Temple Lang}}, year = {2020}, note = {R package version 3.99-0.2}, url = {https://CRAN.R-project.org/package=XML}, } @Manual{R-xtable, title = {xtable: Export Tables to LaTeX or HTML}, author = {David B. Dahl and David Scott and Charles Roosen and Arni Magnusson and Jonathan Swinton}, year = {2019}, note = {R package version 1.8-4}, url = {https://CRAN.R-project.org/package=xtable}, } @Article{animation2013, title = {{animation}: An {R} Package for Creating Animations and Demonstrating Statistical Methods}, author = {Yihui Xie}, journal = {Journal of Statistical Software}, year = {2013}, volume = {53}, number = {1}, pages = {1--27}, url = {http://www.jstatsoft.org/v53/i01/}, } @Book{bookdown2016, title = {bookdown: Authoring Books and Technical Documents with {R} Markdown}, author = {Yihui Xie}, publisher = {Chapman and Hall/CRC}, address = {Boca Raton, Florida}, year = {2016}, note = {ISBN 978-1138700109}, url = {https://github.com/rstudio/bookdown}, } @Article{brms2017, title = {{brms}: An {R} Package for {Bayesian} Multilevel Models Using {Stan}}, author = {Paul-Christian Bürkner}, journal = {Journal of Statistical Software}, year = {2017}, volume = {80}, number = {1}, pages = {1--28}, doi = {10.18637/jss.v080.i01}, encoding = {UTF-8}, } @Article{brms2018, title = {Advanced {Bayesian} Multilevel Modeling with the {R} Package {brms}}, author = {Paul-Christian Bürkner}, journal = {The R Journal}, year = {2018}, volume = {10}, number = {1}, pages = {395--411}, doi = {10.32614/RJ-2018-017}, encoding = {UTF-8}, } @Article{countrycode2018, title = {countrycode: An R package to convert country names and country codes}, author = {Vincent Arel-Bundock and Nils Enevoldsen and CJ Yetman}, journal = {Journal of Open Source Software}, year = {2018}, volume = {3}, number = {28}, pages = {848}, url = {https://doi.org/10.21105/joss.00848}, } @Book{ggplot22016, author = {Hadley Wickham}, title = {ggplot2: Elegant Graphics for Data Analysis}, publisher = {Springer-Verlag New York}, year = {2016}, isbn = {978-3-319-24277-4}, url = {https://ggplot2.tidyverse.org}, } @Article{googleVis2011, title = {googleVis: Interface between R and the Google Visualisation API}, author = {Markus Gesmann and Diego {de Castillo}}, journal = {The R Journal}, year = {2011}, volume = {3}, number = {2}, pages = {40--44}, month = {December}, url = {https://journal.r-project.org/archive/2011-2/RJournal_2011-2_Gesmann+de~Castillo.pdf}, } @Article{jsonlite2014, title = {The jsonlite Package: A Practical and Consistent Mapping Between JSON Data and R Objects}, author = {Jeroen Ooms}, journal = {arXiv:1403.2805 [stat.CO]}, year = {2014}, url = {https://arxiv.org/abs/1403.2805}, } @Book{knitr2015, title = {Dynamic Documents with {R} and knitr}, author = {Yihui Xie}, publisher = {Chapman and Hall/CRC}, address = {Boca Raton, Florida}, year = {2015}, edition = {2nd}, note = {ISBN 978-1498716963}, url = {https://yihui.org/knitr/}, } @InCollection{knitr2014, booktitle = {Implementing Reproducible Computational Research}, editor = {Victoria Stodden and Friedrich Leisch and Roger D. Peng}, title = {knitr: A Comprehensive Tool for Reproducible Research in {R}}, author = {Yihui Xie}, publisher = {Chapman and Hall/CRC}, year = {2014}, note = {ISBN 978-1466561595}, url = {http://www.crcpress.com/product/isbn/9781466561595}, } @Manual{pacman2018, title = {{pacman}: {P}ackage Management for {R}}, author = {Tyler W. Rinker and Dason Kurkiewicz}, address = {Buffalo, New York}, note = {version 0.5.0}, year = {2018}, url = {http://github.com/trinker/pacman}, } @Book{rmarkdown2018, title = {R Markdown: The Definitive Guide}, author = {Yihui Xie and J.J. Allaire and Garrett Grolemund}, publisher = {Chapman and Hall/CRC}, address = {Boca Raton, Florida}, year = {2018}, note = {ISBN 9781138359338}, url = {https://bookdown.org/yihui/rmarkdown}, } @Manual{survival-package, title = {A Package for Survival Analysis in S}, author = {Terry M Therneau}, year = {2015}, note = {version 2.38}, url = {https://CRAN.R-project.org/package=survival}, } @Book{survival-book, title = {Modeling Survival Data: Extending the {C}ox Model}, author = {{Terry M. Therneau} and {Patricia M. Grambsch}}, year = {2000}, publisher = {Springer}, address = {New York}, isbn = {0-387-98784-3}, } @Article{texreg2013, title = {{texreg}: Conversion of Statistical Model Output in {R} to {\LaTeX} and {HTML} Tables}, author = {Philip Leifeld}, journal = {Journal of Statistical Software}, year = {2013}, volume = {55}, number = {8}, pages = {1--24}, url = {http://www.jstatsoft.org/v55/i08/}, } @Article{tidyverse2019, title = {Welcome to the {tidyverse}}, author = {Hadley Wickham and Mara Averick and Jennifer Bryan and Winston Chang and Lucy D'Agostino McGowan and Romain François and Garrett Grolemund and Alex Hayes and Lionel Henry and Jim Hester and Max Kuhn and Thomas Lin Pedersen and Evan Miller and Stephan Milton Bache and Kirill Müller and Jeroen Ooms and David Robinson and Dana Paige Seidel and Vitalie Spinu and Kohske Takahashi and Davis Vaughan and Claus Wilke and Kara Woo and Hiroaki Yutani}, year = {2019}, journal = {Journal of Open Source Software}, volume = {4}, number = {43}, pages = {1686}, doi = {10.21105/joss.01686}, } @Article{tinytex2019, title = {TinyTeX: A lightweight, cross-platform, and easy-to-maintain LaTeX distribution based on TeX Live}, author = {Yihui Xie}, journal = {TUGboat}, year = {2019}, volumne = {40}, number = {1}, pages = {30--32}, url = {http://tug.org/TUGboat/Contents/contents40-1.html}, } ================================================ FILE: rep-res-3rd-edition/rep-res-3rd-edition.Rproj ================================================ Version: 1.0 RestoreWorkspace: No SaveWorkspace: No AlwaysSaveHistory: Default EnableCodeIndexing: Yes UseSpacesForTab: Yes NumSpacesForTab: 4 Encoding: UTF-8 RnwWeave: Sweave LaTeX: pdfLaTeX BuildType: Website