Showing preview only (2,158K chars total). Download the full file or copy to clipboard to get everything.
Repository: geocompx/geocompr
Branch: main
Commit: 2cb2a1a6ec6a
Files: 204
Total size: 2.0 MB
Directory structure:
gitextract_34buill7/
├── .Rbuildignore
├── .binder/
│ ├── Dockerfile
│ ├── LICENSE
│ └── README.md
├── .devcontainer.json
├── .gitattributes
├── .github/
│ ├── .gitignore
│ ├── ISSUE_TEMPLATE.md
│ └── workflows/
│ ├── dev-sf.yaml
│ ├── main-no-deploy.yml
│ ├── main.yaml
│ └── qgis-ext.yaml
├── .gitignore
├── .htaccess
├── .lintr
├── .nojekyll
├── .vscode/
│ └── settings.json
├── 01-introduction.Rmd
├── 02-spatial-data.Rmd
├── 03-attribute-operations.Rmd
├── 04-spatial-operations.Rmd
├── 05-geometry-operations.Rmd
├── 06-raster-vector.Rmd
├── 07-reproj.Rmd
├── 08-read-write-plot.Rmd
├── 09-mapping.Rmd
├── 10-gis.Rmd
├── 11-algorithms.Rmd
├── 12-spatial-cv.Rmd
├── 13-transport.Rmd
├── 14-location.Rmd
├── 15-eco.Rmd
├── 16-synthesis.Rmd
├── CITATION.bib
├── CITATION_ed1.bib
├── CODE_OF_CONDUCT.md
├── DESCRIPTION
├── LICENSE.md
├── README.Rmd
├── README.md
├── _01-ex.Rmd
├── _02-ex.Rmd
├── _03-ex.Rmd
├── _04-ex.Rmd
├── _05-ex.Rmd
├── _06-ex.Rmd
├── _07-ex.Rmd
├── _08-ex.Rmd
├── _09-ex.Rmd
├── _10-ex.Rmd
├── _11-ex.Rmd
├── _12-ex.Rmd
├── _13-ex.Rmd
├── _14-ex.Rmd
├── _15-ex.Rmd
├── _404.Rmd
├── _bookdown.yml
├── _output.yml
├── _redirects
├── apps/
│ ├── CycleHireApp/
│ │ ├── app.R
│ │ └── manifest.json
│ └── coffeeApp/
│ ├── app.R
│ └── manifest.json
├── benchmarks.csv
├── code/
│ ├── 01-cranlogs.R
│ ├── 01-sf-revdep.R
│ ├── 02-contpop.R
│ ├── 02-datum-fig.R
│ ├── 02-raster-crs.R
│ ├── 02-raster-intro-plot.R
│ ├── 02-raster-intro-plot2.R
│ ├── 02-sfdiagram.R
│ ├── 02-sfheaders.R
│ ├── 02-vector-crs.R
│ ├── 02-vectorplots.R
│ ├── 03-cont-raster-plot.R
│ ├── 04-areal-example.R
│ ├── 04-focal-example.R
│ ├── 04-local-operations.R
│ ├── 04-ndvi.R
│ ├── 04-raster-subset.R
│ ├── 04-spatial-join.R
│ ├── 05-bilinear.R
│ ├── 05-extend-example.R
│ ├── 05-us-regions.R
│ ├── 05-venn-clip.R
│ ├── 06-contour-tmap.R
│ ├── 06-pointextr.R
│ ├── 06-raster-vectorization1.R
│ ├── 06-raster-vectorization2.R
│ ├── 06-vector-rasterization1.R
│ ├── 06-vector-rasterization2.R
│ ├── 09-break-styles.R
│ ├── 09-layout1.R
│ ├── 09-layout2.R
│ ├── 09-map-pkgs.R
│ ├── 09-tmpal.R
│ ├── 09-tmshape.R
│ ├── 09-tmstyles.R
│ ├── 09-urban-animation.R
│ ├── 09-usboundaries.R
│ ├── 10-qgis-raster.R
│ ├── 10-saga-segments.R
│ ├── 10-saga-wetness.R
│ ├── 10-sliver.R
│ ├── 10-tsp.R
│ ├── 11-centroid-alg.R
│ ├── 11-centroid-setup.R
│ ├── 11-hello.R
│ ├── 11-polycent.R
│ ├── 12-cv.R
│ ├── 12-partitioning.R
│ ├── 13-cycleways.R
│ ├── 13-desire.R
│ ├── 13-transport-data-gen.R
│ ├── 13-zones.R
│ ├── 14-location-figures.R
│ ├── 15-rf_mlr3.R
│ ├── add-impact.R
│ ├── before_script.R
│ ├── benchmark.R
│ ├── chapters/
│ │ ├── 01-introduction.R
│ │ ├── 02-spatial-data.R
│ │ ├── 03-attribute-operations.R
│ │ ├── 04-spatial-operations.R
│ │ ├── 05-geometry-operations.R
│ │ ├── 06-raster-vector.R
│ │ ├── 07-reproj.R
│ │ ├── 08-read-write-plot.R
│ │ ├── 09-mapping.R
│ │ ├── 10-gis.R
│ │ ├── 11-algorithms.R
│ │ ├── 12-spatial-cv.R
│ │ ├── 13-transport.R
│ │ ├── 14-location.R
│ │ ├── 15-eco.R
│ │ ├── 16-synthesis.R
│ │ ├── README.R
│ │ ├── _01-ex.R
│ │ ├── _02-ex.R
│ │ ├── _03-ex.R
│ │ ├── _04-ex.R
│ │ ├── _05-ex.R
│ │ ├── _06-ex.R
│ │ ├── _07-ex.R
│ │ ├── _08-ex.R
│ │ ├── _10-ex.R
│ │ ├── _12-ex.R
│ │ ├── _13-ex.R
│ │ ├── _15-ex.R
│ │ ├── _404.R
│ │ ├── index.R
│ │ └── references.R
│ ├── de_9im.R
│ ├── extra-pkgs.R
│ ├── front_cover2.R
│ ├── frontcover.R
│ ├── generate-chapter-code.R
│ ├── hex_sticker.R
│ ├── list-contributors.R
│ ├── old-to-future-remove/
│ │ ├── 06_raster_reprojection_tests.R
│ │ ├── 08-uscolonize.R
│ │ ├── 10-centroid.R
│ │ ├── 10-earthquakes.R
│ │ ├── 12-code-extension.R
│ │ ├── 12-desire-front.R
│ │ ├── globe.R
│ │ ├── sfr-class-diagram-gen.R
│ │ └── spData.R
│ ├── sf-classes.R
│ └── sfheaders.Rmd
├── extdata/
│ ├── .gitignore
│ ├── 12-bmr_score.rds
│ ├── 15-bmr_exercises.rds
│ ├── 15-nmds.rds
│ ├── 15-rp_exercises.rds
│ ├── 15-tune.rds
│ ├── coffee-data-messy.csv
│ ├── coffee-data.csv
│ ├── contributors.csv
│ ├── generic_map_pkgs.csv
│ ├── gis-vs-gds-table.csv
│ ├── package_list.csv
│ ├── postgis_data.Rdata
│ ├── sfs-st-cast.csv
│ ├── specific_map_pkgs.csv
│ ├── svm_sp_sp_rbf_50it.rds
│ ├── top_dls.csv
│ └── word-count-time.csv
├── geocompr.Rproj
├── geocompr.bib
├── images/
│ └── r_logo.tif
├── index.Rmd
├── krantz.cls
├── makefile
├── misc/
│ ├── our-impact.csv
│ └── our-style.md
├── packages.bib
├── references.Rmd
└── style/
├── after_body.tex
├── before_body.tex
├── ga.html
├── preamble.tex
└── style.css
================================================
FILE CONTENTS
================================================
================================================
FILE: .Rbuildignore
================================================
^.*\.Rproj$
^\.Rproj\.user$
^\.travis\.yml$
^README\.Rmd$
^README-.*\.png$
^\.github$
================================================
FILE: .binder/Dockerfile
================================================
FROM ghcr.io/geocompx/docker:binder
## Declares build arguments
ARG NB_USER
ARG NB_UID
COPY --chown=${NB_USER} . /home/rstudio
================================================
FILE: .binder/LICENSE
================================================
BSD 3-Clause License
Copyright (c) 2021, Yuvi Panda
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
3. Neither the name of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
================================================
FILE: .binder/README.md
================================================
# Template for RStudio on Binder / JupyterHub
[](https://mybinder.org/v2/gh/yuvipanda/rstudio-binder-template/HEAD?urlpath=rstudio)
Generate a Git repository that can run R code with RStudio on
the browser via [mybinder.org](https://mybinder.org) or any JupyterHub
from this template repository!
Based on the excellent [rocker/binder](https://hub.docker.com/r/rocker/binder)
image maintained by the [Rocker project](https://www.rocker-project.org/)
## How to use this reop
### 1. Create a new repo using this as a template
Use the [Use this template](https://docs.github.com/en/repositories/creating-and-managing-repositories/creating-a-repository-from-a-template#creating-a-repository-from-a-template)
button on GitHub. Use a descriptive name representing the
GUI app you are running / demoing. You can then follow the rest of
the instructions in this README from your newly created repository.
### 2. Install any packages you want
You can create an `install.R` file that will be executed on build.
Use `install.packages` or `devtools::install_version`.
```R
install.packages("ggplot2")
```
Packages are installed from [packagemanager.rstudio.com](https://packagemanager.rstudio.com/client/#/),
and binary packages are preferred wherever possible. For some R packages,
you might need to install system packages via apt - you can do so by writing
out a list of apt package names in `apt.txt`.
### 3. Modify the Binder Badge in the README.md
The 'Launch on Binder' badge in this README points to the template repository.
You should modify it to point to your own repository. Keep the `urlpath=rstudio`
parameter intact - that is what makes sure your repo will launch directly into
RStudio
### 4. Add your R code and update README
Finally, add the R code you want to demo to the repository! Cleanup the README
too so it talks about your code, not these instructions on setting up this repo
================================================
FILE: .devcontainer.json
================================================
{
"image": "pixi-r",
"customizations": {
"vscode": {
"extensions": ["reditorsupport.r"]
}
}
}
================================================
FILE: .gitattributes
================================================
*.html linguist-vendored
*.bib linguist-vendored
latex/ linguist-vendored
================================================
FILE: .github/.gitignore
================================================
*.html
================================================
FILE: .github/ISSUE_TEMPLATE.md
================================================
<!-- To report issues...
(1) If it's about content -- link to the offending link/section
(2) If it's a about code -- a reproducible example will help
(3) Any other comments/questions/suggestions are welcome
Please remove this message before posting. Thanks!
-->
================================================
FILE: .github/workflows/dev-sf.yaml
================================================
name: dev-pkgs
on:
push:
branches:
- main
pull_request:
branches:
- main
jobs:
build:
runs-on: ubuntu-latest
container: ghcr.io/geocompx/suggests:latest
env:
GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
steps:
- name: Checkout Project
uses: actions/checkout@v1
- name: Build book
run: |
Rscript -e 'install.packages("geocompkg", repos = c("https://geocompr.r-universe.dev", "https://cloud.r-project.org"), dependencies = TRUE, force = TRUE)'
Rscript -e 'remotes::install_github("r-spatial/sf")'
Rscript -e 'remotes::install_github("r-spatial/stars")'
Rscript -e 'remotes::install_github("rspatial/terra")'
#Rscript -e 'remotes::install_github("geocompx/geocompkg", dependencies = TRUE, force = TRUE)'
Rscript -e 'bookdown::render_book("index.Rmd")'
================================================
FILE: .github/workflows/main-no-deploy.yml
================================================
on:
pull_request:
branches:
- main
name: Render-no-deploy
jobs:
bookdown:
name: Render-Book
runs-on: ubuntu-latest
container: ghcr.io/geocompx/suggests:latest
env:
GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
steps:
- uses: actions/checkout@v2
- name: Render Book
run: |
Rscript -e 'bookdown::render_book("index.Rmd")'
================================================
FILE: .github/workflows/main.yaml
================================================
on:
push:
branches:
main
name: Render
jobs:
bookdown:
name: Render-Book
runs-on: ubuntu-latest
container: ghcr.io/geocompx/suggests:latest
env:
GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
steps:
- uses: actions/checkout@v2
- name: Render Book
run: |
#Rscript -e 'install.packages("remotes")'
#Rscript -e 'remotes::install_github("geocompx/geocompkg", dependencies = TRUE, force = TRUE)'
#Rscript -e 'install.packages("geocompkg", repos = c("https://geocompr.r-universe.dev", "https://cloud.r-project.org"), dependencies = TRUE, force = TRUE)'
Rscript -e 'bookdown::render_book("index.Rmd")'
cp -fvr _redirects _book/
cp -fvr .htaccess _book/
#cp -fvr es.html fr.html solutions.html htaccess.txt _book/
- name: Deploy
uses: peaceiris/actions-gh-pages@v3
with:
github_token: ${{ secrets.GITHUB_TOKEN }}
publish_dir: ./_book
publish_branch: gh-pages
commit_message: 'Deploy commit: ${{ github.event.head_commit.message }}'
================================================
FILE: .github/workflows/qgis-ext.yaml
================================================
name: qgis
on:
push:
branches:
- main
pull_request:
branches:
- main
jobs:
bookdown:
name: Render-Book
runs-on: ubuntu-latest
container: ghcr.io/geocompx/qgis:latest
env:
GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
steps:
- uses: actions/checkout@v2
#- name: Install deps
# run: Rscript -e 'install.packages("geocompkg", repos = c("https://geocompx.r-universe.dev", "https://cloud.r-project.org"), dependencies = TRUE, force = TRUE)'
- name: Render Book
run: Rscript -e 'bookdown::render_book("index.Rmd")'
================================================
FILE: .gitignore
================================================
.bash_history
.rstudio/
.Rproj.user
.Rhistory
.RData
_bookdown_files
.DS_Store
.Rapp.history
_book
_main.*
*.html
*.pdf
*.utf8.md
!ga.html
libs/
geocompr2.rds
*.gpkg
figures/
.claude/
CLAUDE.md
*_cache/
================================================
FILE: .htaccess
================================================
RewriteCond %{REQUEST_FILENAME} !-f
RewriteRule ^(.*)([^/])$ /$1$2/ [L,R=301]
================================================
FILE: .lintr
================================================
linters: with_defaults(
line_length_linter(120),
assignment_linter = NULL
)
================================================
FILE: .nojekyll
================================================
================================================
FILE: .vscode/settings.json
================================================
{
"editor.wordWrap": "on",
"makefile.configureOnOpen": false
}
================================================
FILE: 01-introduction.Rmd
================================================
```{asis index-2, echo=knitr::is_latex_output()}
\mainmatter
```
# Introduction {#intro}
```{r, include=FALSE}
source("code/before_script.R")
```
This book is about using the power of computers to *do things* with geographic data.
It teaches a range of spatial skills, including: reading, writing and manipulating geographic file formats; making static and interactive maps; and applying geocomputation\index{geocomputation} to support more evidence-based decision-making related to a range of geographic phenomena, from transport systems to ecosystems.
By demonstrating how various geographic operations can be linked, in 'code chunks' that intersperse the prose, the book also teaches reproducible, open and thus scientific workflows.
The book is not just about using the wealth of *existing tools* for geocomputation: it's also about understanding the geographic data structures and software needed to build *new tools*.
The approach we teach throughout, and programming techniques covered in Chapter \@ref(algorithms)\index{algorithm} in particular, can remove constraints on your creativity imposed by software.
After reading the book and completing the exercises, you should be ready to tackle real-world problems, communicate your work in maps and code, and contribute to the open source communities developing tools and documentation for reproducible geocomputation.
Over the last few decades, free and open source software for geospatial (FOSS4G\index{FOSS4G}) has progressed at an astonishing rate.
Thanks to organizations such as OSGeo, advanced geographic techniques are no longer the preserve of those with expensive hardware and software: anyone can now download and run high-performance software for geocomputation.
Open source Geographic Information Systems (GIS\index{GIS}), such as [QGIS](https://qgis.org/en/site/)\index{QGIS}, have made geographic analysis accessible worldwide.
GIS software products are powerful, but they tend to emphasize a graphical user interface\index{graphical user interface} (GUI) approach over the command-line interface (CLI) approach advocated in this book.
The 'GUI focus' of many GIS products has the unintended consequence of disabling many users from making their work fully reproducible\index{reproducibility}, a problem that can be overcome by calling 'geoalgorithms' contained in GIS software from the command line, as we'll see in Chapter \@ref(gis).
A simplistic comparison between the different approaches is illustrated in Table \@ref(tab:gdsl).
```{r gdsl, echo=FALSE, message=FALSE}
d = readr::read_csv("extdata/gis-vs-gds-table.csv")
knitr::kable(x = d,
caption = paste("Differences in emphasis between software",
"packages (Graphical User Interface (GUI) of",
"Geographic Information Systems (GIS) and R)."),
caption.short = "Differences between GUI and CLI",
booktabs = TRUE)
```
R is not the only language providing a CLI for geocomputation.
Other command environments with powerful geographic capabilities exist, including Python\index{Python} (covered in the book [Geocomputation with Python](https://py.geocompx.org/)), Julia, and JavaScript.
However, R has advantages that make it a good language for learning geocomputation and for many geocomputation tasks, especially for statistics, modeling and visualization, as outlined in Section \@ref(why-open-source).
This book is also motivated by the importance of reproducibility\index{reproducibility} for scientific research.
It aims to make reproducible geographic data analysis\index{geographic data analysis} workflows more accessible, and demonstrate the power of open geospatial software available from the command line.
R provides ways to interface with other languages [@eddelbuettel_extending_2018], enabling numerous spatial software libraries to be called from R, as explained in Section \@ref(why-use-r-for-geocomputation) and demonstrated in Chapter \@ref(gis).
Before going into the details of the software, however, it is worth taking a step back and thinking about what we mean by geocomputation\index{geocomputation}.
```{block2 01-introduction-1, type='rmdnote'}
Reproducibility is a major advantage of command-line interfaces, but what does it mean in practice?
We define it as follows: "A process in which the same results can be generated by others using publicly accessible code".
This may sound simple and easy to achieve (which it is if you carefully maintain your R code in script files), but it has profound implications for teaching and the scientific process [@pebesma_r_2012].
```
\index{reproducibility}
## What is geocomputation?
We define geocomputation as
> Academic research, software development and practical applications that use geographic data to solve problems, with a focus on reproducibility, flexibility and tool development.
Geocomputation\index{geocomputation!definition} is a young term, dating back to the first conference on the subject in 1996.^[
The first 'GeoComputation' conference took place at the University of Leeds, where one of the authors (Robin) is currently based.
In 2017 the GeoComputation conference returned to University of Leeds, providing a chance for us to work on and present the book (see www.geocomputation.org for more on the conference series, and papers/presentations spanning more than two decades).
]
What distinguished geocomputation from the (at the time) commonly used term 'quantitative geography' was its emphasis on "creative and experimental" applications [@longley_geocomputation_1998] and the development of new tools and methods.
In the words of Stan Openshaw, a pioneer in the field who was an advocate (and possibly originator) of the term, "GeoComputation is about using the various different types of geodata and about developing relevant geo-tools within the overall context of a 'scientific' approach" [@openshaw_geocomputation_2000].
Building on this early definition, *Geocomputation with R* goes beyond data analysis and modeling to include the development of new tools and methods for work that is not just interesting academically but beneficial.
Our approach differs from early definitions of geocomputation in one important way, however: in its emphasis on reproducibility\index{reproducibility} and collaboration.
At the turn of the 21^st^ Century, it was unrealistic to expect readers to be able to reproduce code examples, due to barriers preventing access to the necessary hardware, software and data.
Fast-forward to today and things have progressed rapidly.
Anyone with access to a laptop with sufficient RAM (at least eight GB recommended) can install and run software for geocomputation, and reproduce the contents of this book.
Financial and hardware barriers to geocomputation that existed in 1990s and early 2000s, when high-performance computers were too expensive for most people, have been removed.^[
A suitable laptop can be acquired second-hand for $100 or less in most countries today from websites such as [Ebay](https://www.ebay.com/sch/i.html?_from=R40&_nkw=laptop&_sacat=0&_oaa=1&_udhi=100&rt=nc&RAM%2520Size=4%2520GB%7C16%2520GB%7C8%2520GB&_dcat=177).
Guidance on installing R and a suitable code editor is provided in Chapter \@ref(spatial-class).
]
Geocomputation is also more accessible because publicly accessible datasets are more widely available than ever before, as we will see in Chapter \@ref(read-write).
Unlike early works in the field, all the work presented in this book is reproducible using code and example data supplied alongside the book, in R\index{R} packages such as **spData**, the installation of which is covered in Chapter \@ref(spatial-class).
Geocomputation\index{geocomputation} is closely related to other terms including: Geographic Information Science (GIScience); Geomatics; Geoinformatics; Spatial Information Science; Geoinformation Engineering [@longley_geographic_2015]; and Spatial Data Science\index{spatial data science}\index{geographical data science|see spatial data science} (SDS).
Each term shares an emphasis on a 'scientific' (implying reproducible and falsifiable) approach influenced by GIS\index{GIS!definition}, although their origins and main fields of application differ.
SDS, for example, emphasizes 'data science' skills and large datasets, while Geoinformatics tends to focus on data structures.
But the overlaps between the terms are larger than the differences between them and we use geocomputation as a rough synonym encapsulating all of them:
they all seek to use geographic data for applied scientific work.
Unlike early users of the term, however, we do not seek to imply that there is any cohesive academic field called 'Geocomputation' (or 'GeoComputation' as Stan Openshaw called it).
Geocomputation is a recent term but is influenced by old ideas.
It can be seen as a part of Geography\index{geography}, which has a 2000+ year history [@talbert_ancient_2014];
and an extension of GIS\index{GIS} [@neteler_open_2008], which emerged in the 1960s [@coppock_history_1991].
Geography\index{geography} has played an important role in explaining and influencing humanity's relationship with the natural world long before the invention of the computer.
The famous explorer, early geographer and pioneering polymath Alexander von Humboldt\index{von Humboldt} (who has dozens of species, geographic features, places and even universities named after him, such was his influence) illustrates this role:
not only did his travels to South America in the early 1800s and resulting observations lay the foundations for physical geography and ecology, they also paved the way towards policies to protect the natural world [@wulf_invention_2015].
This book aims to contribute to the still-evolving 'Geographic Tradition' [@livingstone_geographical_1992] by harnessing the power of modern computers and open source software.
The book's links to older disciplines were reflected in suggested titles for the book: *Geography with R* and *R for GIS*.
Each has advantages.
The former conveying the applied nature of the content, about more than where something is on the map.
The latter communicates that this is a book about using R as a powerful command-line geographic information system, to perform spatial operations on *geographic data*.
However, the term GIS has connotations which fail to communicate some of R's\index{R} greatest strengths:
its abilities to seamlessly switch between geographic and non-geographic data processing, modeling and visualization tasks while enabling reproducibility go far beyond the capabilities of GIS.
Geocomputation\index{geocomputation} implies working with geographic data in a reproducible code-driven environment and programming new results, methods and tools, which is what this book is all about.\index{GIS!connotations}
## Why use open source tools for geocomputation? {#why-open-source}
Early geographers used a variety of tools including barometers, compasses and [sextants](https://en.wikipedia.org/wiki/Sextant) to advance knowledge about the world [@wulf_invention_2015].
It was only with the invention of the marine [chronometer](https://en.wikipedia.org/wiki/Marine_chronometer) in 1761 that it became possible to calculate longitude at sea, enabling ships to take more direct routes, for example.
Before the turn of the century, there was an acute shortage of data and tools for geographic analysis.
<!-- Using a sextant was a difficult-to-learn but essential tool for navigation in the 18th century.
Likewise, learning to program and write and share reproducible code can be hard, but is a key skill for modern geocomputation. -->
<!-- Maps were predominantly hand-drawn until modern computing enabled digitisation, a process which only become widespread in the 1990s and which is still ongoing [@gold_outsidein_1996; @auffret_histmapr_2017]. -->
Nowadays, researchers and practitioners have no such limitations and in some cases face the opposite problem: too much data and too many tools.
Most phones now have a global positioning (GPS\index{GPS}) receiver.
Sensors ranging from satellites and semi-autonomous vehicles to citizen scientists incessantly measure every part of the world.
The rate of data produced can be overwhelming, with emerging technologies such as autonomous vehicles generating hundreds or even thousands of gigabytes of data daily.
Remote sensing\index{remote sensing} datasets from satellites are too large to analyze with a single computer, as outlined in Chapter \@ref(gis).
This 'geodata revolution' drives demand for high performance computer hardware and efficient, scalable software to handle and extract signal from the noise.
Evolving open source tools can import and process subsets from the vast geographic data stores directly, via application programming interfaces (APIs) and via interfaces to databases. \index{spatial database}
With the rapidly changing hardware, software and data landscapes, it's important to choose tools that are future-proof.
A major advantage of open source software is its **rate of development and longevity**, with thousands of potential contributors.
Hundreds of people submit bug reports and suggest new features as well as documentation improvements to open source projects every day --- a rate of evolution that most proprietary solutions simply cannot keep up with.
A linked advantage is **interoperability**.
While proprietary products tend to be monolithic 'empires' that are difficult to maintain (linked to the previously mentioned advantage), open source software is more like a 'federation' of modular tools that can be combined in different ways.
This has allowed open source data science languages such as R to rapidly incorporate new developments such as interfaces to high performance visualization libraries and file formats, while proprietary solutions struggle to keep up.
Another major advantage is **reproducibility**.
Being able to replicate findings is vital for scientific research, and open source software removes an important barrier of reproducibility by enabling others to check your findings or applying your methods in new contexts using the same tools.
The combination of using tools that can be accessed by anyone for free with the ability to share code and data means that the results of your work can be checked and built upon by others, which is a huge advantage if you want your work to be used and cited.
The biggest advantage of open source software combined with sharing of reproducible code for many people, however, is the **community**.
The community enables you to get support far quicker and often of higher quality than is possible with a centralized and budget-limited support team associated with proprietary software.
The community can provide feedback, ideas and, as discussed in the Chapter \@ref(conclusion)), can help you to develop your own tools and methods.
R is an open source software project, a powerful language, and an ever-evolving community of statisticians and developers [@wickham_advanced_2019].
R is not the only language enabling reproducible geocomputation with open source software, as outlined in Section \@ref(software-for-geocomputation)).
Many of the reasons for using R also apply to other open source languages for reproducible data science, such as Python\index{Python} and Julia.
However, R has some key advantages, as outlined in Section \@ref(why-use-r-for-geocomputation).
## Why use R for geocomputation? {#why-use-r-for-geocomputation}
R is a multi-platform, open source language and environment for statistical computing and graphics ([r-project.org/](https://www.r-project.org/)).
With a wide range of packages, R also supports advanced geospatial statistics\index{statistics}, modeling and visualization.
\index{R!language}
Integrated development environments (IDEs\index{IDE}) such as RStudio\index{RStudio} have made R more user-friendly for many, easing map-making with a panel dedicated to interactive visualization.
At its core, R is an object-oriented, [functional programming language](https://adv-r.hadley.nz/fp.html) [@wickham_advanced_2019] and was specifically designed as an interactive interface to other software [@chambers_extending_2016].
The latter also includes many 'bridges' to a treasure trove of GIS\index{GIS} software, 'geolibraries' and functions (see Chapter \@ref(gis)).
It is thus ideal for quickly creating 'geo-tools', without needing to master lower level languages (compared to R) such as C\index{C}, FORTRAN\index{FORTRAN} or Java\index{Java} (see Section \@ref(software-for-geocomputation)).
\index{R}
This can feel like breaking free from the metaphorical 'glass ceiling' imposed by GUI-based or proprietary geographic information systems (see Table \@ref(tab:gdsl) for a definition of GUI\index{graphical user interface}).
Furthermore, R facilitates access to other languages:
the packages **Rcpp** and **reticulate** enable access to C++\index{C++} and Python\index{Python} code, for example.
This means R can be used as a 'bridge' to a wide range of geospatial programs (see Section \@ref(software-for-geocomputation)).
Another example showing R's flexibility and evolving geographic capabilities is interactive map-making\index{map-making!interactive maps}.
As we'll see in Chapter \@ref(adv-map), the statement that R has "limited interactive [plotting] facilities" [@bivand_applied_2013] is no longer true.
This is demonstrated by the following code chunk, which creates Figure \@ref(fig:interactive) (the functions that generate the plot are covered in Section \@ref(interactive-maps)).
```{r 01-introduction-2, eval=FALSE, echo=FALSE}
a = osmdata::getbb("Hereford")
b = osmdata::getbb("Bialystok")
rowMeans(a)
rowMeans(b)
```
```{r interactive-demo, eval=FALSE}
library(leaflet)
popup = c("Robin", "Jakub", "Jannes")
leaflet() |>
addProviderTiles("NASAGIBS.ViirsEarthAtNight2012") |>
addMarkers(lng = c(-3, 23, 11),
lat = c(52, 53, 49),
popup = popup)
```
```{r interactive, fig.cap="The blue markers indicate where the authors are from. The basemap is a tiled image of the Earthat night provided by NASA. Interact with the online version at r.geocompx.org, for example by zooming in and clicking on the pop-ups.", out.width="100%", fig.scap="Where the authors are from.", echo=FALSE}
if (knitr::is_latex_output()){
knitr::include_graphics("images/interactive.png")
} else if (knitr::is_html_output()){
# library(leaflet)
# popup = c("Robin", "Jakub", "Jannes")
# interactive = leaflet() |>
# addProviderTiles("NASAGIBS.ViirsEarthAtNight2012") |>
# addMarkers(lng = c(-3, 23, 11),
# lat = c(52, 53, 49),
# popup = popup)
# library(htmlwidgets)
# saveWidget(interactive, file = "interactive.html")
# file.copy("interactive.html", "~/geocompr/geocompr.github.io/static/img/interactive.html")
knitr::include_url("https://geocompr.github.io/img/interactive.html")
}
```
\index{map-making!interactive}
It would have been difficult to produce Figure \@ref(fig:interactive) using R (or any open source language for data science) a few years ago, let alone as an interactive map.
This illustrates R's flexibility and how, thanks to developments such as **knitr** and **leaflet**, it can be used as an interface to other software, a theme that will recur throughout this book.
The use of R code, therefore, enables teaching geocomputation with reference to reproducible examples representing real-world phenomena, rather than just abstract concepts.
The 'R-spatial stack' is easy to install and has comprehensive, well-maintained and highly interoperable packages.
R has 'batteries included' with statistical functions as part of the base installation and hundreds of well-maintained packages implementing many cutting edge methods.
With R, you can dive and get things working with surprisingly few lines of code, enabling you to focus on the geographic methods and data, rather than debugging and managing package dependencies.
A particular strength of R is the ease with which it allows you to create publication quality interactive maps thanks to excellent mapping packages, as outlined in Chapter \@ref(adv-map).
## Software for geocomputation
R is a powerful language for geocomputation, but there are many other options for geographic data analysis providing thousands of geographic functions\index{function}.
Awareness of other languages for geocomputation will help decide when a different tool may be more appropriate for a specific task, and will place R in the wider geospatial ecosystem.
This section briefly introduces the languages [C++](https://isocpp.org/)\index{C++}, [Java](https://www.oracle.com/java/)\index{Java} and [Python](https://www.python.org/)\index{Python} for geocomputation, in preparation for Chapter \@ref(gis).
An important feature of R (and Python) is that it is an interpreted language.
This is advantageous because it enables interactive programming in a Read–Eval–Print Loop (REPL):\index{REPL}
code entered into the console is immediately executed and the result is printed, rather than waiting for the intermediate stage of compilation.
On the other hand, compiled languages such as C++\index{C++} and Java\index{Java} tend to run faster (once they have been compiled).
C++\index{C++} provides the basis for many GIS packages such as [QGIS](https://www.qgis.org/en/site/)\index{QGIS}, [GRASS GIS](https://grass.osgeo.org/)\index{GRASS GIS} and [SAGA](https://saga-gis.sourceforge.io/)\index{SAGA}, so it is a sensible starting point.
Well-written C++\index{C++} is very fast, making it a good choice for performance-critical applications such as processing large geographic datasets, but is harder to learn than Python or R.
C++\index{C++} has become more accessible with the **Rcpp** package, which provides a good 'way in' to C\index{C} programming for R users.
Proficiency with such low-level languages opens the possibility of creating new, high-performance 'geoalgorithms' and a better understanding of how GIS software works (see Chapter \@ref(algorithms)).
However, it is not necessary to learn C++\index{C++} to use R for geocomputation.
Python\index{Python} is an important language for geocomputation, especially because many Desktop GIS\index{GIS} such as GRASS GIS\index{GRASS GIS}, SAGA\index{SAGA} and QGIS\index{QGIS} provide a Python API\index{API} (see Chapter \@ref(gis)).
Like R\index{R}, Python is a popular language for data science.
Both languages are object-oriented, and have many areas of overlap, leading to initiatives such as the **reticulate** package that facilitates access to Python\index{Python} from R and the [Ursa Labs](https://ursalabs.org/) initiative to support portable libraries to the benefit of the entire open source data science ecosystem.
In practice both R and Python have their strengths.
To some extent which you use is less important than the domain of application and communication of results.
Learning either will provide a head-start in learning the other.
However, there are major advantages of R\index{R} over Python\index{Python} for geocomputation\index{geocomputation}.
This includes its much better support of the geographic raster data model in the language itself (see Chapter \@ref(spatial-class)) and corresponding visualization possibilities (see Chapters \@ref(spatial-class) and \@ref(adv-map)).
Equally important, R has unparalleled support for statistics\index{statistics}, including spatial statistics\index{spatial!statistics}, with hundreds of packages (unmatched by Python\index{Python}) supporting thousands of statistical methods.
The major advantage of Python is that it is a *general-purpose* programming language.
It is used in many domains, including desktop software, computer games, websites and data science\index{data science}.
Python\index{Python} is often the only shared language between different (geocomputation) communities and can be seen as the 'glue' that holds many GIS\index{GIS} programs together.
Many geoalgorithms\index{geoalgorithm}, including those in QGIS\index{QGIS} and ArcMap, can be accessed from the Python command line, making it well suited as a starter language for command line GIS.^[
Python modules providing access to geoalgorithms\index{geoalgorithm} include `grass.script` for GRASS GIS\index{GRASS GIS},
`saga-python` for SAGA-GIS\index{SAGA},
`processing` for QGIS\index{QGIS} and `arcpy` for ArcGIS\index{ArcGIS}.
]
For spatial statistics\index{spatial!statistics} and predictive modeling, however, R is second-to-none.
This does not mean you must choose either R or Python: Python\index{Python} supports most common statistical techniques (though R tends to support new developments in spatial statistics earlier) and many concepts learned from Python can be applied to the R\index{R} world.
Like R, Python also supports geographic data analysis and manipulation with packages such as **shapely**, **geopandas**, **rasterio** and **xarray**.
## R's spatial ecosystem {#r-ecosystem}
There are many ways to handle geographic data in R, with dozens of packages\index{R-spatial} in the area.^[
An overview of R's spatial ecosystem can be found in the CRAN\index{CRAN} Task View on the Analysis of Spatial Data
(see https://cran.r-project.org/view=Spatial).
]
In this book, we endeavor to teach the state-of-the-art in the field whilst ensuring that the methods are future-proof.
Like many areas of software development, R's spatial ecosystem is rapidly evolving (Figure \@ref(fig:cranlogs)).
Because R is open source, these developments can easily build on previous work, by 'standing on the shoulders of giants', as Isaac Newton put it in [1675](https://digitallibrary.hsp.org/index.php/Detail/objects/9792).
This approach is advantageous because it encourages collaboration and avoids 'reinventing the wheel'.
The package **sf**\index{sf} (covered in Chapter \@ref(spatial-class)), for example, builds on its predecessor **sp**.
A surge in development time (and interest) in 'R-spatial\index{R-spatial}' has followed the award of a grant by the R Consortium for the development of support for *simple features*, an open-source standard and model to store and access vector geometries.
This resulted in the **sf** package (covered in Section \@ref(intro-sf)).
Multiple places reflect the immense interest in **sf**.
This is especially true for the [R-sig-Geo Archives](https://stat.ethz.ch/pipermail/r-sig-geo/), a long-standing open access email list containing much R-spatial wisdom accumulated over the years.
```{r cranlogs, fig.cap="Downloads of selected R packages for working with geographic data from early 2013 to present. The y axis shows the average number of daily downloads from the popular cloud.r-project.org CRAN mirror with a 91-day rolling window (log scale).", echo=FALSE, fig.scap="The popularity of spatial packages in R."}
knitr::include_graphics("images/01-cranlogs.png")
```
It is noteworthy that shifts in the wider R community, as exemplified by the data processing package **dplyr** (released in [2014](https://cran.r-project.org/src/contrib/Archive/dplyr/)), influenced shifts in R's spatial ecosystem.
Alongside other packages that have a shared style and emphasis on 'tidy data' (including, e.g., **ggplot2**), **dplyr** was placed in the **tidyverse** 'metapackage'\index{tidyverse (package)} in late [2016](https://cran.r-project.org/src/contrib/Archive/tidyverse/).
The **tidyverse**\index{tidyverse (package)} approach, with its focus on long-form data and fast intuitively named functions, has become immensely popular.
This has led to a demand for 'tidy geographic data' which has been partly met by **sf**.
An obvious feature of the **tidyverse** is the tendency for packages to work in harmony.
There is no equivalent 'geoverse', but the modern R-spatial ecosystem has consolidated around **sf**, as illustrated by key packages that depend on it shown in Table \@ref(tab:revdep), and **terra**, both of which are taught in this book.
The stack is highly interoperable both between packages and with other languages, as outlined in Chapter \@ref(gis).
```{r revdep, echo=FALSE, message=FALSE}
top_dls = readr::read_csv("extdata/top_dls.csv")
knitr::kable(top_dls[1:5, 1:2], digits = 0,
caption = paste("The top 5 most downloaded packages that depend",
"on sf, in terms of average number of downloads",
"per day over the previous month. As of",
min(top_dls$date), ", there are ", nrow(top_dls),
" packages which import sf."),
caption.short = "Top 5 most downloaded packages depending on sf.",
booktabs = TRUE,
col.names = c("Package", "Downloads"))
# cranlogs::cran_top_downloads(when = "last-month") # most downloaded pkgs
```
## History of R-spatial
There are many benefits of using modern spatial packages such as **sf**, but there is value in understanding the history of R's spatial capabilities.
Many functions, use cases and teaching materials are contained in older packages, many of which are still useful, provided you know where to look.
\index{R!history}
\index{R-spatial!history}
R's spatial capabilities originated in early spatial packages in the S language [@bivand_implementing_2000].
\index{S}
The 1990s saw the development of numerous S scripts and a handful of packages for spatial statistics\index{statistics}.
By the year 2000, there were R packages for various spatial methods, including "point pattern analysis, geostatistics, exploratory spatial data analysis and spatial econometrics" [@bivand_open_2000].
Some of these, notably **spatial**, **sgeostat** and **splancs** are still available on CRAN\index{CRAN} [@rowlingson_splancs_1993; @rowlingson_splancs_2017;@venables_modern_2002; @majure_sgeostat_2016].
Key spatial packages were described in @ripley_spatial_2001, which outlined R packages for spatial smoothing and interpolation and point pattern analysis.
One of these (**spatstat**) is still being actively maintained, more than 20 years after its first release.
A following commentary outlined the future prospects of spatial statistics [@bivand_more_2001], setting the stage for the development of the popular **spdep** package [@bivand_spdep_2017].
Notably, the commentary mentioned the need for standardization of spatial interfaces, efficient mechanisms for exchanging data with GIS\index{GIS}, and handling of spatial metadata such as coordinate reference systems (CRS\index{CRS}).
These aims have largely been achieved.
**maptools** [@bivand_maptools_2017] is another important package from this time, which provided an interface to the [shapelib](http://shapelib.maptools.org/) library for reading the Shapefile\index{Shapefile} file format and which fed into **sp**.
An extended review of spatial packages proposed a class system to support the "data objects offered by GDAL"\index{GDAL}, including fundamental point, line, polygon, and raster types, and interfaces to external libraries [@hornik_approaches_2003].
To a large extent, these ideas were realized in the packages **rgdal** and **sp**, providing a foundation for the seminal book *Applied Spatial Data Analysis with R* (ASDAR) [@bivand_applied_2013], first published in 2008.
R's spatial capabilities have evolved substantially since then, but they still build on the ideas of early pioneers.
Interfaces to GDAL\index{GDAL} and PROJ\index{PROJ}, for example, still power R's high-performance geographic data I/O and CRS\index{CRS} transformation capabilities, as outlined in Chapters \@ref(reproj-geo-data) and \@ref(read-write), respectively.
**rgdal**, released in 2003, provided GDAL\index{GDAL} bindings for R which greatly enhanced its ability to import data from previously unavailable geographic data formats.
The initial release supported only raster drivers, but subsequent enhancements provided support for CRSs (via the PROJ library), reprojections and import of vector file formats.
Many of these additional capabilities were developed by Barry Rowlingson and released in the **rgdal** codebase in 2006, as described in @rowlingson_rasp:_2003 and the [R-help](https://stat.ethz.ch/pipermail/r-help/2003-January/028413.html) email list.
The **sp** package, released in 2005, was a significant advancement in R's spatial capabilities.
It introduced classes and generic methods for handling geographic coordinates, including points, lines, polygons, and grids, as well as attribute data.
With the S4 class system, **sp** stores information such as bounding box, coordinate reference system (CRS), and attributes in slots within `Spatial` objects.
This allows for efficient data operations on geographic data.
The package also provided generic methods like `summary()` and `plot()` for working with geographic data.
In the following decade, **sp** classes rapidly became popular for geographic data in R and the number of packages that depended on it increased from around 20 in 2008 to over 100 in 2013 [@bivand_applied_2013].
By 2019 more than 500 packages imported **sp**.
Although the number of packages that depend on **sp** has decreased since the release of **sf** it is still used by prominent R packages, including **gstat** (for spatial and spatiotemporal geostatistics)\index{spatial!statistics} and **geosphere** (for spherical trigonometry) [@R-gstat; @hijmans_geosphere_2016].
```{r, eval=FALSE, echo=FALSE}
# Aim: show n. pkgs that depend on sf and sp
revdep_sp = devtools::revdep(pkg = "sp")
length(revdep_sp) # 449 # 2023-11-16
revdep_sf = devtools::revdep(pkg = "sf")
length(revdep_sf) # 739 # 2023-11-16
```
While **rgdal** and **sp** solved many spatial issues, it was not until **rgeos** was developed during a Google Summer of Code project in 2010 [@R-rgeos] that geometry operations could be undertaken on **sp** objects.
Functions such as `gIntersection()` enabled users to find spatial relationships between geographic objects and to modify their geometries (see Chapter \@ref(geometry-operations) for details on geometric operations with **sf**).
\index{raster (package)}
A limitation of the **sp** ecosystem was its limited support for raster data.
This was overcome by **raster**\index{raster (package)}, first released in 2010 [@R-raster].
**raster**'s class system and functions enabled a range of raster operations, capabilities now implemented in the **terra** package, which supersedes **raster**, as outlined in Section \@ref(raster-data).
An important capability of **raster** and **terra** is their ability to work with datasets that are too large to fit into RAM by supporting off-disk operations.
**raster** and **terra** also supports map algebra, as described in Section \@ref(map-algebra).
In parallel with these developments of class systems and methods, came the support for R as an interface to dedicated GIS software.
**GRASS** [@bivand_using_2000] and follow-on packages **spgrass6**, **rgrass7** and **rgrass** were prominent examples in this direction [@bivand_rgrass7_2016;@bivand_spgrass6_2016;@R-rgrass].
Other examples of bridges between R and GIS include bridges to QGIS via **qgisprocess** [@R-qgisprocess], SAGA via **Rsagacmd** [@R-Rsagacmd] or **RSAGA** [@R-RSAGA]\index{RSAGA (package)} and ArcGIS via **RPyGeo** [@brenning_arcgis_2012, first published in 2008], and more (see Chapter \@ref(gis)).
Visualization was not a focus initially, with the bulk of R-spatial development focused on analysis and geographic operations.
**sp** provided methods for map-making using both the base and lattice plotting system, but demand was growing for advanced map-making capabilities.
**RgoogleMaps** first released in 2009, allowed to overlay R spatial data on top of 'basemap' tiles from online services such as Google Maps or OpenStreetMap [@loecher_rgooglemaps_2015].
\index{ggplot2 (package)}
It was followed by the **ggmap** package that added similar 'basemap' tiles capabilities to **ggplot2** [@kahle_ggmap_2013].
Though **ggmap** facilitated map-making with **ggplot2**, its utility was limited by the need to `fortify` spatial objects, which means converting them into long data frames.
While this works well for points, it is computationally inefficient for lines and polygons, since each coordinate (vertex) is converted into a row, leading to huge data frames to represent complex geometries.
Although geographic visualization tended to focus on vector data, raster visualization was supported in **raster** and received a boost with the release of **rasterVis** [@lamigueiro_displaying_2018].
Since then map-making in R has become a hot topic, with dedicated packages such as **tmap**, **leaflet** and **mapview** gaining popularity, as highlighted in Chapter \@ref(adv-map).
Since 2018, when the First Edition of Geocomputation with R was published, the development of geographic R packages has accelerated.
\index{terra (package)}\index{raster (package)}
**terra**, a successor of the **raster** package, was firstly released in 2020 [@R-terra], bringing several benefits to R users working with raster datasets: it is faster and has more a straightforward user interface than its predecessor, as described in Section \@ref(raster-data).
In mid-2021, **sf** started using the S2 spherical geometry engine for geometry operations on unprojected datasets, as described in Section \@ref(s2).
Additional ways of representing and working with geographic data in R since 2018 have been developed, including with the **stars** and **lidR** packages [@pebesma_stars_2021; @Roussel2020].
\index{stars (package)}
\index{lidR (package)}
Such developments have been motivated by the emergence of new technologies, standards and software outside of the R environment [@bivand_progress_2021].
Major updates to the PROJ library\index{PROJ} beginning in 2018 forced the replacement of 'proj-string' representations of CRSs with 'Well Known Text', as described in Section \@ref(crs-intro) and Chapter \@ref(reproj-geo-data).
\index{rayshader (package)}
Since the publication of the first version of Geocomputation with R in 2018, several packages for spatial data visualization have been developed and improved.
The **rayshader** package, for example, enables the development of striking and easy-to-animate 3D visualizations via raytracing and multiple hill-shading methods [@morganwall_rayshader_2021].
\index{ggplot2 (package)}
The very popular **ggplot2** package gained new spatial capabilities, thanks to work on the **ggspatial** package, which provides scale bars and north arrows [@dunnington_ggspatial_2021].
**gganimate** enables smooth and customizable spatial animations [@pedersen_gganimate_2020].
Existing visualization packages have also been improved or rewritten.
Large raster objects are automatically downscaled in **tmap** and high-performance interactive maps are now possible thanks to packages including **leafgl** and **mapdeck**.
<!-- TODO: add release date of tmap 4.0 when ready -->
The **mapsf** package (successor of **cartography**) was rewritten to reduce dependencies and improve performance [@giraud_mapsf_2021]; and **tmap** underwent a major update in Version 4, in which most of the internal code was revised.
In late 2021, the planned retirement of **rgdal**, **rgeos** and **maptools** [was announced](https://stat.ethz.ch/pipermail/r-sig-geo/2021-September/028760.html) and in October 2023 they were archived on CRAN.
This retirement at the end of 2023 not only has had a large impact on existing workflows applying these packages, but also [influenced the packages that depend on them](https://geocompx.org/post/2023/rgdal-retirement/).
Modern R packages such as **sf** and **terra**, described in Chapter \@ref(spatial-class) provide a strong and future-proof foundation for geocomputation that we build on in this book.
## Exercises
```{r, echo=FALSE, results='asis'}
res = knitr::knit_child('_01-ex.Rmd', quiet = TRUE, options = list(include = FALSE, eval = FALSE))
cat(res, sep = '\n')
```
================================================
FILE: 02-spatial-data.Rmd
================================================
# (PART) Foundations {-}
# Geographic data in R {#spatial-class}
```{r, include=FALSE}
source("code/before_script.R")
```
## Prerequisites {-}
This is the first practical chapter of the book, and therefore it comes with some software requirements.
You need access to a computer with a recent version of R installed (R [4.3.2](https://stat.ethz.ch/pipermail/r-announce/2023/000697.html) or a later version).
We recommend not only reading the prose but also *running the code* in each chapter to build your geocomputational skills.
To keep track of your learning journey, it may be worth starting by creating a new folder on your computer to save your R scripts, outputs and other things related to Geocomputation with R as you go.
You can also [download](https://github.com/geocompx/geocompr/archive/refs/heads/main.zip) or [clone](https://docs.github.com/en/repositories/creating-and-managing-repositories/cloning-a-repository) the [source code](https://github.com/geocompx/geocompr) underlying the book to support your learning.
We strongly recommend using R with an integrated development environment (IDE) such as [RStudio](https://posit.co/download/rstudio-desktop/#download)\index{RStudio} (quicker to get up and running) or [VS Code](https://github.com/REditorSupport/vscode-R)\index{VS Code} (which requires additional setup).
If you are new to R, we recommend following introductory R resources such as [Hands on Programming with R](https://rstudio-education.github.io/hopr/starting.html) and [Introduction to R](https://cengel.github.io/R-intro/) before you dive into Geocomputation with R code.
These resources cover in detail how to install R, which simply involves downloading the latest version from the [Comprehensive R Archive Network (CRAN)](https://cran.r-project.org/).
See the note below for more information on installing R for geocomputation on Mac and Linux.
Organize your work into [projects](https://r4ds.had.co.nz/workflow-projects.html) and give scripts sensible names such as `chapter-02.R` (or equivalent RMarkdown or Quarto file names) to document the code as you learn.
\index{R!prerequisites}
\index{R!installation}
```{block2 02-spatial-data-2, type='rmdnote'}
Mac and Linux operating systems (OSs) have additional systems requirements, which can be found in the README of the [**sf** package](https://github.com/r-spatial/sf).
See also OS-specific instructions such as that provided by the website [rtask.thinkr.fr](https://rtask.thinkr.fr/installation-of-r-4-2-on-ubuntu-22-04-lts-and-tips-for-spatial-packages/), which covers installing R on the open source OS Ubuntu.
```
After you have got a good set-up, it's time to run some code!
Unless you already have these packages installed, the first thing to do is to install foundational R packages used in this chapter, with the following commands:^[
**spDataLarge** is not on CRAN\index{CRAN}, meaning it must be installed via *r-universe* or with the following command: `remotes::install_github("Nowosad/spDataLarge")`.
]
```{r 02-spatial-data-1, eval=FALSE}
install.packages("sf")
install.packages("terra")
install.packages("spData")
install.packages("spDataLarge", repos = "https://geocompr.r-universe.dev")
```
```{r, eval=FALSE, echo=FALSE, message=FALSE, results='hide'}
remotes::install_github("r-tmap/tmap")
```
The packages needed to reproduce Part I of this book can be installed with the following command: `remotes::install_github("geocompx/geocompkg")`.
This command uses the function `install_packages()` from the **remotes** package to install source code hosted on the GitHub code hosting, version and collaboration platform.
The following command will install **all** dependencies required to reproduce the entire book (warning: this may take several minutes): `remotes::install_github("geocompx/geocompkg", dependencies = TRUE)`.
The packages needed to run the code presented in this chapter can be 'loaded' (technically they are attached) with the `library()` function as follows:
```{r 02-spatial-data-3-1, message=TRUE}
library(sf) # classes and functions for vector data
```
The output from `library(sf)` reports which versions of key geographic libraries such as GEOS the package is using, as outlined in Section \@ref(intro-sf).
```{r 02-spatial-data-3-2, message=FALSE}
library(terra) # classes and functions for raster data
```
The other packages that were installed contain data that will be used in the book:
```{r 02-spatial-data-4}
#| message: FALSE
#| results: hide
library(spData) # load geographic data
library(spDataLarge) # load larger geographic data
```
## Introduction {#intro-spatial-class}
This chapter will provide explanations of the fundamental geographic data models:\index{data models} vector and raster.
We will introduce the theory behind each data model and the disciplines in which they predominate, before demonstrating their implementation in R.
The *vector data model* represents the world using points, lines and polygons.
These have discrete, well-defined borders, meaning that vector datasets usually have a high level of precision (but not necessarily accuracy as we will see in Section \@ref(units)).
The *raster data model* divides the surface up into cells of constant size.
Raster datasets are the basis of background images used in web mapping and have been a vital source of geographic data since the origins of aerial photography and satellite-based remote sensing devices.
Rasters aggregate spatially specific features to a given resolution, meaning that they are consistent over space and scalable (many worldwide raster datasets are available).
Which to use?
The answer likely depends on your domain of application:
- Vector data tends to dominate the social sciences because human settlements tend to have discrete borders
- Raster dominates many environmental sciences partially because of the reliance on remote sensing data
Both raster and vector datasets are used in many fields and raster and vector datasets can be used together:
ecologists and demographers, for example, commonly use both vector and raster data.
Furthermore, it is possible to convert between the two forms (see Chapter \@ref(raster-vector)).
Whether your work involves more use of vector or raster datasets, it is worth understanding the underlying data model before using them, as discussed in subsequent chapters.
This book uses **sf** and **terra** packages to work with vector data and raster datasets, respectively.
## Vector data
```{block2 02-spatial-data-5, type="rmdnote"}
Take care when using the word 'vector', as it can have two meanings in this book:
geographic vector data and the `vector` class (note the `monospace` font) in R.
The former is a data model, the latter is an R class just like `data.frame` and `matrix`.
Still, there is a link between the two: the spatial coordinates which are at the heart of the geographic vector data model can be represented in R using `vector` objects.
```
The geographic vector data model\index{vector data model} is based on points located within a coordinate reference system\index{coordinate reference system|see {CRS}} (CRS\index{CRS}).
Points can represent self-standing features (e.g., the location of a bus stop) or they can be linked together to form more complex geometries such as lines and polygons.
Most point geometries contain only two dimensions (much less prominent three-dimensional geometries contain an additional $z$ value, typically representing height above sea level).
In this system, for example, London can be represented by the coordinates `c(-0.1, 51.5)`.
This means that its location is $-0.1$ degrees east and $51.5$ degrees north of the origin.
The origin in this case is at 0 degrees longitude (Prime Meridian) and 0 degrees latitude (Equator) in a geographic ('lon/lat') CRS (Figure \@ref(fig:vectorplots), left panel).
The same point could also be approximated in a projected CRS with 'Easting/Northing' values of `c(530000, 180000)` in the [British National Grid](https://en.wikipedia.org/wiki/Ordnance_Survey_National_Grid), meaning that London is located 530 km *East* and 180 km *North* of the $origin$ of the CRS.
This can be verified visually: slightly more than 5 'boxes' --- square areas bounded by the gray grid lines 100 km in width --- separate the point representing London from the origin (Figure \@ref(fig:vectorplots), right panel).
The location of National Grid's\index{National Grid} origin, in the sea beyond the South West peninsula, ensures that all locations in the UK have positive Easting and Northing values.^[
The origin we are referring to, depicted in blue in Figure \@ref(fig:vectorplots), is in fact the 'false' origin.
The 'true' origin, the location at which distortions are at a minimum, is located at 2° W and 49° N.
This was selected by the Ordnance Survey to be roughly in the center of the British landmass longitudinally.
]
There is more to CRSs, as described in Section \@ref(crs-intro) and Chapter \@ref(reproj-geo-data),
For this section, it is sufficient to know that coordinates consist of two numbers representing distance from an origin, usually in $x$ then $y$ dimensions.
```{r vectorplots-source, include=FALSE, eval=FALSE}
source("https://github.com/geocompx/geocompr/raw/main/code/02-vectorplots.R") # generate subsequent figure
```
```{r vectorplots, fig.cap="Vector (point) data in which the location of London (red X) is represented with reference to an origin (blue circle). The left plot represents a geographic CRS with an origin at 0° longitude and latitude. The right plot represents a projected CRS with an origin located in the sea west of the South West Peninsula.", out.width="49%", fig.show='hold', echo=FALSE, fig.scap="Vector (point) data."}
knitr::include_graphics(c("images/vector_lonlat.png", "images/vector_projected.png"))
```
The **sf** package provides classes for geographic vector data and a consistent command line interface to important low-level libraries for geocomputation:
- [GDAL](https://gdal.org/)\index{GDAL}, for reading, writing and manipulating a wide range of geographic data formats, covered in Chapter \@ref(read-write)
- [PROJ](https://proj.org/), a powerful library for coordinate system transformations, which underlies the content covered in Chapter \@ref(reproj-geo-data)
- [GEOS](https://libgeos.org/)\index{GEOS}, a planar geometry engine for operations such as calculating buffers and centroids on data with a projected CRS, covered in Chapter \@ref(geometry-operations)
- [S2](https://s2geometry.io/)\index{S2}, a spherical geometry engine written in C++ developed by Google, via the [**s2**](https://r-spatial.github.io/s2/) package, covered in Section \@ref(s2) below and in Chapter \@ref(reproj-geo-data)
Information about these interfaces is printed by **sf** the first time the package is loaded: the message `r print(capture.output(sf:::.onAttach(), type = "message"))` that appears below the `library(sf)` command at the beginning of this chapter tells us the versions of linked GEOS, GDAL and PROJ libraries (these vary between computers and over time) and whether or not the S2\index{S2} interface is turned on.
We may take these low-level libraries for granted, but without their tight integration with languages such as R much reproducible geocomputation would be impossible.
A neat feature of **sf** is that you can change the default geometry engine used on unprojected data: 'switching off' S2\index{S2} can be done with the command `sf::sf_use_s2(FALSE)`, meaning that the planar geometry engine GEOS\index{GEOS} will be used by default for all geometry operations, including geometry operations on unprojected data.
As we will see in Section \@ref(s2), planar geometry is based on two-dimensional space.
Planar geometry engines such as GEOS assume 'flat' (projected) coordinates, while spherical geometry engines such as S2 assume unprojected (lon/lat) coordinates.
This section introduces **sf** classes in preparation for subsequent chapters (Chapters \@ref(geometry-operations) and \@ref(read-write) cover the GEOS and GDAL interface, respectively).
### Introduction to simple features {#intro-sf}
Simple features is an [open standard](http://portal.opengeospatial.org/files/?artifact_id=25355) developed and endorsed by the Open Geospatial Consortium (OGC), a not-for-profit organization whose activities we will revisit in a later chapter (Section \@ref(file-formats)).
\index{simple features|see {sf}}
Simple features is a hierarchical data model that represents a wide range of geometry types.
Of 18 geometry types supported by the specification, only seven are used in the vast majority of geographic research (see Figure \@ref(fig:sf-ogc));
these core geometry types are fully supported by the R package **sf** [@pebesma_simple_2018].^[
The full OGC standard includes rather exotic geometry types including 'surface' and 'curve' geometry types, which currently have limited application in real-world applications.
You can find the whole list of possible feature types in the [PostGIS manual ](http://postgis.net/docs/using_postgis_dbmanagement.html).
All 18 types can be represented with the **sf** package, although at the time of writing (2024), plotting only works for the 'core 7'.
]
```{r sf-ogc, fig.cap="Simple feature types fully supported by sf.", out.width="60%", echo=FALSE}
knitr::include_graphics("images/sf-classes.png")
```
**sf** can represent all common vector geometry types (raster data classes are not supported by **sf**): points, lines, polygons and their respective 'multi' versions (which group together features of the same type into a single feature).
\index{sf}
\index{sf (package)|see {sf}}
**sf** also supports geometry collections, which can contain multiple geometry types in a single object.
**sf** provides the same functionality (and more) previously provided in three packages --- **sp**\index{sp (package)} for data classes [@R-sp], **rgdal** for data read/write via an interface to GDAL and PROJ [@R-rgdal] and **rgeos** for spatial operations via an interface to GEOS [@R-rgeos].
To reiterate the message from Chapter 1, geographic R packages have a long history of interfacing with lower level libraries, and **sf** continues this tradition with a unified interface to recent versions of GEOS for geometry operations, the GDAL library for reading and writing geographic data files, and the PROJ library for representing and transforming projected CRSs.
Through **s2**\index{S2}, an R interface to Google's spherical geometry library, [`s2`](https://s2geometry.io/), **sf** also has access to fast and accurate "measurements and operations on non-planar geometries" [@bivand_progress_2021].
Since **sf** version 1.0.0, launched in [June 2021](https://cran.r-project.org/src/contrib/Archive/sf/), **s2** functionality is now used by [default](https://r-spatial.org/r/2020/06/17/s2.html) on geometries with geographic (longitude/latitude) coordinate systems, a unique feature of **sf** that differs from spatial libraries that only support GEOS for geometry operations such as the Python package [GeoPandas](geopandas/geopandas/issues/2098).
We will discuss **s2** in subsequent chapters.
**sf**'s ability to integrate multiple powerful libraries for geocomputation into a single framework is a notable achievement that reduces 'barriers to entry' into the world of reproducible geographic data analysis with high-performance libraries.
**sf**'s functionality is well documented on its website at [r-spatial.github.io/sf/](https://r-spatial.github.io/sf/index.html) which contains seven vignettes.
These can be viewed offline as follows:
```{r 02-spatial-data-6, eval=FALSE}
vignette(package = "sf") # see which vignettes are available
vignette("sf1") # an introduction to the package
```
```{r 02-spatial-data-7, eval=FALSE, echo=FALSE}
vignette("sf1") # an introduction to the package
vignette("sf2") # reading, writing and converting simple features
vignette("sf3") # manipulating simple feature geometries
vignette("sf4") # manipulating simple features
vignette("sf5") # plotting simple features
vignette("sf6") # miscellaneous long-form documentation
vignette("sf7") # spherical geometry operations
```
As the first vignette explains, simple feature objects in R are stored in a data frame, with geographic data occupying a special column, usually named 'geom' or 'geometry'\index{vector!geometry}.
We will use the `world` dataset provided by **spData** [@R-spData], loaded at the beginning of this chapter, to show what `sf` objects are and how they work.
`world` is an '`sf` data frame' containing spatial and attribute columns, the names of which are returned by the function `names()` (the last column in this example contains the geographic information).
```{r 02-spatial-data-8}
class(world)
names(world)
```
The contents of this `geom` column give `sf` objects their spatial powers: `world$geom` is a '[list column](https://adv-r.hadley.nz/vectors-chap.html#list-columns)' that contains all the coordinates of the country polygons.
\index{list column}
`sf` objects can be plotted quickly with the function `plot()`.
Although part of R's default installation (base R), `plot()` is a [*generic*](https://adv-r.hadley.nz/s3.html#s3-methods) that is extended by other packages.
**sf** contains the non-exported (hidden from users most of the time) `plot.sf()` function which is what is called behind the scenes in the following command, which creates Figure \@ref(fig:world-all).
```{r world-all, fig.cap="Map of the world using the sf package, with a facet for each attribute.", warning=FALSE, fig.scap="Map of the world using the sf package."}
plot(world)
```
Note that instead of creating a single map by default for geographic objects, as most GIS programs do, `plot()`ing `sf` objects results in a map for each variable in the datasets.
This behavior can be useful for exploring the spatial distribution of different variables and is discussed further in Section \@ref(basic-map).
More broadly, treating geographic objects as regular data frames with spatial powers has many advantages, especially if you are already used to working with data frames.
The commonly used `summary()` function, for example, provides a useful overview of the variables within the `world` object.
```{r 02-spatial-data-9}
summary(world["lifeExp"])
```
Although we have only selected one variable for the `summary()` command, it also outputs a report on the geometry.
This demonstrates the 'sticky' behavior of the geometry columns of **sf** objects: they are kept unless the user deliberately removes them, as we'll see in Section \@ref(vector-attribute-manipulation).
The result provides a quick summary of both the non-spatial and spatial data contained in `world`: the mean average life expectancy is 71 years (ranging from less than 51 to more than 83 years with a median of 73 years) across all countries.
```{block2 02-spatial-data-10, type='rmdnote'}
The word `MULTIPOLYGON` in the summary output above refers to the geometry type of features (countries) in the `world` object.
This representation is necessary for countries with islands such as Indonesia and Greece.
Other geometry types are described in Section \@ref(geometry).
```
It is also worth taking a deeper look at the basic behavior and contents of this simple feature object, which can usefully be thought of as a '**s**patial data **f**rame'.
`sf` objects are easy to subset: the code below shows how to return an object containing only the first two rows and the first three columns of the `world` object.
The output shows two major differences compared with a regular `data.frame`: the inclusion of additional geographic metadata (`Geometry type`, `Dimension`, `Bounding box` and coordinate reference system information), and the presence of a 'geometry column', here named `geom`:
```{r 02-spatial-data-11}
world_mini = world[1:2, 1:3]
world_mini
```
All this may seem rather complex, especially for a class system that is supposed to be 'simple'!
However, there are good reasons for organizing things this way and using **sf** to work with vector geographic datasets.
Before describing each geometry type that the **sf** package supports, it is worth taking a step back to understand the building blocks of `sf` objects.
Section \@ref(sf) shows how simple features objects are data frames, with special geometry columns.
These spatial columns are often called `geom` or `geometry`: `world$geom` refers to the spatial element of the `world` object described above.
These geometry columns are 'list columns' of class `sfc` (see Section \@ref(sfc)).
In turn, `sfc` objects are composed of one or more objects of class `sfg`: simple feature geometries that we describe in Section \@ref(sfg).
\index{sf!sfc}
\index{simple feature columns|see {sf!sfc}}
To understand how the spatial components of simple features work, it is vital to understand simple feature geometries.
For this reason, we cover each currently supported simple features geometry type in Section \@ref(geometry) before moving on to describe how these can be represented in R using `sf` objects, which are based on `sfg` and `sfc` objects.
```{block2 assignment, type='rmdnote'}
The preceding code chunk uses `=` to create a new object called `world_mini` in the command `world_mini = world[1:2, 1:3]`.
This is called assignment.
An equivalent command to achieve the same result is `world_mini <- world[1:2, 1:3]`.
Although 'arrow assignment' is more commonly used, we use 'equals assignment' because it's slightly faster to type and easier to teach due to compatibility with commonly used languages such as Python and JavaScript.
Which to use is largely a matter of preference as long as you're consistent (packages such as **styler** can be used to change style).
```
### Why simple features?
Simple features is a widely supported data model that underlies data structures in many GIS applications including QGIS\index{QGIS} and PostGIS\index{PostGIS}.
A major advantage of this is that using the data model ensures your work is cross-transferable to other setups, for example importing from and exporting to spatial databases.
\index{sf!why simple features}
A more specific question from an R perspective is "why use the **sf** package?"
There are many reasons (linked to the advantages of the simple features model):
- Fast reading and writing of data
- Enhanced plotting performance
- **sf** objects can be treated as data frames in most operations
- **sf** function names are relatively consistent and intuitive (all begin with `st_`)
- **sf** functions can be combined with the `|>` operator and works well with the [tidyverse](https://www.tidyverse.org/) collection of R packages\index{tidyverse}.
**sf**'s support for **tidyverse** packages is exemplified by `read_sf()`, a function for importing geographic vector data covered in detail in Section \@ref(iovec).
Unlike the function `st_read()`, which returns attributes stored in a base R `data.frame` (and which emits verbose messages, not shown in the code chunk below), `read_sf()` silently returns data as a **tidyverse** `tibble`.
This is demonstrated below:
```{r, message=FALSE}
world_dfr = st_read(system.file("shapes/world.gpkg", package = "spData"))
world_tbl = read_sf(system.file("shapes/world.gpkg", package = "spData"))
class(world_dfr)
class(world_tbl)
```
As described in Chapter \@ref(attr), which shows how to manipulate `sf` objects with **tidyverse** functions, **sf** is now the go-to package for analysis of spatial vector data in R.
**spatstat**, a package ecosystem which provides numerous functions for spatial statistics, and **terra** both have vector geographic data classes, but neither has the same level of uptake as **sf** does for working with vector data.
Many popular packages build on **sf**, as shown by the rise in its popularity in terms of number of downloads per day, as shown in Section \@ref(r-ecosystem) in the previous chapter.
### Basic maps {#basic-map}
Basic geographic visualizations (maps) are created in **sf** with base R's `plot()` function.
By default, this creates a multi-panel plot, one sub-plot for each variable of the object, as illustrated in the left-hand panel in Figure \@ref(fig:sfplot).
A legend or 'key' with a continuous color is produced if the object to be plotted has a single variable (see the right-hand panel).
You can also set fixed colors in `plot()` commands with `col` and `border` arguments.
\index{map-making!basic}
```{r sfplot, fig.cap="Plotting with sf, with multiple variables (left) and a single variable (right).", out.width="49%", fig.show='hold', warning=FALSE, fig.scap="Plotting with sf."}
plot(world[3:6])
plot(world["pop"])
```
Plots are added as layers to existing images by setting `add = TRUE`.^[
`plot()`ing of **sf** objects uses `sf:::plot.sf()` behind the scenes.
`plot()` is a generic method that behaves differently depending on the class of object being plotted.
]
To demonstrate this, and to provide an insight into the contents of Chapters \@ref(attr) and \@ref(spatial-operations) on attribute and spatial data operations, the subsequent code chunk filters countries in Asia and combines them into a single feature:
```{r 02-spatial-data-14, warning=FALSE}
world_asia = world[world$continent == "Asia", ]
asia = st_union(world_asia)
```
We can now plot the Asian continent over a map of the world.
Note that the first plot must only have one facet for `add = TRUE` to work.
If the first plot has a key, `reset = FALSE` must be used:
```{r asia, out.width='50%', fig.cap="Plot of Asia added as a layer on top of countries worldwide."}
plot(world["pop"], reset = FALSE)
plot(asia, add = TRUE, col = "red")
```
```{block2 plottingpacks, type='rmdnote'}
Adding layers in this way can be used to verify the geographic correspondence between layers:
the `plot()` function is fast and requires few lines of code, but its functionality is limited.
For more advanced map-making we recommend using dedicated visualization packages such as **tmap** [@tmap2018] (see Chapter \@ref(adv-map)).
```
There are various ways to modify maps with **sf**'s `plot()` method.
Because **sf** extends base R plotting methods, `plot()`'s arguments work with `sf` objects (see `?graphics::plot` and `?par` for information on arguments such as `main =`).^[
Note: many plot arguments are ignored in facet maps, when more than one `sf` column is plotted.]
\index{base plot|see {map-making}}\index{map-making!base plotting} Figure \@ref(fig:contpop) illustrates this flexibility by overlaying circles, whose diameters (set with `cex =`) represent country populations, on a map of the world.
An unprojected version of this figure can be created with the following commands (see exercises at the end of this chapter and the script [`02-contplot.R`](https://github.com/geocompx/geocompr/blob/main/code/02-contpop.R) to reproduce Figure \@ref(fig:contpop)):
```{r 02-spatial-data-16, eval=FALSE}
plot(world["continent"], reset = FALSE)
cex = sqrt(world$pop) / 10000
world_cents = st_centroid(world, of_largest = TRUE)
plot(st_geometry(world_cents), add = TRUE, cex = cex)
```
```{r contpop, fig.cap="Country continents (represented by fill color) and 2015 populations (represented by circles, with area proportional to population).", echo=FALSE, warning=FALSE, fig.scap="Country continents and 2015 populations."}
source("https://github.com/geocompx/geocompr/raw/main/code/02-contpop.R")
```
The code above uses the function `st_centroid()` to convert one geometry type (polygons) to another (points) (see Chapter \@ref(geometry-operations)), the aesthetics of which are varied with the `cex` argument.
\index{bounding box}
**sf**'s plot method also has arguments specific to geographic data.
`expandBB`, for example, can be used to plot an `sf` object in context:
it takes a numeric vector of length four that expands the bounding box of the plot relative to zero in the following order: bottom, left, top, right.
This is used to plot India in the context of its giant Asian neighbors, with an emphasis on China to the east, in the following code chunk, which generates Figure \@ref(fig:china) (see exercises below on adding text to plots):^[
Note the use of `st_geometry(india)` to return only the geometry associated with the object to prevent attributes being plotted in a simple feature column (`sfc`) object.
An alternative is to use `india[0]`, which returns an `sf` object that contains no attribute data..
]
```{r 02-spatial-data-17, eval=FALSE}
india = world[world$name_long == "India", ]
plot(st_geometry(india), expandBB = c(0, 0.2, 0.1, 1), col = "gray", lwd = 3)
plot(st_geometry(world_asia), add = TRUE)
```
```{r china, fig.cap="India in context, demonstrating the expandBB argument.", warning=FALSE, echo=FALSE, out.width="50%"}
old_par = par(mar = rep(0, 4))
india = world[world$name_long == "India", ]
indchi = world_asia[grepl("Indi|Chi", world_asia$name_long), ]
indchi_points = st_centroid(indchi)
indchi_coords = st_coordinates(indchi_points)
plot(st_geometry(india), expandBB = c(-0.2, 0.5, 0, 1), col = "gray", lwd = 3)
plot(world_asia[0], add = TRUE)
text(indchi_coords[, 1], indchi_coords[, 2], indchi$name_long)
par(old_par)
```
```{r, eval=FALSE, echo=FALSE}
waldo::compare(st_geometry(world), world[0])
```
Note the use of `lwd` to emphasize India in the plotting code.
See Section \@ref(static-maps) for other visualization techniques for representing a range of geometry types, the subject of the next section.
### Geometry types {#geometry}
Geometries are the basic building blocks of simple features.
Simple features in R can take on one of the 18 geometry types supported by the **sf** package.
In this chapter we will focus on the seven most commonly used types: `POINT`, `LINESTRING`, `POLYGON`, `MULTIPOINT`, `MULTILINESTRING`, `MULTIPOLYGON` and `GEOMETRYCOLLECTION`.
\index{geometry types|see {sf!geometry types}} \index{sf!geometry types}
Generally, well-known binary (WKB) or well-known text (WKT) are the standard encoding for simple feature geometries.
WKB representations are usually hexadecimal strings easily readable for computers.
This is why GIS and spatial databases use WKB to transfer and store geometry objects.
WKT, on the other hand, is a human-readable text markup description of simple features.
Both formats are exchangeable, and if we present one, we will naturally choose the WKT representation.
\index{well-known text}
\index{WKT|see {well-known text}}
\index{well-known binary}
\index{WKB|see {well-known binary}}
The basis for each geometry type is the point.
A point is simply a coordinate in two-, three-, or four-dimensional space (see `vignette("sf1")` for more information) such as (Figure \@ref(fig:sfcs), left panel):
\index{sf!point}
- `POINT (5 2)`
\index{sf!linestring}
A linestring is a sequence of points with a straight line connecting the points, for example (Figure \@ref(fig:sfcs), middle panel):
- `LINESTRING (1 5, 4 4, 4 1, 2 2, 3 2)`
A polygon is a sequence of points that form a closed, non-intersecting ring.
Closed means that the first and the last point of a polygon have the same coordinates (Figure \@ref(fig:sfcs), right panel).^[
By definition, a polygon has one exterior boundary (outer ring) and can have zero or more interior boundaries (inner rings), also known as holes.
A polygon with a hole would be, for example, `POLYGON ((1 5, 2 2, 4 1, 4 4, 1 5), (2 4, 3 4, 3 3, 2 3, 2 4))`
]
\index{sf!hole}
- Polygon without a hole: `POLYGON ((1 5, 2 2, 4 1, 4 4, 1 5))`
```{r sfcs, echo=FALSE, fig.cap="Point, linestring and polygon geometries.", fig.asp=0.4}
old_par = par(mfrow = c(1, 3), pty = "s", mar = c(0, 3, 1, 0))
plot(st_as_sfc(c("POINT(5 2)")), axes = TRUE, main = "POINT")
plot(st_as_sfc("LINESTRING(1 5, 4 4, 4 1, 2 2, 3 2)"), axes = TRUE, main = "LINESTRING")
plot(st_as_sfc("POLYGON((1 5, 2 2, 4 1, 4 4, 1 5))"), col="gray", axes = TRUE, main = "POLYGON")
par(old_par)
```
```{r polygon_hole, echo=FALSE, out.width="30%", eval=FALSE}
# not printed - enough of these figures already (RL)
par(pty = "s")
plot(st_as_sfc("POLYGON((1 5, 2 2, 4 1, 4 4, 1 5), (2 4, 3 4, 3 3, 2 3, 2 4))"), col = "gray", axes = TRUE, main = "POLYGON with a hole")
```
So far we have created geometries with only one geometric entity per feature.
\index{sf!multi features}
Simple feature standard also allows multiple geometries of a single type to exist within a single feature within "multi" version of each geometry type (Figure \@ref(fig:multis)):
- Multipoint: `MULTIPOINT (5 2, 1 3, 3 4, 3 2)`
- Multilinestring: `MULTILINESTRING ((1 5, 4 4, 4 1, 2 2, 3 2), (1 2, 2 4))`
- Multipolygon: `MULTIPOLYGON (((1 5, 2 2, 4 1, 4 4, 1 5), (0 2, 1 2, 1 3, 0 3, 0 2)))`
```{r multis, echo=FALSE, fig.cap="Illustration of multi* geometries.", fig.asp=0.4}
old_par = par(mfrow = c(1, 3), pty = "s", mar = c(0, 3, 1, 0))
plot(st_as_sfc("MULTIPOINT (5 2, 1 3, 3 4, 3 2)"), axes = TRUE, main = "MULTIPOINT")
plot(st_as_sfc("MULTILINESTRING ((1 5, 4 4, 4 1, 2 2, 3 2), (1 2, 2 4))"), axes = TRUE, main = "MULTILINESTRING")
plot(st_as_sfc("MULTIPOLYGON (((1 5, 2 2, 4 1, 4 4, 1 5), (0 2, 1 2, 1 3, 0 3, 0 2)))"), col = "gray", axes = TRUE, main = "MULTIPOLYGON")
par(old_par)
```
Finally, a geometry collection can contain any combination of geometries including (multi)points and linestrings (see Figure \@ref(fig:geomcollection)):
\index{sf!geometry collection}
- Geometry collection: `GEOMETRYCOLLECTION (MULTIPOINT (5 2, 1 3, 3 4, 3 2), LINESTRING (1 5, 4 4, 4 1, 2 2, 3 2))`
```{r geomcollection, echo=FALSE, fig.cap="Illustration of a geometry collection.", fig.asp=0.4}
# Plotted - it is referenced in ch5 (st_cast)
old_par = par(pty = "s", mar = c(2, 3, 3, 0))
plot(st_as_sfc("GEOMETRYCOLLECTION (MULTIPOINT (5 2, 1 3, 3 4, 3 2), LINESTRING (1 5, 4 4, 4 1, 2 2, 3 2))"),
axes = TRUE, main = "GEOMETRYCOLLECTION", col = 1)
par(old_par)
```
### The sf class {#sf}
Simple features consist of two main parts: geometries and non-geographic attributes.
Figure \@ref(fig:02-sfdiagram) shows how an sf object is created -- geometries come from an `sfc` object, while attributes are taken from a `data.frame` or `tibble`.^[To learn more about building sf geometries from scratch, see the following Sections \@ref(sfg) and \@ref(sfc).]
```{r 02-sfdiagram, fig.cap="Building blocks of sf objects.", echo=FALSE}
# source("code/02-sfdiagram.R")
knitr::include_graphics("images/02-sfdiagram.png")
```
Non-geographic attributes represent the name of the feature or other attributes such as measured values, groups, and other things.
\index{sf!class}
To illustrate attributes, we will represent a temperature of 25°C in London on June 21, 2023.
This example contains a geometry (coordinates), and three attributes with three different classes (place name, temperature and date).^[
Other attributes might include an urbanity category (city or village), or a remark if the measurement was made using an automatic station.
]
Objects of class `sf` represent such data by combining the attributes (`data.frame`) with the simple feature geometry column (`sfc`).
They are created with `st_sf()` as illustrated below, which creates the London example described above:
```{r 02-spatial-data-33}
lnd_point = st_point(c(0.1, 51.5)) # sfg object
lnd_geom = st_sfc(lnd_point, crs = "EPSG:4326") # sfc object
lnd_attrib = data.frame( # data.frame object
name = "London",
temperature = 25,
date = as.Date("2023-06-21")
)
lnd_sf = st_sf(lnd_attrib, geometry = lnd_geom) # sf object
```
What just happened? First, the coordinates were used to create the simple feature geometry (`sfg`).
Second, the geometry was converted into a simple feature geometry column (`sfc`), with a CRS.
Third, attributes were stored in a `data.frame`, which was combined with the `sfc` object with `st_sf()`.
This results in an `sf` object, as demonstrated below (some output is omitted):
```{r 02-spatial-data-34, eval=FALSE}
lnd_sf
#> Simple feature collection with 1 features and 3 fields
#> ...
#> name temperature date geometry
#> 1 London 25 2023-06-21 POINT (0.1 51.5)
```
```{r 02-spatial-data-35}
class(lnd_sf)
```
The result shows that `sf` objects actually have two classes, `sf` and `data.frame`.
Simple features are simply data frames (square tables), but with spatial attributes stored in a list column, usually called `geometry` or `geom`, as described in Section \@ref(intro-sf).
This duality is central to the concept of simple features:
most of the time a `sf` can be treated as and behaves like a `data.frame`.
Simple features are, in essence, data frames with a spatial extension.
```{r 02-spatial-data-36, eval=FALSE, echo=FALSE}
ruan_point = st_point(c(-9, 53))
# sfc object
our_geometry = st_sfc(lnd_point, ruan_point, crs = 4326)
# data frame object
our_attributes = data.frame(
name = c("London", "Ruan"),
temperature = c(25, 13),
date = c(as.Date("2023-06-21"), as.Date("2023-06-22")),
category = c("city", "village"),
automatic = c(FALSE, TRUE))
# sf object
sf_points = st_sf(our_attributes, geometry = our_geometry)
```
### Simple feature geometries (sfg) {#sfg}
The `sfg` class represents the different simple feature geometry types in R: point, linestring, polygon (and their 'multi' equivalents, such as multipoints) or geometry collection.
\index{simple feature geometries|see {sf!sfg}}
Usually you are spared the tedious task of creating geometries on your own since you can simply import an already existing spatial file.
However, there are a set of functions to create simple feature geometry objects (`sfg`) from scratch, if needed.
The names of these functions are simple and consistent, as they all start with the `st_` prefix and end with the name of the geometry type in lowercase letters:
- A point: `st_point()`
- A linestring: `st_linestring()`
- A polygon: `st_polygon()`
- A multipoint: `st_multipoint()`
- A multilinestring: `st_multilinestring()`
- A multipolygon: `st_multipolygon()`
- A geometry collection: `st_geometrycollection()`
`sfg` objects can be created from three base R data types:
1. A numeric vector: a single point
2. A matrix: a set of points, where each row represents a point, a multipoint or linestring
3. A list: a collection of objects such as matrices, multilinestrings or geometry collections
The function `st_point()` creates single points from numeric vectors:
```{r 02-spatial-data-18}
st_point(c(5, 2)) # XY point
st_point(c(5, 2, 3)) # XYZ point
st_point(c(5, 2, 1), dim = "XYM") # XYM point
st_point(c(5, 2, 3, 1)) # XYZM point
```
The results show that XY (2D coordinates), XYZ (3D coordinates) and XYZM (3D with an additional variable, typically measurement accuracy) point types are created from vectors of lengths 2, 3, and 4, respectively.
The XYM type must be specified using the `dim` argument (which is short for dimension).
By contrast, use matrices in the case of multipoint (`st_multipoint()`) and linestring (`st_linestring()`) objects:
```{r 02-spatial-data-19}
# the rbind function simplifies the creation of matrices
## MULTIPOINT
multipoint_matrix = rbind(c(5, 2), c(1, 3), c(3, 4), c(3, 2))
st_multipoint(multipoint_matrix)
## LINESTRING
linestring_matrix = rbind(c(1, 5), c(4, 4), c(4, 1), c(2, 2), c(3, 2))
st_linestring(linestring_matrix)
```
Finally, use lists for the creation of multilinestrings, (multi-)polygons and geometry collections:
```{r 02-spatial-data-20}
## POLYGON
polygon_list = list(rbind(c(1, 5), c(2, 2), c(4, 1), c(4, 4), c(1, 5)))
st_polygon(polygon_list)
```
```{r 02-spatial-data-21}
## POLYGON with a hole
polygon_border = rbind(c(1, 5), c(2, 2), c(4, 1), c(4, 4), c(1, 5))
polygon_hole = rbind(c(2, 4), c(3, 4), c(3, 3), c(2, 3), c(2, 4))
polygon_with_hole_list = list(polygon_border, polygon_hole)
st_polygon(polygon_with_hole_list)
```
```{r 02-spatial-data-22}
## MULTILINESTRING
multilinestring_list = list(rbind(c(1, 5), c(4, 4), c(4, 1), c(2, 2), c(3, 2)),
rbind(c(1, 2), c(2, 4)))
st_multilinestring(multilinestring_list)
```
```{r 02-spatial-data-23}
## MULTIPOLYGON
multipolygon_list = list(list(rbind(c(1, 5), c(2, 2), c(4, 1), c(4, 4), c(1, 5))),
list(rbind(c(0, 2), c(1, 2), c(1, 3), c(0, 3), c(0, 2))))
st_multipolygon(multipolygon_list)
```
```{r 02-spatial-data-24, eval=FALSE}
## GEOMETRYCOLLECTION
geometrycollection_list = list(st_multipoint(multipoint_matrix),
st_linestring(linestring_matrix))
st_geometrycollection(geometrycollection_list)
#> GEOMETRYCOLLECTION (MULTIPOINT (5 2, 1 3, 3 4, 3 2),
#> LINESTRING (1 5, 4 4, 4 1, 2 2, 3 2))
```
### Simple feature columns (sfc) {#sfc}
One `sfg` object contains only a single simple feature geometry.
A simple feature geometry column (`sfc`) is a list of `sfg` objects, which is additionally able to contain information about the CRS in use.
For instance, to combine two simple features into one object with two features, we can use the `st_sfc()` function.
\index{sf!simple feature columns (sfc)}
This is important since `sfc` represents the geometry column in **sf** data frames:
```{r 02-spatial-data-25}
# sfc POINT
point1 = st_point(c(5, 2))
point2 = st_point(c(1, 3))
points_sfc = st_sfc(point1, point2)
points_sfc
```
In most cases, an `sfc` object contains objects of the same geometry type.
Therefore, when we convert `sfg` objects of type polygon into a simple feature geometry column, we would also end up with an `sfc` object of type polygon, which can be verified with `st_geometry_type()`.
Equally, a geometry column of multilinestrings would result in an `sfc` object of type multilinestring:
```{r 02-spatial-data-26}
# sfc POLYGON
polygon_list1 = list(rbind(c(1, 5), c(2, 2), c(4, 1), c(4, 4), c(1, 5)))
polygon1 = st_polygon(polygon_list1)
polygon_list2 = list(rbind(c(0, 2), c(1, 2), c(1, 3), c(0, 3), c(0, 2)))
polygon2 = st_polygon(polygon_list2)
polygon_sfc = st_sfc(polygon1, polygon2)
st_geometry_type(polygon_sfc)
```
```{r 02-spatial-data-27}
# sfc MULTILINESTRING
multilinestring_list1 = list(rbind(c(1, 5), c(4, 4), c(4, 1), c(2, 2), c(3, 2)),
rbind(c(1, 2), c(2, 4)))
multilinestring1 = st_multilinestring((multilinestring_list1))
multilinestring_list2 = list(rbind(c(2, 9), c(7, 9), c(5, 6), c(4, 7), c(2, 7)),
rbind(c(1, 7), c(3, 8)))
multilinestring2 = st_multilinestring((multilinestring_list2))
multilinestring_sfc = st_sfc(multilinestring1, multilinestring2)
st_geometry_type(multilinestring_sfc)
```
It is also possible to create an `sfc` object from `sfg` objects with different geometry types:
```{r 02-spatial-data-28}
# sfc GEOMETRY
point_multilinestring_sfc = st_sfc(point1, multilinestring1)
st_geometry_type(point_multilinestring_sfc)
```
As mentioned before, `sfc` objects can additionally store information on the CRS.
The default value is `NA` (*Not Available*), as can be verified with `st_crs()`:
```{r 02-spatial-data-29}
st_crs(points_sfc)
```
All geometries in `sfc` objects must have the same CRS.
A CRS can be specified with the `crs` argument of `st_sfc()` (or `st_sf()`), which takes a **CRS identifier** provided as a text string, such as `crs = "EPSG:4326"` (see Section \@ref(crs-in-r) for other CRS representations and details on what this means).
```{r 02-spatial-data-30, eval=FALSE}
# Set the CRS with an identifier referring to an 'EPSG' CRS code:
points_sfc_wgs = st_sfc(point1, point2, crs = "EPSG:4326")
st_crs(points_sfc_wgs) # print CRS (only first 4 lines of output shown)
#> Coordinate Reference System:
#> User input: EPSG:4326
#> wkt:
#> GEOGCRS["WGS 84",
#> ...
```
### The sfheaders package {#sfheaders}
```{r sfheaers-setup, echo=FALSE}
## Detatch {sf} to remove 'print' methods
## because I want to show the underlying structure
##
## library(sf) will be called later
# unloadNamespace("sf") # errors
# pkgload::unload("sf")
```
\index{sfheaders}
**sfheaders** is an R package that speeds-up the construction, conversion and manipulation of `sf` objects [@cooley_sfheaders_2020].
It focuses on building `sf` objects from vectors, matrices and data frames, rapidly, and without depending on the **sf** library; and exposing its underlying C++ code through header files (hence the name, **sfheaders**).
This approach enables others to extend it using compiled and fast-running code.
Every core **sfheaders** function has a corresponding C++ implementation, as described in the [`Cpp` vignette](https://dcooley.github.io/sfheaders/articles/Cpp.html).
For most people, the R functions will be more than sufficient to benefit from the computational speed of the package.
**sfheaders** was developed separately from **sf**, but aims to be fully compatible, creating valid `sf` objects of the type described in preceding sections.
The simplest use case for **sfheaders** is demonstrated in the code chunks below with examples of building `sfg`, `sfc`, and `sf` objects showing:
- A vector converted to `sfg_POINT`
- A matrix converted to `sfg_LINESTRING`
- A data frame converted to `sfg_POLYGON`
We will start by creating the simplest possible `sfg` object, a single coordinate pair, assigned to a vector named `v`:
```{r sfheaders-sfg_point}
#| eval: false
v = c(1, 1)
v_sfg_sfh = sfheaders::sfg_point(obj = v)
v_sfg_sfh # printing without sf loaded
#> [,1] [,2]
#> [1,] 1 1
#> attr(,"class")
#> [1] "XY" "POINT" "sfg"
```
```{r}
#| echo: false
v = c(1, 1)
v_sfg_sfh = sfheaders::sfg_point(obj = v)
```
```{r, eval=FALSE, echo=FALSE}
v_sfg_sfh = sf::st_point(v)
```
The example above shows how the `sfg` object `v_sfg_sfh` is printed when **sf** is not loaded, demonstrating its underlying structure.
When **sf** is loaded (as is the case here), the result of the above command is indistinguishable from `sf` objects:
```{r}
v_sfg_sf = st_point(v)
print(v_sfg_sf) == print(v_sfg_sfh)
```
```{r, echo=FALSE, eval=FALSE}
# (although `sfg` objects created with **sfheaders** have a dimension while `sfg` objects created with the **sf** package do not)
waldo::compare(v_sfg_sf, v_sfg_sfh)
dim(v_sfg_sf)
dim(v_sfg_sfh)
attr(v_sfg_sfh, "dim")
```
The next examples shows how **sfheaders** creates `sfg` objects from matrices and data frames:
```{r sfheaders-sfg_linestring}
# matrices
m = matrix(1:8, ncol = 2)
sfheaders::sfg_linestring(obj = m)
# data frames
df = data.frame(x = 1:4, y = 4:1)
sfheaders::sfg_polygon(obj = df)
```
Reusing the objects `v`, `m`, and `df` we can also build simple feature columns (`sfc`) as follows (outputs not shown):
```{r sfheaders-sfc_point2, eval=FALSE}
sfheaders::sfc_point(obj = v)
sfheaders::sfc_linestring(obj = m)
sfheaders::sfc_polygon(obj = df)
```
Similarly, `sf` objects can be created as follows:
```{r sfheaders-sfc_point, eval=FALSE}
sfheaders::sf_point(obj = v)
sfheaders::sf_linestring(obj = m)
sfheaders::sf_polygon(obj = df)
```
In each of these examples, the CRS is not defined.
If you plan on doing any calculations or geometric operations using **sf** functions, we encourage you to set the CRS (see Chapter \@ref(reproj-geo-data) for details):
```{r sfheaders-crs}
df_sf = sfheaders::sf_polygon(obj = df)
st_crs(df_sf) = "EPSG:4326"
```
**sfheaders** is also good at 'deconstructing' and 'reconstructing' `sf` objects, meaning converting geometry columns into data frames that contain data on the coordinates of each vertex and geometry feature (and multi-feature) ids.
It is fast and reliable at 'casting' geometry columns to different types, a topic covered in Chapter \@ref(geometry-operations).
Benchmarks, in the package's [documentation](https://dcooley.github.io/sfheaders/articles/examples.html#performance) and in test code developed for this book, show it is much faster than the `sf` package for such operations.
### Spherical geometry operations with S2 {#s2}
Spherical geometry engines are based on the fact that the world is round, while simple mathematical procedures for geocomputation, such as calculating a straight line between two points or the area enclosed by a polygon, assume planar (projected) geometries.
Since **sf** version 1.0.0, R supports spherical geometry operations 'out of the box' (and by default), thanks to its interface to Google's S2 spherical geometry engine via the **s2** interface package
\index{S2}.
S2 is perhaps best known as an example of a Discrete Global Grid System (DGGS).
Another example is the [H3](https://h3geo.org/) global hexagonal hierarchical spatial index [@bondaruk_assessing_2020].
Although potentially useful for describing locations anywhere on Earth using character strings, the main benefit of **sf**'s interface to S2 is its provision of drop-in functions for calculations such as distance, buffer, and area calculations, as described in **sf**'s built-in documentation which can be opened with the command [`vignette("sf7")`](https://r-spatial.github.io/sf/articles/sf7.html).
**sf** can run in two modes with respect to S2: on and off.
By default the S2 geometry engine is turned on, as can be verified with the following command:
```{r}
sf_use_s2()
```
An example of the consequences of turning the geometry engine off is shown below, by creating buffers around the `india` object created earlier in the chapter (note the warnings emitted when S2 is turned off) (Figure \@ref(fig:s2example)):
```{r}
india_buffer_with_s2 = st_buffer(india, 1) # 1 meter
sf_use_s2(FALSE)
india_buffer_without_s2 = st_buffer(india, 1) # 1 degree
```
```{r s2example, echo=FALSE, fig.cap="Example of the consequences of turning off the S2 geometry engine. Both representations of a buffer around India were created with the same command but the purple polygon object was created with S2 switched on, resulting in a buffer of 1 m. The larger light green polygon was created with S2 switched off, resulting in a buffer of 1 degree, which is not accurate.", message=FALSE}
library(tmap)
tm1 = tm_shape(india_buffer_with_s2) +
tm_fill(fill = hcl.colors(4, palette = "purple green")[2], lwd = 0.01) +
tm_shape(india) +
tm_fill(fill = "gray95") +
tm_title("st_buffer() with dist = 1") +
tm_title("s2 switched on (default)", position = tm_pos_in("right", "bottom"), size = 1)
tm2 = tm_shape(india_buffer_without_s2) +
tm_fill(fill = hcl.colors(4, palette = "purple green")[3], lwd = 0.01) +
tm_shape(india) +
tm_fill(fill = "gray95") +
tm_title(" ") +
tm_title("s2 switched off", position = tm_pos_in("right", "bottom"), size = 1)
tmap_arrange(tm1, tm2, ncol = 2)
```
The right panel of Figure \@ref(fig:s2example) is incorrect, as the buffer of 1 degree does not return the equal distance around the `india` polygon (for more explanation of this issue, see Section \@ref(geom-proj)).
Throughout this book, we will assume that S2 is turned on, unless explicitly stated.
Turn it on again with the following command.
```{r}
sf_use_s2(TRUE)
```
```{block2 09-gis-2, type="rmdnote"}
Although the **sf**'s use of S2 makes sense in many cases, in some cases there are good reasons for turning S2 off for the duration of an R session or even for an entire project.
As documented in issue [1771](https://github.com/r-spatial/sf/issues/1771) in **sf**'s GitHub repo, the default behavior can make code that would work with S2 turned off (and with older versions of **sf**) fail.
These edge cases include operations on polygons that are not valid according to S2's stricter definition.
If you see error messages such as `#> Error in s2_geography_from_wkb ...` it may be worth trying the command that generated the error message again, after turning off S2.
To turn off S2 for the entirety of a project, you can create a file called .Rprofile in the root directory (the main folder) of your project containing the command `sf::sf_use_s2(FALSE)`.
```
## Raster data
The spatial raster data model represents the world with the continuous grid of cells (often also called pixels; Figure \@ref(fig:raster-intro-plot):A)\index{raster data model}.
This data model often refers to so-called regular grids, in which each cell has the same, constant size -- and we will focus on the regular grids in this book only.
However, several other types of grids exist, including rotated, sheared, rectilinear, and curvilinear grids (see chapter 1 of @pebesma_spatial_2023 or chapter 2 of @tennekes_elegant_2022).
The raster data model usually consists of a raster header\index{raster!header}
and a matrix (with rows and columns) representing equally spaced cells (often also called pixels; Figure \@ref(fig:raster-intro-plot):A).^[
Depending on the file format, the header is part of the actual image data file, e.g., GeoTIFF, or stored in an extra header or world file, e.g., ASCII grid formats.
There is also the headerless (flat) binary raster format which should facilitate the import into various software programs.]
The raster header\index{raster!header} defines the CRS, the extent and the origin.
\index{raster}
\index{raster data model}
The origin (or starting point) is frequently the coordinate of the lower left corner of the matrix (the **terra** package, however, uses the upper left corner, by default (Figure \@ref(fig:raster-intro-plot):B)).
The header defines the extent via the number of columns, the number of rows and the cell size resolution.
The resolution can be calculated as follows:
$$
\text{resolution} = \frac{\text{xmax} - \text{xmin}}{\text{ncol}}, \frac{\text{ymax} - \text{ymin}}{\text{nrow}}
$$
Starting from the origin, we can easily access and modify each single cell by either using the ID of a cell (Figure \@ref(fig:raster-intro-plot):B) or by explicitly specifying the rows and columns.
This matrix representation avoids storing explicitly the coordinates for the four corner points (in fact, it only stores one coordinate, namely the origin) of each cell corner as would be the case for rectangular vector polygons.
This and map algebra (Section \@ref(map-algebra)) make raster processing much more efficient and faster than vector data processing.
In contrast to vector data, the cell of one raster layer can only hold a single value.^[Thus, to store many values for a single location we need to have many raster layers.]
The value might be continuous or categorical (Figure \@ref(fig:raster-intro-plot)C).
```{r raster-intro-plot, echo = FALSE, fig.cap = "Raster data types.", fig.scap="Raster data types.", fig.asp=0.5, message=FALSE}
source("https://github.com/geocompx/geocompr/raw/main/code/02-raster-intro-plot.R", print.eval = TRUE)
```
Raster maps usually represent continuous phenomena such as elevation, temperature, population density or spectral data.
Discrete features such as soil or land-cover classes can also be represented in the raster data model.
Both uses of raster datasets are illustrated in Figure \@ref(fig:raster-intro-plot2), which shows how the borders of discrete features may become blurred in raster datasets.
Depending on the nature of the application, vector representations of discrete features may be more suitable.
```{r raster-intro-plot2, echo=FALSE, fig.cap="Examples of (A) continuous and (B) categorical rasters.", warning=FALSE, message=FALSE}
source("code/02-raster-intro-plot2.R", print.eval = TRUE)
# knitr::include_graphics("https://user-images.githubusercontent.com/1825120/146617327-45919232-a6a3-4d9d-a158-afa87f47381b.png")
```
### R packages for working with raster data
Over the last two decades, several packages for reading and processing raster datasets have been developed.
\index{raster (package)}\index{terra (package)}\index{stars (package)}
As outlined in Section \@ref(history-of-r-spatial), chief among them was **raster**, which led to a step change in R's raster capabilities when it was launched in 2010 and the premier package in the space until the development of **terra** and **stars**.
Both more recently developed packages provide powerful and performant functions for working with raster datasets, and there is substantial overlap between their possible use cases.
In this book, we focus on **terra**, which replaces the older and (in most cases) slower **raster**.
Before learning about how **terra**'s class system works, this section describes similarities and differences between **terra** and **stars**; this knowledge will help decide which is most appropriate in different situations.
First, **terra** focuses on the most common raster data model (regular grids), while **stars** also allows storing less popular models (including regular, rotated, sheared, rectilinear, and curvilinear grids).
While **terra** usually handles one or multi-layered rasters^[It also has an additional class `SpatRasterDataset` for storing many collections of datasets.], the **stars** package provides ways to store raster data cubes -- a raster object with many layers (e.g., bands), for many moments in time (e.g., months), and many attributes (e.g., sensor type A and sensor type B).
Importantly, in both packages, all layers or elements of a data cube must have the same spatial dimensions and extent.
Second, both packages allow to either read all of the raster data into memory or just to read its metadata -- this is usually done automatically based on the input file size.
However, they store raster values very differently.
**terra** is based on C++ code and mostly uses C++ pointers.
**stars** stores values as lists of arrays for smaller rasters or just a file path for larger ones.
Third, **stars** functions are closely related to the vector objects and functions in **sf**, while **terra** uses its own class of objects for vector data, namely `SpatVector`, but also accepts `sf` ones.^[It is also possible to convert between these classes with `vect()` (from `sf` to `SpatVector`) and `st_as_sf()` (from `SpatVector` to `sf`).]
Fourth, both packages have a different approach for how various functions work on their objects.
The **terra** package mostly relies on a large number of built-in functions, where each function has a specific purpose (e.g., resampling or cropping).
On the other hand, **stars** uses some built-in functions (usually with names starting with `st_`), some existing **dplyr** functions (e.g., `filter()` or `slice()`), and also has its own methods for existing R functions (e.g., `split()` or `aggregate()`).
Importantly, it is straightforward to convert objects from **terra** to **stars** (using `st_as_stars()`) and the other way round (using `rast()`).
We also encourage you to read @pebesma_spatial_2023 for the most comprehensive introduction to the **stars** package.
### Introduction to terra
\index{terra (package)}
The **terra** package supports raster objects in R.
It provides an extensive set of functions to create, read, export, manipulate and process raster datasets.
**terra**'s functionality is largely the same as the more mature **raster** package, but there are some differences: **terra** functions are usually more computationally efficient than **raster** equivalents.
On the other hand, the **raster** class system is popular and used by many other packages.
You can seamlessly translate between the two types of object to ensure backward compatibility with older scripts and packages, for example, with the functions [`raster()`](https://rspatial.github.io/raster/reference/raster.html), [`stack()`](https://rspatial.github.io/raster/reference/stack.html), and `brick()` in the **raster** package (see the previous chapter for more on the evolution of R packages for working with geographic data).
In addition to functions for raster data manipulation, **terra** provides many low-level functions that can form a foundation for developing new tools for working with raster datasets.
\index{terra (package)}
**terra** also lets you work on large raster datasets that are too large to fit into the main memory.
In this case, **terra** provides the possibility to divide the raster into smaller chunks, and processes these iteratively instead of loading the whole raster file into RAM.
For the illustration of **terra** concepts, we will use datasets from the **spDataLarge** [@R-spDataLarge].
It consists of a few raster objects and one vector object covering an area of Zion National Park (Utah, USA).
For example, `srtm.tif` is a digital elevation model of this area (for more details, see its documentation `?srtm`).
First, let's create a `SpatRaster` object named `my_rast`:
```{r 02-spatial-data-37, message=FALSE}
raster_filepath = system.file("raster/srtm.tif", package = "spDataLarge")
my_rast = rast(raster_filepath)
class(my_rast)
```
Typing the name of the raster into the console, will print out the raster header (dimensions, resolution, extent, CRS) and some additional information (class, data source, summary of the raster values):
```{r 02-spatial-data-38}
my_rast
```
Dedicated functions report each component: `dim()` returns the number of rows, columns and layers; `ncell()` the number of cells (pixels); `res()` the spatial resolution; `ext()` its spatial extent; and `crs()` its CRS (raster reprojection is covered in Section \@ref(reproj-ras)).
`inMemory()` reports whether the raster data is stored in memory or on disk, and `sources` specifies the file location.
```{block2 terrahelp, type='rmdnote'}
`help("terra-package")` returns a full list of all available **terra** functions.
```
### Basic map-making {#basic-map-raster}
Similar to the **sf** package, **terra** also provides `plot()` methods for its own classes.
As shown in the following command, the `plot()` function creates a basic raster plot, resulting in Figure \@ref(fig:basic-new-raster-plot).
\index{map-making!basic raster}
```{r basic-new-raster-plot, fig.cap="Basic raster plot."}
plot(my_rast)
```
There are several other approaches for plotting raster data in R that are outside the scope of this section, including:
- `plotRGB()` function from the **terra** package to create a plot based on three layers in a `SpatRaster` object
- Packages such as **tmap** to create static and interactive maps of raster and vector objects (see Chapter \@ref(adv-map))
- Functions, for example `levelplot()` from the **rasterVis** package, to create facets, a common technique for visualizing change over time
### Raster classes {#raster-classes}
\index{terra (package)}
The `SpatRaster` class represents rasters object of **terra**.
The easiest way to create a raster object in R is to read-in a raster file from disk or from a server (Section \@ref(raster-data-read)).
\index{raster!class}
```{r 02-spatial-data-41}
single_raster_file = system.file("raster/srtm.tif", package = "spDataLarge")
single_rast = rast(raster_filepath)
```
The **terra** package supports numerous drivers with the help of the GDAL library.
Rasters from files are usually not read entirely into RAM, with an exception of their header and a pointer to the file itself.
Rasters can also be created from scratch, using the same `rast()` function.
This is illustrated in the subsequent code chunk, which results in a new `SpatRaster` object.
The resulting raster consists of 36 cells (6 columns and 6 rows specified by `nrows` and `ncols`) centered around the Prime Meridian and the Equator (see `xmin`, `xmax`, `ymin` and `ymax` parameters).
Values (`vals`) are assigned to each cell: 1 to cell 1, 2 to cell 2, and so on.
Remember: `rast()` fills cells row-wise (unlike `matrix()`) starting at the upper left corner, meaning the top row contains the values 1 to 6, the second 7 to 12, etc.
For other ways of creating raster objects, see `?rast`.
```{r 02-spatial-data-42}
new_raster = rast(nrows = 6, ncols = 6,
xmin = -1.5, xmax = 1.5, ymin = -1.5, ymax = 1.5,
vals = 1:36)
```
Given the number of rows and columns as well as the extent (`xmin`, `xmax`, `ymin`, `ymax`), the resolution has to be 0.5.
The unit of the resolution is that of the underlying CRS.
Here, it is degrees, because the default CRS of raster objects is WGS84.
However, one can specify any other CRS with the `crs` argument.
The `SpatRaster` class also handles multiple layers, which typically correspond to a single multi-spectral satellite file or a time-series of rasters.
```{r 02-spatial-data-45}
multi_raster_file = system.file("raster/landsat.tif", package = "spDataLarge")
multi_rast = rast(multi_raster_file)
multi_rast
```
`nlyr()` retrieves the number of layers stored in a `SpatRaster` object:
```{r 02-spatial-data-47}
nlyr(multi_rast)
```
For multi-layer raster objects, layers can be selected with the `[[` and `$` operators, for example with commands `multi_rast[["landsat_1"]]` and `multi_rast$landsat_1`.
The `terra::subset()` can also be used to select layers.
It accepts a layer number or its name as the second argument:
```{r, eval=FALSE}
multi_rast3 = subset(multi_rast, 3)
multi_rast4 = subset(multi_rast, "landsat_4")
```
The opposite operation, combining several `SpatRaster` objects into one, can be done using the `c` function:
```{r, eval=FALSE}
multi_rast34 = c(multi_rast3, multi_rast4)
```
```{block2 02-spatial-data-2a, type='rmdnote'}
Most `SpatRaster` objects do not store raster values, but rather a pointer to the file itself.
This has a significant side-effect -- they cannot be directly saved to `".rds"` or `".rda"` files or used in cluster computing.
In these cases, there are two main possible solutions: (1) use of the `wrap()` function that creates a special kind of temporary object that can be saved as an R object or used in cluster computing, or (2) save the object as a regular raster with `writeRaster()`.
```
<!--jn:toDo-->
<!--consider new section with other data models-->
<!-- e.g., point clouds, data cubes, meshes, etc. -->
## Coordinate Reference Systems {#crs-intro}
\index{CRS!introduction}
Vector and raster spatial data types share concepts intrinsic to spatial data.
Perhaps the most fundamental of these is the coordinate reference systems (CRSs), which defines how the spatial elements of the data relate to the surface of the Earth (or other bodies).
CRSs are either geographic or projected, as introduced at the beginning of this chapter (see Figure \@ref(fig:vectorplots)).
This section explains each type, laying the foundations for Chapter \@ref(reproj-geo-data), which provides a deep dive into setting, transforming and querying CRSs.
### Geographic coordinate reference systems
\index{CRS!geographic}
Geographic CRSs identify any location on the Earth's surface using two values --- longitude and latitude (Figure \@ref(fig:vector-crs), left panel).
*Longitude* is location in the East-West direction in angular distance from the Prime Meridian plane.
*Latitude* is angular distance North or South of the equatorial plane.
Distances in geographic CRSs are therefore not measured in meters.
This has important consequences, as demonstrated in Section \@ref(reproj-geo-data).
The surface of the Earth in geographic CRSs is represented by a spherical or ellipsoidal surface.
Spherical models assume that the Earth is a perfect sphere of a given radius -- they have the advantage of simplicity but, at the same time, they are inaccurate as the Earth is not exactly a sphere.
Ellipsoidal models are slightly more accurate, and are defined by two parameters: the equatorial radius and the polar radius.
These are suitable because the Earth is compressed: the equatorial radius is around 11.5 km longer than the polar radius [@maling_coordinate_1992].^[
The degree of compression is often referred to as *flattening*, defined in terms of the equatorial radius ($a$) and polar radius ($b$) as follows: $f = (a - b) / a$. The terms *ellipticity* and *compression* can also be used.
Because $f$ is a rather small value, digital ellipsoid models use the 'inverse flattening' ($rf = 1/f$) to define the Earth's compression.
Values of $a$ and $rf$ in various ellipsoidal models can be seen by executing `sf_proj_info(type = "ellps")`.
]
Ellipsoids are part of a wider component of CRSs: the *datum*.
This contains information on what ellipsoid to use and the precise relationship between the coordinates and location on the Earth's surface.
There are two types of datum --- geocentric (such as `WGS84`) and local (such as `NAD83`).
You can see examples of these two types of datums in Figure \@ref(fig:datum-fig).
Black lines represent a *geocentric datum*, whose center is located in the Earth's center of gravity and is not optimized for a specific location.
In a *local datum*, shown as a purple dashed line, the ellipsoidal surface is shifted to align with the surface at a particular location.
These allow local variations in Earth's surface, for example due to large mountain ranges, to be accounted for in a local CRS.
This can be seen in Figure \@ref(fig:datum-fig), where the local datum is fitted to the area of Philippines, but is misaligned with most of the rest of the planet's surface.
Both datums in Figure \@ref(fig:datum-fig) are put on top of a geoid --- a model of global mean sea level.^[Note that the geoid in Figure \@ref(fig:datum-fig) is exaggerated by a factor of 10,000 to highlight the irregular shape of the planet.]
(ref:datum-fig) Geocentric and local geodetic datums shown on top of a geoid (in false color and the vertical exaggeration by 10,000 scale factor). Image of the geoid is adapted from the work of @essd-11-647-2019.
```{r datum-fig, echo=FALSE, message=FALSE, fig.cap="(ref:datum-fig)", fig.scap="Geocentric and local geodetic datums on a geoid."}
knitr::include_graphics("images/02_datum_fig.png")
```
### Projected coordinate reference systems
\index{CRS!projected}
All projected CRSs are based on a geographic CRS, described in the previous section, and rely on map projections to convert the three-dimensional surface of the Earth into Easting and Northing (x and y) values in a projected CRS.
Projected CRSs are based on Cartesian coordinates on an implicitly flat surface (Figure \@ref(fig:vector-crs), right panel).
They have an origin, x and y axes, and a linear unit of measurement such as meters.
This transition cannot be done without adding some deformations.
Therefore, some properties of the Earth's surface are distorted in this process, such as area, direction, distance, and shape.
A projected coordinate reference system can preserve only one or two of those properties.
Projections are often named based on a property they preserve: equal-area preserves area, azimuthal preserve direction, equidistant preserve distance, and conformal preserve local shape.
There are three main groups of projection types: conic, cylindrical, and planar (azimuthal).
In a conic projection, the Earth's surface is projected onto a cone along a single line of tangency or two lines of tangency.
Distortions are minimized along the tangency lines and rise with the distance from those lines in this projection.
Therefore, it is the best suited for maps of mid-latitude areas.
A cylindrical projection maps the surface onto a cylinder.
This projection could also be created by touching the Earth's surface along a single line of tangency or two lines of tangency.
Cylindrical projections are used most often when mapping the entire world.
A planar projection projects data onto a flat surface touching the globe at a point or along a line of tangency.
It is typically used in mapping polar regions.
`sf_proj_info(type = "proj")` gives a list of the available projections supported by the PROJ library.
A quick summary of different projections, their types, properties, and suitability can be found at [www.geo-projections.com](https://www.geo-projections.com/).
We will expand on CRSs and explain how to project from one CRS to another in Chapter \@ref(reproj-geo-data).
For now, it is sufficient to know:
- That coordinate systems are a key component of geographic objects
- Which CRS your data is in, and whether it is in geographic (lon/lat) or projected (typically meters), is important and has consequences for how R handles spatial and geometry operations
- That CRSs of `sf` objects can be queried with the function `st_crs()` and CRSs of `terra` objects can be queried with the function `crs()`
```{r vector-crs, echo=FALSE, fig.cap="Examples of geographic (WGS 84; left) and projected (NAD83 / UTM zone 12N; right) coordinate systems for a vector data type.", message=FALSE, fig.asp=0.56, fig.scap="Examples of geographic and projected CRSs (vector data)."}
# source("https://github.com/geocompx/geocompr/raw/main/code/02-vector-crs.R")
knitr::include_graphics("images/02_vector_crs.png")
```
## Units
An important feature of CRSs is that they contain information about spatial units.
Clearly, it is vital to know whether a house's measurements are in feet or meters, and the same applies to maps.
It is good cartographic practice to add a *scale bar* or some other distance indicator onto maps to demonstrate the relationship between distances on the page or screen and distances on the ground.
Likewise, it is important to formally specify the units in which the geometry data or cells are measured to provide context, and to ensure that subsequent calculations are done in context.
A novel feature of geometry data in `sf` objects is that they have *native support* for units.
This means that distance, area and other geometric calculations in **sf** return values that come with a `units` attribute, defined by the **units** package [@pebesma_measurement_2016].
This is advantageous, preventing confusion caused by different units (most CRSs use meters, some use feet) and providing information on dimensionality.
This is demonstrated in the code chunk below, which calculates the area of Luxembourg:
\index{units}
\index{sf!units}
```{r 02-spatial-data-57}
luxembourg = world[world$name_long == "Luxembourg", ]
```
```{r 02-spatial-data-58}
st_area(luxembourg) # requires the s2 package in recent versions of sf
```
The output is in units of square meters (m^2^), showing that the result represents two-dimensional space.
This information, stored as an attribute (which interested readers can discover with `attributes(st_area(luxembourg))`), can feed into subsequent calculations that use units, such as population density (which is measured in people per unit area, typically per km^2^).
Reporting units prevents confusion.
To take the Luxembourg example, if the units remained unspecified, one could incorrectly assume that the units were in hectares.
To translate the huge number into a more digestible size, it is tempting to divide the results by a million (the number of square meters in a square kilometer):
```{r 02-spatial-data-59}
st_area(luxembourg) / 1000000
```
However, the result is incorrectly given again as square meters.
The solution is to set the correct units with the **units** package:
```{r 02-spatial-data-60}
units::set_units(st_area(luxembourg), km^2)
```
Units are of equal importance in the case of raster data.
However, so far **sf** is the only spatial package that supports units, meaning that people working on raster data should approach changes in the units of analysis (for example, converting pixel widths from imperial to decimal units) with care.
The `my_rast` object (see above) uses a WGS84 projection with decimal degrees as units.
Consequently, its resolution is also given in decimal degrees, but you have to know it, since the `res()` function simply returns a numeric vector.
```{r 02-spatial-data-61}
res(my_rast)
```
If we used the Universal Transverse Mercator (UTM) projection, the units would change.
```{r 02-spatial-data-62, warning=FALSE, message=FALSE}
repr = project(my_rast, "EPSG:26912")
res(repr)
```
Again, the `res()` command gives back a numeric vector without any unit, forcing us to know that the unit of the UTM projection is meters.
## Exercises {#ex2}
```{r, echo=FALSE, results='asis'}
res = knitr::knit_child('_02-ex.Rmd', quiet = TRUE, options = list(include = FALSE, eval = FALSE))
cat(res, sep = '\n')
```
================================================
FILE: 03-attribute-operations.Rmd
================================================
# Attribute data operations {#attr}
```{r, include=FALSE}
source("code/before_script.R")
```
## Prerequisites {-}
- This chapter requires the following packages to be installed and attached:
```{r 03-attribute-operations-1, message=FALSE}
library(sf) # vector data package introduced in Chapter 2
library(terra) # raster data package introduced in Chapter 2
library(dplyr) # tidyverse package for data frame manipulation
```
- It relies on **spData**, which loads datasets used in the code examples of this chapter:
```{r 03-attribute-operations-2, results='hide'}
#| message: FALSE
#| results: hide
library(spData) # spatial data package introduced in Chapter 2
```
- Also ensure you have installed the **tidyr** package, or the **tidyverse** of which it is a part, if you want to run data 'tidying' operations in Section \@ref(vec-attr-creation).
## Introduction
\index{attribute}
Attribute data is non-spatial information associated with geographic (geometry) data.
A bus stop provides a simple example: its position would typically be represented by latitude and longitude coordinates (geometry data), in addition to its name.
The [Elephant & Castle / New Kent Road](https://www.openstreetmap.org/relation/6610626) stop in London, for example has coordinates of $-0.098$ degrees longitude and 51.495 degrees latitude which can be represented as `POINT (-0.098 51.495)` in the `sfc` representation described in Chapter \@ref(spatial-class).
Attributes, such as *name*\index{attribute}, of the POINT feature (to use simple features terminology) are the topic of this chapter.
```{r, eval=FALSE, echo=FALSE}
# Aim: find a bus stop in central London
library(osmdata)
london_coords = c(-0.1, 51.5)
london_bb = c(-0.11, 51.49, -0.09, 51.51)
bb = tmaptools::bb(london_bb)
osm_data = opq(bbox = london_bb) |>
add_osm_feature(key = "highway", value = "bus_stop") |>
osmdata_sf()
osm_data_points = osm_data$osm_points
osm_data_points[4, ]
point_vector = round(sf::st_coordinates(osm_data_points[4, ]), 3)
point_df = data.frame(name = "London bus stop", point_vector)
point_sf = sf::st_as_sf(point_df, coords = c("X", "Y"))
```
\index{attribute}
Another example is the elevation value (attribute) for a specific grid cell in raster data.
Unlike the vector data model, the raster data model stores the coordinate of the grid cell indirectly, meaning the distinction between attribute and spatial information is less clear.
To illustrate the point, think of a pixel in row 3 and column 4 of a raster matrix.
Its spatial location is defined by its index in the matrix: move from the origin four cells in the x direction (typically east and right on maps) and three cells in the y direction (typically south and down).
The raster's *resolution* defines the distance for each x- and y-step which is specified in a *header*.
The header is a vital component of raster datasets which specifies how pixels relate to spatial coordinates (see also Chapter \@ref(spatial-operations)).
This chapter teaches how to manipulate geographic objects based on attributes such as the names of bus stops in a vector dataset and elevations of pixels in a raster dataset.
For vector data, this means techniques such as subsetting and aggregation (see Sections \@ref(vector-attribute-subsetting) to \@ref(vector-attribute-aggregation)).
Sections \@ref(vector-attribute-joining) and \@ref(vec-attr-creation) demonstrate how to join data onto simple feature objects using a shared ID and how to create new variables, respectively.
Each of these operations has a spatial equivalent:
the `[` operator in base R, for example, works equally for subsetting objects based on their attribute and spatial objects; you can also join attributes in two geographic datasets using spatial joins.
This is good news: skills developed in this chapter are cross-transferable.
After a deep dive into various types of *vector* attribute operations in the next section, *raster* attribute data operations are covered.
Creation of raster layers containing continuous and categorical attributes and extraction of cell values from one or more layer (raster subsetting) (Section \@ref(raster-subsetting)) are demonstrated.
Section \@ref(summarizing-raster-objects) provides an overview of 'global' raster operations which can be used to summarize entire raster datasets.
Chapter \@ref(spatial-operations) extends the methods presented here to the spatial world.
## Vector attribute manipulation
\index{attribute}
Geographic vector datasets are well supported in R thanks to the `sf` class, which extends base R's `data.frame`.
Like data frames, `sf` objects have one column per attribute variable (such as 'name') and one row per observation or *feature* (e.g., per bus station).
`sf` objects differ from basic data frames because they have a `geometry` column of class `sfc` which can contain a range of geographic entities (single and 'multi' point, line, and polygon features) per row.
This was described in Chapter \@ref(spatial-class), which demonstrated how *generic methods* such as `plot()` and `summary()` work with `sf` objects.
**sf** also provides generics that allow `sf` objects to behave like regular data frames, as shown by printing the class's methods:
```{r 03-attribute-operations-3, eval=FALSE}
methods(class = "sf") # methods for sf objects, first 12 shown
```
```{r 03-attribute-operations-4}
#> [1] [ [[<- $<- aggregate
#> [5] as.data.frame cbind coerce filter
#> [9] identify initialize merge plot
```
```{r 03-attribute-operations-5, eval=FALSE, echo=FALSE}
# Another way to show sf methods:
attributes(methods(class = "sf"))$info |>
dplyr::filter(!visible)
```
Many of these (`aggregate()`, `cbind()`, `merge()`, `rbind()` and `[`) are for manipulating data frames.
`rbind()`, for example, binds rows of data frames together, one 'on top' of the other.
`$<-` creates new columns.
A key feature of `sf` objects is that they store spatial and non-spatial data in the same way, as columns in a `data.frame`.
```{block2 03-attribute-operations-6, type = 'rmdnote'}
The geometry column of `sf` objects is typically called `geometry` or `geom`, but any name can be used.
The following command, for example, creates a geometry column named g:
`st_sf(data.frame(n = world$name_long), g = world$geom)`
This enables geometries imported from spatial databases to have a variety of names such as `wkb_geometry` and `the_geom`.
```
`sf` objects can also extend the tidyverse classes for data frames, `tbl_df` and `tbl`.\index{tidyverse (package)}
Thus **sf** enables the full power of R's data analysis capabilities to be unleashed on geographic data, whether you use base R or tidyverse functions for data analysis.
\index{tibble}
**sf** objects can also be used with the high-performance data processing package **data.table** although, as documented in the issue [`Rdatatable/data.table#2273`](https://github.com/Rdatatable/data.table/issues/2273), is not fully [compatible](https://github.com/Rdatatable/data.table/issues/5352) with `sf` objects.
Before using these capabilities, it is worth recapping how to discover the basic properties of vector data objects.
Let's start by using base R functions to learn about the `world` dataset from the **spData** package:
```{r 03-attribute-operations-7}
class(world) # it's an sf object and a (tidy) data frame
dim(world) # it is a two-dimensional object, with 177 rows and 11 columns
```
\index{attribute!dropping geometries}
`world` contains ten non-geographic columns (and one geometry list column) with almost 200 rows representing the world's countries.
The function `st_drop_geometry()` keeps only the attributes data of an `sf` object, in other words removing its geometry:
```{r 03-attribute-operations-8}
world_df = st_drop_geometry(world)
class(world_df)
ncol(world_df)
```
Dropping the geometry column before working with attribute data can be useful; data manipulation processes can run faster when they work only on the attribute data and geometry columns are not always needed.
For most cases, however, it makes sense to keep the geometry column, explaining why the column is 'sticky' (it remains after most attribute operations unless specifically dropped).
Non-spatial data operations on `sf` objects only change an object's geometry when appropriate (e.g., by dissolving borders between adjacent polygons following aggregation).
Becoming skilled at geographic attribute data manipulation means becoming skilled at manipulating data frames.
For many applications, the tidyverse\index{tidyverse (package)} package **dplyr** [@R-dplyr] offers an effective approach for working with data frames.
Tidyverse compatibility is an advantage of **sf** over its predecessor **sp**, but there are some pitfalls to avoid (see the supplementary `tidyverse-pitfalls` vignette at [geocompx.org](https://geocompx.github.io/geocompkg/articles/tidyverse-pitfalls.html) for details).
### Vector attribute subsetting
Base R subsetting methods include the operator `[` and the function `subset()`.
The key **dplyr** subsetting functions are `filter()` and `slice()` for subsetting rows, and `select()` for subsetting columns.
Both approaches preserve the spatial components of attribute data in `sf` objects, while using the operator `$` or the **dplyr** function `pull()` to return a single attribute column as a vector will lose the geometry data, as we will see.
\index{attribute!subsetting}
This section focuses on subsetting `sf` data frames; for further details on subsetting vectors and non-geographic data frames we recommend reading Section [2.7](https://cran.r-project.org/doc/manuals/r-release/R-intro.html#Index-vectors) of *An Introduction to R* [@rcoreteam_introduction_2021] and chapter [4](https://adv-r.hadley.nz/subsetting.html) of *Advanced R Programming* [@wickham_advanced_2019], respectively.
\index{attribute!subsetting}
The `[` operator can subset both rows and columns.
Indices placed inside square brackets placed directly after a data frame object name specify the elements to keep.
The command `object[i, j]` means 'return the rows represented by `i` and the columns represented by `j`', where `i` and `j` typically contain integers or `TRUE`s and `FALSE`s (indices can also be character strings, indicating row or column names).
`object[5, 1:3]`, for example, means 'return data containing the 5th row and columns 1 to 3: the result should be a data frame with only 1 row and 3 columns, and a fourth geometry column if it's an `sf` object.
Leaving `i` or `j` empty returns all rows or columns, so `world[1:5, ]` returns the first five rows and all 11 columns.
The examples below demonstrate subsetting with base R.
Guess the number of rows and columns in the `sf` data frames returned by each command and check the results on your own computer (see the end of the chapter for more exercises):
```{r 03-attribute-operations-9, eval=FALSE}
world[1:6, ] # subset rows by position
world[, 1:3] # subset columns by position
world[1:6, 1:3] # subset rows and columns by position
world[, c("name_long", "pop")] # columns by name
world[, c(T, T, F, F, F, F, F, T, T, F, F)] # by logical indices
world[, 888] # an index representing a non-existent column
```
```{r, eval=FALSE, echo=FALSE}
# these fail
world[c(1, 5), c(T, T)]
world[c(1, 5), c(T, T, F, F, F, F, F, T, T, F, F, F)]
```
A demonstration of the utility of using `logical` vectors for subsetting is shown in the code chunk below.
This creates a new object, `small_countries`, containing nations whose surface area is smaller than 10,000 km^2^.
```{r 03-attribute-operations-10}
i_small = world$area_km2 < 10000
summary(i_small) # a logical vector
small_countries = world[i_small, ]
```
The intermediary `i_small` (short for index representing small countries) is a logical vector that can be used to subset the seven smallest countries in the `world` by surface area.
A more concise command, which omits the intermediary object, generates the same result:
```{r 03-attribute-operations-11}
small_countries = world[world$area_km2 < 10000, ]
```
The base R function `subset()` provides another way to achieve the same result:
```{r 03-attribute-operations-12, eval=FALSE}
small_countries = subset(world, area_km2 < 10000)
```
\index{attribute!subsetting}
Base R functions are mature, stable and widely used, making them a rock solid choice, especially in contexts where reproducibility and reliability are key.
**dplyr** functions enable 'tidy' workflows which some people (the authors of this book included) find intuitive and productive for interactive data analysis, especially when combined with code editors such as RStudio that enable [auto-completion](https://support.posit.co/hc/en-us/articles/205273297-Code-Completion-in-the-RStudio-IDE) of column names.
Key functions for subsetting data frames (including `sf` data frames) with **dplyr** functions are demonstrated below.
```{r, echo=FALSE, eval=FALSE}
# Aim: benchmark base vs. dplyr subsetting
# Could move elsewhere?
i = sample(nrow(world), size = 10)
benchmark_subset = bench::mark(
world[i, ],
world |> slice(i)
)
benchmark_subset[c("expression", "itr/sec", "mem_alloc")]
# # October 2021 on laptop with CRAN version of dplyr:
# # A tibble: 2 × 3
# expression `itr/sec` mem_alloc
# <bch:expr> <dbl> <bch:byt>
# 1 world[i, ] 1744. 5.55KB
# 2 world |> slice(i) 671. 4.45KB
```
`select()` selects columns by name or position.
For example, you could select only two columns, `name_long` and `pop`, with the following command:
```{r 03-attribute-operations-14}
world1 = select(world, name_long, pop)
names(world1)
```
Note: as with the equivalent command in base R (`world[, c("name_long", "pop")]`), the sticky `geom` column remains.
`select()` also allows selecting a range of columns with the help of the `:` operator:
```{r 03-attribute-operations-15}
# all columns between name_long and pop (inclusive)
world2 = select(world, name_long:pop)
```
You can remove specific columns with the `-` operator:
```{r 03-attribute-operations-16}
# all columns except subregion and area_km2 (inclusive)
world3 = select(world, -subregion, -area_km2)
```
Subset and rename columns at the same time with the `new_name = old_name` syntax:
```{r 03-attribute-operations-17}
world4 = select(world, name_long, population = pop)
```
It is worth noting that the command above is more concise than base R equivalent, which requires two lines of code:
```{r 03-attribute-operations-18, eval=FALSE}
world5 = world[, c("name_long", "pop")] # subset columns by name
names(world5)[names(world5) == "pop"] = "population" # rename column manually
```
`select()` also works with 'helper functions' for more advanced subsetting operations, including `contains()`, `starts_with()` and `num_range()` (see the help page with `?select` for details).
Most **dplyr** verbs return a data frame, but you can extract a single column as a vector with `pull()`.
You can get the same result in base R with the list subsetting operators `$` and `[[`, the three following commands return the same numeric vector:
```{r 03-attribute-operations-21, eval = FALSE}
pull(world, pop)
world$pop
world[["pop"]]
```
```{r 03-attribute-operations-19, eval=FALSE, echo=FALSE}
# create throw-away data frame
d = data.frame(pop = 1:10, area = 1:10)
# return data frame object when selecting a single column
d[, "pop", drop = FALSE] # equivalent to d["pop"]
select(d, pop)
# return a vector when selecting a single column
d[, "pop"]
pull(d, pop)
```
```{r 03-attribute-operations-20, echo=FALSE, eval=FALSE}
x1 = d[, "pop", drop = FALSE] # equivalent to d["pop"]
x2 = d["pop"]
identical(x1, x2)
```
`slice()` is the row-equivalent of `select()`.
The following code chunk, for example, selects rows 1 to 6:
```{r 03-attribute-operations-22, eval=FALSE}
slice(world, 1:6)
```
`filter()` is **dplyr**'s equivalent of base R's `subset()` function.
It keeps only rows matching given criteria, e.g., only countries with an area below a certain threshold, or with a high average of life expectancy, as shown in the following examples:
```{r 03-attribute-operations-23, eval=FALSE}
world7 = filter(world, area_km2 < 10000) # countries with a small area
world7 = filter(world, lifeExp > 82) # with high life expectancy
```
The standard set of comparison operators can be used in the `filter()` function, as illustrated in Table \@ref(tab:operators).
```{r operators0, echo=FALSE}
if (knitr::is_html_output()){
operators = c("`==`", "`!=`", "`>`, `<`", "`>=`, `<=`", "`&`, <code>|</code>, `!`")
} else {
operators = c("==", "!=", ">, <", ">=, <=", "&, |, !")
}
```
```{r operators, echo=FALSE}
operators_exp = c("Equal to", "Not equal to", "Greater/Less than",
"Greater/Less than or equal",
"Logical operators: And, Or, Not")
knitr::kable(tibble(Symbol = operators, Name = operators_exp),
caption = "Comparison operators that return Boolean (true/false) values.",
caption.short = "Comparison operators.",
booktabs = TRUE)
```
### Chaining commands with pipes
\index{pipe operator}
Key to workflows using **dplyr** functions is the ['pipe'](https://r4ds.had.co.nz/pipes.html) operator `%>%` (or since R `4.1.0` the native pipe `|>`), which takes its name from the Unix pipe `|` [@grolemund_r_2016].
Pipes enable expressive code: the output of a previous function becomes the first argument of the next function, enabling *chaining*.
This is illustrated below, in which only countries from Asia are filtered from the `world` dataset, next the object is subset by columns (`name_long` and `continent`) and the first five rows (result not shown).
```{r 03-attribute-operations-24}
world7 = world |>
filter(continent == "Asia") |>
select(name_long, continent) |>
slice(1:5)
```
The above chunk shows how the pipe operator allows commands to be written in a clear order:
the above run from top to bottom (line-by-line) and left to right.
An alternative to piped operations is nested function calls, which are harder to read:
```{r 03-attribute-operations-25}
world8 = slice(
select(
filter(world, continent == "Asia"),
name_long, continent),
1:5)
```
Another alternative is to split the operations into multiple self-contained lines, which is recommended when developing new R packages, an approach which has the advantage of saving intermediate results with distinct names which can be later inspected for debugging purposes (an approach which has disadvantages of being verbose and cluttering the global environment when undertaking interactive analysis):
```{r 03-attribute-operations-25-2}
world9_filtered = filter(world, continent == "Asia")
world9_selected = select(world9_filtered, continent)
world9 = slice(world9_selected, 1:5)
```
Each approach has advantages and disadvantages, the importance of which depend on your programming style and applications.
For interactive data analysis, the focus of this chapter, we find piped operations fast and intuitive, especially when combined with [RStudio](https://support.posit.co/hc/en-us/articles/200711853-Keyboard-Shortcuts-in-the-RStudio-IDE)/[VSCode](https://github.com/REditorSupport/vscode-R/wiki/Keyboard-shortcuts) shortcuts for creating pipes and [auto-completing](https://support.posit.co/hc/en-us/articles/205273297-Code-Completion-in-the-RStudio-IDE) variable names.
### Vector attribute aggregation
\index{attribute!aggregation}
\index{aggregation}
Aggregation involves summarizing data with one or more 'grouping variables', typically from columns in the data frame to be aggregated (geographic aggregation is covered in the next chapter).
An example of attribute aggregation is calculating the number of people per continent based on country-level data (one row per country).
The `world` dataset contains the necessary ingredients: the columns `pop` and `continent`, the population and the grouping variable, respectively.
The aim is to find the `sum()` of country populations for each continent, resulting in a smaller data frame (aggregation is a form of data reduction and can be a useful early step when working with large datasets).
This can be done with the base R function `aggregate()` as follows:
```{r 03-attribute-operations-26}
world_agg1 = aggregate(pop ~ continent, FUN = sum, data = world,
na.rm = TRUE)
class(world_agg1)
```
The result is a non-spatial data frame with six rows, one per continent, and two columns reporting the name and population of each continent (see Table \@ref(tab:continents) with results for the top three most populous continents).
`aggregate()` is a [generic function](https://adv-r.hadley.nz/s3.html#s3-methods) which means that it behaves differently depending on its inputs.
**sf** provides the method `aggregate.sf()` which is activated automatically when `x` is an `sf` object and a `by` argument is provided:
```{r 03-attribute-operations-27}
world_agg2 = aggregate(world["pop"], by = list(world$continent), FUN = sum,
na.rm = TRUE)
class(world_agg2)
nrow(world_agg2)
```
The resulting `world_agg2` object is a spatial object containing eight features representing the continents of the world (and the open ocean).
\index{attribute!aggregation}
`group_by() |> summarize()` is the **dplyr** equivalent of `aggregate()`.
Grouping variables are defined in the `group_by()` function and to aggregation formula is defined in the `summarize()` function, as shown below:
```{r 03-attribute-operations-28}
world_agg3 = world |>
group_by(continent) |>
summarize(pop = sum(pop, na.rm = TRUE))
```
The approach may seem more complex, but it has benefits: flexibility, readability, and control over the new column names.
This flexibility is illustrated in the command below, which calculates not only the population but also the area and number of countries in each continent:
```{r 03-attribute-operations-29}
world_agg4 = world |>
group_by(continent) |>
summarize(Pop = sum(pop, na.rm = TRUE), Area = sum(area_km2), N = n())
```
In the previous code chunk `Pop`, `Area` and `N` are column names in the result, and `sum()` and `n()` were the aggregating functions.
These aggregating functions return `sf` objects with rows representing continents and geometries containing the multiple polygons representing each land mass and associated islands (this works thanks to the geometric operation 'union', as explained in Section \@ref(geometry-unions)).
\index{pipe operator}
\index{attribute!subsetting}
\index{attribute!aggregation}
Let's combine what we have learned so far about **dplyr** functions, by chaining multiple commands to summarize attribute data about countries worldwide by continent.
The following command calculates population density (with `mutate()`), arranges continents by the number of countries they contain (with `arrange()`), and keeps only the three most populous continents (with `slice_max()`), the result of which is presented in Table \@ref(tab:continents)):
```{r 03-attribute-operations-30}
world_agg5 = world |>
st_drop_geometry() |> # drop the geometry for speed
select(pop, continent, area_km2) |> # subset the columns of interest
group_by(Continent = continent) |> # group by continent and summarize:
summarize(Pop = sum(pop, na.rm = TRUE), Area = sum(area_km2), N = n()) |>
mutate(Density = round(Pop / Area)) |> # calculate population density
slice_max(Pop, n = 3) |> # keep only the top 3
arrange(desc(N)) # arrange in order of n. countries
```
```{r continents, echo=FALSE}
options(scipen = 999)
knitr::kable(
world_agg5,
caption = "The top three most populous continents ordered by number of countries.",
caption.short = "Top three most populous continents.",
booktabs = TRUE
)
```
```{block2 03-attribute-operations-31, type='rmdnote'}
More details are provided in the help pages (which can be accessed via `?summarize` and `vignette(package = "dplyr")` and Chapter 5 of [R for Data Science](https://r4ds.had.co.nz/transform.html#grouped-summaries-with-summarize).
```
### Vector attribute joining
Combining data from different sources is a common task in data preparation.
Joins do this by combining tables based on a shared 'key' variable.
**dplyr** has multiple join functions including `left_join()` and `inner_join()` --- see `vignette("two-table")` for a full list.
These function names follow conventions used in the database language [SQL](https://r4ds.had.co.nz/relational-data.html) [@grolemund_r_2016, Chapter 13]; using them to join non-spatial datasets to `sf` objects is the focus of this section.
**dplyr** join functions work the same on data frames and `sf` objects, the only important difference being the `geometry` list column.
The result of data joins can be either an `sf` or `data.frame` object.
The most common type of attribute join on spatial data takes an `sf` object as the first argument and adds columns to it from a `data.frame` specified as the second argument.
\index{join}
\index{attribute!join}
To demonstrate joins, we will combine data on coffee production with the `world` dataset.
The coffee data is in a data frame called `coffee_data` from the **spData** package (see `?coffee_data` for details).
It has three columns:
`name_long` names major coffee-producing nations and `coffee_production_2016` and `coffee_production_2017` contain estimated values for coffee production in units of 60-kg bags in each year.
A 'left join', which preserves the first dataset, merges `world` with `coffee_data`.
```{r 03-attribute-operations-32, warning=FALSE}
world_coffee = left_join(world, coffee_data)
class(world_coffee)
```
Because the input datasets share a 'key variable' (`name_long`) the join worked without using the `by` argument (see `?left_join` for details).
The result is an `sf` object identical to the original `world` object but with two new variables (with column indices 11 and 12) on coffee production.
This can be plotted as a map, as illustrated in Figure \@ref(fig:coffeemap), generated with the `plot()` function below.
```{r coffeemap, fig.cap="World coffee production (thousand 60-kg bags) by country, 2017. Source: International Coffee Organization.", fig.scap="World coffee production by country."}
names(world_coffee)
plot(world_coffee["coffee_production_2017"])
```
For joining to work, a 'key variable' must be supplied in both datasets.
By default, **dplyr** uses all variables with matching names.
In this case, both `coffee_data` and `world` objects contained a variable called `name_long`, explaining the message `Joining with 'by = join_by(name_long)'`.
In the majority of cases where variable names are not the same, you have two options:
1. Rename the key variable in one of the objects so they match.
2. Use the `by` argument to specify the joining variables.
The latter approach is demonstrated below on a renamed version of `coffee_data`.
```{r 03-attribute-operations-33, warning=FALSE}
coffee_renamed = rename(coffee_data, nm = name_long)
world_coffee2 = left_join(world, coffee_renamed, by = join_by(name_long == nm))
```
```{r 03-attribute-operations-34, eval=FALSE, echo=FALSE}
identical(world_coffee, world_coffee2)
nrow(world)
nrow(world_coffee)
```
Note that the name in the original object is kept, meaning that `world_coffee` and the new object `world_coffee2` are identical.
Another feature of the result is that it has the same number of rows as the original dataset.
Although there are only 47 rows of data in `coffee_data`, all 177 country records are kept intact in `world_coffee` and `world_coffee2`:
rows in the original dataset with no match are assigned `NA` values for the new coffee production variables.
What if we only want to keep countries that have a match in the key variable?
\index{attribute!join}
In that case, an inner join can be used.
```{r 03-attribute-operations-35, warning=FALSE}
world_coffee_inner = inner_join(world, coffee_data)
nrow(world_coffee_inner)
```
Note that the result of `inner_join()` has only 45 rows compared with 47 in `coffee_data`.
What happened to the remaining rows?
We can identify the rows that did not match using the `setdiff()` function as follows:
```{r 03-attribute-operations-36}
setdiff(coffee_data$name_long, world$name_long)
```
The result shows that `Others` accounts for one row not present in the `world` dataset and that the name of the `Democratic Republic of the Congo` accounts for the other:
it has been abbreviated, causing the join to miss it.
The following command uses a string matching (*regex*) function from the **stringr** package to confirm what `Congo, Dem. Rep. of` should be.
```{r 03-attribute-operations-37}
drc = stringr::str_subset(world$name_long, "Dem*.+Congo")
drc
```
```{r, echo=FALSE, eval=FALSE}
world$name_long[grepl(pattern = "Dem*.+Congo", world$name_long)] # base R
```
```{r 03-attribute-operations-38, eval=FALSE, echo=FALSE}
# aim: test names in coffee_data and world objects
str_subset(coffee_data$name_long, "Ivo|Congo,")
.Last.value %in% str_subset(world$name_long, "Ivo|Dem*.+Congo")
```
To fix this issue, we will create a new version of `coffee_data` and update the name.
`inner_join()`ing the updated data frame returns a result with all 46 coffee-producing nations.
```{r 03-attribute-operations-39, warning=FALSE}
coffee_data$name_long[grepl("Congo,", coffee_data$name_long)] = drc
world_coffee_match = inner_join(world, coffee_data)
nrow(world_coffee_match)
```
It is also possible to join in the other direction: starting with a non-spatial dataset and adding variables from a simple features object.
This is demonstrated below, which starts with the `coffee_data` object and adds variables from the original `world` dataset.
In contrast with the previous joins, the result is *not* another simple feature object, but a data frame in the form of a **tidyverse** tibble:
the output of a join tends to match its first argument.
```{r 03-attribute-operations-40, warning=FALSE}
coffee_world = left_join(coffee_data, world)
class(coffee_world)
```
```{block2 03-attribute-operations-41, type='rmdnote'}
In most cases, the geometry column is only useful in an `sf` object.
The geometry column can only be used for creating maps and spatial operations if R 'knows' it is a spatial object, defined by a spatial package such as **sf**.
Fortunately, non-spatial data frames with a geometry list column (like `coffee_world`) can be coerced into an `sf` object as follows: `st_as_sf(coffee_world)`.
```
This section covers the majority of joining use cases.
For more information, we recommend reading the chapter [Relational data](https://r4ds.had.co.nz/relational-data.html?q=join#relational-data) in @grolemund_r_2016, the [join vignette](https://geocompx.github.io/geocompkg/articles/join.html) in the **geocompkg** package that accompanies this book, and [documentation](https://asardaes.github.io/table.express/articles/joins.html) describing joins with **data.table** and other packages.
Additionally, spatial joins are covered in the next chapter (Section \@ref(spatial-joining)).
### Creating attributes and removing spatial information {#vec-attr-creation}
\index{attribute!create}
Often, we would like to create a new column based on already existing columns.
For example, we want to calculate population density for each country.
For this we need to divide a population column, here `pop`, by an area column, here `area_km2` with unit area in square kilometers.
Using base R, we can type:
```{r 03-attribute-operations-42}
world_new = world # do not overwrite our original data
world_new$pop_dens = world_new$pop / world_new$area_km2
```
\index{attribute!create}
Alternatively, we can use one of **dplyr** functions: `mutate()` or `transmute()`.
`mutate()` adds new columns at the penultimate position in the `sf` object (the last one is reserved for the geometry):
```{r 03-attribute-operations-43, eval=FALSE}
world_new2 = world |>
mutate(pop_dens = pop / area_km2)
```
The difference between `mutate()` and `transmute()` is that the latter drops all other existing columns (except for the sticky geometry column).
\index{attribute!create}
`unite()` from the **tidyr** package (which provides many useful functions for reshaping datasets, including `pivot_longer()`) pastes together existing columns.
For example, we want to combine the `continent` and `region_un` columns into a new column named `con_reg`.
Additionally, we can define a separator (here, a colon `:`) which defines how the values of the input columns should be joined, and if the original columns should be removed (here, `TRUE`).
```{r 03-attribute-operations-45, eval=FALSE}
world_unite = world |>
tidyr::unite("con_reg", continent:region_un, sep = ":", remove = TRUE)
```
The resulting `sf` object has a new column called `con_reg` representing the continent and region of each country, e.g., `South America:Americas` for Argentina and other South America countries.
\index{attribute!create}
**tidyr**'s `separate()` function does the opposite of `unite()`: it splits one column into multiple columns using either a regular expression or character positions.
```{r 03-attribute-operations-46, eval=FALSE}
world_separate = world_unite |>
tidyr::separate(con_reg, c("continent", "region_un"), sep = ":")
```
```{r 03-attribute-operations-47, echo=FALSE, eval=FALSE}
identical(world, world_separate)
```
\index{attribute!create}
The **dplyr** function `rename()` and the base R function `setNames()` are useful for renaming columns.
The first replaces an old name with a new one.
The following command, for example, renames the lengthy `name_long` column to simply `name`:
```{r 03-attribute-operations-48, eval=FALSE}
world |>
rename(name = name_long)
```
\index{attribute!create}
`setNames()` changes all column names at once, and requires a character vector with a name matching each column.
This is illustrated below, which outputs the same `world` object, but with very short names:
```{r 03-attribute-operations-49, eval=FALSE, echo=FALSE}
abbreviate(names(world), minlength = 1) |> dput()
```
```{r 03-attribute-operations-50, eval=FALSE}
new_names = c("i", "n", "c", "r", "s", "t", "a", "p", "l", "gP", "geom")
world_new_names = world |>
setNames(new_names)
```
\index{attribute!create}
Each of these attribute data operations preserves the geometry of the simple features.
Sometimes, it makes sense to remove the geometry, for example to speed up aggregation.
Do this with `st_drop_geometry()`, **not** manually with commands such as `select(world, -geom)`, as shown below.^[
`st_geometry(world_st) = NULL` also works to remove the geometry from `world`, but overwrites the original object.
]
```{r 03-attribute-operations-51}
world_data = world |> st_drop_geometry()
class(world_data)
```
## Manipulating raster objects
In contrast to the vector data model underlying simple features (which represents points, lines and polygons as discrete entities in space), raster data represent continuous surfaces.
This section shows how raster objects work by creating them *from scratch*, building on Section \@ref(introduction-to-terra).
Because of their unique structure, subsetting and other operations on raster datasets work in a different way, as demonstrated in Section \@ref(raster-subsetting).
\index{raster!manipulation}
The following code recreates the raster dataset used in Section \@ref(raster-classes), the result of which is illustrated in Figure \@ref(fig:cont-raster).
This demonstrates how the `rast()` function works to create an example raster named `elev` (representing elevations).
```{r 03-attribute-operations-52, message=FALSE, eval = FALSE}
elev = rast(nrows = 6, ncols = 6,
xmin = -1.5, xmax = 1.5, ymin = -1.5, ymax = 1.5,
vals = 1:36)
```
The result is a raster object with 6 rows and 6 columns (specified by the `nrow` and `ncol` arguments), and a minimum and maximum spatial extent in x and y direction (`xmin`, `xmax`, `ymin`, `ymax`).
The `vals` argument sets the values that each cell contains: numeric data ranging from 1 to 36 in this case.
\index{raster!manipulation}
\index{raster!categorical}
Raster objects can also contain categorical values of class `logical` or `factor` variables in R.
The following code creates the raster datasets shown in Figure \@ref(fig:cont-raster):
```{r 03-attribute-operations-53, eval=FALSE}
grain_order = c("clay", "silt", "sand")
grain_char = sample(grain_order, 36, replace = TRUE)
grain_fact = factor(grain_char, levels = grain_order)
grain = rast(nrows = 6, ncols = 6,
xmin = -1.5, xmax = 1.5, ymin = -1.5, ymax = 1.5,
vals = grain_fact)
```
```{r 03-attribute-operations-54, include=FALSE}
elev = rast(system.file("raster/elev.tif", package = "spData"))
grain = rast(system.file("raster/grain.tif", package = "spData"))
```
\index{raster!categorical}
\index{raster attribute table}
The raster object stores the corresponding look-up table or "Raster Attribute Table" (RAT) as a list of data frames, which can be viewed with `cats(grain)` (see `?cats()` for more information).
Each element of this list is a layer of the raster.
It is also possible to use the function `levels()` for retrieving and adding new or replacing existing factor levels.
```{r 03-attribute-operations-56}
grain2 = grain # do not overwrite the original data
levels(grain2) = data.frame(value = c(0, 1, 2), wetness = c("wet", "moist", "dry"))
levels(grain2)
```
```{r cont-raster, echo = FALSE, message = FALSE, fig.asp=0.5, fig.cap = "Raster datasets with numeric (left) and categorical values (right).", fig.scap="Raster datasets with numeric and categorical values.", warning=FALSE}
# knitr::include_graphics("https://user-images.githubusercontent.com/1825120/146617366-7308535b-30f6-4c87-83f7-21702c7d993b.png")
source("code/03-cont-raster-plot.R", print.eval = TRUE)
```
```{block2 coltab, type='rmdnote'}
Categorical raster objects can also store information about the colors associated with each value using a color table.
The color table is a data frame with three (red, green, blue) or four (alpha) columns, where each row relates to one value.
Color tables in **terra** can be viewed or set with the `coltab()` function (see `?coltab`).
Importantly, saving a raster object with a color table to a file (e.g., GeoTIFF) will also save the color information.
```
### Raster subsetting
Raster subsetting is done with the base R operator `[`, which accepts a variety of inputs:
\index{raster!subsetting}
- Row-column indexing
- Cell IDs
- Coordinates
- Another spatial object
Here, we only show the first two options since these can be considered non-spatial operations.
If we need a spatial object to subset another or the output is a spatial object, we refer to this as spatial subsetting.
Therefore, the latter two options will be shown in the next chapter (see Section \@ref(spatial-raster-subsetting)).
\index{raster!subsetting}
The first two subsetting options are demonstrated in the commands below ---
both return the value of the top left pixel in the raster object `elev` (results not shown).
```{r 03-attribute-operations-58, eval = FALSE}
# row 1, column 1
elev[1, 1]
# cell ID 1
elev[1]
```
Subsetting of multi-layered raster objects will return the cell value(s) for each layer.
For example, `two_layers = c(grain, elev); two_layers[1]` returns a data frame with one row and two columns --- one for each layer.
To extract all values, you can also use `values()`.
Cell values can be modified by overwriting existing values in conjunction with a subsetting operation.
The following code chunk, for example, sets the upper left cell of `elev` to 0 (results not shown):
```{r 03-attribute-operations-60, results='hide'}
elev[1, 1] = 0
elev[]
```
Leaving the square brackets empty is a shortcut version of `values()` for retrieving all values of a raster.
Multiple cells can also be modified in this way:
```{r 03-attribute-operations-61}
elev[1, c(1, 2)] = 0
```
Replacing values of multi-layered rasters can be done with a matrix with as many columns as layers and rows as replaceable cells (results not shown):
```{r 03-attribute-operations-61b, eval=FALSE}
two_layers = c(grain, elev)
two_layers[1] = cbind(c(1), c(4))
two_layers[]
```
### Summarizing raster objects
**terra** contains functions for extracting descriptive statistics\index{statistics} for entire rasters.
Printing a raster object to the console by typing its name returns minimum and maximum values of a raster.
\index{raster!summarizing}
`summary()` provides common descriptive statistics\index{statistics} -- minimum, maximum, quartiles and number of `NA`s for continuous rasters and the number of cells of each class for categorical rasters.
\index{raster!summarizing}
Further summary operations such as the standard deviation (see below) or custom summary statistics can be calculated with `global()`.
```{r 03-attribute-operations-62, eval = FALSE}
global(elev, sd)
```
```{block2 03-attribute-operations-63, type='rmdnote'}
If you provide the `summary()` and `global()` functions with a multi-layered raster object, they will summarize each layer separately, as can be illustrated by running: `summary(c(elev, grain))`.
```
\index{raster!summarizing}
Additionally, the `freq()` function allows to get the frequency table of categorical values.
```{r}
freq(grain)
```
Raster value statistics can be visualized in a variety of ways.
Specific functions such as `boxplot()`, `density()`, `hist()` and `pairs()` work also with raster objects, as demonstrated in the histogram created with the command below (not shown).
```{r 03-attribute-operations-64, eval=FALSE}
hist(elev)
```
\index{raster!values}
In case the desired visualization function does not work with raster objects, one can extract the raster data to be plotted with the help of `values()` (Section \@ref(raster-subsetting)).
Descriptive raster statistics belong to the so-called global raster operations.
These and other typical raster processing operations are part of the map algebra scheme, which are covered in the next chapter (Section \@ref(map-algebra)).
```{block 03-attribute-operations-65, type='rmdnote'}
Some function names clash between packages (e.g., functions with the name `extract()` exist in both **terra** and **tidyr** packages).
This may lead to unexpected results when loading packages in a different order.
In addition to calling functions verbosely with their full namespace (e.g., `tidyr::extract()`) to avoid attaching packages with `library()`, another way to prevent function name clashes is by unloading the offending package with `detach()`.
The following command, for example, unloads the **terra** package (this can also be done in the *package* tab which resides by default in the right-bottom pane in RStudio): `detach("package:terra", unload = TRUE, force = TRUE)`.
The `force` argument makes sure that the package will be detached even if other packages depend on it.
This, however, may lead to a restricted usability of packages depending on the detached package, and it is therefore not recommended.
```
## Exercises
```{r, echo=FALSE, results='asis'}
res = knitr::knit_child('_03-ex.Rmd', quiet = TRUE, options = list(include = FALSE, eval = FALSE))
cat(res, sep = '\n')
```
================================================
FILE: 04-spatial-operations.Rmd
================================================
# Spatial data operations {#spatial-operations}
```{r, include=FALSE}
source("code/before_script.R")
```
## Prerequisites {-}
- This chapter requires the same packages used in Chapter \@ref(attr):
```{r 04-spatial-operations-1, message=FALSE, results='hide'}
library(sf)
library(terra)
library(dplyr)
library(spData)
```
## Introduction
Spatial operations, including spatial joins between vector datasets and local and focal operations on raster datasets, are a vital part of geocomputation\index{geocomputation}.
This chapter shows how spatial objects can be modified in a multitude of ways based on their location and shape.
Many spatial operations have a non-spatial (attribute) equivalent, so concepts such as subsetting and joining datasets demonstrated in the previous chapter are applicable here.
This is especially true for *vector* operations: Section \@ref(vector-attribute-manipulation) on vector attribute manipulation provides the basis for understanding its spatial counterpart, namely spatial subsetting (covered in Section \@ref(spatial-subsetting)).
Spatial joining (Sections \@ref(spatial-joining), \@ref(non-overlapping-joins) and \@ref(incongruent)) and aggregation (Section \@ref(spatial-aggr)) also have non-spatial counterparts, covered in the previous chapter.
Spatial operations differ from non-spatial operations in a number of ways, however:
spatial joins, for example, can be done in a number of ways --- including matching entities that intersect with or are within a certain distance of the target dataset --- while the attribution joins discussed in Section \@ref(vector-attribute-joining) in the previous chapter can only be done in one way (except when using fuzzy joins, as described in the documentation of the [**fuzzyjoin**](https://cran.r-project.org/package=fuzzyjoin) package).
Different *types* of spatial relationship between objects, including intersects and disjoint, are described in Sections \@ref(topological-relations) and \@ref(DE-9IM-strings).
\index{spatial operations}
Another unique aspect of spatial objects is distance: all spatial objects are related through space, and distance calculations can be used to explore the strength of this relationship, as described in the context of vector data in Section \@ref(distance-relations).
Spatial operations on raster objects include subsetting --- covered in Section \@ref(spatial-raster-subsetting).
*Map algebra* covers a range of operations that modify raster cell values, with or without reference to surrounding cell values.
The concept of map algebra, vital for many applications, is introduced in Section \@ref(map-algebra); local, focal and zonal map algebra operations are covered in sections \@ref(local-operations), \@ref(focal-operations), and \@ref(zonal-operations), respectively.
Global map algebra operations, which generate summary statistics representing an entire raster dataset, and distance calculations on rasters, are discussed in Section \@ref(global-operations-and-distances).
Next, the relationships between map algebra and vector operations are discussed in Section \@ref(map-algebra-counterparts-in-vector-processing).
In the Section \@ref(merging-rasters),1 the process of merging two raster datasets is discussed and demonstrated with reference to a reproducible example.
```{block2 04-spatial-operations-2, type='rmdnote'}
It is important to note that spatial operations that use two spatial objects rely on both objects having the same coordinate reference system, a topic that was introduced in Section \@ref(crs-intro) and which will be covered in more depth in Chapter \@ref(reproj-geo-data).
```
## Spatial operations on vector data {#spatial-vec}
This section provides an overview of spatial operations on vector geographic data represented as simple features in the **sf** package.
Section \@ref(spatial-ras) presents spatial operations on raster datasets using classes and functions from the **terra** package.
### Spatial subsetting
Spatial subsetting is the process of taking a spatial object and returning a new object containing only features that *relate* in space to another object.
Analogous to *attribute subsetting* (covered in Section \@ref(vector-attribute-subsetting)), subsets of `sf` data frames can be created with square bracket (`[`) operator using the syntax `x[y, , op = st_intersects]`, where `x` is an `sf` object from which a subset of rows will be returned, `y` is the 'subsetting object' and `, op = st_intersects` is an optional argument that specifies the topological relation (also known as the binary predicate) used to do the subsetting.
The default topological relation used when an `op` argument is not provided is `st_intersects()`: the command `x[y, ]` is identical to `x[y, , op = st_intersects]` shown above but not `x[y, , op = st_disjoint]` (the meaning of these and other topological relations is described in the next section).
The `filter()` function from the **tidyverse**\index{tidyverse (package)} can also be used, but this approach is more verbose, as we will see in the examples below.
\index{vector!subsetting}
\index{spatial!subsetting}
To demonstrate spatial subsetting, we will use the `nz` and `nz_height` datasets in the **spData** package, which contain geographic data on the 16 main regions and 101 highest points in New Zealand, respectively (Figure \@ref(fig:nz-subset)), in a projected coordinate reference system.
The following code chunk creates an object representing Canterbury, then uses spatial subsetting to return all high points in the region.
```{r 04-spatial-operations-3}
canterbury = nz |> filter(Name == "Canterbury")
canterbury_height = nz_height[canterbury, ]
```
```{r nz-subset, echo=FALSE, warning=FALSE, fig.cap="Spatial subsetting, with red triangles representing 101 high points in New Zealand, clustered near the central Canterbuy region (left). The points in Canterbury were created with the `[` subsetting operator (highlighted in gray, right).", fig.scap="Spatial subsetting.", message=FALSE}
library(tmap)
p_hpnz1 = tm_shape(nz) +
tm_polygons(fill = "white") +
tm_shape(nz_height) +
tm_symbols(shape = 2, col = "red", size = 0.5, col_alpha = 0.75) +
tm_title("High points in New Zealand") +
tm_layout(bg.color = "lightblue")
p_hpnz2 = tm_shape(nz) +
tm_polygons(fill = "white") +
tm_shape(canterbury) +
tm_fill(col = "gray") +
tm_shape(canterbury_height) +
tm_symbols(shape = 2, col = "red", size = 0.5, col_alpha = 0.75) +
tm_title("High points in Canterbury") +
tm_layout(bg.color = "lightblue")
tmap_arrange(p_hpnz1, p_hpnz2, ncol = 2)
```
Like attribute subsetting, the command `x[y, ]` (equivalent to `nz_height[canterbury, ]`) subsets features of a *target* `x` using the contents of a *source* object `y`.
Instead of `y` being a vector of class `logical` or `integer`, however, for spatial subsetting both `x` and `y` must be geographic objects.
Specifically, objects used for spatial subsetting in this way must have the class `sf` or `sfc`: both `nz` and `nz_height` are geographic vector data frames and have the class `sf`, and the result of the operation returns another `sf` object representing the features in the target `nz_height` object that intersect with (in this case high points that are located within) the `canterbury` region.
Various *topological relations*\index{topological relations} can be used for spatial subsetting which determine the type of spatial relationship that features in the target object must have with the subsetting object to be selected.
These include *touches*, *crosses* or *within*, as we will see shortly in Section \@ref(topological-relations).
The default setting `st_intersects` is a 'catch all' topological relation that will return features in the target that *touch*, *cross* or are *within* the source 'subsetting' object.
Alternative spatial operators can be specified with the `op =` argument, as demonstrated in the following command which returns the opposite of `st_intersects()`, points that do not intersect with Canterbury (see Section \@ref(topological-relations)).
```{r 04-spatial-operations-4, eval=FALSE}
nz_height[canterbury, , op = st_disjoint]
```
```{block2 04-spatial-operations-5, type='rmdnote'}
Note the empty argument --- denoted with `, ,` --- in the preceding code chunk is included to highlight `op`, the third argument in `[` for `sf` objects.
One can use this to change the subsetting operation in many ways.
`nz_height[canterbury, 2, op = st_disjoint]`, for example, returns the same rows but only includes the second attribute column (see `` sf::`[.sf` `` and the `?sf` for details).
```
For many applications, this is all you'll need to know about spatial subsetting for vector data: it just works.
If you are impatient to learn about more topological relations, beyond `st_intersects()` and `st_disjoint()`, skip to the next section (\@ref(topological-relations)).
If you're interested in the details, including other ways of subsetting, read on.
Another way of doing spatial subsetting uses objects returned by topological operators.
These objects can be useful in their own right, for example when exploring the graph network of relationships between contiguous regions, but they can also be used for subsetting, as demonstrated in the code chunk below.
```{r 04-spatial-operations-6, out.lines=9}
sel_sgbp = st_intersects(x = nz_height, y = canterbury)
class(sel_sgbp)
sel_sgbp
sel_logical = lengths(sel_sgbp) > 0
canterbury_height2 = nz_height[sel_logical, ]
```
The above code chunk creates an object of class `sgbp` (a sparse geometry binary predicate, a list of length `x` in the spatial operation) and then converts it into a logical vector `sel_logical` (containing only `TRUE` and `FALSE` values, something that can also be used by **dplyr**'s filter function).
\index{binary predicate|see {topological relations}}
The function `lengths()` identifies which features in `nz_height` intersect with *any* objects in `y`.
In this case, 1 is the greatest possible value, but for more complex operations one could use the method to subset only features that intersect with, for example, 2 or more features from the source object.
```{block2 04-spatial-operations-7, type='rmdnote'}
Note: another way to return a logical output is by setting `sparse = FALSE` (meaning 'return a dense matrix not a sparse one') in operators such as `st_intersects()`. The command `st_intersects(x = nz_height, y = canterbury, sparse = FALSE)[, 1]`, for example, would return an output identical to `sel_logical`.
Note: the solution involving `sgbp` objects is more generalizable though, as it works for many-to-many operations and has lower memory requirements.
```
The same result can be also achieved with the **sf** function `st_filter()` which was [created](https://github.com/r-spatial/sf/issues/1148) to increase compatibility between `sf` objects and **dplyr** data manipulation code:
```{r}
canterbury_height3 = nz_height |>
st_filter(y = canterbury, .predicate = st_intersects)
```
```{r 04-spatial-operations-7b-old, eval=FALSE, echo=FALSE}
# Additional tests of subsetting
canterbury_height4 = nz_height |>
filter(st_intersects(x = _, y = canterbury, sparse = FALSE))
canterbury_height5 = nz_height |>
filter(sel_logical)
identical(canterbury_height3, canterbury_height4)
identical(canterbury_height3, canterbury_height5)
identical(canterbury_height2, canterbury_height4)
identical(canterbury_height, canterbury_height4)
waldo::compare(canterbury_height2, canterbury_height4)
```
At this point, there are three identical (in all but row names) versions of `canterbury_height`, one created using the `[` operator, one created via an intermediary selection object, and another using **sf**'s convenience function `st_filter()`.
<!-- RL: commented out for now as old. Todo: if we ever update that vignette uncomment the next line. -->
<!-- To explore spatial subsetting in more detail, see the supplementary vignettes on `subsetting` and [`tidyverse-pitfalls`](https://geocompr.github.io/geocompkg/articles/) on the [geocompkg website](https://geocompr.github.io/geocompkg/articles/). -->
The next section explores different types of spatial relation, also known as binary predicates, that can be used to identify whether two features are spatially related or not.
### Topological relations
Topological relations\index{topological relations} describe the spatial relationships between objects.
"Binary topological relationships", to give them their full name, are logical statements (in that the answer can only be `TRUE` or `FALSE`) about the spatial relationships between two objects defined by ordered sets of points (typically forming points, lines and polygons) in two or more dimensions [@egenhofer_mathematical_1990].
That may sound rather abstract and, indeed, the definition and classification of topological relations is based on mathematical foundations first published in book form in 1966 [@spanier_algebraic_1995], with the field of algebraic topology continuing beyond the year 2000 [@dieck_algebraic_2008].
Despite their mathematical origins, topological relations can be understood intuitively with reference to visualizations of commonly used functions that test for common types of spatial relationships.
Figure \@ref(fig:relations) shows a variety of geometry pairs and their associated relations.
The third and fourth pairs in Figure \@ref(fig:relations) (from left to right and then down) demonstrate that, for some relations, order is important.
While the relations *equals*, *intersects*, *crosses*, *touches* and *overlaps* are symmetrical, meaning that if `function(x, y)` is true, `function(y, x)` will also be true, relations in which the order of the geometries are important such as *contains* and *within* are not.
Notice that each geometry pair has a "DE-9IM" string such as FF2F11212, described in the next section.
\index{topological relations}
```{r relations, echo=FALSE, fig.cap="Topological relations between vector geometries, inspired by figures 1 and 2 in Egenhofer and Herring (1990). The relations for which the function(x, y) is true are printed for each geometry pair, with x represented in pink and y represented in blue. The nature of the spatial relationship for each pair is described by the Dimensionally Extended 9-Intersection Model string.", fig.show='hold', message=FALSE, fig.asp=0.66, warning=FALSE}
# source("https://github.com/geocompx/geocompr/raw/main/code/de_9im.R")
source("code/de_9im.R")
library(sf)
xy2sfc = function(x, y) st_sfc(st_polygon(list(cbind(x, y))))
p1 = xy2sfc(x = c(0, 0, 1, 1, 0), y = c(0, 1, 1, 0.5, 0))
p2 = xy2sfc(x = c(0, 1, 1, 0), y = c(0, 0, 0.5, 0))
p3 = xy2sfc(x = c(0, 1, 1, 0), y = c(0, 0, 0.7, 0))
p4 = xy2sfc(x = c(0.7, 0.7, 0.9, 0.7), y = c(0.8, 0.5, 0.5, 0.8))
p5 = xy2sfc(x = c(0.6, 0.7, 1, 0.6), y = c(0.7, 0.5, 0.5, 0.7))
p6 = xy2sfc(x = c(0.1, 1, 1, 0.1), y = c(0, 0, 0.3, 0))
p7 = xy2sfc(x = c(0.05, 0.05, 0.6, 0.5, 0.05), y = c(0.4, 0.97, 0.97, 0.4, 0.4))
# todo: add 3 more with line/point relations?
tmap::tmap_arrange(de_9im(p1, p2), de_9im(p1, p3), de_9im(p1, p4),
de_9im(p7, p1), de_9im(p1, p5), de_9im(p1, p6), nrow = 2)
```
In `sf`, functions testing for different types of topological relations are called 'binary predicates', as described in the vignette *Manipulating Simple Feature Geometries*, which can be viewed with the command [`vignette("sf3")`](https://r-spatial.github.io/sf/articles/sf3.html), and in the help page [`?geos_binary_pred`](https://r-spatial.github.io/sf/reference/geos_binary_ops.html).
To see how topological relations work in practice, let's create a simple reproducible example, building on the relations illustrated in Figure \@ref(fig:relations) and consolidating knowledge of how vector geometries are represented from a previous chapter (Section \@ref(geometry)).
Note that to create tabular data representing coordinates (x and y) of the polygon vertices, we use the base R function `cbind()` to create a matrix representing coordinates points, a `POLYGON`, and finally an `sfc` object, as described in Chapter \@ref(spatial-class):
```{r}
polygon_matrix = cbind(
x = c(0, 0, 1, 1, 0),
y = c(0, 1, 1, 0.5, 0)
)
polygon_sfc = st_sfc(st_polygon(list(polygon_matrix)))
```
We will create additional geometries to demonstrate spatial relations with the following commands which, when plotted on top of the polygon created above, relate in space to one another, as shown in Figure \@ref(fig:relation-objects).
Note the use of the function `st_as_sf()` and the argument `coords` to efficiently convert from a data frame containing columns representing coordinates to an `sf` object containing points:
```{r}
point_df = data.frame(
x = c(0.2, 0.7, 0.4),
y = c(0.1, 0.2, 0.8)
)
point_sf = st_as_sf(point_df, coords = c("x", "y"))
```
```{r relation-objects, echo=FALSE, fig.cap="Points, line and polygon objects arranged to illustrate topological relations.", fig.asp=1, out.width="50%", fig.scap="Demonstration of topological relations."}
par(pty = "s")
plot(polygon_sfc, border = "red", col = "gray", axes = TRUE)
plot(point_sf, add = TRUE, lab = 1:4, cex = 2)
text(point_df[, 1] + 0.02, point_df[, 2] + 0.04, 1:3, cex = 1.3)
```
A simple query is: which of the points in `point_sf` intersect in some way with polygon `polygon_sfc`?
The question can be answered by inspection (points 1 and 3 are touching and within the polygon, respectively).
This question can be answered with the spatial predicate `st_intersects()` as follows:
```{r 04-spatial-operations-9, eval=FALSE}
st_intersects(point_sf, polygon_sfc)
#> Sparse geometry binary predicate... `intersects'
#> 1: 1
#> 2: (empty)
#> 3: 1
```
The result should match your intuition:
positive (`1`) results are returned for the first and third point, and a negative result (represented by an empty vector) for the second are outside the polygon's border.
What may be unexpected is that the result comes in the form of a list of vectors.
This *sparse matrix* output only registers a relation if one exists, reducing the memory requirements of topological operations on multi-feature objects.
As we saw in the previous section, a *dense matrix* consisting of `TRUE` or `FALSE` values is returned when `sparse = FALSE`.
```{r 04-spatial-operations-10}
st_intersects(point_sf, polygon_sfc, sparse = FALSE)
```
In the above output each row represents a feature in the target (argument `x`) object, and each column represents a feature in the selecting object (`y`).
In this case, there is only one feature in the `y` object `polygon_sfc` so the result, which can be used for subsetting as we saw in Section \@ref(spatial-subsetting), has only one column.
`st_intersects()` returns `TRUE` even in cases where the features just touch: *intersects*\index{intersects} is a 'catch-all' topological operation which identifies many types of spatial relation, as illustrated in Figure \@ref(fig:relations).
More restrictive questions include which points lie within the polygon, and which features are on or contain a shared boundary with `y`?
These can be answered as follows (results not shown):
```{r 04-spatial-operations-9-2, eval=FALSE}
st_within(point_sf, polygon_sfc)
st_touches(point_sf, polygon_sfc)
```
Note that although the first point *touches* the boundary polygon, it is not within it; the third point is within the polygon but does not touch any part of its border.
The opposite of `st_intersects()` is `st_disjoint()`, which returns only objects that do not spatially relate in any way to the selecting object (note `[, 1]` converts the result into a vector).
```{r 04-spatial-operations-11}
st_disjoint(point_sf, polygon_sfc, sparse = FALSE)[, 1]
```
The function `st_is_within_distance()` detects features that *almost touch* the selection object, which has an additional `dist` argument.
It can be used to set how close target objects need to be before they are selected.
The 'is within distance' binary spatial predicate is demonstrated in the code chunk below, the results of which show that every point is within 0.2 units of the polygon.
```{r 04-spatial-operations-14}
st_is_within_distance(point_sf, polygon_sfc, dist = 0.2, sparse = FALSE)[, 1]
```
Note that although point 2 is more than 0.2 units of distance from the nearest vertex of `polygon_sfc`, it is still selected when the distance is set to 0.2.
This is because distance is measured to the nearest edge, in this case the part of the polygon that lies directly above point 2 in Figure \@ref(fig:relation-objects).
(You can verify the actual distance between point 2 and the polygon is 0.13 with the command `st_distance(point_sf, polygon_sfc)`.)
```{r, eval=FALSE, echo=FALSE}
# verify distances to the polygon with reference to paragraph above:
st_distance(point_sf, polygon_sfc)
# [,1]
# [1,] 0.0000000
# [2,] 0.1341641
# [3,] 0.0000000
```
```{block2 04-spatial-operations-15, type='rmdnote'}
Functions for calculating topological relations use spatial indices to largely speed up spatial query performance.
They achieve that using the Sort-Tile-Recursive (STR) algorithm.
The `st_join` function, mentioned in the next section, also uses the spatial indexing.
You can learn more at https://www.r-spatial.org/r/2017/06/22/spatial-index.html.
```
```{r 04-spatial-operations-16, eval=FALSE, echo=FALSE}
# other tests
st_overlaps(point_sf, polygon_sfc, sparse = FALSE)
st_covers(point_sf, polygon_sfc, sparse = FALSE)
st_covered_by(point_sf, polygon_sfc, sparse = FALSE)
```
```{r 04-spatial-operations-17, eval=FALSE, echo=FALSE}
st_contains(a, p[2, ], sparse = TRUE)
```
```{r 04-spatial-operations-18, eval=FALSE, echo=FALSE}
# starting simpler so commented
a1 = st_polygon(list(rbind(c(-1, -1), c(1, -1), c(1, 1), c(-1, -1))))
a2 = st_polygon(list(rbind(c(2, 0), c(2, 2), c(3, 2), c(3, 0), c(2, 0))))
a = st_sfc(a1, a2)
b1 = a1 * 0.5
b2 = a2 * 0.4 + c(1, 0.5)
b = st_sfc(b1, b2)
l1 = st_linestring(x = matrix(c(0, 3, -1, 1), , 2))
l2 = st_linestring(x = matrix(c(-1, -1, -0.5, 1), , 2))
l = st_sfc(l1, l2)
p = st_multipoint(x = matrix(c(0.5, 1, -1, 0, 1, 0.5), , 2))
plot(a, border = "red", axes = TRUE)
plot(b, border = "green", add = TRUE)
plot(l, add = TRUE)
plot(p, add = TRUE)
```
### Distance relations
While the topological relations presented in the previous section are binary (a feature either intersects with another or does not) distance relations are continuous\index{distance relations}.
The distance between two `sf` objects is calculated with `st_distance()`, which is also used behind the scenes in Section \@ref(non-overlapping-joins) for distance-based joins.
This is illustrated in the code chunk below, which finds the distance between the highest point in New Zealand and the geographic centroid of the Canterbury region, created in Section \@ref(spatial-subsetting):
\index{vector!distance relations}
```{r 04-spatial-operations-31, warning=FALSE}
nz_highest = nz_height |> slice_max(n = 1, order_by = elevation)
canterbury_centroid = st_centroid(canterbury)
st_distance(nz_highest, canterbury_centroid)
```
There are two potentially surprising things about the result:
- It has `units`, telling us the distance is 100,000 meters, not 100,000 inches, or any other measure of distance
- It is returned as a matrix, even though the result only contains a single value
This second feature hints at another useful feature of `st_distance()`, its ability to return *distance matrices* between all combinations of features in objects `x` and `y`.
This is illustrated in the command below, which finds the distances between the first three features in `nz_height` and the Otago and Canterbury regions of New Zealand represented by the object `co`.
```{r 04-spatial-operations-32}
co = filter(nz, grepl("Canter|Otag", Name))
st_distance(nz_height[1:3, ], co)
```
Note that the distance between the second and third features in `nz_height` and the second feature in `co` is zero.
This demonstrates the fact that distances between points and polygons refer to the distance to *any part of the polygon*.
The second and third points in `nz_height` are *in* Otago, which can be verified by plotting them (result not shown):
```{r 04-spatial-operations-33, eval=FALSE}
plot(st_geometry(co)[2])
plot(st_geometry(nz_height)[2:3], add = TRUE)
```
### DE-9IM strings {#DE-9IM-strings}
Underlying the binary predicates demonstrated in the previous section is the Dimensionally Extended 9-Intersection Model (DE-9IM)\index{topological relations!DE-9IM}.
As the cryptic name suggests, this is not an easy topic to understand, but it is worth knowing about because it underlies many spatial operations and enables the creation of custom spatial predicates.
The model was originally labelled "DE + 9IM" by its inventors, referring to the "dimension of the intersections of boundaries, interiors, and exteriors of two features" [@clementini_comparison_1995], but it is now referred to as DE-9IM [@shen_classification_2018].
DE-9IM is applicable to two-dimensional objects (points, lines and polygons) in Euclidean space, meaning that the model (and software implementing it such as GEOS) assumes you are working with data in a projected coordinate reference system, described in Chapter \@ref(reproj-geo-data).
```{r de-9im, echo=FALSE, eval=FALSE}
# Todo one day: revive this
b = st_sfc(st_point(c(0, 1)), st_point(c(1, 1))) # create 2 points
b = st_buffer(b, dist = 1) # convert points to circles
bsf = sf::st_sf(data.frame(Object = c("a", "b")), geometry = b)
b9 = replicate(bsf, n = 9, simplify = FALSE)
b9sf = do.call(rbind, b9)
domains = c("Interior", "Boundary", "Exterior")
b9sf$domain_a = rep(rep(domains, 3), each = 2)
b9sf$domain_b = rep(rep(domains, each = 3), each = 2)
library(ggplot2)
ggplot(b9sf) +
geom_sf() +
facet_grid(domain_a ~ domain_b)
plot(b9sf)
tmap_arrange(
tm_shape(b) + tm_polygons(alpha = 0.5) + tm_layout(title = "Interior-Interior"),
tm_shape(b) + tm_polygons(alpha = 0.5) + tm_layout(title = "Interior-Boundary"),
tm_shape(b) + tm_polygons(alpha = 0.5),
tm_shape(b) + tm_polygons(alpha = 0.5),
tm_shape(b) + tm_polygons(alpha = 0.5),
tm_shape(b) + tm_polygons(alpha = 0.5),
tm_shape(b) + tm_polygons(alpha = 0.5),
tm_shape(b) + tm_polygons(alpha = 0.5),
tm_shape(b) + tm_polygons(alpha = 0.5),
nrow = 3
)
plot(b)
text(x = c(-0.5, 1.5), y = 1, labels = c("x", "y")) # add text
```
To demonstrate how DE-9IM strings work, let's take a look at the various ways that the first geometry pair in Figure \@ref(fig:relations) relate.
Figure \@ref(fig:de9imgg) illustrates the 9-intersection model (9IM) which shows the intersections between every combination of each object's interior, boundary and exterior: when each component of the first object `x` is arranged as columns, and each component of `y` is arranged as rows, a facetted graphic is created with the intersections between each element highlighted.
```{r de9imgg, echo=FALSE, warning=FALSE, fig.cap="Illustration of how the Dimensionally Extended 9 Intersection Model (DE-9IM) works. Colors not in the legend represent the overlap between different components. The thick lines highlight two-dimensional intersections, e.g., between the boundary of object x and the interior of object y, shown in the middle top facet.", message=FALSE}
p1_2 = st_as_sf(c(p1, p3))
ii = st_as_sf(st_intersection(p1, p3))
ii$Object = "Intersection"
ii$domain_a = "Interior"
ii$domain_b = "Interior"
bi = st_sf(x = st_intersection(
st_cast(p1, "LINESTRING"),
st_difference(p3, st_buffer(st_cast(p3, "LINESTRING"), dist = 0.01))
))
bi = st_buffer(bi, dist = 0.01)
bi$Object = "Intersection"
bi$domain_a = "Boundary"
bi$domain_b = "Interior"
ei = st_sf(x = st_difference(p3, p1))
ei$Object = "Intersection"
ei$domain_a = "Exterior"
ei$domain_b = "Interior"
ib = st_sf(x = st_intersection(
st_cast(p3, "LINESTRING"),
st_difference(p1, st_buffer(st_cast(p1, "LINESTRING"), dist = 0.005))
))
ib = st_buffer(ib, dist = 0.01)
ib$Object = "Intersection"
ib$domain_a = "Interior"
ib$domain_b = "Boundary"
bb = st_cast(ii, "POINT")
bb_line = st_sf(x = st_sfc(st_linestring(matrix(c(1, 0.5, 1, 0.7), nrow = 2, byrow = TRUE))))
bb_line_buffer = st_buffer(bb_line, dist = 0.01)
bb_buffer = st_buffer(bb, dist = 0.01)
bb = st_union(bb_buffer, bb_line_buffer)
bb$Object = "Intersection"
bb$domain_a = "Boundary"
bb$domain_b = "Boundary"
eb = st_sf(x = st_difference(
st_cast(p3, "LINESTRING"),
p1
))
eb = st_buffer(eb, dist = 0.01)
eb$Object = "Intersection"
eb$domain_a = "Exterior"
eb$domain_b = "Boundary"
ie = st_sf(x = st_difference(p1, p3))
ie$Object = "Intersection"
ie$domain_a = "Interior"
ie$domain_b = "Exterior"
be = st_sf(x = st_difference(
st_cast(p1, "LINESTRING"),
p3
))
be = st_buffer(be, dist = 0.01)
be$Object = "Intersection"
be$domain_a = "Boundary"
be$domain_b = "Exterior"
ee = st_sf(x = st_difference(
st_buffer(st_union(p1, p3), 0.02),
st_union(p1, p3)
))
ee$Object = "Intersection"
ee$domain_a = "Exterior"
ee$domain_b = "Exterior"
b9 = replicate(p1_2, n = 9, simplify = FALSE)
b9sf = do.call(rbind, b9)
b9sf$Object = rep(c("x", "y"), 9)
domains = c("Interior", "Boundary", "Exterior")
b9sf$domain_a = rep(rep(domains, 3), each = 2)
b9sf$domain_b = rep(rep(domains, each = 3), each = 2)
b9sf = rbind(b9sf, ii, bi, ei, ib, bb, eb, ie, be, ee)
b9sf$domain_a = ordered(b9sf$domain_a, levels = c("Interior", "Boundary", "Exterior"))
b9sf$domain_b = ordered(b9sf$domain_b, levels = c("Interior", "Boundary", "Exterior"))
b9sf = b9sf |>
mutate(alpha = case_when(
Object == "x" ~ 0.1,
Object == "y" ~ 0.1,
TRUE ~ 0.2
))
library(ggplot2)
ggplot(b9sf) +
geom_sf(aes(fill = Object, alpha = alpha)) +
facet_grid(domain_b ~ domain_a) +
scale_fill_manual(values = c("red", "lightblue", "yellow"), position = "top", name = "") +
scale_al
gitextract_34buill7/
├── .Rbuildignore
├── .binder/
│ ├── Dockerfile
│ ├── LICENSE
│ └── README.md
├── .devcontainer.json
├── .gitattributes
├── .github/
│ ├── .gitignore
│ ├── ISSUE_TEMPLATE.md
│ └── workflows/
│ ├── dev-sf.yaml
│ ├── main-no-deploy.yml
│ ├── main.yaml
│ └── qgis-ext.yaml
├── .gitignore
├── .htaccess
├── .lintr
├── .nojekyll
├── .vscode/
│ └── settings.json
├── 01-introduction.Rmd
├── 02-spatial-data.Rmd
├── 03-attribute-operations.Rmd
├── 04-spatial-operations.Rmd
├── 05-geometry-operations.Rmd
├── 06-raster-vector.Rmd
├── 07-reproj.Rmd
├── 08-read-write-plot.Rmd
├── 09-mapping.Rmd
├── 10-gis.Rmd
├── 11-algorithms.Rmd
├── 12-spatial-cv.Rmd
├── 13-transport.Rmd
├── 14-location.Rmd
├── 15-eco.Rmd
├── 16-synthesis.Rmd
├── CITATION.bib
├── CITATION_ed1.bib
├── CODE_OF_CONDUCT.md
├── DESCRIPTION
├── LICENSE.md
├── README.Rmd
├── README.md
├── _01-ex.Rmd
├── _02-ex.Rmd
├── _03-ex.Rmd
├── _04-ex.Rmd
├── _05-ex.Rmd
├── _06-ex.Rmd
├── _07-ex.Rmd
├── _08-ex.Rmd
├── _09-ex.Rmd
├── _10-ex.Rmd
├── _11-ex.Rmd
├── _12-ex.Rmd
├── _13-ex.Rmd
├── _14-ex.Rmd
├── _15-ex.Rmd
├── _404.Rmd
├── _bookdown.yml
├── _output.yml
├── _redirects
├── apps/
│ ├── CycleHireApp/
│ │ ├── app.R
│ │ └── manifest.json
│ └── coffeeApp/
│ ├── app.R
│ └── manifest.json
├── benchmarks.csv
├── code/
│ ├── 01-cranlogs.R
│ ├── 01-sf-revdep.R
│ ├── 02-contpop.R
│ ├── 02-datum-fig.R
│ ├── 02-raster-crs.R
│ ├── 02-raster-intro-plot.R
│ ├── 02-raster-intro-plot2.R
│ ├── 02-sfdiagram.R
│ ├── 02-sfheaders.R
│ ├── 02-vector-crs.R
│ ├── 02-vectorplots.R
│ ├── 03-cont-raster-plot.R
│ ├── 04-areal-example.R
│ ├── 04-focal-example.R
│ ├── 04-local-operations.R
│ ├── 04-ndvi.R
│ ├── 04-raster-subset.R
│ ├── 04-spatial-join.R
│ ├── 05-bilinear.R
│ ├── 05-extend-example.R
│ ├── 05-us-regions.R
│ ├── 05-venn-clip.R
│ ├── 06-contour-tmap.R
│ ├── 06-pointextr.R
│ ├── 06-raster-vectorization1.R
│ ├── 06-raster-vectorization2.R
│ ├── 06-vector-rasterization1.R
│ ├── 06-vector-rasterization2.R
│ ├── 09-break-styles.R
│ ├── 09-layout1.R
│ ├── 09-layout2.R
│ ├── 09-map-pkgs.R
│ ├── 09-tmpal.R
│ ├── 09-tmshape.R
│ ├── 09-tmstyles.R
│ ├── 09-urban-animation.R
│ ├── 09-usboundaries.R
│ ├── 10-qgis-raster.R
│ ├── 10-saga-segments.R
│ ├── 10-saga-wetness.R
│ ├── 10-sliver.R
│ ├── 10-tsp.R
│ ├── 11-centroid-alg.R
│ ├── 11-centroid-setup.R
│ ├── 11-hello.R
│ ├── 11-polycent.R
│ ├── 12-cv.R
│ ├── 12-partitioning.R
│ ├── 13-cycleways.R
│ ├── 13-desire.R
│ ├── 13-transport-data-gen.R
│ ├── 13-zones.R
│ ├── 14-location-figures.R
│ ├── 15-rf_mlr3.R
│ ├── add-impact.R
│ ├── before_script.R
│ ├── benchmark.R
│ ├── chapters/
│ │ ├── 01-introduction.R
│ │ ├── 02-spatial-data.R
│ │ ├── 03-attribute-operations.R
│ │ ├── 04-spatial-operations.R
│ │ ├── 05-geometry-operations.R
│ │ ├── 06-raster-vector.R
│ │ ├── 07-reproj.R
│ │ ├── 08-read-write-plot.R
│ │ ├── 09-mapping.R
│ │ ├── 10-gis.R
│ │ ├── 11-algorithms.R
│ │ ├── 12-spatial-cv.R
│ │ ├── 13-transport.R
│ │ ├── 14-location.R
│ │ ├── 15-eco.R
│ │ ├── 16-synthesis.R
│ │ ├── README.R
│ │ ├── _01-ex.R
│ │ ├── _02-ex.R
│ │ ├── _03-ex.R
│ │ ├── _04-ex.R
│ │ ├── _05-ex.R
│ │ ├── _06-ex.R
│ │ ├── _07-ex.R
│ │ ├── _08-ex.R
│ │ ├── _10-ex.R
│ │ ├── _12-ex.R
│ │ ├── _13-ex.R
│ │ ├── _15-ex.R
│ │ ├── _404.R
│ │ ├── index.R
│ │ └── references.R
│ ├── de_9im.R
│ ├── extra-pkgs.R
│ ├── front_cover2.R
│ ├── frontcover.R
│ ├── generate-chapter-code.R
│ ├── hex_sticker.R
│ ├── list-contributors.R
│ ├── old-to-future-remove/
│ │ ├── 06_raster_reprojection_tests.R
│ │ ├── 08-uscolonize.R
│ │ ├── 10-centroid.R
│ │ ├── 10-earthquakes.R
│ │ ├── 12-code-extension.R
│ │ ├── 12-desire-front.R
│ │ ├── globe.R
│ │ ├── sfr-class-diagram-gen.R
│ │ └── spData.R
│ ├── sf-classes.R
│ └── sfheaders.Rmd
├── extdata/
│ ├── .gitignore
│ ├── 12-bmr_score.rds
│ ├── 15-bmr_exercises.rds
│ ├── 15-nmds.rds
│ ├── 15-rp_exercises.rds
│ ├── 15-tune.rds
│ ├── coffee-data-messy.csv
│ ├── coffee-data.csv
│ ├── contributors.csv
│ ├── generic_map_pkgs.csv
│ ├── gis-vs-gds-table.csv
│ ├── package_list.csv
│ ├── postgis_data.Rdata
│ ├── sfs-st-cast.csv
│ ├── specific_map_pkgs.csv
│ ├── svm_sp_sp_rbf_50it.rds
│ ├── top_dls.csv
│ └── word-count-time.csv
├── geocompr.Rproj
├── geocompr.bib
├── images/
│ └── r_logo.tif
├── index.Rmd
├── krantz.cls
├── makefile
├── misc/
│ ├── our-impact.csv
│ └── our-style.md
├── packages.bib
├── references.Rmd
└── style/
├── after_body.tex
├── before_body.tex
├── ga.html
├── preamble.tex
└── style.css
Condensed preview — 204 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (2,227K chars).
[
{
"path": ".Rbuildignore",
"chars": 86,
"preview": "^.*\\.Rproj$\n^\\.Rproj\\.user$\n^\\.travis\\.yml$\n^README\\.Rmd$\n^README-.*\\.png$\n^\\.github$\n"
},
{
"path": ".binder/Dockerfile",
"chars": 129,
"preview": "FROM ghcr.io/geocompx/docker:binder\n\n## Declares build arguments\nARG NB_USER\nARG NB_UID\n\nCOPY --chown=${NB_USER} . /home"
},
{
"path": ".binder/LICENSE",
"chars": 1518,
"preview": "BSD 3-Clause License\n\nCopyright (c) 2021, Yuvi Panda\nAll rights reserved.\n\nRedistribution and use in source and binary f"
},
{
"path": ".binder/README.md",
"chars": 1949,
"preview": "# Template for RStudio on Binder / JupyterHub\n\n[](https://mybinder.org/v2"
},
{
"path": ".devcontainer.json",
"chars": 113,
"preview": "{\n \"image\": \"pixi-r\",\n \"customizations\": {\n \"vscode\": {\n \"extensions\": [\"reditorsupport.r\"]\n }\n }\n}"
},
{
"path": ".gitattributes",
"chars": 74,
"preview": "*.html linguist-vendored\n*.bib linguist-vendored\nlatex/ linguist-vendored\n"
},
{
"path": ".github/.gitignore",
"chars": 7,
"preview": "*.html\n"
},
{
"path": ".github/ISSUE_TEMPLATE.md",
"chars": 268,
"preview": "<!-- To report issues...\n\n(1) If it's about content -- link to the offending link/section\n\n(2) If it's a about code -- a"
},
{
"path": ".github/workflows/dev-sf.yaml",
"chars": 898,
"preview": "name: dev-pkgs\non:\n push:\n branches:\n - main\n pull_request:\n branches:\n - main\njobs:\n build:\n runs-on:"
},
{
"path": ".github/workflows/main-no-deploy.yml",
"chars": 384,
"preview": "on:\n pull_request:\n branches:\n - main\nname: Render-no-deploy\njobs:\n bookdown:\n name: Render-Book\n runs-o"
},
{
"path": ".github/workflows/main.yaml",
"chars": 1108,
"preview": "on:\n push:\n branches:\n main\nname: Render\njobs:\n bookdown:\n name: Render-Book\n runs-on: ubuntu-latest\n "
},
{
"path": ".github/workflows/qgis-ext.yaml",
"chars": 592,
"preview": "name: qgis\non:\n push:\n branches:\n - main\n pull_request:\n branches:\n - main\njobs:\n bookdown:\n name: Ren"
},
{
"path": ".gitignore",
"chars": 204,
"preview": ".bash_history\n.rstudio/\n.Rproj.user\n.Rhistory\n.RData\n_bookdown_files\n.DS_Store\n.Rapp.history\n_book\n_main.*\n*.html\n*.pdf\n"
},
{
"path": ".htaccess",
"chars": 84,
"preview": "RewriteCond %{REQUEST_FILENAME} !-f\nRewriteRule ^(.*)([^/])$ /$1$2/ [L,R=301]"
},
{
"path": ".lintr",
"chars": 83,
"preview": "linters: with_defaults(\n line_length_linter(120), \n assignment_linter = NULL\n )\n"
},
{
"path": ".nojekyll",
"chars": 0,
"preview": ""
},
{
"path": ".vscode/settings.json",
"chars": 70,
"preview": "{\n \"editor.wordWrap\": \"on\",\n \"makefile.configureOnOpen\": false\n}"
},
{
"path": "01-introduction.Rmd",
"chars": 40179,
"preview": "```{asis index-2, echo=knitr::is_latex_output()}\n\\mainmatter\n```\n\n# Introduction {#intro}\n\n```{r, include=FALSE}\nsource("
},
{
"path": "02-spatial-data.Rmd",
"chars": 76135,
"preview": "# (PART) Foundations {-}\n\n# Geographic data in R {#spatial-class}\n\n```{r, include=FALSE}\nsource(\"code/before_script.R\")\n"
},
{
"path": "03-attribute-operations.Rmd",
"chars": 43551,
"preview": "# Attribute data operations {#attr}\n\n```{r, include=FALSE}\nsource(\"code/before_script.R\")\n```\n\n## Prerequisites {-}\n\n- T"
},
{
"path": "04-spatial-operations.Rmd",
"chars": 72332,
"preview": "# Spatial data operations {#spatial-operations}\n\n```{r, include=FALSE}\nsource(\"code/before_script.R\")\n```\n\n## Prerequisi"
},
{
"path": "05-geometry-operations.Rmd",
"chars": 54015,
"preview": "# Geometry operations {#geometry-operations}\n\n```{r, include=FALSE}\nsource(\"code/before_script.R\")\n```\n\n## Prerequisites"
},
{
"path": "06-raster-vector.Rmd",
"chars": 26424,
"preview": "# Raster-vector interactions {#raster-vector}\n\n```{r, include=FALSE}\nsource(\"code/before_script.R\")\n```\n\n## Prerequisite"
},
{
"path": "07-reproj.Rmd",
"chars": 56928,
"preview": "# Reprojecting geographic data {#reproj-geo-data}\n\n```{r, include=FALSE}\nsource(\"code/before_script.R\")\n```\n\n## Prerequi"
},
{
"path": "08-read-write-plot.Rmd",
"chars": 52671,
"preview": "# Geographic data I/O {#read-write}\n\n```{r, include=FALSE}\nsource(\"code/before_script.R\")\n```\n\n## Prerequisites {-}\n\nThi"
},
{
"path": "09-mapping.Rmd",
"chars": 80106,
"preview": "# (PART) Extensions {-}\n\n# Making maps with R {#adv-map}\n\n```{r, include=FALSE}\nsource(\"code/before_script.R\")\n```\n\n## P"
},
{
"path": "10-gis.Rmd",
"chars": 71745,
"preview": "# Bridges to GIS software {#gis}\n\n```{r, include=FALSE}\nsource(\"code/before_script.R\")\n```\n\n## Prerequisites {-}\n\n- This"
},
{
"path": "11-algorithms.Rmd",
"chars": 31815,
"preview": "# Scripts, algorithms and functions {#algorithms}\n\n```{r, include=FALSE}\nsource(\"code/before_script.R\")\n```\n\n## Prerequi"
},
{
"path": "12-spatial-cv.Rmd",
"chars": 48493,
"preview": "# Statistical learning {#spatial-cv}\n\n```{r, include=FALSE}\nsource(\"code/before_script.R\")\n```\n\n```{r 12-knitr-settings,"
},
{
"path": "13-transport.Rmd",
"chars": 68326,
"preview": "# (PART) Applications {.unnumbered}\n\n# Transportation {#transport}\n\n```{r, include=FALSE}\nsource(\"code/before_script.R\")"
},
{
"path": "14-location.Rmd",
"chars": 26927,
"preview": "# Geomarketing {#location}\n\n```{r, include=FALSE}\nsource(\"code/before_script.R\")\n```\n\n## Prerequisites {-}\n\n- This chapt"
},
{
"path": "15-eco.Rmd",
"chars": 41504,
"preview": "# Ecology {#eco}\n\n```{r, include=FALSE}\nsource(\"code/before_script.R\")\n```\n\n## Prerequisites {-}\n\nThis chapter assumes y"
},
{
"path": "16-synthesis.Rmd",
"chars": 32882,
"preview": "# Conclusion {#conclusion}\n\n```{r, include=FALSE}\nsource(\"code/before_script.R\")\n```\n\n## Introduction\n\nLike the introduc"
},
{
"path": "CITATION.bib",
"chars": 289,
"preview": "@book{lovelace_geocomputation_2025,\n title = {Geocomputation with {{R}}},\n isbn = {9781032248882},\n edition = {Second"
},
{
"path": "CITATION_ed1.bib",
"chars": 267,
"preview": "@book{lovelace_geocomputation_2019,\n title = {Geocomputation with {{R}}},\n isbn = {1-138-30451-4},\n abstract = {Book "
},
{
"path": "CODE_OF_CONDUCT.md",
"chars": 1396,
"preview": "# Contributor Code of Conduct\n\nAs contributors and maintainers of this project, we pledge to respect all people who \ncon"
},
{
"path": "DESCRIPTION",
"chars": 584,
"preview": "Type: Compendium\nPackage: Geocomputation with R\nTitle: Geocomputation with R\nVersion: 0.0.5\nAuthors@R: c(person(\"Robin\","
},
{
"path": "LICENSE.md",
"chars": 17468,
"preview": "## creative commons\n\n# Attribution-NonCommercial-NoDerivatives 4.0 International\n\nCreative Commons Corporation (“Creativ"
},
{
"path": "README.Rmd",
"chars": 16634,
"preview": "---\noutput: github_document\n---\n\n<!-- README.md is generated from README.Rmd. Please edit that file - rmarkdown::render("
},
{
"path": "README.md",
"chars": 17001,
"preview": "<!-- README.md is generated from README.Rmd. Please edit that file - rmarkdown::render('README.Rmd', output_format = 'gi"
},
{
"path": "_01-ex.Rmd",
"chars": 831,
"preview": "\nE1. Think about the terms 'GIS'\\index{GIS}, 'GDS' and 'geocomputation' described above. Which (if any) best describes t"
},
{
"path": "_02-ex.Rmd",
"chars": 4761,
"preview": "\n```{r 02-ex-e0, message=FALSE}\nlibrary(sf)\nlibrary(spData)\nlibrary(terra)\n```\n\nE1. Use `summary()` on the geometry colu"
},
{
"path": "_03-ex.Rmd",
"chars": 8100,
"preview": "\nFor these exercises we will use the `us_states` and `us_states_df` datasets from the **spData** package.\nYou must have "
},
{
"path": "_04-ex.Rmd",
"chars": 10035,
"preview": "```{r 04-ex-e0, include=TRUE, message=FALSE}\nlibrary(sf)\nlibrary(dplyr)\nlibrary(spData)\n```\n\nE1. It was established in S"
},
{
"path": "_05-ex.Rmd",
"chars": 5828,
"preview": "```{r 05-ex-e0, message=FALSE}\nlibrary(sf)\nlibrary(terra)\nlibrary(dplyr)\nlibrary(spData)\nlibrary(spDataLarge)\n```\n\nE1. G"
},
{
"path": "_06-ex.Rmd",
"chars": 4675,
"preview": "Some of the following exercises use a vector (`zion_points`) and raster dataset (`srtm`) from the **spDataLarge** packag"
},
{
"path": "_07-ex.Rmd",
"chars": 3168,
"preview": "\n```{r 07-ex-e0, message=FALSE}\nlibrary(sf)\nlibrary(terra)\nlibrary(spData)\n```\n\nE1. Create a new object called `nz_wgs` "
},
{
"path": "_08-ex.Rmd",
"chars": 3092,
"preview": "```{r 08-ex-e0, message=FALSE}\nlibrary(sf)\nlibrary(terra)\n```\n\nE1. List and describe three types of vector, raster, and "
},
{
"path": "_09-ex.Rmd",
"chars": 11298,
"preview": "```{r 09-ex-e0, message=FALSE}\nlibrary(sf)\nlibrary(terra)\nlibrary(dplyr)\nlibrary(spData)\n```\n\nThese exercises rely on a "
},
{
"path": "_10-ex.Rmd",
"chars": 7484,
"preview": "```{r 10-ex-e0, message=FALSE}\nlibrary(sf)\nlibrary(terra)\n```\n\n<!-- qgisprocess 1-3 -->\nE1. Compute global solar irradia"
},
{
"path": "_11-ex.Rmd",
"chars": 5970,
"preview": "```{asis 11-ex-asis1, message=FALSE}\nThe solutions assume the following packages are attached (other packages will be at"
},
{
"path": "_12-ex.Rmd",
"chars": 9188,
"preview": "```{asis 12-ex-asis1, message=FALSE}\nThe solutions assume the following packages are attached (other packages will be at"
},
{
"path": "_13-ex.Rmd",
"chars": 3250,
"preview": "```{r 13-ex-e0, message=FALSE}\nlibrary(sf)\nlibrary(spDataLarge)\n```\n\nE1. In much of the analysis presented in the chapte"
},
{
"path": "_14-ex.Rmd",
"chars": 5916,
"preview": "```{asis 14-ex-asis1, message=FALSE}\nThe solutions assume the following packages are attached (other packages will be at"
},
{
"path": "_15-ex.Rmd",
"chars": 9959,
"preview": "The solutions assume the following packages are attached (other packages will be attached when needed):\n\n```{r 15-ex-e0,"
},
{
"path": "_404.Rmd",
"chars": 455,
"preview": "<!-- You are nowhere! -->\n\n```{r c404, echo=FALSE, message=FALSE, fig.asp=1}\nlibrary(tmap)\nlibrary(sf)\nnull_island = st_"
},
{
"path": "_bookdown.yml",
"chars": 659,
"preview": "book_filename: geocompr2\nrmd_files: \n - \"index.Rmd\"\n - \"01-introduction.Rmd\"\n - \"02-spatial-data.Rmd\"\n - \"03"
},
{
"path": "_output.yml",
"chars": 930,
"preview": "bookdown::bs4_book:\n theme:\n primary: \"#3860b6\" #links\n base_font:\n google:\n family: Lato\n heading"
},
{
"path": "_redirects",
"chars": 437,
"preview": "# http and https need separate rules if you don’t force_ssl!\nhttp://geocompr.robinlovelace.net/* http://r.geocompx.org/:"
},
{
"path": "apps/CycleHireApp/app.R",
"chars": 2957,
"preview": "# Shiny app for cycle hire from https://github.com/geocompx/geocompr/issues/584\n# Author - Kiranmayi Vadlamudi\n# 2020-12"
},
{
"path": "apps/CycleHireApp/manifest.json",
"chars": 50307,
"preview": "{\n \"version\": 1,\n \"locale\": \"en_AU\",\n \"platform\": \"3.6.3\",\n \"metadata\": {\n \"appmode\": \"shiny\",\n \"primary_rmd\":"
},
{
"path": "apps/coffeeApp/app.R",
"chars": 1681,
"preview": "# Credit: build on the example in https://rstudio.github.io/leaflet/shiny.html\nlibrary(sf)\nlibrary(shiny)\nlibrary(spData"
},
{
"path": "apps/coffeeApp/manifest.json",
"chars": 291904,
"preview": "{\n \"version\": 1,\n \"locale\": \"en_GB\",\n \"platform\": \"4.0.2\",\n \"metadata\": {\n \"appmode\": \"shiny\",\n \"primary_rmd\":"
},
{
"path": "benchmarks.csv",
"chars": 304,
"preview": "command,date_benchmarked,build_time,platform,cpu_model,ram,commit,commit_date,laptop_or_desktop,comments\nbookdown::rende"
},
{
"path": "code/01-cranlogs.R",
"chars": 1130,
"preview": "# Code to download logs of various packages\n# By Robin Lovelace and Colin Gillespie:\n# https://github.com/csgillespie/ef"
},
{
"path": "code/01-sf-revdep.R",
"chars": 402,
"preview": "library(tidyverse)\nsf_revdeps = devtools::revdep(\"sf\",\n dependencies = c(\"Depends\", \"Import"
},
{
"path": "code/02-contpop.R",
"chars": 455,
"preview": "library(sf)\nlibrary(spData)\nworld_proj = st_transform(world, \"+proj=eck4\")\nworld_cents = st_centroid(world_proj, of_larg"
},
{
"path": "code/02-datum-fig.R",
"chars": 1821,
"preview": "library(grid)\nlibrary(gridExtra)\nlibrary(jpeg)\nlibrary(PlaneGeometry)\nlibrary(ggplot2)\ngeo_dat = Ellipse$new(center = c("
},
{
"path": "code/02-raster-crs.R",
"chars": 840,
"preview": "library(terra)\nlibrary(rcartocolor)\nlibrary(tmap)\nraster_filepath = system.file(\"raster/srtm.tif\", package = \"spDataLarg"
},
{
"path": "code/02-raster-intro-plot.R",
"chars": 1003,
"preview": "# first intro plot -----------------------------------------------------------\nlibrary(terra)\nlibrary(sf)\nlibrary(tmap)\n"
},
{
"path": "code/02-raster-intro-plot2.R",
"chars": 1146,
"preview": "# second intro plot -----------------------------------------------------------\nlibrary(tmap)\nlibrary(rcartocolor)\nlibra"
},
{
"path": "code/02-sfdiagram.R",
"chars": 1035,
"preview": "library(DiagrammeR)\nlibrary(DiagrammeRsvg)\nsave_png = function(plot, path){\n par(bg = NA)\n DiagrammeRsvg::export_svg(p"
},
{
"path": "code/02-sfheaders.R",
"chars": 1753,
"preview": "# Aim: compare sf vs. sfheaders in terms of speed\n\nlibrary(spData)\nlibrary(sf)\n\n# proof of concept\nworld_df = sf::st_coo"
},
{
"path": "code/02-vector-crs.R",
"chars": 1123,
"preview": "library(tmap)\nlibrary(sf)\nvector_filepath = system.file(\"vector/zion.gpkg\", package = \"spDataLarge\")\nnew_vector = read_s"
},
{
"path": "code/02-vectorplots.R",
"chars": 1291,
"preview": "library(globe)\nlibrary(dplyr)\nlibrary(sf)\n\nlondon_lonlat = st_point(c(-0.1, 51.5)) %>%\n st_sfc() %>%\n st_sf(crs = 4326"
},
{
"path": "code/03-cont-raster-plot.R",
"chars": 755,
"preview": "library(tmap)\nlibrary(terra)\n\nelev = rast(system.file(\"raster/elev.tif\", package = \"spData\"))\ngrain = rast(system.file(\""
},
{
"path": "code/04-areal-example.R",
"chars": 430,
"preview": "library(sf)\nlibrary(tmap)\nlibrary(spData)\nrx = rbind(congruent, incongruent)\n# tmap_mode(\"plot\")\nm1 = tm_shape(rx) +\n t"
},
{
"path": "code/04-focal-example.R",
"chars": 1740,
"preview": "library(tmap)\nlibrary(sf)\nlibrary(terra)\nlibrary(grid)\nelev = rast(system.file(\"raster/elev.tif\", package = \"spData\"))\ne"
},
{
"path": "code/04-local-operations.R",
"chars": 2888,
"preview": "library(tmap)\nlibrary(terra)\nelev = rast(system.file(\"raster/elev.tif\", package = \"spData\"))\n\nelev_poly = st_as_sf(as.po"
},
{
"path": "code/04-ndvi.R",
"chars": 999,
"preview": "library(tmap)\nlibrary(terra)\nmulti_raster_file = system.file(\"raster/landsat.tif\", package = \"spDataLarge\")\nmulti_rast ="
},
{
"path": "code/04-raster-subset.R",
"chars": 1497,
"preview": "library(tmap)\nlibrary(sf)\nlibrary(terra)\nset.seed(2023-03-10)\nelev = rast(system.file(\"raster/elev.tif\", package = \"spDa"
},
{
"path": "code/04-spatial-join.R",
"chars": 1604,
"preview": "# Aim: demonstrate spatial joins ------------------------------------------\nlibrary(sf)\nlibrary(spData)\nlibrary(tmap)\n# "
},
{
"path": "code/05-bilinear.R",
"chars": 1091,
"preview": "library(tmap)\nlibrary(sf)\nlibrary(terra)\n\nelev = rast(system.file(\"raster/elev.tif\", package = \"spData\"))\nelev_agg = agg"
},
{
"path": "code/05-extend-example.R",
"chars": 539,
"preview": "library(terra)\nlibrary(sf)\nlibrary(tmap)\nelev = rast(system.file(\"raster/elev.tif\", package = \"spData\"))\nelev2 = extend("
},
{
"path": "code/05-us-regions.R",
"chars": 657,
"preview": "library(tmap)\nlibrary(spData)\nlibrary(dplyr)\nlibrary(sf)\n\nregions = aggregate(x = us_states[, \"total_pop_15\"], by = list"
},
{
"path": "code/05-venn-clip.R",
"chars": 1483,
"preview": "if (!exists(\"b\")) {\n library(sf)\n b = st_sfc(st_point(c(0, 1)), st_point(c(1, 1))) # create 2 points\n b = st_buffer(b"
},
{
"path": "code/06-contour-tmap.R",
"chars": 1167,
"preview": "library(tmap)\nlibrary(sf)\nlibrary(terra)\ndem = rast(system.file(\"raster/dem.tif\", package = \"spDataLarge\"))\n# create hil"
},
{
"path": "code/06-pointextr.R",
"chars": 705,
"preview": "library(tmap)\nlibrary(terra)\nlibrary(sf)\nterrain_colors = rcartocolor::carto_pal(7, \"Geyser\")\nsrtm = rast(system.file(\"r"
},
{
"path": "code/06-raster-vectorization1.R",
"chars": 585,
"preview": "library(tmap)\nlibrary(spData)\nlibrary(terra)\nelev = rast(system.file(\"raster/elev.tif\", package = \"spData\"))\nelev_point "
},
{
"path": "code/06-raster-vectorization2.R",
"chars": 871,
"preview": "library(tmap)\nlibrary(spData)\nlibrary(terra)\ngrain = rast(system.file(\"raster/grain.tif\", package = \"spData\"))\ngrain_pol"
},
{
"path": "code/06-vector-rasterization1.R",
"chars": 1805,
"preview": "library(sf)\nlibrary(tmap)\nlibrary(spData)\nlibrary(terra)\n\nif (!exists(\"cycle_hire_osm_projected\")) {\n cycle_hire_osm_pr"
},
{
"path": "code/06-vector-rasterization2.R",
"chars": 1457,
"preview": "library(tmap)\n\nif (!exists(\"raster_template2\")) {\n library(sf)\n library(terra)\n library(spData)\n library(spDataLarge"
},
{
"path": "code/09-break-styles.R",
"chars": 1702,
"preview": "library(tmap)\nlibrary(spData)\nlibrary(spDataLarge)\n\n# ?tmap_style_save\nm_equal = tm_shape(nz) +\n tm_polygons(fill = \"Me"
},
{
"path": "code/09-layout1.R",
"chars": 236,
"preview": "library(spData)\nlibrary(tmap)\nmap_nz = tm_shape(nz) + tm_fill() + tm_borders()\nl2 = map_nz + tm_layout(scale = 4)\nl3 = m"
},
{
"path": "code/09-layout2.R",
"chars": 888,
"preview": "library(spData)\nlibrary(tmap)\nlegend_title = expression(\"Area (km\"^2*\")\")\nmap_nza = tm_shape(nz) +\n tm_fill(fill = \"Lan"
},
{
"path": "code/09-map-pkgs.R",
"chars": 894,
"preview": "# Aim: generate package metrics on common mapping packages\nremotes::install_github(\"ropenscilabs/packagemetrics\")\n\n# gen"
},
{
"path": "code/09-tmpal.R",
"chars": 512,
"preview": "library(tmap)\nlibrary(spData)\nmc1 = tm_shape(nz) + tm_polygons(fill = \"Median_income\")\nmc2 = tm_shape(nz) + tm_polygons("
},
{
"path": "code/09-tmshape.R",
"chars": 474,
"preview": "library(tmap)\nlibrary(spData)\n# Add fill layer to nz shape\nm1 = tm_shape(nz) + tm_fill() +\n tm_title_in(\"tm_shape(nz) +"
},
{
"path": "code/09-tmstyles.R",
"chars": 487,
"preview": "library(spData)\nlibrary(tmap)\nlegend_title = expression(\"Area (km\"^2*\")\")\nmap_nza = tm_shape(nz) +\n tm_fill(col = \"Land"
},
{
"path": "code/09-urban-animation.R",
"chars": 544,
"preview": "library(sf)\nlibrary(dplyr)\nlibrary(spData)\nlibrary(tmap)\nworld2 = filter(world, continent != \"Antarctica\")\nm_save = tm_s"
},
{
"path": "code/09-usboundaries.R",
"chars": 2332,
"preview": "# Aim: create animation showing shifting US boundaries\n# depends on 17 MB USAboundariesData package\n# link to script fil"
},
{
"path": "code/10-qgis-raster.R",
"chars": 2490,
"preview": "library(qgisprocess)\nlibrary(terra)\nlibrary(tmap)\n\ndem = rast(system.file(\"raster/dem.tif\", package = \"spDataLarge\"))\n\nd"
},
{
"path": "code/10-saga-segments.R",
"chars": 1057,
"preview": "library(terra)\nlibrary(sf)\nlibrary(Rsagacmd)\nlibrary(tmap)\n\nsaga = saga_gis(raster_backend = \"terra\", vector_backend = \""
},
{
"path": "code/10-saga-wetness.R",
"chars": 2180,
"preview": "# Filename: 09-saga-wetness.R (2018-06-19)\n#\n# TO DO: Compute and visualize SAGA wetness index\n#\n# Author(s): Jannes Mue"
},
{
"path": "code/10-sliver.R",
"chars": 1163,
"preview": "library(qgisprocess)\nlibrary(sf)\nlibrary(tmap)\n\ndata(\"incongruent\", \"aggregating_zones\", package = \"spData\")\nincongr_wgs"
},
{
"path": "code/10-tsp.R",
"chars": 2073,
"preview": "# Filename: 09-tsp.R (2018-06-19)\n#\n# TO DO: Traveling salesman figure\n#\n# Author(s): Jannes Muenchow\n#\n#***************"
},
{
"path": "code/11-centroid-alg.R",
"chars": 1532,
"preview": "# Aim: take a matrix representing a convex polygon, return its centroid,\n# demonstrate how algorithms work\n\n# Pre-requis"
},
{
"path": "code/11-centroid-setup.R",
"chars": 1283,
"preview": "library(sf)\n# centroid calculation with Python:\ndir.create(\"py\")\nif (!file.exists(\"py/__init__.py\")) {\n download.file(\""
},
{
"path": "code/11-hello.R",
"chars": 57,
"preview": "# Aim: provide a minimal R script\nprint(\"Hello geocompr\")"
},
{
"path": "code/11-polycent.R",
"chars": 1646,
"preview": "# Aim: create visualization showing steps in centroid algorithm\nif (!exists(\"poly_mat\")) {\n # source(\"code/chapters/10-"
},
{
"path": "code/12-cv.R",
"chars": 8946,
"preview": "# Filename: 12-cv.R (2022-02-16)\n#\n# TO DO: Introduce spatial cross-validation with the help of mlr3\n#\n# Author(s): Jann"
},
{
"path": "code/12-partitioning.R",
"chars": 1462,
"preview": "library(mlr3)\nlibrary(mlr3spatiotempcv)\nlibrary(sf)\nlibrary(purrr)\ntask_ecuador = tsk(\"ecuador\")\nset.seed(2024-01-14)\n\np"
},
{
"path": "code/13-cycleways.R",
"chars": 613,
"preview": "if (!exists(\"route_cycleway\")) {\n source(\"code/chapters/13-transport.R\")\n} \ntmap_mode(\"plot\")\nbristol_stations_top = br"
},
{
"path": "code/13-desire.R",
"chars": 1638,
"preview": "# Aim: generate tmap figure representing desire lines\n\n# load data if not already loaded:\nif (!exists(\"desire_lines\")) {"
},
{
"path": "code/13-transport-data-gen.R",
"chars": 178,
"preview": "# See 'bristol.R' code at https://github.com/Nowosad/spDataLarge/tree/master/data-raw\nsource(\"https://raw.githubusercont"
},
{
"path": "code/13-zones.R",
"chars": 446,
"preview": "library(tmap)\ntmap_mode(\"plot\")\ntm_shape(zones_od) + \n tm_fill(c(\"all\", \"all_dest\"), \n fill.scale = tm_scale(v"
},
{
"path": "code/14-location-figures.R",
"chars": 9026,
"preview": "# Filename: 14-location_figures.R (2022-11-30, last update: 2023-08-09)\n#\n# TO DO: Build figures for location chapter\n#\n"
},
{
"path": "code/15-rf_mlr3.R",
"chars": 4399,
"preview": "# Filename: 15-rf_mlr3.R (2022-04-14)\n\n# TO DO: use spatially cross-validated tuned hyperparameters to make a spatial pr"
},
{
"path": "code/add-impact.R",
"chars": 525,
"preview": "# Aim: add new impact to our-impact.csv -----------------------------------\n# Get impact details -----------------------"
},
{
"path": "code/before_script.R",
"chars": 1758,
"preview": "library(methods)\nlibrary(knitr)\nopts_chunk$set(\n background = \"#FCFCFC\", # code chunk color in latex\n comm"
},
{
"path": "code/benchmark.R",
"chars": 985,
"preview": "# Aim: benchmark and record how long it takes to build the book on different setups\n\nremotes::install_cran(\"benchmarkme\""
},
{
"path": "code/chapters/01-introduction.R",
"chars": 4258,
"preview": "## \\mainmatter\n\n\n## ----gdsl, echo=FALSE, message=FALSE----------------------------------------------------------------\n"
},
{
"path": "code/chapters/02-spatial-data.R",
"chars": 24466,
"preview": "## ----02-spatial-data-1, eval=FALSE------------------------------------------------------------------\n## install.packag"
},
{
"path": "code/chapters/03-attribute-operations.R",
"chars": 18768,
"preview": "## ----03-attribute-operations-1, message=FALSE-------------------------------------------------------\nlibrary(sf) "
},
{
"path": "code/chapters/04-spatial-operations.R",
"chars": 27990,
"preview": "## ----04-spatial-operations-1, message=FALSE, results='hide'-----------------------------------------\nlibrary(sf)\nlibra"
},
{
"path": "code/chapters/05-geometry-operations.R",
"chars": 24632,
"preview": "## ----05-geometry-operations-1, message=FALSE--------------------------------------------------------\nlibrary(sf)\nlibra"
},
{
"path": "code/chapters/06-raster-vector.R",
"chars": 13647,
"preview": "## ----06-raster-vector-1, message=FALSE--------------------------------------------------------------\nlibrary(dplyr)\nli"
},
{
"path": "code/chapters/07-reproj.R",
"chars": 21282,
"preview": "## ----06-reproj-1, message=FALSE, warning=FALSE------------------------------------------------------\nlibrary(sf)\nlibra"
},
{
"path": "code/chapters/08-read-write-plot.R",
"chars": 15835,
"preview": "## ----07-read-write-plot-1, message=FALSE------------------------------------------------------------\nlibrary(sf)\nlibra"
},
{
"path": "code/chapters/09-mapping.R",
"chars": 27440,
"preview": "## ----08-mapping-1, message=FALSE--------------------------------------------------------------------\nlibrary(sf)\nlibra"
},
{
"path": "code/chapters/10-gis.R",
"chars": 18691,
"preview": "## ----09-gis-1, message=FALSE------------------------------------------------------------------------\nlibrary(sf)\nlibra"
},
{
"path": "code/chapters/11-algorithms.R",
"chars": 7226,
"preview": "## ----10-algorithms-1--------------------------------------------------------------------------------\nsource(\"code/11-h"
},
{
"path": "code/chapters/12-spatial-cv.R",
"chars": 14150,
"preview": "## ----12-spatial-cv-1, message=FALSE-----------------------------------------------------------------\nlibrary(dplyr)\nli"
},
{
"path": "code/chapters/13-transport.R",
"chars": 16665,
"preview": "## ---- echo=FALSE------------------------------------------------------------------------------------\nknitr::opts_chunk"
},
{
"path": "code/chapters/14-location.R",
"chars": 10512,
"preview": "## ----14-location-1, message=FALSE-------------------------------------\nlibrary(sf)\nlibrary(dplyr)\nlibrary(purrr)\nlibra"
},
{
"path": "code/chapters/15-eco.R",
"chars": 15461,
"preview": "## ----15-eco-1, message=FALSE------------------------------------------------------------------------\nlibrary(data.tabl"
},
{
"path": "code/chapters/16-synthesis.R",
"chars": 1452,
"preview": "## ----15-synthesis-1---------------------------------------------------------------------------------\nlibrary(spData)\nn"
},
{
"path": "code/chapters/README.R",
"chars": 5273,
"preview": "## ---- echo = FALSE----------------------------------------------------------------------------------\nknitr::opts_chunk"
},
{
"path": "code/chapters/_01-ex.R",
"chars": 1,
"preview": "\n"
},
{
"path": "code/chapters/_02-ex.R",
"chars": 3999,
"preview": "## ----02-ex-e0, message=FALSE------------------------------------------------------------------------\nlibrary(sf)\nlibra"
},
{
"path": "code/chapters/_03-ex.R",
"chars": 5535,
"preview": "## ----03-ex-e0, include=TRUE, message=FALSE----------------------------------------------------------\nlibrary(sf)\nlibra"
},
{
"path": "code/chapters/_04-ex.R",
"chars": 7254,
"preview": "## ----04-ex-e0, include=TRUE, message=FALSE----------------------------------------------------------\nlibrary(sf)\nlibra"
},
{
"path": "code/chapters/_05-ex.R",
"chars": 4655,
"preview": "## ----05-ex-e0, message=FALSE------------------------------------------------------------------------\nlibrary(sf)\nlibra"
},
{
"path": "code/chapters/_06-ex.R",
"chars": 3484,
"preview": "## ----06-ex-e0, message=FALSE, include=TRUE----------------------------------------------------------\nlibrary(sf)\nlibra"
},
{
"path": "code/chapters/_07-ex.R",
"chars": 1712,
"preview": "## ----07-ex-e0, message=FALSE------------------------------------------------------------------------\nlibrary(sf)\nlibra"
},
{
"path": "code/chapters/_08-ex.R",
"chars": 2062,
"preview": "## ----08-ex-e0, message=FALSE------------------------------------------------------------------------\nlibrary(sf)\nlibra"
},
{
"path": "code/chapters/_10-ex.R",
"chars": 6178,
"preview": "## ----10-ex-e0, message=FALSE------------------------------------------------------------------------\nlibrary(sf)\nlibra"
},
{
"path": "code/chapters/_12-ex.R",
"chars": 8364,
"preview": "## The solutions assume the following packages are attached (other packages will be attached when needed):\n\n\n## ----12-e"
},
{
"path": "code/chapters/_13-ex.R",
"chars": 1553,
"preview": "## ----13-ex-e0, message=FALSE------------------------------------------------------------------------\nlibrary(sf)\nlibra"
},
{
"path": "code/chapters/_15-ex.R",
"chars": 8328,
"preview": "## ----15-ex-e0, message=FALSE, warning=FALSE----------------------------------------------------------------------\nlibr"
},
{
"path": "code/chapters/_404.R",
"chars": 505,
"preview": "## ----c404, echo=FALSE, message=FALSE, fig.asp=1-----------------------------------------------------\nlibrary(tmap)\nlib"
},
{
"path": "code/chapters/index.R",
"chars": 10059,
"preview": "## ----index-1, echo=FALSE----------------------------------------------------------------------------\nis_on_ghactions ="
},
{
"path": "code/chapters/references.R",
"chars": 1,
"preview": "\n"
},
{
"path": "code/de_9im.R",
"chars": 4381,
"preview": "#' This function visualises sf objects and returns info on the\n#' types of spatial relationship there is between them\n#'"
},
{
"path": "code/extra-pkgs.R",
"chars": 347,
"preview": "pkgs = c(\n \"cranlogs\", # automated cran-logs\n \"USAboundaries\", # for plots of the globe\n \"tidytext\" # for word count\n"
},
{
"path": "code/front_cover2.R",
"chars": 14061,
"preview": "remotes::install_github(\"BlakeRMills/MoMAColors\")\r\npkgs = c(\r\n \"camcorder\",\r\n \"cartogram\",\r\n \"glue\",\r\n \"ggtext\",\r\n "
},
{
"path": "code/frontcover.R",
"chars": 7001,
"preview": "# load packages ----\nlibrary(raster)\nlibrary(sf)\nlibrary(spData)\nlibrary(ggplot2)\nlibrary(hexSticker)\nlibrary(raster)\nli"
},
{
"path": "code/generate-chapter-code.R",
"chars": 962,
"preview": "#' Extracts R code from each chapter and dumps it in the code folder\ngenerate_chapter_code = function(dir = \".\", out_dir"
},
{
"path": "code/hex_sticker.R",
"chars": 2612,
"preview": "library(sf)\nlibrary(raster)\nlibrary(dplyr)\nlibrary(ggplot2)\nlibrary(hexSticker)\nlibrary(showtext)\n\n# read the logo file "
},
{
"path": "code/list-contributors.R",
"chars": 1235,
"preview": "# Filename: list_contributors.R (2018-03-16)\n#\n# TO DO: List all geocompr contributors\n#\n# Author(s): Jannes Muenchow & "
},
{
"path": "code/old-to-future-remove/06_raster_reprojection_tests.R",
"chars": 867,
"preview": "library(ggplot2)\nlibrary(visualraster)\ntheme_set(theme_fullframe())\nset.seed(2017-11-05)\n\nsmall_ras_val = raster(matrix("
},
{
"path": "code/old-to-future-remove/08-uscolonize.R",
"chars": 624,
"preview": "library(tmap)\nlibrary(dplyr)\nlibrary(tidyr)\nlibrary(sf)\n\nstatepop = historydata::us_state_populations %>%\n select(-GISJ"
},
{
"path": "code/old-to-future-remove/10-centroid.R",
"chars": 1172,
"preview": "#' Find the centre-point of a polygon represented by a matrix\n#' \n#' Calculates the centroid and (optionally) area of a "
},
{
"path": "code/old-to-future-remove/10-earthquakes.R",
"chars": 891,
"preview": "# Aim: create up-to-date map of Earthquakes in previous week\n\n# setup --------------------------------------------------"
},
{
"path": "code/old-to-future-remove/12-code-extension.R",
"chars": 5601,
"preview": "# Aim: build on code in Chapter 12 of Geocomputation with R to demosntrate geographic levels\n\n## ----12-transport-1, mes"
},
{
"path": "code/old-to-future-remove/12-desire-front.R",
"chars": 1335,
"preview": "# Aim: generate tmap figure representing desire lines\n\n# load data if not already loaded:\nif (!exists(\"desire_lines\")) {"
},
{
"path": "code/old-to-future-remove/globe.R",
"chars": 343,
"preview": "# Aim: plot globe\nif (!require(globe)) {\n install.packages(\"globe\")\n}\nlibrary(sf)\ncenter = st_sf(st_sfc(st_point(c(0, 0"
},
{
"path": "code/old-to-future-remove/sfr-class-diagram-gen.R",
"chars": 1695,
"preview": "# Aim: create diagram representing sf classes\nlibrary(sf)\nlibrary(diagram)\nsf_classes = getClasses(where = asNamespace(\""
},
{
"path": "code/old-to-future-remove/spData.R",
"chars": 339,
"preview": "# Aim load data from spDataLarge if you cannot install the package\nif (!require(spDataLarge)) {\n download.file(\"https:/"
},
{
"path": "code/sf-classes.R",
"chars": 5893,
"preview": "# https://stackoverflow.com/questions/35631889/ggplot2-align-multiple-plots-with-varying-spacings-and-add-arrows-between"
},
{
"path": "code/sfheaders.Rmd",
"chars": 4516,
"preview": "---\ntitle: \"sfheaders\"\noutput: html_document\neditor_options: \n chunk_output_type: console\n---\n\n```{r setup, include=FAL"
},
{
"path": "extdata/.gitignore",
"chars": 33,
"preview": "l_all.rds\n12-sp_conv_cv_test.rds\n"
},
{
"path": "extdata/coffee-data-messy.csv",
"chars": 2525,
"preview": "Monthly export statistics - November 2017,,,,,,\nIn thousand 60kg bags,,,,,,\n,November 2016,November 2017,% change,Octobe"
},
{
"path": "extdata/coffee-data.csv",
"chars": 1410,
"preview": "name_long,y16,y17\nColombian Milds,1437,1272\nOther Milds,1590,1734\nBrazilian Naturals,3383,3054\nRobustas,3520,2959\nAngola"
},
{
"path": "extdata/contributors.csv",
"chars": 2809,
"preview": "name,link\nprosoitos,https://github.com/prosoitos\ntibbles-and-tribbles,https://github.com/tibbles-and-tribbles\nflorisvdh,"
},
{
"path": "extdata/generic_map_pkgs.csv",
"chars": 1390,
"preview": "package,published,title,depends_count,suggests_count,tidyverse_happy,has_vignette_build,has_tests,reverse_count,dl_last_"
},
{
"path": "extdata/gis-vs-gds-table.csv",
"chars": 188,
"preview": "\"Attribute\",\"Desktop GIS (GUI)\",\"R\"\n\"Home disciplines\",\"Geography\",\"Computing, Statistics\"\n\"Software focus\",\"Graphical U"
},
{
"path": "extdata/package_list.csv",
"chars": 3355,
"preview": "\"Name\",\"Title\",\"version\"\n\"bookdown\",\"Authoring Books and Technical Documents with R Markdown [@R-bookdown]\",\"0.7\"\n\"carto"
},
{
"path": "extdata/sfs-st-cast.csv",
"chars": 316,
"preview": "input_geom,POINT,MULTIPOINT,LINESTRING,MULTILINESTRING,POLYGON,MULTIPOLYGON,GEOMETRYCOLLECTION\nPOINT(1),1,1,1,NA,NA,NA,N"
},
{
"path": "extdata/specific_map_pkgs.csv",
"chars": 1029,
"preview": "package,published,title,depends_count,suggests_count,tidyverse_happy,has_vignette_build,has_tests,reverse_count,dl_last_"
},
{
"path": "extdata/top_dls.csv",
"chars": 11906,
"preview": "package,Downloads,date\nr5r,4774,2023-11-14\nstars,1354,2023-11-14\nleafem,1013,2023-11-14\nspdep,881,2023-11-14\ntmap,873,20"
},
{
"path": "extdata/word-count-time.csv",
"chars": 22964,
"preview": "n_words,chapter,date,n_pages\n934,1,2017-04-15,3.1133333333333333\n114,2,2017-04-15,0.38\n623,3,2017-04-15,2.07666666666666"
},
{
"path": "geocompr.Rproj",
"chars": 210,
"preview": "Version: 1.0\n\nRestoreWorkspace: No\nSaveWorkspace: No\nAlwaysSaveHistory: Yes\n\nEnableCodeIndexing: Yes\nUseSpacesForTab: Ye"
},
{
"path": "geocompr.bib",
"chars": 156107,
"preview": "@misc{_map_1993,\n title = {Map Projections},\n year = {1993},\n publisher = {US Geological Survey},\n doi = {10.3133/70"
},
{
"path": "index.Rmd",
"chars": 26059,
"preview": "--- \ntitle: 'Geocomputation with R'\nauthor: 'Robin Lovelace, Jakub Nowosad, Jannes Muenchow'\ndate: '`r Sys.Date()`'\nsite"
},
{
"path": "krantz.cls",
"chars": 57587,
"preview": "%% This is file `Krantz.cls'\n%%% Created by Shashi Kumar / ITC [August 2008]\n\n\n\\NeedsTeXFormat{LaTeX2e}[1995/12/01]\n\\Pro"
},
{
"path": "makefile",
"chars": 1457,
"preview": "html:\n\tRscript -e 'bookdown::render_book(\"index.Rmd\", output_format = \"bookdown::bs4_book\", clean = TRUE)'\n\tcp -fvr styl"
},
{
"path": "misc/our-impact.csv",
"chars": 8642,
"preview": "url,date,type,description,comments,state,creator\nhttps://github.com/edzer/sfr/issues/53,2016-11-06,enhancement,st_write "
},
{
"path": "misc/our-style.md",
"chars": 3129,
"preview": "# Code\n\nCode lives in the `code` directory in files named according to the chapter they are in, e.g. `01-venn.R`.\nThe co"
},
{
"path": "packages.bib",
"chars": 22374,
"preview": "@Manual{R-bookdown,\n title = {bookdown: Authoring Books and Technical Documents with R Markdown},\n author = {Yihui Xie"
},
{
"path": "references.Rmd",
"chars": 55,
"preview": "`r if (knitr:::is_html_output()) '\n# References {-}\n'`\n"
},
{
"path": "style/after_body.tex",
"chars": 24,
"preview": "\\backmatter\n\\printindex\n"
},
{
"path": "style/before_body.tex",
"chars": 300,
"preview": "%\\cleardoublepage\\newpage\\thispagestyle{empty}\\null\n%\\cleardoublepage\\newpage\\thispagestyle{empty}\\null\n%\\cleardoublepag"
},
{
"path": "style/ga.html",
"chars": 866,
"preview": "<link rel=\"icon\" type=\"image/png\" sizes=\"32x32\" href=\"images/favicon-32x32.png\">\n<link rel=\"icon\" type=\"image/png\" sizes"
},
{
"path": "style/preamble.tex",
"chars": 2693,
"preview": "\\usepackage{booktabs}\n\\usepackage{longtable}\n\\usepackage[bf,singlelinecheck=off]{caption}\n\\captionsetup[table]{labelsep="
},
{
"path": "style/style.css",
"chars": 419,
"preview": "div.rmdnote, div.rstudio-tip, div.rmdwarning {\n padding: 1em;\n margin: 1em 0;\n padding-left: 100px;\n background-size"
}
]
// ... and 8 more files (download for full content)
About this extraction
This page contains the full source code of the geocompx/geocompr GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 204 files (2.0 MB), approximately 539.9k tokens. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.