Repository: hilaryparker/names Branch: master Commit: bb1ab7efef06 Files: 33 Total size: 2.8 MB Directory structure: gitextract_pfsj5zsr/ ├── .gitattributes ├── .gitignore ├── NYCR_hillary_2014/ │ ├── hillary_2014.Rmd │ ├── hillary_2014.Rproj │ ├── hillary_2014.md │ └── hillary_2014.nb.html ├── NYCR_hillary_2015/ │ ├── hillary_2015.R │ ├── hillary_2015.Rmd │ ├── hillary_2015.Rproj │ ├── hillary_2015.md │ └── hillary_2015.nb.html ├── README.markdown ├── cache/ │ ├── .gitignore │ ├── female.nums.RData │ ├── female.percents.RData │ ├── female.ranks.RData │ ├── male.nums.RData │ ├── male.percents.RData │ └── male.ranks.RData ├── config/ │ ├── .gitignore │ └── global.dcf ├── graphs/ │ ├── .Rhistory │ └── .gitignore ├── lib/ │ └── getNames.R ├── munge/ │ ├── .gitignore │ └── 01-A-scrapingdata.R ├── reports/ │ ├── .gitignore │ ├── bigdrops.csv │ └── bigdrops.xlsx └── src/ ├── analysis-scraping.Rout ├── analysis.R ├── analysis.sh └── analysis_for_ignite.R ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitattributes ================================================ # Auto detect text files and perform LF normalization * text=auto # Custom for Visual Studio *.cs diff=csharp *.sln merge=union *.csproj merge=union *.vbproj merge=union *.fsproj merge=union *.dbproj merge=union # Standard to msysgit *.doc diff=astextplain *.DOC diff=astextplain *.docx diff=astextplain *.DOCX diff=astextplain *.dot diff=astextplain *.DOT diff=astextplain *.pdf diff=astextplain *.PDF diff=astextplain *.rtf diff=astextplain *.RTF diff=astextplain ================================================ FILE: .gitignore ================================================ ################# ## Eclipse ################# *.pydevproject .project .metadata bin/ tmp/ *.tmp *.bak *.swp *~.nib local.properties .classpath .settings/ .loadpath # External tool builders .externalToolBuilders/ # Locally stored "Eclipse launch configurations" *.launch # CDT-specific .cproject # PDT-specific .buildpath ################# ## Visual Studio ################# ## Ignore Visual Studio temporary files, build results, and ## files generated by popular Visual Studio add-ons. # User-specific files *.suo *.user *.sln.docstates # Build results [Dd]ebug/ [Rr]elease/ *_i.c *_p.c *.ilk *.meta *.obj *.pch *.pdb *.pgc *.pgd *.rsp *.sbr *.tlb *.tli *.tlh *.tmp *.vspscc .builds *.dotCover ## TODO: If you have NuGet Package Restore enabled, uncomment this #packages/ # Visual C++ cache files ipch/ *.aps *.ncb *.opensdf *.sdf # Visual Studio profiler *.psess *.vsp # ReSharper is a .NET coding add-in _ReSharper* # Installshield output folder [Ee]xpress # DocProject is a documentation generator add-in DocProject/buildhelp/ DocProject/Help/*.HxT DocProject/Help/*.HxC DocProject/Help/*.hhc DocProject/Help/*.hhk DocProject/Help/*.hhp DocProject/Help/Html2 DocProject/Help/html # Click-Once directory publish # Others [Bb]in [Oo]bj sql TestResults *.Cache ClientBin stylecop.* ~$* *.dbmdl Generated_Code #added for RIA/Silverlight projects # Backup & report files from converting an old project file to a newer # Visual Studio version. Backup files are not needed, because we have git ;-) _UpgradeReport_Files/ Backup*/ UpgradeLog*.XML ############ ## Windows ############ # Windows image file caches Thumbs.db # Folder config file Desktop.ini ############# ## Python ############# *.py[co] # Packages *.egg *.egg-info dist build eggs parts bin var sdist develop-eggs .installed.cfg # Installer logs pip-log.txt # Unit test / coverage reports .coverage .tox #Translations *.mo #Mr Developer .mr.developer.cfg # Mac crap .DS_Store ================================================ FILE: NYCR_hillary_2014/hillary_2014.Rmd ================================================ --- title: "R Notebook" output: html_notebook: default md_document: variant: markdown_github --- ```{r} library(babynames) library(dplyr) library(ggplot2) library(ggrepel) babynames %>% summarize(max(year)) ``` ```{r} babynames %>% filter(sex == "F", n >= 115) %>% rename(n_babies = n) %>% arrange(name, year) ``` ```{r} babynames %>% filter(sex == "F", n >= 115) %>% rename(n_babies = n) %>% arrange(name, year) %>% group_by(name) %>% mutate(yeardiff = c(NA, diff(year, lag = 1)), YoY_increase = 100*((prop / lag(prop, 1)) - 1), year_before = lag(year, 1), prop_before = lag(prop, 1)) ``` ```{r} YoY_names <- babynames %>% filter(sex == "F", n >= 115) %>% rename(n_babies = n) %>% arrange(name, year) %>% group_by(name) %>% mutate(yeardiff = c(NA, diff(year)), YoY_increase = 100*((prop / lag(prop, 1)) - 1), year_before = lag(year, 1), prop_before = lag(prop, 1)) %>% ungroup() %>% filter(!is.na(YoY_increase), yeardiff == 1) %>% arrange(YoY_increase) YoY_names ``` ```{r} poisoned_names <- YoY_names %>% filter(min_rank(YoY_increase) <= 30) %>% select(name, year, prop, YoY_increase, year_before, prop_before) poisoned_names ``` ```{r} babynames %>% filter(sex == "F", n >= 115) %>% inner_join(poisoned_names %>% select(name), by = "name") %>% ggplot(aes(x = year, y = prop, color = name)) + geom_line() + theme_bw() + theme(legend.position="none") + geom_text_repel(aes(x = year_before, y = prop_before, label = name), data = poisoned_names) ``` ```{r} trend_names <- babynames %>% filter(sex == "F", n >= 115) %>% left_join(YoY_names) %>% arrange(YoY_increase) %>% inner_join(poisoned_names %>% select(name)) %>% group_by(name) %>% mutate(max_YoY = max(YoY_increase, na.rm = TRUE), min_YoY = min(YoY_increase, na.rm = TRUE), year_rank = min_rank(year)) %>% filter(max_YoY >= 100 | (YoY_increase == min_YoY & year_rank %in% 2:10)) trend_names ``` ```{r} babynames %>% filter(sex == "F", n >= 115) %>% inner_join(poisoned_names %>% select(name), by = "name") %>% anti_join(trend_names %>% select(name)) %>% ggplot(aes(x = year, y = prop, color = name)) + geom_line() + geom_text_repel(aes(x = year_before, y = prop_before, label = paste0(name, ", ", round(YoY_increase, 1), "%")), data = poisoned_names %>% filter(name %in% c("Hilary", "Hillary"))) + theme_bw() ``` ================================================ FILE: NYCR_hillary_2014/hillary_2014.Rproj ================================================ Version: 1.0 RestoreWorkspace: Default SaveWorkspace: Default AlwaysSaveHistory: Default EnableCodeIndexing: Yes UseSpacesForTab: Yes NumSpacesForTab: 2 Encoding: UTF-8 RnwWeave: Sweave LaTeX: pdfLaTeX ================================================ FILE: NYCR_hillary_2014/hillary_2014.md ================================================ ``` r library(babynames) library(dplyr) ``` ## ## Attaching package: 'dplyr' ## The following objects are masked from 'package:stats': ## ## filter, lag ## The following objects are masked from 'package:base': ## ## intersect, setdiff, setequal, union ``` r library(ggplot2) library(ggrepel) babynames %>% summarize(max(year)) ``` ## # A tibble: 1 × 1 ## `max(year)` ## ## 1 2014 ``` r babynames %>% filter(sex == "F", n >= 115) %>% rename(n_babies = n) %>% arrange(name, year) ``` ## # A tibble: 115,436 × 5 ## year sex name n_babies prop ## ## 1 2013 F Aadhya 172 8.965010e-05 ## 2 2014 F Aadhya 249 1.284476e-04 ## 3 2014 F Aadya 164 8.460001e-05 ## 4 2013 F Aaleyah 116 6.046170e-05 ## 5 1994 F Aaliyah 1451 7.445779e-04 ## 6 1995 F Aaliyah 1254 6.528030e-04 ## 7 1996 F Aaliyah 831 4.335921e-04 ## 8 1997 F Aaliyah 1738 9.107105e-04 ## 9 1998 F Aaliyah 1399 7.220496e-04 ## 10 1999 F Aaliyah 1088 5.591907e-04 ## # ... with 115,426 more rows ``` r babynames %>% filter(sex == "F", n >= 115) %>% rename(n_babies = n) %>% arrange(name, year) %>% group_by(name) %>% mutate(yeardiff = c(NA, diff(year, lag = 1)), YoY_increase = 100*((prop / lag(prop, 1)) - 1), year_before = lag(year, 1), prop_before = lag(prop, 1)) ``` ## Source: local data frame [115,436 x 9] ## Groups: name [4,076] ## ## year sex name n_babies prop yeardiff YoY_increase ## ## 1 2013 F Aadhya 172 8.965010e-05 NA NA ## 2 2014 F Aadhya 249 1.284476e-04 1 43.27655 ## 3 2014 F Aadya 164 8.460001e-05 NA NA ## 4 2013 F Aaleyah 116 6.046170e-05 NA NA ## 5 1994 F Aaliyah 1451 7.445779e-04 NA NA ## 6 1995 F Aaliyah 1254 6.528030e-04 1 -12.32577 ## 7 1996 F Aaliyah 831 4.335921e-04 1 -33.57995 ## 8 1997 F Aaliyah 1738 9.107105e-04 1 110.03856 ## 9 1998 F Aaliyah 1399 7.220496e-04 1 -20.71580 ## 10 1999 F Aaliyah 1088 5.591907e-04 1 -22.55509 ## # ... with 115,426 more rows, and 2 more variables: year_before , ## # prop_before ``` r YoY_names <- babynames %>% filter(sex == "F", n >= 115) %>% rename(n_babies = n) %>% arrange(name, year) %>% group_by(name) %>% mutate(yeardiff = c(NA, diff(year)), YoY_increase = 100*((prop / lag(prop, 1)) - 1), year_before = lag(year, 1), prop_before = lag(prop, 1)) %>% ungroup() %>% filter(!is.na(YoY_increase), yeardiff == 1) %>% arrange(YoY_increase) YoY_names ``` ## # A tibble: 108,528 × 9 ## year sex name n_babies prop yeardiff YoY_increase ## ## 1 1978 F Farrah 332 2.019914e-04 1 -78.08493 ## 2 1995 F Kadijah 119 6.194861e-05 1 -75.15995 ## 3 1974 F Catina 328 2.094373e-04 1 -73.68934 ## 4 1990 F Stephani 173 8.424268e-05 1 -73.61845 ## 5 1995 F Khadijah 438 2.280125e-04 1 -72.48665 ## 6 1965 F Deneen 421 2.303977e-04 1 -71.88849 ## 7 1993 F Hilary 343 1.740336e-04 1 -70.21517 ## 8 1974 F Katina 765 4.884742e-04 1 -69.30519 ## 9 1981 F Renata 224 1.252889e-04 1 -69.02427 ## 10 1992 F Iesha 581 2.899060e-04 1 -68.91587 ## # ... with 108,518 more rows, and 2 more variables: year_before , ## # prop_before ``` r poisoned_names <- YoY_names %>% filter(min_rank(YoY_increase) <= 30) %>% select(name, year, prop, YoY_increase, year_before, prop_before) poisoned_names ``` ## # A tibble: 30 × 6 ## name year prop YoY_increase year_before prop_before ## ## 1 Farrah 1978 2.019914e-04 -78.08493 1977 0.0009217010 ## 2 Kadijah 1995 6.194861e-05 -75.15995 1994 0.0002493900 ## 3 Catina 1974 2.094373e-04 -73.68934 1973 0.0007960170 ## 4 Stephani 1990 8.424268e-05 -73.61845 1989 0.0003193242 ## 5 Khadijah 1995 2.280125e-04 -72.48665 1994 0.0008287342 ## 6 Deneen 1965 2.303977e-04 -71.88849 1964 0.0008195850 ## 7 Hilary 1993 1.740336e-04 -70.21517 1992 0.0005843028 ## 8 Katina 1974 4.884742e-04 -69.30519 1973 0.0015913904 ## 9 Renata 1981 1.252889e-04 -69.02427 1980 0.0004044744 ## 10 Iesha 1992 2.899060e-04 -68.91587 1991 0.0009326495 ## # ... with 20 more rows ``` r babynames %>% filter(sex == "F", n >= 115) %>% inner_join(poisoned_names %>% select(name), by = "name") %>% ggplot(aes(x = year, y = prop, color = name)) + geom_line() + theme_bw() + theme(legend.position="none") + geom_text_repel(aes(x = year_before, y = prop_before, label = name), data = poisoned_names) ``` ![](hillary_2014_files/figure-markdown_github/unnamed-chunk-6-1.png) ``` r trend_names <- babynames %>% filter(sex == "F", n >= 115) %>% left_join(YoY_names) %>% arrange(YoY_increase) %>% inner_join(poisoned_names %>% select(name)) %>% group_by(name) %>% mutate(max_YoY = max(YoY_increase, na.rm = TRUE), min_YoY = min(YoY_increase, na.rm = TRUE), year_rank = min_rank(year)) %>% filter(max_YoY >= 100 | (YoY_increase == min_YoY & year_rank %in% 2:10)) ``` ## Joining, by = c("year", "sex", "name", "prop") ## Joining, by = "name" ``` r trend_names ``` ## Source: local data frame [397 x 13] ## Groups: name [27] ## ## year sex name n prop n_babies yeardiff YoY_increase ## ## 1 1978 F Farrah 332 2.019914e-04 332 1 -78.08493 ## 2 1995 F Kadijah 119 6.194861e-05 119 1 -75.15995 ## 3 1974 F Catina 328 2.094373e-04 328 1 -73.68934 ## 4 1990 F Stephani 173 8.424268e-05 173 1 -73.61845 ## 5 1995 F Khadijah 438 2.280125e-04 438 1 -72.48665 ## 6 1965 F Deneen 421 2.303977e-04 421 1 -71.88849 ## 7 1974 F Katina 765 4.884742e-04 765 1 -69.30519 ## 8 1981 F Renata 224 1.252889e-04 224 1 -69.02427 ## 9 1992 F Iesha 581 2.899060e-04 581 1 -68.91587 ## 10 1998 F Yulissa 197 1.016753e-04 197 1 -68.29458 ## # ... with 387 more rows, and 5 more variables: year_before , ## # prop_before , max_YoY , min_YoY , year_rank ``` r babynames %>% filter(sex == "F", n >= 115) %>% inner_join(poisoned_names %>% select(name), by = "name") %>% anti_join(trend_names %>% select(name)) %>% ggplot(aes(x = year, y = prop, color = name)) + geom_line() + geom_text_repel(aes(x = year_before, y = prop_before, label = paste0(name, ", ", round(YoY_increase, 1), "%")), data = poisoned_names %>% filter(name %in% c("Hilary", "Hillary"))) + theme_bw() ``` ## Joining, by = "name" ![](hillary_2014_files/figure-markdown_github/unnamed-chunk-8-1.png) ================================================ FILE: NYCR_hillary_2014/hillary_2014.nb.html ================================================ R Notebook
library(babynames)
Warning messages:
1: In file(con, "rb") :
  cannot open file '/Users/hilaryparker/Desktop/names/NYCR_hillary_2014/.Rproj.user/shared/notebooks/A65B0A3D-hillary_2014/1/s/chunks.json': No such file or directory
2: In file(con, "rb") :
  cannot open file '/Users/hilaryparker/Desktop/names/NYCR_hillary_2014/.Rproj.user/shared/notebooks/A65B0A3D-hillary_2014/1/s/chunks.json': No such file or directory
3: In file(con, "rb") :
  cannot open file '/Users/hilaryparker/Desktop/names/NYCR_hillary_2014/.Rproj.user/shared/notebooks/A65B0A3D-hillary_2014/1/s/chunks.json': No such file or directory
4: In file(con, "rb") :
  cannot open file '/Users/hilaryparker/Desktop/names/NYCR_hillary_2014/.Rproj.user/shared/notebooks/A65B0A3D-hillary_2014/1/s/chunks.json': No such file or directory
library(dplyr)

Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union
library(ggplot2)
library(ggrepel)
babynames %>% summarize(max(year))
babynames %>% 
  filter(sex == "F",  n >= 115) %>% 
  rename(n_babies = n) %>% 
  arrange(name, year) 
babynames %>% 
  filter(sex == "F",  n >= 115) %>% 
  rename(n_babies = n) %>% 
  arrange(name, year) %>% 
  group_by(name) %>% 
  mutate(yeardiff = c(NA, diff(year, lag = 1)),
         YoY_increase = 100*((prop / lag(prop, 1)) - 1),
         year_before = lag(year, 1), prop_before = lag(prop, 1))
YoY_names <- babynames %>% 
  filter(sex == "F",  n >= 115) %>% 
  rename(n_babies = n) %>% 
  arrange(name, year) %>% 
  group_by(name) %>% 
  mutate(yeardiff = c(NA, diff(year)),
         YoY_increase = 100*((prop / lag(prop, 1)) - 1),
         year_before = lag(year, 1), prop_before = lag(prop, 1)) %>% 
  ungroup() %>% 
  filter(!is.na(YoY_increase), yeardiff == 1) %>% 
  arrange(YoY_increase)
YoY_names
poisoned_names <- YoY_names %>% 
  filter(min_rank(YoY_increase) <= 30) %>% 
  select(name, year, prop, YoY_increase, year_before, prop_before)
poisoned_names
babynames %>% 
  filter(sex == "F",  n >= 115) %>% 
  inner_join(poisoned_names %>% select(name), by = "name") %>% 
  ggplot(aes(x = year, y = prop, color = name)) +
    geom_line() +
    theme_bw() + theme(legend.position="none") +
    geom_text_repel(aes(x = year_before, y = prop_before, label = name), data = poisoned_names) 

trend_names <- babynames %>% 
  filter(sex == "F", n >= 115) %>% 
  left_join(YoY_names) %>% 
  arrange(YoY_increase) %>% 
  inner_join(poisoned_names %>% select(name)) %>% 
  group_by(name) %>% 
  mutate(max_YoY = max(YoY_increase, na.rm = TRUE),
         min_YoY = min(YoY_increase, na.rm = TRUE),
         year_rank = min_rank(year)) %>% 
  filter(max_YoY >= 100 | 
           (YoY_increase == min_YoY & year_rank %in% 2:10))  
Joining, by = c("year", "sex", "name", "prop")
Joining, by = "name"
trend_names
babynames %>% 
  filter(sex == "F",  n >= 115) %>% 
  inner_join(poisoned_names %>% select(name), by = "name") %>% 
  anti_join(trend_names %>% select(name)) %>% 
  ggplot(aes(x = year, y = prop, color = name)) +
    geom_line() +
    geom_text_repel(aes(x = year_before, y = prop_before, label = paste0(name, ", ", round(YoY_increase, 1), "%")), data = poisoned_names %>% filter(name %in% c("Hilary", "Hillary"))) +
    theme_bw()
Joining, by = "name"

LS0tCnRpdGxlOiAiUiBOb3RlYm9vayIKb3V0cHV0OgogIGh0bWxfbm90ZWJvb2s6IGRlZmF1bHQKICBtZF9kb2N1bWVudDoKICAgIHZhcmlhbnQ6IG1hcmtkb3duX2dpdGh1YgotLS0KCmBgYHtyfQpsaWJyYXJ5KGJhYnluYW1lcykKbGlicmFyeShkcGx5cikKbGlicmFyeShnZ3Bsb3QyKQpsaWJyYXJ5KGdncmVwZWwpCmJhYnluYW1lcyAlPiUgc3VtbWFyaXplKG1heCh5ZWFyKSkKYGBgCgoKCgoKCgoKCmBgYHtyfQpiYWJ5bmFtZXMgJT4lIAogIGZpbHRlcihzZXggPT0gIkYiLCAgbiA+PSAxMTUpICU+JSAKICByZW5hbWUobl9iYWJpZXMgPSBuKSAlPiUgCiAgYXJyYW5nZShuYW1lLCB5ZWFyKSAKYGBgCgoKCgoKCgpgYGB7cn0KYmFieW5hbWVzICU+JSAKICBmaWx0ZXIoc2V4ID09ICJGIiwgIG4gPj0gMTE1KSAlPiUgCiAgcmVuYW1lKG5fYmFiaWVzID0gbikgJT4lIAogIGFycmFuZ2UobmFtZSwgeWVhcikgJT4lIAogIGdyb3VwX2J5KG5hbWUpICU+JSAKICBtdXRhdGUoeWVhcmRpZmYgPSBjKE5BLCBkaWZmKHllYXIsIGxhZyA9IDEpKSwKICAgICAgICAgWW9ZX2luY3JlYXNlID0gMTAwKigocHJvcCAvIGxhZyhwcm9wLCAxKSkgLSAxKSwKICAgICAgICAgeWVhcl9iZWZvcmUgPSBsYWcoeWVhciwgMSksIHByb3BfYmVmb3JlID0gbGFnKHByb3AsIDEpKQpgYGAKCgoKCgoKCgpgYGB7cn0KWW9ZX25hbWVzIDwtIGJhYnluYW1lcyAlPiUgCiAgZmlsdGVyKHNleCA9PSAiRiIsICBuID49IDExNSkgJT4lIAogIHJlbmFtZShuX2JhYmllcyA9IG4pICU+JSAKICBhcnJhbmdlKG5hbWUsIHllYXIpICU+JSAKICBncm91cF9ieShuYW1lKSAlPiUgCiAgbXV0YXRlKHllYXJkaWZmID0gYyhOQSwgZGlmZih5ZWFyKSksCiAgICAgICAgIFlvWV9pbmNyZWFzZSA9IDEwMCooKHByb3AgLyBsYWcocHJvcCwgMSkpIC0gMSksCiAgICAgICAgIHllYXJfYmVmb3JlID0gbGFnKHllYXIsIDEpLCBwcm9wX2JlZm9yZSA9IGxhZyhwcm9wLCAxKSkgJT4lIAogIHVuZ3JvdXAoKSAlPiUgCiAgZmlsdGVyKCFpcy5uYShZb1lfaW5jcmVhc2UpLCB5ZWFyZGlmZiA9PSAxKSAlPiUgCiAgYXJyYW5nZShZb1lfaW5jcmVhc2UpCllvWV9uYW1lcwpgYGAKCgoKCgoKCgoKYGBge3J9CnBvaXNvbmVkX25hbWVzIDwtIFlvWV9uYW1lcyAlPiUgCiAgZmlsdGVyKG1pbl9yYW5rKFlvWV9pbmNyZWFzZSkgPD0gMzApICU+JSAKICBzZWxlY3QobmFtZSwgeWVhciwgcHJvcCwgWW9ZX2luY3JlYXNlLCB5ZWFyX2JlZm9yZSwgcHJvcF9iZWZvcmUpCnBvaXNvbmVkX25hbWVzCmBgYAoKCgoKCgoKCgpgYGB7cn0KYmFieW5hbWVzICU+JSAKICBmaWx0ZXIoc2V4ID09ICJGIiwgIG4gPj0gMTE1KSAlPiUgCiAgaW5uZXJfam9pbihwb2lzb25lZF9uYW1lcyAlPiUgc2VsZWN0KG5hbWUpLCBieSA9ICJuYW1lIikgJT4lIAogIGdncGxvdChhZXMoeCA9IHllYXIsIHkgPSBwcm9wLCBjb2xvciA9IG5hbWUpKSArCiAgICBnZW9tX2xpbmUoKSArCiAgICB0aGVtZV9idygpICsgdGhlbWUobGVnZW5kLnBvc2l0aW9uPSJub25lIikgKwogICAgZ2VvbV90ZXh0X3JlcGVsKGFlcyh4ID0geWVhcl9iZWZvcmUsIHkgPSBwcm9wX2JlZm9yZSwgbGFiZWwgPSBuYW1lKSwgZGF0YSA9IHBvaXNvbmVkX25hbWVzKSAKYGBgCgoKCgoKCgoKCmBgYHtyfQp0cmVuZF9uYW1lcyA8LSBiYWJ5bmFtZXMgJT4lIAogIGZpbHRlcihzZXggPT0gIkYiLCBuID49IDExNSkgJT4lIAogIGxlZnRfam9pbihZb1lfbmFtZXMpICU+JSAKICBhcnJhbmdlKFlvWV9pbmNyZWFzZSkgJT4lIAogIGlubmVyX2pvaW4ocG9pc29uZWRfbmFtZXMgJT4lIHNlbGVjdChuYW1lKSkgJT4lIAogIGdyb3VwX2J5KG5hbWUpICU+JSAKICBtdXRhdGUobWF4X1lvWSA9IG1heChZb1lfaW5jcmVhc2UsIG5hLnJtID0gVFJVRSksCiAgICAgICAgIG1pbl9Zb1kgPSBtaW4oWW9ZX2luY3JlYXNlLCBuYS5ybSA9IFRSVUUpLAogICAgICAgICB5ZWFyX3JhbmsgPSBtaW5fcmFuayh5ZWFyKSkgJT4lIAogIGZpbHRlcihtYXhfWW9ZID49IDEwMCB8IAogICAgICAgICAgIChZb1lfaW5jcmVhc2UgPT0gbWluX1lvWSAmIHllYXJfcmFuayAlaW4lIDI6MTApKSAgCnRyZW5kX25hbWVzCmBgYCAgCiAgCiAgCiAgCiAgCiAgCiAgCiAgCiAgCmBgYHtyfQpiYWJ5bmFtZXMgJT4lIAogIGZpbHRlcihzZXggPT0gIkYiLCAgbiA+PSAxMTUpICU+JSAKICBpbm5lcl9qb2luKHBvaXNvbmVkX25hbWVzICU+JSBzZWxlY3QobmFtZSksIGJ5ID0gIm5hbWUiKSAlPiUgCiAgYW50aV9qb2luKHRyZW5kX25hbWVzICU+JSBzZWxlY3QobmFtZSkpICU+JSAKICBnZ3Bsb3QoYWVzKHggPSB5ZWFyLCB5ID0gcHJvcCwgY29sb3IgPSBuYW1lKSkgKwogICAgZ2VvbV9saW5lKCkgKwogICAgZ2VvbV90ZXh0X3JlcGVsKGFlcyh4ID0geWVhcl9iZWZvcmUsIHkgPSBwcm9wX2JlZm9yZSwgbGFiZWwgPSBwYXN0ZTAobmFtZSwgIiwgIiwgcm91bmQoWW9ZX2luY3JlYXNlLCAxKSwgIiUiKSksIGRhdGEgPSBwb2lzb25lZF9uYW1lcyAlPiUgZmlsdGVyKG5hbWUgJWluJSBjKCJIaWxhcnkiLCAiSGlsbGFyeSIpKSkgKwogICAgdGhlbWVfYncoKQpgYGA=
================================================ FILE: NYCR_hillary_2015/hillary_2015.R ================================================ ```{r} library(dplyr) library(ggplot2) library(readr) babynames <- read_csv('https://cdn.rawgit.com/wharton-data-analytics/babynames/3192856738dee4a91bfc7d320355daa5ae428c17/data-raw/csv/babynames.csv') babynames %>% summarize(max(year)) ``` ```{r} poisoned_names <- babynames %>% filter(sex == "F") %>% filter(n >= 115) %>% arrange(name, year) %>% group_by(name) %>% mutate(years = n()) %>% mutate(yeardiff = c(NA, diff(year)), YoY_increase = 100* prop / lag(prop, 1), year_before = lag(year, 1), prop_before = lag(prop, 1)) %>% ungroup() %>% mutate(YoY_decrease = 100-YoY_increase) %>% filter(!is.na(YoY_increase), yeardiff == 1) %>% arrange(YoY_increase) %>% top_n(30) %>% select(name, year, prop, YoY_decrease, year_before, prop_before) ``` ```{r} babynames %>% filter(sex == "F" & year > 1950) %>% inner_join(poisoned_names %>% select(name), by = "name") %>% ggplot(aes(x = year, y = prop, color = name)) + geom_line() + geom_text_repel(aes(x = year_before, y = prop_before, label = name), data = poisoned_names) + theme_bw() ``` ```{r} babynames %>% filter(sex == "F" & year > 1950 & n >= 200 & !(name %in% c("Marian", "Ashanti", "Christin", "Litzy"))) %>% inner_join(poisoned_names %>% select(name), by = "name") %>% group_by(name) %>% mutate(years = n()) %>% filter(years > 18) %>% ggplot(aes(x = year, y = prop, color = name)) + geom_line() + geom_text_repel(aes(x = year_before, y = prop_before, label = paste0(name, ", ", round(YoY_decrease), "%")), data = poisoned_names %>% filter(name %in% c("Hilary", "Hillary", "Isis"))) + theme_bw() ``` ================================================ FILE: NYCR_hillary_2015/hillary_2015.Rmd ================================================ --- title: "R Notebook" output: md_document: variant: markdown_github html_notebook: default --- ```{r} library(dplyr) library(ggplot2) library(readr) library(ggrepel) babynames <- read_csv('https://cdn.rawgit.com/wharton-data-analytics/babynames/3192856738dee4a91bfc7d320355daa5ae428c17/data-raw/csv/babynames.csv') babynames %>% summarize(max(year)) ``` ```{r} YoY_names <- babynames %>% filter(sex == "F", n >= 115) %>% rename(n_babies = n) %>% arrange(name, year) %>% group_by(name) %>% mutate(yeardiff = c(NA, diff(year)), YoY_increase = 100*((prop / lag(prop, 1)) - 1), year_before = lag(year, 1), prop_before = lag(prop, 1)) %>% ungroup() %>% filter(!is.na(YoY_increase), yeardiff == 1) %>% arrange(YoY_increase) ``` ```{r} poisoned_names <- YoY_names %>% filter(min_rank(YoY_increase) <= 30) %>% select(name, year, prop, YoY_increase, year_before, prop_before) ``` ```{r} babynames %>% filter(sex == "F", n >= 115) %>% inner_join(poisoned_names %>% select(name), by = "name") %>% ggplot(aes(x = year, y = prop, color = name)) + geom_line() + theme_bw() + theme(legend.position="none") + geom_text_repel(aes(x = year_before, y = prop_before, label = name), data = poisoned_names) ``` ```{r} trend_names <- babynames %>% filter(sex == "F", n >= 115) %>% left_join(YoY_names) %>% arrange(YoY_increase) %>% inner_join(poisoned_names %>% select(name)) %>% group_by(name) %>% mutate(max_YoY = max(YoY_increase, na.rm = TRUE), min_YoY = min(YoY_increase, na.rm = TRUE), year_rank = min_rank(year)) %>% filter(max_YoY >= 100 | (YoY_increase == min_YoY & year_rank %in% 2:10)) ``` ```{r} babynames %>% filter(sex == "F", n >= 115) %>% inner_join(poisoned_names %>% select(name), by = "name") %>% anti_join(trend_names %>% select(name)) %>% ggplot(aes(x = year, y = prop, color = name)) + geom_line() + geom_text_repel(aes(x = year_before, y = prop_before, label = paste0(name, ", ", round(YoY_increase, 1), "%")), data = poisoned_names %>% filter(name %in% c("Hilary", "Hillary", "Isis"))) + theme_bw() ``` ================================================ FILE: NYCR_hillary_2015/hillary_2015.Rproj ================================================ Version: 1.0 RestoreWorkspace: Default SaveWorkspace: Default AlwaysSaveHistory: Default EnableCodeIndexing: Yes UseSpacesForTab: Yes NumSpacesForTab: 2 Encoding: UTF-8 RnwWeave: Sweave LaTeX: pdfLaTeX ================================================ FILE: NYCR_hillary_2015/hillary_2015.md ================================================ ``` r library(dplyr) ``` ## ## Attaching package: 'dplyr' ## The following objects are masked from 'package:stats': ## ## filter, lag ## The following objects are masked from 'package:base': ## ## intersect, setdiff, setequal, union ``` r library(ggplot2) library(readr) library(ggrepel) babynames <- read_csv('https://cdn.rawgit.com/wharton-data-analytics/babynames/3192856738dee4a91bfc7d320355daa5ae428c17/data-raw/csv/babynames.csv') ``` ## Parsed with column specification: ## cols( ## year = col_double(), ## sex = col_character(), ## name = col_character(), ## n = col_integer(), ## prop = col_double() ## ) ``` r babynames %>% summarize(max(year)) ``` ## # A tibble: 1 × 1 ## `max(year)` ## ## 1 2015 ``` r YoY_names <- babynames %>% filter(sex == "F", n >= 115) %>% rename(n_babies = n) %>% arrange(name, year) %>% group_by(name) %>% mutate(yeardiff = c(NA, diff(year)), YoY_increase = 100*((prop / lag(prop, 1)) - 1), year_before = lag(year, 1), prop_before = lag(prop, 1)) %>% ungroup() %>% filter(!is.na(YoY_increase), yeardiff == 1) %>% arrange(YoY_increase) ``` ``` r poisoned_names <- YoY_names %>% filter(min_rank(YoY_increase) <= 30) %>% select(name, year, prop, YoY_increase, year_before, prop_before) ``` ``` r babynames %>% filter(sex == "F", n >= 115) %>% inner_join(poisoned_names %>% select(name), by = "name") %>% ggplot(aes(x = year, y = prop, color = name)) + geom_line() + theme_bw() + theme(legend.position="none") + geom_text_repel(aes(x = year_before, y = prop_before, label = name), data = poisoned_names) ``` ![](hillary_2015_files/figure-markdown_github/unnamed-chunk-4-1.png) ``` r trend_names <- babynames %>% filter(sex == "F", n >= 115) %>% left_join(YoY_names) %>% arrange(YoY_increase) %>% inner_join(poisoned_names %>% select(name)) %>% group_by(name) %>% mutate(max_YoY = max(YoY_increase, na.rm = TRUE), min_YoY = min(YoY_increase, na.rm = TRUE), year_rank = min_rank(year)) %>% filter(max_YoY >= 100 | (YoY_increase == min_YoY & year_rank %in% 2:10)) ``` ## Joining, by = c("year", "sex", "name", "prop") ## Joining, by = "name" ``` r babynames %>% filter(sex == "F", n >= 115) %>% inner_join(poisoned_names %>% select(name), by = "name") %>% anti_join(trend_names %>% select(name)) %>% ggplot(aes(x = year, y = prop, color = name)) + geom_line() + geom_text_repel(aes(x = year_before, y = prop_before, label = paste0(name, ", ", round(YoY_increase, 1), "%")), data = poisoned_names %>% filter(name %in% c("Hilary", "Hillary", "Isis"))) + theme_bw() ``` ## Joining, by = "name" ![](hillary_2015_files/figure-markdown_github/unnamed-chunk-6-1.png) ================================================ FILE: NYCR_hillary_2015/hillary_2015.nb.html ================================================ R Notebook
library(dplyr)

Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union

Warning messages:
1: In file(con, "rb") :
  cannot open file '/Users/hilaryparker/Desktop/names/NYCR_hillary_2015/.Rproj.user/shared/notebooks/6F15A132-hillary_2015/1/s/chunks.json': No such file or directory
2: In file(con, "rb") :
  cannot open file '/Users/hilaryparker/Desktop/names/NYCR_hillary_2015/.Rproj.user/shared/notebooks/6F15A132-hillary_2015/1/s/chunks.json': No such file or directory
library(ggplot2)
library(readr)
library(ggrepel)
babynames <- read_csv('https://cdn.rawgit.com/wharton-data-analytics/babynames/3192856738dee4a91bfc7d320355daa5ae428c17/data-raw/csv/babynames.csv')
Parsed with column specification:
cols(
  year = col_double(),
  sex = col_character(),
  name = col_character(),
  n = col_integer(),
  prop = col_double()
)
babynames %>% summarize(max(year))
YoY_names <- babynames %>% 
  filter(sex == "F",  n >= 115) %>% 
  rename(n_babies = n) %>% 
  arrange(name, year) %>% 
  group_by(name) %>% 
  mutate(yeardiff = c(NA, diff(year)),
         YoY_increase = 100*((prop / lag(prop, 1)) - 1),
         year_before = lag(year, 1), prop_before = lag(prop, 1)) %>% 
  ungroup() %>% 
  filter(!is.na(YoY_increase), yeardiff == 1) %>% 
  arrange(YoY_increase)
poisoned_names <- YoY_names %>% 
  filter(min_rank(YoY_increase) <= 30) %>% 
  select(name, year, prop, YoY_increase, year_before, prop_before)
babynames %>% 
  filter(sex == "F",  n >= 115) %>% 
  inner_join(poisoned_names %>% select(name), by = "name") %>% 
  ggplot(aes(x = year, y = prop, color = name)) +
    geom_line() +
    theme_bw() + theme(legend.position="none") +
    geom_text_repel(aes(x = year_before, y = prop_before, label = name), data = poisoned_names) 

trend_names <- babynames %>% 
  filter(sex == "F", n >= 115) %>% 
  left_join(YoY_names) %>% 
  arrange(YoY_increase) %>% 
  inner_join(poisoned_names %>% select(name)) %>% 
  group_by(name) %>% 
  mutate(max_YoY = max(YoY_increase, na.rm = TRUE),
         min_YoY = min(YoY_increase, na.rm = TRUE),
         year_rank = min_rank(year)) %>% 
  filter(max_YoY >= 100 | 
           (YoY_increase == min_YoY & year_rank %in% 2:10))  
Joining, by = c("year", "sex", "name", "prop")
Joining, by = "name"
babynames %>% 
  filter(sex == "F",  n >= 115) %>% 
  inner_join(poisoned_names %>% select(name), by = "name") %>% 
  anti_join(trend_names %>% select(name)) %>% 
  ggplot(aes(x = year, y = prop, color = name)) +
    geom_line() +
    geom_text_repel(aes(x = year_before, y = prop_before, label = paste0(name, ", ", round(YoY_increase, 1), "%")), data = poisoned_names %>% filter(name %in% c("Hilary", "Hillary", "Isis"))) +
    theme_bw()
Joining, by = "name"

LS0tCnRpdGxlOiAiUiBOb3RlYm9vayIKb3V0cHV0OgogIG1kX2RvY3VtZW50OgogICAgdmFyaWFudDogbWFya2Rvd25fZ2l0aHViCiAgaHRtbF9ub3RlYm9vazogZGVmYXVsdAotLS0KCmBgYHtyfQpsaWJyYXJ5KGRwbHlyKQpsaWJyYXJ5KGdncGxvdDIpCmxpYnJhcnkocmVhZHIpCmxpYnJhcnkoZ2dyZXBlbCkKCmJhYnluYW1lcyA8LSByZWFkX2NzdignaHR0cHM6Ly9jZG4ucmF3Z2l0LmNvbS93aGFydG9uLWRhdGEtYW5hbHl0aWNzL2JhYnluYW1lcy8zMTkyODU2NzM4ZGVlNGE5MWJmYzdkMzIwMzU1ZGFhNWFlNDI4YzE3L2RhdGEtcmF3L2Nzdi9iYWJ5bmFtZXMuY3N2JykKYmFieW5hbWVzICU+JSBzdW1tYXJpemUobWF4KHllYXIpKQpgYGAKCgoKCmBgYHtyfQpZb1lfbmFtZXMgPC0gYmFieW5hbWVzICU+JSAKICBmaWx0ZXIoc2V4ID09ICJGIiwgIG4gPj0gMTE1KSAlPiUgCiAgcmVuYW1lKG5fYmFiaWVzID0gbikgJT4lIAogIGFycmFuZ2UobmFtZSwgeWVhcikgJT4lIAogIGdyb3VwX2J5KG5hbWUpICU+JSAKICBtdXRhdGUoeWVhcmRpZmYgPSBjKE5BLCBkaWZmKHllYXIpKSwKICAgICAgICAgWW9ZX2luY3JlYXNlID0gMTAwKigocHJvcCAvIGxhZyhwcm9wLCAxKSkgLSAxKSwKICAgICAgICAgeWVhcl9iZWZvcmUgPSBsYWcoeWVhciwgMSksIHByb3BfYmVmb3JlID0gbGFnKHByb3AsIDEpKSAlPiUgCiAgdW5ncm91cCgpICU+JSAKICBmaWx0ZXIoIWlzLm5hKFlvWV9pbmNyZWFzZSksIHllYXJkaWZmID09IDEpICU+JSAKICBhcnJhbmdlKFlvWV9pbmNyZWFzZSkKYGBgCgpgYGB7cn0KcG9pc29uZWRfbmFtZXMgPC0gWW9ZX25hbWVzICU+JSAKICBmaWx0ZXIobWluX3JhbmsoWW9ZX2luY3JlYXNlKSA8PSAzMCkgJT4lIAogIHNlbGVjdChuYW1lLCB5ZWFyLCBwcm9wLCBZb1lfaW5jcmVhc2UsIHllYXJfYmVmb3JlLCBwcm9wX2JlZm9yZSkKYGBgCgoKCmBgYHtyfQpiYWJ5bmFtZXMgJT4lIAogIGZpbHRlcihzZXggPT0gIkYiLCAgbiA+PSAxMTUpICU+JSAKICBpbm5lcl9qb2luKHBvaXNvbmVkX25hbWVzICU+JSBzZWxlY3QobmFtZSksIGJ5ID0gIm5hbWUiKSAlPiUgCiAgZ2dwbG90KGFlcyh4ID0geWVhciwgeSA9IHByb3AsIGNvbG9yID0gbmFtZSkpICsKICAgIGdlb21fbGluZSgpICsKICAgIHRoZW1lX2J3KCkgKyB0aGVtZShsZWdlbmQucG9zaXRpb249Im5vbmUiKSArCiAgICBnZW9tX3RleHRfcmVwZWwoYWVzKHggPSB5ZWFyX2JlZm9yZSwgeSA9IHByb3BfYmVmb3JlLCBsYWJlbCA9IG5hbWUpLCBkYXRhID0gcG9pc29uZWRfbmFtZXMpIApgYGAKCgpgYGB7cn0KdHJlbmRfbmFtZXMgPC0gYmFieW5hbWVzICU+JSAKICBmaWx0ZXIoc2V4ID09ICJGIiwgbiA+PSAxMTUpICU+JSAKICBsZWZ0X2pvaW4oWW9ZX25hbWVzKSAlPiUgCiAgYXJyYW5nZShZb1lfaW5jcmVhc2UpICU+JSAKICBpbm5lcl9qb2luKHBvaXNvbmVkX25hbWVzICU+JSBzZWxlY3QobmFtZSkpICU+JSAKICBncm91cF9ieShuYW1lKSAlPiUgCiAgbXV0YXRlKG1heF9Zb1kgPSBtYXgoWW9ZX2luY3JlYXNlLCBuYS5ybSA9IFRSVUUpLAogICAgICAgICBtaW5fWW9ZID0gbWluKFlvWV9pbmNyZWFzZSwgbmEucm0gPSBUUlVFKSwKICAgICAgICAgeWVhcl9yYW5rID0gbWluX3JhbmsoeWVhcikpICU+JSAKICBmaWx0ZXIobWF4X1lvWSA+PSAxMDAgfCAKICAgICAgICAgICAoWW9ZX2luY3JlYXNlID09IG1pbl9Zb1kgJiB5ZWFyX3JhbmsgJWluJSAyOjEwKSkgIApgYGAgIAogIApgYGB7cn0KYmFieW5hbWVzICU+JSAKICBmaWx0ZXIoc2V4ID09ICJGIiwgIG4gPj0gMTE1KSAlPiUgCiAgaW5uZXJfam9pbihwb2lzb25lZF9uYW1lcyAlPiUgc2VsZWN0KG5hbWUpLCBieSA9ICJuYW1lIikgJT4lIAogIGFudGlfam9pbih0cmVuZF9uYW1lcyAlPiUgc2VsZWN0KG5hbWUpKSAlPiUgCiAgZ2dwbG90KGFlcyh4ID0geWVhciwgeSA9IHByb3AsIGNvbG9yID0gbmFtZSkpICsKICAgIGdlb21fbGluZSgpICsKICAgIGdlb21fdGV4dF9yZXBlbChhZXMoeCA9IHllYXJfYmVmb3JlLCB5ID0gcHJvcF9iZWZvcmUsIGxhYmVsID0gcGFzdGUwKG5hbWUsICIsICIsIHJvdW5kKFlvWV9pbmNyZWFzZSwgMSksICIlIikpLCBkYXRhID0gcG9pc29uZWRfbmFtZXMgJT4lIGZpbHRlcihuYW1lICVpbiUgYygiSGlsYXJ5IiwgIkhpbGxhcnkiLCAiSXNpcyIpKSkgKwogICAgdGhlbWVfYncoKQpgYGAKCgoK
================================================ FILE: README.markdown ================================================ # Analysis of poisoned names This project is more completely described on [my blog post](http://hilaryparker.com/2013/01/30/hilary-the-most-poisoned-baby-name-in-us-history/). For access to the code for scraping the data from the social security administration baby names website, look in the `munge` and `lib` directories. For access to the data that has been pulled from the website and formatted as (rows=names, columns=years), go to the `cache` directory. For the code for the analysis, go to the `src` directory. I organized the code using [ProjectTemplate](http://projecttemplate.net/), an R package that provides a systematic template for organizing code. ProjectTemplate also allows for easy loading of the project. Have fun!! ================================================ FILE: cache/.gitignore ================================================ ================================================ FILE: config/.gitignore ================================================ ================================================ FILE: config/global.dcf ================================================ data_loading: on cache_loading: on munging: off logging: off load_libraries: on libraries: RCurl, XML, RColorBrewer, xtable as_factors: on data_tables: off ================================================ FILE: graphs/.Rhistory ================================================ plot( x = year.ind, y = male.percents["Adolf",], type = "l", ylim = c(0,0.15), col = cols[4], lwd = 3, xlab="Year", ylab="Percent", cex.lab = 1.5, cex.axis = 1.5 ) lines(x = year.ind, y = male.percents["Adolph",], col = cols[3], lwd = 3) lines(x = year.ind, y = female.percents["Hilary",], col = cols[2], lwd=3) lines(x = year.ind, y = female.percents["Hillary",], col = cols[1], lwd=3) # Adolf text(x=1919,y=0.015,col=cols[4],labels="Adolf",font=2,cex=1.5) # Adolph text(x=1930,y=0.05,col=cols[3],labels="Adolph",font=2,cex=1.5) # Hilary text(x=2002,y=0.06,col=cols[2],labels="Hilary",font=2,cex=1.5) # Hillary text(x=1983,y=0.13,col=cols[1],labels="Hillary",font=2,cex=1.5) dev.off() # Adolph and Adolf pdf( file="Adolph_Adolf_Hilary_Hillary.pdf", width = 10, height = 7 ) plot( x = year.ind, y = male.percents["Adolf",], type = "l", ylim = c(0,0.15), col = cols[4], lwd = 3, xlab="Year", ylab="Percent", cex.lab = 1.5, cex.axis = 1.5 ) lines(x = year.ind, y = male.percents["Adolph",], col = cols[3], lwd = 3) lines(x = year.ind, y = female.percents["Hilary",], col = cols[2], lwd=3) lines(x = year.ind, y = female.percents["Hillary",], col = cols[1], lwd=3) # Adolf text(x=1919,y=0.015,col=cols[4],labels="Adolf",font=2,cex=1.5) # Adolph text(x=1927,y=0.05,col=cols[3],labels="Adolph",font=2,cex=1.5) # Hilary text(x=2002,y=0.06,col=cols[2],labels="Hilary",font=2,cex=1.5) # Hillary text(x=1984,y=0.13,col=cols[1],labels="Hillary",font=2,cex=1.5) dev.off() # Adolph and Adolf pdf( file="Adolph_Adolf_Hilary_Hillary.pdf", width = 10, height = 7 ) plot( x = year.ind, y = male.percents["Adolf",], type = "l", ylim = c(0,0.15), col = cols[4], lwd = 3, xlab="Year", ylab="Percent", cex.lab = 1.5, cex.axis = 1.5 ) lines(x = year.ind, y = male.percents["Adolph",], col = cols[3], lwd = 3) lines(x = year.ind, y = female.percents["Hilary",], col = cols[2], lwd=3) lines(x = year.ind, y = female.percents["Hillary",], col = cols[1], lwd=3) # Adolf text(x=1919,y=0.015,col=cols[4],labels="Adolf",font=2,cex=1.5) # Adolph text(x=1929,y=0.05,col=cols[3],labels="Adolph",font=2,cex=1.5) # Hilary text(x=2002,y=0.06,col=cols[2],labels="Hilary",font=2,cex=1.5) # Hillary text(x=1984,y=0.13,col=cols[1],labels="Hillary",font=2,cex=1.5) dev.off() # Adolph and Adolf pdf( file="Adolph_Adolf_Hilary_Hillary.pdf", width = 10, height = 7 ) plot( x = year.ind, y = male.percents["Adolf",], type = "l", ylim = c(0,0.15), col = cols[4], lwd = 3, xlab="Year", ylab="Percent", cex.lab = 1.5, cex.axis = 1.5 ) lines(x = year.ind, y = male.percents["Adolph",], col = cols[3], lwd = 3) lines(x = year.ind, y = female.percents["Hilary",], col = cols[2], lwd=3) lines(x = year.ind, y = female.percents["Hillary",], col = cols[1], lwd=3) # Adolf text(x=1919,y=0.015,col=cols[4],labels="Adolf",font=2,cex=1.5) # Adolph text(x=1930,y=0.05,col=cols[3],labels="Adolph",font=2,cex=1.5) # Hilary text(x=2002,y=0.06,col=cols[2],labels="Hilary",font=2,cex=1.5) # Hillary text(x=1984,y=0.13,col=cols[1],labels="Hillary",font=2,cex=1.5) dev.off() # Adolph and Adolf pdf( file="Adolph_Adolf_Hilary_Hillary.pdf", width = 10, height = 7 ) plot( x = year.ind, y = male.percents["Adolf",], type = "l", ylim = c(0,0.15), col = cols[4], lwd = 3, xlab="Year", ylab="Percent", cex.lab = 1.5, cex.axis = 1.5 ) lines(x = year.ind, y = male.percents["Adolph",], col = cols[3], lwd = 3) lines(x = year.ind, y = female.percents["Hilary",], col = cols[2], lwd=3) lines(x = year.ind, y = female.percents["Hillary",], col = cols[1], lwd=3) # Adolf text(x=1919,y=0.015,col=cols[4],labels="Adolf",font=2,cex=1.5) # Adolph text(x=1930,y=0.05,col=cols[3],labels="Adolph",font=2,cex=1.5) # Hilary text(x=2001,y=0.055,col=cols[2],labels="Hilary",font=2,cex=1.5) # Hillary text(x=1984,y=0.13,col=cols[1],labels="Hillary",font=2,cex=1.5) dev.off() plot( x = year.ind, y = female.percents["Lolita",], type = "l", ylim = c(0,0.03), xlab = "Year", ylab = "Percent", col = cols[3], lwd = 3 ) lines(x = year.ind, y = female.percents["Hilary",], col = cols[2], lwd=3) lines(x = year.ind, y = female.percents["Hillary",], col = cols[1], lwd=3) ## Analysis of the name Lolita ## plot( x = year.ind, y = female.percents["Lolita",], type = "l", ylim = c(0,0.2), xlab = "Year", ylab = "Percent", col = cols[3], lwd = 3 ) lines(x = year.ind, y = female.percents["Hilary",], col = cols[2], lwd=3) lines(x = year.ind, y = female.percents["Hillary",], col = cols[1], lwd=3) plot( x=year.ind, y=female.percents["Katrina",], type="l", xlim=c(1945,2011), # ylim=c(0,0.03), xlab="Year", ylab="Percent", main="Percent of baby girls given the name Lolita", col=cols[3], lwd=3 ) lines( x = year.ind[which(year.ind==1945):length(year.ind)], y = female.percents["Hilary", cutoff:dim(female.percents)[2]], col = cols[2], lwd=3 ) plot( x=year.ind, y=female.percents["Katrina",], type="l", xlim=c(1945,2011), # ylim=c(0,0.03), xlab="Year", ylab="Percent", col=cols[3], lwd=3, cex.lab = 1.5, cex.axis = 1.5 ) lines( x = year.ind[which(year.ind==1945):length(year.ind)], y = female.percents["Hilary", cutoff:dim(female.percents)[2]], col = cols[2], lwd=3 ) lines( x = year.ind[which(year.ind==1945):length(year.ind)], y = female.percents["Hillary", cutoff:dim(female.percents)[2]], col = cols[2], lwd=3 ) lines( x = year.ind[which(year.ind==1945):length(year.ind)], y = female.percents["Hillary", cutoff:dim(female.percents)[2]], col = cols[1], lwd=3 ) plot( x=year.ind, y=female.percents["Katrina",], type="l", xlim=c(1945,2011), # ylim=c(0,0.03), xlab="Year", ylab="Percent", col=cols[3], lwd=3, cex.lab = 1.5, cex.axis = 1.5 ) lines( x = year.ind[which(year.ind==1945):length(year.ind)], y = female.percents["Hilary", cutoff:dim(female.percents)[2]], col = cols[2], lwd=3 ) text(x=2001,y=0.055,col=cols[2],labels="Hilary",font=2,cex=1.5) text(x=1980,y=0.055,col=cols[2],labels="Hilary",font=2,cex=1.5) text(x=1980,y=0.06,col=cols[2],labels="Hilary",font=2,cex=1.5) text(x=2000,y=0.06,col=cols[3],labels="Katrina",font=2,cex=1.5) text(x=2000,y=0.1,col=cols[3],labels="Katrina",font=2,cex=1.5) text(x=2002,y=0.1,col=cols[3],labels="Katrina",font=2,cex=1.5) # Katrina pdf( file="Katrina_Hilary.pdf", width = 10, height = 7 ) plot( x=year.ind, y=female.percents["Katrina",], type="l", xlim=c(1945,2011), # ylim=c(0,0.03), xlab="Year", ylab="Percent", col=cols[3], lwd=3, cex.lab = 1.5, cex.axis = 1.5 ) lines( x = year.ind[which(year.ind==1945):length(year.ind)], y = female.percents["Hilary", cutoff:dim(female.percents)[2]], col = cols[2], lwd=3 ) text(x=1980,y=0.06,col=cols[2],labels="Hilary",font=2,cex=1.5) text(x=2002,y=0.1,col=cols[3],labels="Katrina",font=2,cex=1.5) dev.off() # Katrina pdf( file="Katrina_Hilary.pdf", width = 10, height = 7 ) plot( x=year.ind, y=female.percents["Katrina",], type="l", xlim=c(1945,2011), # ylim=c(0,0.03), xlab="Year", ylab="Percent", col=cols[3], lwd=3, cex.lab = 1.5, cex.axis = 1.5 ) lines( x = year.ind[which(year.ind==1945):length(year.ind)], y = female.percents["Hilary", cutoff:dim(female.percents)[2]], col = cols[2], lwd=3 ) text(x=1980,y=0.06,col=cols[2],labels="Hilary",font=2,cex=1.5) text(x=2000,y=0.1,col=cols[3],labels="Katrina",font=2,cex=1.5) dev.off() year.ind<-1880:2011 rel.risk <- female.percents[,-1] for(i in 1:dim(female.percents)[2] - 1){ rel.risk[,i] <- female.percents[,i+1] / female.percents[,i] } # create table of biggest drops # bigdropsind <- which( rel.risk < 0.33, arr.ind = TRUE ) loss <- (1 - round(rel.risk[bigdropsind],2)) * 100 yearlost <- year.ind[bigdropsind[,2] + 1] bigdrops <- as.data.frame( cbind( name = rownames(bigdropsind), loss, yearlost ) ) View(bigdrops) plot( x = year.ind[which(year.ind==1945):length(year.ind)], y = female.percents["Hillary", cutoff:dim(female.percents)[2]], type = "l", # ylim = c(0,0.2), xlab = "Year", ylab = "Percent", cex.lab = 1.5, cex.axis = 1.5 ) lines( x = year.ind[which(year.ind==1945):length(year.ind)], y = female.percents["Hillary", cutoff:dim(female.percents)[2]], col = cols[3], lwd=3 ) lines( x = year.ind[which(year.ind==1945):length(year.ind)], y = female.percents["Hilary", cutoff:dim(female.percents)[2]], col = cols[2], lwd=3 ) # tweak each of these individually so graph looks nice in this case # text( x = 1997, y = 0.05, labels = "Hilary", col = cols[2], cex = 1.5, font = 2 ) text( x = 1996, y = 0.13, labels = "Hillary", col = cols[3], cex = 1.5, font = 2 ) text( x = 1985, y = 0.10, labels = "Hillary", col = cols[3], cex = 1.5, font = 2 ) # tweak each of these individually so graph looks nice in this case # text( x = 1999, y = 0.05, labels = "Hilary", col = cols[2], cex = 1.5, font = 2 ) pdf( file="Hilary_Hillary.pdf", width = 10, height = 7 ) plot( x = year.ind[which(year.ind==1945):length(year.ind)], y = female.percents["Hillary", cutoff:dim(female.percents)[2]], type = "l", # ylim = c(0,0.2), xlab = "Year", ylab = "Percent", cex.lab = 1.5, cex.axis = 1.5 ) lines( x = year.ind[which(year.ind==1945):length(year.ind)], y = female.percents["Hillary", cutoff:dim(female.percents)[2]], col = cols[3], lwd=3 ) lines( x = year.ind[which(year.ind==1945):length(year.ind)], y = female.percents["Hilary", cutoff:dim(female.percents)[2]], col = cols[2], lwd=3 ) # tweak each of these individually so graph looks nice in this case # text( x = 1999, y = 0.05, labels = "Hilary", col = cols[2], cex = 1.5, font = 2 ) text( x = 1985, y = 0.10, labels = "Hillary", col = cols[3], cex = 1.5, font = 2 ) dev.off() # Hilary and Hillary pdf( file="Hilary_Hillary.pdf", width = 10, height = 7 ) plot( x = year.ind[which(year.ind==1945):length(year.ind)], y = female.percents["Hillary", cutoff:dim(female.percents)[2]], type = "l", # ylim = c(0,0.2), xlab = "Year", ylab = "Percent", cex.lab = 1.5, cex.axis = 1.5 ) lines( x = year.ind[which(year.ind==1945):length(year.ind)], y = female.percents["Hillary", cutoff:dim(female.percents)[2]], col = cols[1], lwd=3 ) lines( x = year.ind[which(year.ind==1945):length(year.ind)], y = female.percents["Hilary", cutoff:dim(female.percents)[2]], col = cols[2], lwd=3 ) # tweak each of these individually so graph looks nice in this case # text( x = 1999, y = 0.05, labels = "Hilary", col = cols[1], cex = 1.5, font = 2 ) text( x = 1985, y = 0.10, labels = "Hillary", col = cols[3], cex = 1.5, font = 2 ) dev.off() # Hilary and Hillary pdf( file="Hilary_Hillary.pdf", width = 10, height = 7 ) plot( x = year.ind[which(year.ind==1945):length(year.ind)], y = female.percents["Hillary", cutoff:dim(female.percents)[2]], type = "l", # ylim = c(0,0.2), xlab = "Year", ylab = "Percent", cex.lab = 1.5, cex.axis = 1.5 ) lines( x = year.ind[which(year.ind==1945):length(year.ind)], y = female.percents["Hillary", cutoff:dim(female.percents)[2]], col = cols[1], lwd=3 ) lines( x = year.ind[which(year.ind==1945):length(year.ind)], y = female.percents["Hilary", cutoff:dim(female.percents)[2]], col = cols[2], lwd=3 ) # tweak each of these individually so graph looks nice in this case # text( x = 1999, y = 0.05, labels = "Hilary", col = cols[2], cex = 1.5, font = 2 ) text( x = 1985, y = 0.10, labels = "Hillary", col = cols[1], cex = 1.5, font = 2 ) dev.off() ================================================ FILE: graphs/.gitignore ================================================ ================================================ FILE: lib/getNames.R ================================================ # Function for retrieving names from the SSA Website # # set the years you'd like to examine # # min year = 1880, max year = 2011 # # number="p" gives percent, number="n" gives raw number. Rank is always given # # female=TRUE female names, FALSE male names # returns two matrices -- one with the raw number or percentages, and one with the ranks # getNames<-function(year.ind=seq(1950,2011),number="p",female=TRUE){ nametable<-list() names<-c(NA) # scrape from website # for(i in 1:length(year.ind)){ # get the data from the website as a POST form # raw <- postForm("http://www.ssa.gov/cgi-bin/popularnames.cgi",year=year.ind[i],top=1000,number=number,style="post") # read the HTML output into an R table nametable[[i]] <- cbind(readHTMLTable(raw,which=3)[-1001,],"Year"=rep(year.ind[i],1000)) # keep a vector with just the female names for creating results matrix next if(female==TRUE){ names<-c(names,as.character(nametable[[i]]$"Female name")) } if(female==FALSE){ names<-c(names,as.character(nametable[[i]]$"Male name")) } } names<-names[-1] # unique names from all of the years you looked at unique.names<-unique(names) # create results matrix, rows are the unique names from all years, columns are the years # names.mat<-matrix(nrow=length(unique.names),ncol=length(year.ind)) rownames(names.mat)<-unique.names nms<-rep(NA,length(year.ind)) for(i in 1:length(year.ind)){ nms[i]<-as.character(year.ind[i]) } colnames(names.mat)<-nms ranks.mat<-names.mat for(i in 1:length(year.ind)){ if(female==TRUE){ temp.names<-as.character(nametable[[i]]$"Female name") # need to replace commas and extract numbers# if(number=="n"){ temp.nums<-as.numeric(gsub(",","",as.character(nametable[[i]]$"Number of females"))) } if(number=="p"){ temp.nums<-as.numeric(gsub("%","",as.character(nametable[[i]]$"Percent oftotal females"))) } } if(female==FALSE){ temp.names<-as.character(nametable[[i]]$"Male name") # need to replace commas and extract numbers# if(number=="n"){ temp.nums<-as.numeric(gsub(",","",as.character(nametable[[i]]$"Number of males"))) } if(number=="p"){ temp.nums<-as.numeric(gsub("%","",as.character(nametable[[i]]$"Percent oftotal males"))) } } temp.ranks<-1:1000 # match matrix column to the year index # ind<-match(unique.names,temp.names) # go thru every unique name and fill in that year's data for that name # for(j in 1:length(ind)){ if(!is.na(ind[j])){ names.mat[j,i]<-temp.nums[ind[j]] ranks.mat[j,i]<-temp.ranks[ind[j]] } } } res<-list(names.mat,ranks.mat) return(res) } ================================================ FILE: munge/.gitignore ================================================ ================================================ FILE: munge/01-A-scrapingdata.R ================================================ # Gathering data from SSA website using getNames function # # ranks will be the same from each iteration, only take ranks from the first one tmp <- getNames(year.ind=seq(1880,2011),number="p",female=TRUE) female.percents <- tmp[[1]] female.ranks <- tmp[[2]] ProjectTemplate::cache("female.percents") ProjectTemplate::cache("female.ranks") tmp <- getNames(year.ind=seq(1880,2011),number="p",female=FALSE) male.percents <- tmp[[1]] male.ranks <- tmp[[2]] ProjectTemplate::cache("male.percents") ProjectTemplate::cache("male.ranks") tmp <- getNames(year.ind=seq(1880,2011),number="n",female=TRUE) female.nums <- tmp[[1]] ProjectTemplate::cache("female.nums") tmp <- getNames(year.ind=seq(1880,2011),number="n",female=FALSE) male.nums <- tmp[[1]] ProjectTemplate::cache("male.nums") ================================================ FILE: reports/.gitignore ================================================ ================================================ FILE: reports/bigdrops.csv ================================================ "name","loss","yearlost" "Clementine","69","1881" "Celestine","67","1881" "Minna","68","1883" "Dewey","74","1899" "Deneen","72","1965" "Katina","69","1974" "Catina","74","1974" "Farrah","78","1978" "Renata","69","1981" "Infant","67","1991" "Iesha","69","1992" "Hilary","70","1993" "Khadijah","72","1995" "Ashanti","68","2003" ================================================ FILE: src/analysis-scraping.Rout ================================================ R version 2.15.2 Patched (2012-10-28 r61038) -- "Trick or Treat" Copyright (C) 2012 The R Foundation for Statistical Computing ISBN 3-900051-07-0 Platform: x86_64-unknown-linux-gnu (64-bit) R is free software and comes with ABSOLUTELY NO WARRANTY. You are welcome to redistribute it under certain conditions. Type 'license()' or 'licence()' for distribution details. Natural language support but running in an English locale R is a collaborative project with many contributors. Type 'contributors()' for more information and 'citation()' on how to cite R or R packages in publications. Type 'demo()' for some demos, 'help()' for on-line help, or 'help.start()' for an HTML browser interface to help. Type 'q()' to quit R. > setwd("/home/bst/student/hiparker/names") > library('ProjectTemplate') Loading required package: testthat > load.project() Loading project configuration Autoloading helper functions Running helper script: getNames.R Autoloading packages Loading package: RCurl Loading required package: bitops Loading package: XML Autoloading data Loading cached data set: female.nums Loading cached data set: female.percents Loading cached data set: female.ranks Loading cached data set: male.nums Loading cached data set: male.percents Loading cached data set: male.ranks Munging data Running preprocessing script: 01-A-scrapingdata.R > > > #tmp1 > > #dim(tmp1) > #tmp2<-tmp1[,-1] > #for(i in 1:dim(tmp1)[2]-1){ > # tmp2[,i]<-tmp1[,i+1]/tmp1[,i] > #} > #min(tmp2,na.rm=TRUE) > #which(tmp2<0.5,arr.ind=TRUE) > #tmp2 > > #?lapply > > #names.mat["Hilary",] > #names.mat["Hillary",] > > > proc.time() user system elapsed 623.337 0.805 1249.528 ================================================ FILE: src/analysis.R ================================================ setwd("C:/Users/Hilary/GitHub/names") library('ProjectTemplate') load.project() year.ind<-1880:2011 rel.risk <- female.percents[,-1] for(i in 1:dim(female.percents)[2]-1){ rel.risk[,i]<-female.percents[,i+1]/female.percents[,i] } # create table of biggest drops # which(rel.risk<0.33,arr.ind=TRUE)->bigdropsind loss<-(1-round(rel.risk[bigdropsind],2))*100 yearlost<-year.ind[bigdropsind[,2]+1] bigdrops<-as.data.frame(cbind(name=rownames(bigdropsind),loss,yearlost)) setwd("C:/Users/Hilary/GitHub/names/reports") write.table(bigdrops,file="bigdrops.csv",row.names=FALSE,sep=",") setwd("C:/Users/Hilary/GitHub/names") #compare to boys# rel.risk.b <- male.percents[,-1] for(i in 1:dim(male.percents)[2]-1){ rel.risk.b[,i]<-male.percents[,i+1]/male.percents[,i] } which(rel.risk.b<0.33,arr.ind=TRUE)->bigdropsind.b loss.b<-(1-round(rel.risk.b[bigdropsind.b],2))*100 yearlost.b<-year.ind[bigdropsind.b[,2]+1] bigdrops.b<-as.data.frame(cbind(name=rownames(bigdropsind.b),loss.b,yearlost.b)) # create graph of these names # len<-length(loss) #display.brewer.all() colors <- brewer.pal(12, "Paired") pal <- colorRampPalette(colors) cols <- pal(len) # quick function for plotting names on graph # plotname<-function(ind,offx,offy){ tmprow<-bigdropsind[ind,1] tmpcol<-bigdropsind[ind,2] text(x=year.ind[tmpcol]+offx,y=female.percents[tmprow,tmpcol]+offy, labels=rownames(bigdropsind)[ind],col=cols[ind],font=2) } # create graph # setwd("C:/Users/Hilary/GitHub/names/graphs") png(file="names.png",width = 480*2, height = 480) plot(x=year.ind,y=female.percents["Hilary",],type="l",ylim=c(0,0.2),xlab="Year",ylab="Percent", main="Percent of baby girls given a name over time for the 14 most poisoned names") for(i in 1:len){ lines(x=year.ind,y=female.percents[bigdropsind[i,1],],col=cols[i],lwd=3) } lines(x=year.ind,y=female.percents["Hilary",],type="l",col=cols[12],lwd=5) # put names on graph # # tweak each of these individually so graph looks nice in this case # plotname(1,1,0.005) plotname(2,0,0.005) plotname(3,0,0.012) plotname(4,0,0.005) plotname(5,-2,0.005) plotname(6,0,0.018) plotname(7,-3,0.01) plotname(8,1,0.008) plotname(9,5,-0.015) plotname(10,0,-0.025) plotname(11,0,0.005) plotname(12,-7,0.005) plotname(13,2.5,0.005) plotname(14,0,0.005) dev.off() ## non-one-hit-wonders ## nonflashind<-rowSums((!is.na(female.percents[bigdropsind[,1],])))>20 bigdropsind.non<-bigdropsind*nonflashind bigdropsind.non<-bigdropsind.non[bigdropsind.non[,1]!=0,] setwd("C:/Users/Hilary/GitHub/names/graphs") png(file="names_trimmed.png",width = 480*2, height = 480) plot(x=year.ind,y=female.percents["Hilary",],type="l",ylim=c(0,0.1),xlab="Year",ylab="Percent", main="Percent of baby girls given a name over time for the 14 most poisoned names, controlling for fads") for(i in 1:dim(bigdropsind.non)[1]){ lines(x=year.ind,y=female.percents[bigdropsind.non[i,1],],col=cols[i],lwd=3) } lines(x=year.ind,y=female.percents["Hilary",],type="l",col=cols[12],lwd=5) plotname(1,1,0.005) plotname(2,0,0.005) plotname(3,0,0.012) plotname(12,0,0.008) dev.off() # get Hillary on the graph too # # create table of biggest drops # which(rel.risk<0.39,arr.ind=TRUE)->bigdropsind loss<-(1-round(rel.risk[bigdropsind],2))*100 yearlost<-year.ind[bigdropsind[,2]+1] bigdrops<-as.data.frame(cbind(name=rownames(bigdropsind),loss,yearlost)) bigdrops nonflashind<-rowSums((!is.na(female.percents[bigdropsind[,1],])))>20 bigdropsind.non<-bigdropsind*nonflashind bigdropsind.non<-bigdropsind.non[bigdropsind.non[,1]!=0,] len<-dim(bigdropsind.non)[1] cols <- pal(len) setwd("C:/Users/Hilary/GitHub/names/graphs") png(file="more_names_trimmed.png",width = 480*2, height = 480) plot(x=year.ind,y=female.percents["Hilary",],type="l",ylim=c(0,0.35),xlab="Year",ylab="Percent", main="Percent of baby girls given a name over time for the 39 most poisoned names, controlling for fads") for(i in 1:dim(bigdropsind.non)[1]){ lines(x=year.ind,y=female.percents[bigdropsind.non[i,1],],col=cols[i],lwd=3) } # Hilary text(x=1996.5,y=0.06,col=cols[20],labels="Hilary",font=2) # Hillary text(x=1995,y=0.14,col=cols[21],labels="Hillary",font=2) # Marian text(x=1954,y=0.21,col=cols[18],labels="Marian",font=2) # Christin text(x=1993,y=-0.001,col=cols[19],labels="Christin",font=2) dev.off() # Adolph and Adolf #display.brewer.all() colors <- brewer.pal(8, "Dark2") pal <- colorRampPalette(colors) cols <- pal(5) setwd("C:/Users/Hilary/GitHub/names/graphs") png(file="names_adolf.png",width = 480*2, height = 480) plot(x=year.ind,y=male.percents["Adolf",],type="l",ylim=c(0,0.15),col=cols[2],lwd=3, xlab="Year",ylab="Percent", main="Percent of babies named Adolf, Adolph, Hilary or Hillary over time") lines(x=year.ind,y=male.percents["Adolph",],col=cols[1],lwd=3) lines(x=year.ind,y=female.percents["Hilary",],col=cols[3],lwd=3) lines(x=year.ind,y=female.percents["Hillary",],col=cols[4],lwd=3) # Adolf text(x=1919,y=0.01,col=cols[2],labels="Adolf",font=2) # Adolph text(x=1937,y=0.025,col=cols[1],labels="Adolph",font=2) # Hilary text(x=1997,y=0.06,col=cols[3],labels="Hilary",font=2) # Hillary text(x=1995,y=0.13,col=cols[4],labels="Hillary",font=2) dev.off() ## Analysis of the name Lolita ## setwd("./graphs") png(file="names_Lolita.png",width = 480*2, height = 480) plot(x=year.ind,y=female.percents["Lolita",],type="l",ylim=c(0,0.03),xlab="Year",ylab="Percent", main="Percent of baby girls given the name Lolita",col=cols[3],lwd=3) text(x=1954,y=0.013,col=cols[3],labels="Lolita",font=2) dev.off() ================================================ FILE: src/analysis.sh ================================================ #!/bin/bash R CMD BATCH analysis.R ================================================ FILE: src/analysis_for_ignite.R ================================================ library('ProjectTemplate') load.project() year.ind<-1880:2011 rel.risk <- female.percents[,-1] for(i in 1:dim(female.percents)[2] - 1){ rel.risk[,i] <- female.percents[,i+1] / female.percents[,i] } # create table of biggest drops # bigdropsind <- which( rel.risk < 0.33, arr.ind = TRUE ) loss <- (1 - round(rel.risk[bigdropsind],2)) * 100 yearlost <- year.ind[bigdropsind[,2] + 1] bigdrops <- as.data.frame( cbind( name = rownames(bigdropsind), loss, yearlost ) ) setwd("./reports") write.table( bigdrops, file = "bigdrops.csv", row.names = FALSE, sep = "," ) setwd("..") #compare to boys# rel.risk.b <- male.percents[,-1] for(i in 1:dim(male.percents)[2]-1){ rel.risk.b[,i]<-male.percents[,i+1]/male.percents[,i] } which(rel.risk.b<0.33,arr.ind=TRUE)->bigdropsind.b loss.b<-(1-round(rel.risk.b[bigdropsind.b],2))*100 yearlost.b<-year.ind[bigdropsind.b[,2]+1] bigdrops.b<-as.data.frame(cbind(name=rownames(bigdropsind.b),loss.b,yearlost.b)) # create graph of these names # len<-length(loss) #display.brewer.all() colors <- brewer.pal(12, "Paired") pal <- colorRampPalette(colors) cols <- pal(len) # limit to names past 1935 bigdropsind <- bigdropsind[-c(1,2,3,4),] # adjust len len<-length(year.ind[which(year.ind==1945):length(year.ind)]) # create graph # setwd("./graphs") pdf( file="names_past_1945.pdf", width = 10, height = 7 ) cutoff <- which(colnames(rel.risk) == 1945) + 1 plot( x = year.ind[which(year.ind==1945):length(year.ind)], y = female.percents["Hilary", cutoff:dim(female.percents)[2]], type = "l", ylim = c(0,0.2), xlab = "Year", ylab = "Percent", cex.lab = 1.5, cex.axis = 1.5 ) for(i in 1:len){ lines( x = year.ind[which(year.ind==1945):length(year.ind)], y = female.percents[bigdropsind[i,1], cutoff:dim(female.percents)[2]], col = cols[i+4], lwd=3 ) } lines( x = year.ind[which(year.ind==1945):length(year.ind)], y = female.percents["Hilary", cutoff:dim(female.percents)[2]], type = "l", col = cols[2], lwd = 5 ) # put names on graph # # quick function for plotting names on graph # plotname <- function(ind, offx, offy){ tmprow <- bigdropsind[ind,1] tmpcol <- bigdropsind[ind,2] text( x = year.ind[tmpcol] + offx, y = female.percents[tmprow, tmpcol] + offy, labels = rownames(bigdropsind)[ind], col = cols[ind+4], cex = 1.5, font = 2 ) } # tweak each of these individually so graph looks nice in this case # plotname(1,-5,0.002) plotname(2,-2,0.02) plotname(3,-4.8,0.013) plotname(4,1,0.008) plotname(5,0,-0.04) plotname(6,0,-0.025) plotname(7,-2,0.01) # skip hilary plotname(9,3,0.01) plotname(10,0,0.01) # add hilary by hand text( x = 1955, y = 0.018, labels = "Hilary", col = cols[2], cex = 1.5, font = 2 ) dev.off() # Catina, Katina pdf( file="Catina_Katina.pdf", width = 10, height = 7 ) plot( x = year.ind[which(year.ind==1945):length(year.ind)], y = female.percents["Katina", cutoff:dim(female.percents)[2]], type = "l", ylim = c(0,0.2), xlab = "Year", ylab = "Percent", cex.lab = 1.5, cex.axis = 1.5 ) lines( x = year.ind[which(year.ind==1945):length(year.ind)], y = female.percents["Katina", cutoff:dim(female.percents)[2]], col = cols[6], lwd=3 ) lines( x = year.ind[which(year.ind==1945):length(year.ind)], y = female.percents["Catina", cutoff:dim(female.percents)[2]], col = cols[7], lwd=3 ) # tweak each of these individually so graph looks nice in this case # plotname(2,-2,0.02) plotname(3,-4.8,0.013) dev.off() # Farrah pdf( file="Farrah.pdf", width = 10, height = 7 ) plot( x = year.ind[which(year.ind==1945):length(year.ind)], y = female.percents["Farrah", cutoff:dim(female.percents)[2]], type = "l", ylim = c(0,0.2), xlab = "Year", ylab = "Percent", cex.lab = 1.5, cex.axis = 1.5 ) lines( x = year.ind[which(year.ind==1945):length(year.ind)], y = female.percents["Farrah", cutoff:dim(female.percents)[2]], col = cols[8], lwd=3 ) # tweak each of these individually so graph looks nice in this case # plotname(4,1,0.008) dev.off() # Iesha pdf( file="Iesha.pdf", width = 10, height = 7 ) plot( x = year.ind[which(year.ind==1945):length(year.ind)], y = female.percents["Iesha", cutoff:dim(female.percents)[2]], type = "l", ylim = c(0,0.2), xlab = "Year", ylab = "Percent", cex.lab = 1.5, cex.axis = 1.5 ) lines( x = year.ind[which(year.ind==1945):length(year.ind)], y = female.percents["Iesha", cutoff:dim(female.percents)[2]], col = cols[11], lwd=3 ) # tweak each of these individually so graph looks nice in this case # plotname(7,-2,0.01) dev.off() # Ashanti pdf( file="Ashanti.pdf", width = 10, height = 7 ) plot( x = year.ind[which(year.ind==1945):length(year.ind)], y = female.percents["Ashanti", cutoff:dim(female.percents)[2]], type = "l", ylim = c(0,0.2), xlab = "Year", ylab = "Percent", cex.lab = 1.5, cex.axis = 1.5 ) lines( x = year.ind[which(year.ind==1945):length(year.ind)], y = female.percents["Ashanti", cutoff:dim(female.percents)[2]], col = cols[14], lwd=3 ) # tweak each of these individually so graph looks nice in this case # plotname(10,0,0.01) dev.off() # Hilary and Hillary pdf( file="Hilary_Hillary.pdf", width = 10, height = 7 ) plot( x = year.ind[which(year.ind==1945):length(year.ind)], y = female.percents["Hillary", cutoff:dim(female.percents)[2]], type = "l", # ylim = c(0,0.2), xlab = "Year", ylab = "Percent", cex.lab = 1.5, cex.axis = 1.5 ) lines( x = year.ind[which(year.ind==1945):length(year.ind)], y = female.percents["Hillary", cutoff:dim(female.percents)[2]], col = cols[1], lwd=3 ) lines( x = year.ind[which(year.ind==1945):length(year.ind)], y = female.percents["Hilary", cutoff:dim(female.percents)[2]], col = cols[2], lwd=3 ) # tweak each of these individually so graph looks nice in this case # text( x = 1999, y = 0.05, labels = "Hilary", col = cols[2], cex = 1.5, font = 2 ) text( x = 1985, y = 0.10, labels = "Hillary", col = cols[1], cex = 1.5, font = 2 ) dev.off() # Adolph and Adolf pdf( file="Adolph_Adolf_Hilary_Hillary.pdf", width = 10, height = 7 ) plot( x = year.ind, y = male.percents["Adolf",], type = "l", ylim = c(0,0.15), col = cols[4], lwd = 3, xlab="Year", ylab="Percent", cex.lab = 1.5, cex.axis = 1.5 ) lines(x = year.ind, y = male.percents["Adolph",], col = cols[3], lwd = 3) lines(x = year.ind, y = female.percents["Hilary",], col = cols[2], lwd=3) lines(x = year.ind, y = female.percents["Hillary",], col = cols[1], lwd=3) # Adolf text(x=1919,y=0.015,col=cols[4],labels="Adolf",font=2,cex=1.5) # Adolph text(x=1930,y=0.05,col=cols[3],labels="Adolph",font=2,cex=1.5) # Hilary text(x=2001,y=0.055,col=cols[2],labels="Hilary",font=2,cex=1.5) # Hillary text(x=1984,y=0.13,col=cols[1],labels="Hillary",font=2,cex=1.5) dev.off() # Katrina pdf( file="Katrina_Hilary.pdf", width = 10, height = 7 ) plot( x=year.ind, y=female.percents["Katrina",], type="l", xlim=c(1945,2011), # ylim=c(0,0.03), xlab="Year", ylab="Percent", col=cols[3], lwd=3, cex.lab = 1.5, cex.axis = 1.5 ) lines( x = year.ind[which(year.ind==1945):length(year.ind)], y = female.percents["Hilary", cutoff:dim(female.percents)[2]], col = cols[2], lwd=3 ) text(x=1980,y=0.06,col=cols[2],labels="Hilary",font=2,cex=1.5) text(x=2000,y=0.1,col=cols[3],labels="Katrina",font=2,cex=1.5) dev.off()