Full Code of wesm/pydata-book for AI

3rd-edition a892791ce5a3 cached
203 files
238.9 MB
4.0M tokens
1 requests
Copy disabled (too large) Download .txt
Showing preview only (15,826K chars total). Download the full file to get everything.
Repository: wesm/pydata-book
Branch: 3rd-edition
Commit: a892791ce5a3
Files: 203
Total size: 238.9 MB

Directory structure:
gitextract_mai23lqn/

├── .gitignore
├── COPYING
├── README.md
├── appa.ipynb
├── appb.ipynb
├── ch02.ipynb
├── ch03.ipynb
├── ch04.ipynb
├── ch05.ipynb
├── ch06.ipynb
├── ch07.ipynb
├── ch08.ipynb
├── ch09.ipynb
├── ch10.ipynb
├── ch11.ipynb
├── ch12.ipynb
├── ch13.ipynb
├── datasets/
│   ├── babynames/
│   │   ├── yob1880.txt
│   │   ├── yob1881.txt
│   │   ├── yob1882.txt
│   │   ├── yob1883.txt
│   │   ├── yob1884.txt
│   │   ├── yob1885.txt
│   │   ├── yob1886.txt
│   │   ├── yob1887.txt
│   │   ├── yob1888.txt
│   │   ├── yob1889.txt
│   │   ├── yob1890.txt
│   │   ├── yob1891.txt
│   │   ├── yob1892.txt
│   │   ├── yob1893.txt
│   │   ├── yob1894.txt
│   │   ├── yob1895.txt
│   │   ├── yob1896.txt
│   │   ├── yob1897.txt
│   │   ├── yob1898.txt
│   │   ├── yob1899.txt
│   │   ├── yob1900.txt
│   │   ├── yob1901.txt
│   │   ├── yob1902.txt
│   │   ├── yob1903.txt
│   │   ├── yob1904.txt
│   │   ├── yob1905.txt
│   │   ├── yob1906.txt
│   │   ├── yob1907.txt
│   │   ├── yob1908.txt
│   │   ├── yob1909.txt
│   │   ├── yob1910.txt
│   │   ├── yob1911.txt
│   │   ├── yob1912.txt
│   │   ├── yob1913.txt
│   │   ├── yob1914.txt
│   │   ├── yob1915.txt
│   │   ├── yob1916.txt
│   │   ├── yob1917.txt
│   │   ├── yob1918.txt
│   │   ├── yob1919.txt
│   │   ├── yob1920.txt
│   │   ├── yob1921.txt
│   │   ├── yob1922.txt
│   │   ├── yob1923.txt
│   │   ├── yob1924.txt
│   │   ├── yob1925.txt
│   │   ├── yob1926.txt
│   │   ├── yob1927.txt
│   │   ├── yob1928.txt
│   │   ├── yob1929.txt
│   │   ├── yob1930.txt
│   │   ├── yob1931.txt
│   │   ├── yob1932.txt
│   │   ├── yob1933.txt
│   │   ├── yob1934.txt
│   │   ├── yob1935.txt
│   │   ├── yob1936.txt
│   │   ├── yob1937.txt
│   │   ├── yob1938.txt
│   │   ├── yob1939.txt
│   │   ├── yob1940.txt
│   │   ├── yob1941.txt
│   │   ├── yob1942.txt
│   │   ├── yob1943.txt
│   │   ├── yob1944.txt
│   │   ├── yob1945.txt
│   │   ├── yob1946.txt
│   │   ├── yob1947.txt
│   │   ├── yob1948.txt
│   │   ├── yob1949.txt
│   │   ├── yob1950.txt
│   │   ├── yob1951.txt
│   │   ├── yob1952.txt
│   │   ├── yob1953.txt
│   │   ├── yob1954.txt
│   │   ├── yob1955.txt
│   │   ├── yob1956.txt
│   │   ├── yob1957.txt
│   │   ├── yob1958.txt
│   │   ├── yob1959.txt
│   │   ├── yob1960.txt
│   │   ├── yob1961.txt
│   │   ├── yob1962.txt
│   │   ├── yob1963.txt
│   │   ├── yob1964.txt
│   │   ├── yob1965.txt
│   │   ├── yob1966.txt
│   │   ├── yob1967.txt
│   │   ├── yob1968.txt
│   │   ├── yob1969.txt
│   │   ├── yob1970.txt
│   │   ├── yob1971.txt
│   │   ├── yob1972.txt
│   │   ├── yob1973.txt
│   │   ├── yob1974.txt
│   │   ├── yob1975.txt
│   │   ├── yob1976.txt
│   │   ├── yob1977.txt
│   │   ├── yob1978.txt
│   │   ├── yob1979.txt
│   │   ├── yob1980.txt
│   │   ├── yob1981.txt
│   │   ├── yob1982.txt
│   │   ├── yob1983.txt
│   │   ├── yob1984.txt
│   │   ├── yob1985.txt
│   │   ├── yob1986.txt
│   │   ├── yob1987.txt
│   │   ├── yob1988.txt
│   │   ├── yob1989.txt
│   │   ├── yob1990.txt
│   │   ├── yob1991.txt
│   │   ├── yob1992.txt
│   │   ├── yob1993.txt
│   │   ├── yob1994.txt
│   │   ├── yob1995.txt
│   │   ├── yob1996.txt
│   │   ├── yob1997.txt
│   │   ├── yob1998.txt
│   │   ├── yob1999.txt
│   │   ├── yob2000.txt
│   │   ├── yob2001.txt
│   │   ├── yob2002.txt
│   │   ├── yob2003.txt
│   │   ├── yob2004.txt
│   │   ├── yob2005.txt
│   │   ├── yob2006.txt
│   │   ├── yob2007.txt
│   │   ├── yob2008.txt
│   │   ├── yob2009.txt
│   │   └── yob2010.txt
│   ├── bitly_usagov/
│   │   └── example.txt
│   ├── fec/
│   │   ├── P00000001-ALL.csv
│   │   └── fec.parquet
│   ├── haiti/
│   │   ├── Haiti.csv
│   │   └── PortAuPrince_Roads/
│   │       ├── PortAuPrince_Roads.dbf
│   │       ├── PortAuPrince_Roads.prj
│   │       ├── PortAuPrince_Roads.sbn
│   │       ├── PortAuPrince_Roads.sbx
│   │       ├── PortAuPrince_Roads.shp
│   │       ├── PortAuPrince_Roads.shx
│   │       └── PortAuPrince_Roads_README.txt
│   ├── movielens/
│   │   └── README
│   ├── mta_perf/
│   │   ├── Performance_LIBUS.xml
│   │   ├── Performance_LIBUS.xsd
│   │   ├── Performance_LIRR.xml
│   │   ├── Performance_LIRR.xsd
│   │   ├── Performance_MNR.xml
│   │   ├── Performance_MNR.xsd
│   │   ├── Performance_MTABUS.xml
│   │   ├── Performance_MTABUS.xsd
│   │   ├── Performance_NYCT.xml
│   │   ├── Performance_NYCT.xsd
│   │   ├── Performance_TBTA.xml
│   │   ├── Performance_TBTA.xsd
│   │   └── parse.py
│   ├── titanic/
│   │   ├── genderclassmodel.csv
│   │   ├── gendermodel.csv
│   │   ├── test.csv
│   │   └── train.csv
│   └── usda_food/
│       └── database.json
├── examples/
│   ├── array_ex.txt
│   ├── csv_mindex.csv
│   ├── ex1.csv
│   ├── ex1.xlsx
│   ├── ex2.csv
│   ├── ex3.txt
│   ├── ex4.csv
│   ├── ex5.csv
│   ├── ex6.csv
│   ├── ex7.csv
│   ├── example.json
│   ├── fdic_failed_bank_list.html
│   ├── ipython_bug.py
│   ├── macrodata.csv
│   ├── segismundo.txt
│   ├── spx.csv
│   ├── stock_px.csv
│   ├── test_file.csv
│   ├── tips.csv
│   ├── tseries.csv
│   ├── volume.csv
│   ├── yahoo_price.pkl
│   └── yahoo_volume.pkl
├── pyproject.toml
└── requirements.txt

================================================
FILE CONTENTS
================================================

================================================
FILE: .gitignore
================================================
.ipynb_checkpoints

================================================
FILE: COPYING
================================================
Code examples from "Python for Data Analysis", 3rd Edition

The MIT License (MIT)

Copyright (c) 2022 Wes McKinney

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

================================================
FILE: README.md
================================================
# Python for Data Analysis, 3rd Edition

Materials and IPython notebooks for "Python for Data Analysis, 3rd
Edition" by Wes McKinney, published by O'Reilly Media. Book content
including updates and errata fixes can be [found for free on my
website][6].

[Buy the book on Amazon][1]

Follow Wes on Twitter: [![Twitter Follow](https://img.shields.io/twitter/follow/wesmckinn.svg?style=social&label=Follow)](https://twitter.com/wesmckinn)

## Setup Instructions

### Option 1: Using uv (Recommended)

[uv](https://github.com/astral-sh/uv) is a fast Python package installer and resolver. To get started:

1. Install uv if you haven't already:
```bash
curl -LsSf https://astral.sh/uv/install.sh | sh
```

2. Start Jupyter Notebook with all dependencies:
```bash
uv run jupyter notebook
```

That's it! uv will automatically create a virtual environment and install all required packages from `pyproject.toml`.

### Option 2: Using Conda

1. Create a new conda environment:
```bash
conda create -n pydata-book python=3.11
conda activate pydata-book
```

2. Install dependencies:
```bash
pip install -r requirements.txt
```

3. Start Jupyter Notebook:
```bash
jupyter notebook
```

**Note:** This project uses pandas 2.0.3 to ensure compatibility with the notebooks.

# 2nd Edition Readers

If you are reading the 2nd Edition (published in 2017), please find the
reorganized book materials on the [`2nd-edition` branch][5].

# 1st Edition Readers

If you are reading the 1st Edition (published in 2012), please find the
reorganized book materials on the [`1st-edition` branch][2].

## IPython Notebooks:

* [Chapter 2: Python Language Basics, IPython, and Jupyter Notebooks](http://nbviewer.ipython.org/github/pydata/pydata-book/blob/3rd-edition/ch02.ipynb)
* [Chapter 3: Built-in Data Structures, Functions, and Files](http://nbviewer.ipython.org/github/pydata/pydata-book/blob/3rd-edition/ch03.ipynb)
* [Chapter 4: NumPy Basics: Arrays and Vectorized Computation](http://nbviewer.ipython.org/github/pydata/pydata-book/blob/3rd-edition/ch04.ipynb)
* [Chapter 5: Getting Started with pandas](http://nbviewer.ipython.org/github/pydata/pydata-book/blob/3rd-edition/ch05.ipynb)
* [Chapter 6: Data Loading, Storage, and File Formats](http://nbviewer.ipython.org/github/pydata/pydata-book/blob/3rd-edition/ch06.ipynb)
* [Chapter 7: Data Cleaning and Preparation](http://nbviewer.ipython.org/github/pydata/pydata-book/blob/3rd-edition/ch07.ipynb)
* [Chapter 8: Data Wrangling: Join, Combine, and Reshape](http://nbviewer.ipython.org/github/pydata/pydata-book/blob/3rd-edition/ch08.ipynb)
* [Chapter 9: Plotting and Visualization](http://nbviewer.ipython.org/github/pydata/pydata-book/blob/3rd-edition/ch09.ipynb)
* [Chapter 10: Data Aggregation and Group Operations](http://nbviewer.ipython.org/github/pydata/pydata-book/blob/3rd-edition/ch10.ipynb)
* [Chapter 11: Time Series](http://nbviewer.ipython.org/github/pydata/pydata-book/blob/3rd-edition/ch11.ipynb)
* [Chapter 12: Introduction to Modeling Libraries in Python](http://nbviewer.ipython.org/github/pydata/pydata-book/blob/3rd-edition/ch12.ipynb)
* [Chapter 13: Data Analysis Examples](http://nbviewer.ipython.org/github/pydata/pydata-book/blob/3rd-edition/ch13.ipynb)
* [Appendix A: Advanced NumPy](http://nbviewer.ipython.org/github/pydata/pydata-book/blob/3rd-edition/appa.ipynb)

## License

### Code

The code in this repository, including all code samples in the notebooks listed
above, is released under the [MIT license](LICENSE-CODE). Read more at the
[Open Source Initiative](https://opensource.org/licenses/MIT).

[1]: https://amzn.to/3DyLaJc
[2]: https://github.com/wesm/pydata-book/tree/1st-edition
[5]: https://github.com/wesm/pydata-book/tree/2nd-edition
[6]: https://wesmckinney.com/book/

================================================
FILE: appa.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "plt.rc('figure', figsize=(10, 6))\n",
    "PREVIOUS_MAX_ROWS = pd.options.display.max_rows\n",
    "pd.options.display.max_columns = 20\n",
    "pd.options.display.max_rows = 20\n",
    "pd.options.display.max_colwidth = 80\n",
    "np.set_printoptions(precision=4, suppress=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "rng = np.random.default_rng(seed=12345)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.ones((10, 5)).shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.ones((3, 4, 5), dtype=np.float64).strides"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "ints = np.ones(10, dtype=np.uint16)\n",
    "floats = np.ones(10, dtype=np.float32)\n",
    "np.issubdtype(ints.dtype, np.integer)\n",
    "np.issubdtype(floats.dtype, np.floating)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.float64.mro()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.issubdtype(ints.dtype, np.number)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr = np.arange(8)\n",
    "arr\n",
    "arr.reshape((4, 2))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr.reshape((4, 2)).reshape((2, 4))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr = np.arange(15)\n",
    "arr.reshape((5, -1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "other_arr = np.ones((3, 5))\n",
    "other_arr.shape\n",
    "arr.reshape(other_arr.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr = np.arange(15).reshape((5, 3))\n",
    "arr\n",
    "arr.ravel()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr.flatten()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr = np.arange(12).reshape((3, 4))\n",
    "arr\n",
    "arr.ravel()\n",
    "arr.ravel('F')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr1 = np.array([[1, 2, 3], [4, 5, 6]])\n",
    "arr2 = np.array([[7, 8, 9], [10, 11, 12]])\n",
    "np.concatenate([arr1, arr2], axis=0)\n",
    "np.concatenate([arr1, arr2], axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.vstack((arr1, arr2))\n",
    "np.hstack((arr1, arr2))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr = rng.standard_normal((5, 2))\n",
    "arr\n",
    "first, second, third = np.split(arr, [1, 3])\n",
    "first\n",
    "second\n",
    "third"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr = np.arange(6)\n",
    "arr1 = arr.reshape((3, 2))\n",
    "arr2 = rng.standard_normal((3, 2))\n",
    "np.r_[arr1, arr2]\n",
    "np.c_[np.r_[arr1, arr2], arr]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.c_[1:6, -10:-5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr = np.arange(3)\n",
    "arr\n",
    "arr.repeat(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr.repeat([2, 3, 4])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr = rng.standard_normal((2, 2))\n",
    "arr\n",
    "arr.repeat(2, axis=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr.repeat([2, 3], axis=0)\n",
    "arr.repeat([2, 3], axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr\n",
    "np.tile(arr, 2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr\n",
    "np.tile(arr, (2, 1))\n",
    "np.tile(arr, (3, 2))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr = np.arange(10) * 100\n",
    "inds = [7, 1, 2, 6]\n",
    "arr[inds]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr.take(inds)\n",
    "arr.put(inds, 42)\n",
    "arr\n",
    "arr.put(inds, [40, 41, 42, 43])\n",
    "arr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "inds = [2, 0, 2, 1]\n",
    "arr = rng.standard_normal((2, 4))\n",
    "arr\n",
    "arr.take(inds, axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr = np.arange(5)\n",
    "arr\n",
    "arr * 4"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr = rng.standard_normal((4, 3))\n",
    "arr.mean(0)\n",
    "demeaned = arr - arr.mean(0)\n",
    "demeaned\n",
    "demeaned.mean(0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr\n",
    "row_means = arr.mean(1)\n",
    "row_means.shape\n",
    "row_means.reshape((4, 1))\n",
    "demeaned = arr - row_means.reshape((4, 1))\n",
    "demeaned.mean(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr - arr.mean(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr - arr.mean(1).reshape((4, 1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr = np.zeros((4, 4))\n",
    "arr_3d = arr[:, np.newaxis, :]\n",
    "arr_3d.shape\n",
    "arr_1d = rng.standard_normal(3)\n",
    "arr_1d[:, np.newaxis]\n",
    "arr_1d[np.newaxis, :]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr = rng.standard_normal((3, 4, 5))\n",
    "depth_means = arr.mean(2)\n",
    "depth_means\n",
    "depth_means.shape\n",
    "demeaned = arr - depth_means[:, :, np.newaxis]\n",
    "demeaned.mean(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr = np.zeros((4, 3))\n",
    "arr[:] = 5\n",
    "arr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "col = np.array([1.28, -0.42, 0.44, 1.6])\n",
    "arr[:] = col[:, np.newaxis]\n",
    "arr\n",
    "arr[:2] = [[-1.37], [0.509]]\n",
    "arr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr = np.arange(10)\n",
    "np.add.reduce(arr)\n",
    "arr.sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "my_rng = np.random.default_rng(12346)  # for reproducibility\n",
    "arr = my_rng.standard_normal((5, 5))\n",
    "arr\n",
    "arr[::2].sort(1) # sort a few rows\n",
    "arr[:, :-1] < arr[:, 1:]\n",
    "np.logical_and.reduce(arr[:, :-1] < arr[:, 1:], axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr = np.arange(15).reshape((3, 5))\n",
    "np.add.accumulate(arr, axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr = np.arange(3).repeat([1, 2, 2])\n",
    "arr\n",
    "np.multiply.outer(arr, np.arange(5))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
    "x, y = rng.standard_normal((3, 4)), rng.standard_normal(5)\n",
    "result = np.subtract.outer(x, y)\n",
    "result.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr = np.arange(10)\n",
    "np.add.reduceat(arr, [0, 5, 8])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr = np.multiply.outer(np.arange(4), np.arange(5))\n",
    "arr\n",
    "np.add.reduceat(arr, [0, 2, 4], axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [],
   "source": [
    "def add_elements(x, y):\n",
    "    return x + y\n",
    "add_them = np.frompyfunc(add_elements, 2, 1)\n",
    "add_them(np.arange(8), np.arange(8))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [],
   "source": [
    "add_them = np.vectorize(add_elements, otypes=[np.float64])\n",
    "add_them(np.arange(8), np.arange(8))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr = rng.standard_normal(10000)\n",
    "%timeit add_them(arr, arr)\n",
    "%timeit np.add(arr, arr)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [],
   "source": [
    "dtype = [('x', np.float64), ('y', np.int32)]\n",
    "sarr = np.array([(1.5, 6), (np.pi, -2)], dtype=dtype)\n",
    "sarr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [],
   "source": [
    "sarr[0]\n",
    "sarr[0]['y']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [],
   "source": [
    "sarr['x']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [],
   "source": [
    "dtype = [('x', np.int64, 3), ('y', np.int32)]\n",
    "arr = np.zeros(4, dtype=dtype)\n",
    "arr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr[0]['x']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr['x']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [],
   "source": [
    "dtype = [('x', [('a', 'f8'), ('b', 'f4')]), ('y', np.int32)]\n",
    "data = np.array([((1, 2), 5), ((3, 4), 6)], dtype=dtype)\n",
    "data['x']\n",
    "data['y']\n",
    "data['x']['a']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr = rng.standard_normal(6)\n",
    "arr.sort()\n",
    "arr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr = rng.standard_normal((3, 5))\n",
    "arr\n",
    "arr[:, 0].sort()  # Sort first column values in place\n",
    "arr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr = rng.standard_normal(5)\n",
    "arr\n",
    "np.sort(arr)\n",
    "arr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr = rng.standard_normal((3, 5))\n",
    "arr\n",
    "arr.sort(axis=1)\n",
    "arr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr[:, ::-1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [],
   "source": [
    "values = np.array([5, 0, 1, 3, 2])\n",
    "indexer = values.argsort()\n",
    "indexer\n",
    "values[indexer]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr = rng.standard_normal((3, 5))\n",
    "arr[0] = values\n",
    "arr\n",
    "arr[:, arr[0].argsort()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [],
   "source": [
    "first_name = np.array(['Bob', 'Jane', 'Steve', 'Bill', 'Barbara'])\n",
    "last_name = np.array(['Jones', 'Arnold', 'Arnold', 'Jones', 'Walters'])\n",
    "sorter = np.lexsort((first_name, last_name))\n",
    "sorter\n",
    "list(zip(last_name[sorter], first_name[sorter]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [],
   "source": [
    "values = np.array(['2:first', '2:second', '1:first', '1:second',\n",
    "                   '1:third'])\n",
    "key = np.array([2, 2, 1, 1, 1])\n",
    "indexer = key.argsort(kind='mergesort')\n",
    "indexer\n",
    "values.take(indexer)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [],
   "source": [
    "rng = np.random.default_rng(12345)\n",
    "arr = rng.standard_normal(20)\n",
    "arr\n",
    "np.partition(arr, 3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [],
   "source": [
    "indices = np.argpartition(arr, 3)\n",
    "indices\n",
    "arr.take(indices)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr = np.array([0, 1, 7, 12, 15])\n",
    "arr.searchsorted(9)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr.searchsorted([0, 8, 11, 16])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr = np.array([0, 0, 0, 1, 1, 1, 1])\n",
    "arr.searchsorted([0, 1])\n",
    "arr.searchsorted([0, 1], side='right')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = np.floor(rng.uniform(0, 10000, size=50))\n",
    "bins = np.array([0, 100, 1000, 5000, 10000])\n",
    "data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [],
   "source": [
    "labels = bins.searchsorted(data)\n",
    "labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.Series(data).groupby(labels).mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "\n",
    "def mean_distance(x, y):\n",
    "    nx = len(x)\n",
    "    result = 0.0\n",
    "    count = 0\n",
    "    for i in range(nx):\n",
    "        result += x[i] - y[i]\n",
    "        count += 1\n",
    "    return result / count"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {},
   "outputs": [],
   "source": [
    "mmap = np.memmap('mymmap', dtype='float64', mode='w+',\n",
    "                 shape=(10000, 10000))\n",
    "mmap"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {},
   "outputs": [],
   "source": [
    "section = mmap[:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {},
   "outputs": [],
   "source": [
    "section[:] = rng.standard_normal((5, 10000))\n",
    "mmap.flush()\n",
    "mmap\n",
    "del mmap"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {},
   "outputs": [],
   "source": [
    "mmap = np.memmap('mymmap', dtype='float64', shape=(10000, 10000))\n",
    "mmap"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {},
   "outputs": [],
   "source": [
    "%xdel mmap\n",
    "!rm mymmap"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr_c = np.ones((100, 10000), order='C')\n",
    "arr_f = np.ones((100, 10000), order='F')\n",
    "arr_c.flags\n",
    "arr_f.flags\n",
    "arr_f.flags.f_contiguous"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {},
   "outputs": [],
   "source": [
    "%timeit arr_c.sum(1)\n",
    "%timeit arr_f.sum(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr_f.copy('C').flags"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr_c[:50].flags.contiguous\n",
    "arr_c[:, :50].flags"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {},
   "outputs": [],
   "source": [
    "%xdel arr_c\n",
    "%xdel arr_f"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.options.display.max_rows = PREVIOUS_MAX_ROWS"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}

================================================
FILE: appb.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "plt.rc('figure', figsize=(10, 6))\n",
    "PREVIOUS_MAX_ROWS = pd.options.display.max_rows\n",
    "pd.options.display.max_columns = 20\n",
    "pd.options.display.max_rows = 20\n",
    "pd.options.display.max_colwidth = 80\n",
    "np.set_printoptions(precision=4, suppress=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# a very large list of strings\n",
    "strings = ['foo', 'foobar', 'baz', 'qux',\n",
    "           'python', 'Guido Van Rossum'] * 100000\n",
    "\n",
    "method1 = [x for x in strings if x.startswith('foo')]\n",
    "\n",
    "method2 = [x for x in strings if x[:3] == 'foo']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "%time method1 = [x for x in strings if x.startswith('foo')]\n",
    "%time method2 = [x for x in strings if x[:3] == 'foo']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.options.display.max_rows = PREVIOUS_MAX_ROWS"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}

================================================
FILE: ch02.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "np.random.seed(12345)\n",
    "np.set_printoptions(precision=4, suppress=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "data = [np.random.standard_normal() for i in range(7)]\n",
    "data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "a = [1, 2, 3]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "b = a\n",
    "b"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "a.append(4)\n",
    "b"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "def append_element(some_list, element):\n",
    "    some_list.append(element)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = [1, 2, 3]\n",
    "append_element(data, 4)\n",
    "data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "a = 5\n",
    "type(a)\n",
    "a = \"foo\"\n",
    "type(a)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "\"5\" + 5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "a = 4.5\n",
    "b = 2\n",
    "# String formatting, to be visited later\n",
    "print(f\"a is {type(a)}, b is {type(b)}\")\n",
    "a / b"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "a = 5\n",
    "isinstance(a, int)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "a = 5; b = 4.5\n",
    "isinstance(a, (int, float))\n",
    "isinstance(b, (int, float))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "a = \"foo\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "getattr(a, \"split\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "def isiterable(obj):\n",
    "    try:\n",
    "        iter(obj)\n",
    "        return True\n",
    "    except TypeError: # not iterable\n",
    "        return False"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "isiterable(\"a string\")\n",
    "isiterable([1, 2, 3])\n",
    "isiterable(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "5 - 7\n",
    "12 + 21.5\n",
    "5 <= 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "a = [1, 2, 3]\n",
    "b = a\n",
    "c = list(a)\n",
    "a is b\n",
    "a is not c"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "a == c"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "a = None\n",
    "a is None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "a_list = [\"foo\", 2, [4, 5]]\n",
    "a_list[2] = (3, 4)\n",
    "a_list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "a_tuple = (3, 5, (4, 5))\n",
    "a_tuple[1] = \"four\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "ival = 17239871\n",
    "ival ** 6"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "fval = 7.243\n",
    "fval2 = 6.78e-5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "3 / 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "3 // 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "c = \"\"\"\n",
    "This is a longer string that\n",
    "spans multiple lines\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "c.count(\"\\n\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "a = \"this is a string\"\n",
    "a[10] = \"f\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "b = a.replace(\"string\", \"longer string\")\n",
    "b"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "a"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "a = 5.6\n",
    "s = str(a)\n",
    "print(s)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "s = \"python\"\n",
    "list(s)\n",
    "s[:3]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "s = \"12\\\\34\"\n",
    "print(s)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "s = r\"this\\has\\no\\special\\characters\"\n",
    "s"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "a = \"this is the first half \"\n",
    "b = \"and this is the second half\"\n",
    "a + b"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "template = \"{0:.2f} {1:s} are worth US${2:d}\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "template.format(88.46, \"Argentine Pesos\", 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "amount = 10\n",
    "rate = 88.46\n",
    "currency = \"Pesos\"\n",
    "result = f\"{amount} {currency} is worth US${amount / rate}\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "f\"{amount} {currency} is worth US${amount / rate:.2f}\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [],
   "source": [
    "val = \"espa\u00f1ol\"\n",
    "val"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
    "val_utf8 = val.encode(\"utf-8\")\n",
    "val_utf8\n",
    "type(val_utf8)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [],
   "source": [
    "val_utf8.decode(\"utf-8\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [],
   "source": [
    "val.encode(\"latin1\")\n",
    "val.encode(\"utf-16\")\n",
    "val.encode(\"utf-16le\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [],
   "source": [
    "True and True\n",
    "False or True"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [],
   "source": [
    "int(False)\n",
    "int(True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [],
   "source": [
    "a = True\n",
    "b = False\n",
    "not a\n",
    "not b"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [],
   "source": [
    "s = \"3.14159\"\n",
    "fval = float(s)\n",
    "type(fval)\n",
    "int(fval)\n",
    "bool(fval)\n",
    "bool(0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [],
   "source": [
    "a = None\n",
    "a is None\n",
    "b = 5\n",
    "b is not None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [],
   "source": [
    "from datetime import datetime, date, time\n",
    "dt = datetime(2011, 10, 29, 20, 30, 21)\n",
    "dt.day\n",
    "dt.minute"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [],
   "source": [
    "dt.date()\n",
    "dt.time()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [],
   "source": [
    "dt.strftime(\"%Y-%m-%d %H:%M\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [],
   "source": [
    "datetime.strptime(\"20091031\", \"%Y%m%d\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [],
   "source": [
    "dt_hour = dt.replace(minute=0, second=0)\n",
    "dt_hour"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [],
   "source": [
    "dt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [],
   "source": [
    "dt2 = datetime(2011, 11, 15, 22, 30)\n",
    "delta = dt2 - dt\n",
    "delta\n",
    "type(delta)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [],
   "source": [
    "dt\n",
    "dt + delta"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [],
   "source": [
    "a = 5; b = 7\n",
    "c = 8; d = 4\n",
    "if a < b or c > d:\n",
    "    print(\"Made it\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [],
   "source": [
    "4 > 3 > 2 > 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in range(4):\n",
    "    for j in range(4):\n",
    "        if j > i:\n",
    "            break\n",
    "        print((i, j))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [],
   "source": [
    "range(10)\n",
    "list(range(10))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [],
   "source": [
    "list(range(0, 20, 2))\n",
    "list(range(5, 0, -1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [],
   "source": [
    "seq = [1, 2, 3, 4]\n",
    "for i in range(len(seq)):\n",
    "    print(f\"element {i}: {seq[i]}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [],
   "source": [
    "total = 0\n",
    "for i in range(100_000):\n",
    "    # % is the modulo operator\n",
    "    if i % 3 == 0 or i % 5 == 0:\n",
    "        total += i\n",
    "print(total)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}

================================================
FILE: ch03.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "tup = (4, 5, 6)\n",
    "tup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "tup = 4, 5, 6\n",
    "tup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "tuple([4, 0, 2])\n",
    "tup = tuple('string')\n",
    "tup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "tup[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "nested_tup = (4, 5, 6), (7, 8)\n",
    "nested_tup\n",
    "nested_tup[0]\n",
    "nested_tup[1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "tup = tuple(['foo', [1, 2], True])\n",
    "tup[2] = False"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "tup[1].append(3)\n",
    "tup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "(4, None, 'foo') + (6, 0) + ('bar',)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "('foo', 'bar') * 4"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "tup = (4, 5, 6)\n",
    "a, b, c = tup\n",
    "b"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "tup = 4, 5, (6, 7)\n",
    "a, b, (c, d) = tup\n",
    "d"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "a, b = 1, 2\n",
    "a\n",
    "b\n",
    "b, a = a, b\n",
    "a\n",
    "b"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "seq = [(1, 2, 3), (4, 5, 6), (7, 8, 9)]\n",
    "for a, b, c in seq:\n",
    "    print(f'a={a}, b={b}, c={c}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "values = 1, 2, 3, 4, 5\n",
    "a, b, *rest = values\n",
    "a\n",
    "b\n",
    "rest"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "a, b, *_ = values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "a = (1, 2, 2, 2, 3, 4, 2)\n",
    "a.count(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "a_list = [2, 3, 7, None]\n",
    "\n",
    "tup = (\"foo\", \"bar\", \"baz\")\n",
    "b_list = list(tup)\n",
    "b_list\n",
    "b_list[1] = \"peekaboo\"\n",
    "b_list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "gen = range(10)\n",
    "gen\n",
    "list(gen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "b_list.append(\"dwarf\")\n",
    "b_list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "b_list.insert(1, \"red\")\n",
    "b_list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "b_list.pop(2)\n",
    "b_list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "b_list.append(\"foo\")\n",
    "b_list\n",
    "b_list.remove(\"foo\")\n",
    "b_list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "\"dwarf\" in b_list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "\"dwarf\" not in b_list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "[4, None, \"foo\"] + [7, 8, (2, 3)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "x = [4, None, \"foo\"]\n",
    "x.extend([7, 8, (2, 3)])\n",
    "x"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "a = [7, 2, 5, 1, 3]\n",
    "a.sort()\n",
    "a"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "b = [\"saw\", \"small\", \"He\", \"foxes\", \"six\"]\n",
    "b.sort(key=len)\n",
    "b"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "seq = [7, 2, 3, 7, 5, 6, 0, 1]\n",
    "seq[1:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "seq[3:5] = [6, 3]\n",
    "seq"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "seq[:5]\n",
    "seq[3:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "seq[-4:]\n",
    "seq[-6:-2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "seq[::2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "seq[::-1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "empty_dict = {}\n",
    "d1 = {\"a\": \"some value\", \"b\": [1, 2, 3, 4]}\n",
    "d1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "d1[7] = \"an integer\"\n",
    "d1\n",
    "d1[\"b\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "\"b\" in d1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "d1[5] = \"some value\"\n",
    "d1\n",
    "d1[\"dummy\"] = \"another value\"\n",
    "d1\n",
    "del d1[5]\n",
    "d1\n",
    "ret = d1.pop(\"dummy\")\n",
    "ret\n",
    "d1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "list(d1.keys())\n",
    "list(d1.values())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [],
   "source": [
    "list(d1.items())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
    "d1.update({\"b\": \"foo\", \"c\": 12})\n",
    "d1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [],
   "source": [
    "tuples = zip(range(5), reversed(range(5)))\n",
    "tuples\n",
    "mapping = dict(tuples)\n",
    "mapping"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [],
   "source": [
    "words = [\"apple\", \"bat\", \"bar\", \"atom\", \"book\"]\n",
    "by_letter = {}\n",
    "\n",
    "for word in words:\n",
    "    letter = word[0]\n",
    "    if letter not in by_letter:\n",
    "        by_letter[letter] = [word]\n",
    "    else:\n",
    "        by_letter[letter].append(word)\n",
    "\n",
    "by_letter"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [],
   "source": [
    "by_letter = {}\n",
    "for word in words:\n",
    "    letter = word[0]\n",
    "    by_letter.setdefault(letter, []).append(word)\n",
    "by_letter"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [],
   "source": [
    "from collections import defaultdict\n",
    "by_letter = defaultdict(list)\n",
    "for word in words:\n",
    "    by_letter[word[0]].append(word)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [],
   "source": [
    "hash(\"string\")\n",
    "hash((1, 2, (2, 3)))\n",
    "hash((1, 2, [2, 3])) # fails because lists are mutable"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [],
   "source": [
    "d = {}\n",
    "d[tuple([1, 2, 3])] = 5\n",
    "d"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [],
   "source": [
    "set([2, 2, 2, 1, 3, 3])\n",
    "{2, 2, 2, 1, 3, 3}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [],
   "source": [
    "a = {1, 2, 3, 4, 5}\n",
    "b = {3, 4, 5, 6, 7, 8}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [],
   "source": [
    "a.union(b)\n",
    "a | b"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [],
   "source": [
    "a.intersection(b)\n",
    "a & b"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [],
   "source": [
    "c = a.copy()\n",
    "c |= b\n",
    "c\n",
    "d = a.copy()\n",
    "d &= b\n",
    "d"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [],
   "source": [
    "my_data = [1, 2, 3, 4]\n",
    "my_set = {tuple(my_data)}\n",
    "my_set"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [],
   "source": [
    "a_set = {1, 2, 3, 4, 5}\n",
    "{1, 2, 3}.issubset(a_set)\n",
    "a_set.issuperset({1, 2, 3})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [],
   "source": [
    "{1, 2, 3} == {3, 2, 1}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [],
   "source": [
    "sorted([7, 1, 2, 6, 0, 3, 2])\n",
    "sorted(\"horse race\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [],
   "source": [
    "seq1 = [\"foo\", \"bar\", \"baz\"]\n",
    "seq2 = [\"one\", \"two\", \"three\"]\n",
    "zipped = zip(seq1, seq2)\n",
    "list(zipped)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [],
   "source": [
    "seq3 = [False, True]\n",
    "list(zip(seq1, seq2, seq3))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [],
   "source": [
    "for index, (a, b) in enumerate(zip(seq1, seq2)):\n",
    "    print(f\"{index}: {a}, {b}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [],
   "source": [
    "list(reversed(range(10)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [],
   "source": [
    "strings = [\"a\", \"as\", \"bat\", \"car\", \"dove\", \"python\"]\n",
    "[x.upper() for x in strings if len(x) > 2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [],
   "source": [
    "unique_lengths = {len(x) for x in strings}\n",
    "unique_lengths"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [],
   "source": [
    "set(map(len, strings))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [],
   "source": [
    "loc_mapping = {value: index for index, value in enumerate(strings)}\n",
    "loc_mapping"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_data = [[\"John\", \"Emily\", \"Michael\", \"Mary\", \"Steven\"],\n",
    "            [\"Maria\", \"Juan\", \"Javier\", \"Natalia\", \"Pilar\"]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [],
   "source": [
    "names_of_interest = []\n",
    "for names in all_data:\n",
    "    enough_as = [name for name in names if name.count(\"a\") >= 2]\n",
    "    names_of_interest.extend(enough_as)\n",
    "names_of_interest"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [],
   "source": [
    "result = [name for names in all_data for name in names\n",
    "          if name.count(\"a\") >= 2]\n",
    "result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [],
   "source": [
    "some_tuples = [(1, 2, 3), (4, 5, 6), (7, 8, 9)]\n",
    "flattened = [x for tup in some_tuples for x in tup]\n",
    "flattened"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [],
   "source": [
    "flattened = []\n",
    "\n",
    "for tup in some_tuples:\n",
    "    for x in tup:\n",
    "        flattened.append(x)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [],
   "source": [
    "[[x for x in tup] for tup in some_tuples]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {},
   "outputs": [],
   "source": [
    "def my_function(x, y):\n",
    "    return x + y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {},
   "outputs": [],
   "source": [
    "my_function(1, 2)\n",
    "result = my_function(1, 2)\n",
    "result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {},
   "outputs": [],
   "source": [
    "def function_without_return(x):\n",
    "    print(x)\n",
    "\n",
    "result = function_without_return(\"hello!\")\n",
    "print(result)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {},
   "outputs": [],
   "source": [
    "def my_function2(x, y, z=1.5):\n",
    "    if z > 1:\n",
    "        return z * (x + y)\n",
    "    else:\n",
    "        return z / (x + y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {},
   "outputs": [],
   "source": [
    "my_function2(5, 6, z=0.7)\n",
    "my_function2(3.14, 7, 3.5)\n",
    "my_function2(10, 20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {},
   "outputs": [],
   "source": [
    "a = []\n",
    "def func():\n",
    "    for i in range(5):\n",
    "        a.append(i)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {},
   "outputs": [],
   "source": [
    "func()\n",
    "a\n",
    "func()\n",
    "a"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {},
   "outputs": [],
   "source": [
    "a = None\n",
    "def bind_a_variable():\n",
    "    global a\n",
    "    a = []\n",
    "bind_a_variable()\n",
    "print(a)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {},
   "outputs": [],
   "source": [
    "states = [\"   Alabama \", \"Georgia!\", \"Georgia\", \"georgia\", \"FlOrIda\",\n",
    "          \"south   carolina##\", \"West virginia?\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "\n",
    "def clean_strings(strings):\n",
    "    result = []\n",
    "    for value in strings:\n",
    "        value = value.strip()\n",
    "        value = re.sub(\"[!#?]\", \"\", value)\n",
    "        value = value.title()\n",
    "        result.append(value)\n",
    "    return result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {},
   "outputs": [],
   "source": [
    "clean_strings(states)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {},
   "outputs": [],
   "source": [
    "def remove_punctuation(value):\n",
    "    return re.sub(\"[!#?]\", \"\", value)\n",
    "\n",
    "clean_ops = [str.strip, remove_punctuation, str.title]\n",
    "\n",
    "def clean_strings(strings, ops):\n",
    "    result = []\n",
    "    for value in strings:\n",
    "        for func in ops:\n",
    "            value = func(value)\n",
    "        result.append(value)\n",
    "    return result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "metadata": {},
   "outputs": [],
   "source": [
    "clean_strings(states, clean_ops)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "metadata": {},
   "outputs": [],
   "source": [
    "for x in map(remove_punctuation, states):\n",
    "    print(x)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "metadata": {},
   "outputs": [],
   "source": [
    "def short_function(x):\n",
    "    return x * 2\n",
    "\n",
    "equiv_anon = lambda x: x * 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "metadata": {},
   "outputs": [],
   "source": [
    "def apply_to_list(some_list, f):\n",
    "    return [f(x) for x in some_list]\n",
    "\n",
    "ints = [4, 0, 1, 5, 6]\n",
    "apply_to_list(ints, lambda x: x * 2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "metadata": {},
   "outputs": [],
   "source": [
    "strings = [\"foo\", \"card\", \"bar\", \"aaaa\", \"abab\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "metadata": {},
   "outputs": [],
   "source": [
    "strings.sort(key=lambda x: len(set(x)))\n",
    "strings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "metadata": {},
   "outputs": [],
   "source": [
    "some_dict = {\"a\": 1, \"b\": 2, \"c\": 3}\n",
    "for key in some_dict:\n",
    "    print(key)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "metadata": {},
   "outputs": [],
   "source": [
    "dict_iterator = iter(some_dict)\n",
    "dict_iterator"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "metadata": {},
   "outputs": [],
   "source": [
    "list(dict_iterator)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "metadata": {},
   "outputs": [],
   "source": [
    "def squares(n=10):\n",
    "    print(f\"Generating squares from 1 to {n ** 2}\")\n",
    "    for i in range(1, n + 1):\n",
    "        yield i ** 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "metadata": {},
   "outputs": [],
   "source": [
    "gen = squares()\n",
    "gen"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "metadata": {},
   "outputs": [],
   "source": [
    "for x in gen:\n",
    "    print(x, end=\" \")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "metadata": {},
   "outputs": [],
   "source": [
    "gen = (x ** 2 for x in range(100))\n",
    "gen"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "metadata": {},
   "outputs": [],
   "source": [
    "sum(x ** 2 for x in range(100))\n",
    "dict((i, i ** 2) for i in range(5))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 98,
   "metadata": {},
   "outputs": [],
   "source": [
    "import itertools\n",
    "def first_letter(x):\n",
    "    return x[0]\n",
    "\n",
    "names = [\"Alan\", \"Adam\", \"Wes\", \"Will\", \"Albert\", \"Steven\"]\n",
    "\n",
    "for letter, names in itertools.groupby(names, first_letter):\n",
    "    print(letter, list(names)) # names is a generator"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "metadata": {},
   "outputs": [],
   "source": [
    "float(\"1.2345\")\n",
    "float(\"something\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 100,
   "metadata": {},
   "outputs": [],
   "source": [
    "def attempt_float(x):\n",
    "    try:\n",
    "        return float(x)\n",
    "    except:\n",
    "        return x"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "metadata": {},
   "outputs": [],
   "source": [
    "attempt_float(\"1.2345\")\n",
    "attempt_float(\"something\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 102,
   "metadata": {},
   "outputs": [],
   "source": [
    "float((1, 2))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 103,
   "metadata": {},
   "outputs": [],
   "source": [
    "def attempt_float(x):\n",
    "    try:\n",
    "        return float(x)\n",
    "    except ValueError:\n",
    "        return x"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 104,
   "metadata": {},
   "outputs": [],
   "source": [
    "attempt_float((1, 2))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "metadata": {},
   "outputs": [],
   "source": [
    "def attempt_float(x):\n",
    "    try:\n",
    "        return float(x)\n",
    "    except (TypeError, ValueError):\n",
    "        return x"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 106,
   "metadata": {},
   "outputs": [],
   "source": [
    "path = \"examples/segismundo.txt\"\n",
    "f = open(path, encoding=\"utf-8\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 107,
   "metadata": {},
   "outputs": [],
   "source": [
    "lines = [x.rstrip() for x in open(path, encoding=\"utf-8\")]\n",
    "lines"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 108,
   "metadata": {},
   "outputs": [],
   "source": [
    "f.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 109,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(path, encoding=\"utf-8\") as f:\n",
    "    lines = [x.rstrip() for x in f]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 110,
   "metadata": {},
   "outputs": [],
   "source": [
    "f1 = open(path)\n",
    "f1.read(10)\n",
    "f2 = open(path, mode=\"rb\")  # Binary mode\n",
    "f2.read(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "metadata": {},
   "outputs": [],
   "source": [
    "f1.tell()\n",
    "f2.tell()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 112,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "sys.getdefaultencoding()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 113,
   "metadata": {},
   "outputs": [],
   "source": [
    "f1.seek(3)\n",
    "f1.read(1)\n",
    "f1.tell()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 114,
   "metadata": {},
   "outputs": [],
   "source": [
    "f1.close()\n",
    "f2.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 115,
   "metadata": {},
   "outputs": [],
   "source": [
    "path\n",
    "\n",
    "with open(\"tmp.txt\", mode=\"w\") as handle:\n",
    "    handle.writelines(x for x in open(path) if len(x) > 1)\n",
    "\n",
    "with open(\"tmp.txt\") as f:\n",
    "    lines = f.readlines()\n",
    "\n",
    "lines"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 116,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "os.remove(\"tmp.txt\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 117,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(path) as f:\n",
    "    chars = f.read(10)\n",
    "\n",
    "chars\n",
    "len(chars)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 118,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(path, mode=\"rb\") as f:\n",
    "    data = f.read(10)\n",
    "\n",
    "data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 119,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.decode(\"utf-8\")\n",
    "data[:4].decode(\"utf-8\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 120,
   "metadata": {},
   "outputs": [],
   "source": [
    "sink_path = \"sink.txt\"\n",
    "with open(path) as source:\n",
    "    with open(sink_path, \"x\", encoding=\"iso-8859-1\") as sink:\n",
    "        sink.write(source.read())\n",
    "\n",
    "with open(sink_path, encoding=\"iso-8859-1\") as f:\n",
    "    print(f.read(10))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 121,
   "metadata": {},
   "outputs": [],
   "source": [
    "os.remove(sink_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 122,
   "metadata": {},
   "outputs": [],
   "source": [
    "f = open(path, encoding='utf-8')\n",
    "f.read(5)\n",
    "f.seek(4)\n",
    "f.read(1)\n",
    "f.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 123,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}

================================================
FILE: ch04.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "np.random.seed(12345)\n",
    "import matplotlib.pyplot as plt\n",
    "plt.rc(\"figure\", figsize=(10, 6))\n",
    "np.set_printoptions(precision=4, suppress=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "\n",
    "my_arr = np.arange(1_000_000)\n",
    "my_list = list(range(1_000_000))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "%timeit my_arr2 = my_arr * 2\n",
    "%timeit my_list2 = [x * 2 for x in my_list]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "data = np.array([[1.5, -0.1, 3], [0, -3, 6.5]])\n",
    "data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "data * 10\n",
    "data + data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.shape\n",
    "data.dtype"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "data1 = [6, 7.5, 8, 0, 1]\n",
    "arr1 = np.array(data1)\n",
    "arr1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "data2 = [[1, 2, 3, 4], [5, 6, 7, 8]]\n",
    "arr2 = np.array(data2)\n",
    "arr2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr2.ndim\n",
    "arr2.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr1.dtype\n",
    "arr2.dtype"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.zeros(10)\n",
    "np.zeros((3, 6))\n",
    "np.empty((2, 3, 2))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.arange(15)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr1 = np.array([1, 2, 3], dtype=np.float64)\n",
    "arr2 = np.array([1, 2, 3], dtype=np.int32)\n",
    "arr1.dtype\n",
    "arr2.dtype"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr = np.array([1, 2, 3, 4, 5])\n",
    "arr.dtype\n",
    "float_arr = arr.astype(np.float64)\n",
    "float_arr\n",
    "float_arr.dtype"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr = np.array([3.7, -1.2, -2.6, 0.5, 12.9, 10.1])\n",
    "arr\n",
    "arr.astype(np.int32)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "numeric_strings = np.array([\"1.25\", \"-9.6\", \"42\"], dtype=np.string_)\n",
    "numeric_strings.astype(float)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "int_array = np.arange(10)\n",
    "calibers = np.array([.22, .270, .357, .380, .44, .50], dtype=np.float64)\n",
    "int_array.astype(calibers.dtype)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "zeros_uint32 = np.zeros(8, dtype=\"u4\")\n",
    "zeros_uint32"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr = np.array([[1., 2., 3.], [4., 5., 6.]])\n",
    "arr\n",
    "arr * arr\n",
    "arr - arr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "1 / arr\n",
    "arr ** 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr2 = np.array([[0., 4., 1.], [7., 2., 12.]])\n",
    "arr2\n",
    "arr2 > arr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr = np.arange(10)\n",
    "arr\n",
    "arr[5]\n",
    "arr[5:8]\n",
    "arr[5:8] = 12\n",
    "arr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr_slice = arr[5:8]\n",
    "arr_slice"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr_slice[1] = 12345\n",
    "arr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr_slice[:] = 64\n",
    "arr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr2d = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])\n",
    "arr2d[2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr2d[0][2]\n",
    "arr2d[0, 2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr3d = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]])\n",
    "arr3d"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr3d[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "old_values = arr3d[0].copy()\n",
    "arr3d[0] = 42\n",
    "arr3d\n",
    "arr3d[0] = old_values\n",
    "arr3d"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr3d[1, 0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "x = arr3d[1]\n",
    "x\n",
    "x[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr\n",
    "arr[1:6]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr2d\n",
    "arr2d[:2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr2d[:2, 1:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "lower_dim_slice = arr2d[1, :2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "lower_dim_slice.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr2d[:2, 2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr2d[:, :1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr2d[:2, 1:] = 0\n",
    "arr2d"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [],
   "source": [
    "names = np.array([\"Bob\", \"Joe\", \"Will\", \"Bob\", \"Will\", \"Joe\", \"Joe\"])\n",
    "data = np.array([[4, 7], [0, 2], [-5, 6], [0, 0], [1, 2],\n",
    "                 [-12, -4], [3, 4]])\n",
    "names\n",
    "data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
    "names == \"Bob\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [],
   "source": [
    "data[names == \"Bob\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [],
   "source": [
    "data[names == \"Bob\", 1:]\n",
    "data[names == \"Bob\", 1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [],
   "source": [
    "names != \"Bob\"\n",
    "~(names == \"Bob\")\n",
    "data[~(names == \"Bob\")]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [],
   "source": [
    "cond = names == \"Bob\"\n",
    "data[~cond]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [],
   "source": [
    "mask = (names == \"Bob\") | (names == \"Will\")\n",
    "mask\n",
    "data[mask]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [],
   "source": [
    "data[data < 0] = 0\n",
    "data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [],
   "source": [
    "data[names != \"Joe\"] = 7\n",
    "data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr = np.zeros((8, 4))\n",
    "for i in range(8):\n",
    "    arr[i] = i\n",
    "arr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr[[4, 3, 0, 6]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr[[-3, -5, -7]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr = np.arange(32).reshape((8, 4))\n",
    "arr\n",
    "arr[[1, 5, 7, 2], [0, 3, 1, 2]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr[[1, 5, 7, 2]][:, [0, 3, 1, 2]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr[[1, 5, 7, 2], [0, 3, 1, 2]]\n",
    "arr[[1, 5, 7, 2], [0, 3, 1, 2]] = 0\n",
    "arr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr = np.arange(15).reshape((3, 5))\n",
    "arr\n",
    "arr.T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr = np.array([[0, 1, 0], [1, 2, -2], [6, 3, 2], [-1, 0, -1], [1, 0, 1]])\n",
    "arr\n",
    "np.dot(arr.T, arr)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr.T @ arr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr\n",
    "arr.swapaxes(0, 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [],
   "source": [
    "samples = np.random.standard_normal(size=(4, 4))\n",
    "samples"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [],
   "source": [
    "from random import normalvariate\n",
    "N = 1_000_000\n",
    "%timeit samples = [normalvariate(0, 1) for _ in range(N)]\n",
    "%timeit np.random.standard_normal(N)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [],
   "source": [
    "rng = np.random.default_rng(seed=12345)\n",
    "data = rng.standard_normal((2, 3))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [],
   "source": [
    "type(rng)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr = np.arange(10)\n",
    "arr\n",
    "np.sqrt(arr)\n",
    "np.exp(arr)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [],
   "source": [
    "x = rng.standard_normal(8)\n",
    "y = rng.standard_normal(8)\n",
    "x\n",
    "y\n",
    "np.maximum(x, y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr = rng.standard_normal(7) * 5\n",
    "arr\n",
    "remainder, whole_part = np.modf(arr)\n",
    "remainder\n",
    "whole_part"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr\n",
    "out = np.zeros_like(arr)\n",
    "np.add(arr, 1)\n",
    "np.add(arr, 1, out=out)\n",
    "out"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [],
   "source": [
    "points = np.arange(-5, 5, 0.01) # 100 equally spaced points\n",
    "xs, ys = np.meshgrid(points, points)\n",
    "ys"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [],
   "source": [
    "z = np.sqrt(xs ** 2 + ys ** 2)\n",
    "z"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "plt.imshow(z, cmap=plt.cm.gray, extent=[-5, 5, -5, 5])\n",
    "plt.colorbar()\n",
    "plt.title(\"Image plot of $\\sqrt{x^2 + y^2}$ for a grid of values\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.draw()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.close(\"all\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {},
   "outputs": [],
   "source": [
    "xarr = np.array([1.1, 1.2, 1.3, 1.4, 1.5])\n",
    "yarr = np.array([2.1, 2.2, 2.3, 2.4, 2.5])\n",
    "cond = np.array([True, False, True, True, False])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {},
   "outputs": [],
   "source": [
    "result = [(x if c else y)\n",
    "          for x, y, c in zip(xarr, yarr, cond)]\n",
    "result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {},
   "outputs": [],
   "source": [
    "result = np.where(cond, xarr, yarr)\n",
    "result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr = rng.standard_normal((4, 4))\n",
    "arr\n",
    "arr > 0\n",
    "np.where(arr > 0, 2, -2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.where(arr > 0, 2, arr) # set only positive values to 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr = rng.standard_normal((5, 4))\n",
    "arr\n",
    "arr.mean()\n",
    "np.mean(arr)\n",
    "arr.sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr.mean(axis=1)\n",
    "arr.sum(axis=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr = np.array([0, 1, 2, 3, 4, 5, 6, 7])\n",
    "arr.cumsum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr = np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8]])\n",
    "arr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr.cumsum(axis=0)\n",
    "arr.cumsum(axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr = rng.standard_normal(100)\n",
    "(arr > 0).sum() # Number of positive values\n",
    "(arr <= 0).sum() # Number of non-positive values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "metadata": {},
   "outputs": [],
   "source": [
    "bools = np.array([False, False, True, False])\n",
    "bools.any()\n",
    "bools.all()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr = rng.standard_normal(6)\n",
    "arr\n",
    "arr.sort()\n",
    "arr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr = rng.standard_normal((5, 3))\n",
    "arr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr.sort(axis=0)\n",
    "arr\n",
    "arr.sort(axis=1)\n",
    "arr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr2 = np.array([5, -10, 7, 1, 0, -3])\n",
    "sorted_arr2 = np.sort(arr2)\n",
    "sorted_arr2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "metadata": {},
   "outputs": [],
   "source": [
    "names = np.array([\"Bob\", \"Will\", \"Joe\", \"Bob\", \"Will\", \"Joe\", \"Joe\"])\n",
    "np.unique(names)\n",
    "ints = np.array([3, 3, 3, 2, 2, 1, 1, 4, 4])\n",
    "np.unique(ints)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "metadata": {},
   "outputs": [],
   "source": [
    "sorted(set(names))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "metadata": {},
   "outputs": [],
   "source": [
    "values = np.array([6, 0, 0, 3, 2, 5, 6])\n",
    "np.in1d(values, [2, 3, 6])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr = np.arange(10)\n",
    "np.save(\"some_array\", arr)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.load(\"some_array.npy\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.savez(\"array_archive.npz\", a=arr, b=arr)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "metadata": {},
   "outputs": [],
   "source": [
    "arch = np.load(\"array_archive.npz\")\n",
    "arch[\"b\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.savez_compressed(\"arrays_compressed.npz\", a=arr, b=arr)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "metadata": {},
   "outputs": [],
   "source": [
    "!rm some_array.npy\n",
    "!rm array_archive.npz\n",
    "!rm arrays_compressed.npz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 98,
   "metadata": {},
   "outputs": [],
   "source": [
    "x = np.array([[1., 2., 3.], [4., 5., 6.]])\n",
    "y = np.array([[6., 23.], [-1, 7], [8, 9]])\n",
    "x\n",
    "y\n",
    "x.dot(y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.dot(x, y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 100,
   "metadata": {},
   "outputs": [],
   "source": [
    "x @ np.ones(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "metadata": {},
   "outputs": [],
   "source": [
    "from numpy.linalg import inv, qr\n",
    "X = rng.standard_normal((5, 5))\n",
    "mat = X.T @ X\n",
    "inv(mat)\n",
    "mat @ inv(mat)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 102,
   "metadata": {},
   "outputs": [],
   "source": [
    "import random\n",
    "position = 0\n",
    "walk = [position]\n",
    "nsteps = 1000\n",
    "for _ in range(nsteps):\n",
    "    step = 1 if random.randint(0, 1) else -1\n",
    "    position += step\n",
    "    walk.append(position)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 103,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.figure()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 104,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.plot(walk[:100])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "metadata": {},
   "outputs": [],
   "source": [
    "nsteps = 1000\n",
    "rng = np.random.default_rng(seed=12345)  # fresh random generator\n",
    "draws = rng.integers(0, 2, size=nsteps)\n",
    "steps = np.where(draws == 0, 1, -1)\n",
    "walk = steps.cumsum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 106,
   "metadata": {},
   "outputs": [],
   "source": [
    "walk.min()\n",
    "walk.max()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 107,
   "metadata": {},
   "outputs": [],
   "source": [
    "(np.abs(walk) >= 10).argmax()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 108,
   "metadata": {},
   "outputs": [],
   "source": [
    "nwalks = 5000\n",
    "nsteps = 1000\n",
    "draws = rng.integers(0, 2, size=(nwalks, nsteps)) # 0 or 1\n",
    "steps = np.where(draws > 0, 1, -1)\n",
    "walks = steps.cumsum(axis=1)\n",
    "walks"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 109,
   "metadata": {},
   "outputs": [],
   "source": [
    "walks.max()\n",
    "walks.min()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 110,
   "metadata": {},
   "outputs": [],
   "source": [
    "hits30 = (np.abs(walks) >= 30).any(axis=1)\n",
    "hits30\n",
    "hits30.sum() # Number that hit 30 or -30"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "metadata": {},
   "outputs": [],
   "source": [
    "crossing_times = (np.abs(walks[hits30]) >= 30).argmax(axis=1)\n",
    "crossing_times"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 112,
   "metadata": {},
   "outputs": [],
   "source": [
    "crossing_times.mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 113,
   "metadata": {},
   "outputs": [],
   "source": [
    "draws = 0.25 * rng.standard_normal((nwalks, nsteps))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 114,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}

================================================
FILE: ch05.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pandas import Series, DataFrame"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "np.random.seed(12345)\n",
    "import matplotlib.pyplot as plt\n",
    "plt.rc(\"figure\", figsize=(10, 6))\n",
    "PREVIOUS_MAX_ROWS = pd.options.display.max_rows\n",
    "pd.options.display.max_rows = 20\n",
    "pd.options.display.max_columns = 20\n",
    "pd.options.display.max_colwidth = 80\n",
    "np.set_printoptions(precision=4, suppress=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "obj = pd.Series([4, 7, -5, 3])\n",
    "obj"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "obj.array\n",
    "obj.index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "obj2 = pd.Series([4, 7, -5, 3], index=[\"d\", \"b\", \"a\", \"c\"])\n",
    "obj2\n",
    "obj2.index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "obj2[\"a\"]\n",
    "obj2[\"d\"] = 6\n",
    "obj2[[\"c\", \"a\", \"d\"]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "obj2[obj2 > 0]\n",
    "obj2 * 2\n",
    "import numpy as np\n",
    "np.exp(obj2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "\"b\" in obj2\n",
    "\"e\" in obj2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "sdata = {\"Ohio\": 35000, \"Texas\": 71000, \"Oregon\": 16000, \"Utah\": 5000}\n",
    "obj3 = pd.Series(sdata)\n",
    "obj3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "obj3.to_dict()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "states = [\"California\", \"Ohio\", \"Oregon\", \"Texas\"]\n",
    "obj4 = pd.Series(sdata, index=states)\n",
    "obj4"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.isna(obj4)\n",
    "pd.notna(obj4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "obj4.isna()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "obj3\n",
    "obj4\n",
    "obj3 + obj4"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "obj4.name = \"population\"\n",
    "obj4.index.name = \"state\"\n",
    "obj4"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "obj\n",
    "obj.index = [\"Bob\", \"Steve\", \"Jeff\", \"Ryan\"]\n",
    "obj"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = {\"state\": [\"Ohio\", \"Ohio\", \"Ohio\", \"Nevada\", \"Nevada\", \"Nevada\"],\n",
    "        \"year\": [2000, 2001, 2002, 2001, 2002, 2003],\n",
    "        \"pop\": [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}\n",
    "frame = pd.DataFrame(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "frame"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "frame.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "frame.tail()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.DataFrame(data, columns=[\"year\", \"state\", \"pop\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "frame2 = pd.DataFrame(data, columns=[\"year\", \"state\", \"pop\", \"debt\"])\n",
    "frame2\n",
    "frame2.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "frame2[\"state\"]\n",
    "frame2.year"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "frame2.loc[1]\n",
    "frame2.iloc[2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "frame2[\"debt\"] = 16.5\n",
    "frame2\n",
    "frame2[\"debt\"] = np.arange(6.)\n",
    "frame2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "val = pd.Series([-1.2, -1.5, -1.7], index=[\"two\", \"four\", \"five\"])\n",
    "frame2[\"debt\"] = val\n",
    "frame2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "frame2[\"eastern\"] = frame2[\"state\"] == \"Ohio\"\n",
    "frame2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "del frame2[\"eastern\"]\n",
    "frame2.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "populations = {\"Ohio\": {2000: 1.5, 2001: 1.7, 2002: 3.6},\n",
    "               \"Nevada\": {2001: 2.4, 2002: 2.9}}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "frame3 = pd.DataFrame(populations)\n",
    "frame3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "frame3.T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.DataFrame(populations, index=[2001, 2002, 2003])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "pdata = {\"Ohio\": frame3[\"Ohio\"][:-1],\n",
    "         \"Nevada\": frame3[\"Nevada\"][:2]}\n",
    "pd.DataFrame(pdata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "frame3.index.name = \"year\"\n",
    "frame3.columns.name = \"state\"\n",
    "frame3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "frame3.to_numpy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "frame2.to_numpy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "obj = pd.Series(np.arange(3), index=[\"a\", \"b\", \"c\"])\n",
    "index = obj.index\n",
    "index\n",
    "index[1:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "labels = pd.Index(np.arange(3))\n",
    "labels\n",
    "obj2 = pd.Series([1.5, -2.5, 0], index=labels)\n",
    "obj2\n",
    "obj2.index is labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "frame3\n",
    "frame3.columns\n",
    "\"Ohio\" in frame3.columns\n",
    "2003 in frame3.index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.Index([\"foo\", \"foo\", \"bar\", \"bar\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
    "obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=[\"d\", \"b\", \"a\", \"c\"])\n",
    "obj"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [],
   "source": [
    "obj2 = obj.reindex([\"a\", \"b\", \"c\", \"d\", \"e\"])\n",
    "obj2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [],
   "source": [
    "obj3 = pd.Series([\"blue\", \"purple\", \"yellow\"], index=[0, 2, 4])\n",
    "obj3\n",
    "obj3.reindex(np.arange(6), method=\"ffill\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [],
   "source": [
    "frame = pd.DataFrame(np.arange(9).reshape((3, 3)),\n",
    "                     index=[\"a\", \"c\", \"d\"],\n",
    "                     columns=[\"Ohio\", \"Texas\", \"California\"])\n",
    "frame\n",
    "frame2 = frame.reindex(index=[\"a\", \"b\", \"c\", \"d\"])\n",
    "frame2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [],
   "source": [
    "states = [\"Texas\", \"Utah\", \"California\"]\n",
    "frame.reindex(columns=states)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [],
   "source": [
    "frame.reindex(states, axis=\"columns\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [],
   "source": [
    "frame.loc[[\"a\", \"d\", \"c\"], [\"California\", \"Texas\"]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [],
   "source": [
    "obj = pd.Series(np.arange(5.), index=[\"a\", \"b\", \"c\", \"d\", \"e\"])\n",
    "obj\n",
    "new_obj = obj.drop(\"c\")\n",
    "new_obj\n",
    "obj.drop([\"d\", \"c\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.DataFrame(np.arange(16).reshape((4, 4)),\n",
    "                    index=[\"Ohio\", \"Colorado\", \"Utah\", \"New York\"],\n",
    "                    columns=[\"one\", \"two\", \"three\", \"four\"])\n",
    "data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.drop(index=[\"Colorado\", \"Ohio\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.drop(columns=[\"two\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.drop(\"two\", axis=1)\n",
    "data.drop([\"two\", \"four\"], axis=\"columns\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [],
   "source": [
    "obj = pd.Series(np.arange(4.), index=[\"a\", \"b\", \"c\", \"d\"])\n",
    "obj\n",
    "obj[\"b\"]\n",
    "obj[1]\n",
    "obj[2:4]\n",
    "obj[[\"b\", \"a\", \"d\"]]\n",
    "obj[[1, 3]]\n",
    "obj[obj < 2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [],
   "source": [
    "obj.loc[[\"b\", \"a\", \"d\"]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [],
   "source": [
    "obj1 = pd.Series([1, 2, 3], index=[2, 0, 1])\n",
    "obj2 = pd.Series([1, 2, 3], index=[\"a\", \"b\", \"c\"])\n",
    "obj1\n",
    "obj2\n",
    "obj1[[0, 1, 2]]\n",
    "obj2[[0, 1, 2]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [],
   "source": [
    "obj1.iloc[[0, 1, 2]]\n",
    "obj2.iloc[[0, 1, 2]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [],
   "source": [
    "obj2.loc[\"b\":\"c\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [],
   "source": [
    "obj2.loc[\"b\":\"c\"] = 5\n",
    "obj2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.DataFrame(np.arange(16).reshape((4, 4)),\n",
    "                    index=[\"Ohio\", \"Colorado\", \"Utah\", \"New York\"],\n",
    "                    columns=[\"one\", \"two\", \"three\", \"four\"])\n",
    "data\n",
    "data[\"two\"]\n",
    "data[[\"three\", \"one\"]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [],
   "source": [
    "data[:2]\n",
    "data[data[\"three\"] > 5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [],
   "source": [
    "data < 5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [],
   "source": [
    "data[data < 5] = 0\n",
    "data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [],
   "source": [
    "data\n",
    "data.loc[\"Colorado\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.loc[[\"Colorado\", \"New York\"]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.loc[\"Colorado\", [\"two\", \"three\"]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.iloc[2]\n",
    "data.iloc[[2, 1]]\n",
    "data.iloc[2, [3, 0, 1]]\n",
    "data.iloc[[1, 2], [3, 0, 1]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.loc[:\"Utah\", \"two\"]\n",
    "data.iloc[:, :3][data.three > 5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.loc[data.three >= 2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [],
   "source": [
    "ser = pd.Series(np.arange(3.))\n",
    "ser\n",
    "ser[-1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [],
   "source": [
    "ser"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {},
   "outputs": [],
   "source": [
    "ser2 = pd.Series(np.arange(3.), index=[\"a\", \"b\", \"c\"])\n",
    "ser2[-1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {},
   "outputs": [],
   "source": [
    "ser.iloc[-1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {},
   "outputs": [],
   "source": [
    "ser[:2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.loc[:, \"one\"] = 1\n",
    "data\n",
    "data.iloc[2] = 5\n",
    "data\n",
    "data.loc[data[\"four\"] > 5] = 3\n",
    "data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.loc[data.three == 5][\"three\"] = 6"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {},
   "outputs": [],
   "source": [
    "data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.loc[data.three == 5, \"three\"] = 6\n",
    "data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {},
   "outputs": [],
   "source": [
    "s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=[\"a\", \"c\", \"d\", \"e\"])\n",
    "s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1],\n",
    "               index=[\"a\", \"c\", \"e\", \"f\", \"g\"])\n",
    "s1\n",
    "s2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {},
   "outputs": [],
   "source": [
    "s1 + s2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "metadata": {},
   "outputs": [],
   "source": [
    "df1 = pd.DataFrame(np.arange(9.).reshape((3, 3)), columns=list(\"bcd\"),\n",
    "                   index=[\"Ohio\", \"Texas\", \"Colorado\"])\n",
    "df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list(\"bde\"),\n",
    "                   index=[\"Utah\", \"Ohio\", \"Texas\", \"Oregon\"])\n",
    "df1\n",
    "df2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {},
   "outputs": [],
   "source": [
    "df1 + df2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {},
   "outputs": [],
   "source": [
    "df1 = pd.DataFrame({\"A\": [1, 2]})\n",
    "df2 = pd.DataFrame({\"B\": [3, 4]})\n",
    "df1\n",
    "df2\n",
    "df1 + df2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "metadata": {},
   "outputs": [],
   "source": [
    "df1 = pd.DataFrame(np.arange(12.).reshape((3, 4)),\n",
    "                   columns=list(\"abcd\"))\n",
    "df2 = pd.DataFrame(np.arange(20.).reshape((4, 5)),\n",
    "                   columns=list(\"abcde\"))\n",
    "df2.loc[1, \"b\"] = np.nan\n",
    "df1\n",
    "df2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "metadata": {},
   "outputs": [],
   "source": [
    "df1 + df2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "metadata": {},
   "outputs": [],
   "source": [
    "df1.add(df2, fill_value=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "metadata": {},
   "outputs": [],
   "source": [
    "1 / df1\n",
    "df1.rdiv(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "metadata": {},
   "outputs": [],
   "source": [
    "df1.reindex(columns=df2.columns, fill_value=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr = np.arange(12.).reshape((3, 4))\n",
    "arr\n",
    "arr[0]\n",
    "arr - arr[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "metadata": {},
   "outputs": [],
   "source": [
    "frame = pd.DataFrame(np.arange(12.).reshape((4, 3)),\n",
    "                     columns=list(\"bde\"),\n",
    "                     index=[\"Utah\", \"Ohio\", \"Texas\", \"Oregon\"])\n",
    "series = frame.iloc[0]\n",
    "frame\n",
    "series"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "metadata": {},
   "outputs": [],
   "source": [
    "frame - series"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "metadata": {},
   "outputs": [],
   "source": [
    "series2 = pd.Series(np.arange(3), index=[\"b\", \"e\", \"f\"])\n",
    "series2\n",
    "frame + series2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "metadata": {},
   "outputs": [],
   "source": [
    "series3 = frame[\"d\"]\n",
    "frame\n",
    "series3\n",
    "frame.sub(series3, axis=\"index\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "metadata": {},
   "outputs": [],
   "source": [
    "frame = pd.DataFrame(np.random.standard_normal((4, 3)),\n",
    "                     columns=list(\"bde\"),\n",
    "                     index=[\"Utah\", \"Ohio\", \"Texas\", \"Oregon\"])\n",
    "frame\n",
    "np.abs(frame)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "metadata": {},
   "outputs": [],
   "source": [
    "def f1(x):\n",
    "    return x.max() - x.min()\n",
    "\n",
    "frame.apply(f1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "metadata": {},
   "outputs": [],
   "source": [
    "frame.apply(f1, axis=\"columns\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "metadata": {},
   "outputs": [],
   "source": [
    "def f2(x):\n",
    "    return pd.Series([x.min(), x.max()], index=[\"min\", \"max\"])\n",
    "frame.apply(f2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 98,
   "metadata": {},
   "outputs": [],
   "source": [
    "def my_format(x):\n",
    "    return f\"{x:.2f}\"\n",
    "\n",
    "frame.applymap(my_format)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "metadata": {},
   "outputs": [],
   "source": [
    "frame[\"e\"].map(my_format)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 100,
   "metadata": {},
   "outputs": [],
   "source": [
    "obj = pd.Series(np.arange(4), index=[\"d\", \"a\", \"b\", \"c\"])\n",
    "obj\n",
    "obj.sort_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "metadata": {},
   "outputs": [],
   "source": [
    "frame = pd.DataFrame(np.arange(8).reshape((2, 4)),\n",
    "                     index=[\"three\", \"one\"],\n",
    "                     columns=[\"d\", \"a\", \"b\", \"c\"])\n",
    "frame\n",
    "frame.sort_index()\n",
    "frame.sort_index(axis=\"columns\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 102,
   "metadata": {},
   "outputs": [],
   "source": [
    "frame.sort_index(axis=\"columns\", ascending=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 103,
   "metadata": {},
   "outputs": [],
   "source": [
    "obj = pd.Series([4, 7, -3, 2])\n",
    "obj.sort_values()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 104,
   "metadata": {},
   "outputs": [],
   "source": [
    "obj = pd.Series([4, np.nan, 7, np.nan, -3, 2])\n",
    "obj.sort_values()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "metadata": {},
   "outputs": [],
   "source": [
    "obj.sort_values(na_position=\"first\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 106,
   "metadata": {},
   "outputs": [],
   "source": [
    "frame = pd.DataFrame({\"b\": [4, 7, -3, 2], \"a\": [0, 1, 0, 1]})\n",
    "frame\n",
    "frame.sort_values(\"b\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 107,
   "metadata": {},
   "outputs": [],
   "source": [
    "frame.sort_values([\"a\", \"b\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 108,
   "metadata": {},
   "outputs": [],
   "source": [
    "obj = pd.Series([7, -5, 7, 4, 2, 0, 4])\n",
    "obj.rank()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 109,
   "metadata": {},
   "outputs": [],
   "source": [
    "obj.rank(method=\"first\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 110,
   "metadata": {},
   "outputs": [],
   "source": [
    "obj.rank(ascending=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "metadata": {},
   "outputs": [],
   "source": [
    "frame = pd.DataFrame({\"b\": [4.3, 7, -3, 2], \"a\": [0, 1, 0, 1],\n",
    "                      \"c\": [-2, 5, 8, -2.5]})\n",
    "frame\n",
    "frame.rank(axis=\"columns\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 112,
   "metadata": {},
   "outputs": [],
   "source": [
    "obj = pd.Series(np.arange(5), index=[\"a\", \"a\", \"b\", \"b\", \"c\"])\n",
    "obj"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 113,
   "metadata": {},
   "outputs": [],
   "source": [
    "obj.index.is_unique"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 114,
   "metadata": {},
   "outputs": [],
   "source": [
    "obj[\"a\"]\n",
    "obj[\"c\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 115,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.DataFrame(np.random.standard_normal((5, 3)),\n",
    "                  index=[\"a\", \"a\", \"b\", \"b\", \"c\"])\n",
    "df\n",
    "df.loc[\"b\"]\n",
    "df.loc[\"c\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 116,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5],\n",
    "                   [np.nan, np.nan], [0.75, -1.3]],\n",
    "                  index=[\"a\", \"b\", \"c\", \"d\"],\n",
    "                  columns=[\"one\", \"two\"])\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 117,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 118,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.sum(axis=\"columns\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 119,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.sum(axis=\"index\", skipna=False)\n",
    "df.sum(axis=\"columns\", skipna=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 120,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.mean(axis=\"columns\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 121,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.idxmax()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 122,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.cumsum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 123,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 124,
   "metadata": {},
   "outputs": [],
   "source": [
    "obj = pd.Series([\"a\", \"a\", \"b\", \"c\"] * 4)\n",
    "obj.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 125,
   "metadata": {},
   "outputs": [],
   "source": [
    "price = pd.read_pickle(\"examples/yahoo_price.pkl\")\n",
    "volume = pd.read_pickle(\"examples/yahoo_volume.pkl\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 126,
   "metadata": {},
   "outputs": [],
   "source": [
    "returns = price.pct_change()\n",
    "returns.tail()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 127,
   "metadata": {},
   "outputs": [],
   "source": [
    "returns[\"MSFT\"].corr(returns[\"IBM\"])\n",
    "returns[\"MSFT\"].cov(returns[\"IBM\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 128,
   "metadata": {},
   "outputs": [],
   "source": [
    "returns.corr()\n",
    "returns.cov()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 129,
   "metadata": {},
   "outputs": [],
   "source": [
    "returns.corrwith(returns[\"IBM\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 130,
   "metadata": {},
   "outputs": [],
   "source": [
    "returns.corrwith(volume)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 131,
   "metadata": {},
   "outputs": [],
   "source": [
    "obj = pd.Series([\"c\", \"a\", \"d\", \"a\", \"a\", \"b\", \"b\", \"c\", \"c\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 132,
   "metadata": {},
   "outputs": [],
   "source": [
    "uniques = obj.unique()\n",
    "uniques"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 133,
   "metadata": {},
   "outputs": [],
   "source": [
    "obj.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 134,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.value_counts(obj.to_numpy(), sort=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 135,
   "metadata": {},
   "outputs": [],
   "source": [
    "obj\n",
    "mask = obj.isin([\"b\", \"c\"])\n",
    "mask\n",
    "obj[mask]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 136,
   "metadata": {},
   "outputs": [],
   "source": [
    "to_match = pd.Series([\"c\", \"a\", \"b\", \"b\", \"c\", \"a\"])\n",
    "unique_vals = pd.Series([\"c\", \"b\", \"a\"])\n",
    "indices = pd.Index(unique_vals).get_indexer(to_match)\n",
    "indices"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 137,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.DataFrame({\"Qu1\": [1, 3, 4, 3, 4],\n",
    "                     \"Qu2\": [2, 3, 1, 2, 3],\n",
    "                     \"Qu3\": [1, 5, 2, 4, 4]})\n",
    "data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 138,
   "metadata": {},
   "outputs": [],
   "source": [
    "data[\"Qu1\"].value_counts().sort_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 139,
   "metadata": {},
   "outputs": [],
   "source": [
    "result = data.apply(pd.value_counts).fillna(0)\n",
    "result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 140,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.DataFrame({\"a\": [1, 1, 1, 2, 2], \"b\": [0, 0, 1, 0, 0]})\n",
    "data\n",
    "data.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 141,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 142,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.options.display.max_rows = PREVIOUS_MAX_ROWS"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}

================================================
FILE: ch06.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "np.random.seed(12345)\n",
    "import matplotlib.pyplot as plt\n",
    "plt.rc(\"figure\", figsize=(10, 6))\n",
    "pd.options.display.max_colwidth = 75\n",
    "pd.options.display.max_columns = 20\n",
    "np.set_printoptions(precision=4, suppress=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "!cat examples/ex1.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv(\"examples/ex1.csv\")\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "!cat examples/ex2.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.read_csv(\"examples/ex2.csv\", header=None)\n",
    "pd.read_csv(\"examples/ex2.csv\", names=[\"a\", \"b\", \"c\", \"d\", \"message\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "names = [\"a\", \"b\", \"c\", \"d\", \"message\"]\n",
    "pd.read_csv(\"examples/ex2.csv\", names=names, index_col=\"message\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "!cat examples/csv_mindex.csv\n",
    "parsed = pd.read_csv(\"examples/csv_mindex.csv\",\n",
    "                     index_col=[\"key1\", \"key2\"])\n",
    "parsed"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "!cat examples/ex3.txt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "result = pd.read_csv(\"examples/ex3.txt\", sep=\"\\s+\")\n",
    "result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "!cat examples/ex4.csv\n",
    "pd.read_csv(\"examples/ex4.csv\", skiprows=[0, 2, 3])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "!cat examples/ex5.csv\n",
    "result = pd.read_csv(\"examples/ex5.csv\")\n",
    "result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.isna(result)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "result = pd.read_csv(\"examples/ex5.csv\", na_values=[\"NULL\"])\n",
    "result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "result2 = pd.read_csv(\"examples/ex5.csv\", keep_default_na=False)\n",
    "result2\n",
    "result2.isna()\n",
    "result3 = pd.read_csv(\"examples/ex5.csv\", keep_default_na=False,\n",
    "                      na_values=[\"NA\"])\n",
    "result3\n",
    "result3.isna()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "sentinels = {\"message\": [\"foo\", \"NA\"], \"something\": [\"two\"]}\n",
    "pd.read_csv(\"examples/ex5.csv\", na_values=sentinels,\n",
    "            keep_default_na=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.options.display.max_rows = 10"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "result = pd.read_csv(\"examples/ex6.csv\")\n",
    "result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.read_csv(\"examples/ex6.csv\", nrows=5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "chunker = pd.read_csv(\"examples/ex6.csv\", chunksize=1000)\n",
    "type(chunker)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "chunker = pd.read_csv(\"examples/ex6.csv\", chunksize=1000)\n",
    "\n",
    "tot = pd.Series([], dtype='int64')\n",
    "for piece in chunker:\n",
    "    tot = tot.add(piece[\"key\"].value_counts(), fill_value=0)\n",
    "\n",
    "tot = tot.sort_values(ascending=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "tot[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.read_csv(\"examples/ex5.csv\")\n",
    "data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.to_csv(\"examples/out.csv\")\n",
    "!cat examples/out.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "data.to_csv(sys.stdout, sep=\"|\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.to_csv(sys.stdout, na_rep=\"NULL\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.to_csv(sys.stdout, index=False, header=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.to_csv(sys.stdout, index=False, columns=[\"a\", \"b\", \"c\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "!cat examples/ex7.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "import csv\n",
    "f = open(\"examples/ex7.csv\")\n",
    "reader = csv.reader(f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "for line in reader:\n",
    "    print(line)\n",
    "f.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(\"examples/ex7.csv\") as f:\n",
    "    lines = list(csv.reader(f))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "header, values = lines[0], lines[1:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_dict = {h: v for h, v in zip(header, zip(*values))}\n",
    "data_dict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "obj = \"\"\"\n",
    "{\"name\": \"Wes\",\n",
    " \"cities_lived\": [\"Akron\", \"Nashville\", \"New York\", \"San Francisco\"],\n",
    " \"pet\": null,\n",
    " \"siblings\": [{\"name\": \"Scott\", \"age\": 34, \"hobbies\": [\"guitars\", \"soccer\"]},\n",
    "              {\"name\": \"Katie\", \"age\": 42, \"hobbies\": [\"diving\", \"art\"]}]\n",
    "}\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "result = json.loads(obj)\n",
    "result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "asjson = json.dumps(result)\n",
    "asjson"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "siblings = pd.DataFrame(result[\"siblings\"], columns=[\"name\", \"age\"])\n",
    "siblings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "!cat examples/example.json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.read_json(\"examples/example.json\")\n",
    "data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.to_json(sys.stdout)\n",
    "data.to_json(sys.stdout, orient=\"records\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [],
   "source": [
    "tables = pd.read_html(\"examples/fdic_failed_bank_list.html\")\n",
    "len(tables)\n",
    "failures = tables[0]\n",
    "failures.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
    "close_timestamps = pd.to_datetime(failures[\"Closing Date\"])\n",
    "close_timestamps.dt.year.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [],
   "source": [
    "from lxml import objectify\n",
    "\n",
    "path = \"datasets/mta_perf/Performance_MNR.xml\"\n",
    "with open(path) as f:\n",
    "    parsed = objectify.parse(f)\n",
    "root = parsed.getroot()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = []\n",
    "\n",
    "skip_fields = [\"PARENT_SEQ\", \"INDICATOR_SEQ\",\n",
    "               \"DESIRED_CHANGE\", \"DECIMAL_PLACES\"]\n",
    "\n",
    "for elt in root.INDICATOR:\n",
    "    el_data = {}\n",
    "    for child in elt.getchildren():\n",
    "        if child.tag in skip_fields:\n",
    "            continue\n",
    "        el_data[child.tag] = child.pyval\n",
    "    data.append(el_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [],
   "source": [
    "perf = pd.DataFrame(data)\n",
    "perf.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [],
   "source": [
    "perf2 = pd.read_xml(path)\n",
    "perf2.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [],
   "source": [
    "frame = pd.read_csv(\"examples/ex1.csv\")\n",
    "frame\n",
    "frame.to_pickle(\"examples/frame_pickle\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.read_pickle(\"examples/frame_pickle\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [],
   "source": [
    "!rm examples/frame_pickle"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [],
   "source": [
    "fec = pd.read_parquet('datasets/fec/fec.parquet')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [],
   "source": [
    "xlsx = pd.ExcelFile(\"examples/ex1.xlsx\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [],
   "source": [
    "xlsx.sheet_names"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [],
   "source": [
    "xlsx.parse(sheet_name=\"Sheet1\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [],
   "source": [
    "xlsx.parse(sheet_name=\"Sheet1\", index_col=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [],
   "source": [
    "frame = pd.read_excel(\"examples/ex1.xlsx\", sheet_name=\"Sheet1\")\n",
    "frame"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [],
   "source": [
    "writer = pd.ExcelWriter(\"examples/ex2.xlsx\")\n",
    "frame.to_excel(writer, \"Sheet1\")\n",
    "writer.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [],
   "source": [
    "frame.to_excel(\"examples/ex2.xlsx\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [],
   "source": [
    "!rm examples/ex2.xlsx"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [],
   "source": [
    "!rm -f examples/mydata.h5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [],
   "source": [
    "frame = pd.DataFrame({\"a\": np.random.standard_normal(100)})\n",
    "store = pd.HDFStore(\"examples/mydata.h5\")\n",
    "store[\"obj1\"] = frame\n",
    "store[\"obj1_col\"] = frame[\"a\"]\n",
    "store"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [],
   "source": [
    "store[\"obj1\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [],
   "source": [
    "store.put(\"obj2\", frame, format=\"table\")\n",
    "store.select(\"obj2\", where=[\"index >= 10 and index <= 15\"])\n",
    "store.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [],
   "source": [
    "frame.to_hdf(\"examples/mydata.h5\", \"obj3\", format=\"table\")\n",
    "pd.read_hdf(\"examples/mydata.h5\", \"obj3\", where=[\"index < 5\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "os.remove(\"examples/mydata.h5\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests\n",
    "url = \"https://api.github.com/repos/pandas-dev/pandas/issues\"\n",
    "resp = requests.get(url)\n",
    "resp.raise_for_status()\n",
    "resp"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = resp.json()\n",
    "data[0][\"title\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [],
   "source": [
    "issues = pd.DataFrame(data, columns=[\"number\", \"title\",\n",
    "                                     \"labels\", \"state\"])\n",
    "issues"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sqlite3\n",
    "\n",
    "query = \"\"\"\n",
    "CREATE TABLE test\n",
    "(a VARCHAR(20), b VARCHAR(20),\n",
    " c REAL,        d INTEGER\n",
    ");\"\"\"\n",
    "\n",
    "con = sqlite3.connect(\"mydata.sqlite\")\n",
    "con.execute(query)\n",
    "con.commit()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = [(\"Atlanta\", \"Georgia\", 1.25, 6),\n",
    "        (\"Tallahassee\", \"Florida\", 2.6, 3),\n",
    "        (\"Sacramento\", \"California\", 1.7, 5)]\n",
    "stmt = \"INSERT INTO test VALUES(?, ?, ?, ?)\"\n",
    "\n",
    "con.executemany(stmt, data)\n",
    "con.commit()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [],
   "source": [
    "cursor = con.execute(\"SELECT * FROM test\")\n",
    "rows = cursor.fetchall()\n",
    "rows"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [],
   "source": [
    "cursor.description\n",
    "pd.DataFrame(rows, columns=[x[0] for x in cursor.description])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sqlalchemy as sqla\n",
    "db = sqla.create_engine(\"sqlite:///mydata.sqlite\")\n",
    "pd.read_sql(\"SELECT * FROM test\", db)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {},
   "outputs": [],
   "source": [
    "!rm mydata.sqlite"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}

================================================
FILE: ch07.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "PREVIOUS_MAX_ROWS = pd.options.display.max_rows\n",
    "pd.options.display.max_rows = 25\n",
    "pd.options.display.max_columns = 20\n",
    "pd.options.display.max_colwidth = 82\n",
    "np.random.seed(12345)\n",
    "import matplotlib.pyplot as plt\n",
    "plt.rc(\"figure\", figsize=(10, 6))\n",
    "np.set_printoptions(precision=4, suppress=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "float_data = pd.Series([1.2, -3.5, np.nan, 0])\n",
    "float_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "float_data.isna()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "string_data = pd.Series([\"aardvark\", np.nan, None, \"avocado\"])\n",
    "string_data\n",
    "string_data.isna()\n",
    "float_data = pd.Series([1, 2, None], dtype='float64')\n",
    "float_data\n",
    "float_data.isna()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.Series([1, np.nan, 3.5, np.nan, 7])\n",
    "data.dropna()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "data[data.notna()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.DataFrame([[1., 6.5, 3.], [1., np.nan, np.nan],\n",
    "                     [np.nan, np.nan, np.nan], [np.nan, 6.5, 3.]])\n",
    "data\n",
    "data.dropna()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.dropna(how=\"all\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "data[4] = np.nan\n",
    "data\n",
    "data.dropna(axis=\"columns\", how=\"all\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.DataFrame(np.random.standard_normal((7, 3)))\n",
    "df.iloc[:4, 1] = np.nan\n",
    "df.iloc[:2, 2] = np.nan\n",
    "df\n",
    "df.dropna()\n",
    "df.dropna(thresh=2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.fillna(0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.fillna({1: 0.5, 2: 0})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.DataFrame(np.random.standard_normal((6, 3)))\n",
    "df.iloc[2:, 1] = np.nan\n",
    "df.iloc[4:, 2] = np.nan\n",
    "df\n",
    "df.fillna(method=\"ffill\")\n",
    "df.fillna(method=\"ffill\", limit=2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.Series([1., np.nan, 3.5, np.nan, 7])\n",
    "data.fillna(data.mean())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.DataFrame({\"k1\": [\"one\", \"two\"] * 3 + [\"two\"],\n",
    "                     \"k2\": [1, 1, 2, 3, 3, 4, 4]})\n",
    "data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.duplicated()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.drop_duplicates()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "data[\"v1\"] = range(7)\n",
    "data\n",
    "data.drop_duplicates(subset=[\"k1\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.drop_duplicates([\"k1\", \"k2\"], keep=\"last\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.DataFrame({\"food\": [\"bacon\", \"pulled pork\", \"bacon\",\n",
    "                              \"pastrami\", \"corned beef\", \"bacon\",\n",
    "                              \"pastrami\", \"honey ham\", \"nova lox\"],\n",
    "                     \"ounces\": [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})\n",
    "data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "meat_to_animal = {\n",
    "  \"bacon\": \"pig\",\n",
    "  \"pulled pork\": \"pig\",\n",
    "  \"pastrami\": \"cow\",\n",
    "  \"corned beef\": \"cow\",\n",
    "  \"honey ham\": \"pig\",\n",
    "  \"nova lox\": \"salmon\"\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "data[\"animal\"] = data[\"food\"].map(meat_to_animal)\n",
    "data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_animal(x):\n",
    "    return meat_to_animal[x]\n",
    "data[\"food\"].map(get_animal)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.Series([1., -999., 2., -999., -1000., 3.])\n",
    "data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.replace(-999, np.nan)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.replace([-999, -1000], np.nan)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.replace([-999, -1000], [np.nan, 0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.replace({-999: np.nan, -1000: 0})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.DataFrame(np.arange(12).reshape((3, 4)),\n",
    "                    index=[\"Ohio\", \"Colorado\", \"New York\"],\n",
    "                    columns=[\"one\", \"two\", \"three\", \"four\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "def transform(x):\n",
    "    return x[:4].upper()\n",
    "\n",
    "data.index.map(transform)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.index = data.index.map(transform)\n",
    "data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.rename(index=str.title, columns=str.upper)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.rename(index={\"OHIO\": \"INDIANA\"},\n",
    "            columns={\"three\": \"peekaboo\"})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "bins = [18, 25, 35, 60, 100]\n",
    "age_categories = pd.cut(ages, bins)\n",
    "age_categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "age_categories.codes\n",
    "age_categories.categories\n",
    "age_categories.categories[0]\n",
    "pd.value_counts(age_categories)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.cut(ages, bins, right=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "group_names = [\"Youth\", \"YoungAdult\", \"MiddleAged\", \"Senior\"]\n",
    "pd.cut(ages, bins, labels=group_names)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = np.random.uniform(size=20)\n",
    "pd.cut(data, 4, precision=2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = np.random.standard_normal(1000)\n",
    "quartiles = pd.qcut(data, 4, precision=2)\n",
    "quartiles\n",
    "pd.value_counts(quartiles)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.]).value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.DataFrame(np.random.standard_normal((1000, 4)))\n",
    "data.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [],
   "source": [
    "col = data[2]\n",
    "col[col.abs() > 3]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [],
   "source": [
    "data[(data.abs() > 3).any(axis=\"columns\")]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [],
   "source": [
    "data[data.abs() > 3] = np.sign(data) * 3\n",
    "data.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.sign(data).head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.DataFrame(np.arange(5 * 7).reshape((5, 7)))\n",
    "df\n",
    "sampler = np.random.permutation(5)\n",
    "sampler"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.take(sampler)\n",
    "df.iloc[sampler]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [],
   "source": [
    "column_sampler = np.random.permutation(7)\n",
    "column_sampler\n",
    "df.take(column_sampler, axis=\"columns\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.sample(n=3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [],
   "source": [
    "choices = pd.Series([5, 7, -1, 6, 4])\n",
    "choices.sample(n=10, replace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.DataFrame({\"key\": [\"b\", \"b\", \"a\", \"c\", \"a\", \"b\"],\n",
    "                   \"data1\": range(6)})\n",
    "df\n",
    "pd.get_dummies(df[\"key\"], dtype=float)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [],
   "source": [
    "dummies = pd.get_dummies(df[\"key\"], prefix=\"key\", dtype=float)\n",
    "df_with_dummy = df[[\"data1\"]].join(dummies)\n",
    "df_with_dummy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [],
   "source": [
    "mnames = [\"movie_id\", \"title\", \"genres\"]\n",
    "movies = pd.read_table(\"datasets/movielens/movies.dat\", sep=\"::\",\n",
    "                       header=None, names=mnames, engine=\"python\")\n",
    "movies[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [],
   "source": [
    "dummies = movies[\"genres\"].str.get_dummies(\"|\")\n",
    "dummies.iloc[:10, :6]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [],
   "source": [
    "movies_windic = movies.join(dummies.add_prefix(\"Genre_\"))\n",
    "movies_windic.iloc[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.random.seed(12345) # to make the example repeatable\n",
    "values = np.random.uniform(size=10)\n",
    "values\n",
    "bins = [0, 0.2, 0.4, 0.6, 0.8, 1]\n",
    "pd.get_dummies(pd.cut(values, bins))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [],
   "source": [
    "s = pd.Series([1, 2, 3, None])\n",
    "s\n",
    "s.dtype"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [],
   "source": [
    "s = pd.Series([1, 2, 3, None], dtype=pd.Int64Dtype())\n",
    "s\n",
    "s.isna()\n",
    "s.dtype"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [],
   "source": [
    "s[3]\n",
    "s[3] is pd.NA"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [],
   "source": [
    "s = pd.Series([1, 2, 3, None], dtype=\"Int64\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [],
   "source": [
    "s = pd.Series(['one', 'two', None, 'three'], dtype=pd.StringDtype())\n",
    "s"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.DataFrame({\"A\": [1, 2, None, 4],\n",
    "                   \"B\": [\"one\", \"two\", \"three\", None],\n",
    "                   \"C\": [False, None, False, True]})\n",
    "df\n",
    "df[\"A\"] = df[\"A\"].astype(\"Int64\")\n",
    "df[\"B\"] = df[\"B\"].astype(\"string\")\n",
    "df[\"C\"] = df[\"C\"].astype(\"boolean\")\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [],
   "source": [
    "val = \"a,b,  guido\"\n",
    "val.split(\",\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [],
   "source": [
    "pieces = [x.strip() for x in val.split(\",\")]\n",
    "pieces"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [],
   "source": [
    "first, second, third = pieces\n",
    "first + \"::\" + second + \"::\" + third"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [],
   "source": [
    "\"::\".join(pieces)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [],
   "source": [
    "\"guido\" in val\n",
    "val.index(\",\")\n",
    "val.find(\":\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [],
   "source": [
    "val.index(\":\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [],
   "source": [
    "val.count(\",\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {},
   "outputs": [],
   "source": [
    "val.replace(\",\", \"::\")\n",
    "val.replace(\",\", \"\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "text = \"foo    bar\\t baz  \\tqux\"\n",
    "re.split(r\"\\s+\", text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {},
   "outputs": [],
   "source": [
    "regex = re.compile(r\"\\s+\")\n",
    "regex.split(text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {},
   "outputs": [],
   "source": [
    "regex.findall(text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {},
   "outputs": [],
   "source": [
    "text = \"\"\"Dave dave@google.com\n",
    "Steve steve@gmail.com\n",
    "Rob rob@gmail.com\n",
    "Ryan ryan@yahoo.com\"\"\"\n",
    "pattern = r\"[A-Z0-9._%+-]+@[A-Z0-9.-]+\\.[A-Z]{2,4}\"\n",
    "\n",
    "# re.IGNORECASE makes the regex case insensitive\n",
    "regex = re.compile(pattern, flags=re.IGNORECASE)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {},
   "outputs": [],
   "source": [
    "regex.findall(text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {},
   "outputs": [],
   "source": [
    "m = regex.search(text)\n",
    "m\n",
    "text[m.start():m.end()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(regex.match(text))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(regex.sub(\"REDACTED\", text))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "metadata": {},
   "outputs": [],
   "source": [
    "pattern = r\"([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\\.([A-Z]{2,4})\"\n",
    "regex = re.compile(pattern, flags=re.IGNORECASE)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {},
   "outputs": [],
   "source": [
    "m = regex.match(\"wesm@bright.net\")\n",
    "m.groups()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {},
   "outputs": [],
   "source": [
    "regex.findall(text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(regex.sub(r\"Username: \\1, Domain: \\2, Suffix: \\3\", text))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = {\"Dave\": \"dave@google.com\", \"Steve\": \"steve@gmail.com\",\n",
    "        \"Rob\": \"rob@gmail.com\", \"Wes\": np.nan}\n",
    "data = pd.Series(data)\n",
    "data\n",
    "data.isna()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.str.contains(\"gmail\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_as_string_ext = data.astype('string')\n",
    "data_as_string_ext\n",
    "data_as_string_ext.str.contains(\"gmail\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "metadata": {},
   "outputs": [],
   "source": [
    "pattern = r\"([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\\.([A-Z]{2,4})\"\n",
    "data.str.findall(pattern, flags=re.IGNORECASE)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "metadata": {},
   "outputs": [],
   "source": [
    "matches = data.str.findall(pattern, flags=re.IGNORECASE).str[0]\n",
    "matches\n",
    "matches.str.get(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.str[:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.str.extract(pattern, flags=re.IGNORECASE)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "metadata": {},
   "outputs": [],
   "source": [
    "values = pd.Series(['apple', 'orange', 'apple',\n",
    "                    'apple'] * 2)\n",
    "values\n",
    "pd.unique(values)\n",
    "pd.value_counts(values)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "metadata": {},
   "outputs": [],
   "source": [
    "values = pd.Series([0, 1, 0, 0] * 2)\n",
    "dim = pd.Series(['apple', 'orange'])\n",
    "values\n",
    "dim"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "metadata": {},
   "outputs": [],
   "source": [
    "dim.take(values)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "metadata": {},
   "outputs": [],
   "source": [
    "fruits = ['apple', 'orange', 'apple', 'apple'] * 2\n",
    "N = len(fruits)\n",
    "rng = np.random.default_rng(seed=12345)\n",
    "df = pd.DataFrame({'fruit': fruits,\n",
    "                   'basket_id': np.arange(N),\n",
    "                   'count': rng.integers(3, 15, size=N),\n",
    "                   'weight': rng.uniform(0, 4, size=N)},\n",
    "                  columns=['basket_id', 'fruit', 'count', 'weight'])\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "metadata": {},
   "outputs": [],
   "source": [
    "fruit_cat = df['fruit'].astype('category')\n",
    "fruit_cat"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "metadata": {},
   "outputs": [],
   "source": [
    "c = fruit_cat.array\n",
    "type(c)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 98,
   "metadata": {},
   "outputs": [],
   "source": [
    "c.categories\n",
    "c.codes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "metadata": {},
   "outputs": [],
   "source": [
    "dict(enumerate(c.categories))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 100,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['fruit'] = df['fruit'].astype('category')\n",
    "df[\"fruit\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "metadata": {},
   "outputs": [],
   "source": [
    "my_categories = pd.Categorical(['foo', 'bar', 'baz', 'foo', 'bar'])\n",
    "my_categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 102,
   "metadata": {},
   "outputs": [],
   "source": [
    "categories = ['foo', 'bar', 'baz']\n",
    "codes = [0, 1, 2, 0, 0, 1]\n",
    "my_cats_2 = pd.Categorical.from_codes(codes, categories)\n",
    "my_cats_2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 103,
   "metadata": {},
   "outputs": [],
   "source": [
    "ordered_cat = pd.Categorical.from_codes(codes, categories,\n",
    "                                        ordered=True)\n",
    "ordered_cat"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 104,
   "metadata": {},
   "outputs": [],
   "source": [
    "my_cats_2.as_ordered()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "metadata": {},
   "outputs": [],
   "source": [
    "rng = np.random.default_rng(seed=12345)\n",
    "draws = rng.standard_normal(1000)\n",
    "draws[:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 106,
   "metadata": {},
   "outputs": [],
   "source": [
    "bins = pd.qcut(draws, 4)\n",
    "bins"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 107,
   "metadata": {},
   "outputs": [],
   "source": [
    "bins = pd.qcut(draws, 4, labels=['Q1', 'Q2', 'Q3', 'Q4'])\n",
    "bins\n",
    "bins.codes[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 108,
   "metadata": {},
   "outputs": [],
   "source": [
    "bins = pd.Series(bins, name='quartile')\n",
    "results = (pd.Series(draws)\n",
    "           .groupby(bins)\n",
    "           .agg(['count', 'min', 'max'])\n",
    "           .reset_index())\n",
    "results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 109,
   "metadata": {},
   "outputs": [],
   "source": [
    "results['quartile']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 110,
   "metadata": {},
   "outputs": [],
   "source": [
    "N = 10_000_000\n",
    "labels = pd.Series(['foo', 'bar', 'baz', 'qux'] * (N // 4))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "metadata": {},
   "outputs": [],
   "source": [
    "categories = labels.astype('category')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 112,
   "metadata": {},
   "outputs": [],
   "source": [
    "labels.memory_usage(deep=True)\n",
    "categories.memory_usage(deep=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 113,
   "metadata": {},
   "outputs": [],
   "source": [
    "%time _ = labels.astype('category')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 114,
   "metadata": {},
   "outputs": [],
   "source": [
    "%timeit labels.value_counts()\n",
    "%timeit categories.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 115,
   "metadata": {},
   "outputs": [],
   "source": [
    "s = pd.Series(['a', 'b', 'c', 'd'] * 2)\n",
    "cat_s = s.astype('category')\n",
    "cat_s"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 116,
   "metadata": {},
   "outputs": [],
   "source": [
    "cat_s.cat.codes\n",
    "cat_s.cat.categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 117,
   "metadata": {},
   "outputs": [],
   "source": [
    "actual_categories = ['a', 'b', 'c', 'd', 'e']\n",
    "cat_s2 = cat_s.cat.set_categories(actual_categories)\n",
    "cat_s2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 118,
   "metadata": {},
   "outputs": [],
   "source": [
    "cat_s.value_counts()\n",
    "cat_s2.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 119,
   "metadata": {},
   "outputs": [],
   "source": [
    "cat_s3 = cat_s[cat_s.isin(['a', 'b'])]\n",
    "cat_s3\n",
    "cat_s3.cat.remove_unused_categories()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 120,
   "metadata": {},
   "outputs": [],
   "source": [
    "cat_s = pd.Series(['a', 'b', 'c', 'd'] * 2, dtype='category')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 121,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.get_dummies(cat_s, dtype=float)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 122,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 123,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.options.display.max_rows = PREVIOUS_MAX_ROWS"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}

================================================
FILE: ch08.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "pd.options.display.max_rows = 20\n",
    "pd.options.display.max_colwidth = 80\n",
    "pd.options.display.max_columns = 20\n",
    "np.random.seed(12345)\n",
    "import matplotlib.pyplot as plt\n",
    "plt.rc(\"figure\", figsize=(10, 6))\n",
    "np.set_printoptions(precision=4, suppress=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.Series(np.random.uniform(size=9),\n",
    "                 index=[[\"a\", \"a\", \"a\", \"b\", \"b\", \"c\", \"c\", \"d\", \"d\"],\n",
    "                        [1, 2, 3, 1, 3, 1, 2, 2, 3]])\n",
    "data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "data[\"b\"]\n",
    "data[\"b\":\"c\"]\n",
    "data.loc[[\"b\", \"d\"]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.loc[:, 2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.unstack()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.unstack().stack()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "frame = pd.DataFrame(np.arange(12).reshape((4, 3)),\n",
    "                     index=[[\"a\", \"a\", \"b\", \"b\"], [1, 2, 1, 2]],\n",
    "                     columns=[[\"Ohio\", \"Ohio\", \"Colorado\"],\n",
    "                              [\"Green\", \"Red\", \"Green\"]])\n",
    "frame"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "frame.index.names = [\"key1\", \"key2\"]\n",
    "frame.columns.names = [\"state\", \"color\"]\n",
    "frame"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "frame.index.nlevels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "frame[\"Ohio\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "frame.swaplevel(\"key1\", \"key2\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "frame.sort_index(level=1)\n",
    "frame.swaplevel(0, 1).sort_index(level=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "frame.groupby(level=\"key2\").sum()\n",
    "frame.groupby(level=\"color\", axis=\"columns\").sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "frame = pd.DataFrame({\"a\": range(7), \"b\": range(7, 0, -1),\n",
    "                      \"c\": [\"one\", \"one\", \"one\", \"two\", \"two\",\n",
    "                            \"two\", \"two\"],\n",
    "                      \"d\": [0, 1, 2, 0, 1, 2, 3]})\n",
    "frame"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "frame2 = frame.set_index([\"c\", \"d\"])\n",
    "frame2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "frame.set_index([\"c\", \"d\"], drop=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "frame2.reset_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "df1 = pd.DataFrame({\"key\": [\"b\", \"b\", \"a\", \"c\", \"a\", \"a\", \"b\"],\n",
    "                    \"data1\": pd.Series(range(7), dtype=\"Int64\")})\n",
    "df2 = pd.DataFrame({\"key\": [\"a\", \"b\", \"d\"],\n",
    "                    \"data2\": pd.Series(range(3), dtype=\"Int64\")})\n",
    "df1\n",
    "df2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.merge(df1, df2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.merge(df1, df2, on=\"key\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "df3 = pd.DataFrame({\"lkey\": [\"b\", \"b\", \"a\", \"c\", \"a\", \"a\", \"b\"],\n",
    "                    \"data1\": pd.Series(range(7), dtype=\"Int64\")})\n",
    "df4 = pd.DataFrame({\"rkey\": [\"a\", \"b\", \"d\"],\n",
    "                    \"data2\": pd.Series(range(3), dtype=\"Int64\")})\n",
    "pd.merge(df3, df4, left_on=\"lkey\", right_on=\"rkey\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.merge(df1, df2, how=\"outer\")\n",
    "pd.merge(df3, df4, left_on=\"lkey\", right_on=\"rkey\", how=\"outer\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "df1 = pd.DataFrame({\"key\": [\"b\", \"b\", \"a\", \"c\", \"a\", \"b\"],\n",
    "                    \"data1\": pd.Series(range(6), dtype=\"Int64\")})\n",
    "df2 = pd.DataFrame({\"key\": [\"a\", \"b\", \"a\", \"b\", \"d\"],\n",
    "                    \"data2\": pd.Series(range(5), dtype=\"Int64\")})\n",
    "df1\n",
    "df2\n",
    "pd.merge(df1, df2, on=\"key\", how=\"left\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.merge(df1, df2, how=\"inner\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "left = pd.DataFrame({\"key1\": [\"foo\", \"foo\", \"bar\"],\n",
    "                     \"key2\": [\"one\", \"two\", \"one\"],\n",
    "                     \"lval\": pd.Series([1, 2, 3], dtype='Int64')})\n",
    "right = pd.DataFrame({\"key1\": [\"foo\", \"foo\", \"bar\", \"bar\"],\n",
    "                      \"key2\": [\"one\", \"one\", \"one\", \"two\"],\n",
    "                      \"rval\": pd.Series([4, 5, 6, 7], dtype='Int64')})\n",
    "pd.merge(left, right, on=[\"key1\", \"key2\"], how=\"outer\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.merge(left, right, on=\"key1\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.merge(left, right, on=\"key1\", suffixes=(\"_left\", \"_right\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "left1 = pd.DataFrame({\"key\": [\"a\", \"b\", \"a\", \"a\", \"b\", \"c\"],\n",
    "                      \"value\": pd.Series(range(6), dtype=\"Int64\")})\n",
    "right1 = pd.DataFrame({\"group_val\": [3.5, 7]}, index=[\"a\", \"b\"])\n",
    "left1\n",
    "right1\n",
    "pd.merge(left1, right1, left_on=\"key\", right_index=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.merge(left1, right1, left_on=\"key\", right_index=True, how=\"outer\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "lefth = pd.DataFrame({\"key1\": [\"Ohio\", \"Ohio\", \"Ohio\",\n",
    "                               \"Nevada\", \"Nevada\"],\n",
    "                      \"key2\": [2000, 2001, 2002, 2001, 2002],\n",
    "                      \"data\": pd.Series(range(5), dtype=\"Int64\")})\n",
    "righth_index = pd.MultiIndex.from_arrays(\n",
    "    [\n",
    "        [\"Nevada\", \"Nevada\", \"Ohio\", \"Ohio\", \"Ohio\", \"Ohio\"],\n",
    "        [2001, 2000, 2000, 2000, 2001, 2002]\n",
    "    ]\n",
    ")\n",
    "righth = pd.DataFrame({\"event1\": pd.Series([0, 2, 4, 6, 8, 10], dtype=\"Int64\",\n",
    "                                           index=righth_index),\n",
    "                       \"event2\": pd.Series([1, 3, 5, 7, 9, 11], dtype=\"Int64\",\n",
    "                                           index=righth_index)})\n",
    "lefth\n",
    "righth"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.merge(lefth, righth, left_on=[\"key1\", \"key2\"], right_index=True)\n",
    "pd.merge(lefth, righth, left_on=[\"key1\", \"key2\"],\n",
    "         right_index=True, how=\"outer\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "left2 = pd.DataFrame([[1., 2.], [3., 4.], [5., 6.]],\n",
    "                     index=[\"a\", \"c\", \"e\"],\n",
    "                     columns=[\"Ohio\", \"Nevada\"]).astype(\"Int64\")\n",
    "right2 = pd.DataFrame([[7., 8.], [9., 10.], [11., 12.], [13, 14]],\n",
    "                      index=[\"b\", \"c\", \"d\", \"e\"],\n",
    "                      columns=[\"Missouri\", \"Alabama\"]).astype(\"Int64\")\n",
    "left2\n",
    "right2\n",
    "pd.merge(left2, right2, how=\"outer\", left_index=True, right_index=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "left2.join(right2, how=\"outer\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "left1.join(right1, on=\"key\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "another = pd.DataFrame([[7., 8.], [9., 10.], [11., 12.], [16., 17.]],\n",
    "                       index=[\"a\", \"c\", \"e\", \"f\"],\n",
    "                       columns=[\"New York\", \"Oregon\"])\n",
    "another\n",
    "left2.join([right2, another])\n",
    "left2.join([right2, another], how=\"outer\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr = np.arange(12).reshape((3, 4))\n",
    "arr\n",
    "np.concatenate([arr, arr], axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "s1 = pd.Series([0, 1], index=[\"a\", \"b\"], dtype=\"Int64\")\n",
    "s2 = pd.Series([2, 3, 4], index=[\"c\", \"d\", \"e\"], dtype=\"Int64\")\n",
    "s3 = pd.Series([5, 6], index=[\"f\", \"g\"], dtype=\"Int64\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "s1\n",
    "s2\n",
    "s3\n",
    "pd.concat([s1, s2, s3])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.concat([s1, s2, s3], axis=\"columns\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [],
   "source": [
    "s4 = pd.concat([s1, s3])\n",
    "s4\n",
    "pd.concat([s1, s4], axis=\"columns\")\n",
    "pd.concat([s1, s4], axis=\"columns\", join=\"inner\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
    "result = pd.concat([s1, s1, s3], keys=[\"one\", \"two\", \"three\"])\n",
    "result\n",
    "result.unstack()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.concat([s1, s2, s3], axis=\"columns\", keys=[\"one\", \"two\", \"three\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [],
   "source": [
    "df1 = pd.DataFrame(np.arange(6).reshape(3, 2), index=[\"a\", \"b\", \"c\"],\n",
    "                   columns=[\"one\", \"two\"])\n",
    "df2 = pd.DataFrame(5 + np.arange(4).reshape(2, 2), index=[\"a\", \"c\"],\n",
    "                   columns=[\"three\", \"four\"])\n",
    "df1\n",
    "df2\n",
    "pd.concat([df1, df2], axis=\"columns\", keys=[\"level1\", \"level2\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.concat({\"level1\": df1, \"level2\": df2}, axis=\"columns\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.concat([df1, df2], axis=\"columns\", keys=[\"level1\", \"level2\"],\n",
    "          names=[\"upper\", \"lower\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [],
   "source": [
    "df1 = pd.DataFrame(np.random.standard_normal((3, 4)),\n",
    "                   columns=[\"a\", \"b\", \"c\", \"d\"])\n",
    "df2 = pd.DataFrame(np.random.standard_normal((2, 3)),\n",
    "                   columns=[\"b\", \"d\", \"a\"])\n",
    "df1\n",
    "df2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.concat([df1, df2], ignore_index=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [],
   "source": [
    "a = pd.Series([np.nan, 2.5, 0.0, 3.5, 4.5, np.nan],\n",
    "              index=[\"f\", \"e\", \"d\", \"c\", \"b\", \"a\"])\n",
    "b = pd.Series([0., np.nan, 2., np.nan, np.nan, 5.],\n",
    "              index=[\"a\", \"b\", \"c\", \"d\", \"e\", \"f\"])\n",
    "a\n",
    "b\n",
    "np.where(pd.isna(a), b, a)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [],
   "source": [
    "a.combine_first(b)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [],
   "source": [
    "df1 = pd.DataFrame({\"a\": [1., np.nan, 5., np.nan],\n",
    "                    \"b\": [np.nan, 2., np.nan, 6.],\n",
    "                    \"c\": range(2, 18, 4)})\n",
    "df2 = pd.DataFrame({\"a\": [5., 4., np.nan, 3., 7.],\n",
    "                    \"b\": [np.nan, 3., 4., 6., 8.]})\n",
    "df1\n",
    "df2\n",
    "df1.combine_first(df2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.DataFrame(np.arange(6).reshape((2, 3)),\n",
    "                    index=pd.Index([\"Ohio\", \"Colorado\"], name=\"state\"),\n",
    "                    columns=pd.Index([\"one\", \"two\", \"three\"],\n",
    "                    name=\"number\"))\n",
    "data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [],
   "source": [
    "result = data.stack()\n",
    "result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [],
   "source": [
    "result.unstack()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [],
   "source": [
    "result.unstack(level=0)\n",
    "result.unstack(level=\"state\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [],
   "source": [
    "s1 = pd.Series([0, 1, 2, 3], index=[\"a\", \"b\", \"c\", \"d\"], dtype=\"Int64\")\n",
    "s2 = pd.Series([4, 5, 6], index=[\"c\", \"d\", \"e\"], dtype=\"Int64\")\n",
    "data2 = pd.concat([s1, s2], keys=[\"one\", \"two\"])\n",
    "data2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [],
   "source": [
    "data2.unstack()\n",
    "data2.unstack().stack()\n",
    "data2.unstack().stack(dropna=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.DataFrame({\"left\": result, \"right\": result + 5},\n",
    "                  columns=pd.Index([\"left\", \"right\"], name=\"side\"))\n",
    "df\n",
    "df.unstack(level=\"state\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.unstack(level=\"state\").stack(level=\"side\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.read_csv(\"examples/macrodata.csv\")\n",
    "data = data.loc[:, [\"year\", \"quarter\", \"realgdp\", \"infl\", \"unemp\"]]\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [],
   "source": [
    "periods = pd.PeriodIndex(year=data.pop(\"year\"),\n",
    "                         quarter=data.pop(\"quarter\"),\n",
    "                         name=\"date\")\n",
    "periods\n",
    "data.index = periods.to_timestamp(\"D\")\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = data.reindex(columns=[\"realgdp\", \"infl\", \"unemp\"])\n",
    "data.columns.name = \"item\"\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [],
   "source": [
    "long_data = (data.stack()\n",
    "             .reset_index()\n",
    "             .rename(columns={0: \"value\"}))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [],
   "source": [
    "long_data[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [],
   "source": [
    "pivoted = long_data.pivot(index=\"date\", columns=\"item\",\n",
    "                          values=\"value\")\n",
    "pivoted.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [],
   "source": [
    "long_data.index.name = None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [],
   "source": [
    "long_data[\"value2\"] = np.random.standard_normal(len(long_data))\n",
    "long_data[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [],
   "source": [
    "pivoted = long_data.pivot(index=\"date\", columns=\"item\")\n",
    "pivoted.head()\n",
    "pivoted[\"value\"].head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [],
   "source": [
    "unstacked = long_data.set_index([\"date\", \"item\"]).unstack(level=\"item\")\n",
    "unstacked.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.DataFrame({\"key\": [\"foo\", \"bar\", \"baz\"],\n",
    "                   \"A\": [1, 2, 3],\n",
    "                   \"B\": [4, 5, 6],\n",
    "                   \"C\": [7, 8, 9]})\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {},
   "outputs": [],
   "source": [
    "melted = pd.melt(df, id_vars=\"key\")\n",
    "melted"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {},
   "outputs": [],
   "source": [
    "reshaped = melted.pivot(index=\"key\", columns=\"variable\",\n",
    "                        values=\"value\")\n",
    "reshaped"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {},
   "outputs": [],
   "source": [
    "reshaped.reset_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.melt(df, id_vars=\"key\", value_vars=[\"A\", \"B\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.melt(df, value_vars=[\"A\", \"B\", \"C\"])\n",
    "pd.melt(df, value_vars=[\"key\", \"A\", \"B\"])"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}

================================================
FILE: ch09.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "PREVIOUS_MAX_ROWS = pd.options.display.max_rows\n",
    "pd.options.display.max_rows = 20\n",
    "pd.options.display.max_colwidth = 80\n",
    "pd.options.display.max_columns = 20\n",
    "np.random.seed(12345)\n",
    "import matplotlib.pyplot as plt\n",
    "import matplotlib\n",
    "plt.rc(\"figure\", figsize=(10, 6))\n",
    "np.set_printoptions(precision=4, suppress=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = np.arange(10)\n",
    "data\n",
    "plt.plot(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig = plt.figure()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "ax1 = fig.add_subplot(2, 2, 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "ax2 = fig.add_subplot(2, 2, 2)\n",
    "ax3 = fig.add_subplot(2, 2, 3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "ax3.plot(np.random.standard_normal(50).cumsum(), color=\"black\",\n",
    "         linestyle=\"dashed\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "ax1.hist(np.random.standard_normal(100), bins=20, color=\"black\", alpha=0.3);\n",
    "ax2.scatter(np.arange(30), np.arange(30) + 3 * np.random.standard_normal(30));"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.close(\"all\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig, axes = plt.subplots(2, 3)\n",
    "axes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig, axes = plt.subplots(2, 2, sharex=True, sharey=True)\n",
    "for i in range(2):\n",
    "    for j in range(2):\n",
    "        axes[i, j].hist(np.random.standard_normal(500), bins=50,\n",
    "                        color=\"black\", alpha=0.5)\n",
    "fig.subplots_adjust(wspace=0, hspace=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig = plt.figure()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "ax = fig.add_subplot()\n",
    "ax.plot(np.random.standard_normal(30).cumsum(), color=\"black\",\n",
    "        linestyle=\"dashed\", marker=\"o\");"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.close(\"all\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig = plt.figure()\n",
    "ax = fig.add_subplot()\n",
    "data = np.random.standard_normal(30).cumsum()\n",
    "ax.plot(data, color=\"black\", linestyle=\"dashed\", label=\"Default\");\n",
    "ax.plot(data, color=\"black\", linestyle=\"dashed\",\n",
    "        drawstyle=\"steps-post\", label=\"steps-post\");\n",
    "ax.legend()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig, ax = plt.subplots()\n",
    "ax.plot(np.random.standard_normal(1000).cumsum());"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "ticks = ax.set_xticks([0, 250, 500, 750, 1000])\n",
    "labels = ax.set_xticklabels([\"one\", \"two\", \"three\", \"four\", \"five\"],\n",
    "                            rotation=30, fontsize=8)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "ax.set_xlabel(\"Stages\")\n",
    "ax.set_title(\"My first matplotlib plot\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig, ax = plt.subplots()\n",
    "ax.plot(np.random.randn(1000).cumsum(), color=\"black\", label=\"one\");\n",
    "ax.plot(np.random.randn(1000).cumsum(), color=\"black\", linestyle=\"dashed\",\n",
    "        label=\"two\");\n",
    "ax.plot(np.random.randn(1000).cumsum(), color=\"black\", linestyle=\"dotted\",\n",
    "        label=\"three\");"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "ax.legend()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "from datetime import datetime\n",
    "\n",
    "fig, ax = plt.subplots()\n",
    "\n",
    "data = pd.read_csv(\"examples/spx.csv\", index_col=0, parse_dates=True)\n",
    "spx = data[\"SPX\"]\n",
    "\n",
    "spx.plot(ax=ax, color=\"black\")\n",
    "\n",
    "crisis_data = [\n",
    "    (datetime(2007, 10, 11), \"Peak of bull market\"),\n",
    "    (datetime(2008, 3, 12), \"Bear Stearns Fails\"),\n",
    "    (datetime(2008, 9, 15), \"Lehman Bankruptcy\")\n",
    "]\n",
    "\n",
    "for date, label in crisis_data:\n",
    "    ax.annotate(label, xy=(date, spx.asof(date) + 75),\n",
    "                xytext=(date, spx.asof(date) + 225),\n",
    "                arrowprops=dict(facecolor=\"black\", headwidth=4, width=2,\n",
    "                                headlength=4),\n",
    "                horizontalalignment=\"left\", verticalalignment=\"top\")\n",
    "\n",
    "# Zoom in on 2007-2010\n",
    "ax.set_xlim([\"1/1/2007\", \"1/1/2011\"])\n",
    "ax.set_ylim([600, 1800])\n",
    "\n",
    "ax.set_title(\"Important dates in the 2008\u20132009 financial crisis\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "ax.set_title(\"Important dates in the 2008\u20132009 financial crisis\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig, ax = plt.subplots(figsize=(12, 6))\n",
    "rect = plt.Rectangle((0.2, 0.75), 0.4, 0.15, color=\"black\", alpha=0.3)\n",
    "circ = plt.Circle((0.7, 0.2), 0.15, color=\"blue\", alpha=0.3)\n",
    "pgon = plt.Polygon([[0.15, 0.15], [0.35, 0.4], [0.2, 0.6]],\n",
    "                   color=\"green\", alpha=0.5)\n",
    "ax.add_patch(rect)\n",
    "ax.add_patch(circ)\n",
    "ax.add_patch(pgon)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.close(\"all\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "s = pd.Series(np.random.standard_normal(10).cumsum(), index=np.arange(0, 100, 10))\n",
    "s.plot()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.DataFrame(np.random.standard_normal((10, 4)).cumsum(0),\n",
    "                  columns=[\"A\", \"B\", \"C\", \"D\"],\n",
    "                  index=np.arange(0, 100, 10))\n",
    "plt.style.use('grayscale')\n",
    "df.plot()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig, axes = plt.subplots(2, 1)\n",
    "data = pd.Series(np.random.uniform(size=16), index=list(\"abcdefghijklmnop\"))\n",
    "data.plot.bar(ax=axes[0], color=\"black\", alpha=0.7)\n",
    "data.plot.barh(ax=axes[1], color=\"black\", alpha=0.7)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.random.seed(12348)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.DataFrame(np.random.uniform(size=(6, 4)),\n",
    "                  index=[\"one\", \"two\", \"three\", \"four\", \"five\", \"six\"],\n",
    "                  columns=pd.Index([\"A\", \"B\", \"C\", \"D\"], name=\"Genus\"))\n",
    "df\n",
    "df.plot.bar()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.figure()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.plot.barh(stacked=True, alpha=0.5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.close(\"all\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "tips = pd.read_csv(\"examples/tips.csv\")\n",
    "tips.head()\n",
    "party_counts = pd.crosstab(tips[\"day\"], tips[\"size\"])\n",
    "party_counts = party_counts.reindex(index=[\"Thur\", \"Fri\", \"Sat\", \"Sun\"])\n",
    "party_counts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "party_counts = party_counts.loc[:, 2:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Normalize to sum to 1\n",
    "party_pcts = party_counts.div(party_counts.sum(axis=\"columns\"),\n",
    "                              axis=\"index\")\n",
    "party_pcts\n",
    "party_pcts.plot.bar(stacked=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.close(\"all\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "import seaborn as sns\n",
    "\n",
    "tips[\"tip_pct\"] = tips[\"tip\"] / (tips[\"total_bill\"] - tips[\"tip\"])\n",
    "tips.head()\n",
    "sns.barplot(x=\"tip_pct\", y=\"day\", data=tips, orient=\"h\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.close(\"all\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "sns.barplot(x=\"tip_pct\", y=\"day\", hue=\"time\", data=tips, orient=\"h\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.close(\"all\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [],
   "source": [
    "sns.set_style(\"whitegrid\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.figure()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [],
   "source": [
    "tips[\"tip_pct\"].plot.hist(bins=50)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.figure()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [],
   "source": [
    "tips[\"tip_pct\"].plot.density()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.figure()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [],
   "source": [
    "comp1 = np.random.standard_normal(200)\n",
    "comp2 = 10 + 2 * np.random.standard_normal(200)\n",
    "values = pd.Series(np.concatenate([comp1, comp2]))\n",
    "\n",
    "sns.histplot(values, bins=100, color=\"black\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [],
   "source": [
    "macro = pd.read_csv(\"examples/macrodata.csv\")\n",
    "data = macro[[\"cpi\", \"m1\", \"tbilrate\", \"unemp\"]]\n",
    "trans_data = np.log(data).diff().dropna()\n",
    "trans_data.tail()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.figure()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [],
   "source": [
    "ax = sns.regplot(x=\"m1\", y=\"unemp\", data=trans_data)\n",
    "ax.set_title(\"Changes in log(m1) versus log(unemp)\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [],
   "source": [
    "sns.pairplot(trans_data, diag_kind=\"kde\", plot_kws={\"alpha\": 0.2})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [],
   "source": [
    "sns.catplot(x=\"day\", y=\"tip_pct\", hue=\"time\", col=\"smoker\",\n",
    "            kind=\"bar\", data=tips[tips.tip_pct < 1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [],
   "source": [
    "sns.catplot(x=\"day\", y=\"tip_pct\", row=\"time\",\n",
    "            col=\"smoker\",\n",
    "            kind=\"bar\", data=tips[tips.tip_pct < 1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [],
   "source": [
    "sns.catplot(x=\"tip_pct\", y=\"day\", kind=\"box\",\n",
    "            data=tips[tips.tip_pct < 0.5])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.options.display.max_rows = PREVIOUS_MAX_ROWS"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}

================================================
FILE: ch10.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "PREVIOUS_MAX_ROWS = pd.options.display.max_rows\n",
    "pd.options.display.max_columns = 20\n",
    "pd.options.display.max_rows = 20\n",
    "pd.options.display.max_colwidth = 80\n",
    "np.random.seed(12345)\n",
    "import matplotlib.pyplot as plt\n",
    "plt.rc(\"figure\", figsize=(10, 6))\n",
    "np.set_printoptions(precision=4, suppress=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.DataFrame({\"key1\" : [\"a\", \"a\", None, \"b\", \"b\", \"a\", None],\n",
    "                   \"key2\" : pd.Series([1, 2, 1, 2, 1, None, 1],\n",
    "                                      dtype=\"Int64\"),\n",
    "                   \"data1\" : np.random.standard_normal(7),\n",
    "                   \"data2\" : np.random.standard_normal(7)})\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "grouped = df[\"data1\"].groupby(df[\"key1\"])\n",
    "grouped"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "grouped.mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "means = df[\"data1\"].groupby([df[\"key1\"], df[\"key2\"]]).mean()\n",
    "means"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "means.unstack()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "states = np.array([\"OH\", \"CA\", \"CA\", \"OH\", \"OH\", \"CA\", \"OH\"])\n",
    "years = [2005, 2005, 2006, 2005, 2006, 2005, 2006]\n",
    "df[\"data1\"].groupby([states, years]).mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.groupby(\"key1\").mean()\n",
    "df.groupby(\"key2\").mean(numeric_only=True)\n",
    "df.groupby([\"key1\", \"key2\"]).mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.groupby([\"key1\", \"key2\"]).size()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.groupby(\"key1\", dropna=False).size()\n",
    "df.groupby([\"key1\", \"key2\"], dropna=False).size()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.groupby(\"key1\").count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "for name, group in df.groupby(\"key1\"):\n",
    "    print(name)\n",
    "    print(group)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "for (k1, k2), group in df.groupby([\"key1\", \"key2\"]):\n",
    "    print((k1, k2))\n",
    "    print(group)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "pieces = {name: group for name, group in df.groupby(\"key1\")}\n",
    "pieces[\"b\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "grouped = df.groupby({\"key1\": \"key\", \"key2\": \"key\",\n",
    "                      \"data1\": \"data\", \"data2\": \"data\"}, axis=\"columns\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "for group_key, group_values in grouped:\n",
    "    print(group_key)\n",
    "    print(group_values)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.groupby([\"key1\", \"key2\"])[[\"data2\"]].mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "s_grouped = df.groupby([\"key1\", \"key2\"])[\"data2\"]\n",
    "s_grouped\n",
    "s_grouped.mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "people = pd.DataFrame(np.random.standard_normal((5, 5)),\n",
    "                      columns=[\"a\", \"b\", \"c\", \"d\", \"e\"],\n",
    "                      index=[\"Joe\", \"Steve\", \"Wanda\", \"Jill\", \"Trey\"])\n",
    "people.iloc[2:3, [1, 2]] = np.nan # Add a few NA values\n",
    "people"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "mapping = {\"a\": \"red\", \"b\": \"red\", \"c\": \"blue\",\n",
    "           \"d\": \"blue\", \"e\": \"red\", \"f\" : \"orange\"}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "by_column = people.groupby(mapping, axis=\"columns\")\n",
    "by_column.sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "map_series = pd.Series(mapping)\n",
    "map_series\n",
    "people.groupby(map_series, axis=\"columns\").count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "people.groupby(len).sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "key_list = [\"one\", \"one\", \"one\", \"two\", \"two\"]\n",
    "people.groupby([len, key_list]).min()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "columns = pd.MultiIndex.from_arrays([[\"US\", \"US\", \"US\", \"JP\", \"JP\"],\
Download .txt
gitextract_mai23lqn/

├── .gitignore
├── COPYING
├── README.md
├── appa.ipynb
├── appb.ipynb
├── ch02.ipynb
├── ch03.ipynb
├── ch04.ipynb
├── ch05.ipynb
├── ch06.ipynb
├── ch07.ipynb
├── ch08.ipynb
├── ch09.ipynb
├── ch10.ipynb
├── ch11.ipynb
├── ch12.ipynb
├── ch13.ipynb
├── datasets/
│   ├── babynames/
│   │   ├── yob1880.txt
│   │   ├── yob1881.txt
│   │   ├── yob1882.txt
│   │   ├── yob1883.txt
│   │   ├── yob1884.txt
│   │   ├── yob1885.txt
│   │   ├── yob1886.txt
│   │   ├── yob1887.txt
│   │   ├── yob1888.txt
│   │   ├── yob1889.txt
│   │   ├── yob1890.txt
│   │   ├── yob1891.txt
│   │   ├── yob1892.txt
│   │   ├── yob1893.txt
│   │   ├── yob1894.txt
│   │   ├── yob1895.txt
│   │   ├── yob1896.txt
│   │   ├── yob1897.txt
│   │   ├── yob1898.txt
│   │   ├── yob1899.txt
│   │   ├── yob1900.txt
│   │   ├── yob1901.txt
│   │   ├── yob1902.txt
│   │   ├── yob1903.txt
│   │   ├── yob1904.txt
│   │   ├── yob1905.txt
│   │   ├── yob1906.txt
│   │   ├── yob1907.txt
│   │   ├── yob1908.txt
│   │   ├── yob1909.txt
│   │   ├── yob1910.txt
│   │   ├── yob1911.txt
│   │   ├── yob1912.txt
│   │   ├── yob1913.txt
│   │   ├── yob1914.txt
│   │   ├── yob1915.txt
│   │   ├── yob1916.txt
│   │   ├── yob1917.txt
│   │   ├── yob1918.txt
│   │   ├── yob1919.txt
│   │   ├── yob1920.txt
│   │   ├── yob1921.txt
│   │   ├── yob1922.txt
│   │   ├── yob1923.txt
│   │   ├── yob1924.txt
│   │   ├── yob1925.txt
│   │   ├── yob1926.txt
│   │   ├── yob1927.txt
│   │   ├── yob1928.txt
│   │   ├── yob1929.txt
│   │   ├── yob1930.txt
│   │   ├── yob1931.txt
│   │   ├── yob1932.txt
│   │   ├── yob1933.txt
│   │   ├── yob1934.txt
│   │   ├── yob1935.txt
│   │   ├── yob1936.txt
│   │   ├── yob1937.txt
│   │   ├── yob1938.txt
│   │   ├── yob1939.txt
│   │   ├── yob1940.txt
│   │   ├── yob1941.txt
│   │   ├── yob1942.txt
│   │   ├── yob1943.txt
│   │   ├── yob1944.txt
│   │   ├── yob1945.txt
│   │   ├── yob1946.txt
│   │   ├── yob1947.txt
│   │   ├── yob1948.txt
│   │   ├── yob1949.txt
│   │   ├── yob1950.txt
│   │   ├── yob1951.txt
│   │   ├── yob1952.txt
│   │   ├── yob1953.txt
│   │   ├── yob1954.txt
│   │   ├── yob1955.txt
│   │   ├── yob1956.txt
│   │   ├── yob1957.txt
│   │   ├── yob1958.txt
│   │   ├── yob1959.txt
│   │   ├── yob1960.txt
│   │   ├── yob1961.txt
│   │   ├── yob1962.txt
│   │   ├── yob1963.txt
│   │   ├── yob1964.txt
│   │   ├── yob1965.txt
│   │   ├── yob1966.txt
│   │   ├── yob1967.txt
│   │   ├── yob1968.txt
│   │   ├── yob1969.txt
│   │   ├── yob1970.txt
│   │   ├── yob1971.txt
│   │   ├── yob1972.txt
│   │   ├── yob1973.txt
│   │   ├── yob1974.txt
│   │   ├── yob1975.txt
│   │   ├── yob1976.txt
│   │   ├── yob1977.txt
│   │   ├── yob1978.txt
│   │   ├── yob1979.txt
│   │   ├── yob1980.txt
│   │   ├── yob1981.txt
│   │   ├── yob1982.txt
│   │   ├── yob1983.txt
│   │   ├── yob1984.txt
│   │   ├── yob1985.txt
│   │   ├── yob1986.txt
│   │   ├── yob1987.txt
│   │   ├── yob1988.txt
│   │   ├── yob1989.txt
│   │   ├── yob1990.txt
│   │   ├── yob1991.txt
│   │   ├── yob1992.txt
│   │   ├── yob1993.txt
│   │   ├── yob1994.txt
│   │   ├── yob1995.txt
│   │   ├── yob1996.txt
│   │   ├── yob1997.txt
│   │   ├── yob1998.txt
│   │   ├── yob1999.txt
│   │   ├── yob2000.txt
│   │   ├── yob2001.txt
│   │   ├── yob2002.txt
│   │   ├── yob2003.txt
│   │   ├── yob2004.txt
│   │   ├── yob2005.txt
│   │   ├── yob2006.txt
│   │   ├── yob2007.txt
│   │   ├── yob2008.txt
│   │   ├── yob2009.txt
│   │   └── yob2010.txt
│   ├── bitly_usagov/
│   │   └── example.txt
│   ├── fec/
│   │   ├── P00000001-ALL.csv
│   │   └── fec.parquet
│   ├── haiti/
│   │   ├── Haiti.csv
│   │   └── PortAuPrince_Roads/
│   │       ├── PortAuPrince_Roads.dbf
│   │       ├── PortAuPrince_Roads.prj
│   │       ├── PortAuPrince_Roads.sbn
│   │       ├── PortAuPrince_Roads.sbx
│   │       ├── PortAuPrince_Roads.shp
│   │       ├── PortAuPrince_Roads.shx
│   │       └── PortAuPrince_Roads_README.txt
│   ├── movielens/
│   │   └── README
│   ├── mta_perf/
│   │   ├── Performance_LIBUS.xml
│   │   ├── Performance_LIBUS.xsd
│   │   ├── Performance_LIRR.xml
│   │   ├── Performance_LIRR.xsd
│   │   ├── Performance_MNR.xml
│   │   ├── Performance_MNR.xsd
│   │   ├── Performance_MTABUS.xml
│   │   ├── Performance_MTABUS.xsd
│   │   ├── Performance_NYCT.xml
│   │   ├── Performance_NYCT.xsd
│   │   ├── Performance_TBTA.xml
│   │   ├── Performance_TBTA.xsd
│   │   └── parse.py
│   ├── titanic/
│   │   ├── genderclassmodel.csv
│   │   ├── gendermodel.csv
│   │   ├── test.csv
│   │   └── train.csv
│   └── usda_food/
│       └── database.json
├── examples/
│   ├── array_ex.txt
│   ├── csv_mindex.csv
│   ├── ex1.csv
│   ├── ex1.xlsx
│   ├── ex2.csv
│   ├── ex3.txt
│   ├── ex4.csv
│   ├── ex5.csv
│   ├── ex6.csv
│   ├── ex7.csv
│   ├── example.json
│   ├── fdic_failed_bank_list.html
│   ├── ipython_bug.py
│   ├── macrodata.csv
│   ├── segismundo.txt
│   ├── spx.csv
│   ├── stock_px.csv
│   ├── test_file.csv
│   ├── tips.csv
│   ├── tseries.csv
│   ├── volume.csv
│   ├── yahoo_price.pkl
│   └── yahoo_volume.pkl
├── pyproject.toml
└── requirements.txt
Copy disabled (too large) Download .json
Condensed preview — 203 files, each showing path, character count, and a content snippet. Download the .json file for the full structured content (18,251K chars).
[
  {
    "path": ".gitignore",
    "chars": 18,
    "preview": ".ipynb_checkpoints"
  },
  {
    "path": "COPYING",
    "chars": 1138,
    "preview": "Code examples from \"Python for Data Analysis\", 3rd Edition\n\nThe MIT License (MIT)\n\nCopyright (c) 2022 Wes McKinney\n\nPerm"
  },
  {
    "path": "README.md",
    "chars": 3750,
    "preview": "# Python for Data Analysis, 3rd Edition\n\nMaterials and IPython notebooks for \"Python for Data Analysis, 3rd\nEdition\" by "
  },
  {
    "path": "appa.ipynb",
    "chars": 18320,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 1,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n "
  },
  {
    "path": "appb.ipynb",
    "chars": 1868,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 1,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n "
  },
  {
    "path": "ch02.ipynb",
    "chars": 12089,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 1,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n "
  },
  {
    "path": "ch03.ipynb",
    "chars": 25308,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 1,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": []\n"
  },
  {
    "path": "ch04.ipynb",
    "chars": 22417,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 1,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n "
  },
  {
    "path": "ch05.ipynb",
    "chars": 29194,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 1,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n "
  },
  {
    "path": "ch06.ipynb",
    "chars": 16500,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 1,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n "
  },
  {
    "path": "ch07.ipynb",
    "chars": 26450,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 1,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n "
  },
  {
    "path": "ch08.ipynb",
    "chars": 20734,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 1,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n "
  },
  {
    "path": "ch09.ipynb",
    "chars": 14368,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 1,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n "
  },
  {
    "path": "ch10.ipynb",
    "chars": 21850,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 1,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n "
  },
  {
    "path": "ch11.ipynb",
    "chars": 27066,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 1,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n "
  },
  {
    "path": "ch12.ipynb",
    "chars": 11519,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 1,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n "
  },
  {
    "path": "ch13.ipynb",
    "chars": 31243,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 1,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n "
  },
  {
    "path": "datasets/babynames/yob1880.txt",
    "chars": 24933,
    "preview": "Mary,F,7065\r\nAnna,F,2604\r\nEmma,F,2003\r\nElizabeth,F,1939\r\nMinnie,F,1746\r\nMargaret,F,1578\r\nIda,F,1472\r\nAlice,F,1414\r\nBerth"
  },
  {
    "path": "datasets/babynames/yob1881.txt",
    "chars": 24065,
    "preview": "Mary,F,6919\r\nAnna,F,2698\r\nEmma,F,2034\r\nElizabeth,F,1852\r\nMargaret,F,1658\r\nMinnie,F,1653\r\nIda,F,1439\r\nAnnie,F,1326\r\nBerth"
  },
  {
    "path": "datasets/babynames/yob1882.txt",
    "chars": 26559,
    "preview": "Mary,F,8149\r\nAnna,F,3143\r\nEmma,F,2303\r\nElizabeth,F,2187\r\nMinnie,F,2004\r\nMargaret,F,1821\r\nIda,F,1673\r\nAlice,F,1542\r\nBerth"
  },
  {
    "path": "datasets/babynames/yob1883.txt",
    "chars": 26003,
    "preview": "Mary,F,8012\r\nAnna,F,3306\r\nEmma,F,2367\r\nElizabeth,F,2255\r\nMinnie,F,2035\r\nMargaret,F,1881\r\nBertha,F,1681\r\nIda,F,1634\r\nAnni"
  },
  {
    "path": "datasets/babynames/yob1884.txt",
    "chars": 28670,
    "preview": "Mary,F,9217\r\nAnna,F,3860\r\nEmma,F,2587\r\nElizabeth,F,2549\r\nMinnie,F,2243\r\nMargaret,F,2143\r\nIda,F,1882\r\nClara,F,1852\r\nBerth"
  },
  {
    "path": "datasets/babynames/yob1885.txt",
    "chars": 28625,
    "preview": "Mary,F,9128\r\nAnna,F,3994\r\nEmma,F,2728\r\nElizabeth,F,2582\r\nMargaret,F,2204\r\nMinnie,F,2178\r\nClara,F,1910\r\nBertha,F,1860\r\nId"
  },
  {
    "path": "datasets/babynames/yob1886.txt",
    "chars": 29822,
    "preview": "Mary,F,9891\r\nAnna,F,4283\r\nEmma,F,2764\r\nElizabeth,F,2680\r\nMinnie,F,2372\r\nMargaret,F,2275\r\nIda,F,2049\r\nBertha,F,2001\r\nClar"
  },
  {
    "path": "datasets/babynames/yob1887.txt",
    "chars": 29532,
    "preview": "Mary,F,9888\r\nAnna,F,4227\r\nElizabeth,F,2681\r\nEmma,F,2647\r\nMargaret,F,2419\r\nMinnie,F,2215\r\nBertha,F,2037\r\nClara,F,1984\r\nFl"
  },
  {
    "path": "datasets/babynames/yob1888.txt",
    "chars": 33065,
    "preview": "Mary,F,11754\r\nAnna,F,4982\r\nElizabeth,F,3224\r\nEmma,F,3087\r\nMargaret,F,2904\r\nMinnie,F,2654\r\nBertha,F,2450\r\nFlorence,F,2444"
  },
  {
    "path": "datasets/babynames/yob1889.txt",
    "chars": 32297,
    "preview": "Mary,F,11649\r\nAnna,F,5062\r\nElizabeth,F,3058\r\nMargaret,F,2917\r\nEmma,F,2884\r\nMinnie,F,2624\r\nFlorence,F,2465\r\nEthel,F,2463\r"
  },
  {
    "path": "datasets/babynames/yob1890.txt",
    "chars": 33621,
    "preview": "Mary,F,12078\r\nAnna,F,5233\r\nElizabeth,F,3112\r\nMargaret,F,3100\r\nEmma,F,2980\r\nFlorence,F,2745\r\nEthel,F,2718\r\nMinnie,F,2650\r"
  },
  {
    "path": "datasets/babynames/yob1891.txt",
    "chars": 33186,
    "preview": "Mary,F,11704\r\nAnna,F,5099\r\nMargaret,F,3066\r\nElizabeth,F,3059\r\nEmma,F,2884\r\nFlorence,F,2715\r\nEthel,F,2689\r\nMinnie,F,2428\r"
  },
  {
    "path": "datasets/babynames/yob1892.txt",
    "chars": 36542,
    "preview": "Mary,F,13174\r\nAnna,F,5542\r\nElizabeth,F,3461\r\nMargaret,F,3435\r\nRuth,F,3291\r\nFlorence,F,3154\r\nEmma,F,3128\r\nEthel,F,3035\r\nH"
  },
  {
    "path": "datasets/babynames/yob1893.txt",
    "chars": 35434,
    "preview": "Mary,F,12784\r\nAnna,F,5695\r\nRuth,F,3658\r\nMargaret,F,3565\r\nElizabeth,F,3361\r\nHelen,F,3249\r\nFlorence,F,3231\r\nEthel,F,3119\r\n"
  },
  {
    "path": "datasets/babynames/yob1894.txt",
    "chars": 36817,
    "preview": "Mary,F,13151\r\nAnna,F,5565\r\nMargaret,F,3701\r\nHelen,F,3676\r\nElizabeth,F,3425\r\nRuth,F,3372\r\nEthel,F,3287\r\nFlorence,F,3233\r\n"
  },
  {
    "path": "datasets/babynames/yob1895.txt",
    "chars": 38232,
    "preview": "Mary,F,13446\r\nAnna,F,5949\r\nHelen,F,4023\r\nMargaret,F,3931\r\nElizabeth,F,3603\r\nRuth,F,3551\r\nFlorence,F,3471\r\nEthel,F,3391\r\n"
  },
  {
    "path": "datasets/babynames/yob1896.txt",
    "chars": 38747,
    "preview": "Mary,F,13811\r\nAnna,F,5860\r\nHelen,F,4392\r\nMargaret,F,4051\r\nRuth,F,3905\r\nEthel,F,3502\r\nElizabeth,F,3471\r\nFlorence,F,3323\r\n"
  },
  {
    "path": "datasets/babynames/yob1897.txt",
    "chars": 37936,
    "preview": "Mary,F,13412\r\nAnna,F,5429\r\nHelen,F,4518\r\nMargaret,F,4145\r\nRuth,F,3878\r\nElizabeth,F,3442\r\nFlorence,F,3369\r\nEthel,F,3182\r\n"
  },
  {
    "path": "datasets/babynames/yob1898.txt",
    "chars": 40937,
    "preview": "Mary,F,14406\r\nAnna,F,5773\r\nHelen,F,5230\r\nMargaret,F,4696\r\nRuth,F,4249\r\nFlorence,F,3790\r\nElizabeth,F,3659\r\nEthel,F,3531\r\n"
  },
  {
    "path": "datasets/babynames/yob1899.txt",
    "chars": 38141,
    "preview": "Mary,F,13172\r\nAnna,F,5115\r\nHelen,F,5048\r\nMargaret,F,4249\r\nRuth,F,3912\r\nFlorence,F,3314\r\nElizabeth,F,3287\r\nMarie,F,3156\r\n"
  },
  {
    "path": "datasets/babynames/yob1900.txt",
    "chars": 46898,
    "preview": "Mary,F,16710\r\nHelen,F,6343\r\nAnna,F,6115\r\nMargaret,F,5305\r\nRuth,F,4765\r\nElizabeth,F,4097\r\nFlorence,F,3920\r\nEthel,F,3896\r\n"
  },
  {
    "path": "datasets/babynames/yob1901.txt",
    "chars": 39584,
    "preview": "Mary,F,13137\r\nHelen,F,5247\r\nAnna,F,4923\r\nMargaret,F,4424\r\nRuth,F,3974\r\nElizabeth,F,3216\r\nMarie,F,3157\r\nFlorence,F,3131\r\n"
  },
  {
    "path": "datasets/babynames/yob1902.txt",
    "chars": 42284,
    "preview": "Mary,F,14485\r\nHelen,F,5967\r\nAnna,F,5288\r\nMargaret,F,5011\r\nRuth,F,4384\r\nElizabeth,F,3694\r\nFlorence,F,3509\r\nMarie,F,3423\r\n"
  },
  {
    "path": "datasets/babynames/yob1903.txt",
    "chars": 42679,
    "preview": "Mary,F,14275\r\nHelen,F,6129\r\nAnna,F,5098\r\nMargaret,F,5046\r\nRuth,F,4518\r\nElizabeth,F,3723\r\nMarie,F,3469\r\nFlorence,F,3468\r\n"
  },
  {
    "path": "datasets/babynames/yob1904.txt",
    "chars": 44819,
    "preview": "Mary,F,14962\r\nHelen,F,6489\r\nAnna,F,5330\r\nMargaret,F,5302\r\nRuth,F,4900\r\nElizabeth,F,3833\r\nMarie,F,3595\r\nFlorence,F,3572\r\n"
  },
  {
    "path": "datasets/babynames/yob1905.txt",
    "chars": 46015,
    "preview": "Mary,F,16067\r\nHelen,F,6811\r\nMargaret,F,5690\r\nAnna,F,5424\r\nRuth,F,5068\r\nElizabeth,F,4122\r\nDorothy,F,3939\r\nMildred,F,3758\r"
  },
  {
    "path": "datasets/babynames/yob1906.txt",
    "chars": 45845,
    "preview": "Mary,F,16371\r\nHelen,F,7176\r\nMargaret,F,6096\r\nAnna,F,5502\r\nRuth,F,5140\r\nDorothy,F,4326\r\nElizabeth,F,4321\r\nAlice,F,4192\r\nM"
  },
  {
    "path": "datasets/babynames/yob1907.txt",
    "chars": 49829,
    "preview": "Mary,F,17579\r\nHelen,F,7579\r\nMargaret,F,6713\r\nAnna,F,5575\r\nRuth,F,5573\r\nDorothy,F,4966\r\nElizabeth,F,4623\r\nMildred,F,4277\r"
  },
  {
    "path": "datasets/babynames/yob1908.txt",
    "chars": 50841,
    "preview": "Mary,F,18664\r\nHelen,F,8436\r\nMargaret,F,6976\r\nRuth,F,6179\r\nAnna,F,5858\r\nDorothy,F,5703\r\nElizabeth,F,4904\r\nMildred,F,4624\r"
  },
  {
    "path": "datasets/babynames/yob1909.txt",
    "chars": 53488,
    "preview": "Mary,F,19256\r\nHelen,F,9249\r\nMargaret,F,7358\r\nRuth,F,6508\r\nDorothy,F,6252\r\nAnna,F,5803\r\nElizabeth,F,5175\r\nMildred,F,5053\r"
  },
  {
    "path": "datasets/babynames/yob1910.txt",
    "chars": 58699,
    "preview": "Mary,F,22840\r\nHelen,F,10477\r\nMargaret,F,8226\r\nDorothy,F,7315\r\nRuth,F,7210\r\nAnna,F,6434\r\nElizabeth,F,5799\r\nMildred,F,5692"
  },
  {
    "path": "datasets/babynames/yob1911.txt",
    "chars": 61771,
    "preview": "Mary,F,24385\r\nHelen,F,11799\r\nMargaret,F,9276\r\nDorothy,F,8869\r\nRuth,F,7999\r\nAnna,F,6752\r\nElizabeth,F,6296\r\nMildred,F,6269"
  },
  {
    "path": "datasets/babynames/yob1912.txt",
    "chars": 80705,
    "preview": "Mary,F,32295\r\nHelen,F,16130\r\nDorothy,F,12642\r\nMargaret,F,12532\r\nRuth,F,11274\r\nMildred,F,8759\r\nAnna,F,8585\r\nElizabeth,F,8"
  },
  {
    "path": "datasets/babynames/yob1913.txt",
    "chars": 88666,
    "preview": "Mary,F,36631\r\nHelen,F,18879\r\nDorothy,F,14670\r\nMargaret,F,14484\r\nRuth,F,12601\r\nMildred,F,9917\r\nAnna,F,9681\r\nElizabeth,F,9"
  },
  {
    "path": "datasets/babynames/yob1914.txt",
    "chars": 101529,
    "preview": "Mary,F,45341\r\nHelen,F,23218\r\nDorothy,F,18779\r\nMargaret,F,17756\r\nRuth,F,15835\r\nAnna,F,11864\r\nMildred,F,11648\r\nElizabeth,F"
  },
  {
    "path": "datasets/babynames/yob1915.txt",
    "chars": 119483,
    "preview": "Mary,F,58184\r\nHelen,F,30865\r\nDorothy,F,25150\r\nMargaret,F,23052\r\nRuth,F,21881\r\nMildred,F,15248\r\nAnna,F,15122\r\nElizabeth,F"
  },
  {
    "path": "datasets/babynames/yob1916.txt",
    "chars": 123879,
    "preview": "Mary,F,61428\r\nHelen,F,32662\r\nDorothy,F,27415\r\nMargaret,F,24954\r\nRuth,F,23185\r\nMildred,F,15636\r\nAnna,F,15225\r\nElizabeth,F"
  },
  {
    "path": "datasets/babynames/yob1917.txt",
    "chars": 126782,
    "preview": "Mary,F,64270\r\nHelen,F,34241\r\nDorothy,F,28851\r\nMargaret,F,25561\r\nRuth,F,23562\r\nMildred,F,16258\r\nAnna,F,15161\r\nElizabeth,F"
  },
  {
    "path": "datasets/babynames/yob1918.txt",
    "chars": 133101,
    "preview": "Mary,F,67375\r\nHelen,F,36152\r\nDorothy,F,32031\r\nMargaret,F,27136\r\nRuth,F,25538\r\nMildred,F,17252\r\nVirginia,F,16419\r\nFrances"
  },
  {
    "path": "datasets/babynames/yob1919.txt",
    "chars": 132726,
    "preview": "Mary,F,65839\r\nHelen,F,33705\r\nDorothy,F,31733\r\nMargaret,F,26240\r\nRuth,F,24569\r\nMildred,F,17300\r\nVirginia,F,15636\r\nElizabe"
  },
  {
    "path": "datasets/babynames/yob1920.txt",
    "chars": 137750,
    "preview": "Mary,F,70974\r\nDorothy,F,36646\r\nHelen,F,35093\r\nMargaret,F,27998\r\nRuth,F,26105\r\nMildred,F,18064\r\nVirginia,F,17312\r\nElizabe"
  },
  {
    "path": "datasets/babynames/yob1921.txt",
    "chars": 139183,
    "preview": "Mary,F,73980\r\nDorothy,F,39082\r\nHelen,F,34812\r\nMargaret,F,28462\r\nRuth,F,25780\r\nVirginia,F,19025\r\nMildred,F,17877\r\nBetty,F"
  },
  {
    "path": "datasets/babynames/yob1922.txt",
    "chars": 138200,
    "preview": "Mary,F,72156\r\nDorothy,F,37711\r\nHelen,F,32501\r\nMargaret,F,26858\r\nRuth,F,23622\r\nBetty,F,20897\r\nVirginia,F,19145\r\nMildred,F"
  },
  {
    "path": "datasets/babynames/yob1923.txt",
    "chars": 136768,
    "preview": "Mary,F,71636\r\nDorothy,F,39023\r\nHelen,F,31493\r\nMargaret,F,26131\r\nBetty,F,25992\r\nRuth,F,23638\r\nVirginia,F,18331\r\nMildred,F"
  },
  {
    "path": "datasets/babynames/yob1924.txt",
    "chars": 139518,
    "preview": "Mary,F,73505\r\nDorothy,F,40005\r\nHelen,F,31199\r\nBetty,F,30595\r\nMargaret,F,26551\r\nRuth,F,23604\r\nVirginia,F,18622\r\nMildred,F"
  },
  {
    "path": "datasets/babynames/yob1925.txt",
    "chars": 136831,
    "preview": "Mary,F,70594\r\nDorothy,F,38571\r\nBetty,F,32816\r\nHelen,F,29159\r\nMargaret,F,24461\r\nRuth,F,22257\r\nVirginia,F,17487\r\nDoris,F,1"
  },
  {
    "path": "datasets/babynames/yob1926.txt",
    "chars": 134326,
    "preview": "Mary,F,67830\r\nDorothy,F,36608\r\nBetty,F,32947\r\nHelen,F,26872\r\nMargaret,F,23061\r\nRuth,F,20199\r\nDoris,F,16298\r\nVirginia,F,1"
  },
  {
    "path": "datasets/babynames/yob1927.txt",
    "chars": 133892,
    "preview": "Mary,F,70594\r\nDorothy,F,35980\r\nBetty,F,35414\r\nHelen,F,25302\r\nMargaret,F,21972\r\nRuth,F,19411\r\nDoris,F,16515\r\nVirginia,F,1"
  },
  {
    "path": "datasets/babynames/yob1928.txt",
    "chars": 130726,
    "preview": "Mary,F,66867\r\nBetty,F,36074\r\nDorothy,F,33726\r\nHelen,F,22922\r\nMargaret,F,20290\r\nRuth,F,17854\r\nDoris,F,16564\r\nBarbara,F,14"
  },
  {
    "path": "datasets/babynames/yob1929.txt",
    "chars": 126316,
    "preview": "Mary,F,63502\r\nBetty,F,36665\r\nDorothy,F,31467\r\nHelen,F,20994\r\nMargaret,F,19196\r\nDoris,F,16488\r\nBarbara,F,16033\r\nRuth,F,16"
  },
  {
    "path": "datasets/babynames/yob1930.txt",
    "chars": 125892,
    "preview": "Mary,F,64110\r\nBetty,F,38239\r\nDorothy,F,30390\r\nHelen,F,19907\r\nMargaret,F,18353\r\nBarbara,F,18287\r\nPatricia,F,15746\r\nJoan,F"
  },
  {
    "path": "datasets/babynames/yob1931.txt",
    "chars": 119362,
    "preview": "Mary,F,60303\r\nBetty,F,36082\r\nDorothy,F,26511\r\nBarbara,F,21788\r\nJoan,F,19102\r\nHelen,F,17653\r\nMargaret,F,17324\r\nPatricia,F"
  },
  {
    "path": "datasets/babynames/yob1932.txt",
    "chars": 120786,
    "preview": "Mary,F,59859\r\nBetty,F,34411\r\nBarbara,F,26310\r\nDorothy,F,24972\r\nJoan,F,21043\r\nPatricia,F,17988\r\nMargaret,F,16537\r\nHelen,F"
  },
  {
    "path": "datasets/babynames/yob1933.txt",
    "chars": 115824,
    "preview": "Mary,F,55477\r\nBetty,F,31520\r\nBarbara,F,26946\r\nDorothy,F,22039\r\nJoan,F,19287\r\nPatricia,F,18622\r\nMargaret,F,15238\r\nHelen,F"
  },
  {
    "path": "datasets/babynames/yob1934.txt",
    "chars": 118031,
    "preview": "Mary,F,56898\r\nBetty,F,31082\r\nBarbara,F,29222\r\nShirley,F,22837\r\nDorothy,F,21283\r\nPatricia,F,20844\r\nJoan,F,19463\r\nMargaret"
  },
  {
    "path": "datasets/babynames/yob1935.txt",
    "chars": 116221,
    "preview": "Mary,F,55055\r\nShirley,F,42343\r\nBarbara,F,30686\r\nBetty,F,28660\r\nPatricia,F,22876\r\nDorothy,F,19393\r\nJoan,F,18219\r\nMargaret"
  },
  {
    "path": "datasets/babynames/yob1936.txt",
    "chars": 114311,
    "preview": "Mary,F,54356\r\nShirley,F,35150\r\nBarbara,F,31672\r\nBetty,F,25862\r\nPatricia,F,23903\r\nDorothy,F,17665\r\nJoan,F,17065\r\nNancy,F,"
  },
  {
    "path": "datasets/babynames/yob1937.txt",
    "chars": 115279,
    "preview": "Mary,F,55635\r\nBarbara,F,34902\r\nPatricia,F,26838\r\nShirley,F,26808\r\nBetty,F,25313\r\nCarol,F,17331\r\nNancy,F,17063\r\nDorothy,F"
  },
  {
    "path": "datasets/babynames/yob1938.txt",
    "chars": 116279,
    "preview": "Mary,F,56199\r\nBarbara,F,39262\r\nPatricia,F,27547\r\nBetty,F,25497\r\nShirley,F,23762\r\nCarol,F,19421\r\nNancy,F,18943\r\nDorothy,F"
  },
  {
    "path": "datasets/babynames/yob1939.txt",
    "chars": 114819,
    "preview": "Mary,F,54903\r\nBarbara,F,37262\r\nPatricia,F,29699\r\nBetty,F,23637\r\nShirley,F,20442\r\nCarol,F,20164\r\nNancy,F,19723\r\nJudith,F,"
  },
  {
    "path": "datasets/babynames/yob1940.txt",
    "chars": 115631,
    "preview": "Mary,F,56203\r\nBarbara,F,36730\r\nPatricia,F,32658\r\nJudith,F,22376\r\nBetty,F,22067\r\nCarol,F,21757\r\nNancy,F,19730\r\nLinda,F,18"
  },
  {
    "path": "datasets/babynames/yob1941.txt",
    "chars": 117195,
    "preview": "Mary,F,58025\r\nBarbara,F,39534\r\nPatricia,F,36896\r\nCarol,F,24186\r\nLinda,F,23724\r\nJudith,F,23311\r\nBetty,F,20897\r\nNancy,F,20"
  },
  {
    "path": "datasets/babynames/yob1942.txt",
    "chars": 121600,
    "preview": "Mary,F,63236\r\nBarbara,F,44732\r\nPatricia,F,39454\r\nLinda,F,31607\r\nCarol,F,30266\r\nSandra,F,24988\r\nJudith,F,24785\r\nNancy,F,2"
  },
  {
    "path": "datasets/babynames/yob1943.txt",
    "chars": 121528,
    "preview": "Mary,F,66156\r\nBarbara,F,43422\r\nPatricia,F,39615\r\nLinda,F,38433\r\nCarol,F,31673\r\nSandra,F,25985\r\nJudith,F,25209\r\nSharon,F,"
  },
  {
    "path": "datasets/babynames/yob1944.txt",
    "chars": 118128,
    "preview": "Mary,F,62476\r\nBarbara,F,39186\r\nLinda,F,38408\r\nPatricia,F,36875\r\nCarol,F,30457\r\nSandra,F,26025\r\nNancy,F,23213\r\nJudith,F,2"
  },
  {
    "path": "datasets/babynames/yob1945.txt",
    "chars": 116618,
    "preview": "Mary,F,59285\r\nLinda,F,41485\r\nBarbara,F,38283\r\nPatricia,F,35849\r\nCarol,F,30404\r\nSandra,F,24703\r\nNancy,F,21467\r\nSharon,F,2"
  },
  {
    "path": "datasets/babynames/yob1946.txt",
    "chars": 125383,
    "preview": "Mary,F,67437\r\nLinda,F,52702\r\nPatricia,F,46272\r\nBarbara,F,45103\r\nCarol,F,34246\r\nSandra,F,31677\r\nNancy,F,28328\r\nSusan,F,28"
  },
  {
    "path": "datasets/babynames/yob1947.txt",
    "chars": 133917,
    "preview": "Linda,F,99651\r\nMary,F,71654\r\nPatricia,F,51269\r\nBarbara,F,48778\r\nSandra,F,34755\r\nCarol,F,33528\r\nNancy,F,32436\r\nSusan,F,31"
  },
  {
    "path": "datasets/babynames/yob1948.txt",
    "chars": 132465,
    "preview": "Linda,F,96185\r\nMary,F,68582\r\nBarbara,F,46811\r\nPatricia,F,46127\r\nSusan,F,35972\r\nSandra,F,31064\r\nNancy,F,29388\r\nCarol,F,28"
  },
  {
    "path": "datasets/babynames/yob1949.txt",
    "chars": 132602,
    "preview": "Linda,F,90952\r\nMary,F,66809\r\nPatricia,F,46315\r\nBarbara,F,42562\r\nSusan,F,37685\r\nSandra,F,30619\r\nNancy,F,29222\r\nCarol,F,26"
  },
  {
    "path": "datasets/babynames/yob1950.txt",
    "chars": 133205,
    "preview": "Linda,F,80408\r\nMary,F,65444\r\nPatricia,F,47910\r\nBarbara,F,41558\r\nSusan,F,38010\r\nNancy,F,29625\r\nDeborah,F,29073\r\nSandra,F,"
  },
  {
    "path": "datasets/babynames/yob1951.txt",
    "chars": 135349,
    "preview": "Linda,F,73858\r\nMary,F,65624\r\nPatricia,F,56394\r\nDeborah,F,42017\r\nBarbara,F,40529\r\nSusan,F,40195\r\nNancy,F,30317\r\nKaren,F,2"
  },
  {
    "path": "datasets/babynames/yob1952.txt",
    "chars": 137658,
    "preview": "Linda,F,67071\r\nMary,F,65699\r\nPatricia,F,53083\r\nDeborah,F,49796\r\nSusan,F,41343\r\nBarbara,F,39890\r\nNancy,F,31724\r\nKaren,F,2"
  },
  {
    "path": "datasets/babynames/yob1953.txt",
    "chars": 140015,
    "preview": "Mary,F,64325\r\nLinda,F,61243\r\nDeborah,F,52170\r\nPatricia,F,50979\r\nSusan,F,44264\r\nBarbara,F,38439\r\nDebra,F,36853\r\nNancy,F,3"
  },
  {
    "path": "datasets/babynames/yob1954.txt",
    "chars": 141564,
    "preview": "Mary,F,67988\r\nLinda,F,55378\r\nDeborah,F,54661\r\nPatricia,F,49139\r\nSusan,F,47156\r\nDebra,F,45877\r\nBarbara,F,36366\r\nKaren,F,3"
  },
  {
    "path": "datasets/babynames/yob1955.txt",
    "chars": 143552,
    "preview": "Mary,F,63159\r\nDeborah,F,52303\r\nLinda,F,51271\r\nDebra,F,50525\r\nSusan,F,47382\r\nPatricia,F,46203\r\nBarbara,F,33589\r\nKaren,F,3"
  },
  {
    "path": "datasets/babynames/yob1956.txt",
    "chars": 146612,
    "preview": "Mary,F,61752\r\nDebra,F,48303\r\nLinda,F,48067\r\nDeborah,F,47836\r\nSusan,F,46557\r\nPatricia,F,43333\r\nKaren,F,40045\r\nCynthia,F,3"
  },
  {
    "path": "datasets/babynames/yob1957.txt",
    "chars": 149179,
    "preview": "Mary,F,61090\r\nSusan,F,45946\r\nLinda,F,44497\r\nDebra,F,42715\r\nKaren,F,40585\r\nDeborah,F,40055\r\nCynthia,F,39305\r\nPatricia,F,3"
  },
  {
    "path": "datasets/babynames/yob1958.txt",
    "chars": 148618,
    "preview": "Mary,F,55832\r\nSusan,F,45158\r\nLinda,F,41895\r\nKaren,F,38449\r\nPatricia,F,37916\r\nDebra,F,35533\r\nDeborah,F,32942\r\nCynthia,F,3"
  },
  {
    "path": "datasets/babynames/yob1959.txt",
    "chars": 151965,
    "preview": "Mary,F,54469\r\nSusan,F,41602\r\nLinda,F,40407\r\nKaren,F,36763\r\nDonna,F,36459\r\nPatricia,F,35233\r\nDebra,F,31362\r\nCynthia,F,301"
  },
  {
    "path": "datasets/babynames/yob1960.txt",
    "chars": 154001,
    "preview": "Mary,F,51477\r\nSusan,F,39197\r\nLinda,F,37317\r\nKaren,F,36384\r\nDonna,F,34126\r\nLisa,F,33708\r\nPatricia,F,32108\r\nDebra,F,26738\r"
  },
  {
    "path": "datasets/babynames/yob1961.txt",
    "chars": 157443,
    "preview": "Mary,F,47651\r\nLisa,F,42685\r\nSusan,F,37517\r\nLinda,F,35558\r\nKaren,F,34684\r\nPatricia,F,28843\r\nDonna,F,28664\r\nCynthia,F,2437"
  },
  {
    "path": "datasets/babynames/yob1962.txt",
    "chars": 157847,
    "preview": "Lisa,F,46090\r\nMary,F,43488\r\nSusan,F,35743\r\nKaren,F,35185\r\nLinda,F,31458\r\nPatricia,F,26545\r\nDonna,F,25731\r\nCynthia,F,2402"
  },
  {
    "path": "datasets/babynames/yob1963.txt",
    "chars": 158855,
    "preview": "Lisa,F,56030\r\nMary,F,41553\r\nSusan,F,33991\r\nKaren,F,32483\r\nLinda,F,27708\r\nDonna,F,25393\r\nPatricia,F,25355\r\nLori,F,23904\r\n"
  },
  {
    "path": "datasets/babynames/yob1964.txt",
    "chars": 160451,
    "preview": "Lisa,F,54271\r\nMary,F,40978\r\nSusan,F,31511\r\nKaren,F,30235\r\nPatricia,F,26081\r\nKimberly,F,24125\r\nDonna,F,23803\r\nLinda,F,236"
  },
  {
    "path": "datasets/babynames/yob1965.txt",
    "chars": 155029,
    "preview": "Lisa,F,60265\r\nMary,F,34270\r\nKaren,F,32881\r\nKimberly,F,28831\r\nSusan,F,26329\r\nPatricia,F,23551\r\nDonna,F,19696\r\nLinda,F,193"
  },
  {
    "path": "datasets/babynames/yob1966.txt",
    "chars": 157349,
    "preview": "Lisa,F,56899\r\nKimberly,F,32226\r\nMary,F,28881\r\nMichelle,F,27149\r\nKaren,F,25442\r\nSusan,F,23767\r\nPatricia,F,20120\r\nTammy,F,"
  },
  {
    "path": "datasets/babynames/yob1967.txt",
    "chars": 160588,
    "preview": "Lisa,F,52428\r\nKimberly,F,33099\r\nMichelle,F,30813\r\nMary,F,25317\r\nSusan,F,22258\r\nKaren,F,21543\r\nAngela,F,19539\r\nTammy,F,18"
  },
  {
    "path": "datasets/babynames/yob1968.txt",
    "chars": 167356,
    "preview": "Lisa,F,49523\r\nMichelle,F,33206\r\nKimberly,F,31908\r\nJennifer,F,26841\r\nMelissa,F,21730\r\nMary,F,21722\r\nAngela,F,20661\r\nTammy"
  },
  {
    "path": "datasets/babynames/yob1969.txt",
    "chars": 177888,
    "preview": "Lisa,F,45036\r\nMichelle,F,34315\r\nJennifer,F,33699\r\nKimberly,F,33075\r\nMelissa,F,23024\r\nAmy,F,21468\r\nAngela,F,21047\r\nMary,F"
  },
  {
    "path": "datasets/babynames/yob1970.txt",
    "chars": 191360,
    "preview": "Jennifer,F,46151\r\nLisa,F,38951\r\nKimberly,F,34131\r\nMichelle,F,34046\r\nAmy,F,25209\r\nAngela,F,24921\r\nMelissa,F,23738\r\nTammy,"
  },
  {
    "path": "datasets/babynames/yob1971.txt",
    "chars": 197951,
    "preview": "Jennifer,F,56779\r\nMichelle,F,33157\r\nLisa,F,32906\r\nKimberly,F,30688\r\nAmy,F,26231\r\nAngela,F,25879\r\nMelissa,F,23868\r\nTammy,"
  },
  {
    "path": "datasets/babynames/yob1972.txt",
    "chars": 199410,
    "preview": "Jennifer,F,63603\r\nMichelle,F,29276\r\nLisa,F,27547\r\nKimberly,F,26293\r\nAmy,F,25868\r\nAngela,F,23554\r\nMelissa,F,22491\r\nStepha"
  },
  {
    "path": "datasets/babynames/yob1973.txt",
    "chars": 202912,
    "preview": "Jennifer,F,62447\r\nAmy,F,26962\r\nMichelle,F,26927\r\nKimberly,F,23530\r\nLisa,F,22660\r\nMelissa,F,22482\r\nAngela,F,20895\r\nHeathe"
  },
  {
    "path": "datasets/babynames/yob1974.txt",
    "chars": 210367,
    "preview": "Jennifer,F,63098\r\nAmy,F,29564\r\nMichelle,F,25825\r\nHeather,F,23187\r\nAngela,F,22795\r\nKimberly,F,22421\r\nMelissa,F,22165\r\nLis"
  },
  {
    "path": "datasets/babynames/yob1975.txt",
    "chars": 218922,
    "preview": "Jennifer,F,58176\r\nAmy,F,32246\r\nHeather,F,24303\r\nMelissa,F,24168\r\nAngela,F,23344\r\nMichelle,F,22663\r\nKimberly,F,20263\r\nLis"
  },
  {
    "path": "datasets/babynames/yob1976.txt",
    "chars": 224880,
    "preview": "Jennifer,F,59468\r\nAmy,F,31341\r\nMelissa,F,25092\r\nHeather,F,24198\r\nAngela,F,22046\r\nMichelle,F,19546\r\nKimberly,F,18963\r\nJes"
  },
  {
    "path": "datasets/babynames/yob1977.txt",
    "chars": 234872,
    "preview": "Jennifer,F,58951\r\nMelissa,F,26880\r\nAmy,F,26729\r\nJessica,F,24838\r\nHeather,F,23766\r\nAngela,F,20990\r\nMichelle,F,19539\r\nKimb"
  },
  {
    "path": "datasets/babynames/yob1978.txt",
    "chars": 235822,
    "preview": "Jennifer,F,56307\r\nMelissa,F,28323\r\nJessica,F,26102\r\nAmy,F,23219\r\nHeather,F,22265\r\nAmanda,F,20519\r\nAngela,F,20503\r\nSarah,"
  },
  {
    "path": "datasets/babynames/yob1979.txt",
    "chars": 246109,
    "preview": "Jennifer,F,56709\r\nMelissa,F,34045\r\nAmanda,F,31922\r\nJessica,F,27774\r\nAmy,F,21612\r\nSarah,F,21017\r\nHeather,F,20839\r\nAngela,"
  },
  {
    "path": "datasets/babynames/yob1980.txt",
    "chars": 251121,
    "preview": "Jennifer,F,58375\r\nAmanda,F,35817\r\nJessica,F,33914\r\nMelissa,F,31625\r\nSarah,F,25737\r\nHeather,F,19965\r\nNicole,F,19910\r\nAmy,"
  },
  {
    "path": "datasets/babynames/yob1981.txt",
    "chars": 251593,
    "preview": "Jennifer,F,57029\r\nJessica,F,42519\r\nAmanda,F,34366\r\nSarah,F,28163\r\nMelissa,F,28000\r\nAmy,F,20338\r\nNicole,F,20309\r\nStephani"
  },
  {
    "path": "datasets/babynames/yob1982.txt",
    "chars": 254584,
    "preview": "Jennifer,F,57096\r\nJessica,F,45425\r\nAmanda,F,34209\r\nSarah,F,28470\r\nMelissa,F,25851\r\nNicole,F,21699\r\nStephanie,F,20854\r\nEl"
  },
  {
    "path": "datasets/babynames/yob1983.txt",
    "chars": 250692,
    "preview": "Jennifer,F,54325\r\nJessica,F,45271\r\nAmanda,F,33735\r\nAshley,F,33280\r\nSarah,F,27214\r\nMelissa,F,23466\r\nNicole,F,22391\r\nSteph"
  },
  {
    "path": "datasets/babynames/yob1984.txt",
    "chars": 252513,
    "preview": "Jennifer,F,50546\r\nJessica,F,45841\r\nAshley,F,38757\r\nAmanda,F,33893\r\nSarah,F,25860\r\nStephanie,F,23012\r\nNicole,F,22256\r\nMel"
  },
  {
    "path": "datasets/babynames/yob1985.txt",
    "chars": 260173,
    "preview": "Jessica,F,48340\r\nAshley,F,46995\r\nJennifer,F,42645\r\nAmanda,F,39042\r\nSarah,F,24868\r\nStephanie,F,23233\r\nNicole,F,22957\r\nHea"
  },
  {
    "path": "datasets/babynames/yob1986.txt",
    "chars": 267762,
    "preview": "Jessica,F,52643\r\nAshley,F,49672\r\nAmanda,F,40513\r\nJennifer,F,36172\r\nSarah,F,28122\r\nStephanie,F,22632\r\nNicole,F,21284\r\nBri"
  },
  {
    "path": "datasets/babynames/yob1987.txt",
    "chars": 278085,
    "preview": "Jessica,F,55983\r\nAshley,F,54826\r\nAmanda,F,41784\r\nJennifer,F,32687\r\nSarah,F,27876\r\nStephanie,F,22395\r\nBrittany,F,22215\r\nN"
  },
  {
    "path": "datasets/babynames/yob1989.txt",
    "chars": 309368,
    "preview": "Jessica,F,47888\r\nAshley,F,47585\r\nBrittany,F,37786\r\nAmanda,F,36832\r\nSarah,F,27788\r\nSamantha,F,24794\r\nJennifer,F,23996\r\nSt"
  },
  {
    "path": "datasets/babynames/yob1991.txt",
    "chars": 327121,
    "preview": "Ashley,F,43482\r\nJessica,F,43396\r\nBrittany,F,29091\r\nAmanda,F,28884\r\nSamantha,F,25648\r\nSarah,F,25214\r\nStephanie,F,22763\r\nJ"
  },
  {
    "path": "datasets/babynames/yob1992.txt",
    "chars": 331344,
    "preview": "Ashley,F,38449\r\nJessica,F,38342\r\nAmanda,F,25030\r\nBrittany,F,24972\r\nSarah,F,24625\r\nSamantha,F,24405\r\nEmily,F,21834\r\nSteph"
  },
  {
    "path": "datasets/babynames/yob1993.txt",
    "chars": 337663,
    "preview": "Jessica,F,34973\r\nAshley,F,34844\r\nSarah,F,24220\r\nSamantha,F,23659\r\nEmily,F,23589\r\nBrittany,F,21726\r\nTaylor,F,21266\r\nAmand"
  },
  {
    "path": "datasets/babynames/yob1994.txt",
    "chars": 338089,
    "preview": "Jessica,F,32111\r\nAshley,F,30272\r\nEmily,F,24148\r\nSamantha,F,22818\r\nSarah,F,22259\r\nTaylor,F,20731\r\nBrittany,F,18897\r\nAmand"
  },
  {
    "path": "datasets/babynames/yob1996.txt",
    "chars": 342951,
    "preview": "Emily,F,25144\r\nJessica,F,24180\r\nAshley,F,23676\r\nSarah,F,21012\r\nSamantha,F,20541\r\nTaylor,F,19147\r\nHannah,F,18585\r\nAlexis,"
  },
  {
    "path": "datasets/babynames/yob1997.txt",
    "chars": 349976,
    "preview": "Emily,F,25730\r\nJessica,F,21043\r\nAshley,F,20890\r\nSarah,F,20674\r\nHannah,F,20581\r\nSamantha,F,20169\r\nTaylor,F,19503\r\nAlexis,"
  }
]

// ... and 71 more files (download for full content)

About this extraction

This page contains the full source code of the wesm/pydata-book GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 203 files (238.9 MB), approximately 4.0M tokens. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.

Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.

Copied to clipboard!