Copy disabled (too large)
Download .txt
Showing preview only (15,826K chars total). Download the full file to get everything.
Repository: wesm/pydata-book
Branch: 3rd-edition
Commit: a892791ce5a3
Files: 203
Total size: 238.9 MB
Directory structure:
gitextract_mai23lqn/
├── .gitignore
├── COPYING
├── README.md
├── appa.ipynb
├── appb.ipynb
├── ch02.ipynb
├── ch03.ipynb
├── ch04.ipynb
├── ch05.ipynb
├── ch06.ipynb
├── ch07.ipynb
├── ch08.ipynb
├── ch09.ipynb
├── ch10.ipynb
├── ch11.ipynb
├── ch12.ipynb
├── ch13.ipynb
├── datasets/
│ ├── babynames/
│ │ ├── yob1880.txt
│ │ ├── yob1881.txt
│ │ ├── yob1882.txt
│ │ ├── yob1883.txt
│ │ ├── yob1884.txt
│ │ ├── yob1885.txt
│ │ ├── yob1886.txt
│ │ ├── yob1887.txt
│ │ ├── yob1888.txt
│ │ ├── yob1889.txt
│ │ ├── yob1890.txt
│ │ ├── yob1891.txt
│ │ ├── yob1892.txt
│ │ ├── yob1893.txt
│ │ ├── yob1894.txt
│ │ ├── yob1895.txt
│ │ ├── yob1896.txt
│ │ ├── yob1897.txt
│ │ ├── yob1898.txt
│ │ ├── yob1899.txt
│ │ ├── yob1900.txt
│ │ ├── yob1901.txt
│ │ ├── yob1902.txt
│ │ ├── yob1903.txt
│ │ ├── yob1904.txt
│ │ ├── yob1905.txt
│ │ ├── yob1906.txt
│ │ ├── yob1907.txt
│ │ ├── yob1908.txt
│ │ ├── yob1909.txt
│ │ ├── yob1910.txt
│ │ ├── yob1911.txt
│ │ ├── yob1912.txt
│ │ ├── yob1913.txt
│ │ ├── yob1914.txt
│ │ ├── yob1915.txt
│ │ ├── yob1916.txt
│ │ ├── yob1917.txt
│ │ ├── yob1918.txt
│ │ ├── yob1919.txt
│ │ ├── yob1920.txt
│ │ ├── yob1921.txt
│ │ ├── yob1922.txt
│ │ ├── yob1923.txt
│ │ ├── yob1924.txt
│ │ ├── yob1925.txt
│ │ ├── yob1926.txt
│ │ ├── yob1927.txt
│ │ ├── yob1928.txt
│ │ ├── yob1929.txt
│ │ ├── yob1930.txt
│ │ ├── yob1931.txt
│ │ ├── yob1932.txt
│ │ ├── yob1933.txt
│ │ ├── yob1934.txt
│ │ ├── yob1935.txt
│ │ ├── yob1936.txt
│ │ ├── yob1937.txt
│ │ ├── yob1938.txt
│ │ ├── yob1939.txt
│ │ ├── yob1940.txt
│ │ ├── yob1941.txt
│ │ ├── yob1942.txt
│ │ ├── yob1943.txt
│ │ ├── yob1944.txt
│ │ ├── yob1945.txt
│ │ ├── yob1946.txt
│ │ ├── yob1947.txt
│ │ ├── yob1948.txt
│ │ ├── yob1949.txt
│ │ ├── yob1950.txt
│ │ ├── yob1951.txt
│ │ ├── yob1952.txt
│ │ ├── yob1953.txt
│ │ ├── yob1954.txt
│ │ ├── yob1955.txt
│ │ ├── yob1956.txt
│ │ ├── yob1957.txt
│ │ ├── yob1958.txt
│ │ ├── yob1959.txt
│ │ ├── yob1960.txt
│ │ ├── yob1961.txt
│ │ ├── yob1962.txt
│ │ ├── yob1963.txt
│ │ ├── yob1964.txt
│ │ ├── yob1965.txt
│ │ ├── yob1966.txt
│ │ ├── yob1967.txt
│ │ ├── yob1968.txt
│ │ ├── yob1969.txt
│ │ ├── yob1970.txt
│ │ ├── yob1971.txt
│ │ ├── yob1972.txt
│ │ ├── yob1973.txt
│ │ ├── yob1974.txt
│ │ ├── yob1975.txt
│ │ ├── yob1976.txt
│ │ ├── yob1977.txt
│ │ ├── yob1978.txt
│ │ ├── yob1979.txt
│ │ ├── yob1980.txt
│ │ ├── yob1981.txt
│ │ ├── yob1982.txt
│ │ ├── yob1983.txt
│ │ ├── yob1984.txt
│ │ ├── yob1985.txt
│ │ ├── yob1986.txt
│ │ ├── yob1987.txt
│ │ ├── yob1988.txt
│ │ ├── yob1989.txt
│ │ ├── yob1990.txt
│ │ ├── yob1991.txt
│ │ ├── yob1992.txt
│ │ ├── yob1993.txt
│ │ ├── yob1994.txt
│ │ ├── yob1995.txt
│ │ ├── yob1996.txt
│ │ ├── yob1997.txt
│ │ ├── yob1998.txt
│ │ ├── yob1999.txt
│ │ ├── yob2000.txt
│ │ ├── yob2001.txt
│ │ ├── yob2002.txt
│ │ ├── yob2003.txt
│ │ ├── yob2004.txt
│ │ ├── yob2005.txt
│ │ ├── yob2006.txt
│ │ ├── yob2007.txt
│ │ ├── yob2008.txt
│ │ ├── yob2009.txt
│ │ └── yob2010.txt
│ ├── bitly_usagov/
│ │ └── example.txt
│ ├── fec/
│ │ ├── P00000001-ALL.csv
│ │ └── fec.parquet
│ ├── haiti/
│ │ ├── Haiti.csv
│ │ └── PortAuPrince_Roads/
│ │ ├── PortAuPrince_Roads.dbf
│ │ ├── PortAuPrince_Roads.prj
│ │ ├── PortAuPrince_Roads.sbn
│ │ ├── PortAuPrince_Roads.sbx
│ │ ├── PortAuPrince_Roads.shp
│ │ ├── PortAuPrince_Roads.shx
│ │ └── PortAuPrince_Roads_README.txt
│ ├── movielens/
│ │ └── README
│ ├── mta_perf/
│ │ ├── Performance_LIBUS.xml
│ │ ├── Performance_LIBUS.xsd
│ │ ├── Performance_LIRR.xml
│ │ ├── Performance_LIRR.xsd
│ │ ├── Performance_MNR.xml
│ │ ├── Performance_MNR.xsd
│ │ ├── Performance_MTABUS.xml
│ │ ├── Performance_MTABUS.xsd
│ │ ├── Performance_NYCT.xml
│ │ ├── Performance_NYCT.xsd
│ │ ├── Performance_TBTA.xml
│ │ ├── Performance_TBTA.xsd
│ │ └── parse.py
│ ├── titanic/
│ │ ├── genderclassmodel.csv
│ │ ├── gendermodel.csv
│ │ ├── test.csv
│ │ └── train.csv
│ └── usda_food/
│ └── database.json
├── examples/
│ ├── array_ex.txt
│ ├── csv_mindex.csv
│ ├── ex1.csv
│ ├── ex1.xlsx
│ ├── ex2.csv
│ ├── ex3.txt
│ ├── ex4.csv
│ ├── ex5.csv
│ ├── ex6.csv
│ ├── ex7.csv
│ ├── example.json
│ ├── fdic_failed_bank_list.html
│ ├── ipython_bug.py
│ ├── macrodata.csv
│ ├── segismundo.txt
│ ├── spx.csv
│ ├── stock_px.csv
│ ├── test_file.csv
│ ├── tips.csv
│ ├── tseries.csv
│ ├── volume.csv
│ ├── yahoo_price.pkl
│ └── yahoo_volume.pkl
├── pyproject.toml
└── requirements.txt
================================================
FILE CONTENTS
================================================
================================================
FILE: .gitignore
================================================
.ipynb_checkpoints
================================================
FILE: COPYING
================================================
Code examples from "Python for Data Analysis", 3rd Edition
The MIT License (MIT)
Copyright (c) 2022 Wes McKinney
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
================================================
FILE: README.md
================================================
# Python for Data Analysis, 3rd Edition
Materials and IPython notebooks for "Python for Data Analysis, 3rd
Edition" by Wes McKinney, published by O'Reilly Media. Book content
including updates and errata fixes can be [found for free on my
website][6].
[Buy the book on Amazon][1]
Follow Wes on Twitter: [](https://twitter.com/wesmckinn)
## Setup Instructions
### Option 1: Using uv (Recommended)
[uv](https://github.com/astral-sh/uv) is a fast Python package installer and resolver. To get started:
1. Install uv if you haven't already:
```bash
curl -LsSf https://astral.sh/uv/install.sh | sh
```
2. Start Jupyter Notebook with all dependencies:
```bash
uv run jupyter notebook
```
That's it! uv will automatically create a virtual environment and install all required packages from `pyproject.toml`.
### Option 2: Using Conda
1. Create a new conda environment:
```bash
conda create -n pydata-book python=3.11
conda activate pydata-book
```
2. Install dependencies:
```bash
pip install -r requirements.txt
```
3. Start Jupyter Notebook:
```bash
jupyter notebook
```
**Note:** This project uses pandas 2.0.3 to ensure compatibility with the notebooks.
# 2nd Edition Readers
If you are reading the 2nd Edition (published in 2017), please find the
reorganized book materials on the [`2nd-edition` branch][5].
# 1st Edition Readers
If you are reading the 1st Edition (published in 2012), please find the
reorganized book materials on the [`1st-edition` branch][2].
## IPython Notebooks:
* [Chapter 2: Python Language Basics, IPython, and Jupyter Notebooks](http://nbviewer.ipython.org/github/pydata/pydata-book/blob/3rd-edition/ch02.ipynb)
* [Chapter 3: Built-in Data Structures, Functions, and Files](http://nbviewer.ipython.org/github/pydata/pydata-book/blob/3rd-edition/ch03.ipynb)
* [Chapter 4: NumPy Basics: Arrays and Vectorized Computation](http://nbviewer.ipython.org/github/pydata/pydata-book/blob/3rd-edition/ch04.ipynb)
* [Chapter 5: Getting Started with pandas](http://nbviewer.ipython.org/github/pydata/pydata-book/blob/3rd-edition/ch05.ipynb)
* [Chapter 6: Data Loading, Storage, and File Formats](http://nbviewer.ipython.org/github/pydata/pydata-book/blob/3rd-edition/ch06.ipynb)
* [Chapter 7: Data Cleaning and Preparation](http://nbviewer.ipython.org/github/pydata/pydata-book/blob/3rd-edition/ch07.ipynb)
* [Chapter 8: Data Wrangling: Join, Combine, and Reshape](http://nbviewer.ipython.org/github/pydata/pydata-book/blob/3rd-edition/ch08.ipynb)
* [Chapter 9: Plotting and Visualization](http://nbviewer.ipython.org/github/pydata/pydata-book/blob/3rd-edition/ch09.ipynb)
* [Chapter 10: Data Aggregation and Group Operations](http://nbviewer.ipython.org/github/pydata/pydata-book/blob/3rd-edition/ch10.ipynb)
* [Chapter 11: Time Series](http://nbviewer.ipython.org/github/pydata/pydata-book/blob/3rd-edition/ch11.ipynb)
* [Chapter 12: Introduction to Modeling Libraries in Python](http://nbviewer.ipython.org/github/pydata/pydata-book/blob/3rd-edition/ch12.ipynb)
* [Chapter 13: Data Analysis Examples](http://nbviewer.ipython.org/github/pydata/pydata-book/blob/3rd-edition/ch13.ipynb)
* [Appendix A: Advanced NumPy](http://nbviewer.ipython.org/github/pydata/pydata-book/blob/3rd-edition/appa.ipynb)
## License
### Code
The code in this repository, including all code samples in the notebooks listed
above, is released under the [MIT license](LICENSE-CODE). Read more at the
[Open Source Initiative](https://opensource.org/licenses/MIT).
[1]: https://amzn.to/3DyLaJc
[2]: https://github.com/wesm/pydata-book/tree/1st-edition
[5]: https://github.com/wesm/pydata-book/tree/2nd-edition
[6]: https://wesmckinney.com/book/
================================================
FILE: appa.ipynb
================================================
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"plt.rc('figure', figsize=(10, 6))\n",
"PREVIOUS_MAX_ROWS = pd.options.display.max_rows\n",
"pd.options.display.max_columns = 20\n",
"pd.options.display.max_rows = 20\n",
"pd.options.display.max_colwidth = 80\n",
"np.set_printoptions(precision=4, suppress=True)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"rng = np.random.default_rng(seed=12345)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"np.ones((10, 5)).shape"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"np.ones((3, 4, 5), dtype=np.float64).strides"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"ints = np.ones(10, dtype=np.uint16)\n",
"floats = np.ones(10, dtype=np.float32)\n",
"np.issubdtype(ints.dtype, np.integer)\n",
"np.issubdtype(floats.dtype, np.floating)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"np.float64.mro()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"np.issubdtype(ints.dtype, np.number)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"arr = np.arange(8)\n",
"arr\n",
"arr.reshape((4, 2))"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"arr.reshape((4, 2)).reshape((2, 4))"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"arr = np.arange(15)\n",
"arr.reshape((5, -1))"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"other_arr = np.ones((3, 5))\n",
"other_arr.shape\n",
"arr.reshape(other_arr.shape)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"arr = np.arange(15).reshape((5, 3))\n",
"arr\n",
"arr.ravel()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"arr.flatten()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"arr = np.arange(12).reshape((3, 4))\n",
"arr\n",
"arr.ravel()\n",
"arr.ravel('F')"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"arr1 = np.array([[1, 2, 3], [4, 5, 6]])\n",
"arr2 = np.array([[7, 8, 9], [10, 11, 12]])\n",
"np.concatenate([arr1, arr2], axis=0)\n",
"np.concatenate([arr1, arr2], axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"np.vstack((arr1, arr2))\n",
"np.hstack((arr1, arr2))"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"arr = rng.standard_normal((5, 2))\n",
"arr\n",
"first, second, third = np.split(arr, [1, 3])\n",
"first\n",
"second\n",
"third"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"arr = np.arange(6)\n",
"arr1 = arr.reshape((3, 2))\n",
"arr2 = rng.standard_normal((3, 2))\n",
"np.r_[arr1, arr2]\n",
"np.c_[np.r_[arr1, arr2], arr]"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"np.c_[1:6, -10:-5]"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"arr = np.arange(3)\n",
"arr\n",
"arr.repeat(3)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"arr.repeat([2, 3, 4])"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"arr = rng.standard_normal((2, 2))\n",
"arr\n",
"arr.repeat(2, axis=0)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"arr.repeat([2, 3], axis=0)\n",
"arr.repeat([2, 3], axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"arr\n",
"np.tile(arr, 2)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"arr\n",
"np.tile(arr, (2, 1))\n",
"np.tile(arr, (3, 2))"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"arr = np.arange(10) * 100\n",
"inds = [7, 1, 2, 6]\n",
"arr[inds]"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"arr.take(inds)\n",
"arr.put(inds, 42)\n",
"arr\n",
"arr.put(inds, [40, 41, 42, 43])\n",
"arr"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"inds = [2, 0, 2, 1]\n",
"arr = rng.standard_normal((2, 4))\n",
"arr\n",
"arr.take(inds, axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"arr = np.arange(5)\n",
"arr\n",
"arr * 4"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
"arr = rng.standard_normal((4, 3))\n",
"arr.mean(0)\n",
"demeaned = arr - arr.mean(0)\n",
"demeaned\n",
"demeaned.mean(0)"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
"arr\n",
"row_means = arr.mean(1)\n",
"row_means.shape\n",
"row_means.reshape((4, 1))\n",
"demeaned = arr - row_means.reshape((4, 1))\n",
"demeaned.mean(1)"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
"arr - arr.mean(1)"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [],
"source": [
"arr - arr.mean(1).reshape((4, 1))"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"arr = np.zeros((4, 4))\n",
"arr_3d = arr[:, np.newaxis, :]\n",
"arr_3d.shape\n",
"arr_1d = rng.standard_normal(3)\n",
"arr_1d[:, np.newaxis]\n",
"arr_1d[np.newaxis, :]"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [],
"source": [
"arr = rng.standard_normal((3, 4, 5))\n",
"depth_means = arr.mean(2)\n",
"depth_means\n",
"depth_means.shape\n",
"demeaned = arr - depth_means[:, :, np.newaxis]\n",
"demeaned.mean(2)"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [],
"source": [
"arr = np.zeros((4, 3))\n",
"arr[:] = 5\n",
"arr"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
"col = np.array([1.28, -0.42, 0.44, 1.6])\n",
"arr[:] = col[:, np.newaxis]\n",
"arr\n",
"arr[:2] = [[-1.37], [0.509]]\n",
"arr"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [],
"source": [
"arr = np.arange(10)\n",
"np.add.reduce(arr)\n",
"arr.sum()"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [],
"source": [
"my_rng = np.random.default_rng(12346) # for reproducibility\n",
"arr = my_rng.standard_normal((5, 5))\n",
"arr\n",
"arr[::2].sort(1) # sort a few rows\n",
"arr[:, :-1] < arr[:, 1:]\n",
"np.logical_and.reduce(arr[:, :-1] < arr[:, 1:], axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [],
"source": [
"arr = np.arange(15).reshape((3, 5))\n",
"np.add.accumulate(arr, axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [],
"source": [
"arr = np.arange(3).repeat([1, 2, 2])\n",
"arr\n",
"np.multiply.outer(arr, np.arange(5))"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [],
"source": [
"x, y = rng.standard_normal((3, 4)), rng.standard_normal(5)\n",
"result = np.subtract.outer(x, y)\n",
"result.shape"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [],
"source": [
"arr = np.arange(10)\n",
"np.add.reduceat(arr, [0, 5, 8])"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [],
"source": [
"arr = np.multiply.outer(np.arange(4), np.arange(5))\n",
"arr\n",
"np.add.reduceat(arr, [0, 2, 4], axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [],
"source": [
"def add_elements(x, y):\n",
" return x + y\n",
"add_them = np.frompyfunc(add_elements, 2, 1)\n",
"add_them(np.arange(8), np.arange(8))"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [],
"source": [
"add_them = np.vectorize(add_elements, otypes=[np.float64])\n",
"add_them(np.arange(8), np.arange(8))"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [],
"source": [
"arr = rng.standard_normal(10000)\n",
"%timeit add_them(arr, arr)\n",
"%timeit np.add(arr, arr)"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [],
"source": [
"dtype = [('x', np.float64), ('y', np.int32)]\n",
"sarr = np.array([(1.5, 6), (np.pi, -2)], dtype=dtype)\n",
"sarr"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [],
"source": [
"sarr[0]\n",
"sarr[0]['y']"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [],
"source": [
"sarr['x']"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {},
"outputs": [],
"source": [
"dtype = [('x', np.int64, 3), ('y', np.int32)]\n",
"arr = np.zeros(4, dtype=dtype)\n",
"arr"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {},
"outputs": [],
"source": [
"arr[0]['x']"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {},
"outputs": [],
"source": [
"arr['x']"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {},
"outputs": [],
"source": [
"dtype = [('x', [('a', 'f8'), ('b', 'f4')]), ('y', np.int32)]\n",
"data = np.array([((1, 2), 5), ((3, 4), 6)], dtype=dtype)\n",
"data['x']\n",
"data['y']\n",
"data['x']['a']"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {},
"outputs": [],
"source": [
"arr = rng.standard_normal(6)\n",
"arr.sort()\n",
"arr"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {},
"outputs": [],
"source": [
"arr = rng.standard_normal((3, 5))\n",
"arr\n",
"arr[:, 0].sort() # Sort first column values in place\n",
"arr"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {},
"outputs": [],
"source": [
"arr = rng.standard_normal(5)\n",
"arr\n",
"np.sort(arr)\n",
"arr"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {},
"outputs": [],
"source": [
"arr = rng.standard_normal((3, 5))\n",
"arr\n",
"arr.sort(axis=1)\n",
"arr"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {},
"outputs": [],
"source": [
"arr[:, ::-1]"
]
},
{
"cell_type": "code",
"execution_count": 60,
"metadata": {},
"outputs": [],
"source": [
"values = np.array([5, 0, 1, 3, 2])\n",
"indexer = values.argsort()\n",
"indexer\n",
"values[indexer]"
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {},
"outputs": [],
"source": [
"arr = rng.standard_normal((3, 5))\n",
"arr[0] = values\n",
"arr\n",
"arr[:, arr[0].argsort()]"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {},
"outputs": [],
"source": [
"first_name = np.array(['Bob', 'Jane', 'Steve', 'Bill', 'Barbara'])\n",
"last_name = np.array(['Jones', 'Arnold', 'Arnold', 'Jones', 'Walters'])\n",
"sorter = np.lexsort((first_name, last_name))\n",
"sorter\n",
"list(zip(last_name[sorter], first_name[sorter]))"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {},
"outputs": [],
"source": [
"values = np.array(['2:first', '2:second', '1:first', '1:second',\n",
" '1:third'])\n",
"key = np.array([2, 2, 1, 1, 1])\n",
"indexer = key.argsort(kind='mergesort')\n",
"indexer\n",
"values.take(indexer)"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {},
"outputs": [],
"source": [
"rng = np.random.default_rng(12345)\n",
"arr = rng.standard_normal(20)\n",
"arr\n",
"np.partition(arr, 3)"
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {},
"outputs": [],
"source": [
"indices = np.argpartition(arr, 3)\n",
"indices\n",
"arr.take(indices)"
]
},
{
"cell_type": "code",
"execution_count": 66,
"metadata": {},
"outputs": [],
"source": [
"arr = np.array([0, 1, 7, 12, 15])\n",
"arr.searchsorted(9)"
]
},
{
"cell_type": "code",
"execution_count": 67,
"metadata": {},
"outputs": [],
"source": [
"arr.searchsorted([0, 8, 11, 16])"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {},
"outputs": [],
"source": [
"arr = np.array([0, 0, 0, 1, 1, 1, 1])\n",
"arr.searchsorted([0, 1])\n",
"arr.searchsorted([0, 1], side='right')"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {},
"outputs": [],
"source": [
"data = np.floor(rng.uniform(0, 10000, size=50))\n",
"bins = np.array([0, 100, 1000, 5000, 10000])\n",
"data"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {},
"outputs": [],
"source": [
"labels = bins.searchsorted(data)\n",
"labels"
]
},
{
"cell_type": "code",
"execution_count": 71,
"metadata": {},
"outputs": [],
"source": [
"pd.Series(data).groupby(labels).mean()"
]
},
{
"cell_type": "code",
"execution_count": 72,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"\n",
"def mean_distance(x, y):\n",
" nx = len(x)\n",
" result = 0.0\n",
" count = 0\n",
" for i in range(nx):\n",
" result += x[i] - y[i]\n",
" count += 1\n",
" return result / count"
]
},
{
"cell_type": "code",
"execution_count": 73,
"metadata": {},
"outputs": [],
"source": [
"mmap = np.memmap('mymmap', dtype='float64', mode='w+',\n",
" shape=(10000, 10000))\n",
"mmap"
]
},
{
"cell_type": "code",
"execution_count": 74,
"metadata": {},
"outputs": [],
"source": [
"section = mmap[:5]"
]
},
{
"cell_type": "code",
"execution_count": 75,
"metadata": {},
"outputs": [],
"source": [
"section[:] = rng.standard_normal((5, 10000))\n",
"mmap.flush()\n",
"mmap\n",
"del mmap"
]
},
{
"cell_type": "code",
"execution_count": 76,
"metadata": {},
"outputs": [],
"source": [
"mmap = np.memmap('mymmap', dtype='float64', shape=(10000, 10000))\n",
"mmap"
]
},
{
"cell_type": "code",
"execution_count": 77,
"metadata": {},
"outputs": [],
"source": [
"%xdel mmap\n",
"!rm mymmap"
]
},
{
"cell_type": "code",
"execution_count": 78,
"metadata": {},
"outputs": [],
"source": [
"arr_c = np.ones((100, 10000), order='C')\n",
"arr_f = np.ones((100, 10000), order='F')\n",
"arr_c.flags\n",
"arr_f.flags\n",
"arr_f.flags.f_contiguous"
]
},
{
"cell_type": "code",
"execution_count": 79,
"metadata": {},
"outputs": [],
"source": [
"%timeit arr_c.sum(1)\n",
"%timeit arr_f.sum(1)"
]
},
{
"cell_type": "code",
"execution_count": 80,
"metadata": {},
"outputs": [],
"source": [
"arr_f.copy('C').flags"
]
},
{
"cell_type": "code",
"execution_count": 81,
"metadata": {},
"outputs": [],
"source": [
"arr_c[:50].flags.contiguous\n",
"arr_c[:, :50].flags"
]
},
{
"cell_type": "code",
"execution_count": 82,
"metadata": {},
"outputs": [],
"source": [
"%xdel arr_c\n",
"%xdel arr_f"
]
},
{
"cell_type": "code",
"execution_count": 83,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 84,
"metadata": {},
"outputs": [],
"source": [
"pd.options.display.max_rows = PREVIOUS_MAX_ROWS"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
================================================
FILE: appb.ipynb
================================================
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"plt.rc('figure', figsize=(10, 6))\n",
"PREVIOUS_MAX_ROWS = pd.options.display.max_rows\n",
"pd.options.display.max_columns = 20\n",
"pd.options.display.max_rows = 20\n",
"pd.options.display.max_colwidth = 80\n",
"np.set_printoptions(precision=4, suppress=True)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# a very large list of strings\n",
"strings = ['foo', 'foobar', 'baz', 'qux',\n",
" 'python', 'Guido Van Rossum'] * 100000\n",
"\n",
"method1 = [x for x in strings if x.startswith('foo')]\n",
"\n",
"method2 = [x for x in strings if x[:3] == 'foo']"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"%time method1 = [x for x in strings if x.startswith('foo')]\n",
"%time method2 = [x for x in strings if x[:3] == 'foo']"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"pd.options.display.max_rows = PREVIOUS_MAX_ROWS"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
================================================
FILE: ch02.ipynb
================================================
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"np.random.seed(12345)\n",
"np.set_printoptions(precision=4, suppress=True)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"data = [np.random.standard_normal() for i in range(7)]\n",
"data"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"a = [1, 2, 3]"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"b = a\n",
"b"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"a.append(4)\n",
"b"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"def append_element(some_list, element):\n",
" some_list.append(element)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"data = [1, 2, 3]\n",
"append_element(data, 4)\n",
"data"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"a = 5\n",
"type(a)\n",
"a = \"foo\"\n",
"type(a)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"\"5\" + 5"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"a = 4.5\n",
"b = 2\n",
"# String formatting, to be visited later\n",
"print(f\"a is {type(a)}, b is {type(b)}\")\n",
"a / b"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"a = 5\n",
"isinstance(a, int)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"a = 5; b = 4.5\n",
"isinstance(a, (int, float))\n",
"isinstance(b, (int, float))"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"a = \"foo\""
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"getattr(a, \"split\")"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"def isiterable(obj):\n",
" try:\n",
" iter(obj)\n",
" return True\n",
" except TypeError: # not iterable\n",
" return False"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"isiterable(\"a string\")\n",
"isiterable([1, 2, 3])\n",
"isiterable(5)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"5 - 7\n",
"12 + 21.5\n",
"5 <= 2"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"a = [1, 2, 3]\n",
"b = a\n",
"c = list(a)\n",
"a is b\n",
"a is not c"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"a == c"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"a = None\n",
"a is None"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"a_list = [\"foo\", 2, [4, 5]]\n",
"a_list[2] = (3, 4)\n",
"a_list"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"a_tuple = (3, 5, (4, 5))\n",
"a_tuple[1] = \"four\""
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"ival = 17239871\n",
"ival ** 6"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"fval = 7.243\n",
"fval2 = 6.78e-5"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"3 / 2"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"3 // 2"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"c = \"\"\"\n",
"This is a longer string that\n",
"spans multiple lines\n",
"\"\"\""
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"c.count(\"\\n\")"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"a = \"this is a string\"\n",
"a[10] = \"f\""
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
"b = a.replace(\"string\", \"longer string\")\n",
"b"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
"a"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
"a = 5.6\n",
"s = str(a)\n",
"print(s)"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [],
"source": [
"s = \"python\"\n",
"list(s)\n",
"s[:3]"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"s = \"12\\\\34\"\n",
"print(s)"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [],
"source": [
"s = r\"this\\has\\no\\special\\characters\"\n",
"s"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [],
"source": [
"a = \"this is the first half \"\n",
"b = \"and this is the second half\"\n",
"a + b"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
"template = \"{0:.2f} {1:s} are worth US${2:d}\""
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [],
"source": [
"template.format(88.46, \"Argentine Pesos\", 1)"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [],
"source": [
"amount = 10\n",
"rate = 88.46\n",
"currency = \"Pesos\"\n",
"result = f\"{amount} {currency} is worth US${amount / rate}\""
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [],
"source": [
"f\"{amount} {currency} is worth US${amount / rate:.2f}\""
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [],
"source": [
"val = \"espa\u00f1ol\"\n",
"val"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [],
"source": [
"val_utf8 = val.encode(\"utf-8\")\n",
"val_utf8\n",
"type(val_utf8)"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [],
"source": [
"val_utf8.decode(\"utf-8\")"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [],
"source": [
"val.encode(\"latin1\")\n",
"val.encode(\"utf-16\")\n",
"val.encode(\"utf-16le\")"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [],
"source": [
"True and True\n",
"False or True"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [],
"source": [
"int(False)\n",
"int(True)"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [],
"source": [
"a = True\n",
"b = False\n",
"not a\n",
"not b"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [],
"source": [
"s = \"3.14159\"\n",
"fval = float(s)\n",
"type(fval)\n",
"int(fval)\n",
"bool(fval)\n",
"bool(0)"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [],
"source": [
"a = None\n",
"a is None\n",
"b = 5\n",
"b is not None"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [],
"source": [
"from datetime import datetime, date, time\n",
"dt = datetime(2011, 10, 29, 20, 30, 21)\n",
"dt.day\n",
"dt.minute"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {},
"outputs": [],
"source": [
"dt.date()\n",
"dt.time()"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {},
"outputs": [],
"source": [
"dt.strftime(\"%Y-%m-%d %H:%M\")"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {},
"outputs": [],
"source": [
"datetime.strptime(\"20091031\", \"%Y%m%d\")"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {},
"outputs": [],
"source": [
"dt_hour = dt.replace(minute=0, second=0)\n",
"dt_hour"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {},
"outputs": [],
"source": [
"dt"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {},
"outputs": [],
"source": [
"dt2 = datetime(2011, 11, 15, 22, 30)\n",
"delta = dt2 - dt\n",
"delta\n",
"type(delta)"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {},
"outputs": [],
"source": [
"dt\n",
"dt + delta"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {},
"outputs": [],
"source": [
"a = 5; b = 7\n",
"c = 8; d = 4\n",
"if a < b or c > d:\n",
" print(\"Made it\")"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {},
"outputs": [],
"source": [
"4 > 3 > 2 > 1"
]
},
{
"cell_type": "code",
"execution_count": 60,
"metadata": {},
"outputs": [],
"source": [
"for i in range(4):\n",
" for j in range(4):\n",
" if j > i:\n",
" break\n",
" print((i, j))\n"
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {},
"outputs": [],
"source": [
"range(10)\n",
"list(range(10))"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {},
"outputs": [],
"source": [
"list(range(0, 20, 2))\n",
"list(range(5, 0, -1))"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {},
"outputs": [],
"source": [
"seq = [1, 2, 3, 4]\n",
"for i in range(len(seq)):\n",
" print(f\"element {i}: {seq[i]}\")"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {},
"outputs": [],
"source": [
"total = 0\n",
"for i in range(100_000):\n",
" # % is the modulo operator\n",
" if i % 3 == 0 or i % 5 == 0:\n",
" total += i\n",
"print(total)"
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
================================================
FILE: ch03.ipynb
================================================
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"tup = (4, 5, 6)\n",
"tup"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"tup = 4, 5, 6\n",
"tup"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"tuple([4, 0, 2])\n",
"tup = tuple('string')\n",
"tup"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"tup[0]"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"nested_tup = (4, 5, 6), (7, 8)\n",
"nested_tup\n",
"nested_tup[0]\n",
"nested_tup[1]"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"tup = tuple(['foo', [1, 2], True])\n",
"tup[2] = False"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"tup[1].append(3)\n",
"tup"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"(4, None, 'foo') + (6, 0) + ('bar',)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"('foo', 'bar') * 4"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"tup = (4, 5, 6)\n",
"a, b, c = tup\n",
"b"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"tup = 4, 5, (6, 7)\n",
"a, b, (c, d) = tup\n",
"d"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"a, b = 1, 2\n",
"a\n",
"b\n",
"b, a = a, b\n",
"a\n",
"b"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"seq = [(1, 2, 3), (4, 5, 6), (7, 8, 9)]\n",
"for a, b, c in seq:\n",
" print(f'a={a}, b={b}, c={c}')"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"values = 1, 2, 3, 4, 5\n",
"a, b, *rest = values\n",
"a\n",
"b\n",
"rest"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"a, b, *_ = values"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"a = (1, 2, 2, 2, 3, 4, 2)\n",
"a.count(2)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"a_list = [2, 3, 7, None]\n",
"\n",
"tup = (\"foo\", \"bar\", \"baz\")\n",
"b_list = list(tup)\n",
"b_list\n",
"b_list[1] = \"peekaboo\"\n",
"b_list"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"gen = range(10)\n",
"gen\n",
"list(gen)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"b_list.append(\"dwarf\")\n",
"b_list"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"b_list.insert(1, \"red\")\n",
"b_list"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"b_list.pop(2)\n",
"b_list"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"b_list.append(\"foo\")\n",
"b_list\n",
"b_list.remove(\"foo\")\n",
"b_list"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"\"dwarf\" in b_list"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"\"dwarf\" not in b_list"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"[4, None, \"foo\"] + [7, 8, (2, 3)]"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"x = [4, None, \"foo\"]\n",
"x.extend([7, 8, (2, 3)])\n",
"x"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"a = [7, 2, 5, 1, 3]\n",
"a.sort()\n",
"a"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"b = [\"saw\", \"small\", \"He\", \"foxes\", \"six\"]\n",
"b.sort(key=len)\n",
"b"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
"seq = [7, 2, 3, 7, 5, 6, 0, 1]\n",
"seq[1:5]"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
"seq[3:5] = [6, 3]\n",
"seq"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
"seq[:5]\n",
"seq[3:]"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [],
"source": [
"seq[-4:]\n",
"seq[-6:-2]"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"seq[::2]"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [],
"source": [
"seq[::-1]"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [],
"source": [
"empty_dict = {}\n",
"d1 = {\"a\": \"some value\", \"b\": [1, 2, 3, 4]}\n",
"d1"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
"d1[7] = \"an integer\"\n",
"d1\n",
"d1[\"b\"]"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [],
"source": [
"\"b\" in d1"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [],
"source": [
"d1[5] = \"some value\"\n",
"d1\n",
"d1[\"dummy\"] = \"another value\"\n",
"d1\n",
"del d1[5]\n",
"d1\n",
"ret = d1.pop(\"dummy\")\n",
"ret\n",
"d1"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [],
"source": [
"list(d1.keys())\n",
"list(d1.values())"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [],
"source": [
"list(d1.items())"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [],
"source": [
"d1.update({\"b\": \"foo\", \"c\": 12})\n",
"d1"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [],
"source": [
"tuples = zip(range(5), reversed(range(5)))\n",
"tuples\n",
"mapping = dict(tuples)\n",
"mapping"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [],
"source": [
"words = [\"apple\", \"bat\", \"bar\", \"atom\", \"book\"]\n",
"by_letter = {}\n",
"\n",
"for word in words:\n",
" letter = word[0]\n",
" if letter not in by_letter:\n",
" by_letter[letter] = [word]\n",
" else:\n",
" by_letter[letter].append(word)\n",
"\n",
"by_letter"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [],
"source": [
"by_letter = {}\n",
"for word in words:\n",
" letter = word[0]\n",
" by_letter.setdefault(letter, []).append(word)\n",
"by_letter"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [],
"source": [
"from collections import defaultdict\n",
"by_letter = defaultdict(list)\n",
"for word in words:\n",
" by_letter[word[0]].append(word)"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [],
"source": [
"hash(\"string\")\n",
"hash((1, 2, (2, 3)))\n",
"hash((1, 2, [2, 3])) # fails because lists are mutable"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [],
"source": [
"d = {}\n",
"d[tuple([1, 2, 3])] = 5\n",
"d"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [],
"source": [
"set([2, 2, 2, 1, 3, 3])\n",
"{2, 2, 2, 1, 3, 3}"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [],
"source": [
"a = {1, 2, 3, 4, 5}\n",
"b = {3, 4, 5, 6, 7, 8}"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {},
"outputs": [],
"source": [
"a.union(b)\n",
"a | b"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {},
"outputs": [],
"source": [
"a.intersection(b)\n",
"a & b"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {},
"outputs": [],
"source": [
"c = a.copy()\n",
"c |= b\n",
"c\n",
"d = a.copy()\n",
"d &= b\n",
"d"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {},
"outputs": [],
"source": [
"my_data = [1, 2, 3, 4]\n",
"my_set = {tuple(my_data)}\n",
"my_set"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {},
"outputs": [],
"source": [
"a_set = {1, 2, 3, 4, 5}\n",
"{1, 2, 3}.issubset(a_set)\n",
"a_set.issuperset({1, 2, 3})"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {},
"outputs": [],
"source": [
"{1, 2, 3} == {3, 2, 1}"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {},
"outputs": [],
"source": [
"sorted([7, 1, 2, 6, 0, 3, 2])\n",
"sorted(\"horse race\")"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {},
"outputs": [],
"source": [
"seq1 = [\"foo\", \"bar\", \"baz\"]\n",
"seq2 = [\"one\", \"two\", \"three\"]\n",
"zipped = zip(seq1, seq2)\n",
"list(zipped)"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {},
"outputs": [],
"source": [
"seq3 = [False, True]\n",
"list(zip(seq1, seq2, seq3))"
]
},
{
"cell_type": "code",
"execution_count": 60,
"metadata": {},
"outputs": [],
"source": [
"for index, (a, b) in enumerate(zip(seq1, seq2)):\n",
" print(f\"{index}: {a}, {b}\")\n"
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {},
"outputs": [],
"source": [
"list(reversed(range(10)))"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {},
"outputs": [],
"source": [
"strings = [\"a\", \"as\", \"bat\", \"car\", \"dove\", \"python\"]\n",
"[x.upper() for x in strings if len(x) > 2]"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {},
"outputs": [],
"source": [
"unique_lengths = {len(x) for x in strings}\n",
"unique_lengths"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {},
"outputs": [],
"source": [
"set(map(len, strings))"
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {},
"outputs": [],
"source": [
"loc_mapping = {value: index for index, value in enumerate(strings)}\n",
"loc_mapping"
]
},
{
"cell_type": "code",
"execution_count": 66,
"metadata": {},
"outputs": [],
"source": [
"all_data = [[\"John\", \"Emily\", \"Michael\", \"Mary\", \"Steven\"],\n",
" [\"Maria\", \"Juan\", \"Javier\", \"Natalia\", \"Pilar\"]]"
]
},
{
"cell_type": "code",
"execution_count": 67,
"metadata": {},
"outputs": [],
"source": [
"names_of_interest = []\n",
"for names in all_data:\n",
" enough_as = [name for name in names if name.count(\"a\") >= 2]\n",
" names_of_interest.extend(enough_as)\n",
"names_of_interest"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {},
"outputs": [],
"source": [
"result = [name for names in all_data for name in names\n",
" if name.count(\"a\") >= 2]\n",
"result"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {},
"outputs": [],
"source": [
"some_tuples = [(1, 2, 3), (4, 5, 6), (7, 8, 9)]\n",
"flattened = [x for tup in some_tuples for x in tup]\n",
"flattened"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {},
"outputs": [],
"source": [
"flattened = []\n",
"\n",
"for tup in some_tuples:\n",
" for x in tup:\n",
" flattened.append(x)"
]
},
{
"cell_type": "code",
"execution_count": 71,
"metadata": {},
"outputs": [],
"source": [
"[[x for x in tup] for tup in some_tuples]"
]
},
{
"cell_type": "code",
"execution_count": 72,
"metadata": {},
"outputs": [],
"source": [
"def my_function(x, y):\n",
" return x + y"
]
},
{
"cell_type": "code",
"execution_count": 73,
"metadata": {},
"outputs": [],
"source": [
"my_function(1, 2)\n",
"result = my_function(1, 2)\n",
"result"
]
},
{
"cell_type": "code",
"execution_count": 74,
"metadata": {},
"outputs": [],
"source": [
"def function_without_return(x):\n",
" print(x)\n",
"\n",
"result = function_without_return(\"hello!\")\n",
"print(result)"
]
},
{
"cell_type": "code",
"execution_count": 75,
"metadata": {},
"outputs": [],
"source": [
"def my_function2(x, y, z=1.5):\n",
" if z > 1:\n",
" return z * (x + y)\n",
" else:\n",
" return z / (x + y)"
]
},
{
"cell_type": "code",
"execution_count": 76,
"metadata": {},
"outputs": [],
"source": [
"my_function2(5, 6, z=0.7)\n",
"my_function2(3.14, 7, 3.5)\n",
"my_function2(10, 20)"
]
},
{
"cell_type": "code",
"execution_count": 77,
"metadata": {},
"outputs": [],
"source": [
"a = []\n",
"def func():\n",
" for i in range(5):\n",
" a.append(i)"
]
},
{
"cell_type": "code",
"execution_count": 78,
"metadata": {},
"outputs": [],
"source": [
"func()\n",
"a\n",
"func()\n",
"a"
]
},
{
"cell_type": "code",
"execution_count": 79,
"metadata": {},
"outputs": [],
"source": [
"a = None\n",
"def bind_a_variable():\n",
" global a\n",
" a = []\n",
"bind_a_variable()\n",
"print(a)"
]
},
{
"cell_type": "code",
"execution_count": 80,
"metadata": {},
"outputs": [],
"source": [
"states = [\" Alabama \", \"Georgia!\", \"Georgia\", \"georgia\", \"FlOrIda\",\n",
" \"south carolina##\", \"West virginia?\"]"
]
},
{
"cell_type": "code",
"execution_count": 81,
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"\n",
"def clean_strings(strings):\n",
" result = []\n",
" for value in strings:\n",
" value = value.strip()\n",
" value = re.sub(\"[!#?]\", \"\", value)\n",
" value = value.title()\n",
" result.append(value)\n",
" return result"
]
},
{
"cell_type": "code",
"execution_count": 82,
"metadata": {},
"outputs": [],
"source": [
"clean_strings(states)"
]
},
{
"cell_type": "code",
"execution_count": 83,
"metadata": {},
"outputs": [],
"source": [
"def remove_punctuation(value):\n",
" return re.sub(\"[!#?]\", \"\", value)\n",
"\n",
"clean_ops = [str.strip, remove_punctuation, str.title]\n",
"\n",
"def clean_strings(strings, ops):\n",
" result = []\n",
" for value in strings:\n",
" for func in ops:\n",
" value = func(value)\n",
" result.append(value)\n",
" return result"
]
},
{
"cell_type": "code",
"execution_count": 84,
"metadata": {},
"outputs": [],
"source": [
"clean_strings(states, clean_ops)"
]
},
{
"cell_type": "code",
"execution_count": 85,
"metadata": {},
"outputs": [],
"source": [
"for x in map(remove_punctuation, states):\n",
" print(x)"
]
},
{
"cell_type": "code",
"execution_count": 86,
"metadata": {},
"outputs": [],
"source": [
"def short_function(x):\n",
" return x * 2\n",
"\n",
"equiv_anon = lambda x: x * 2"
]
},
{
"cell_type": "code",
"execution_count": 87,
"metadata": {},
"outputs": [],
"source": [
"def apply_to_list(some_list, f):\n",
" return [f(x) for x in some_list]\n",
"\n",
"ints = [4, 0, 1, 5, 6]\n",
"apply_to_list(ints, lambda x: x * 2)"
]
},
{
"cell_type": "code",
"execution_count": 88,
"metadata": {},
"outputs": [],
"source": [
"strings = [\"foo\", \"card\", \"bar\", \"aaaa\", \"abab\"]"
]
},
{
"cell_type": "code",
"execution_count": 89,
"metadata": {},
"outputs": [],
"source": [
"strings.sort(key=lambda x: len(set(x)))\n",
"strings"
]
},
{
"cell_type": "code",
"execution_count": 90,
"metadata": {},
"outputs": [],
"source": [
"some_dict = {\"a\": 1, \"b\": 2, \"c\": 3}\n",
"for key in some_dict:\n",
" print(key)"
]
},
{
"cell_type": "code",
"execution_count": 91,
"metadata": {},
"outputs": [],
"source": [
"dict_iterator = iter(some_dict)\n",
"dict_iterator"
]
},
{
"cell_type": "code",
"execution_count": 92,
"metadata": {},
"outputs": [],
"source": [
"list(dict_iterator)"
]
},
{
"cell_type": "code",
"execution_count": 93,
"metadata": {},
"outputs": [],
"source": [
"def squares(n=10):\n",
" print(f\"Generating squares from 1 to {n ** 2}\")\n",
" for i in range(1, n + 1):\n",
" yield i ** 2"
]
},
{
"cell_type": "code",
"execution_count": 94,
"metadata": {},
"outputs": [],
"source": [
"gen = squares()\n",
"gen"
]
},
{
"cell_type": "code",
"execution_count": 95,
"metadata": {},
"outputs": [],
"source": [
"for x in gen:\n",
" print(x, end=\" \")"
]
},
{
"cell_type": "code",
"execution_count": 96,
"metadata": {},
"outputs": [],
"source": [
"gen = (x ** 2 for x in range(100))\n",
"gen"
]
},
{
"cell_type": "code",
"execution_count": 97,
"metadata": {},
"outputs": [],
"source": [
"sum(x ** 2 for x in range(100))\n",
"dict((i, i ** 2) for i in range(5))"
]
},
{
"cell_type": "code",
"execution_count": 98,
"metadata": {},
"outputs": [],
"source": [
"import itertools\n",
"def first_letter(x):\n",
" return x[0]\n",
"\n",
"names = [\"Alan\", \"Adam\", \"Wes\", \"Will\", \"Albert\", \"Steven\"]\n",
"\n",
"for letter, names in itertools.groupby(names, first_letter):\n",
" print(letter, list(names)) # names is a generator"
]
},
{
"cell_type": "code",
"execution_count": 99,
"metadata": {},
"outputs": [],
"source": [
"float(\"1.2345\")\n",
"float(\"something\")"
]
},
{
"cell_type": "code",
"execution_count": 100,
"metadata": {},
"outputs": [],
"source": [
"def attempt_float(x):\n",
" try:\n",
" return float(x)\n",
" except:\n",
" return x"
]
},
{
"cell_type": "code",
"execution_count": 101,
"metadata": {},
"outputs": [],
"source": [
"attempt_float(\"1.2345\")\n",
"attempt_float(\"something\")"
]
},
{
"cell_type": "code",
"execution_count": 102,
"metadata": {},
"outputs": [],
"source": [
"float((1, 2))"
]
},
{
"cell_type": "code",
"execution_count": 103,
"metadata": {},
"outputs": [],
"source": [
"def attempt_float(x):\n",
" try:\n",
" return float(x)\n",
" except ValueError:\n",
" return x"
]
},
{
"cell_type": "code",
"execution_count": 104,
"metadata": {},
"outputs": [],
"source": [
"attempt_float((1, 2))"
]
},
{
"cell_type": "code",
"execution_count": 105,
"metadata": {},
"outputs": [],
"source": [
"def attempt_float(x):\n",
" try:\n",
" return float(x)\n",
" except (TypeError, ValueError):\n",
" return x"
]
},
{
"cell_type": "code",
"execution_count": 106,
"metadata": {},
"outputs": [],
"source": [
"path = \"examples/segismundo.txt\"\n",
"f = open(path, encoding=\"utf-8\")"
]
},
{
"cell_type": "code",
"execution_count": 107,
"metadata": {},
"outputs": [],
"source": [
"lines = [x.rstrip() for x in open(path, encoding=\"utf-8\")]\n",
"lines"
]
},
{
"cell_type": "code",
"execution_count": 108,
"metadata": {},
"outputs": [],
"source": [
"f.close()"
]
},
{
"cell_type": "code",
"execution_count": 109,
"metadata": {},
"outputs": [],
"source": [
"with open(path, encoding=\"utf-8\") as f:\n",
" lines = [x.rstrip() for x in f]"
]
},
{
"cell_type": "code",
"execution_count": 110,
"metadata": {},
"outputs": [],
"source": [
"f1 = open(path)\n",
"f1.read(10)\n",
"f2 = open(path, mode=\"rb\") # Binary mode\n",
"f2.read(10)"
]
},
{
"cell_type": "code",
"execution_count": 111,
"metadata": {},
"outputs": [],
"source": [
"f1.tell()\n",
"f2.tell()"
]
},
{
"cell_type": "code",
"execution_count": 112,
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"sys.getdefaultencoding()"
]
},
{
"cell_type": "code",
"execution_count": 113,
"metadata": {},
"outputs": [],
"source": [
"f1.seek(3)\n",
"f1.read(1)\n",
"f1.tell()"
]
},
{
"cell_type": "code",
"execution_count": 114,
"metadata": {},
"outputs": [],
"source": [
"f1.close()\n",
"f2.close()"
]
},
{
"cell_type": "code",
"execution_count": 115,
"metadata": {},
"outputs": [],
"source": [
"path\n",
"\n",
"with open(\"tmp.txt\", mode=\"w\") as handle:\n",
" handle.writelines(x for x in open(path) if len(x) > 1)\n",
"\n",
"with open(\"tmp.txt\") as f:\n",
" lines = f.readlines()\n",
"\n",
"lines"
]
},
{
"cell_type": "code",
"execution_count": 116,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"os.remove(\"tmp.txt\")"
]
},
{
"cell_type": "code",
"execution_count": 117,
"metadata": {},
"outputs": [],
"source": [
"with open(path) as f:\n",
" chars = f.read(10)\n",
"\n",
"chars\n",
"len(chars)"
]
},
{
"cell_type": "code",
"execution_count": 118,
"metadata": {},
"outputs": [],
"source": [
"with open(path, mode=\"rb\") as f:\n",
" data = f.read(10)\n",
"\n",
"data"
]
},
{
"cell_type": "code",
"execution_count": 119,
"metadata": {},
"outputs": [],
"source": [
"data.decode(\"utf-8\")\n",
"data[:4].decode(\"utf-8\")"
]
},
{
"cell_type": "code",
"execution_count": 120,
"metadata": {},
"outputs": [],
"source": [
"sink_path = \"sink.txt\"\n",
"with open(path) as source:\n",
" with open(sink_path, \"x\", encoding=\"iso-8859-1\") as sink:\n",
" sink.write(source.read())\n",
"\n",
"with open(sink_path, encoding=\"iso-8859-1\") as f:\n",
" print(f.read(10))"
]
},
{
"cell_type": "code",
"execution_count": 121,
"metadata": {},
"outputs": [],
"source": [
"os.remove(sink_path)"
]
},
{
"cell_type": "code",
"execution_count": 122,
"metadata": {},
"outputs": [],
"source": [
"f = open(path, encoding='utf-8')\n",
"f.read(5)\n",
"f.seek(4)\n",
"f.read(1)\n",
"f.close()"
]
},
{
"cell_type": "code",
"execution_count": 123,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
================================================
FILE: ch04.ipynb
================================================
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"np.random.seed(12345)\n",
"import matplotlib.pyplot as plt\n",
"plt.rc(\"figure\", figsize=(10, 6))\n",
"np.set_printoptions(precision=4, suppress=True)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"\n",
"my_arr = np.arange(1_000_000)\n",
"my_list = list(range(1_000_000))"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"%timeit my_arr2 = my_arr * 2\n",
"%timeit my_list2 = [x * 2 for x in my_list]"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"data = np.array([[1.5, -0.1, 3], [0, -3, 6.5]])\n",
"data"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"data * 10\n",
"data + data"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"data.shape\n",
"data.dtype"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"data1 = [6, 7.5, 8, 0, 1]\n",
"arr1 = np.array(data1)\n",
"arr1"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"data2 = [[1, 2, 3, 4], [5, 6, 7, 8]]\n",
"arr2 = np.array(data2)\n",
"arr2"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"arr2.ndim\n",
"arr2.shape"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"arr1.dtype\n",
"arr2.dtype"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"np.zeros(10)\n",
"np.zeros((3, 6))\n",
"np.empty((2, 3, 2))"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"np.arange(15)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"arr1 = np.array([1, 2, 3], dtype=np.float64)\n",
"arr2 = np.array([1, 2, 3], dtype=np.int32)\n",
"arr1.dtype\n",
"arr2.dtype"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"arr = np.array([1, 2, 3, 4, 5])\n",
"arr.dtype\n",
"float_arr = arr.astype(np.float64)\n",
"float_arr\n",
"float_arr.dtype"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"arr = np.array([3.7, -1.2, -2.6, 0.5, 12.9, 10.1])\n",
"arr\n",
"arr.astype(np.int32)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"numeric_strings = np.array([\"1.25\", \"-9.6\", \"42\"], dtype=np.string_)\n",
"numeric_strings.astype(float)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"int_array = np.arange(10)\n",
"calibers = np.array([.22, .270, .357, .380, .44, .50], dtype=np.float64)\n",
"int_array.astype(calibers.dtype)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"zeros_uint32 = np.zeros(8, dtype=\"u4\")\n",
"zeros_uint32"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"arr = np.array([[1., 2., 3.], [4., 5., 6.]])\n",
"arr\n",
"arr * arr\n",
"arr - arr"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"1 / arr\n",
"arr ** 2"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"arr2 = np.array([[0., 4., 1.], [7., 2., 12.]])\n",
"arr2\n",
"arr2 > arr"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"arr = np.arange(10)\n",
"arr\n",
"arr[5]\n",
"arr[5:8]\n",
"arr[5:8] = 12\n",
"arr"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"arr_slice = arr[5:8]\n",
"arr_slice"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"arr_slice[1] = 12345\n",
"arr"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"arr_slice[:] = 64\n",
"arr"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"arr2d = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])\n",
"arr2d[2]"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"arr2d[0][2]\n",
"arr2d[0, 2]"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"arr3d = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]])\n",
"arr3d"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"arr3d[0]"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
"old_values = arr3d[0].copy()\n",
"arr3d[0] = 42\n",
"arr3d\n",
"arr3d[0] = old_values\n",
"arr3d"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
"arr3d[1, 0]"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
"x = arr3d[1]\n",
"x\n",
"x[0]"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [],
"source": [
"arr\n",
"arr[1:6]"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"arr2d\n",
"arr2d[:2]"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [],
"source": [
"arr2d[:2, 1:]"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [],
"source": [
"lower_dim_slice = arr2d[1, :2]"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
"lower_dim_slice.shape"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [],
"source": [
"arr2d[:2, 2]"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [],
"source": [
"arr2d[:, :1]"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [],
"source": [
"arr2d[:2, 1:] = 0\n",
"arr2d"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [],
"source": [
"names = np.array([\"Bob\", \"Joe\", \"Will\", \"Bob\", \"Will\", \"Joe\", \"Joe\"])\n",
"data = np.array([[4, 7], [0, 2], [-5, 6], [0, 0], [1, 2],\n",
" [-12, -4], [3, 4]])\n",
"names\n",
"data"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [],
"source": [
"names == \"Bob\""
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [],
"source": [
"data[names == \"Bob\"]"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [],
"source": [
"data[names == \"Bob\", 1:]\n",
"data[names == \"Bob\", 1]"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [],
"source": [
"names != \"Bob\"\n",
"~(names == \"Bob\")\n",
"data[~(names == \"Bob\")]"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [],
"source": [
"cond = names == \"Bob\"\n",
"data[~cond]"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [],
"source": [
"mask = (names == \"Bob\") | (names == \"Will\")\n",
"mask\n",
"data[mask]"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [],
"source": [
"data[data < 0] = 0\n",
"data"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [],
"source": [
"data[names != \"Joe\"] = 7\n",
"data"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [],
"source": [
"arr = np.zeros((8, 4))\n",
"for i in range(8):\n",
" arr[i] = i\n",
"arr"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {},
"outputs": [],
"source": [
"arr[[4, 3, 0, 6]]"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {},
"outputs": [],
"source": [
"arr[[-3, -5, -7]]"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {},
"outputs": [],
"source": [
"arr = np.arange(32).reshape((8, 4))\n",
"arr\n",
"arr[[1, 5, 7, 2], [0, 3, 1, 2]]"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {},
"outputs": [],
"source": [
"arr[[1, 5, 7, 2]][:, [0, 3, 1, 2]]"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {},
"outputs": [],
"source": [
"arr[[1, 5, 7, 2], [0, 3, 1, 2]]\n",
"arr[[1, 5, 7, 2], [0, 3, 1, 2]] = 0\n",
"arr"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {},
"outputs": [],
"source": [
"arr = np.arange(15).reshape((3, 5))\n",
"arr\n",
"arr.T"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {},
"outputs": [],
"source": [
"arr = np.array([[0, 1, 0], [1, 2, -2], [6, 3, 2], [-1, 0, -1], [1, 0, 1]])\n",
"arr\n",
"np.dot(arr.T, arr)"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {},
"outputs": [],
"source": [
"arr.T @ arr"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {},
"outputs": [],
"source": [
"arr\n",
"arr.swapaxes(0, 1)"
]
},
{
"cell_type": "code",
"execution_count": 60,
"metadata": {},
"outputs": [],
"source": [
"samples = np.random.standard_normal(size=(4, 4))\n",
"samples"
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {},
"outputs": [],
"source": [
"from random import normalvariate\n",
"N = 1_000_000\n",
"%timeit samples = [normalvariate(0, 1) for _ in range(N)]\n",
"%timeit np.random.standard_normal(N)"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {},
"outputs": [],
"source": [
"rng = np.random.default_rng(seed=12345)\n",
"data = rng.standard_normal((2, 3))"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {},
"outputs": [],
"source": [
"type(rng)"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {},
"outputs": [],
"source": [
"arr = np.arange(10)\n",
"arr\n",
"np.sqrt(arr)\n",
"np.exp(arr)"
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {},
"outputs": [],
"source": [
"x = rng.standard_normal(8)\n",
"y = rng.standard_normal(8)\n",
"x\n",
"y\n",
"np.maximum(x, y)"
]
},
{
"cell_type": "code",
"execution_count": 66,
"metadata": {},
"outputs": [],
"source": [
"arr = rng.standard_normal(7) * 5\n",
"arr\n",
"remainder, whole_part = np.modf(arr)\n",
"remainder\n",
"whole_part"
]
},
{
"cell_type": "code",
"execution_count": 67,
"metadata": {},
"outputs": [],
"source": [
"arr\n",
"out = np.zeros_like(arr)\n",
"np.add(arr, 1)\n",
"np.add(arr, 1, out=out)\n",
"out"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {},
"outputs": [],
"source": [
"points = np.arange(-5, 5, 0.01) # 100 equally spaced points\n",
"xs, ys = np.meshgrid(points, points)\n",
"ys"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {},
"outputs": [],
"source": [
"z = np.sqrt(xs ** 2 + ys ** 2)\n",
"z"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {},
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
"plt.imshow(z, cmap=plt.cm.gray, extent=[-5, 5, -5, 5])\n",
"plt.colorbar()\n",
"plt.title(\"Image plot of $\\sqrt{x^2 + y^2}$ for a grid of values\")"
]
},
{
"cell_type": "code",
"execution_count": 71,
"metadata": {},
"outputs": [],
"source": [
"plt.draw()"
]
},
{
"cell_type": "code",
"execution_count": 72,
"metadata": {},
"outputs": [],
"source": [
"plt.close(\"all\")"
]
},
{
"cell_type": "code",
"execution_count": 73,
"metadata": {},
"outputs": [],
"source": [
"xarr = np.array([1.1, 1.2, 1.3, 1.4, 1.5])\n",
"yarr = np.array([2.1, 2.2, 2.3, 2.4, 2.5])\n",
"cond = np.array([True, False, True, True, False])"
]
},
{
"cell_type": "code",
"execution_count": 74,
"metadata": {},
"outputs": [],
"source": [
"result = [(x if c else y)\n",
" for x, y, c in zip(xarr, yarr, cond)]\n",
"result"
]
},
{
"cell_type": "code",
"execution_count": 75,
"metadata": {},
"outputs": [],
"source": [
"result = np.where(cond, xarr, yarr)\n",
"result"
]
},
{
"cell_type": "code",
"execution_count": 76,
"metadata": {},
"outputs": [],
"source": [
"arr = rng.standard_normal((4, 4))\n",
"arr\n",
"arr > 0\n",
"np.where(arr > 0, 2, -2)"
]
},
{
"cell_type": "code",
"execution_count": 77,
"metadata": {},
"outputs": [],
"source": [
"np.where(arr > 0, 2, arr) # set only positive values to 2"
]
},
{
"cell_type": "code",
"execution_count": 78,
"metadata": {},
"outputs": [],
"source": [
"arr = rng.standard_normal((5, 4))\n",
"arr\n",
"arr.mean()\n",
"np.mean(arr)\n",
"arr.sum()"
]
},
{
"cell_type": "code",
"execution_count": 79,
"metadata": {},
"outputs": [],
"source": [
"arr.mean(axis=1)\n",
"arr.sum(axis=0)"
]
},
{
"cell_type": "code",
"execution_count": 80,
"metadata": {},
"outputs": [],
"source": [
"arr = np.array([0, 1, 2, 3, 4, 5, 6, 7])\n",
"arr.cumsum()"
]
},
{
"cell_type": "code",
"execution_count": 81,
"metadata": {},
"outputs": [],
"source": [
"arr = np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8]])\n",
"arr"
]
},
{
"cell_type": "code",
"execution_count": 82,
"metadata": {},
"outputs": [],
"source": [
"arr.cumsum(axis=0)\n",
"arr.cumsum(axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 83,
"metadata": {},
"outputs": [],
"source": [
"arr = rng.standard_normal(100)\n",
"(arr > 0).sum() # Number of positive values\n",
"(arr <= 0).sum() # Number of non-positive values"
]
},
{
"cell_type": "code",
"execution_count": 84,
"metadata": {},
"outputs": [],
"source": [
"bools = np.array([False, False, True, False])\n",
"bools.any()\n",
"bools.all()"
]
},
{
"cell_type": "code",
"execution_count": 85,
"metadata": {},
"outputs": [],
"source": [
"arr = rng.standard_normal(6)\n",
"arr\n",
"arr.sort()\n",
"arr"
]
},
{
"cell_type": "code",
"execution_count": 86,
"metadata": {},
"outputs": [],
"source": [
"arr = rng.standard_normal((5, 3))\n",
"arr"
]
},
{
"cell_type": "code",
"execution_count": 87,
"metadata": {},
"outputs": [],
"source": [
"arr.sort(axis=0)\n",
"arr\n",
"arr.sort(axis=1)\n",
"arr"
]
},
{
"cell_type": "code",
"execution_count": 88,
"metadata": {},
"outputs": [],
"source": [
"arr2 = np.array([5, -10, 7, 1, 0, -3])\n",
"sorted_arr2 = np.sort(arr2)\n",
"sorted_arr2"
]
},
{
"cell_type": "code",
"execution_count": 89,
"metadata": {},
"outputs": [],
"source": [
"names = np.array([\"Bob\", \"Will\", \"Joe\", \"Bob\", \"Will\", \"Joe\", \"Joe\"])\n",
"np.unique(names)\n",
"ints = np.array([3, 3, 3, 2, 2, 1, 1, 4, 4])\n",
"np.unique(ints)"
]
},
{
"cell_type": "code",
"execution_count": 90,
"metadata": {},
"outputs": [],
"source": [
"sorted(set(names))"
]
},
{
"cell_type": "code",
"execution_count": 91,
"metadata": {},
"outputs": [],
"source": [
"values = np.array([6, 0, 0, 3, 2, 5, 6])\n",
"np.in1d(values, [2, 3, 6])"
]
},
{
"cell_type": "code",
"execution_count": 92,
"metadata": {},
"outputs": [],
"source": [
"arr = np.arange(10)\n",
"np.save(\"some_array\", arr)"
]
},
{
"cell_type": "code",
"execution_count": 93,
"metadata": {},
"outputs": [],
"source": [
"np.load(\"some_array.npy\")"
]
},
{
"cell_type": "code",
"execution_count": 94,
"metadata": {},
"outputs": [],
"source": [
"np.savez(\"array_archive.npz\", a=arr, b=arr)"
]
},
{
"cell_type": "code",
"execution_count": 95,
"metadata": {},
"outputs": [],
"source": [
"arch = np.load(\"array_archive.npz\")\n",
"arch[\"b\"]"
]
},
{
"cell_type": "code",
"execution_count": 96,
"metadata": {},
"outputs": [],
"source": [
"np.savez_compressed(\"arrays_compressed.npz\", a=arr, b=arr)"
]
},
{
"cell_type": "code",
"execution_count": 97,
"metadata": {},
"outputs": [],
"source": [
"!rm some_array.npy\n",
"!rm array_archive.npz\n",
"!rm arrays_compressed.npz"
]
},
{
"cell_type": "code",
"execution_count": 98,
"metadata": {},
"outputs": [],
"source": [
"x = np.array([[1., 2., 3.], [4., 5., 6.]])\n",
"y = np.array([[6., 23.], [-1, 7], [8, 9]])\n",
"x\n",
"y\n",
"x.dot(y)"
]
},
{
"cell_type": "code",
"execution_count": 99,
"metadata": {},
"outputs": [],
"source": [
"np.dot(x, y)"
]
},
{
"cell_type": "code",
"execution_count": 100,
"metadata": {},
"outputs": [],
"source": [
"x @ np.ones(3)"
]
},
{
"cell_type": "code",
"execution_count": 101,
"metadata": {},
"outputs": [],
"source": [
"from numpy.linalg import inv, qr\n",
"X = rng.standard_normal((5, 5))\n",
"mat = X.T @ X\n",
"inv(mat)\n",
"mat @ inv(mat)"
]
},
{
"cell_type": "code",
"execution_count": 102,
"metadata": {},
"outputs": [],
"source": [
"import random\n",
"position = 0\n",
"walk = [position]\n",
"nsteps = 1000\n",
"for _ in range(nsteps):\n",
" step = 1 if random.randint(0, 1) else -1\n",
" position += step\n",
" walk.append(position)\n"
]
},
{
"cell_type": "code",
"execution_count": 103,
"metadata": {},
"outputs": [],
"source": [
"plt.figure()"
]
},
{
"cell_type": "code",
"execution_count": 104,
"metadata": {},
"outputs": [],
"source": [
"plt.plot(walk[:100])"
]
},
{
"cell_type": "code",
"execution_count": 105,
"metadata": {},
"outputs": [],
"source": [
"nsteps = 1000\n",
"rng = np.random.default_rng(seed=12345) # fresh random generator\n",
"draws = rng.integers(0, 2, size=nsteps)\n",
"steps = np.where(draws == 0, 1, -1)\n",
"walk = steps.cumsum()"
]
},
{
"cell_type": "code",
"execution_count": 106,
"metadata": {},
"outputs": [],
"source": [
"walk.min()\n",
"walk.max()"
]
},
{
"cell_type": "code",
"execution_count": 107,
"metadata": {},
"outputs": [],
"source": [
"(np.abs(walk) >= 10).argmax()"
]
},
{
"cell_type": "code",
"execution_count": 108,
"metadata": {},
"outputs": [],
"source": [
"nwalks = 5000\n",
"nsteps = 1000\n",
"draws = rng.integers(0, 2, size=(nwalks, nsteps)) # 0 or 1\n",
"steps = np.where(draws > 0, 1, -1)\n",
"walks = steps.cumsum(axis=1)\n",
"walks"
]
},
{
"cell_type": "code",
"execution_count": 109,
"metadata": {},
"outputs": [],
"source": [
"walks.max()\n",
"walks.min()"
]
},
{
"cell_type": "code",
"execution_count": 110,
"metadata": {},
"outputs": [],
"source": [
"hits30 = (np.abs(walks) >= 30).any(axis=1)\n",
"hits30\n",
"hits30.sum() # Number that hit 30 or -30"
]
},
{
"cell_type": "code",
"execution_count": 111,
"metadata": {},
"outputs": [],
"source": [
"crossing_times = (np.abs(walks[hits30]) >= 30).argmax(axis=1)\n",
"crossing_times"
]
},
{
"cell_type": "code",
"execution_count": 112,
"metadata": {},
"outputs": [],
"source": [
"crossing_times.mean()"
]
},
{
"cell_type": "code",
"execution_count": 113,
"metadata": {},
"outputs": [],
"source": [
"draws = 0.25 * rng.standard_normal((nwalks, nsteps))"
]
},
{
"cell_type": "code",
"execution_count": 114,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
================================================
FILE: ch05.ipynb
================================================
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"from pandas import Series, DataFrame"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"np.random.seed(12345)\n",
"import matplotlib.pyplot as plt\n",
"plt.rc(\"figure\", figsize=(10, 6))\n",
"PREVIOUS_MAX_ROWS = pd.options.display.max_rows\n",
"pd.options.display.max_rows = 20\n",
"pd.options.display.max_columns = 20\n",
"pd.options.display.max_colwidth = 80\n",
"np.set_printoptions(precision=4, suppress=True)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"obj = pd.Series([4, 7, -5, 3])\n",
"obj"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"obj.array\n",
"obj.index"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"obj2 = pd.Series([4, 7, -5, 3], index=[\"d\", \"b\", \"a\", \"c\"])\n",
"obj2\n",
"obj2.index"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"obj2[\"a\"]\n",
"obj2[\"d\"] = 6\n",
"obj2[[\"c\", \"a\", \"d\"]]"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"obj2[obj2 > 0]\n",
"obj2 * 2\n",
"import numpy as np\n",
"np.exp(obj2)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"\"b\" in obj2\n",
"\"e\" in obj2"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"sdata = {\"Ohio\": 35000, \"Texas\": 71000, \"Oregon\": 16000, \"Utah\": 5000}\n",
"obj3 = pd.Series(sdata)\n",
"obj3"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"obj3.to_dict()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"states = [\"California\", \"Ohio\", \"Oregon\", \"Texas\"]\n",
"obj4 = pd.Series(sdata, index=states)\n",
"obj4"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"pd.isna(obj4)\n",
"pd.notna(obj4)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"obj4.isna()"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"obj3\n",
"obj4\n",
"obj3 + obj4"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"obj4.name = \"population\"\n",
"obj4.index.name = \"state\"\n",
"obj4"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"obj\n",
"obj.index = [\"Bob\", \"Steve\", \"Jeff\", \"Ryan\"]\n",
"obj"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"data = {\"state\": [\"Ohio\", \"Ohio\", \"Ohio\", \"Nevada\", \"Nevada\", \"Nevada\"],\n",
" \"year\": [2000, 2001, 2002, 2001, 2002, 2003],\n",
" \"pop\": [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}\n",
"frame = pd.DataFrame(data)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"frame"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"frame.head()"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"frame.tail()"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"pd.DataFrame(data, columns=[\"year\", \"state\", \"pop\"])"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"frame2 = pd.DataFrame(data, columns=[\"year\", \"state\", \"pop\", \"debt\"])\n",
"frame2\n",
"frame2.columns"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"frame2[\"state\"]\n",
"frame2.year"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"frame2.loc[1]\n",
"frame2.iloc[2]"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"frame2[\"debt\"] = 16.5\n",
"frame2\n",
"frame2[\"debt\"] = np.arange(6.)\n",
"frame2"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"val = pd.Series([-1.2, -1.5, -1.7], index=[\"two\", \"four\", \"five\"])\n",
"frame2[\"debt\"] = val\n",
"frame2"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"frame2[\"eastern\"] = frame2[\"state\"] == \"Ohio\"\n",
"frame2"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"del frame2[\"eastern\"]\n",
"frame2.columns"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
"populations = {\"Ohio\": {2000: 1.5, 2001: 1.7, 2002: 3.6},\n",
" \"Nevada\": {2001: 2.4, 2002: 2.9}}"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
"frame3 = pd.DataFrame(populations)\n",
"frame3"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
"frame3.T"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [],
"source": [
"pd.DataFrame(populations, index=[2001, 2002, 2003])"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"pdata = {\"Ohio\": frame3[\"Ohio\"][:-1],\n",
" \"Nevada\": frame3[\"Nevada\"][:2]}\n",
"pd.DataFrame(pdata)"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [],
"source": [
"frame3.index.name = \"year\"\n",
"frame3.columns.name = \"state\"\n",
"frame3"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [],
"source": [
"frame3.to_numpy()"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
"frame2.to_numpy()"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [],
"source": [
"obj = pd.Series(np.arange(3), index=[\"a\", \"b\", \"c\"])\n",
"index = obj.index\n",
"index\n",
"index[1:]"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [],
"source": [
"labels = pd.Index(np.arange(3))\n",
"labels\n",
"obj2 = pd.Series([1.5, -2.5, 0], index=labels)\n",
"obj2\n",
"obj2.index is labels"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [],
"source": [
"frame3\n",
"frame3.columns\n",
"\"Ohio\" in frame3.columns\n",
"2003 in frame3.index"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [],
"source": [
"pd.Index([\"foo\", \"foo\", \"bar\", \"bar\"])"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [],
"source": [
"obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=[\"d\", \"b\", \"a\", \"c\"])\n",
"obj"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [],
"source": [
"obj2 = obj.reindex([\"a\", \"b\", \"c\", \"d\", \"e\"])\n",
"obj2"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [],
"source": [
"obj3 = pd.Series([\"blue\", \"purple\", \"yellow\"], index=[0, 2, 4])\n",
"obj3\n",
"obj3.reindex(np.arange(6), method=\"ffill\")"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [],
"source": [
"frame = pd.DataFrame(np.arange(9).reshape((3, 3)),\n",
" index=[\"a\", \"c\", \"d\"],\n",
" columns=[\"Ohio\", \"Texas\", \"California\"])\n",
"frame\n",
"frame2 = frame.reindex(index=[\"a\", \"b\", \"c\", \"d\"])\n",
"frame2"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [],
"source": [
"states = [\"Texas\", \"Utah\", \"California\"]\n",
"frame.reindex(columns=states)"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [],
"source": [
"frame.reindex(states, axis=\"columns\")"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [],
"source": [
"frame.loc[[\"a\", \"d\", \"c\"], [\"California\", \"Texas\"]]"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [],
"source": [
"obj = pd.Series(np.arange(5.), index=[\"a\", \"b\", \"c\", \"d\", \"e\"])\n",
"obj\n",
"new_obj = obj.drop(\"c\")\n",
"new_obj\n",
"obj.drop([\"d\", \"c\"])"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [],
"source": [
"data = pd.DataFrame(np.arange(16).reshape((4, 4)),\n",
" index=[\"Ohio\", \"Colorado\", \"Utah\", \"New York\"],\n",
" columns=[\"one\", \"two\", \"three\", \"four\"])\n",
"data"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {},
"outputs": [],
"source": [
"data.drop(index=[\"Colorado\", \"Ohio\"])"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {},
"outputs": [],
"source": [
"data.drop(columns=[\"two\"])"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {},
"outputs": [],
"source": [
"data.drop(\"two\", axis=1)\n",
"data.drop([\"two\", \"four\"], axis=\"columns\")"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {},
"outputs": [],
"source": [
"obj = pd.Series(np.arange(4.), index=[\"a\", \"b\", \"c\", \"d\"])\n",
"obj\n",
"obj[\"b\"]\n",
"obj[1]\n",
"obj[2:4]\n",
"obj[[\"b\", \"a\", \"d\"]]\n",
"obj[[1, 3]]\n",
"obj[obj < 2]"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {},
"outputs": [],
"source": [
"obj.loc[[\"b\", \"a\", \"d\"]]"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {},
"outputs": [],
"source": [
"obj1 = pd.Series([1, 2, 3], index=[2, 0, 1])\n",
"obj2 = pd.Series([1, 2, 3], index=[\"a\", \"b\", \"c\"])\n",
"obj1\n",
"obj2\n",
"obj1[[0, 1, 2]]\n",
"obj2[[0, 1, 2]]"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {},
"outputs": [],
"source": [
"obj1.iloc[[0, 1, 2]]\n",
"obj2.iloc[[0, 1, 2]]"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {},
"outputs": [],
"source": [
"obj2.loc[\"b\":\"c\"]"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {},
"outputs": [],
"source": [
"obj2.loc[\"b\":\"c\"] = 5\n",
"obj2"
]
},
{
"cell_type": "code",
"execution_count": 60,
"metadata": {},
"outputs": [],
"source": [
"data = pd.DataFrame(np.arange(16).reshape((4, 4)),\n",
" index=[\"Ohio\", \"Colorado\", \"Utah\", \"New York\"],\n",
" columns=[\"one\", \"two\", \"three\", \"four\"])\n",
"data\n",
"data[\"two\"]\n",
"data[[\"three\", \"one\"]]"
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {},
"outputs": [],
"source": [
"data[:2]\n",
"data[data[\"three\"] > 5]"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {},
"outputs": [],
"source": [
"data < 5"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {},
"outputs": [],
"source": [
"data[data < 5] = 0\n",
"data"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {},
"outputs": [],
"source": [
"data\n",
"data.loc[\"Colorado\"]"
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {},
"outputs": [],
"source": [
"data.loc[[\"Colorado\", \"New York\"]]"
]
},
{
"cell_type": "code",
"execution_count": 66,
"metadata": {},
"outputs": [],
"source": [
"data.loc[\"Colorado\", [\"two\", \"three\"]]"
]
},
{
"cell_type": "code",
"execution_count": 67,
"metadata": {},
"outputs": [],
"source": [
"data.iloc[2]\n",
"data.iloc[[2, 1]]\n",
"data.iloc[2, [3, 0, 1]]\n",
"data.iloc[[1, 2], [3, 0, 1]]"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {},
"outputs": [],
"source": [
"data.loc[:\"Utah\", \"two\"]\n",
"data.iloc[:, :3][data.three > 5]"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {},
"outputs": [],
"source": [
"data.loc[data.three >= 2]"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {},
"outputs": [],
"source": [
"ser = pd.Series(np.arange(3.))\n",
"ser\n",
"ser[-1]"
]
},
{
"cell_type": "code",
"execution_count": 71,
"metadata": {},
"outputs": [],
"source": [
"ser"
]
},
{
"cell_type": "code",
"execution_count": 72,
"metadata": {},
"outputs": [],
"source": [
"ser2 = pd.Series(np.arange(3.), index=[\"a\", \"b\", \"c\"])\n",
"ser2[-1]"
]
},
{
"cell_type": "code",
"execution_count": 73,
"metadata": {},
"outputs": [],
"source": [
"ser.iloc[-1]"
]
},
{
"cell_type": "code",
"execution_count": 74,
"metadata": {},
"outputs": [],
"source": [
"ser[:2]"
]
},
{
"cell_type": "code",
"execution_count": 75,
"metadata": {},
"outputs": [],
"source": [
"data.loc[:, \"one\"] = 1\n",
"data\n",
"data.iloc[2] = 5\n",
"data\n",
"data.loc[data[\"four\"] > 5] = 3\n",
"data"
]
},
{
"cell_type": "code",
"execution_count": 76,
"metadata": {},
"outputs": [],
"source": [
"data.loc[data.three == 5][\"three\"] = 6"
]
},
{
"cell_type": "code",
"execution_count": 77,
"metadata": {},
"outputs": [],
"source": [
"data"
]
},
{
"cell_type": "code",
"execution_count": 78,
"metadata": {},
"outputs": [],
"source": [
"data.loc[data.three == 5, \"three\"] = 6\n",
"data"
]
},
{
"cell_type": "code",
"execution_count": 79,
"metadata": {},
"outputs": [],
"source": [
"s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=[\"a\", \"c\", \"d\", \"e\"])\n",
"s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1],\n",
" index=[\"a\", \"c\", \"e\", \"f\", \"g\"])\n",
"s1\n",
"s2"
]
},
{
"cell_type": "code",
"execution_count": 80,
"metadata": {},
"outputs": [],
"source": [
"s1 + s2"
]
},
{
"cell_type": "code",
"execution_count": 81,
"metadata": {},
"outputs": [],
"source": [
"df1 = pd.DataFrame(np.arange(9.).reshape((3, 3)), columns=list(\"bcd\"),\n",
" index=[\"Ohio\", \"Texas\", \"Colorado\"])\n",
"df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list(\"bde\"),\n",
" index=[\"Utah\", \"Ohio\", \"Texas\", \"Oregon\"])\n",
"df1\n",
"df2"
]
},
{
"cell_type": "code",
"execution_count": 82,
"metadata": {},
"outputs": [],
"source": [
"df1 + df2"
]
},
{
"cell_type": "code",
"execution_count": 83,
"metadata": {},
"outputs": [],
"source": [
"df1 = pd.DataFrame({\"A\": [1, 2]})\n",
"df2 = pd.DataFrame({\"B\": [3, 4]})\n",
"df1\n",
"df2\n",
"df1 + df2"
]
},
{
"cell_type": "code",
"execution_count": 84,
"metadata": {},
"outputs": [],
"source": [
"df1 = pd.DataFrame(np.arange(12.).reshape((3, 4)),\n",
" columns=list(\"abcd\"))\n",
"df2 = pd.DataFrame(np.arange(20.).reshape((4, 5)),\n",
" columns=list(\"abcde\"))\n",
"df2.loc[1, \"b\"] = np.nan\n",
"df1\n",
"df2"
]
},
{
"cell_type": "code",
"execution_count": 85,
"metadata": {},
"outputs": [],
"source": [
"df1 + df2"
]
},
{
"cell_type": "code",
"execution_count": 86,
"metadata": {},
"outputs": [],
"source": [
"df1.add(df2, fill_value=0)"
]
},
{
"cell_type": "code",
"execution_count": 87,
"metadata": {},
"outputs": [],
"source": [
"1 / df1\n",
"df1.rdiv(1)"
]
},
{
"cell_type": "code",
"execution_count": 88,
"metadata": {},
"outputs": [],
"source": [
"df1.reindex(columns=df2.columns, fill_value=0)"
]
},
{
"cell_type": "code",
"execution_count": 89,
"metadata": {},
"outputs": [],
"source": [
"arr = np.arange(12.).reshape((3, 4))\n",
"arr\n",
"arr[0]\n",
"arr - arr[0]"
]
},
{
"cell_type": "code",
"execution_count": 90,
"metadata": {},
"outputs": [],
"source": [
"frame = pd.DataFrame(np.arange(12.).reshape((4, 3)),\n",
" columns=list(\"bde\"),\n",
" index=[\"Utah\", \"Ohio\", \"Texas\", \"Oregon\"])\n",
"series = frame.iloc[0]\n",
"frame\n",
"series"
]
},
{
"cell_type": "code",
"execution_count": 91,
"metadata": {},
"outputs": [],
"source": [
"frame - series"
]
},
{
"cell_type": "code",
"execution_count": 92,
"metadata": {},
"outputs": [],
"source": [
"series2 = pd.Series(np.arange(3), index=[\"b\", \"e\", \"f\"])\n",
"series2\n",
"frame + series2"
]
},
{
"cell_type": "code",
"execution_count": 93,
"metadata": {},
"outputs": [],
"source": [
"series3 = frame[\"d\"]\n",
"frame\n",
"series3\n",
"frame.sub(series3, axis=\"index\")"
]
},
{
"cell_type": "code",
"execution_count": 94,
"metadata": {},
"outputs": [],
"source": [
"frame = pd.DataFrame(np.random.standard_normal((4, 3)),\n",
" columns=list(\"bde\"),\n",
" index=[\"Utah\", \"Ohio\", \"Texas\", \"Oregon\"])\n",
"frame\n",
"np.abs(frame)"
]
},
{
"cell_type": "code",
"execution_count": 95,
"metadata": {},
"outputs": [],
"source": [
"def f1(x):\n",
" return x.max() - x.min()\n",
"\n",
"frame.apply(f1)"
]
},
{
"cell_type": "code",
"execution_count": 96,
"metadata": {},
"outputs": [],
"source": [
"frame.apply(f1, axis=\"columns\")"
]
},
{
"cell_type": "code",
"execution_count": 97,
"metadata": {},
"outputs": [],
"source": [
"def f2(x):\n",
" return pd.Series([x.min(), x.max()], index=[\"min\", \"max\"])\n",
"frame.apply(f2)"
]
},
{
"cell_type": "code",
"execution_count": 98,
"metadata": {},
"outputs": [],
"source": [
"def my_format(x):\n",
" return f\"{x:.2f}\"\n",
"\n",
"frame.applymap(my_format)"
]
},
{
"cell_type": "code",
"execution_count": 99,
"metadata": {},
"outputs": [],
"source": [
"frame[\"e\"].map(my_format)"
]
},
{
"cell_type": "code",
"execution_count": 100,
"metadata": {},
"outputs": [],
"source": [
"obj = pd.Series(np.arange(4), index=[\"d\", \"a\", \"b\", \"c\"])\n",
"obj\n",
"obj.sort_index()"
]
},
{
"cell_type": "code",
"execution_count": 101,
"metadata": {},
"outputs": [],
"source": [
"frame = pd.DataFrame(np.arange(8).reshape((2, 4)),\n",
" index=[\"three\", \"one\"],\n",
" columns=[\"d\", \"a\", \"b\", \"c\"])\n",
"frame\n",
"frame.sort_index()\n",
"frame.sort_index(axis=\"columns\")"
]
},
{
"cell_type": "code",
"execution_count": 102,
"metadata": {},
"outputs": [],
"source": [
"frame.sort_index(axis=\"columns\", ascending=False)"
]
},
{
"cell_type": "code",
"execution_count": 103,
"metadata": {},
"outputs": [],
"source": [
"obj = pd.Series([4, 7, -3, 2])\n",
"obj.sort_values()"
]
},
{
"cell_type": "code",
"execution_count": 104,
"metadata": {},
"outputs": [],
"source": [
"obj = pd.Series([4, np.nan, 7, np.nan, -3, 2])\n",
"obj.sort_values()"
]
},
{
"cell_type": "code",
"execution_count": 105,
"metadata": {},
"outputs": [],
"source": [
"obj.sort_values(na_position=\"first\")"
]
},
{
"cell_type": "code",
"execution_count": 106,
"metadata": {},
"outputs": [],
"source": [
"frame = pd.DataFrame({\"b\": [4, 7, -3, 2], \"a\": [0, 1, 0, 1]})\n",
"frame\n",
"frame.sort_values(\"b\")"
]
},
{
"cell_type": "code",
"execution_count": 107,
"metadata": {},
"outputs": [],
"source": [
"frame.sort_values([\"a\", \"b\"])"
]
},
{
"cell_type": "code",
"execution_count": 108,
"metadata": {},
"outputs": [],
"source": [
"obj = pd.Series([7, -5, 7, 4, 2, 0, 4])\n",
"obj.rank()"
]
},
{
"cell_type": "code",
"execution_count": 109,
"metadata": {},
"outputs": [],
"source": [
"obj.rank(method=\"first\")"
]
},
{
"cell_type": "code",
"execution_count": 110,
"metadata": {},
"outputs": [],
"source": [
"obj.rank(ascending=False)"
]
},
{
"cell_type": "code",
"execution_count": 111,
"metadata": {},
"outputs": [],
"source": [
"frame = pd.DataFrame({\"b\": [4.3, 7, -3, 2], \"a\": [0, 1, 0, 1],\n",
" \"c\": [-2, 5, 8, -2.5]})\n",
"frame\n",
"frame.rank(axis=\"columns\")"
]
},
{
"cell_type": "code",
"execution_count": 112,
"metadata": {},
"outputs": [],
"source": [
"obj = pd.Series(np.arange(5), index=[\"a\", \"a\", \"b\", \"b\", \"c\"])\n",
"obj"
]
},
{
"cell_type": "code",
"execution_count": 113,
"metadata": {},
"outputs": [],
"source": [
"obj.index.is_unique"
]
},
{
"cell_type": "code",
"execution_count": 114,
"metadata": {},
"outputs": [],
"source": [
"obj[\"a\"]\n",
"obj[\"c\"]"
]
},
{
"cell_type": "code",
"execution_count": 115,
"metadata": {},
"outputs": [],
"source": [
"df = pd.DataFrame(np.random.standard_normal((5, 3)),\n",
" index=[\"a\", \"a\", \"b\", \"b\", \"c\"])\n",
"df\n",
"df.loc[\"b\"]\n",
"df.loc[\"c\"]"
]
},
{
"cell_type": "code",
"execution_count": 116,
"metadata": {},
"outputs": [],
"source": [
"df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5],\n",
" [np.nan, np.nan], [0.75, -1.3]],\n",
" index=[\"a\", \"b\", \"c\", \"d\"],\n",
" columns=[\"one\", \"two\"])\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 117,
"metadata": {},
"outputs": [],
"source": [
"df.sum()"
]
},
{
"cell_type": "code",
"execution_count": 118,
"metadata": {},
"outputs": [],
"source": [
"df.sum(axis=\"columns\")"
]
},
{
"cell_type": "code",
"execution_count": 119,
"metadata": {},
"outputs": [],
"source": [
"df.sum(axis=\"index\", skipna=False)\n",
"df.sum(axis=\"columns\", skipna=False)"
]
},
{
"cell_type": "code",
"execution_count": 120,
"metadata": {},
"outputs": [],
"source": [
"df.mean(axis=\"columns\")"
]
},
{
"cell_type": "code",
"execution_count": 121,
"metadata": {},
"outputs": [],
"source": [
"df.idxmax()"
]
},
{
"cell_type": "code",
"execution_count": 122,
"metadata": {},
"outputs": [],
"source": [
"df.cumsum()"
]
},
{
"cell_type": "code",
"execution_count": 123,
"metadata": {},
"outputs": [],
"source": [
"df.describe()"
]
},
{
"cell_type": "code",
"execution_count": 124,
"metadata": {},
"outputs": [],
"source": [
"obj = pd.Series([\"a\", \"a\", \"b\", \"c\"] * 4)\n",
"obj.describe()"
]
},
{
"cell_type": "code",
"execution_count": 125,
"metadata": {},
"outputs": [],
"source": [
"price = pd.read_pickle(\"examples/yahoo_price.pkl\")\n",
"volume = pd.read_pickle(\"examples/yahoo_volume.pkl\")"
]
},
{
"cell_type": "code",
"execution_count": 126,
"metadata": {},
"outputs": [],
"source": [
"returns = price.pct_change()\n",
"returns.tail()"
]
},
{
"cell_type": "code",
"execution_count": 127,
"metadata": {},
"outputs": [],
"source": [
"returns[\"MSFT\"].corr(returns[\"IBM\"])\n",
"returns[\"MSFT\"].cov(returns[\"IBM\"])"
]
},
{
"cell_type": "code",
"execution_count": 128,
"metadata": {},
"outputs": [],
"source": [
"returns.corr()\n",
"returns.cov()"
]
},
{
"cell_type": "code",
"execution_count": 129,
"metadata": {},
"outputs": [],
"source": [
"returns.corrwith(returns[\"IBM\"])"
]
},
{
"cell_type": "code",
"execution_count": 130,
"metadata": {},
"outputs": [],
"source": [
"returns.corrwith(volume)"
]
},
{
"cell_type": "code",
"execution_count": 131,
"metadata": {},
"outputs": [],
"source": [
"obj = pd.Series([\"c\", \"a\", \"d\", \"a\", \"a\", \"b\", \"b\", \"c\", \"c\"])"
]
},
{
"cell_type": "code",
"execution_count": 132,
"metadata": {},
"outputs": [],
"source": [
"uniques = obj.unique()\n",
"uniques"
]
},
{
"cell_type": "code",
"execution_count": 133,
"metadata": {},
"outputs": [],
"source": [
"obj.value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 134,
"metadata": {},
"outputs": [],
"source": [
"pd.value_counts(obj.to_numpy(), sort=False)"
]
},
{
"cell_type": "code",
"execution_count": 135,
"metadata": {},
"outputs": [],
"source": [
"obj\n",
"mask = obj.isin([\"b\", \"c\"])\n",
"mask\n",
"obj[mask]"
]
},
{
"cell_type": "code",
"execution_count": 136,
"metadata": {},
"outputs": [],
"source": [
"to_match = pd.Series([\"c\", \"a\", \"b\", \"b\", \"c\", \"a\"])\n",
"unique_vals = pd.Series([\"c\", \"b\", \"a\"])\n",
"indices = pd.Index(unique_vals).get_indexer(to_match)\n",
"indices"
]
},
{
"cell_type": "code",
"execution_count": 137,
"metadata": {},
"outputs": [],
"source": [
"data = pd.DataFrame({\"Qu1\": [1, 3, 4, 3, 4],\n",
" \"Qu2\": [2, 3, 1, 2, 3],\n",
" \"Qu3\": [1, 5, 2, 4, 4]})\n",
"data"
]
},
{
"cell_type": "code",
"execution_count": 138,
"metadata": {},
"outputs": [],
"source": [
"data[\"Qu1\"].value_counts().sort_index()"
]
},
{
"cell_type": "code",
"execution_count": 139,
"metadata": {},
"outputs": [],
"source": [
"result = data.apply(pd.value_counts).fillna(0)\n",
"result"
]
},
{
"cell_type": "code",
"execution_count": 140,
"metadata": {},
"outputs": [],
"source": [
"data = pd.DataFrame({\"a\": [1, 1, 1, 2, 2], \"b\": [0, 0, 1, 0, 0]})\n",
"data\n",
"data.value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 141,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 142,
"metadata": {},
"outputs": [],
"source": [
"pd.options.display.max_rows = PREVIOUS_MAX_ROWS"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
================================================
FILE: ch06.ipynb
================================================
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"np.random.seed(12345)\n",
"import matplotlib.pyplot as plt\n",
"plt.rc(\"figure\", figsize=(10, 6))\n",
"pd.options.display.max_colwidth = 75\n",
"pd.options.display.max_columns = 20\n",
"np.set_printoptions(precision=4, suppress=True)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"!cat examples/ex1.csv"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv(\"examples/ex1.csv\")\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"!cat examples/ex2.csv"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"pd.read_csv(\"examples/ex2.csv\", header=None)\n",
"pd.read_csv(\"examples/ex2.csv\", names=[\"a\", \"b\", \"c\", \"d\", \"message\"])"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"names = [\"a\", \"b\", \"c\", \"d\", \"message\"]\n",
"pd.read_csv(\"examples/ex2.csv\", names=names, index_col=\"message\")"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"!cat examples/csv_mindex.csv\n",
"parsed = pd.read_csv(\"examples/csv_mindex.csv\",\n",
" index_col=[\"key1\", \"key2\"])\n",
"parsed"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"!cat examples/ex3.txt"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"result = pd.read_csv(\"examples/ex3.txt\", sep=\"\\s+\")\n",
"result"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"!cat examples/ex4.csv\n",
"pd.read_csv(\"examples/ex4.csv\", skiprows=[0, 2, 3])"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"!cat examples/ex5.csv\n",
"result = pd.read_csv(\"examples/ex5.csv\")\n",
"result"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"pd.isna(result)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"result = pd.read_csv(\"examples/ex5.csv\", na_values=[\"NULL\"])\n",
"result"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"result2 = pd.read_csv(\"examples/ex5.csv\", keep_default_na=False)\n",
"result2\n",
"result2.isna()\n",
"result3 = pd.read_csv(\"examples/ex5.csv\", keep_default_na=False,\n",
" na_values=[\"NA\"])\n",
"result3\n",
"result3.isna()"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"sentinels = {\"message\": [\"foo\", \"NA\"], \"something\": [\"two\"]}\n",
"pd.read_csv(\"examples/ex5.csv\", na_values=sentinels,\n",
" keep_default_na=False)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"pd.options.display.max_rows = 10"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"result = pd.read_csv(\"examples/ex6.csv\")\n",
"result"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"pd.read_csv(\"examples/ex6.csv\", nrows=5)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"chunker = pd.read_csv(\"examples/ex6.csv\", chunksize=1000)\n",
"type(chunker)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"chunker = pd.read_csv(\"examples/ex6.csv\", chunksize=1000)\n",
"\n",
"tot = pd.Series([], dtype='int64')\n",
"for piece in chunker:\n",
" tot = tot.add(piece[\"key\"].value_counts(), fill_value=0)\n",
"\n",
"tot = tot.sort_values(ascending=False)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"tot[:10]"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"data = pd.read_csv(\"examples/ex5.csv\")\n",
"data"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"data.to_csv(\"examples/out.csv\")\n",
"!cat examples/out.csv"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"data.to_csv(sys.stdout, sep=\"|\")"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"data.to_csv(sys.stdout, na_rep=\"NULL\")"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"data.to_csv(sys.stdout, index=False, header=False)"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"data.to_csv(sys.stdout, index=False, columns=[\"a\", \"b\", \"c\"])"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"!cat examples/ex7.csv"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"import csv\n",
"f = open(\"examples/ex7.csv\")\n",
"reader = csv.reader(f)"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
"for line in reader:\n",
" print(line)\n",
"f.close()"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
"with open(\"examples/ex7.csv\") as f:\n",
" lines = list(csv.reader(f))"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
"header, values = lines[0], lines[1:]"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [],
"source": [
"data_dict = {h: v for h, v in zip(header, zip(*values))}\n",
"data_dict"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"obj = \"\"\"\n",
"{\"name\": \"Wes\",\n",
" \"cities_lived\": [\"Akron\", \"Nashville\", \"New York\", \"San Francisco\"],\n",
" \"pet\": null,\n",
" \"siblings\": [{\"name\": \"Scott\", \"age\": 34, \"hobbies\": [\"guitars\", \"soccer\"]},\n",
" {\"name\": \"Katie\", \"age\": 42, \"hobbies\": [\"diving\", \"art\"]}]\n",
"}\n",
"\"\"\""
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"result = json.loads(obj)\n",
"result"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [],
"source": [
"asjson = json.dumps(result)\n",
"asjson"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
"siblings = pd.DataFrame(result[\"siblings\"], columns=[\"name\", \"age\"])\n",
"siblings"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [],
"source": [
"!cat examples/example.json"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [],
"source": [
"data = pd.read_json(\"examples/example.json\")\n",
"data"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [],
"source": [
"data.to_json(sys.stdout)\n",
"data.to_json(sys.stdout, orient=\"records\")"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [],
"source": [
"tables = pd.read_html(\"examples/fdic_failed_bank_list.html\")\n",
"len(tables)\n",
"failures = tables[0]\n",
"failures.head()"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [],
"source": [
"close_timestamps = pd.to_datetime(failures[\"Closing Date\"])\n",
"close_timestamps.dt.year.value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [],
"source": [
"from lxml import objectify\n",
"\n",
"path = \"datasets/mta_perf/Performance_MNR.xml\"\n",
"with open(path) as f:\n",
" parsed = objectify.parse(f)\n",
"root = parsed.getroot()"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [],
"source": [
"data = []\n",
"\n",
"skip_fields = [\"PARENT_SEQ\", \"INDICATOR_SEQ\",\n",
" \"DESIRED_CHANGE\", \"DECIMAL_PLACES\"]\n",
"\n",
"for elt in root.INDICATOR:\n",
" el_data = {}\n",
" for child in elt.getchildren():\n",
" if child.tag in skip_fields:\n",
" continue\n",
" el_data[child.tag] = child.pyval\n",
" data.append(el_data)"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [],
"source": [
"perf = pd.DataFrame(data)\n",
"perf.head()"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [],
"source": [
"perf2 = pd.read_xml(path)\n",
"perf2.head()"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [],
"source": [
"frame = pd.read_csv(\"examples/ex1.csv\")\n",
"frame\n",
"frame.to_pickle(\"examples/frame_pickle\")"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [],
"source": [
"pd.read_pickle(\"examples/frame_pickle\")"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [],
"source": [
"!rm examples/frame_pickle"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [],
"source": [
"fec = pd.read_parquet('datasets/fec/fec.parquet')"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {},
"outputs": [],
"source": [
"xlsx = pd.ExcelFile(\"examples/ex1.xlsx\")"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {},
"outputs": [],
"source": [
"xlsx.sheet_names"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {},
"outputs": [],
"source": [
"xlsx.parse(sheet_name=\"Sheet1\")"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {},
"outputs": [],
"source": [
"xlsx.parse(sheet_name=\"Sheet1\", index_col=0)"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {},
"outputs": [],
"source": [
"frame = pd.read_excel(\"examples/ex1.xlsx\", sheet_name=\"Sheet1\")\n",
"frame"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {},
"outputs": [],
"source": [
"writer = pd.ExcelWriter(\"examples/ex2.xlsx\")\n",
"frame.to_excel(writer, \"Sheet1\")\n",
"writer.close()"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {},
"outputs": [],
"source": [
"frame.to_excel(\"examples/ex2.xlsx\")"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {},
"outputs": [],
"source": [
"!rm examples/ex2.xlsx"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {},
"outputs": [],
"source": [
"!rm -f examples/mydata.h5"
]
},
{
"cell_type": "code",
"execution_count": 60,
"metadata": {},
"outputs": [],
"source": [
"frame = pd.DataFrame({\"a\": np.random.standard_normal(100)})\n",
"store = pd.HDFStore(\"examples/mydata.h5\")\n",
"store[\"obj1\"] = frame\n",
"store[\"obj1_col\"] = frame[\"a\"]\n",
"store"
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {},
"outputs": [],
"source": [
"store[\"obj1\"]"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {},
"outputs": [],
"source": [
"store.put(\"obj2\", frame, format=\"table\")\n",
"store.select(\"obj2\", where=[\"index >= 10 and index <= 15\"])\n",
"store.close()"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {},
"outputs": [],
"source": [
"frame.to_hdf(\"examples/mydata.h5\", \"obj3\", format=\"table\")\n",
"pd.read_hdf(\"examples/mydata.h5\", \"obj3\", where=[\"index < 5\"])"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"os.remove(\"examples/mydata.h5\")"
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {},
"outputs": [],
"source": [
"import requests\n",
"url = \"https://api.github.com/repos/pandas-dev/pandas/issues\"\n",
"resp = requests.get(url)\n",
"resp.raise_for_status()\n",
"resp"
]
},
{
"cell_type": "code",
"execution_count": 66,
"metadata": {},
"outputs": [],
"source": [
"data = resp.json()\n",
"data[0][\"title\"]"
]
},
{
"cell_type": "code",
"execution_count": 67,
"metadata": {},
"outputs": [],
"source": [
"issues = pd.DataFrame(data, columns=[\"number\", \"title\",\n",
" \"labels\", \"state\"])\n",
"issues"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {},
"outputs": [],
"source": [
"import sqlite3\n",
"\n",
"query = \"\"\"\n",
"CREATE TABLE test\n",
"(a VARCHAR(20), b VARCHAR(20),\n",
" c REAL, d INTEGER\n",
");\"\"\"\n",
"\n",
"con = sqlite3.connect(\"mydata.sqlite\")\n",
"con.execute(query)\n",
"con.commit()"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {},
"outputs": [],
"source": [
"data = [(\"Atlanta\", \"Georgia\", 1.25, 6),\n",
" (\"Tallahassee\", \"Florida\", 2.6, 3),\n",
" (\"Sacramento\", \"California\", 1.7, 5)]\n",
"stmt = \"INSERT INTO test VALUES(?, ?, ?, ?)\"\n",
"\n",
"con.executemany(stmt, data)\n",
"con.commit()"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {},
"outputs": [],
"source": [
"cursor = con.execute(\"SELECT * FROM test\")\n",
"rows = cursor.fetchall()\n",
"rows"
]
},
{
"cell_type": "code",
"execution_count": 71,
"metadata": {},
"outputs": [],
"source": [
"cursor.description\n",
"pd.DataFrame(rows, columns=[x[0] for x in cursor.description])"
]
},
{
"cell_type": "code",
"execution_count": 72,
"metadata": {},
"outputs": [],
"source": [
"import sqlalchemy as sqla\n",
"db = sqla.create_engine(\"sqlite:///mydata.sqlite\")\n",
"pd.read_sql(\"SELECT * FROM test\", db)"
]
},
{
"cell_type": "code",
"execution_count": 73,
"metadata": {},
"outputs": [],
"source": [
"!rm mydata.sqlite"
]
},
{
"cell_type": "code",
"execution_count": 74,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
================================================
FILE: ch07.ipynb
================================================
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"PREVIOUS_MAX_ROWS = pd.options.display.max_rows\n",
"pd.options.display.max_rows = 25\n",
"pd.options.display.max_columns = 20\n",
"pd.options.display.max_colwidth = 82\n",
"np.random.seed(12345)\n",
"import matplotlib.pyplot as plt\n",
"plt.rc(\"figure\", figsize=(10, 6))\n",
"np.set_printoptions(precision=4, suppress=True)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"float_data = pd.Series([1.2, -3.5, np.nan, 0])\n",
"float_data"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"float_data.isna()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"string_data = pd.Series([\"aardvark\", np.nan, None, \"avocado\"])\n",
"string_data\n",
"string_data.isna()\n",
"float_data = pd.Series([1, 2, None], dtype='float64')\n",
"float_data\n",
"float_data.isna()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"data = pd.Series([1, np.nan, 3.5, np.nan, 7])\n",
"data.dropna()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"data[data.notna()]"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"data = pd.DataFrame([[1., 6.5, 3.], [1., np.nan, np.nan],\n",
" [np.nan, np.nan, np.nan], [np.nan, 6.5, 3.]])\n",
"data\n",
"data.dropna()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"data.dropna(how=\"all\")"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"data[4] = np.nan\n",
"data\n",
"data.dropna(axis=\"columns\", how=\"all\")"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"df = pd.DataFrame(np.random.standard_normal((7, 3)))\n",
"df.iloc[:4, 1] = np.nan\n",
"df.iloc[:2, 2] = np.nan\n",
"df\n",
"df.dropna()\n",
"df.dropna(thresh=2)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"df.fillna(0)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"df.fillna({1: 0.5, 2: 0})"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"df = pd.DataFrame(np.random.standard_normal((6, 3)))\n",
"df.iloc[2:, 1] = np.nan\n",
"df.iloc[4:, 2] = np.nan\n",
"df\n",
"df.fillna(method=\"ffill\")\n",
"df.fillna(method=\"ffill\", limit=2)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"data = pd.Series([1., np.nan, 3.5, np.nan, 7])\n",
"data.fillna(data.mean())"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"data = pd.DataFrame({\"k1\": [\"one\", \"two\"] * 3 + [\"two\"],\n",
" \"k2\": [1, 1, 2, 3, 3, 4, 4]})\n",
"data"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"data.duplicated()"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"data.drop_duplicates()"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"data[\"v1\"] = range(7)\n",
"data\n",
"data.drop_duplicates(subset=[\"k1\"])"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"data.drop_duplicates([\"k1\", \"k2\"], keep=\"last\")"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"data = pd.DataFrame({\"food\": [\"bacon\", \"pulled pork\", \"bacon\",\n",
" \"pastrami\", \"corned beef\", \"bacon\",\n",
" \"pastrami\", \"honey ham\", \"nova lox\"],\n",
" \"ounces\": [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})\n",
"data"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"meat_to_animal = {\n",
" \"bacon\": \"pig\",\n",
" \"pulled pork\": \"pig\",\n",
" \"pastrami\": \"cow\",\n",
" \"corned beef\": \"cow\",\n",
" \"honey ham\": \"pig\",\n",
" \"nova lox\": \"salmon\"\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"data[\"animal\"] = data[\"food\"].map(meat_to_animal)\n",
"data"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"def get_animal(x):\n",
" return meat_to_animal[x]\n",
"data[\"food\"].map(get_animal)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"data = pd.Series([1., -999., 2., -999., -1000., 3.])\n",
"data"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"data.replace(-999, np.nan)"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"data.replace([-999, -1000], np.nan)"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"data.replace([-999, -1000], [np.nan, 0])"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"data.replace({-999: np.nan, -1000: 0})"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
"data = pd.DataFrame(np.arange(12).reshape((3, 4)),\n",
" index=[\"Ohio\", \"Colorado\", \"New York\"],\n",
" columns=[\"one\", \"two\", \"three\", \"four\"])"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
"def transform(x):\n",
" return x[:4].upper()\n",
"\n",
"data.index.map(transform)"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
"data.index = data.index.map(transform)\n",
"data"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [],
"source": [
"data.rename(index=str.title, columns=str.upper)"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"data.rename(index={\"OHIO\": \"INDIANA\"},\n",
" columns={\"three\": \"peekaboo\"})"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [],
"source": [
"ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [],
"source": [
"bins = [18, 25, 35, 60, 100]\n",
"age_categories = pd.cut(ages, bins)\n",
"age_categories"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
"age_categories.codes\n",
"age_categories.categories\n",
"age_categories.categories[0]\n",
"pd.value_counts(age_categories)"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [],
"source": [
"pd.cut(ages, bins, right=False)"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [],
"source": [
"group_names = [\"Youth\", \"YoungAdult\", \"MiddleAged\", \"Senior\"]\n",
"pd.cut(ages, bins, labels=group_names)"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [],
"source": [
"data = np.random.uniform(size=20)\n",
"pd.cut(data, 4, precision=2)"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [],
"source": [
"data = np.random.standard_normal(1000)\n",
"quartiles = pd.qcut(data, 4, precision=2)\n",
"quartiles\n",
"pd.value_counts(quartiles)"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [],
"source": [
"pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.]).value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [],
"source": [
"data = pd.DataFrame(np.random.standard_normal((1000, 4)))\n",
"data.describe()"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [],
"source": [
"col = data[2]\n",
"col[col.abs() > 3]"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [],
"source": [
"data[(data.abs() > 3).any(axis=\"columns\")]"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [],
"source": [
"data[data.abs() > 3] = np.sign(data) * 3\n",
"data.describe()"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [],
"source": [
"np.sign(data).head()"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [],
"source": [
"df = pd.DataFrame(np.arange(5 * 7).reshape((5, 7)))\n",
"df\n",
"sampler = np.random.permutation(5)\n",
"sampler"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [],
"source": [
"df.take(sampler)\n",
"df.iloc[sampler]"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [],
"source": [
"column_sampler = np.random.permutation(7)\n",
"column_sampler\n",
"df.take(column_sampler, axis=\"columns\")"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {},
"outputs": [],
"source": [
"df.sample(n=3)"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {},
"outputs": [],
"source": [
"choices = pd.Series([5, 7, -1, 6, 4])\n",
"choices.sample(n=10, replace=True)"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {},
"outputs": [],
"source": [
"df = pd.DataFrame({\"key\": [\"b\", \"b\", \"a\", \"c\", \"a\", \"b\"],\n",
" \"data1\": range(6)})\n",
"df\n",
"pd.get_dummies(df[\"key\"], dtype=float)"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {},
"outputs": [],
"source": [
"dummies = pd.get_dummies(df[\"key\"], prefix=\"key\", dtype=float)\n",
"df_with_dummy = df[[\"data1\"]].join(dummies)\n",
"df_with_dummy"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {},
"outputs": [],
"source": [
"mnames = [\"movie_id\", \"title\", \"genres\"]\n",
"movies = pd.read_table(\"datasets/movielens/movies.dat\", sep=\"::\",\n",
" header=None, names=mnames, engine=\"python\")\n",
"movies[:10]"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {},
"outputs": [],
"source": [
"dummies = movies[\"genres\"].str.get_dummies(\"|\")\n",
"dummies.iloc[:10, :6]"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {},
"outputs": [],
"source": [
"movies_windic = movies.join(dummies.add_prefix(\"Genre_\"))\n",
"movies_windic.iloc[0]"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {},
"outputs": [],
"source": [
"np.random.seed(12345) # to make the example repeatable\n",
"values = np.random.uniform(size=10)\n",
"values\n",
"bins = [0, 0.2, 0.4, 0.6, 0.8, 1]\n",
"pd.get_dummies(pd.cut(values, bins))"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {},
"outputs": [],
"source": [
"s = pd.Series([1, 2, 3, None])\n",
"s\n",
"s.dtype"
]
},
{
"cell_type": "code",
"execution_count": 60,
"metadata": {},
"outputs": [],
"source": [
"s = pd.Series([1, 2, 3, None], dtype=pd.Int64Dtype())\n",
"s\n",
"s.isna()\n",
"s.dtype"
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {},
"outputs": [],
"source": [
"s[3]\n",
"s[3] is pd.NA"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {},
"outputs": [],
"source": [
"s = pd.Series([1, 2, 3, None], dtype=\"Int64\")"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {},
"outputs": [],
"source": [
"s = pd.Series(['one', 'two', None, 'three'], dtype=pd.StringDtype())\n",
"s"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {},
"outputs": [],
"source": [
"df = pd.DataFrame({\"A\": [1, 2, None, 4],\n",
" \"B\": [\"one\", \"two\", \"three\", None],\n",
" \"C\": [False, None, False, True]})\n",
"df\n",
"df[\"A\"] = df[\"A\"].astype(\"Int64\")\n",
"df[\"B\"] = df[\"B\"].astype(\"string\")\n",
"df[\"C\"] = df[\"C\"].astype(\"boolean\")\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {},
"outputs": [],
"source": [
"val = \"a,b, guido\"\n",
"val.split(\",\")"
]
},
{
"cell_type": "code",
"execution_count": 66,
"metadata": {},
"outputs": [],
"source": [
"pieces = [x.strip() for x in val.split(\",\")]\n",
"pieces"
]
},
{
"cell_type": "code",
"execution_count": 67,
"metadata": {},
"outputs": [],
"source": [
"first, second, third = pieces\n",
"first + \"::\" + second + \"::\" + third"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {},
"outputs": [],
"source": [
"\"::\".join(pieces)"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {},
"outputs": [],
"source": [
"\"guido\" in val\n",
"val.index(\",\")\n",
"val.find(\":\")"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {},
"outputs": [],
"source": [
"val.index(\":\")"
]
},
{
"cell_type": "code",
"execution_count": 71,
"metadata": {},
"outputs": [],
"source": [
"val.count(\",\")"
]
},
{
"cell_type": "code",
"execution_count": 72,
"metadata": {},
"outputs": [],
"source": [
"val.replace(\",\", \"::\")\n",
"val.replace(\",\", \"\")"
]
},
{
"cell_type": "code",
"execution_count": 73,
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"text = \"foo bar\\t baz \\tqux\"\n",
"re.split(r\"\\s+\", text)"
]
},
{
"cell_type": "code",
"execution_count": 74,
"metadata": {},
"outputs": [],
"source": [
"regex = re.compile(r\"\\s+\")\n",
"regex.split(text)"
]
},
{
"cell_type": "code",
"execution_count": 75,
"metadata": {},
"outputs": [],
"source": [
"regex.findall(text)"
]
},
{
"cell_type": "code",
"execution_count": 76,
"metadata": {},
"outputs": [],
"source": [
"text = \"\"\"Dave dave@google.com\n",
"Steve steve@gmail.com\n",
"Rob rob@gmail.com\n",
"Ryan ryan@yahoo.com\"\"\"\n",
"pattern = r\"[A-Z0-9._%+-]+@[A-Z0-9.-]+\\.[A-Z]{2,4}\"\n",
"\n",
"# re.IGNORECASE makes the regex case insensitive\n",
"regex = re.compile(pattern, flags=re.IGNORECASE)"
]
},
{
"cell_type": "code",
"execution_count": 77,
"metadata": {},
"outputs": [],
"source": [
"regex.findall(text)"
]
},
{
"cell_type": "code",
"execution_count": 78,
"metadata": {},
"outputs": [],
"source": [
"m = regex.search(text)\n",
"m\n",
"text[m.start():m.end()]"
]
},
{
"cell_type": "code",
"execution_count": 79,
"metadata": {},
"outputs": [],
"source": [
"print(regex.match(text))"
]
},
{
"cell_type": "code",
"execution_count": 80,
"metadata": {},
"outputs": [],
"source": [
"print(regex.sub(\"REDACTED\", text))"
]
},
{
"cell_type": "code",
"execution_count": 81,
"metadata": {},
"outputs": [],
"source": [
"pattern = r\"([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\\.([A-Z]{2,4})\"\n",
"regex = re.compile(pattern, flags=re.IGNORECASE)"
]
},
{
"cell_type": "code",
"execution_count": 82,
"metadata": {},
"outputs": [],
"source": [
"m = regex.match(\"wesm@bright.net\")\n",
"m.groups()"
]
},
{
"cell_type": "code",
"execution_count": 83,
"metadata": {},
"outputs": [],
"source": [
"regex.findall(text)"
]
},
{
"cell_type": "code",
"execution_count": 84,
"metadata": {},
"outputs": [],
"source": [
"print(regex.sub(r\"Username: \\1, Domain: \\2, Suffix: \\3\", text))"
]
},
{
"cell_type": "code",
"execution_count": 85,
"metadata": {},
"outputs": [],
"source": [
"data = {\"Dave\": \"dave@google.com\", \"Steve\": \"steve@gmail.com\",\n",
" \"Rob\": \"rob@gmail.com\", \"Wes\": np.nan}\n",
"data = pd.Series(data)\n",
"data\n",
"data.isna()"
]
},
{
"cell_type": "code",
"execution_count": 86,
"metadata": {},
"outputs": [],
"source": [
"data.str.contains(\"gmail\")"
]
},
{
"cell_type": "code",
"execution_count": 87,
"metadata": {},
"outputs": [],
"source": [
"data_as_string_ext = data.astype('string')\n",
"data_as_string_ext\n",
"data_as_string_ext.str.contains(\"gmail\")"
]
},
{
"cell_type": "code",
"execution_count": 88,
"metadata": {},
"outputs": [],
"source": [
"pattern = r\"([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\\.([A-Z]{2,4})\"\n",
"data.str.findall(pattern, flags=re.IGNORECASE)"
]
},
{
"cell_type": "code",
"execution_count": 89,
"metadata": {},
"outputs": [],
"source": [
"matches = data.str.findall(pattern, flags=re.IGNORECASE).str[0]\n",
"matches\n",
"matches.str.get(1)"
]
},
{
"cell_type": "code",
"execution_count": 90,
"metadata": {},
"outputs": [],
"source": [
"data.str[:5]"
]
},
{
"cell_type": "code",
"execution_count": 91,
"metadata": {},
"outputs": [],
"source": [
"data.str.extract(pattern, flags=re.IGNORECASE)"
]
},
{
"cell_type": "code",
"execution_count": 92,
"metadata": {},
"outputs": [],
"source": [
"values = pd.Series(['apple', 'orange', 'apple',\n",
" 'apple'] * 2)\n",
"values\n",
"pd.unique(values)\n",
"pd.value_counts(values)"
]
},
{
"cell_type": "code",
"execution_count": 93,
"metadata": {},
"outputs": [],
"source": [
"values = pd.Series([0, 1, 0, 0] * 2)\n",
"dim = pd.Series(['apple', 'orange'])\n",
"values\n",
"dim"
]
},
{
"cell_type": "code",
"execution_count": 94,
"metadata": {},
"outputs": [],
"source": [
"dim.take(values)"
]
},
{
"cell_type": "code",
"execution_count": 95,
"metadata": {},
"outputs": [],
"source": [
"fruits = ['apple', 'orange', 'apple', 'apple'] * 2\n",
"N = len(fruits)\n",
"rng = np.random.default_rng(seed=12345)\n",
"df = pd.DataFrame({'fruit': fruits,\n",
" 'basket_id': np.arange(N),\n",
" 'count': rng.integers(3, 15, size=N),\n",
" 'weight': rng.uniform(0, 4, size=N)},\n",
" columns=['basket_id', 'fruit', 'count', 'weight'])\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 96,
"metadata": {},
"outputs": [],
"source": [
"fruit_cat = df['fruit'].astype('category')\n",
"fruit_cat"
]
},
{
"cell_type": "code",
"execution_count": 97,
"metadata": {},
"outputs": [],
"source": [
"c = fruit_cat.array\n",
"type(c)"
]
},
{
"cell_type": "code",
"execution_count": 98,
"metadata": {},
"outputs": [],
"source": [
"c.categories\n",
"c.codes"
]
},
{
"cell_type": "code",
"execution_count": 99,
"metadata": {},
"outputs": [],
"source": [
"dict(enumerate(c.categories))"
]
},
{
"cell_type": "code",
"execution_count": 100,
"metadata": {},
"outputs": [],
"source": [
"df['fruit'] = df['fruit'].astype('category')\n",
"df[\"fruit\"]"
]
},
{
"cell_type": "code",
"execution_count": 101,
"metadata": {},
"outputs": [],
"source": [
"my_categories = pd.Categorical(['foo', 'bar', 'baz', 'foo', 'bar'])\n",
"my_categories"
]
},
{
"cell_type": "code",
"execution_count": 102,
"metadata": {},
"outputs": [],
"source": [
"categories = ['foo', 'bar', 'baz']\n",
"codes = [0, 1, 2, 0, 0, 1]\n",
"my_cats_2 = pd.Categorical.from_codes(codes, categories)\n",
"my_cats_2"
]
},
{
"cell_type": "code",
"execution_count": 103,
"metadata": {},
"outputs": [],
"source": [
"ordered_cat = pd.Categorical.from_codes(codes, categories,\n",
" ordered=True)\n",
"ordered_cat"
]
},
{
"cell_type": "code",
"execution_count": 104,
"metadata": {},
"outputs": [],
"source": [
"my_cats_2.as_ordered()"
]
},
{
"cell_type": "code",
"execution_count": 105,
"metadata": {},
"outputs": [],
"source": [
"rng = np.random.default_rng(seed=12345)\n",
"draws = rng.standard_normal(1000)\n",
"draws[:5]"
]
},
{
"cell_type": "code",
"execution_count": 106,
"metadata": {},
"outputs": [],
"source": [
"bins = pd.qcut(draws, 4)\n",
"bins"
]
},
{
"cell_type": "code",
"execution_count": 107,
"metadata": {},
"outputs": [],
"source": [
"bins = pd.qcut(draws, 4, labels=['Q1', 'Q2', 'Q3', 'Q4'])\n",
"bins\n",
"bins.codes[:10]"
]
},
{
"cell_type": "code",
"execution_count": 108,
"metadata": {},
"outputs": [],
"source": [
"bins = pd.Series(bins, name='quartile')\n",
"results = (pd.Series(draws)\n",
" .groupby(bins)\n",
" .agg(['count', 'min', 'max'])\n",
" .reset_index())\n",
"results"
]
},
{
"cell_type": "code",
"execution_count": 109,
"metadata": {},
"outputs": [],
"source": [
"results['quartile']"
]
},
{
"cell_type": "code",
"execution_count": 110,
"metadata": {},
"outputs": [],
"source": [
"N = 10_000_000\n",
"labels = pd.Series(['foo', 'bar', 'baz', 'qux'] * (N // 4))"
]
},
{
"cell_type": "code",
"execution_count": 111,
"metadata": {},
"outputs": [],
"source": [
"categories = labels.astype('category')"
]
},
{
"cell_type": "code",
"execution_count": 112,
"metadata": {},
"outputs": [],
"source": [
"labels.memory_usage(deep=True)\n",
"categories.memory_usage(deep=True)"
]
},
{
"cell_type": "code",
"execution_count": 113,
"metadata": {},
"outputs": [],
"source": [
"%time _ = labels.astype('category')"
]
},
{
"cell_type": "code",
"execution_count": 114,
"metadata": {},
"outputs": [],
"source": [
"%timeit labels.value_counts()\n",
"%timeit categories.value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 115,
"metadata": {},
"outputs": [],
"source": [
"s = pd.Series(['a', 'b', 'c', 'd'] * 2)\n",
"cat_s = s.astype('category')\n",
"cat_s"
]
},
{
"cell_type": "code",
"execution_count": 116,
"metadata": {},
"outputs": [],
"source": [
"cat_s.cat.codes\n",
"cat_s.cat.categories"
]
},
{
"cell_type": "code",
"execution_count": 117,
"metadata": {},
"outputs": [],
"source": [
"actual_categories = ['a', 'b', 'c', 'd', 'e']\n",
"cat_s2 = cat_s.cat.set_categories(actual_categories)\n",
"cat_s2"
]
},
{
"cell_type": "code",
"execution_count": 118,
"metadata": {},
"outputs": [],
"source": [
"cat_s.value_counts()\n",
"cat_s2.value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 119,
"metadata": {},
"outputs": [],
"source": [
"cat_s3 = cat_s[cat_s.isin(['a', 'b'])]\n",
"cat_s3\n",
"cat_s3.cat.remove_unused_categories()"
]
},
{
"cell_type": "code",
"execution_count": 120,
"metadata": {},
"outputs": [],
"source": [
"cat_s = pd.Series(['a', 'b', 'c', 'd'] * 2, dtype='category')"
]
},
{
"cell_type": "code",
"execution_count": 121,
"metadata": {},
"outputs": [],
"source": [
"pd.get_dummies(cat_s, dtype=float)"
]
},
{
"cell_type": "code",
"execution_count": 122,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 123,
"metadata": {},
"outputs": [],
"source": [
"pd.options.display.max_rows = PREVIOUS_MAX_ROWS"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
================================================
FILE: ch08.ipynb
================================================
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"pd.options.display.max_rows = 20\n",
"pd.options.display.max_colwidth = 80\n",
"pd.options.display.max_columns = 20\n",
"np.random.seed(12345)\n",
"import matplotlib.pyplot as plt\n",
"plt.rc(\"figure\", figsize=(10, 6))\n",
"np.set_printoptions(precision=4, suppress=True)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"data = pd.Series(np.random.uniform(size=9),\n",
" index=[[\"a\", \"a\", \"a\", \"b\", \"b\", \"c\", \"c\", \"d\", \"d\"],\n",
" [1, 2, 3, 1, 3, 1, 2, 2, 3]])\n",
"data"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"data.index"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"data[\"b\"]\n",
"data[\"b\":\"c\"]\n",
"data.loc[[\"b\", \"d\"]]"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"data.loc[:, 2]"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"data.unstack()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"data.unstack().stack()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"frame = pd.DataFrame(np.arange(12).reshape((4, 3)),\n",
" index=[[\"a\", \"a\", \"b\", \"b\"], [1, 2, 1, 2]],\n",
" columns=[[\"Ohio\", \"Ohio\", \"Colorado\"],\n",
" [\"Green\", \"Red\", \"Green\"]])\n",
"frame"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"frame.index.names = [\"key1\", \"key2\"]\n",
"frame.columns.names = [\"state\", \"color\"]\n",
"frame"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"frame.index.nlevels"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"frame[\"Ohio\"]"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"frame.swaplevel(\"key1\", \"key2\")"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"frame.sort_index(level=1)\n",
"frame.swaplevel(0, 1).sort_index(level=0)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"frame.groupby(level=\"key2\").sum()\n",
"frame.groupby(level=\"color\", axis=\"columns\").sum()"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"frame = pd.DataFrame({\"a\": range(7), \"b\": range(7, 0, -1),\n",
" \"c\": [\"one\", \"one\", \"one\", \"two\", \"two\",\n",
" \"two\", \"two\"],\n",
" \"d\": [0, 1, 2, 0, 1, 2, 3]})\n",
"frame"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"frame2 = frame.set_index([\"c\", \"d\"])\n",
"frame2"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"frame.set_index([\"c\", \"d\"], drop=False)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"frame2.reset_index()"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"df1 = pd.DataFrame({\"key\": [\"b\", \"b\", \"a\", \"c\", \"a\", \"a\", \"b\"],\n",
" \"data1\": pd.Series(range(7), dtype=\"Int64\")})\n",
"df2 = pd.DataFrame({\"key\": [\"a\", \"b\", \"d\"],\n",
" \"data2\": pd.Series(range(3), dtype=\"Int64\")})\n",
"df1\n",
"df2"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"pd.merge(df1, df2)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"pd.merge(df1, df2, on=\"key\")"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"df3 = pd.DataFrame({\"lkey\": [\"b\", \"b\", \"a\", \"c\", \"a\", \"a\", \"b\"],\n",
" \"data1\": pd.Series(range(7), dtype=\"Int64\")})\n",
"df4 = pd.DataFrame({\"rkey\": [\"a\", \"b\", \"d\"],\n",
" \"data2\": pd.Series(range(3), dtype=\"Int64\")})\n",
"pd.merge(df3, df4, left_on=\"lkey\", right_on=\"rkey\")"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"pd.merge(df1, df2, how=\"outer\")\n",
"pd.merge(df3, df4, left_on=\"lkey\", right_on=\"rkey\", how=\"outer\")"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"df1 = pd.DataFrame({\"key\": [\"b\", \"b\", \"a\", \"c\", \"a\", \"b\"],\n",
" \"data1\": pd.Series(range(6), dtype=\"Int64\")})\n",
"df2 = pd.DataFrame({\"key\": [\"a\", \"b\", \"a\", \"b\", \"d\"],\n",
" \"data2\": pd.Series(range(5), dtype=\"Int64\")})\n",
"df1\n",
"df2\n",
"pd.merge(df1, df2, on=\"key\", how=\"left\")"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"pd.merge(df1, df2, how=\"inner\")"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"left = pd.DataFrame({\"key1\": [\"foo\", \"foo\", \"bar\"],\n",
" \"key2\": [\"one\", \"two\", \"one\"],\n",
" \"lval\": pd.Series([1, 2, 3], dtype='Int64')})\n",
"right = pd.DataFrame({\"key1\": [\"foo\", \"foo\", \"bar\", \"bar\"],\n",
" \"key2\": [\"one\", \"one\", \"one\", \"two\"],\n",
" \"rval\": pd.Series([4, 5, 6, 7], dtype='Int64')})\n",
"pd.merge(left, right, on=[\"key1\", \"key2\"], how=\"outer\")"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"pd.merge(left, right, on=\"key1\")"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"pd.merge(left, right, on=\"key1\", suffixes=(\"_left\", \"_right\"))"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"left1 = pd.DataFrame({\"key\": [\"a\", \"b\", \"a\", \"a\", \"b\", \"c\"],\n",
" \"value\": pd.Series(range(6), dtype=\"Int64\")})\n",
"right1 = pd.DataFrame({\"group_val\": [3.5, 7]}, index=[\"a\", \"b\"])\n",
"left1\n",
"right1\n",
"pd.merge(left1, right1, left_on=\"key\", right_index=True)"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
"pd.merge(left1, right1, left_on=\"key\", right_index=True, how=\"outer\")"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
"lefth = pd.DataFrame({\"key1\": [\"Ohio\", \"Ohio\", \"Ohio\",\n",
" \"Nevada\", \"Nevada\"],\n",
" \"key2\": [2000, 2001, 2002, 2001, 2002],\n",
" \"data\": pd.Series(range(5), dtype=\"Int64\")})\n",
"righth_index = pd.MultiIndex.from_arrays(\n",
" [\n",
" [\"Nevada\", \"Nevada\", \"Ohio\", \"Ohio\", \"Ohio\", \"Ohio\"],\n",
" [2001, 2000, 2000, 2000, 2001, 2002]\n",
" ]\n",
")\n",
"righth = pd.DataFrame({\"event1\": pd.Series([0, 2, 4, 6, 8, 10], dtype=\"Int64\",\n",
" index=righth_index),\n",
" \"event2\": pd.Series([1, 3, 5, 7, 9, 11], dtype=\"Int64\",\n",
" index=righth_index)})\n",
"lefth\n",
"righth"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
"pd.merge(lefth, righth, left_on=[\"key1\", \"key2\"], right_index=True)\n",
"pd.merge(lefth, righth, left_on=[\"key1\", \"key2\"],\n",
" right_index=True, how=\"outer\")"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [],
"source": [
"left2 = pd.DataFrame([[1., 2.], [3., 4.], [5., 6.]],\n",
" index=[\"a\", \"c\", \"e\"],\n",
" columns=[\"Ohio\", \"Nevada\"]).astype(\"Int64\")\n",
"right2 = pd.DataFrame([[7., 8.], [9., 10.], [11., 12.], [13, 14]],\n",
" index=[\"b\", \"c\", \"d\", \"e\"],\n",
" columns=[\"Missouri\", \"Alabama\"]).astype(\"Int64\")\n",
"left2\n",
"right2\n",
"pd.merge(left2, right2, how=\"outer\", left_index=True, right_index=True)"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"left2.join(right2, how=\"outer\")"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [],
"source": [
"left1.join(right1, on=\"key\")"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [],
"source": [
"another = pd.DataFrame([[7., 8.], [9., 10.], [11., 12.], [16., 17.]],\n",
" index=[\"a\", \"c\", \"e\", \"f\"],\n",
" columns=[\"New York\", \"Oregon\"])\n",
"another\n",
"left2.join([right2, another])\n",
"left2.join([right2, another], how=\"outer\")"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
"arr = np.arange(12).reshape((3, 4))\n",
"arr\n",
"np.concatenate([arr, arr], axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [],
"source": [
"s1 = pd.Series([0, 1], index=[\"a\", \"b\"], dtype=\"Int64\")\n",
"s2 = pd.Series([2, 3, 4], index=[\"c\", \"d\", \"e\"], dtype=\"Int64\")\n",
"s3 = pd.Series([5, 6], index=[\"f\", \"g\"], dtype=\"Int64\")"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [],
"source": [
"s1\n",
"s2\n",
"s3\n",
"pd.concat([s1, s2, s3])"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [],
"source": [
"pd.concat([s1, s2, s3], axis=\"columns\")"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [],
"source": [
"s4 = pd.concat([s1, s3])\n",
"s4\n",
"pd.concat([s1, s4], axis=\"columns\")\n",
"pd.concat([s1, s4], axis=\"columns\", join=\"inner\")"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [],
"source": [
"result = pd.concat([s1, s1, s3], keys=[\"one\", \"two\", \"three\"])\n",
"result\n",
"result.unstack()"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [],
"source": [
"pd.concat([s1, s2, s3], axis=\"columns\", keys=[\"one\", \"two\", \"three\"])"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [],
"source": [
"df1 = pd.DataFrame(np.arange(6).reshape(3, 2), index=[\"a\", \"b\", \"c\"],\n",
" columns=[\"one\", \"two\"])\n",
"df2 = pd.DataFrame(5 + np.arange(4).reshape(2, 2), index=[\"a\", \"c\"],\n",
" columns=[\"three\", \"four\"])\n",
"df1\n",
"df2\n",
"pd.concat([df1, df2], axis=\"columns\", keys=[\"level1\", \"level2\"])"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [],
"source": [
"pd.concat({\"level1\": df1, \"level2\": df2}, axis=\"columns\")"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [],
"source": [
"pd.concat([df1, df2], axis=\"columns\", keys=[\"level1\", \"level2\"],\n",
" names=[\"upper\", \"lower\"])"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [],
"source": [
"df1 = pd.DataFrame(np.random.standard_normal((3, 4)),\n",
" columns=[\"a\", \"b\", \"c\", \"d\"])\n",
"df2 = pd.DataFrame(np.random.standard_normal((2, 3)),\n",
" columns=[\"b\", \"d\", \"a\"])\n",
"df1\n",
"df2"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [],
"source": [
"pd.concat([df1, df2], ignore_index=True)"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [],
"source": [
"a = pd.Series([np.nan, 2.5, 0.0, 3.5, 4.5, np.nan],\n",
" index=[\"f\", \"e\", \"d\", \"c\", \"b\", \"a\"])\n",
"b = pd.Series([0., np.nan, 2., np.nan, np.nan, 5.],\n",
" index=[\"a\", \"b\", \"c\", \"d\", \"e\", \"f\"])\n",
"a\n",
"b\n",
"np.where(pd.isna(a), b, a)"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [],
"source": [
"a.combine_first(b)"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {},
"outputs": [],
"source": [
"df1 = pd.DataFrame({\"a\": [1., np.nan, 5., np.nan],\n",
" \"b\": [np.nan, 2., np.nan, 6.],\n",
" \"c\": range(2, 18, 4)})\n",
"df2 = pd.DataFrame({\"a\": [5., 4., np.nan, 3., 7.],\n",
" \"b\": [np.nan, 3., 4., 6., 8.]})\n",
"df1\n",
"df2\n",
"df1.combine_first(df2)"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {},
"outputs": [],
"source": [
"data = pd.DataFrame(np.arange(6).reshape((2, 3)),\n",
" index=pd.Index([\"Ohio\", \"Colorado\"], name=\"state\"),\n",
" columns=pd.Index([\"one\", \"two\", \"three\"],\n",
" name=\"number\"))\n",
"data"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {},
"outputs": [],
"source": [
"result = data.stack()\n",
"result"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {},
"outputs": [],
"source": [
"result.unstack()"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {},
"outputs": [],
"source": [
"result.unstack(level=0)\n",
"result.unstack(level=\"state\")"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {},
"outputs": [],
"source": [
"s1 = pd.Series([0, 1, 2, 3], index=[\"a\", \"b\", \"c\", \"d\"], dtype=\"Int64\")\n",
"s2 = pd.Series([4, 5, 6], index=[\"c\", \"d\", \"e\"], dtype=\"Int64\")\n",
"data2 = pd.concat([s1, s2], keys=[\"one\", \"two\"])\n",
"data2"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {},
"outputs": [],
"source": [
"data2.unstack()\n",
"data2.unstack().stack()\n",
"data2.unstack().stack(dropna=False)"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {},
"outputs": [],
"source": [
"df = pd.DataFrame({\"left\": result, \"right\": result + 5},\n",
" columns=pd.Index([\"left\", \"right\"], name=\"side\"))\n",
"df\n",
"df.unstack(level=\"state\")"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {},
"outputs": [],
"source": [
"df.unstack(level=\"state\").stack(level=\"side\")"
]
},
{
"cell_type": "code",
"execution_count": 60,
"metadata": {},
"outputs": [],
"source": [
"data = pd.read_csv(\"examples/macrodata.csv\")\n",
"data = data.loc[:, [\"year\", \"quarter\", \"realgdp\", \"infl\", \"unemp\"]]\n",
"data.head()"
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {},
"outputs": [],
"source": [
"periods = pd.PeriodIndex(year=data.pop(\"year\"),\n",
" quarter=data.pop(\"quarter\"),\n",
" name=\"date\")\n",
"periods\n",
"data.index = periods.to_timestamp(\"D\")\n",
"data.head()"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {},
"outputs": [],
"source": [
"data = data.reindex(columns=[\"realgdp\", \"infl\", \"unemp\"])\n",
"data.columns.name = \"item\"\n",
"data.head()"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {},
"outputs": [],
"source": [
"long_data = (data.stack()\n",
" .reset_index()\n",
" .rename(columns={0: \"value\"}))"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {},
"outputs": [],
"source": [
"long_data[:10]"
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {},
"outputs": [],
"source": [
"pivoted = long_data.pivot(index=\"date\", columns=\"item\",\n",
" values=\"value\")\n",
"pivoted.head()"
]
},
{
"cell_type": "code",
"execution_count": 66,
"metadata": {},
"outputs": [],
"source": [
"long_data.index.name = None"
]
},
{
"cell_type": "code",
"execution_count": 67,
"metadata": {},
"outputs": [],
"source": [
"long_data[\"value2\"] = np.random.standard_normal(len(long_data))\n",
"long_data[:10]"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {},
"outputs": [],
"source": [
"pivoted = long_data.pivot(index=\"date\", columns=\"item\")\n",
"pivoted.head()\n",
"pivoted[\"value\"].head()"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {},
"outputs": [],
"source": [
"unstacked = long_data.set_index([\"date\", \"item\"]).unstack(level=\"item\")\n",
"unstacked.head()"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 71,
"metadata": {},
"outputs": [],
"source": [
"df = pd.DataFrame({\"key\": [\"foo\", \"bar\", \"baz\"],\n",
" \"A\": [1, 2, 3],\n",
" \"B\": [4, 5, 6],\n",
" \"C\": [7, 8, 9]})\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 72,
"metadata": {},
"outputs": [],
"source": [
"melted = pd.melt(df, id_vars=\"key\")\n",
"melted"
]
},
{
"cell_type": "code",
"execution_count": 73,
"metadata": {},
"outputs": [],
"source": [
"reshaped = melted.pivot(index=\"key\", columns=\"variable\",\n",
" values=\"value\")\n",
"reshaped"
]
},
{
"cell_type": "code",
"execution_count": 74,
"metadata": {},
"outputs": [],
"source": [
"reshaped.reset_index()"
]
},
{
"cell_type": "code",
"execution_count": 75,
"metadata": {},
"outputs": [],
"source": [
"pd.melt(df, id_vars=\"key\", value_vars=[\"A\", \"B\"])"
]
},
{
"cell_type": "code",
"execution_count": 76,
"metadata": {},
"outputs": [],
"source": [
"pd.melt(df, value_vars=[\"A\", \"B\", \"C\"])\n",
"pd.melt(df, value_vars=[\"key\", \"A\", \"B\"])"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
================================================
FILE: ch09.ipynb
================================================
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"PREVIOUS_MAX_ROWS = pd.options.display.max_rows\n",
"pd.options.display.max_rows = 20\n",
"pd.options.display.max_colwidth = 80\n",
"pd.options.display.max_columns = 20\n",
"np.random.seed(12345)\n",
"import matplotlib.pyplot as plt\n",
"import matplotlib\n",
"plt.rc(\"figure\", figsize=(10, 6))\n",
"np.set_printoptions(precision=4, suppress=True)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"data = np.arange(10)\n",
"data\n",
"plt.plot(data)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"fig = plt.figure()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"ax1 = fig.add_subplot(2, 2, 1)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"ax2 = fig.add_subplot(2, 2, 2)\n",
"ax3 = fig.add_subplot(2, 2, 3)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"ax3.plot(np.random.standard_normal(50).cumsum(), color=\"black\",\n",
" linestyle=\"dashed\")"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"ax1.hist(np.random.standard_normal(100), bins=20, color=\"black\", alpha=0.3);\n",
"ax2.scatter(np.arange(30), np.arange(30) + 3 * np.random.standard_normal(30));"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"plt.close(\"all\")"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"fig, axes = plt.subplots(2, 3)\n",
"axes"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"fig, axes = plt.subplots(2, 2, sharex=True, sharey=True)\n",
"for i in range(2):\n",
" for j in range(2):\n",
" axes[i, j].hist(np.random.standard_normal(500), bins=50,\n",
" color=\"black\", alpha=0.5)\n",
"fig.subplots_adjust(wspace=0, hspace=0)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"fig = plt.figure()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"ax = fig.add_subplot()\n",
"ax.plot(np.random.standard_normal(30).cumsum(), color=\"black\",\n",
" linestyle=\"dashed\", marker=\"o\");"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"plt.close(\"all\")"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"fig = plt.figure()\n",
"ax = fig.add_subplot()\n",
"data = np.random.standard_normal(30).cumsum()\n",
"ax.plot(data, color=\"black\", linestyle=\"dashed\", label=\"Default\");\n",
"ax.plot(data, color=\"black\", linestyle=\"dashed\",\n",
" drawstyle=\"steps-post\", label=\"steps-post\");\n",
"ax.legend()"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"fig, ax = plt.subplots()\n",
"ax.plot(np.random.standard_normal(1000).cumsum());"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"ticks = ax.set_xticks([0, 250, 500, 750, 1000])\n",
"labels = ax.set_xticklabels([\"one\", \"two\", \"three\", \"four\", \"five\"],\n",
" rotation=30, fontsize=8)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"ax.set_xlabel(\"Stages\")\n",
"ax.set_title(\"My first matplotlib plot\")"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"fig, ax = plt.subplots()\n",
"ax.plot(np.random.randn(1000).cumsum(), color=\"black\", label=\"one\");\n",
"ax.plot(np.random.randn(1000).cumsum(), color=\"black\", linestyle=\"dashed\",\n",
" label=\"two\");\n",
"ax.plot(np.random.randn(1000).cumsum(), color=\"black\", linestyle=\"dotted\",\n",
" label=\"three\");"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"ax.legend()"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"from datetime import datetime\n",
"\n",
"fig, ax = plt.subplots()\n",
"\n",
"data = pd.read_csv(\"examples/spx.csv\", index_col=0, parse_dates=True)\n",
"spx = data[\"SPX\"]\n",
"\n",
"spx.plot(ax=ax, color=\"black\")\n",
"\n",
"crisis_data = [\n",
" (datetime(2007, 10, 11), \"Peak of bull market\"),\n",
" (datetime(2008, 3, 12), \"Bear Stearns Fails\"),\n",
" (datetime(2008, 9, 15), \"Lehman Bankruptcy\")\n",
"]\n",
"\n",
"for date, label in crisis_data:\n",
" ax.annotate(label, xy=(date, spx.asof(date) + 75),\n",
" xytext=(date, spx.asof(date) + 225),\n",
" arrowprops=dict(facecolor=\"black\", headwidth=4, width=2,\n",
" headlength=4),\n",
" horizontalalignment=\"left\", verticalalignment=\"top\")\n",
"\n",
"# Zoom in on 2007-2010\n",
"ax.set_xlim([\"1/1/2007\", \"1/1/2011\"])\n",
"ax.set_ylim([600, 1800])\n",
"\n",
"ax.set_title(\"Important dates in the 2008\u20132009 financial crisis\")"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"ax.set_title(\"Important dates in the 2008\u20132009 financial crisis\")"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"fig, ax = plt.subplots(figsize=(12, 6))\n",
"rect = plt.Rectangle((0.2, 0.75), 0.4, 0.15, color=\"black\", alpha=0.3)\n",
"circ = plt.Circle((0.7, 0.2), 0.15, color=\"blue\", alpha=0.3)\n",
"pgon = plt.Polygon([[0.15, 0.15], [0.35, 0.4], [0.2, 0.6]],\n",
" color=\"green\", alpha=0.5)\n",
"ax.add_patch(rect)\n",
"ax.add_patch(circ)\n",
"ax.add_patch(pgon)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"plt.close(\"all\")"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"s = pd.Series(np.random.standard_normal(10).cumsum(), index=np.arange(0, 100, 10))\n",
"s.plot()"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"df = pd.DataFrame(np.random.standard_normal((10, 4)).cumsum(0),\n",
" columns=[\"A\", \"B\", \"C\", \"D\"],\n",
" index=np.arange(0, 100, 10))\n",
"plt.style.use('grayscale')\n",
"df.plot()"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"fig, axes = plt.subplots(2, 1)\n",
"data = pd.Series(np.random.uniform(size=16), index=list(\"abcdefghijklmnop\"))\n",
"data.plot.bar(ax=axes[0], color=\"black\", alpha=0.7)\n",
"data.plot.barh(ax=axes[1], color=\"black\", alpha=0.7)"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"np.random.seed(12348)"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"df = pd.DataFrame(np.random.uniform(size=(6, 4)),\n",
" index=[\"one\", \"two\", \"three\", \"four\", \"five\", \"six\"],\n",
" columns=pd.Index([\"A\", \"B\", \"C\", \"D\"], name=\"Genus\"))\n",
"df\n",
"df.plot.bar()"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
"plt.figure()"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
"df.plot.barh(stacked=True, alpha=0.5)"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
"plt.close(\"all\")"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [],
"source": [
"tips = pd.read_csv(\"examples/tips.csv\")\n",
"tips.head()\n",
"party_counts = pd.crosstab(tips[\"day\"], tips[\"size\"])\n",
"party_counts = party_counts.reindex(index=[\"Thur\", \"Fri\", \"Sat\", \"Sun\"])\n",
"party_counts"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"party_counts = party_counts.loc[:, 2:5]"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [],
"source": [
"# Normalize to sum to 1\n",
"party_pcts = party_counts.div(party_counts.sum(axis=\"columns\"),\n",
" axis=\"index\")\n",
"party_pcts\n",
"party_pcts.plot.bar(stacked=True)"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [],
"source": [
"plt.close(\"all\")"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
"import seaborn as sns\n",
"\n",
"tips[\"tip_pct\"] = tips[\"tip\"] / (tips[\"total_bill\"] - tips[\"tip\"])\n",
"tips.head()\n",
"sns.barplot(x=\"tip_pct\", y=\"day\", data=tips, orient=\"h\")"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [],
"source": [
"plt.close(\"all\")"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [],
"source": [
"sns.barplot(x=\"tip_pct\", y=\"day\", hue=\"time\", data=tips, orient=\"h\")"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [],
"source": [
"plt.close(\"all\")"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [],
"source": [
"sns.set_style(\"whitegrid\")"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [],
"source": [
"plt.figure()"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [],
"source": [
"tips[\"tip_pct\"].plot.hist(bins=50)"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [],
"source": [
"plt.figure()"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [],
"source": [
"tips[\"tip_pct\"].plot.density()"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [],
"source": [
"plt.figure()"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [],
"source": [
"comp1 = np.random.standard_normal(200)\n",
"comp2 = 10 + 2 * np.random.standard_normal(200)\n",
"values = pd.Series(np.concatenate([comp1, comp2]))\n",
"\n",
"sns.histplot(values, bins=100, color=\"black\")"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [],
"source": [
"macro = pd.read_csv(\"examples/macrodata.csv\")\n",
"data = macro[[\"cpi\", \"m1\", \"tbilrate\", \"unemp\"]]\n",
"trans_data = np.log(data).diff().dropna()\n",
"trans_data.tail()"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [],
"source": [
"plt.figure()"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [],
"source": [
"ax = sns.regplot(x=\"m1\", y=\"unemp\", data=trans_data)\n",
"ax.set_title(\"Changes in log(m1) versus log(unemp)\")"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {},
"outputs": [],
"source": [
"sns.pairplot(trans_data, diag_kind=\"kde\", plot_kws={\"alpha\": 0.2})"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {},
"outputs": [],
"source": [
"sns.catplot(x=\"day\", y=\"tip_pct\", hue=\"time\", col=\"smoker\",\n",
" kind=\"bar\", data=tips[tips.tip_pct < 1])"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {},
"outputs": [],
"source": [
"sns.catplot(x=\"day\", y=\"tip_pct\", row=\"time\",\n",
" col=\"smoker\",\n",
" kind=\"bar\", data=tips[tips.tip_pct < 1])"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {},
"outputs": [],
"source": [
"sns.catplot(x=\"tip_pct\", y=\"day\", kind=\"box\",\n",
" data=tips[tips.tip_pct < 0.5])"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {},
"outputs": [],
"source": [
"pd.options.display.max_rows = PREVIOUS_MAX_ROWS"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
================================================
FILE: ch10.ipynb
================================================
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"PREVIOUS_MAX_ROWS = pd.options.display.max_rows\n",
"pd.options.display.max_columns = 20\n",
"pd.options.display.max_rows = 20\n",
"pd.options.display.max_colwidth = 80\n",
"np.random.seed(12345)\n",
"import matplotlib.pyplot as plt\n",
"plt.rc(\"figure\", figsize=(10, 6))\n",
"np.set_printoptions(precision=4, suppress=True)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"df = pd.DataFrame({\"key1\" : [\"a\", \"a\", None, \"b\", \"b\", \"a\", None],\n",
" \"key2\" : pd.Series([1, 2, 1, 2, 1, None, 1],\n",
" dtype=\"Int64\"),\n",
" \"data1\" : np.random.standard_normal(7),\n",
" \"data2\" : np.random.standard_normal(7)})\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"grouped = df[\"data1\"].groupby(df[\"key1\"])\n",
"grouped"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"grouped.mean()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"means = df[\"data1\"].groupby([df[\"key1\"], df[\"key2\"]]).mean()\n",
"means"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"means.unstack()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"states = np.array([\"OH\", \"CA\", \"CA\", \"OH\", \"OH\", \"CA\", \"OH\"])\n",
"years = [2005, 2005, 2006, 2005, 2006, 2005, 2006]\n",
"df[\"data1\"].groupby([states, years]).mean()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"df.groupby(\"key1\").mean()\n",
"df.groupby(\"key2\").mean(numeric_only=True)\n",
"df.groupby([\"key1\", \"key2\"]).mean()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"df.groupby([\"key1\", \"key2\"]).size()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"df.groupby(\"key1\", dropna=False).size()\n",
"df.groupby([\"key1\", \"key2\"], dropna=False).size()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"df.groupby(\"key1\").count()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"for name, group in df.groupby(\"key1\"):\n",
" print(name)\n",
" print(group)\n"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"for (k1, k2), group in df.groupby([\"key1\", \"key2\"]):\n",
" print((k1, k2))\n",
" print(group)\n"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"pieces = {name: group for name, group in df.groupby(\"key1\")}\n",
"pieces[\"b\"]"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"grouped = df.groupby({\"key1\": \"key\", \"key2\": \"key\",\n",
" \"data1\": \"data\", \"data2\": \"data\"}, axis=\"columns\")"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"for group_key, group_values in grouped:\n",
" print(group_key)\n",
" print(group_values)\n"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"df.groupby([\"key1\", \"key2\"])[[\"data2\"]].mean()"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"s_grouped = df.groupby([\"key1\", \"key2\"])[\"data2\"]\n",
"s_grouped\n",
"s_grouped.mean()"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"people = pd.DataFrame(np.random.standard_normal((5, 5)),\n",
" columns=[\"a\", \"b\", \"c\", \"d\", \"e\"],\n",
" index=[\"Joe\", \"Steve\", \"Wanda\", \"Jill\", \"Trey\"])\n",
"people.iloc[2:3, [1, 2]] = np.nan # Add a few NA values\n",
"people"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"mapping = {\"a\": \"red\", \"b\": \"red\", \"c\": \"blue\",\n",
" \"d\": \"blue\", \"e\": \"red\", \"f\" : \"orange\"}"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"by_column = people.groupby(mapping, axis=\"columns\")\n",
"by_column.sum()"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"map_series = pd.Series(mapping)\n",
"map_series\n",
"people.groupby(map_series, axis=\"columns\").count()"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"people.groupby(len).sum()"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"key_list = [\"one\", \"one\", \"one\", \"two\", \"two\"]\n",
"people.groupby([len, key_list]).min()"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"columns = pd.MultiIndex.from_arrays([[\"US\", \"US\", \"US\", \"JP\", \"JP\"],\
gitextract_mai23lqn/ ├── .gitignore ├── COPYING ├── README.md ├── appa.ipynb ├── appb.ipynb ├── ch02.ipynb ├── ch03.ipynb ├── ch04.ipynb ├── ch05.ipynb ├── ch06.ipynb ├── ch07.ipynb ├── ch08.ipynb ├── ch09.ipynb ├── ch10.ipynb ├── ch11.ipynb ├── ch12.ipynb ├── ch13.ipynb ├── datasets/ │ ├── babynames/ │ │ ├── yob1880.txt │ │ ├── yob1881.txt │ │ ├── yob1882.txt │ │ ├── yob1883.txt │ │ ├── yob1884.txt │ │ ├── yob1885.txt │ │ ├── yob1886.txt │ │ ├── yob1887.txt │ │ ├── yob1888.txt │ │ ├── yob1889.txt │ │ ├── yob1890.txt │ │ ├── yob1891.txt │ │ ├── yob1892.txt │ │ ├── yob1893.txt │ │ ├── yob1894.txt │ │ ├── yob1895.txt │ │ ├── yob1896.txt │ │ ├── yob1897.txt │ │ ├── yob1898.txt │ │ ├── yob1899.txt │ │ ├── yob1900.txt │ │ ├── yob1901.txt │ │ ├── yob1902.txt │ │ ├── yob1903.txt │ │ ├── yob1904.txt │ │ ├── yob1905.txt │ │ ├── yob1906.txt │ │ ├── yob1907.txt │ │ ├── yob1908.txt │ │ ├── yob1909.txt │ │ ├── yob1910.txt │ │ ├── yob1911.txt │ │ ├── yob1912.txt │ │ ├── yob1913.txt │ │ ├── yob1914.txt │ │ ├── yob1915.txt │ │ ├── yob1916.txt │ │ ├── yob1917.txt │ │ ├── yob1918.txt │ │ ├── yob1919.txt │ │ ├── yob1920.txt │ │ ├── yob1921.txt │ │ ├── yob1922.txt │ │ ├── yob1923.txt │ │ ├── yob1924.txt │ │ ├── yob1925.txt │ │ ├── yob1926.txt │ │ ├── yob1927.txt │ │ ├── yob1928.txt │ │ ├── yob1929.txt │ │ ├── yob1930.txt │ │ ├── yob1931.txt │ │ ├── yob1932.txt │ │ ├── yob1933.txt │ │ ├── yob1934.txt │ │ ├── yob1935.txt │ │ ├── yob1936.txt │ │ ├── yob1937.txt │ │ ├── yob1938.txt │ │ ├── yob1939.txt │ │ ├── yob1940.txt │ │ ├── yob1941.txt │ │ ├── yob1942.txt │ │ ├── yob1943.txt │ │ ├── yob1944.txt │ │ ├── yob1945.txt │ │ ├── yob1946.txt │ │ ├── yob1947.txt │ │ ├── yob1948.txt │ │ ├── yob1949.txt │ │ ├── yob1950.txt │ │ ├── yob1951.txt │ │ ├── yob1952.txt │ │ ├── yob1953.txt │ │ ├── yob1954.txt │ │ ├── yob1955.txt │ │ ├── yob1956.txt │ │ ├── yob1957.txt │ │ ├── yob1958.txt │ │ ├── yob1959.txt │ │ ├── yob1960.txt │ │ ├── yob1961.txt │ │ ├── yob1962.txt │ │ ├── yob1963.txt │ │ ├── yob1964.txt │ │ ├── yob1965.txt │ │ ├── yob1966.txt │ │ ├── yob1967.txt │ │ ├── yob1968.txt │ │ ├── yob1969.txt │ │ ├── yob1970.txt │ │ ├── yob1971.txt │ │ ├── yob1972.txt │ │ ├── yob1973.txt │ │ ├── yob1974.txt │ │ ├── yob1975.txt │ │ ├── yob1976.txt │ │ ├── yob1977.txt │ │ ├── yob1978.txt │ │ ├── yob1979.txt │ │ ├── yob1980.txt │ │ ├── yob1981.txt │ │ ├── yob1982.txt │ │ ├── yob1983.txt │ │ ├── yob1984.txt │ │ ├── yob1985.txt │ │ ├── yob1986.txt │ │ ├── yob1987.txt │ │ ├── yob1988.txt │ │ ├── yob1989.txt │ │ ├── yob1990.txt │ │ ├── yob1991.txt │ │ ├── yob1992.txt │ │ ├── yob1993.txt │ │ ├── yob1994.txt │ │ ├── yob1995.txt │ │ ├── yob1996.txt │ │ ├── yob1997.txt │ │ ├── yob1998.txt │ │ ├── yob1999.txt │ │ ├── yob2000.txt │ │ ├── yob2001.txt │ │ ├── yob2002.txt │ │ ├── yob2003.txt │ │ ├── yob2004.txt │ │ ├── yob2005.txt │ │ ├── yob2006.txt │ │ ├── yob2007.txt │ │ ├── yob2008.txt │ │ ├── yob2009.txt │ │ └── yob2010.txt │ ├── bitly_usagov/ │ │ └── example.txt │ ├── fec/ │ │ ├── P00000001-ALL.csv │ │ └── fec.parquet │ ├── haiti/ │ │ ├── Haiti.csv │ │ └── PortAuPrince_Roads/ │ │ ├── PortAuPrince_Roads.dbf │ │ ├── PortAuPrince_Roads.prj │ │ ├── PortAuPrince_Roads.sbn │ │ ├── PortAuPrince_Roads.sbx │ │ ├── PortAuPrince_Roads.shp │ │ ├── PortAuPrince_Roads.shx │ │ └── PortAuPrince_Roads_README.txt │ ├── movielens/ │ │ └── README │ ├── mta_perf/ │ │ ├── Performance_LIBUS.xml │ │ ├── Performance_LIBUS.xsd │ │ ├── Performance_LIRR.xml │ │ ├── Performance_LIRR.xsd │ │ ├── Performance_MNR.xml │ │ ├── Performance_MNR.xsd │ │ ├── Performance_MTABUS.xml │ │ ├── Performance_MTABUS.xsd │ │ ├── Performance_NYCT.xml │ │ ├── Performance_NYCT.xsd │ │ ├── Performance_TBTA.xml │ │ ├── Performance_TBTA.xsd │ │ └── parse.py │ ├── titanic/ │ │ ├── genderclassmodel.csv │ │ ├── gendermodel.csv │ │ ├── test.csv │ │ └── train.csv │ └── usda_food/ │ └── database.json ├── examples/ │ ├── array_ex.txt │ ├── csv_mindex.csv │ ├── ex1.csv │ ├── ex1.xlsx │ ├── ex2.csv │ ├── ex3.txt │ ├── ex4.csv │ ├── ex5.csv │ ├── ex6.csv │ ├── ex7.csv │ ├── example.json │ ├── fdic_failed_bank_list.html │ ├── ipython_bug.py │ ├── macrodata.csv │ ├── segismundo.txt │ ├── spx.csv │ ├── stock_px.csv │ ├── test_file.csv │ ├── tips.csv │ ├── tseries.csv │ ├── volume.csv │ ├── yahoo_price.pkl │ └── yahoo_volume.pkl ├── pyproject.toml └── requirements.txt
Copy disabled (too large)
Download .json
Condensed preview — 203 files, each showing path, character count, and a content snippet. Download the .json file for the full structured content (18,251K chars).
[
{
"path": ".gitignore",
"chars": 18,
"preview": ".ipynb_checkpoints"
},
{
"path": "COPYING",
"chars": 1138,
"preview": "Code examples from \"Python for Data Analysis\", 3rd Edition\n\nThe MIT License (MIT)\n\nCopyright (c) 2022 Wes McKinney\n\nPerm"
},
{
"path": "README.md",
"chars": 3750,
"preview": "# Python for Data Analysis, 3rd Edition\n\nMaterials and IPython notebooks for \"Python for Data Analysis, 3rd\nEdition\" by "
},
{
"path": "appa.ipynb",
"chars": 18320,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"code\",\n \"execution_count\": 1,\n \"metadata\": {},\n \"outputs\": [],\n \"source\": [\n "
},
{
"path": "appb.ipynb",
"chars": 1868,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"code\",\n \"execution_count\": 1,\n \"metadata\": {},\n \"outputs\": [],\n \"source\": [\n "
},
{
"path": "ch02.ipynb",
"chars": 12089,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"code\",\n \"execution_count\": 1,\n \"metadata\": {},\n \"outputs\": [],\n \"source\": [\n "
},
{
"path": "ch03.ipynb",
"chars": 25308,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"code\",\n \"execution_count\": 1,\n \"metadata\": {},\n \"outputs\": [],\n \"source\": []\n"
},
{
"path": "ch04.ipynb",
"chars": 22417,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"code\",\n \"execution_count\": 1,\n \"metadata\": {},\n \"outputs\": [],\n \"source\": [\n "
},
{
"path": "ch05.ipynb",
"chars": 29194,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"code\",\n \"execution_count\": 1,\n \"metadata\": {},\n \"outputs\": [],\n \"source\": [\n "
},
{
"path": "ch06.ipynb",
"chars": 16500,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"code\",\n \"execution_count\": 1,\n \"metadata\": {},\n \"outputs\": [],\n \"source\": [\n "
},
{
"path": "ch07.ipynb",
"chars": 26450,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"code\",\n \"execution_count\": 1,\n \"metadata\": {},\n \"outputs\": [],\n \"source\": [\n "
},
{
"path": "ch08.ipynb",
"chars": 20734,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"code\",\n \"execution_count\": 1,\n \"metadata\": {},\n \"outputs\": [],\n \"source\": [\n "
},
{
"path": "ch09.ipynb",
"chars": 14368,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"code\",\n \"execution_count\": 1,\n \"metadata\": {},\n \"outputs\": [],\n \"source\": [\n "
},
{
"path": "ch10.ipynb",
"chars": 21850,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"code\",\n \"execution_count\": 1,\n \"metadata\": {},\n \"outputs\": [],\n \"source\": [\n "
},
{
"path": "ch11.ipynb",
"chars": 27066,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"code\",\n \"execution_count\": 1,\n \"metadata\": {},\n \"outputs\": [],\n \"source\": [\n "
},
{
"path": "ch12.ipynb",
"chars": 11519,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"code\",\n \"execution_count\": 1,\n \"metadata\": {},\n \"outputs\": [],\n \"source\": [\n "
},
{
"path": "ch13.ipynb",
"chars": 31243,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"code\",\n \"execution_count\": 1,\n \"metadata\": {},\n \"outputs\": [],\n \"source\": [\n "
},
{
"path": "datasets/babynames/yob1880.txt",
"chars": 24933,
"preview": "Mary,F,7065\r\nAnna,F,2604\r\nEmma,F,2003\r\nElizabeth,F,1939\r\nMinnie,F,1746\r\nMargaret,F,1578\r\nIda,F,1472\r\nAlice,F,1414\r\nBerth"
},
{
"path": "datasets/babynames/yob1881.txt",
"chars": 24065,
"preview": "Mary,F,6919\r\nAnna,F,2698\r\nEmma,F,2034\r\nElizabeth,F,1852\r\nMargaret,F,1658\r\nMinnie,F,1653\r\nIda,F,1439\r\nAnnie,F,1326\r\nBerth"
},
{
"path": "datasets/babynames/yob1882.txt",
"chars": 26559,
"preview": "Mary,F,8149\r\nAnna,F,3143\r\nEmma,F,2303\r\nElizabeth,F,2187\r\nMinnie,F,2004\r\nMargaret,F,1821\r\nIda,F,1673\r\nAlice,F,1542\r\nBerth"
},
{
"path": "datasets/babynames/yob1883.txt",
"chars": 26003,
"preview": "Mary,F,8012\r\nAnna,F,3306\r\nEmma,F,2367\r\nElizabeth,F,2255\r\nMinnie,F,2035\r\nMargaret,F,1881\r\nBertha,F,1681\r\nIda,F,1634\r\nAnni"
},
{
"path": "datasets/babynames/yob1884.txt",
"chars": 28670,
"preview": "Mary,F,9217\r\nAnna,F,3860\r\nEmma,F,2587\r\nElizabeth,F,2549\r\nMinnie,F,2243\r\nMargaret,F,2143\r\nIda,F,1882\r\nClara,F,1852\r\nBerth"
},
{
"path": "datasets/babynames/yob1885.txt",
"chars": 28625,
"preview": "Mary,F,9128\r\nAnna,F,3994\r\nEmma,F,2728\r\nElizabeth,F,2582\r\nMargaret,F,2204\r\nMinnie,F,2178\r\nClara,F,1910\r\nBertha,F,1860\r\nId"
},
{
"path": "datasets/babynames/yob1886.txt",
"chars": 29822,
"preview": "Mary,F,9891\r\nAnna,F,4283\r\nEmma,F,2764\r\nElizabeth,F,2680\r\nMinnie,F,2372\r\nMargaret,F,2275\r\nIda,F,2049\r\nBertha,F,2001\r\nClar"
},
{
"path": "datasets/babynames/yob1887.txt",
"chars": 29532,
"preview": "Mary,F,9888\r\nAnna,F,4227\r\nElizabeth,F,2681\r\nEmma,F,2647\r\nMargaret,F,2419\r\nMinnie,F,2215\r\nBertha,F,2037\r\nClara,F,1984\r\nFl"
},
{
"path": "datasets/babynames/yob1888.txt",
"chars": 33065,
"preview": "Mary,F,11754\r\nAnna,F,4982\r\nElizabeth,F,3224\r\nEmma,F,3087\r\nMargaret,F,2904\r\nMinnie,F,2654\r\nBertha,F,2450\r\nFlorence,F,2444"
},
{
"path": "datasets/babynames/yob1889.txt",
"chars": 32297,
"preview": "Mary,F,11649\r\nAnna,F,5062\r\nElizabeth,F,3058\r\nMargaret,F,2917\r\nEmma,F,2884\r\nMinnie,F,2624\r\nFlorence,F,2465\r\nEthel,F,2463\r"
},
{
"path": "datasets/babynames/yob1890.txt",
"chars": 33621,
"preview": "Mary,F,12078\r\nAnna,F,5233\r\nElizabeth,F,3112\r\nMargaret,F,3100\r\nEmma,F,2980\r\nFlorence,F,2745\r\nEthel,F,2718\r\nMinnie,F,2650\r"
},
{
"path": "datasets/babynames/yob1891.txt",
"chars": 33186,
"preview": "Mary,F,11704\r\nAnna,F,5099\r\nMargaret,F,3066\r\nElizabeth,F,3059\r\nEmma,F,2884\r\nFlorence,F,2715\r\nEthel,F,2689\r\nMinnie,F,2428\r"
},
{
"path": "datasets/babynames/yob1892.txt",
"chars": 36542,
"preview": "Mary,F,13174\r\nAnna,F,5542\r\nElizabeth,F,3461\r\nMargaret,F,3435\r\nRuth,F,3291\r\nFlorence,F,3154\r\nEmma,F,3128\r\nEthel,F,3035\r\nH"
},
{
"path": "datasets/babynames/yob1893.txt",
"chars": 35434,
"preview": "Mary,F,12784\r\nAnna,F,5695\r\nRuth,F,3658\r\nMargaret,F,3565\r\nElizabeth,F,3361\r\nHelen,F,3249\r\nFlorence,F,3231\r\nEthel,F,3119\r\n"
},
{
"path": "datasets/babynames/yob1894.txt",
"chars": 36817,
"preview": "Mary,F,13151\r\nAnna,F,5565\r\nMargaret,F,3701\r\nHelen,F,3676\r\nElizabeth,F,3425\r\nRuth,F,3372\r\nEthel,F,3287\r\nFlorence,F,3233\r\n"
},
{
"path": "datasets/babynames/yob1895.txt",
"chars": 38232,
"preview": "Mary,F,13446\r\nAnna,F,5949\r\nHelen,F,4023\r\nMargaret,F,3931\r\nElizabeth,F,3603\r\nRuth,F,3551\r\nFlorence,F,3471\r\nEthel,F,3391\r\n"
},
{
"path": "datasets/babynames/yob1896.txt",
"chars": 38747,
"preview": "Mary,F,13811\r\nAnna,F,5860\r\nHelen,F,4392\r\nMargaret,F,4051\r\nRuth,F,3905\r\nEthel,F,3502\r\nElizabeth,F,3471\r\nFlorence,F,3323\r\n"
},
{
"path": "datasets/babynames/yob1897.txt",
"chars": 37936,
"preview": "Mary,F,13412\r\nAnna,F,5429\r\nHelen,F,4518\r\nMargaret,F,4145\r\nRuth,F,3878\r\nElizabeth,F,3442\r\nFlorence,F,3369\r\nEthel,F,3182\r\n"
},
{
"path": "datasets/babynames/yob1898.txt",
"chars": 40937,
"preview": "Mary,F,14406\r\nAnna,F,5773\r\nHelen,F,5230\r\nMargaret,F,4696\r\nRuth,F,4249\r\nFlorence,F,3790\r\nElizabeth,F,3659\r\nEthel,F,3531\r\n"
},
{
"path": "datasets/babynames/yob1899.txt",
"chars": 38141,
"preview": "Mary,F,13172\r\nAnna,F,5115\r\nHelen,F,5048\r\nMargaret,F,4249\r\nRuth,F,3912\r\nFlorence,F,3314\r\nElizabeth,F,3287\r\nMarie,F,3156\r\n"
},
{
"path": "datasets/babynames/yob1900.txt",
"chars": 46898,
"preview": "Mary,F,16710\r\nHelen,F,6343\r\nAnna,F,6115\r\nMargaret,F,5305\r\nRuth,F,4765\r\nElizabeth,F,4097\r\nFlorence,F,3920\r\nEthel,F,3896\r\n"
},
{
"path": "datasets/babynames/yob1901.txt",
"chars": 39584,
"preview": "Mary,F,13137\r\nHelen,F,5247\r\nAnna,F,4923\r\nMargaret,F,4424\r\nRuth,F,3974\r\nElizabeth,F,3216\r\nMarie,F,3157\r\nFlorence,F,3131\r\n"
},
{
"path": "datasets/babynames/yob1902.txt",
"chars": 42284,
"preview": "Mary,F,14485\r\nHelen,F,5967\r\nAnna,F,5288\r\nMargaret,F,5011\r\nRuth,F,4384\r\nElizabeth,F,3694\r\nFlorence,F,3509\r\nMarie,F,3423\r\n"
},
{
"path": "datasets/babynames/yob1903.txt",
"chars": 42679,
"preview": "Mary,F,14275\r\nHelen,F,6129\r\nAnna,F,5098\r\nMargaret,F,5046\r\nRuth,F,4518\r\nElizabeth,F,3723\r\nMarie,F,3469\r\nFlorence,F,3468\r\n"
},
{
"path": "datasets/babynames/yob1904.txt",
"chars": 44819,
"preview": "Mary,F,14962\r\nHelen,F,6489\r\nAnna,F,5330\r\nMargaret,F,5302\r\nRuth,F,4900\r\nElizabeth,F,3833\r\nMarie,F,3595\r\nFlorence,F,3572\r\n"
},
{
"path": "datasets/babynames/yob1905.txt",
"chars": 46015,
"preview": "Mary,F,16067\r\nHelen,F,6811\r\nMargaret,F,5690\r\nAnna,F,5424\r\nRuth,F,5068\r\nElizabeth,F,4122\r\nDorothy,F,3939\r\nMildred,F,3758\r"
},
{
"path": "datasets/babynames/yob1906.txt",
"chars": 45845,
"preview": "Mary,F,16371\r\nHelen,F,7176\r\nMargaret,F,6096\r\nAnna,F,5502\r\nRuth,F,5140\r\nDorothy,F,4326\r\nElizabeth,F,4321\r\nAlice,F,4192\r\nM"
},
{
"path": "datasets/babynames/yob1907.txt",
"chars": 49829,
"preview": "Mary,F,17579\r\nHelen,F,7579\r\nMargaret,F,6713\r\nAnna,F,5575\r\nRuth,F,5573\r\nDorothy,F,4966\r\nElizabeth,F,4623\r\nMildred,F,4277\r"
},
{
"path": "datasets/babynames/yob1908.txt",
"chars": 50841,
"preview": "Mary,F,18664\r\nHelen,F,8436\r\nMargaret,F,6976\r\nRuth,F,6179\r\nAnna,F,5858\r\nDorothy,F,5703\r\nElizabeth,F,4904\r\nMildred,F,4624\r"
},
{
"path": "datasets/babynames/yob1909.txt",
"chars": 53488,
"preview": "Mary,F,19256\r\nHelen,F,9249\r\nMargaret,F,7358\r\nRuth,F,6508\r\nDorothy,F,6252\r\nAnna,F,5803\r\nElizabeth,F,5175\r\nMildred,F,5053\r"
},
{
"path": "datasets/babynames/yob1910.txt",
"chars": 58699,
"preview": "Mary,F,22840\r\nHelen,F,10477\r\nMargaret,F,8226\r\nDorothy,F,7315\r\nRuth,F,7210\r\nAnna,F,6434\r\nElizabeth,F,5799\r\nMildred,F,5692"
},
{
"path": "datasets/babynames/yob1911.txt",
"chars": 61771,
"preview": "Mary,F,24385\r\nHelen,F,11799\r\nMargaret,F,9276\r\nDorothy,F,8869\r\nRuth,F,7999\r\nAnna,F,6752\r\nElizabeth,F,6296\r\nMildred,F,6269"
},
{
"path": "datasets/babynames/yob1912.txt",
"chars": 80705,
"preview": "Mary,F,32295\r\nHelen,F,16130\r\nDorothy,F,12642\r\nMargaret,F,12532\r\nRuth,F,11274\r\nMildred,F,8759\r\nAnna,F,8585\r\nElizabeth,F,8"
},
{
"path": "datasets/babynames/yob1913.txt",
"chars": 88666,
"preview": "Mary,F,36631\r\nHelen,F,18879\r\nDorothy,F,14670\r\nMargaret,F,14484\r\nRuth,F,12601\r\nMildred,F,9917\r\nAnna,F,9681\r\nElizabeth,F,9"
},
{
"path": "datasets/babynames/yob1914.txt",
"chars": 101529,
"preview": "Mary,F,45341\r\nHelen,F,23218\r\nDorothy,F,18779\r\nMargaret,F,17756\r\nRuth,F,15835\r\nAnna,F,11864\r\nMildred,F,11648\r\nElizabeth,F"
},
{
"path": "datasets/babynames/yob1915.txt",
"chars": 119483,
"preview": "Mary,F,58184\r\nHelen,F,30865\r\nDorothy,F,25150\r\nMargaret,F,23052\r\nRuth,F,21881\r\nMildred,F,15248\r\nAnna,F,15122\r\nElizabeth,F"
},
{
"path": "datasets/babynames/yob1916.txt",
"chars": 123879,
"preview": "Mary,F,61428\r\nHelen,F,32662\r\nDorothy,F,27415\r\nMargaret,F,24954\r\nRuth,F,23185\r\nMildred,F,15636\r\nAnna,F,15225\r\nElizabeth,F"
},
{
"path": "datasets/babynames/yob1917.txt",
"chars": 126782,
"preview": "Mary,F,64270\r\nHelen,F,34241\r\nDorothy,F,28851\r\nMargaret,F,25561\r\nRuth,F,23562\r\nMildred,F,16258\r\nAnna,F,15161\r\nElizabeth,F"
},
{
"path": "datasets/babynames/yob1918.txt",
"chars": 133101,
"preview": "Mary,F,67375\r\nHelen,F,36152\r\nDorothy,F,32031\r\nMargaret,F,27136\r\nRuth,F,25538\r\nMildred,F,17252\r\nVirginia,F,16419\r\nFrances"
},
{
"path": "datasets/babynames/yob1919.txt",
"chars": 132726,
"preview": "Mary,F,65839\r\nHelen,F,33705\r\nDorothy,F,31733\r\nMargaret,F,26240\r\nRuth,F,24569\r\nMildred,F,17300\r\nVirginia,F,15636\r\nElizabe"
},
{
"path": "datasets/babynames/yob1920.txt",
"chars": 137750,
"preview": "Mary,F,70974\r\nDorothy,F,36646\r\nHelen,F,35093\r\nMargaret,F,27998\r\nRuth,F,26105\r\nMildred,F,18064\r\nVirginia,F,17312\r\nElizabe"
},
{
"path": "datasets/babynames/yob1921.txt",
"chars": 139183,
"preview": "Mary,F,73980\r\nDorothy,F,39082\r\nHelen,F,34812\r\nMargaret,F,28462\r\nRuth,F,25780\r\nVirginia,F,19025\r\nMildred,F,17877\r\nBetty,F"
},
{
"path": "datasets/babynames/yob1922.txt",
"chars": 138200,
"preview": "Mary,F,72156\r\nDorothy,F,37711\r\nHelen,F,32501\r\nMargaret,F,26858\r\nRuth,F,23622\r\nBetty,F,20897\r\nVirginia,F,19145\r\nMildred,F"
},
{
"path": "datasets/babynames/yob1923.txt",
"chars": 136768,
"preview": "Mary,F,71636\r\nDorothy,F,39023\r\nHelen,F,31493\r\nMargaret,F,26131\r\nBetty,F,25992\r\nRuth,F,23638\r\nVirginia,F,18331\r\nMildred,F"
},
{
"path": "datasets/babynames/yob1924.txt",
"chars": 139518,
"preview": "Mary,F,73505\r\nDorothy,F,40005\r\nHelen,F,31199\r\nBetty,F,30595\r\nMargaret,F,26551\r\nRuth,F,23604\r\nVirginia,F,18622\r\nMildred,F"
},
{
"path": "datasets/babynames/yob1925.txt",
"chars": 136831,
"preview": "Mary,F,70594\r\nDorothy,F,38571\r\nBetty,F,32816\r\nHelen,F,29159\r\nMargaret,F,24461\r\nRuth,F,22257\r\nVirginia,F,17487\r\nDoris,F,1"
},
{
"path": "datasets/babynames/yob1926.txt",
"chars": 134326,
"preview": "Mary,F,67830\r\nDorothy,F,36608\r\nBetty,F,32947\r\nHelen,F,26872\r\nMargaret,F,23061\r\nRuth,F,20199\r\nDoris,F,16298\r\nVirginia,F,1"
},
{
"path": "datasets/babynames/yob1927.txt",
"chars": 133892,
"preview": "Mary,F,70594\r\nDorothy,F,35980\r\nBetty,F,35414\r\nHelen,F,25302\r\nMargaret,F,21972\r\nRuth,F,19411\r\nDoris,F,16515\r\nVirginia,F,1"
},
{
"path": "datasets/babynames/yob1928.txt",
"chars": 130726,
"preview": "Mary,F,66867\r\nBetty,F,36074\r\nDorothy,F,33726\r\nHelen,F,22922\r\nMargaret,F,20290\r\nRuth,F,17854\r\nDoris,F,16564\r\nBarbara,F,14"
},
{
"path": "datasets/babynames/yob1929.txt",
"chars": 126316,
"preview": "Mary,F,63502\r\nBetty,F,36665\r\nDorothy,F,31467\r\nHelen,F,20994\r\nMargaret,F,19196\r\nDoris,F,16488\r\nBarbara,F,16033\r\nRuth,F,16"
},
{
"path": "datasets/babynames/yob1930.txt",
"chars": 125892,
"preview": "Mary,F,64110\r\nBetty,F,38239\r\nDorothy,F,30390\r\nHelen,F,19907\r\nMargaret,F,18353\r\nBarbara,F,18287\r\nPatricia,F,15746\r\nJoan,F"
},
{
"path": "datasets/babynames/yob1931.txt",
"chars": 119362,
"preview": "Mary,F,60303\r\nBetty,F,36082\r\nDorothy,F,26511\r\nBarbara,F,21788\r\nJoan,F,19102\r\nHelen,F,17653\r\nMargaret,F,17324\r\nPatricia,F"
},
{
"path": "datasets/babynames/yob1932.txt",
"chars": 120786,
"preview": "Mary,F,59859\r\nBetty,F,34411\r\nBarbara,F,26310\r\nDorothy,F,24972\r\nJoan,F,21043\r\nPatricia,F,17988\r\nMargaret,F,16537\r\nHelen,F"
},
{
"path": "datasets/babynames/yob1933.txt",
"chars": 115824,
"preview": "Mary,F,55477\r\nBetty,F,31520\r\nBarbara,F,26946\r\nDorothy,F,22039\r\nJoan,F,19287\r\nPatricia,F,18622\r\nMargaret,F,15238\r\nHelen,F"
},
{
"path": "datasets/babynames/yob1934.txt",
"chars": 118031,
"preview": "Mary,F,56898\r\nBetty,F,31082\r\nBarbara,F,29222\r\nShirley,F,22837\r\nDorothy,F,21283\r\nPatricia,F,20844\r\nJoan,F,19463\r\nMargaret"
},
{
"path": "datasets/babynames/yob1935.txt",
"chars": 116221,
"preview": "Mary,F,55055\r\nShirley,F,42343\r\nBarbara,F,30686\r\nBetty,F,28660\r\nPatricia,F,22876\r\nDorothy,F,19393\r\nJoan,F,18219\r\nMargaret"
},
{
"path": "datasets/babynames/yob1936.txt",
"chars": 114311,
"preview": "Mary,F,54356\r\nShirley,F,35150\r\nBarbara,F,31672\r\nBetty,F,25862\r\nPatricia,F,23903\r\nDorothy,F,17665\r\nJoan,F,17065\r\nNancy,F,"
},
{
"path": "datasets/babynames/yob1937.txt",
"chars": 115279,
"preview": "Mary,F,55635\r\nBarbara,F,34902\r\nPatricia,F,26838\r\nShirley,F,26808\r\nBetty,F,25313\r\nCarol,F,17331\r\nNancy,F,17063\r\nDorothy,F"
},
{
"path": "datasets/babynames/yob1938.txt",
"chars": 116279,
"preview": "Mary,F,56199\r\nBarbara,F,39262\r\nPatricia,F,27547\r\nBetty,F,25497\r\nShirley,F,23762\r\nCarol,F,19421\r\nNancy,F,18943\r\nDorothy,F"
},
{
"path": "datasets/babynames/yob1939.txt",
"chars": 114819,
"preview": "Mary,F,54903\r\nBarbara,F,37262\r\nPatricia,F,29699\r\nBetty,F,23637\r\nShirley,F,20442\r\nCarol,F,20164\r\nNancy,F,19723\r\nJudith,F,"
},
{
"path": "datasets/babynames/yob1940.txt",
"chars": 115631,
"preview": "Mary,F,56203\r\nBarbara,F,36730\r\nPatricia,F,32658\r\nJudith,F,22376\r\nBetty,F,22067\r\nCarol,F,21757\r\nNancy,F,19730\r\nLinda,F,18"
},
{
"path": "datasets/babynames/yob1941.txt",
"chars": 117195,
"preview": "Mary,F,58025\r\nBarbara,F,39534\r\nPatricia,F,36896\r\nCarol,F,24186\r\nLinda,F,23724\r\nJudith,F,23311\r\nBetty,F,20897\r\nNancy,F,20"
},
{
"path": "datasets/babynames/yob1942.txt",
"chars": 121600,
"preview": "Mary,F,63236\r\nBarbara,F,44732\r\nPatricia,F,39454\r\nLinda,F,31607\r\nCarol,F,30266\r\nSandra,F,24988\r\nJudith,F,24785\r\nNancy,F,2"
},
{
"path": "datasets/babynames/yob1943.txt",
"chars": 121528,
"preview": "Mary,F,66156\r\nBarbara,F,43422\r\nPatricia,F,39615\r\nLinda,F,38433\r\nCarol,F,31673\r\nSandra,F,25985\r\nJudith,F,25209\r\nSharon,F,"
},
{
"path": "datasets/babynames/yob1944.txt",
"chars": 118128,
"preview": "Mary,F,62476\r\nBarbara,F,39186\r\nLinda,F,38408\r\nPatricia,F,36875\r\nCarol,F,30457\r\nSandra,F,26025\r\nNancy,F,23213\r\nJudith,F,2"
},
{
"path": "datasets/babynames/yob1945.txt",
"chars": 116618,
"preview": "Mary,F,59285\r\nLinda,F,41485\r\nBarbara,F,38283\r\nPatricia,F,35849\r\nCarol,F,30404\r\nSandra,F,24703\r\nNancy,F,21467\r\nSharon,F,2"
},
{
"path": "datasets/babynames/yob1946.txt",
"chars": 125383,
"preview": "Mary,F,67437\r\nLinda,F,52702\r\nPatricia,F,46272\r\nBarbara,F,45103\r\nCarol,F,34246\r\nSandra,F,31677\r\nNancy,F,28328\r\nSusan,F,28"
},
{
"path": "datasets/babynames/yob1947.txt",
"chars": 133917,
"preview": "Linda,F,99651\r\nMary,F,71654\r\nPatricia,F,51269\r\nBarbara,F,48778\r\nSandra,F,34755\r\nCarol,F,33528\r\nNancy,F,32436\r\nSusan,F,31"
},
{
"path": "datasets/babynames/yob1948.txt",
"chars": 132465,
"preview": "Linda,F,96185\r\nMary,F,68582\r\nBarbara,F,46811\r\nPatricia,F,46127\r\nSusan,F,35972\r\nSandra,F,31064\r\nNancy,F,29388\r\nCarol,F,28"
},
{
"path": "datasets/babynames/yob1949.txt",
"chars": 132602,
"preview": "Linda,F,90952\r\nMary,F,66809\r\nPatricia,F,46315\r\nBarbara,F,42562\r\nSusan,F,37685\r\nSandra,F,30619\r\nNancy,F,29222\r\nCarol,F,26"
},
{
"path": "datasets/babynames/yob1950.txt",
"chars": 133205,
"preview": "Linda,F,80408\r\nMary,F,65444\r\nPatricia,F,47910\r\nBarbara,F,41558\r\nSusan,F,38010\r\nNancy,F,29625\r\nDeborah,F,29073\r\nSandra,F,"
},
{
"path": "datasets/babynames/yob1951.txt",
"chars": 135349,
"preview": "Linda,F,73858\r\nMary,F,65624\r\nPatricia,F,56394\r\nDeborah,F,42017\r\nBarbara,F,40529\r\nSusan,F,40195\r\nNancy,F,30317\r\nKaren,F,2"
},
{
"path": "datasets/babynames/yob1952.txt",
"chars": 137658,
"preview": "Linda,F,67071\r\nMary,F,65699\r\nPatricia,F,53083\r\nDeborah,F,49796\r\nSusan,F,41343\r\nBarbara,F,39890\r\nNancy,F,31724\r\nKaren,F,2"
},
{
"path": "datasets/babynames/yob1953.txt",
"chars": 140015,
"preview": "Mary,F,64325\r\nLinda,F,61243\r\nDeborah,F,52170\r\nPatricia,F,50979\r\nSusan,F,44264\r\nBarbara,F,38439\r\nDebra,F,36853\r\nNancy,F,3"
},
{
"path": "datasets/babynames/yob1954.txt",
"chars": 141564,
"preview": "Mary,F,67988\r\nLinda,F,55378\r\nDeborah,F,54661\r\nPatricia,F,49139\r\nSusan,F,47156\r\nDebra,F,45877\r\nBarbara,F,36366\r\nKaren,F,3"
},
{
"path": "datasets/babynames/yob1955.txt",
"chars": 143552,
"preview": "Mary,F,63159\r\nDeborah,F,52303\r\nLinda,F,51271\r\nDebra,F,50525\r\nSusan,F,47382\r\nPatricia,F,46203\r\nBarbara,F,33589\r\nKaren,F,3"
},
{
"path": "datasets/babynames/yob1956.txt",
"chars": 146612,
"preview": "Mary,F,61752\r\nDebra,F,48303\r\nLinda,F,48067\r\nDeborah,F,47836\r\nSusan,F,46557\r\nPatricia,F,43333\r\nKaren,F,40045\r\nCynthia,F,3"
},
{
"path": "datasets/babynames/yob1957.txt",
"chars": 149179,
"preview": "Mary,F,61090\r\nSusan,F,45946\r\nLinda,F,44497\r\nDebra,F,42715\r\nKaren,F,40585\r\nDeborah,F,40055\r\nCynthia,F,39305\r\nPatricia,F,3"
},
{
"path": "datasets/babynames/yob1958.txt",
"chars": 148618,
"preview": "Mary,F,55832\r\nSusan,F,45158\r\nLinda,F,41895\r\nKaren,F,38449\r\nPatricia,F,37916\r\nDebra,F,35533\r\nDeborah,F,32942\r\nCynthia,F,3"
},
{
"path": "datasets/babynames/yob1959.txt",
"chars": 151965,
"preview": "Mary,F,54469\r\nSusan,F,41602\r\nLinda,F,40407\r\nKaren,F,36763\r\nDonna,F,36459\r\nPatricia,F,35233\r\nDebra,F,31362\r\nCynthia,F,301"
},
{
"path": "datasets/babynames/yob1960.txt",
"chars": 154001,
"preview": "Mary,F,51477\r\nSusan,F,39197\r\nLinda,F,37317\r\nKaren,F,36384\r\nDonna,F,34126\r\nLisa,F,33708\r\nPatricia,F,32108\r\nDebra,F,26738\r"
},
{
"path": "datasets/babynames/yob1961.txt",
"chars": 157443,
"preview": "Mary,F,47651\r\nLisa,F,42685\r\nSusan,F,37517\r\nLinda,F,35558\r\nKaren,F,34684\r\nPatricia,F,28843\r\nDonna,F,28664\r\nCynthia,F,2437"
},
{
"path": "datasets/babynames/yob1962.txt",
"chars": 157847,
"preview": "Lisa,F,46090\r\nMary,F,43488\r\nSusan,F,35743\r\nKaren,F,35185\r\nLinda,F,31458\r\nPatricia,F,26545\r\nDonna,F,25731\r\nCynthia,F,2402"
},
{
"path": "datasets/babynames/yob1963.txt",
"chars": 158855,
"preview": "Lisa,F,56030\r\nMary,F,41553\r\nSusan,F,33991\r\nKaren,F,32483\r\nLinda,F,27708\r\nDonna,F,25393\r\nPatricia,F,25355\r\nLori,F,23904\r\n"
},
{
"path": "datasets/babynames/yob1964.txt",
"chars": 160451,
"preview": "Lisa,F,54271\r\nMary,F,40978\r\nSusan,F,31511\r\nKaren,F,30235\r\nPatricia,F,26081\r\nKimberly,F,24125\r\nDonna,F,23803\r\nLinda,F,236"
},
{
"path": "datasets/babynames/yob1965.txt",
"chars": 155029,
"preview": "Lisa,F,60265\r\nMary,F,34270\r\nKaren,F,32881\r\nKimberly,F,28831\r\nSusan,F,26329\r\nPatricia,F,23551\r\nDonna,F,19696\r\nLinda,F,193"
},
{
"path": "datasets/babynames/yob1966.txt",
"chars": 157349,
"preview": "Lisa,F,56899\r\nKimberly,F,32226\r\nMary,F,28881\r\nMichelle,F,27149\r\nKaren,F,25442\r\nSusan,F,23767\r\nPatricia,F,20120\r\nTammy,F,"
},
{
"path": "datasets/babynames/yob1967.txt",
"chars": 160588,
"preview": "Lisa,F,52428\r\nKimberly,F,33099\r\nMichelle,F,30813\r\nMary,F,25317\r\nSusan,F,22258\r\nKaren,F,21543\r\nAngela,F,19539\r\nTammy,F,18"
},
{
"path": "datasets/babynames/yob1968.txt",
"chars": 167356,
"preview": "Lisa,F,49523\r\nMichelle,F,33206\r\nKimberly,F,31908\r\nJennifer,F,26841\r\nMelissa,F,21730\r\nMary,F,21722\r\nAngela,F,20661\r\nTammy"
},
{
"path": "datasets/babynames/yob1969.txt",
"chars": 177888,
"preview": "Lisa,F,45036\r\nMichelle,F,34315\r\nJennifer,F,33699\r\nKimberly,F,33075\r\nMelissa,F,23024\r\nAmy,F,21468\r\nAngela,F,21047\r\nMary,F"
},
{
"path": "datasets/babynames/yob1970.txt",
"chars": 191360,
"preview": "Jennifer,F,46151\r\nLisa,F,38951\r\nKimberly,F,34131\r\nMichelle,F,34046\r\nAmy,F,25209\r\nAngela,F,24921\r\nMelissa,F,23738\r\nTammy,"
},
{
"path": "datasets/babynames/yob1971.txt",
"chars": 197951,
"preview": "Jennifer,F,56779\r\nMichelle,F,33157\r\nLisa,F,32906\r\nKimberly,F,30688\r\nAmy,F,26231\r\nAngela,F,25879\r\nMelissa,F,23868\r\nTammy,"
},
{
"path": "datasets/babynames/yob1972.txt",
"chars": 199410,
"preview": "Jennifer,F,63603\r\nMichelle,F,29276\r\nLisa,F,27547\r\nKimberly,F,26293\r\nAmy,F,25868\r\nAngela,F,23554\r\nMelissa,F,22491\r\nStepha"
},
{
"path": "datasets/babynames/yob1973.txt",
"chars": 202912,
"preview": "Jennifer,F,62447\r\nAmy,F,26962\r\nMichelle,F,26927\r\nKimberly,F,23530\r\nLisa,F,22660\r\nMelissa,F,22482\r\nAngela,F,20895\r\nHeathe"
},
{
"path": "datasets/babynames/yob1974.txt",
"chars": 210367,
"preview": "Jennifer,F,63098\r\nAmy,F,29564\r\nMichelle,F,25825\r\nHeather,F,23187\r\nAngela,F,22795\r\nKimberly,F,22421\r\nMelissa,F,22165\r\nLis"
},
{
"path": "datasets/babynames/yob1975.txt",
"chars": 218922,
"preview": "Jennifer,F,58176\r\nAmy,F,32246\r\nHeather,F,24303\r\nMelissa,F,24168\r\nAngela,F,23344\r\nMichelle,F,22663\r\nKimberly,F,20263\r\nLis"
},
{
"path": "datasets/babynames/yob1976.txt",
"chars": 224880,
"preview": "Jennifer,F,59468\r\nAmy,F,31341\r\nMelissa,F,25092\r\nHeather,F,24198\r\nAngela,F,22046\r\nMichelle,F,19546\r\nKimberly,F,18963\r\nJes"
},
{
"path": "datasets/babynames/yob1977.txt",
"chars": 234872,
"preview": "Jennifer,F,58951\r\nMelissa,F,26880\r\nAmy,F,26729\r\nJessica,F,24838\r\nHeather,F,23766\r\nAngela,F,20990\r\nMichelle,F,19539\r\nKimb"
},
{
"path": "datasets/babynames/yob1978.txt",
"chars": 235822,
"preview": "Jennifer,F,56307\r\nMelissa,F,28323\r\nJessica,F,26102\r\nAmy,F,23219\r\nHeather,F,22265\r\nAmanda,F,20519\r\nAngela,F,20503\r\nSarah,"
},
{
"path": "datasets/babynames/yob1979.txt",
"chars": 246109,
"preview": "Jennifer,F,56709\r\nMelissa,F,34045\r\nAmanda,F,31922\r\nJessica,F,27774\r\nAmy,F,21612\r\nSarah,F,21017\r\nHeather,F,20839\r\nAngela,"
},
{
"path": "datasets/babynames/yob1980.txt",
"chars": 251121,
"preview": "Jennifer,F,58375\r\nAmanda,F,35817\r\nJessica,F,33914\r\nMelissa,F,31625\r\nSarah,F,25737\r\nHeather,F,19965\r\nNicole,F,19910\r\nAmy,"
},
{
"path": "datasets/babynames/yob1981.txt",
"chars": 251593,
"preview": "Jennifer,F,57029\r\nJessica,F,42519\r\nAmanda,F,34366\r\nSarah,F,28163\r\nMelissa,F,28000\r\nAmy,F,20338\r\nNicole,F,20309\r\nStephani"
},
{
"path": "datasets/babynames/yob1982.txt",
"chars": 254584,
"preview": "Jennifer,F,57096\r\nJessica,F,45425\r\nAmanda,F,34209\r\nSarah,F,28470\r\nMelissa,F,25851\r\nNicole,F,21699\r\nStephanie,F,20854\r\nEl"
},
{
"path": "datasets/babynames/yob1983.txt",
"chars": 250692,
"preview": "Jennifer,F,54325\r\nJessica,F,45271\r\nAmanda,F,33735\r\nAshley,F,33280\r\nSarah,F,27214\r\nMelissa,F,23466\r\nNicole,F,22391\r\nSteph"
},
{
"path": "datasets/babynames/yob1984.txt",
"chars": 252513,
"preview": "Jennifer,F,50546\r\nJessica,F,45841\r\nAshley,F,38757\r\nAmanda,F,33893\r\nSarah,F,25860\r\nStephanie,F,23012\r\nNicole,F,22256\r\nMel"
},
{
"path": "datasets/babynames/yob1985.txt",
"chars": 260173,
"preview": "Jessica,F,48340\r\nAshley,F,46995\r\nJennifer,F,42645\r\nAmanda,F,39042\r\nSarah,F,24868\r\nStephanie,F,23233\r\nNicole,F,22957\r\nHea"
},
{
"path": "datasets/babynames/yob1986.txt",
"chars": 267762,
"preview": "Jessica,F,52643\r\nAshley,F,49672\r\nAmanda,F,40513\r\nJennifer,F,36172\r\nSarah,F,28122\r\nStephanie,F,22632\r\nNicole,F,21284\r\nBri"
},
{
"path": "datasets/babynames/yob1987.txt",
"chars": 278085,
"preview": "Jessica,F,55983\r\nAshley,F,54826\r\nAmanda,F,41784\r\nJennifer,F,32687\r\nSarah,F,27876\r\nStephanie,F,22395\r\nBrittany,F,22215\r\nN"
},
{
"path": "datasets/babynames/yob1989.txt",
"chars": 309368,
"preview": "Jessica,F,47888\r\nAshley,F,47585\r\nBrittany,F,37786\r\nAmanda,F,36832\r\nSarah,F,27788\r\nSamantha,F,24794\r\nJennifer,F,23996\r\nSt"
},
{
"path": "datasets/babynames/yob1991.txt",
"chars": 327121,
"preview": "Ashley,F,43482\r\nJessica,F,43396\r\nBrittany,F,29091\r\nAmanda,F,28884\r\nSamantha,F,25648\r\nSarah,F,25214\r\nStephanie,F,22763\r\nJ"
},
{
"path": "datasets/babynames/yob1992.txt",
"chars": 331344,
"preview": "Ashley,F,38449\r\nJessica,F,38342\r\nAmanda,F,25030\r\nBrittany,F,24972\r\nSarah,F,24625\r\nSamantha,F,24405\r\nEmily,F,21834\r\nSteph"
},
{
"path": "datasets/babynames/yob1993.txt",
"chars": 337663,
"preview": "Jessica,F,34973\r\nAshley,F,34844\r\nSarah,F,24220\r\nSamantha,F,23659\r\nEmily,F,23589\r\nBrittany,F,21726\r\nTaylor,F,21266\r\nAmand"
},
{
"path": "datasets/babynames/yob1994.txt",
"chars": 338089,
"preview": "Jessica,F,32111\r\nAshley,F,30272\r\nEmily,F,24148\r\nSamantha,F,22818\r\nSarah,F,22259\r\nTaylor,F,20731\r\nBrittany,F,18897\r\nAmand"
},
{
"path": "datasets/babynames/yob1996.txt",
"chars": 342951,
"preview": "Emily,F,25144\r\nJessica,F,24180\r\nAshley,F,23676\r\nSarah,F,21012\r\nSamantha,F,20541\r\nTaylor,F,19147\r\nHannah,F,18585\r\nAlexis,"
},
{
"path": "datasets/babynames/yob1997.txt",
"chars": 349976,
"preview": "Emily,F,25730\r\nJessica,F,21043\r\nAshley,F,20890\r\nSarah,F,20674\r\nHannah,F,20581\r\nSamantha,F,20169\r\nTaylor,F,19503\r\nAlexis,"
}
]
// ... and 71 more files (download for full content)
About this extraction
This page contains the full source code of the wesm/pydata-book GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 203 files (238.9 MB), approximately 4.0M tokens. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.