Full Code of jphall663/GWU_data_mining for AI

master 1b760e3870d1 cached
141 files
48.6 MB
2.7M tokens
50 symbols
1 requests
Copy disabled (too large) Download .txt
Showing preview only (10,761K chars total). Download the full file to get everything.
Repository: jphall663/GWU_data_mining
Branch: master
Commit: 1b760e3870d1
Files: 141
Total size: 48.6 MB

Directory structure:
gitextract_uh8insif/

├── .gitattributes
├── .gitignore
├── 00_intro_and_history/
│   ├── 00_intro_and_history.md
│   ├── notes/
│   │   └── .gitignore
│   ├── quiz/
│   │   └── .gitignore
│   └── spring_2017_announcements/
│       └── spring_2017_announcements.md
├── 01_basic_data_prep/
│   ├── 01_basic_data_prep.md
│   ├── assignment/
│   │   └── .gitignore
│   ├── notes/
│   │   └── .gitignore
│   ├── quiz/
│   │   └── .gitignore
│   └── src/
│       ├── notebooks/
│       │   ├── py/
│       │   │   ├── .gitignore
│       │   │   └── Py_Part_0_pandas_numpy.ipynb
│       │   ├── r/
│       │   │   ├── .gitignore
│       │   │   ├── R_Part_0_Basics_dplyr_and_ggplot2.ipynb
│       │   │   └── R_Part_1_data.table.ipynb
│       │   └── sas/
│       │       ├── SAS_Part_0_Base_SAS_PROC_SGPLOT.ipynb
│       │       └── SAS_Part_1_PROC_SQL.ipynb
│       └── raw/
│           ├── py/
│           │   ├── Py_Part_0_pandas_numpy.py
│           │   ├── pyspark_example.py
│           │   ├── scratch.csv
│           │   ├── scratch2.csv
│           │   └── scratch3.csv
│           ├── r/
│           │   ├── .gitignore
│           │   ├── R_Part_0_Basics_dplyr_and_ggplot2.r
│           │   └── R_Part_1_data.table.r
│           └── sas/
│               ├── .gitignore
│               ├── SAS_Part_0_Base_SAS_PROC_SGPLOT.sas
│               └── SAS_Part_1_PROC_SQL.sas
├── 02_analytical_data_prep/
│   ├── 02_analytical_data_prep.md
│   ├── data/
│   │   ├── loan.csv
│   │   └── loans.sas7bdat
│   ├── notes/
│   │   └── .gitignore
│   ├── quiz/
│   │   └── .gitignore
│   ├── src/
│   │   ├── .gitignore
│   │   ├── DataPreperation.py
│   │   ├── data_sets/
│   │   │   └── kaggle_house/
│   │   │       ├── test.csv
│   │   │       └── train.csv
│   │   ├── housing.html
│   │   ├── housing.ipynb
│   │   ├── py_part_2_discretization.ipynb
│   │   ├── py_part_2_encoding.ipynb
│   │   ├── py_part_2_feature_extraction.ipynb
│   │   ├── py_part_2_feature_selection.ipynb
│   │   ├── py_part_2_impute.ipynb
│   │   ├── py_part_2_over_sample.ipynb
│   │   ├── py_part_2_standardize.ipynb
│   │   ├── py_part_2_target_encode_categorical.ipynb
│   │   ├── py_part_2_target_encode_numeric.ipynb
│   │   └── py_part_2_winsorize.ipynb
│   └── xml/
│       └── 02_analytical_data_prep.xml
├── 03_regression/
│   ├── .gitignore
│   ├── 03_regression.md
│   ├── assignment/
│   │   └── .gitignore
│   ├── data/
│   │   ├── .gitignore
│   │   ├── loan_clean.csv
│   │   ├── test.csv
│   │   └── train.csv
│   ├── quiz/
│   │   └── .gitignore
│   ├── src/
│   │   ├── .gitignore
│   │   ├── py_part_3_kaggle_starter.ipynb
│   │   ├── py_part_3_linear_regression_gradient_descent.ipynb
│   │   ├── py_part_3_penalized_linear_regression.ipynb
│   │   ├── py_part_3_penalized_logistic_regression.ipynb
│   │   ├── spark_kaggle_starter/
│   │   │   ├── README.md
│   │   │   ├── feature_combiner.py
│   │   │   ├── get_type_lists.py
│   │   │   ├── logging_lib/
│   │   │   │   ├── LICENSE.md
│   │   │   │   ├── LoggingController.py
│   │   │   │   ├── MarkdownBuilder.py
│   │   │   │   ├── README.md
│   │   │   │   ├── __init__.py
│   │   │   │   ├── example.py
│   │   │   │   └── markdown_preview_github.css
│   │   │   ├── main.py
│   │   │   ├── spark_controler/
│   │   │   │   ├── LICENSE.md
│   │   │   │   ├── README.md
│   │   │   │   ├── __init__.py
│   │   │   │   ├── ec2_instance_data_dict.py
│   │   │   │   ├── emr_controller.py
│   │   │   │   ├── files/
│   │   │   │   │   ├── setup.sh
│   │   │   │   │   └── terminate_idle_cluster.sh
│   │   │   │   ├── resource_calculator/
│   │   │   │   │   └── C2FO-Spark-Config-Cheatsheet.xlsx
│   │   │   │   └── scripts/
│   │   │   │       ├── bootstrap_actions.sh
│   │   │   │       ├── deep_learning_install_complete.sh
│   │   │   │       ├── pyspark_quick_setup.sh
│   │   │   │       └── terminate_idle_cluster.sh
│   │   │   ├── spark_main.py
│   │   │   └── target_encoder.py
│   │   └── target_encoder.py
│   ├── xlsx/
│   │   └── assessment_workbook.xlsx
│   └── xml/
│       ├── 03_linear_regression.xml
│       └── 03_logistic_regression.xml
├── 04_decision_trees/
│   ├── 04_decision_trees.md
│   ├── data/
│   │   └── .gitignore
│   ├── quiz/
│   │   └── .gitignore
│   ├── src/
│   │   ├── py_part_4_decision_tree_ensembles.ipynb
│   │   └── py_part_4_kaggle_xgboost.ipynb
│   └── xml/
│       └── 04_decision_trees.xml
├── 05_neural_networks/
│   ├── 05_neural_networks.md
│   ├── assignment/
│   │   └── .gitignore
│   ├── data/
│   │   └── .gitignore
│   ├── quiz/
│   │   ├── .gitignore
│   │   └── sample/
│   │       └── .gitignore
│   ├── src/
│   │   ├── .gitignore
│   │   ├── py_part_5_MNIST_DNN.ipynb
│   │   ├── py_part_5_MNIST_autoencoder.ipynb
│   │   ├── py_part_5_MNIST_data_augmentation.ipynb
│   │   ├── py_part_5_MNIST_keras_lenet.ipynb
│   │   ├── py_part_5_basic_mlp_example.ipynb
│   │   └── py_part_5_neural_networks.ipynb
│   └── xml/
│       └── 05_neural_networks.xml
├── 06_clustering/
│   ├── 06_clustering.md
│   ├── assignment/
│   │   └── key/
│   │       └── .gitignore
│   ├── quiz/
│   │   └── .gitignore
│   ├── src/
│   │   └── py_part_6_clustering.ipynb
│   └── xml/
│       └── 06_clustering.xml
├── 07_association_rules/
│   ├── 07_association_rules.md
│   ├── assignment/
│   │   ├── .gitignore
│   │   └── assignment_7.docx
│   ├── quiz/
│   │   └── .gitignore
│   └── xml/
│       └── 07_association_rules.xml
├── 08_text_mining/
│   ├── 08_text_mining.md
│   ├── quiz/
│   │   ├── .gitignore
│   │   └── sample/
│   │       ├── .gitignore
│   │       └── Quiz_8.docx
│   └── xml/
│       └── 08_text_mining.xml
├── 09_matrix_factorization/
│   ├── 09_matrix_factorization.md
│   └── src/
│       ├── py_part_9_iris_pca.ipynb
│       └── py_part_9_kaggle_GLRM_example.ipynb
├── 10_model_interpretability/
│   ├── 10_model_interpretability.md
│   ├── quiz/
│   │   └── .gitignore
│   └── src/
│       ├── dt_surrogate.ipynb
│       ├── lime.ipynb
│       ├── loco.ipynb
│       ├── mono_xgboost.ipynb
│       ├── pdp_ice.ipynb
│       └── sensitivity_analysis.ipynb
├── README.md
├── anaconda_py35_h2o_xgboost_graphviz/
│   └── Dockerfile
├── cold_call.py
└── requirements.txt

================================================
FILE CONTENTS
================================================

================================================
FILE: .gitattributes
================================================
*.sas7bdat filter=lfs diff=lfs merge=lfs -text
*.jpg filter=lfs diff=lfs merge=lfs -text
*.png filter=lfs diff=lfs merge=lfs -text
*.csv filter=lfs diff=lfs merge=lfs -text
*.zip filter=lfs diff=lfs merge=lfs -text


================================================
FILE: .gitignore
================================================
*.DS_Store
.idea*
*.ipynb_checkpoints
interpreting_ml
FAQ
RosterInformation.xlsx
env


================================================
FILE: 00_intro_and_history/00_intro_and_history.md
================================================
## Section 00: Intro and History

#### Class Notes

* *Introduction to Data Mining* - [chapter 1 notes](http://www-users.cs.umn.edu/~kumar/dmbook/dmslides/chap1_intro.pdf)
* *Advanced Business Analytics* - chapter 1 notes (available on [Blackboard](https://blackboard.gwu.edu))
* [Instructor notes](notes/00_instructor_notes.pdf)
* [More Thoughts on Data Mining](https://github.com/jphall663/nafsa_2018_slides/blob/master/main.pdf)

#### Required Reading

* [*A Very Short History of Data Science*](http://www.forbes.com/sites/gilpress/2013/05/28/a-very-short-history-of-data-science/)
* *Introduction to Data Mining* - chapter 1
* [*The Evolution of Analytics*](http://www.oreilly.com/data/free/the-evolution-of-analytics.csp) (see Blackboard electronic reserves too)

#### [Sample Quiz](quiz/sample/quiz_0.pdf)

#### [Quiz key](quiz/key/quiz_0_key.pdf)

#### Example Data Sets

* Structured data
  * Analytical base table/'Tidy' data - [UCI Adult data](https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data)
  * Times series data - [Historical stock data for DIJA 30 companies](https://www.kaggle.com/szrlee/stock-time-series-20050101-to-20171231/data)
  * Transactional data - [State of Oklahoma credit card purchases](https://catalog.data.gov/dataset/purchase-card-pcard-fiscal-year-2014/resource/4105c297-84dc-4f25-9061-c4e2ad38f7d2)
* Semi-structured data - [Web visitor interest logs](https://www.kaggle.com/yburger/web-visitor-interests)
* Unstructured data
  * Text data - [Hillary Clinton's emails](https://www.kaggle.com/kaggle/hillary-clinton-emails)
  * Image data - [CIFAR-10 data](https://www.kaggle.com/c/cifar-10)

#### Supplementary References

* [*Statistical Modeling: the Two Cultures*](http://www.stat.uchicago.edu/~lekheng/courses/191f09/breiman.pdf)
* [*Fifty Years of Data Science*](http://courses.csail.mit.edu/18.337/2015/docs/50YearsDataScience.pdf)
* [*The Future of Data Analysis*](https://projecteuclid.org/euclid.aoms/1177704711)
* [*Data Science: An Action Plan for Expanding the Technical Areas of the Field of Statistics*](https://utexas.instructure.com/files/35465950/download)

***

* [H2O Algorithm Overview](notes/h2o_algos.pdf)
* [Quora answer for good machine learning references](https://www.quora.com/What-are-some-of-the-best-research-papers-books-for-Machine-learning)
* [An Empirical Comparison of Supervised Learning Algorithms](http://www.eecs.wsu.edu/~holder/courses/CptS570/fall07/present/CaruanaICML06.pdf)


================================================
FILE: 00_intro_and_history/notes/.gitignore
================================================
*.pptx

================================================
FILE: 00_intro_and_history/quiz/.gitignore
================================================
key


================================================
FILE: 00_intro_and_history/spring_2017_announcements/spring_2017_announcements.md
================================================
## Section 00 Announcements

1. Attend class **if possible**:
  * In general, you only need to attend the class (e.g. Thursday or Friday) you registered for, **not both classes**.
  * As of now Thursday's class (1/19) is still on schedule.
  * Friday's class (1/20) is **cancelled** because the university is closed (as some of you correctly pointed out).
  * Neither section's classroom is capable of video recordings.
  * I intend to teach the same material for both classes.

2. Read the class [syllabus](https://github.com/jphall663/GWU_data_mining/blob/master/README.md).

3. Study all the class notes and required reading materials listed on the [Section 00 page](https://github.com/jphall663/GWU_data_mining/blob/master/00_intro_and_history/00_intro_and_history.md) in preparation for a quiz the following week (1/26-1/27). You should also have a look at the [sample quiz](https://github.com/jphall663/GWU_data_mining/blob/master/00_intro_and_history/sample_quiz/quiz_0.pdf). The *Advanced Business Analytics* materials have been posted to the Electronic Reserves section of Blackboard.

4. Pick your group for the semester project. Please speak with your classmates about which group you will be joining and either send a representative from your group to Thursday's class or contact me about your group members by email (one email per group please, and include your section number: Thursday=11, Friday=12).

5. Begin installing software over the next few weeks:
  * Register for [SAS on Demand for Academics](https://odamid.oda.sas.com/SASODAControlCenter/enroll.html?enroll=f0c0602b-d3c3-4676-b44c-c378f14fac91) which will give you access to cloud versions of SAS software suitable for in class use.
  * You may need a local install of SAS to complete some assignments and you may need to contact the [GWU Instructional Technology Lab](https://itl.gwu.edu/sas-software-distribution) for information or assistance regarding installing this software. I would try to start installing SAS and SAS Enterprise Miner soon.
  * For Python and h2o.ai:
    * First, install [Anaconda Python](https://www.continuum.io/downloads).
    * Then install the [h2o.ai library for Python](http://h2o-release.s3.amazonaws.com/h2o/rel-tutte/2/index.html). (See the 'INSTALL IN PYTHON' tab **only**).
  * If you are have difficulties with installing software, we can discuss them in class or office hours.


================================================
FILE: 01_basic_data_prep/01_basic_data_prep.md
================================================
## Section 01: Basic Data Prep

#### Basic data operations

A great deal of work in data mining projects is spent on data munging. Below some of the basic operations are illustrated and defined. Code examples are provided in common languages.

![alt text](basic_data_operations.png)

**Subset/Select/Filter/Slice Rows** - Selecting rows or reducing the number of rows in a data set by some criterion.

**Subset/Select/Slice Columns** - Selecting(/variables) or reducing the number of columns(/variables) in a data set by some criterion.

**Sort/Arrange/Order By** - Arranging the rows of a data set in sequential order based on the values of one or more variables.

**Group By** - Grouping the rows of a data set together based on the values of one or more variables.

**Transpose** - Rearranging a data set such that the row and column(/variable) values are switched.

**Merge/Bind** - Combining data sets side-by-side regardless of the values of any variable(s).

**Join/Bind** - Combining data sets side-by-side based on matching values of variables in both data sets.

**Append/Bind** - Stacking data sets bottom-to-top regardless of the values of any variable(s).

#### Code examples
* [Python Pandas](01_basic_data_prep.md#python-pandas---view-notebook) - [view notebook](src/notebooks/py/Py_Part_0_pandas_numpy.ipynb)
* R
  * [Basics, dplyr, and ggplot](01_basic_data_prep.md#r-basics-dplyr-and-ggplot---view-notebook) - [view notebook](src/notebooks/r/R_Part_0_Basics_dplyr_and_ggplot2.ipynb)
  * [data.table](01_basic_data_prep.md#r-datatable---view-notebook) - [view notebook](src/notebooks/r/R_Part_1_data.table.ipynb)
* SAS
  * [Base SAS and PROC SGPLOT](01_basic_data_prep.md#base-sas-and-proc-sgplot---clonedownload-notebook) - [clone/download notebook](src/notebooks/sas)
  * [PROC SQL](01_basic_data_prep.md#sas-proc-sql---clonedownload-notebook) - [clone/download notebook](src/notebooks/sas)

#### Class Notes:
* [Instructor Notes](notes/01_instructor_notes.pdf)
* *Introduction to Data Mining* - [chapter 2 notes](https://www-users.cs.umn.edu/~kumar/dmbook/dmslides/chap2_data.pdf)

#### Required Reading

* *Introduction to Data Mining* - chapter 2, sections 2.1-2.3
* [*Tidy Data*](https://www.jstatsoft.org/article/view/v059i10)

#### [Sample Quiz](quiz/sample/quiz_1.pdf)

#### [Quiz key](quiz/key/quiz_1_key.pdf)

#### [Assignment](assignment/assignment_1.pdf)

#### [Assignment Key](assignment/key)

#### Supplementary References
* Simple [benchmark](https://github.com/szilard/benchm-databases) of data processing tools by [@szilard](https://github.com/szilard)

***

#### Python Pandas - [view notebook](src/notebooks/py/Py_Part_0_pandas_numpy.ipynb)
```python

"""
Copyright (C) 2017 - 2023 J. Patrick Hall, jphall@gwu.edu

Permission is hereby granted, free of charge, to any person obtaining a
copy of this software and associated documentation files (the "Software"),
to deal in the Software without restriction, including without limitation
the rights to use, copy, modify, merge, publish, distribute, sublicense,
and/or sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included
in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.

"""

#%% standard output ###########################################################
# print is the primary function used to write to the console in Python
# print is a *function* in Python 3
# print is a *statement* in Python 2

print('Hello World!') # Python 3
print 'Hello World!'  # Python 2

# an object with no functions or operators is also printed to the console
x = 'Hello World!'
x

#%% importing libraries #######################################################
# python contains many libraries, often called modules
# modules are:
# * nearly always free and open source
# * installed using many different methods - a package manager like conda,
#     readily available through the Anaconda release of Python
#     (https://www.continuum.io/downloads) - is often a good solution for
#     installing and managing packages/modules
# * of relatively high and uniform quality and but licensing can vary
# * imported using the import statement

# import packages
# packages can be aliased using the as statement

import string                   # module with string utilities
import pandas as pd             # module with many utilities for dataframes
import numpy as np              # module with numeric and math utilities
import matplotlib.pyplot as plt # module for plotting

#%% generating a sample data set ##############################################

# set the number of rows and columns for the sample data set
n_rows = 1000
n_vars = 2

### create lists of strings that will become column names
# lists are:
# * a common data structure in python
# * surrounded by square brackets []
# * can contain different data types as list elements
# * often created by a speficic type pythonic syntax, list comprehensions
# * indexed from 0, unlike SAS or R
# * slicable using numeric indices

# list comprehension
# str() converts to string
# range() creates a list of values from arg1 to arg2
num_col_names = ['numeric' + str(i+1) for i in range(0, n_vars)]
num_col_names

# type() can be used to determine the class of an object in python
type(num_col_names)

# anonymous functions
# the lamba statement is used to define simple anonymous functions
# map() is very similar to to lapply() in R
# it applies a function to the elements of a list
char_col_names = map(lambda j: 'char' + str(j+1), range(0, n_vars))
char_col_names

# string.ascii_uppercase is a string constant of uppercase letters
print(string.ascii_uppercase)

# another list comprehension
# slice first seven letters of the string
text_draw = [(letter * 8) for letter in string.ascii_uppercase[:7]]
text_draw

# create a random numerical columns directly using numpy
# the numerical columns will originally be a 2-D numpy array
randoms = np.random.randn(n_rows, n_vars)
randoms[0:5]
type(randoms)

# create numerical columns of Pandas dataframe from numpy array
# notice that a key is generated automatically
num_cols = pd.DataFrame(randoms, columns=num_col_names)
num_cols.head()
type(num_cols)

# create random character columns as a Pandas dataframe
# use numpy sampling function choice() to generate a numpy array of random text
# create Pandas dataframe from numpy 2-D array
char_cols = pd.DataFrame(np.random.choice(text_draw, (n_rows, n_vars)),
                         columns=char_col_names)
char_cols.head()

# use Pandas concat() to join the numeric and character columns
scratch_df = pd.concat([num_cols, char_cols], axis=1)
scratch_df.head()

#%% plotting variables in a dataframe #########################################
# pandas has several builtin plotting utilities
# pandas hist() method to plot a histogram of numeric1

# pandas alllows slicing by dataframes index using ix[]
# ix[:, 0] means all rows of the 0th column - or numeric1
scratch_df.ix[:, 0].plot.hist(title='Histogram of Numeric1')


# use pandas scatter() method to plot numeric1 vs. numeric2
scratch_df.plot.scatter(x='numeric1', y='numeric2',
                        title='Numeric1 vs. Numeric2')

#%% subsetting pandas dataframes ##############################################

### by columns

# subsetting by index
# one column returns a Pandas series
# a Pandas series is like a single column vector
scratch_df.iloc[:, 0].head()
type(scratch_df.iloc[:, 0])

# more than one columns makes a dataframe
# iloc enables location by index
scratch_df.iloc[:, 0:2].head()
type(scratch_df.iloc[:, 0:2])

# subsetting by variable name
scratch_df['numeric1'].head()
scratch_df.numeric1.head()

# loc[] allows for location by column or row label
scratch_df.loc[:, 'numeric1'].head()

# loc can accept lists as an input
scratch_df.loc[:, ['numeric1', 'numeric2']].head()

### by rows

# subsetting by index
scratch_df[0:3]

# selecting by index
scratch_df.iloc[0:5, :]

# select by row label
# here index/key values 0:5 are returned
scratch_df.loc[0:5, :]

### boolean subsetting

scratch_df[scratch_df.numeric2 > 0].head()
scratch_df[scratch_df.char1 == 'AAAAAAAA'].head()
scratch_df[scratch_df.char1.isin(['AAAAAAAA', 'BBBBBBBB'])].head()
scratch_df[scratch_df.numeric2 > 0].loc[5:10, 'char2']

#%% updating the dataframe ####################################################

# must use .copy() or this will be a symbolic link
scratch_df2 = scratch_df.copy()

# pandas supports in place overwrites of data
# overwrite last 500 rows of char1 with ZZZZZZZZ
scratch_df2.loc[500:, 'char1'] = 'ZZZZZZZZ'
scratch_df2.tail()

# iat[] allows for fast location of specific indices
scratch_df2.iat[0, 0] = 1000
scratch_df2.head()

#%% sorting the dataframe #####################################################

# sort by values of one variable
scratch_df2.sort_values(by='char1').head()

# sort by values of multiple variables and specify sort order
scratch_df3 = scratch_df2.sort_values(by=['char1', 'numeric1'],
                                      ascending=[False, True]).copy()
scratch_df3.head()

# sort by the value of the dataframe index
scratch_df2.sort_index().head()

#%% adding data to the dataframe ##############################################
# pandas concat() supports numerous types of joins and merges
# pandas merge() supports joins and merges using more SQL-like syntax
# i.e. merge(left, right, on=)
# pandas append() supports stacking dataframes top-to-bottom

# create a toy dataframe to join/merge onto scratch_df
scratch_df3 = scratch_df3.drop(['numeric1', 'numeric2'] , axis=1)
scratch_df3.columns = ['char3', 'char4']
scratch_df3.tail()

# default outer join on indices
# indices are not in identical, matching order
# this will create 2000 row � 6 column dataset
scratch_df4 = pd.concat([scratch_df, scratch_df3])
scratch_df4

# outer join on matching columns
# axis=1 specificies to join on columns
# this performs the expected join
scratch_df5 = pd.concat([scratch_df, scratch_df3], axis=1)
scratch_df5.head()
scratch_df5.shape

# append
scratch_df6 = scratch_df.append(scratch_df)
scratch_df6.shape

#%% comparing dataframes ######################################################
# Use Pandas equals() to compare dataframes
# Row order is not ignored

scratch_df.equals(scratch_df)
scratch_df.equals(scratch_df.sort_values(by='char1'))
scratch_df.equals(scratch_df2)

#%% summarizing dataframes ####################################################
# Pandas offers several straightforward summarization functions

scratch_df.mean()
scratch_df.mode()
scratch_df.describe()

#%% by group processing #######################################################
# use pandas groupby() to create groups for subsequent processing

# use summary function size() on groups created by groupby()
counts = scratch_df.groupby('char1').size()
plt.figure()
counts.plot.bar(title='Frequency of char1 values (Histogram of char1)')

# groupby the values of more than one variable
group_means = scratch_df.groupby(['char1', 'char2']).mean()
group_means

#%% transposing a table #######################################################
# transposing a matrix simply switches row and columns values
# transposing a dataframe is more complex because of metadata associated with
#   variable names and row indices

# pandas .T performs a transpose
scratch_df.T.iloc[:, 0:5]

# often, instead of simply transposing, a data set will need to be reformatted
#   in a melt/stack -> column split -> cast action described in Hadley
#   Wickham's *Tidy Data*:
#   https://www.jstatsoft.org/article/view/v059i10
#
# see the stack and unstack methods for Pandas dataframes

#%% exporting and importing a dataframe
# many to_* methods available for exporting dataframes to other formats
# many read_* methods available for creating dataframes from other formats

# export to csv
scratch_df.to_csv('scratch.csv')

# import from csv
scratch_df7 = pd.read_csv('scratch.csv')

```

#### R Basics, dplyr, and ggplot - [view notebook](src/notebooks/r/R_Part_0_Basics_dplyr_and_ggplot2.ipynb)
```r

###############################################################################
# Copyright (C) 2017 - 2023 J. Patrick Hall, jphall@gwu.edu
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included
# in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.

### standard output ###########################################################
# two primary R core functions are used to print information to the console
#   print() and cat()
# print is a generic function that responds differently to different classes
#   of R objects
# note that '.' is just a character, it does not denote object membership
#   as in Java and Python
# cat() simply attempts to print string literals
# an object with no functions or operators is also printed to the console

x <- 'Hello World!'
print(x)
cat(x)
x

class(x) <- 'some.class'
print(x)
cat(x)
x

### import packages ###########################################################

# R contains thousands of packages for many different purposes
# Packages are:
#   - nearly always free and open source
#   - installed using the install.packages() function or a GUI command
#   - of varying quality and licensing
#   - loaded using the library() function, after being installed

library(dplyr)    # popular package for data wrangling with consistent syntax
library(ggplot2)  # popular package for plotting with consistent syntax

# surpress warnings about versions and object masking
# using suppressPackageStartupMessages()
# suppressPackageStartupMessages(library(dplyr))
# suppressPackageStartupMessages(library(ggplot2))

### working directory #########################################################

# enter the directory location of this file within single quotes
# '<-' is the preferred assignment operator in R
# '/' is the safest directory separator character to use

git_dir <- '/path/to/GWU_data_mining/01_basic_data_prep/src/raw/r'

# set the working directory
# the working directory is where files are written to and read from by default
# setwd() sets the working directory
# getwd() prints the current working directory
setwd(git_dir)
getwd()

### generate a sample data set ################################################

# set the number of rows and columns for the sample data set
n_rows <- 1000
n_vars <- 5

# create a key variable
# a key variable has a unique value for each row of a data set
# seq() generates values from a number (default = 1), to another number, by
#   a certain value (default = 1)
# many types of data structures in R have key variables (a.k.a. row names) by
#   default
key <- seq(n_rows)

# show the first five elements
# most data structures in R can be 'sliced', i.e. using numeric indices
#   to select a subset of items
key[1:5]

# create lists of strings that will become column names
# paste() concatentates strings with a separator character in between them
num_vars <- paste('numeric', seq_len(n_vars), sep = '')
num_vars

char_vars <- paste('char', seq_len(n_vars), sep = '')
char_vars

# initialize a data.frame with the key variable
scratch_df <- data.frame(INDEX = key)

# add n_var numeric columns, each with n_row rows, to the data.frame
# each column contains random uniform numeric values generated by runif()
# replicate() replicates n_row length lists of numeric values n_vars times
scratch_df[, num_vars] <- replicate(n_vars, runif(n_rows))

# head() displays the top of a data structure
head(scratch_df)

# add n_var character columns, each with n_row rows, to the data.frame
# create a list of strings from which to generate random text variables
# sapply() applies a function to a sequence of values
# LETTERS is a character vector containing uppercase letters
# an anonymous function is defined that replicates a value 8 times with no
#   seperator character
# replicate() replicates n_var lists of n_row elements from text_draw sampled
#   randomly from test_draw using the sample() function
text_draw <- sapply(LETTERS[1:7],
                    FUN = function(x) paste(rep(x, 8), collapse = ""))
text_draw

scratch_df[, char_vars] <- replicate(n_vars,
                                     sample(text_draw, n_rows, replace = TRUE))
head(scratch_df)

# convert from standard data.frame to dlpyr table
# dplyr is a popular, intuitive, and effcient package for manipulating data sets
# R has many data types: http://www.statmethods.net/input/datatypes.html
scratch_tbl <- tbl_df(scratch_df)

# use the dplyr::glimpse function to see a summary of the generated data set
glimpse(scratch_tbl)

### plotting variables in the table ###########################################
# ggplot allows you to overlay graphics using the '+' operator
# plot univariate densities of numeric1 and char1 using the geom_bar()
#   components
# gtitle adds title
# coord_flip rotates the bar chart

ggplot(scratch_tbl, aes(numeric1)) +
  geom_bar(stat = "bin", fill = "blue", bins = 100) +
  ggtitle('Histogram of Numeric1')

ggplot(scratch_tbl, aes(char1)) +
  geom_bar(aes(fill=char1)) +
  ggtitle('Histogram of Char1') +
  coord_flip()

### subsetting the table ######################################################

# subset variables using dplyr::select
# subset a range of variables with similar names and numeric suffixes
# subset all the variables whose names begin with 'char'
# subset variables by their names
num_vars <- select(scratch_tbl, num_range('numeric', 1:n_vars))
head(num_vars)

char_vars <- select(scratch_tbl, starts_with('char'))
head(char_vars)

mixed_vars <- select(scratch_tbl, one_of('numeric1', 'char1'))
head(mixed_vars)

# subset rows using multiple dplyr functions
# subset rows using their numeric indices
# subset top rows based on the value of a certain variable
# subset rows where a certain variable has a certain value
some_rows <- slice(scratch_tbl, 1:10)
some_rows

sorted_top_rows <- top_n(scratch_tbl, 10, numeric1)
sorted_top_rows

AAAAAAAA_rows <- filter(scratch_tbl, char1 == 'AAAAAAAA')
head(AAAAAAAA_rows)

### updating the table ########################################################
# dplyr, as a best practice, does not support in-place overwrites of data

# dplyr::transform enables the creation of new variables from existing
#   variables
scratch_tbl2 <- transform(scratch_tbl,
                          new_numeric = round(numeric1, 1))
head(scratch_tbl2)

# dplyr::mutate enables the creation of new variables from existing
#   variables and computed variables
scratch_tbl2 <- mutate(scratch_tbl,
                       new_numeric = round(numeric1, 1),
                       new_numeric2 = new_numeric * 10)
head(scratch_tbl2)

# dplyr::transmute enables the creation of new variables from existing
#   variables and computed variables, but keeps only newly created variables
scratch_tbl2 <- transmute(scratch_tbl,
                          new_numeric = round(numeric1, 1),
                          new_numeric2 = new_numeric * 10)
head(scratch_tbl2)

### sorting the table #########################################################
# sort tables using dplyr::arrange
# sort by one variable
# sort by two variables

sorted <- arrange(char_vars, char1)
head(sorted)

sorted2 <- arrange(char_vars, char1, char2)
head(sorted2)

### adding data to the table ##################################################
# add data to a table using dplyr:: bind and dplyr::join
# bind smashes tables together
# join combines tables based on matching values of a shared variable

bindr <- bind_rows(sorted, sorted2)
nrow(bindr)

bindc <- bind_cols(sorted, sorted2)
ncol(bindc)

# create two tables to join on a key variable
sorted_left <- arrange(select(scratch_tbl, one_of('INDEX', 'char1')), char1)
right <- select(scratch_tbl, one_of('INDEX', 'numeric1'))

# Perform join
# joined table contains `char1` from the left table
#   and `numeric1` from the right table
#  matched by the value of `INDEX`
joined <- left_join(sorted_left, right, by = 'INDEX')
head(joined)

### comparing tables ##########################################################
# comparing tables using dplyr::all.equal
# dplyr::all.equal will test tables for equality despite the order of rows
#   and/or columns
# very useful for keeping track of changes to important tables

# Create a table for comparision
test <- select(scratch_tbl, one_of('INDEX', 'numeric1', 'char1'))

# Compare
print(all.equal(joined, test, ignore_row_order = FALSE))
print(all.equal(joined, test, ignore_col_order = FALSE))
print(all.equal(joined, test))

### summarizing tables ########################################################
# combine rows of tables into summary values with dplyr::summarise and
#   dplyr::summarise_each
# summarize one variable using summarise, avg is the name of the created var
# summarize many variables using summarise_each, funs() defines the summary
#   function

ave <- summarise(num_vars, avg = mean(numeric1))
ave

all_aves <-summarise_each(num_vars, funs(mean))
all_aves

### by group processing #######################################################
# By groups allow you to divide and process a data set based on the values of
#   one or more variables
# dplyr::group_by groups a data set together based on the values of a certain
#   variable
# operations can then be applied to groups
grouped <- group_by(joined, char1)
grouped <- summarise(grouped, avg = mean(numeric1))
grouped

### Transposing a table #######################################################
# Transposing a matrix simply switches row and columns values
# Transposing a data.frame or dplyr table is more complex because of metadata
#   associated with variable names and row indices

transposed = t(scratch_tbl)
glimpse(transposed)

# Often, instead of simply transposing, a data set will need to be reformatted
# in a melt/stack-column split-cast action described in Hadley Wickham's
# 'Tidy Data' https://www.jstatsoft.org/article/view/v059i10
# see also dplyr::gather and dplyr::spread()

### exporting and importing the table #########################################
# the R core function write.table enables writing text files
# the similar R core function read.table enables reading text files

# export
# use the sep option to specifiy the columns delimiter character
# row.names = FALSE indicates not to save the row number to the text file
filename <- paste(git_dir, 'scratch.csv', sep = '/')
write.table(scratch_tbl, file = filename, quote = FALSE, sep = ',',
            row.names = FALSE)

# import
import <- read.table(filename, header = TRUE, sep = ',')
```

#### R data.table - [view notebook](src/notebooks/r/R_Part_1_data.table.ipynb)
```r

###############################################################################
# Copyright (C) 2017 - 2023 J. Patrick Hall, jphall@gwu.edu
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included
# in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.

### data.table is an efficient package for manipulating data sets #############
# data.table is implemented in optimized C and often attempts to update
#   items by reference to avoid copying large amounts of data
# data.table is a subclass of data.frame and generally accepts data.frame
#   syntax
# general form of a data.table is dt[i, j, by]
#   i is row index, indexed from 1 ...
#   j is col index, indexed from 1 ...
#   by is by-group var name

library(data.table)

# enter the directory location of this file within single quotes
git_dir <- '/path/to/GWU_data_mining/01_basic_data_prep/src/raw/r'

# set the working directory
setwd(git_dir)
getwd()

### generate a sample data set ################################################

# set the number of rows and columns for the sample data set
n_rows <- 1000
n_vars <- 3

# create a key variable
key <- seq(n_rows)

# create lists of strings that will become column names
num_vars <- paste('numeric', seq_len(n_vars), sep = '')
char_vars <- paste('char', seq_len(n_vars), sep = '')

# create a list of strings from which to generate random text variables
text_draw <- sapply(LETTERS[1:7],
                    FUN = function(x) paste(rep(x, 8), collapse = ""))

# create a sample data.table
scratch_dt <- data.table(key,
                         replicate(n_vars, runif(n_rows)),
                         replicate(n_vars, sample(text_draw, n_rows,
                                                  replace = TRUE)))

# the data.table::set* family of methods in data.table always updates items
#   by reference for efficiency
setnames(scratch_dt, c('key', num_vars, char_vars))
scratch_dt

### plotting ##################################################################
# data.table enables simple plotting for numeric variables

scratch_dt[,plot(numeric1, numeric2)]

### subsetting the table ######################################################

### by column

# selecting a single column results in a vector
class(scratch_dt[,char1])
length(scratch_dt[,char1])

# multiple columns can be selected

# specifying multiple columns by a vector results in a concatenated vector
class(scratch_dt[,c(numeric1, char1)])
length(scratch_dt[,c(numeric1, char1)])

# specifying multiple columns by list results in a data.table
class(scratch_dt[,list(numeric1, char1)])
scratch_dt[,list(numeric1, char1)]

# '.' is an alias for 'list'
class(scratch_dt[,.(numeric1, char1)] )
scratch_dt[,.(numeric1, char1)]

# computed columns
scratch_dt[1:5, round(numeric1, 1)] # compute standalone vector
scratch_dt[, .(new_numeric = round(numeric1, 1))] # assign name

### by row

scratch_dt[3:5] # use numeric indices/slicing
scratch_dt[3:5,]
scratch_dt[char1 == 'DDDDDDDD']
scratch_dt[char1 %in% c('DDDDDDDD', 'EEEEEEEE')]

# .N contains the number of rows or the last row
scratch_dt[.N]
scratch_dt[,.N]

### sorting the table #########################################################

# data.table::setorder reorders columns by reference
sorted <- setorder(scratch_dt, char1)
sorted

# when used in data.table order() also reorders columns by reference
sorted <- scratch_dt[order(char1)]
sorted

# sort orders can be specified by using order()
sorted2 <- scratch_dt[order(char1, -numeric1)]
sorted2

# data.table::setkey reorders columns by reference by the specified key
#  variable (here called 'key') and sets the variable to the key of the
#  data.table for future operations
# subsetting and selecting by the key variable will be more efficient
#  in future operations
sorted3 <- setkey(scratch_dt, key)
sorted3

### updating the table ########################################################

# update rows by reference using the := operator
# data.table supports overwrite of data
scratch_dt2 <- scratch_dt[key > 500, char1 := 'ZZZZZZZZ']
scratch_dt2

# create new columns by reference using the := operator
scratch_dt2[, new_numeric := round(numeric1, 1)]
scratch_dt2  

### adding data to the table ##################################################

# use data.table::rbindlist to stack data.tables vertically
bindr <- rbindlist(list(sorted, sorted2))
nrow(bindr)

# data.table::merge joins tables side-by-side using a common key variable
# joining data.tables without prespecified keys (i.e. by using data.table::setkey)
#   requires that a key for the join be specified
# The prefix 'x.' is added to the left table variable names by default
# The prefix 'y.' is added to the right table variables names by default
joined1 <- merge(sorted, sorted2, by = c('key'))
joined1

# joining data.tables with prespecified keys does not require that a key be
#   specified when data.table::merge is called
# Add a key to the scratch_dt2 table
scratch_dt2 <- setkey(scratch_dt2[,.(key, char1, new_numeric)], key)
scratch_dt2

# Now sorted3 and scratch_dt2 can be joined without specifiying a key
joined2 <- merge(sorted3, scratch_dt2)
joined2

### by group processing #######################################################
# by groups allow you to divide and process a data set based on the values
#   of a certain variable
# general form of a data.table is dt[i, j, by]
#   by is by group variable name

scratch_dt2[, sum(new_numeric), by = char1]
scratch_dt2[1:500, sum(new_numeric), by = char1]

# .N returns the number of rows in each by group
scratch_dt2[, .N, by = char1]

# by groups can also be a list
scratch_dt[, mean(new_numeric), by = .(char1, char2)]

# .SD represents all the variables except the by variable(s)
scratch_dt2[, lapply(.SD, sum), by = char1]

# .N can be used to find the first and last rows of each by group
scratch_dt2[, .SD[c(1, .N)], by = char1]

### operations can be chained #################################################

# chaining
scratch_dt2[, .(new_numeric2 = sum(new_numeric)), by = char1][new_numeric2 > 40]

# no chaining
scratch_dt3 <- scratch_dt2[, .(new_numeric2 = sum(new_numeric)), by = char1]
scratch_dt3[new_numeric2 > 40]

### Transposing a table #######################################################
# Transposing a matrix simply switches row and columns values
# Transposing a data.frame or data.table is more complex because of metadata
#   associated with variable names and row indices

transposed = t(scratch_dt)
str(transposed)

# Often, instead of simply transposing, a data set will need to be reformatted
# in a melt/stack-column split-cast action described in Hadley Wickham's
# 'Tidy Data' https://www.jstatsoft.org/article/view/v059i10
# see also dcast.data.table and melt.data.table

### exporting and importing the table #########################################
# fread and fwrite allow for optimized file i/o
# fwrite only availabe in data.table version > 1.9.7
# available from http://Rdatatable.github.io/data.table

# use fwrite to write a file
fwrite(scratch_dt, 'scratch_dt.csv')

# use fread to read a file
scratch_dt <- fread('scratch_dt.csv')
head(scratch_dt)
```

#### Base SAS and PROC SGPLOT - [clone/download notebook](src/notebooks/sas)
```sas

******************************************************************************;
* Copyright (C) 2015 by SAS Institute Inc., Cary, NC 27513 USA               *;
*                                                                            *;
* Licensed under the Apache License, Version 2.0 (the "License");            *;
* you may not use this file except in compliance with the License.           *;
* You may obtain a copy of the License at                                    *;
*                                                                            *;
*   http://www.apache.org/licenses/LICENSE-2.0                               *;
*                                                                            *;
* Unless required by applicable law or agreed to in writing, software        *;
* distributed under the License is distributed on an "AS IS" BASIS,          *;
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.   *;
* See the License for the specific language governing permissions and        *;
* limitations under the License.                                             *;
******************************************************************************;

******************************************************************************;
* NOTE: examples are meant for the free SAS University Edition               *;
* to install see: http://www.sas.com/en_us/software/university-edition.html  *;
******************************************************************************;

******************************************************************************;
* SECTION 1: Hello World! - Standard SAS Output                              *;
******************************************************************************;

* the _null_ data step allows you to execute commands;
* or read a data set without creating a new data set;
data _null_;
	put 'Hello world!';
run;

* print the value of a variable to the log;
* VERY useful for debugging;
data _null_;
	x = 'Hello world!';
	put x;
	put x=;
run;

* file print writes to the open standard output;
* usually html or listing;
data _null_;
	file print;
	put 'Hello world!';
run;

* logging information levels;
* use these prefixes to print color-coded information to the log;
data _null_;
	put 'NOTE: Hello world!';
	put 'WARNING: Hello world!';
	put 'ERROR: Hello world!';
run;

* you can also use the put macro statement;
* SAS macro statements are often used for program flow control around DATA;
*   step statements and SAS procedures;
* This tutorial will only use simple macro statements;
%put Hello world!;
%put NOTE: Hello world!;
%put WARNING: Hello world!;
%put ERROR: Hello world!;

%put 'Hello world!'; /* macro variables are ALWAYS strings */

* the macro preprocessor resolves macro variables as text literals;
* before data step code is executed;
%let x = Hello world!;
%put &x;
%put '&x'; /* single quotes PREVENT macro resolution */
%put "&x"; /* double quotes ALLOW macro resolution */

******************************************************************************;
* SECTION 2 - SAS data sets                                                  *;
******************************************************************************;

*** sas data sets ************************************************************;

* the sas data set is the primary data structure in the SAS language;
* now you will make one called scratch;
* The size of data set is more typically defined by the size of the SAS data
*   set(s) from which it is created;

%let n_rows = 1000; /* define number of rows */
%let n_vars = 5;    /* define number of character and numeric variables */

* options mprint; /* to see the macro variables resolve uncomment this line */
data scratch;

  /* data sets can be made permanent by creating them in a library */
  /* syntax: data <library>.<table> */
  /* a library is like a database */
  /* a library is usually directly mapped to a filesystem directory */  
	/* since you did not specify a permanent library on the data statement */
	/* the scratch set will be created in the temporary library work */
	/* it will be deleted when you leave SAS */

	/* SAS is strongly typed - it is safest to declare variables */
	/* using a length statement - especially for character variables */
	/* $ denotes a character variable */

	/* arrays are a data structure that can exist during the data step */
	/* they are a reference to a group of variables */
	/* horizontally across a data set */
	/* $ denotes a character array */
	/* do loops are often used in conjuction with arrays */
	/* SAS arrays are indexed from 1, like R data structures */

	/* a key is a variable with a unique value for each row */

	/* mod() is the modulo function */
	/* the %eval() macro function performs math operations */
	/* before text substitution */

	/* the drop statement removes variables from the output data set */

	/* since you are not reading from a pre-existing data set */
	/* you must output rows explicitly using the output statement */

	length key 8 char1-char&n_vars $ 8 numeric1-numeric&n_vars 8;
	text_draw = 'AAAAAAAA BBBBBBBB CCCCCCCC DDDDDDDD EEEEEEEE FFFFFFFF GGGGGGGG';
	array c $ char1-char&n_vars;
	array n numeric1-numeric&n_vars;
	do i=1 to &n_rows;
		key = i;
		do j=1 to %eval(&n_vars);
			/* assign a random value from text_draw */
			/* to each element of the array c */
			c[j] = scan(text_draw, floor(7*ranuni(12345)+1), ' ');
			/* assign a random numeric value to each element of the n array */
			/* ranuni() requires a seed value */
			n[j] = ranuni(%eval(&n_rows*&n_vars));
		end;
	  if mod(i, %eval(&n_rows/10)) = 0 then put 'Processing line ' i '...';
		drop i j text_draw;
		output;
	end;
	put 'Done.';
run;

* (obs=) option enables setting the number of rows to print;
proc print data=scratch (obs=5); run;

*** basic data analysis ******************************************************;

* use proc contents to understand basic information about a data set;
proc contents data=scratch;
run;

* use proc freq to analyze categorical data;
proc freq
	/* nlevels counts the discreet levels in each variable */
	/* the colon operator expands to include variable names with prefix char */
	data=scratch nlevels;
	/* request frequency bar charts for each variable */
	tables char: / plots=freqplot(type=bar);
run;

* use proc univariate to analyze numeric data;
proc univariate
	data=scratch;
	/* request univariate statistics for variables names with prefix 'numeric' */
	var numeric:;
	/* request histograms for the same variables */
	histogram numeric:;
	/* inset basic statistics on the histograms */
	inset min max mean / position=ne;
run;

*** basic data manipulation **************************************************;

* subsetting columns;
* create scratch2 set;
data scratch2;
	/* set statement reads from a pre-existing data set */
	/* no output statement is required - this is more typical */
	/* using data set options: keep, drop, etc. is often more efficient than */
	/* corresponding data step statements */
	/* : notation */
	set scratch(keep=numeric:);
run;

* print first five rows;
proc print data=scratch2(obs=5); run;

* overwrite scratch2 set;
data scratch2;
    /* ranges of vars specified using var<N> - var<M> syntax */
	set scratch(keep=char1-char&n_vars);
run;

* print first five rows;
proc print data=scratch2(obs=5); run;

* overwrite scratch2 set;
data scratch2;
	/* by name */
	set scratch(keep=key numeric1 char1);
run;

* print first five rows;
proc print data=scratch2(obs=5); run;

* subsetting and modifying columns;
* select two columns and modify them with data step functions;
* overwrite scratch2 set;
data scratch2;
	/* use length statement to ensure correct length of trans_char1 */
	/* the lag function saves the value from the row above */
	/* lag will create a numeric missing value in the first row */
	/* tranwrd finds and replaces character values */
	set scratch(keep=key char1 numeric1
		rename=(char1=new_char1 numeric1=new_numeric1));
 	length trans_char1 $8;
	lag_numeric1 = lag(new_numeric1);
	trans_char1 = tranwrd(new_char1, 'GGGGGGGG', 'foo');
run;

* print first five rows;
* notice that '.' represents numeric missing in SAS;
proc print data=scratch2(obs=5); run;

* subsetting rows;
* select only the first row and impute the missing value;
* create scratch3 set;
data scratch3;
	/* the where data set option can subset rows of data sets */
	/* there are MANY other ways to do this ... */
	set scratch2 (where=(key=1));
	lag_numeric1 = 0;
run;

* print;
proc print data=scratch3; run;

* subsetting rows;
* remove the problematic first row containing the missing value;
* from scratch2 set;
data scratch2;
	set scratch2;
	if key > 1;
run;

* print first five rows;
proc print data=scratch2(obs=5); run;

* combining data sets top-to-bottom;
* add scratch3 to the bottom of scratch2;
proc append
	base=scratch2  /* proc append does not read the base set */
	data=scratch3; /* for performance reasons base set should be largest */
run;

* sorting data sets;
* sort scratch2 in place;
proc sort
	data=scratch2;
	by key; /* you must specificy a variables to sort by */
run;

* print first five rows;
proc print data=scratch2(obs=5); run;

* sorting data sets;
* create the new scratch4 set;
proc sort
	data=scratch2
	out=scratch4; /* specifying an out set creates a new data set */
	by new_char1 new_numeric1; /* you can sort by many variables */
run;

* print first five rows;
proc print data=scratch4(obs=5); run;

* combining data sets side-by-side;
* to create messy scratch5 set;
data scratch5;
	/* merge simply attaches two or more data sets together side-by-side*/
	/* it overwrites common variables - be careful */
	merge scratch scratch4;
run;

* print first five rows;
proc print data=scratch5(obs=5); run;

* combining data sets side-by-side;
* join columns to scratch from scratch2 when key variable matches;
* to create scratch6 correctly;
data scratch6;
	/* merging with a by variable is safer */
	/* it requires that both sets be sorted */
	/* then rows are matched when key values are equal */
	/* very similar to SQL join */
	merge scratch scratch2;
	by key;
run;

* print first five rows;
proc print data=scratch6(obs=5); run;

* don't forget PROC SQL;
* nearly all common SQL statements and functions are supported by PROC SQL;
* join columns to scratch from scratch2 when key variable matches;
* to create scratch7 correctly;
proc sql noprint; /* noprint suppresses procedure output */
	create table scratch7 as
	select *
	from scratch
	join scratch2
	on scratch.key = scratch2.key;
quit;

* print first five rows;
proc print data=scratch7(obs=5); run;

* comparing data sets;
* results from data step merge with by variable and PROC SQL join;
* should be equal;
proc compare base=scratch6 compare=scratch7;
run;

* export data set;
* to default directory;
* to create a csv file;
proc export
	data=scratch7
	/* likely the correct directory for SAS University Edition */
	outfile='/folders/myfolders/sasuser.v94/scratch.csv'
	/* create a csv */
	dbms=csv
	/* replace an existing file with that name */
	replace;
run;

* import data set;
* from default directory;
* from the csv file;
* to overwrite scratch7 set;
proc import
	/* import from scratch7.csv */
	/* likely the correct directory for SAS University Edition */
	datafile='/folders/myfolders/sasuser.v94/scratch.csv'
	/* create a sas table in the work library */
	out=scratch7
	/* from a csv file */
	dbms=csv
	/* replace an existing data set with that name */
	replace;
run;

* by group processing;
* by variables can be used in the data step;
* the data set must be sorted;
* create scratch8 summary set;
data scratch8;
	set scratch4;
	by new_char1 new_numeric1;
	retain count 0; /* retained variables are remembered from row-to-row */
	if last.new_char1 then do; /* first. and last. can be used with by vars */
		count + 1; /* shorthand to increment a retained variable */
		output; /* output the last row of a sorted by group */
	end;
run;

* using PROC PRINT without the data= option prints the most recent set;
proc print; run;

* by group processing;
* by variables can be used efficiently in most procedures;
* the data set must be sorted;
proc univariate
	data=scratch4;
	var lag_numeric1;
	histogram lag_numeric1;
	inset min max mean / position=ne;
	by new_char1;
run;

* transpose;
proc transpose
	data=scratch
	out=scratch8;
run;

* print;
proc print; var _NAME_ col1-col5; run;

* transposing a sas data set can be a complex process;
* because of metadata associated with variable names;

* often, instead of simply transposing, a data set will need to be reformatted;
* in a melt/stack - column split - cast action described in Tidy Data by
* Hadley Wickham: https://www.jstatsoft.org/article/view/v059i10
* see also:
*  https://github.com/sassoftware/enlighten-apply/tree/master/SAS_UE_TidyData

******************************************************************************;
* SECTION 3 - generating analytical graphics                                 *;
******************************************************************************;

*** histograms using PROC SGPLOT *********************************************;

proc sgplot
	/* sashelp.iris is a sample data set */
	/* binwidth - bin width in terms of histogram variable */
	/* datalabel - display counts or percents for each bin */
	/* showbins - use bins to determine x-axis tickmarks */
	data=sashelp.iris;
	histogram petalwidth /
		binwidth=2
		datalabel=count
		showbins;
run;

*** bubble plots using PROC SGPLOT *******************************************;

proc sgplot
	/* group - color by a categorical variable */
	/* lineattrs - sets the bubble outline color and other outline attributes */
	data=sashelp.iris;
	bubble x=petalwidth y=petallength size=sepallength /
		group=species
		lineattrs=(color=grey);
run;

*** scatter plot with regression information using PROC SGPLOT ***************;

proc sgplot
	/* clm - confidence limits for mean predicted values */
	/* cli - prediction limits for individual predicted values */
	/* alpha - set threshold for clm and cli limits */
	data=sashelp.iris;
	reg x=petalwidth y=petallength /
	clm cli alpha=0.1;
run;

*** stacked bar chart using PROC SGPLOT **************************************;

proc sgplot
	/* sashelp.cars is a sample data set */
	/* vbar variable on x-axis */
	/* group - splits vertical bars */
	/* add title */
	data=sashelp.cars;
	vbar type / group=origin;
	title 'Car Types by Country of Origin';
run;
```

#### SAS PROC SQL - [clone/download notebook](src/notebooks/sas)
```sas

******************************************************************************;
* Copyright (C) 2017 - 2023 by J. Patrick Hall, jphall@gwu.edu                      *;
*                                                                            *;
* Permission is hereby granted, free of charge, to any person obtaining a    *;
* copy of this software and associated documentation files (the "Software"), *;
* to deal in the Software without restriction, including without limitation  *;
* the rights to use, copy, modify, merge, publish, distribute, sublicense,   *;
* and/or sell copies of the Software, and to permit persons to whom the      *;
* Software is furnished to do so, subject to the following conditions:       *;
*                                                                            *;
* The above copyright notice and this permission notice shall be included    *;
* in all copies or substantial portions of the Software.                     *;
*                                                                            *;
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS    *;
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,*;
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *;
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER *;
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING    *;
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER        *;
* DEALINGS IN THE SOFTWARE.                                                  *;
******************************************************************************;

******************************************************************************;
* simple SQL operations demonstrated using SAS PROC SQL                      *;
* a *VERY BASIC* introduction to SQL                                         *;
******************************************************************************;

******************************************************************************;
* NOTE: examples are meant for the free SAS University Edition               *;
* to install see: http://www.sas.com/en_us/software/university-edition.html  *;
* Refer to part 0                                                            *;
******************************************************************************;

*** simulate some small example tables using SAS data step *******************;
* table1 has a primary key called key and two numeric variables: x1 and x2;
* table1 is located in the SAS work library, it could be called work.table1;
data table1;
	do key=1 to 20;
		x1 = key * 10;
		x2 = key + 10;
		output;
	end;
run;
proc print; run;

* table2 has a primary key called key and two character variables: x3 and x4;
* table2 is located in the SAS work library, it could be called work.table2;
data table2;
	do key=2 to 20 by 2;
		x3 = scan('a b c d e f g h i j', key/2);
		x4 = scan('k l m n o p q r s t', key/2);
		output;
	end;
run;
proc print; run;

******************************************************************************;
* SAS PROC SQL allows users to execute valid SQL statements;
* often called queries, from SAS;
* in a more typical SQL environment the proc sql and quit statements;
* would be unnecessary and unrecognized in a query;

proc sql;

 	* display basic information about table1 in the SAS log;
 	* in SQL parlance work is the database and table1 is the table;
	describe table work.table1;

quit;

proc sql;

	* display the variable x1 from table1;
	select x1 from work.table1;

quit; 	

* the NOPRINT option can be used to supress output;
* very important for large tables;
proc sql /* noprint */;

	* create table3 in the work library/database;
	* x1 from table1 will be named x5 in the new table;
	* the SQL statement as creates a temporary name or alias;
	create table table3 as
	select key, x1 as x5
	from table1;

quit;

proc sql;

	* a where clause is used to subset rows of a table;
	* the order by statement sorts displayed results or created tables;
	* desc refers to descending sort order;
	create table table4 as
	select key, x2 as x6
	from table1
	where key <= 10
	order by x6 desc;

quit;

proc sql;

	* insert can be used to add data to a table;
	insert into table1
	values (21, 210, 31);

quit;

proc sql;

	* update can be used to change the value of previously existing data;
	update table1
	set key = 6, x1 = 60, x2 = 16
	where key = 7;

quit;

proc sql; 	

	* an inner join only retains rows from both tables;
	* where key values match;
	create table table5 as
	select *
	from table1
	join table2
	on table1.key = table2.key;

quit;

proc sql;

	* left joins retain all the rows from one table;
	* and only retain rows where key values match from the other table;
	* aliases can also be used for tables;
	create table table6 as
	select *
	from table1 as t1 /* left table */
	left join table2 as t2 /* right table */
	on t1.key = t2.key;

quit;

proc sql;

	* the where statement cannot be used with aggregate functions;
	* instead use the having statement;
	* where sum_x1 > 100 would cause errors in this query;
	create table table7 as
	select key, sum(x1) as sum_x1
	from table1
	group by key
	having sum_x1 > 100;

quit;

proc sql;

	* a subquery is a query embedded in another query;
	select *
	from
	(select key, x1, x2
	from table1
	where key <= 10);

quit;
```


================================================
FILE: 01_basic_data_prep/assignment/.gitignore
================================================
raw
assignment_1.docx
key


================================================
FILE: 01_basic_data_prep/notes/.gitignore
================================================
*.pptx


================================================
FILE: 01_basic_data_prep/quiz/.gitignore
================================================
key


================================================
FILE: 01_basic_data_prep/src/notebooks/py/.gitignore
================================================
.ipynb_checkpoints
scratch.csv

================================================
FILE: 01_basic_data_prep/src/notebooks/py/Py_Part_0_pandas_numpy.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# License \n",
    "***\n",
    "Copyright (C) 2017 J. Patrick Hall, jphall@gwu.edu\n",
    "\n",
    "Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the \"Software\"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:\n",
    "\n",
    "The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.\n",
    "\n",
    "THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "***\n",
    "# Python: Part 0 - Pandas and Numpy\n",
    "\n",
    "## 1. Standard output\n",
    "`print` is the primary function used to write to the console in Python\n",
    "* `print` is a *function* in Python 3\n",
    "* `print` is a *statement* in Python 2 "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Hello World!\n",
      "Hello World!\n"
     ]
    }
   ],
   "source": [
    "print('Hello World!') # Python 3\n",
    "print 'Hello World!'  # Python 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Hello World!'"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# An object with no functions or operators is also printed to the console\n",
    "x = 'Hello World!'\n",
    "x"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "***\n",
    "## 2. Importing libraries \n",
    "\n",
    "Python contains many libraries, often called *modules*, for different purposes\n",
    "\n",
    "Modules are:\n",
    "* Nearly always free and open source\n",
    "* Installed using many different methods - a package manager like `conda`, readily available through the Anaconda release of Python (https://www.continuum.io/downloads) - is often a good solution for installing and managing packages/modules \n",
    "* Of relatively high and uniform quality and but licensing can vary\n",
    "* Imported using the `import` statement"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# import packages\n",
    "import string                   # module with string utilities\n",
    "import pandas as pd             # large module with many utilities for dataframes, here aliased as 'pd' \n",
    "import numpy as np              # large module with many numeric and mathematical utilities, here aliased as 'np'\n",
    "import matplotlib.pyplot as plt # module for plotting\n",
    "\n",
    "# \"magic\" syntax to display matplotlib graphics in a notebook\n",
    "# magic statements start with '%' and are often used to control notebook behavior\n",
    "%matplotlib inline             "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "***\n",
    "## 3. Generating a sample data set\n",
    "#### Set the number of rows and columns for the sample data set"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "n_rows = 1000\n",
    "n_vars = 2"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Create lists of strings that will become column names\n",
    "* Lists are a common data structure in Python\n",
    "* Lists are surrounded by square brackets [] and contain different data types as list elements\n",
    "* Lists can be created by a speficic type Pythonic syntax, called list comprehensions\n",
    "* Lists in Python are indexed from 0, unlike SAS or R\n",
    "* Lists in Python, and other data structures, can be sliced using numeric indices"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['numeric1', 'numeric2']"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# list comprehension\n",
    "# str() converts to string\n",
    "# range() creates a list of values from arg1 to arg2\n",
    "num_col_names = ['numeric' + str(i+1) for i in range(0, n_vars)]  \n",
    "num_col_names"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "list"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "type(num_col_names) # type() can be used to determine the class of an object in Python"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### Python supports anonymous functions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['char1', 'char2']"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# anonymous functions\n",
    "# the lamba statement is used to define simple anonymous functions\n",
    "# map() is very similar to to lapply() in R - it applies a function to the elements of a list\n",
    "char_col_names = map(lambda j: 'char' + str(j+1), range(0, n_vars)) \n",
    "char_col_names"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Create a list of text elements from which to sample"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ABCDEFGHIJKLMNOPQRSTUVWXYZ\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "['AAAAAAAA',\n",
       " 'BBBBBBBB',\n",
       " 'CCCCCCCC',\n",
       " 'DDDDDDDD',\n",
       " 'EEEEEEEE',\n",
       " 'FFFFFFFF',\n",
       " 'GGGGGGGG']"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# string.ascii_uppercase is a string constant of uppercase letters\n",
    "print(string.ascii_uppercase)\n",
    "\n",
    "# another list comprehension\n",
    "# slice first seven letters of the string\n",
    "text_draw = [(letter * 8) for letter in string.ascii_uppercase[:7]] \n",
    "text_draw"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Create a random numerical columns directly using numpy\n",
    "The numerical columns will originally be a 2-D numpy array"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[-1.846671  ,  1.84830227],\n",
       "       [-0.70740383, -1.00412281],\n",
       "       [-0.09483552, -0.25116307],\n",
       "       [-0.12577991, -1.23737785],\n",
       "       [ 0.38218289, -1.7115725 ]])"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "randoms = np.random.randn(n_rows, n_vars)\n",
    "randoms[0:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "numpy.ndarray"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "type(randoms)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Create numerical columns of Pandas dataframe from numpy array\n",
    "Notice that a key is generated automatically "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>numeric1</th>\n",
       "      <th>numeric2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>-1.846671</td>\n",
       "      <td>1.848302</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>-0.707404</td>\n",
       "      <td>-1.004123</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>-0.094836</td>\n",
       "      <td>-0.251163</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>-0.125780</td>\n",
       "      <td>-1.237378</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.382183</td>\n",
       "      <td>-1.711572</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   numeric1  numeric2\n",
       "0 -1.846671  1.848302\n",
       "1 -0.707404 -1.004123\n",
       "2 -0.094836 -0.251163\n",
       "3 -0.125780 -1.237378\n",
       "4  0.382183 -1.711572"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "num_cols = pd.DataFrame(randoms, columns=num_col_names)\n",
    "num_cols.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "pandas.core.frame.DataFrame"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "type(num_cols)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Create random character columns as a Pandas dataframe\n",
    "* Use a numpy sampling function `choice()` to generate a 2-D numpy array of random text values\n",
    "* Create Pandas dataframe from numpy 2-D array"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>char1</th>\n",
       "      <th>char2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>EEEEEEEE</td>\n",
       "      <td>AAAAAAAA</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>DDDDDDDD</td>\n",
       "      <td>DDDDDDDD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>AAAAAAAA</td>\n",
       "      <td>DDDDDDDD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>DDDDDDDD</td>\n",
       "      <td>FFFFFFFF</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>CCCCCCCC</td>\n",
       "      <td>FFFFFFFF</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      char1     char2\n",
       "0  EEEEEEEE  AAAAAAAA\n",
       "1  DDDDDDDD  DDDDDDDD\n",
       "2  AAAAAAAA  DDDDDDDD\n",
       "3  DDDDDDDD  FFFFFFFF\n",
       "4  CCCCCCCC  FFFFFFFF"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "char_cols = pd.DataFrame(np.random.choice(text_draw, (n_rows, n_vars)), \n",
    "                         columns=char_col_names)\n",
    "char_cols.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Use Pandas `concat()` function to join the numeric and character columns into a new dataframe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>numeric1</th>\n",
       "      <th>numeric2</th>\n",
       "      <th>char1</th>\n",
       "      <th>char2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>-1.846671</td>\n",
       "      <td>1.848302</td>\n",
       "      <td>EEEEEEEE</td>\n",
       "      <td>AAAAAAAA</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>-0.707404</td>\n",
       "      <td>-1.004123</td>\n",
       "      <td>DDDDDDDD</td>\n",
       "      <td>DDDDDDDD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>-0.094836</td>\n",
       "      <td>-0.251163</td>\n",
       "      <td>AAAAAAAA</td>\n",
       "      <td>DDDDDDDD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>-0.125780</td>\n",
       "      <td>-1.237378</td>\n",
       "      <td>DDDDDDDD</td>\n",
       "      <td>FFFFFFFF</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.382183</td>\n",
       "      <td>-1.711572</td>\n",
       "      <td>CCCCCCCC</td>\n",
       "      <td>FFFFFFFF</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   numeric1  numeric2     char1     char2\n",
       "0 -1.846671  1.848302  EEEEEEEE  AAAAAAAA\n",
       "1 -0.707404 -1.004123  DDDDDDDD  DDDDDDDD\n",
       "2 -0.094836 -0.251163  AAAAAAAA  DDDDDDDD\n",
       "3 -0.125780 -1.237378  DDDDDDDD  FFFFFFFF\n",
       "4  0.382183 -1.711572  CCCCCCCC  FFFFFFFF"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "scratch_df = pd.concat([num_cols, char_cols], axis=1)\n",
    "scratch_df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "***\n",
    "## 4. Plotting variables in a dataframe\n",
    "Pandas has several builtin plotting utilities\n",
    "#### Use Pandas `hist()` method to plot a histogram of numeric1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<matplotlib.axes._subplots.AxesSubplot at 0xc3b33c8>"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYQAAAEKCAYAAAASByJ7AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAGaFJREFUeJzt3X2UZHV95/H3RxARMSBqYHhQUGGz+AQqE41mHdeA6CLg\n2SziwxGVTTyaVdZ1NzJg4iSuiboRzSbR1RXdUQOKEFlc9cBAaPXEjUQFBAcENg4yCIMPCKOgDsx3\n/7i/lpqe6u6ama6+3T3v1zl1pu6t+/Dtnq77qd/v/ureVBWSJD2o7wIkSQuDgSBJAgwESVJjIEiS\nAANBktQYCJIkwEDQmCW5Nsm/6ruOPiV5SZJbkmxM8tS+69lWSV6R5OK+69D4GQjabknWJXn+lHmv\nTvKVyemqelJVfXmW7RycZHOSpfr3+BfAG6rq4VV19dQX28/+rSQZmPdfk3xsXqucRlX9bVW9YLbl\nkjwpycVJfpBk83zUprm1VN+Amh/VHnMlsy+yHRtNdhnHdkfcd4DHAGtnWXQZcPLA9IL4xug2/u5+\nCXwKOHVM5WjMDATNtS0OZK0V8a/b8+VJvp7kriS3J/mLtthkC+InrVvlN9N5W1t/Q5LVSX5tYLuv\nSnJzkh8OLDe5n1VJzk/yiSR3AackOSrJ/01yZ5LvJ/mrJA8e2N7mJK9PcmOSu5P8aZLHt3V+kuRT\ng8tP+RmH1prkIcBGYBfg6iQ3zvB7ew/wJwMH4MHWwookt8zwe12V5DPt5727tTYOTbKy1XNzkqMH\n1t0rydnt97A+yTsmW2ethfcPSc5K8kNg1dRWX5InJlmT5Eft/3ElQFXdUFUfY/bw0wJlIGhHTf1U\nP3V6MCD+EnhfVe0FPA74TJv/2+3fvVq3yteA1wCnACvasnsCfw2Q5HDgb4CX0X2y3gvYf8p+jwc+\n0/Z1DnA/cBrwSOBZwPOBN0xZ5xjgSOCZwFuB/9n28Rjgye35MENrrapfVNWebZmnVNWh06wP8Fng\nbuDVMywzaGoL4jjg48AjgCuBNW3+/sA7gA8NLPu/6D7NP57u5z0G+PcDry8H/h/w68A7B3eS5OHA\npcAX6H73TwAuG7FmLXAGgnZEgAvbp+47k9xJd6Cerrvjl8ChSR5VVfe0A//kdqZ6BfDeqlpXVT8D\nVgInt0/QvwtcVFVfrapNwB8P2edXq+oigKr6eVV9s6quqKrNVXUz8GHguVPWeU9V/bSq1gLXAF9s\n+78b+CLdwXOY6WrdlvfXZuCPgD+ariUyiy9X1Zqquh84ny743tWmPw0c3Fot+wIvBN5cVfdW1Q+A\n97Nld9X3q+pv2u/q51P2c1x7/X1V9cv2+7piO+rVAmQgaEcUcEJVPWLyQfepe7pzAacChwHXJbki\nyb+ZYdvLgJsHpr8H7Ars215b/6siqu4FfjRl/fWDE0kOS/J/ktzWupHeSXfQHLRh4Pm9Q6b3ZLiZ\nah1ZVX2x1f06tv0cwh0Dz+8FflgPXLny3vbvnsBjgQcDtw2E+P8AHj2w/hbdU1McBPzzNtamRcJA\n0Fyb9sRwVd1UVS+vqkcD7wbOT/JQhh/8vg8cPDD9GOA+4HbgNuDAX+2w28bUg/vUbX6Qrm/7Ca0b\n6Uzm7u9/ulo3DF16ZmcCZwB7DMz72eB0ayU9mu1zC/AL4JEDQb5XVT15YJmZwuh7dN1iWoIMBM2b\nJK9MMnkgu4vuwLMZ+EH79/EDi58LvDndkNQ9gT8DPlVVm4ELgBcneVaS3YBVzD5CaU+6E7z3JPkN\n4PWjlDzN86lmqnWbVNWXgGvpzklMugHYPcmLWnfS24CHbOu22/ZvAy4Bzkry8CQPaifPR/2uyOeB\nZUlOS/KQto3lky8m2R3YrT1/SDuxrkXCQNBcm2ko6guAa5NsBN4HnNxOvN5D14XzD60bYznwUeAT\ndCOQ/hm4B3gjQFV9uz3/FN2n8410XSa/mKGG/wy8nO7E7YfbuoPLDKt56uvT/VzT1jrDtqfbD3QH\n/H0m51fVXXRdcR+h61L6KVt26wyrbabpV9EdtNcCP6Y7ub/fLNuarGUjcDTwYrqW2g10J9NJcjDd\nz35tW/5e4LqhP7EWpIzrBjntk8KX6D7J7Ab876pamWQfupNcjwXWASdV1U/aOiuB19KNCHlTVV0y\nluK0pLRP5XfSdQfdPNvykoYbWwuhjU54XlUdATwFeF6S5wCnA2uq6jC64Wqnw6+GEr4UOBw4FvjA\nNo7S0E4kyYuT7JHkYXTfBP6WYSDtmLEecFtXAHQthF3oPsUdD6xu81cDJ7bnJwDnVtWmqloH3EQ3\nHloa5njg1vZ4PFsOm5S0HcYaCO2E1VV0oy0ub32/+1bV5OiLDTwwNG9/thwquB44YJz1afGqqt9r\nI2T2rqqjq2qmbwFLGsGu49x4G2VxRJK9gIuTPG/K65VkppMYC+J6LpK0MxhrIEyqqruSfB54OrAh\nyX5VdXuSZTzwhZpb6b70MunANm8LswSIJGkaVTXz8OyqGssDeBSwd3v+ULohec+nu4jXW9v80+m+\nXg/dyeSr6M43HEJ3LZUM2W6Nq+Y5/vlX9V3DUqlzMdRonda50B+jHDvH2UJYBqxuI4UeBHyiqi5L\nciVwXpJTacNOW6Vrk5xHNzb6Prrrx9sakKR5MrZAqKprgKcNmf9j4HemWefP6L7lKUmaZ47zH5+J\nvgsY0UTfBYxgou8CRjTRdwEjmui7gBFN9F3AiCb6LmCujO2byuOSpGq2EyOSpC2Mcuy0hSBJAgwE\nSVJjIEiSAANBktQYCJIkwECQJDUGgiQJMBAkSY2BIEkCDARJUmMgSJIAA0GS1BgIkiTAQJAkNQaC\nJAkwECRJjYEgSQIMBElSYyBIkgADQZLU7Np3AdJilKT62O9sN0mXdoSBIG23+c4Es0DjZZeRJAkw\nECRJjYEgSQIMBElSM7ZASHJQksuTfDvJtUne1OavSrI+yZXt8cKBdVYmuTHJ9UmOGVdtkqStpWo8\nIyWS7AfsV1VXJdkT+AZwInASsLGqzpqy/OHAOcBRwAHApcBhVbV5ynLl0Dv1rRt2Ov+jjPzb1/Ya\n5dg5thZCVd1eVVe15z8FrqM70MPw8XMnAOdW1aaqWgfcBCwfV32SpC3NyzmEJAcDRwL/2Ga9McnV\nSc5Osnebtz+wfmC19TwQIJKkMRt7ILTuovOB01pL4YPAIcARwG3Ae2dYvZdvg0rSzmis31RO8mDg\nAuCTVXUhQFXdMfD6R4DPtclbgYMGVj+wzRu23VUDkxNVNTF3VUvS4pdkBbBim9YZ40nlAKuBH1XV\nmwfmL6uq29rzNwNHVdXLB04qL+eBk8pPqCkFelJZC4EnlbXYjHLsHGcL4dnAK4FvJbmyzTsDeFmS\nI+jeTd8FXgdQVWuTnAesBe4D3jA1DCRJ4zO2FsK42ELQQmALQYtNr8NOJUmLi4EgSQIMBElSYyBI\nkgADQZLUGAiSJMBAkCQ1Y710hTRfuu8FSNoRBoKWkPnMBL8fpqXHLiNJEmAgSJIaA0GSBBgIkqTG\nQJAkAQaCJKkxECRJgIEgSWoMBEkSYCBIkhoDQZIEGAiSpMZAkCQBBoIkqTEQJEmAgSBJagwESRJg\nIEiSGgNBkgQYCJKkZmyBkOSgJJcn+XaSa5O8qc3fJ8maJDckuSTJ3gPrrExyY5LrkxwzrtokSVtL\nVY1nw8l+wH5VdVWSPYFvACcCrwF+WFXvSfJW4BFVdXqSw4FzgKOAA4BLgcOqavOU7VZVZSxFa9FK\nUjCev+Vp9sj87q/bp3/72l6jHDvH1kKoqtur6qr2/KfAdXQH+uOB1W2x1XQhAXACcG5VbaqqdcBN\nwPJx1SdJ2tK8nENIcjBwJPA1YN+q2tBe2gDs257vD6wfWG09XYBIkubBruPeQesuugA4rao2Jg+0\nWKqquqb+tIa+lmTVwOREVU3MQamStGQkWQGs2JZ1xhoISR5MFwafqKoL2+wNSfarqtuTLAPuaPNv\nBQ4aWP3ANm8rVbVqTCVL0pLQPihPTE4nefts64xzlFGAs4G1VfX+gZcuAk5pz08BLhyYf3KS3ZIc\nAhwKXDGu+iRJWxrnKKPnAF8GvsUDXT8r6Q7y5wGPAdYBJ1XVT9o6ZwCvBe6j62K6eMh2HWWkrTjK\nSJrZKMfOsQXCuBgIGsZAkGbW67BTSdLiYiBIkgADQZLUGAiSJMBAkCQ1BoIkCTAQJEmNgSBJAkYI\nhCRPno9CJEn9GqWF8MEk/5TkDUn2GntFkqRezBoIVfUc4BV01x76ZpJzvb2lJC09I1/LKMmudHc3\n++/AXXRhckZVXTC+8obW4bWMtJWd5VpG88332tIxyrFz1vshJHkq8GrgOGANcFxVfTPJ/sA/0t3v\nQNK8mO/Q085k1hZCki/R3dfg/Kq6Z8prr6qqj4+xvmH12ELQVnaeFsL8/oy+15aOObn8dbsF5r1V\ndX+b3gXYvap+NmeVbgMDQcMYCOPZn++1pWOuLn99KfDQgek96LqOJElLyCiBsHtV/XRyoqo20oWC\nJGkJGSUQfpbk6ZMTSZ4B3Du+kiRJfZh1lBHwH4HzktzWppcBLx1fSZKkPoz0PYQkuwH/gu6M1neq\natO4C5uhFk8qayueVB7P/nyvLR1zMsqobei3gEPoWhQFMN/DTQdqMRC0FQNhPPvzvbZ0zNUX0z4J\nPA64Crh/4KVeAkGSNB6jnEN4OnB4jXqNC0nSojTKKKNr6U4kS5KWsFFaCI8G1ia5AvhFm1dVdfz4\nypIkzbdRAmFV+7d44GpXdh9J0hIz6iijg4EnVNWlSfYAdq2qu8dc23S1OMpIW3GU0Xj253tt6ZiT\naxkl+X3gM8CH2qwDgc/ueHmSpIVklJPKfwA8B7gboKpuAH59lI0n+WiSDUmuGZi3Ksn6JFe2xwsH\nXluZ5MYk13tXNkmaX6MEwi+qavJk8uSd00Ztt34MOHbKvALOqqoj2+OLbbuH010S4/C2zgeSjFKf\nJGkOjHLA/VKSM4E9khxN1330uVE2XlVfAe4c8tKwfqwTgHOralNVrQNuApaPsh9J0o4bJRBOB34A\nXAO8DvgC8LYd3O8bk1yd5Owke7d5+wPrB5ZZDxywg/uRJI1o1mGn7U5pH26PufBB4E/b83cA7wVO\nnW73w2YmWTUwOVFVE3NUmyQtCUlWACu2ZZ1RrmX03SGzq6oety07GljxjoFtf4QHup9uBQ4aWPTA\nNm/YNlZtz74laWfRPihPTE4nefts64zyxbSjBp7vDvwu8MhtrO1Xkiyrqsl7K7yErisK4CLgnCRn\n0XUVHQpcsb37kSRtm5G+mLbVSsk3q+ppIyx3LvBc4FHABuDtdE2YI+i6g74LvK6qNrTlzwBeC9wH\nnFZVFw/Zpl9M01b8Ytp49ud7bemYk/shtNtnTi70IOAZwOur6qlzUuU2MhA0jIEwnv35Xls65uR+\nCHQnfSf/Cu8D1gEn7VhpkqSFZru6jPpkC0HD2EIYz/58ry0dc3XHtLew9V/hr656WlVnbWd9kqQF\nZNQ7ph1FNwoowHHAPwE3jLEuSdI8G+Wk8leAF1XVxjb9cOALVfXb81DfsHrsMtJW7DIaz/58ry0d\nc3L5a7orm24amN7EiFc7lSQtHqN0GX0cuCLJ39F9RDkRWD3WqiRJ827UO6Y9ne6eCABfrqorx1rV\nzLXYZaSt2GU0nv35Xls65qrLCGAPYGNV/SWwPskhO1ydJGlBGeUWmquAP6S7DDbAbsAnx1iTJKkH\no7QQXkJ385qfAVTVrcDDx1mUJGn+jXoLzc2TE0keNsZ6JEk9GWWU0WeSfAjYO8nv012N9CPjLUuL\nXXeSV9JiMuMooyShu2nNbwDHtNkXV9WaeahtupocZbQILP1RP44y0uKyw5e/boFwTVU9aa6L214G\nwuJgICyFfRoIS8kODzutLi2+kWT5nFYmSVpwRrmW0XeAJwA300Ya0WXFU8Zc23T12EJYBGwhLIV9\n2kJYSnbo8tdJHlNV3wNeQPdX6B+GJC1h07YQklxZVUe25xdU1b+d18qmYQthcbCFsBT2aQthKZnL\nS1c8bg7qkSQtYKMGgiRpiZupy+h+4J42+VDg3oGXq6p+bcy1DWWX0eJgl9FS2KddRkvJDp1Urqpd\n5r4kSdJCZZeRJAkwECRJjYEgSQIMBElSYyBIkoAxB0KSjybZkOSagXn7JFmT5IYklyTZe+C1lUlu\nTHJ9kmOGb1WSNA7jbiF8DDh2yrzTgTVVdRhwWZsmyeHAS4HD2zofSGILRpLmyVgPuFX1FeDOKbOP\nB1a356uBE9vzE4Bzq2pTVa0DbgK87LYkzZM+PoHvW1Ub2vMNwL7t+f7A+oHl1gMHzGdhkrQzG+We\nymNTVTXLvXeHvpZk1cDkRFVNzGVdkrTYJVkBrNiWdfoIhA1J9quq25MsA+5o82+lu3/zpAPbvK1U\n1arxlihJi1v7oDwxOZ3k7bOt00eX0UXAKe35KcCFA/NPTrJbkkOAQ4EreqhPknZKY20hJDkXeC7w\nqCS3AH8MvAs4L8mpwDrgJICqWpvkPGAtcB/whprt/p6SxmqWLt0559VV+zXrPZUXGi9/vTh4+eul\nsE8vt72UzOUd0yRJS5yBIEkCDARJUmMgSJIAA0GS1BgIkiTAQJAkNQaCJAkwECRJjYEgSQIMBElS\nYyBIkgADQZLUGAiSJMBAkCQ1BoIkCTAQJEmNgSBJAgwESVJjIEiSAANBktQYCJIkwECQJDUGgiQJ\nMBAkSY2BIEkCDARJUmMgSJIA2LWvHSdZB9wN3A9sqqrlSfYBPg08FlgHnFRVP+mrRknamfTZQihg\nRVUdWVXL27zTgTVVdRhwWZuWJM2DvruMMmX6eGB1e74aOHF+y5GknVffLYRLk3w9ye+1eftW1Yb2\nfAOwbz+lSdLOp7dzCMCzq+q2JI8G1iS5fvDFqqok1VNtkrTT6S0Qquq29u8PknwWWA5sSLJfVd2e\nZBlwx7B1k6wamJyoqolx1ytJi0mSFcCKbVqnav4/hCfZA9ilqjYmeRhwCfAnwO8AP6qqdyc5Hdi7\nqk6fsm5V1dRzD1pgutbdfP5thaW9vz72Of/78709PqMcO/tqIewLfDbJZA1/W1WXJPk6cF6SU2nD\nTnuqT5J2Or20EHaELYTt08/5mKX9adYWwtzvz/f2+CzkFoJ6Md8HE0mLSd/fQ5AkLRAGgiQJMBAk\nSY2BIEkCDARJUmMgSJIAA0GS1BgIkiTAQJAkNQaCJAkwECRJjYEgSQIMBElS49VOJS0Y832Zdi+3\nvSUDQdIC4iXa+2SXkSQJMBAkSY2BIEkCPIfQi37ubyxJMzMQetPHDdolaXp2GUmSAANBktQYCJIk\nwECQJDUGgiQJMBAkSY2BIEkCFmAgJDk2yfVJbkzy1r7rkbR0Jan5fvT9M89kQQVCkl2AvwaOBQ4H\nXpbkX/Zb1fZJsqLvGkYz0XcBI5jou4ARTfRdwIgm+i5gRBPzsI+ag8fl27DswragAgFYDtxUVeuq\nahPwKeCEnmvaXiv6LmA0E30XMIKJvgsY0UTfBYxoou8CRjTRdwEjmui7gDmz0C5dcQBwy8D0euA3\nx7nDJIcCLx7Dpp+V5D+NYbuSNBYLLRD6aFM9GXjvmLZ9zJi2K0lzLlULp18ryTOBVVV1bJteCWyu\nqncPLLNwCpakRWS2W4YutEDYFfgO8Hzg+8AVwMuq6rpeC5OkncCC6jKqqvuS/AfgYmAX4GzDQJLm\nx4JqIUiS+rPQhp2OLMlbkmxOsk/ftQyT5B1Jrk5yVZLLkhzUd03DJPlvSa5rtf5dkr36rmmYJP8u\nybeT3J/kaX3XM9Vi+EJlko8m2ZDkmr5rmUmSg5Jc3v6/r03ypr5rmirJ7km+1t7fa5P8ed81zSTJ\nLkmuTPK5mZZblIHQDq5HAzf3XcsM3lNVT62qI4ALgbf3XdA0LgGeWFVPBW4AVvZcz3SuAV4CfLnv\nQqZaRF+o/BhdjQvdJuDNVfVE4JnAHyy032dV/Rx4Xnt/PwV4XpLn9FzWTE4D1jLLSM5FGQjAWcAf\n9l3ETKpq48DknsAP+6plJlW1pqo2t8mvAQf2Wc90qur6qrqh7zqmsSi+UFlVXwHu7LuO2VTV7VV1\nVXv+U+A6YP9+q9paVd3Tnu5Gd87zxz2WM60kBwIvAj7CLPfSXXSBkOQEYH1VfavvWmaT5J1Jvgec\nAryr73pG8FrgC30XsQgN+0LlAT3VsqQkORg4ku7DyoKS5EFJrgI2AJdX1dq+a5rG+4D/AmyebcEF\nNcpoUpI1wH5DXjqTrktj8Atfvd09foY6z6iqz1XVmcCZSU6n+095zbwW2MxWZ1vmTOCXVXXOvBY3\nYJQ6FyhHZoxBkj2B84HTWkthQWkt6yPaebeLk6yoqomey9pCkuOAO6rqylGur7YgA6Gqjh42P8mT\ngEOAq5NA173xjSTLq+qOeSwRmL7OIc6hx0/es9WZ5NV0Tcrnz0tB09iG3+dCcyswOGjgILpWgrZT\nkgcDFwCfrKoL+65nJlV1V5LPA89g4V3Y6LeA45O8CNgd+LUkH6+qVw1beFF1GVXVtVW1b1UdUlWH\n0L3pntZHGMymXSNp0gnAlX3VMpMkx9I1J09oJ8oWg95ahdP4OnBokoOT7Aa8FLio55oWrXSf9s4G\n1lbV+/uuZ5gkj0qyd3v+ULpBLgvuPV5VZ1TVQe14eTLw99OFASyyQBhiITfV/zzJNa2PcQXwlp7r\nmc5f0Z30XtOGpX2g74KGSfKSJLfQjTr5fJIv9l3TpKq6D5j8QuVa4NML8QuVSc4FvgocluSWJL10\nYY7g2cAr6UbuXNkeC2101DLg79v7+2vA56rqsp5rGsWMx0y/mCZJAhZ/C0GSNEcMBEkSYCBIkhoD\nQZIEGAiSpMZAkCQBBoIkqTEQJEkA/H/6x1EDVGX6JwAAAABJRU5ErkJggg==\n",
      "text/plain": [
       "<matplotlib.figure.Figure at 0x33c1dd8>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# Pandas alllows slicing by dataframes index using ix[]\n",
    "# ix[:, 0] means all rows of the 0th column - or numeric1\n",
    "scratch_df.ix[:, 0].plot.hist(title='Histogram of Numeric1')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### use Pandas `scatter()` method to plot numeric1 vs. numeric2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<matplotlib.axes._subplots.AxesSubplot at 0xc4f7a20>"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Anaconda\\lib\\site-packages\\matplotlib\\collections.py:590: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison\n",
      "  if self._edgecolors == str('face'):\n"
     ]
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX8AAAEZCAYAAAB/6SUgAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJztvXucJldZ7/t9Opee6e709CUhFyYXSCADSTAGt3swSEYx\nGY5HQTJ6PAoYgsSDgJ0wHQ17QGRLDLghIQTUnEScxK2o20u4qOSyMaNujno+TIgEQmCQECARcCZk\nhsxkZjLdz/5j1epaVbXqrXov3fW+/T6/z2d9ut+qVauetarqWWs9V1FVDAaDwTBcGGmaAIPBYDCs\nPIz5GwwGwxDCmL/BYDAMIYz5GwwGwxDCmL/BYDAMIYz5GwwGwxDCmL9hVUBEflhEHmqajtUAETlN\nRL4nItI0LYblgzF/Qy2IyNdE5NsiMhYce72I3NskXR6q+o+quqGqnoicJCIfF5FHRWRRRE5bCfpy\nNOwQkadEZH1w7MdE5OGVpiUGVf26qh6nFU5AIrJRRO4RkT0i8h0R+R8ictJK0WnoDsb8De1gBLiy\naSLyEJGj26i+CPwtsGWZyKmL/cCvN0xDAW2O5RRwM3B6Ur4HbF8Ougy9hzF/Q10o8D7gahFZlz8p\nImckK+mR4NgOEfnF5P/XisinReQGEfmuiHxFRH5IRC4Xka8nu4pfCK4dFZH3icgjIvItEfk9EVmT\nnNskIt8UkV8TkX8HPpwc+0Zw/aki8lfJinS3iHwQQFW/o6o3A5+p6rCIXCMif5479gER+UDQp38T\nkX0i8lUR+fk2xvIm4OdE5Nkl914Mz4nIbSLyrlz/fzXp32Mi8lMi8uMi8uVkJf7W4FoRkbcmY75b\nRP5MRKaTc/65vU5EHgH+p4icHj5LEZkRke3JbulxEbkjGcs7VfUvVfVJVX0K+B3gwppjYGgYxvwN\n7eAzwA7g6pr1NSkePwj8KzAD/AnwP4ALgDOBVwMfCsRK7wHOAr4v+ftM4B1BWycC08BpwP8T3lRE\njgL+GngYtyJ9JvCnNWkO8SfAj4vIRNDuzwB/LCLjwAeAl6nqJPAi4P422n4UuBX4rzXr58fyRGAU\nOBk3Lr8PvAr4fuCHgXeIyOlJ3Tng5cBLkvrfxTHqEC8BNgCbgbys/78Da4DnA88Abiih8SXA52v2\nx9A0VNWKlcqCY6Q/CpwDPAEcD7weuDc5fwZOpDISXHMv8Lrk/9cCXw7OnZfUPyE4tht4AY75PAk8\nOzj3IuCryf+bgEPAscH5TcA3grrfCWmJ9Ofo5P6nVfT7H4HXJP9fDHwl+X8cx0QvBda2OZb3Aq9L\nxvAJHFP9MeDhoM5irv/bgXcFfT0ASPL7uKT+fwrqfwZ4efL/F4EfDc6dDBzGLf78czsjOL/0LJO6\nC8C6ij69ANgDXNj0u2qlXrGVv6EtqOoXcKvqt5JdidbBt4P/n0ra+4/csQngBGAM2JmIiL4LfBLH\nLD3+Q1UPl9znVOARVV1sk74YPgL8XPL/zwN/nNC9H/hZ4A3AYyLy1yJydjsNq+pu4EPAb9L+WO7R\nhOuSjCXF8Z1I/j8duCMYyweBI7jdg8c3iONU4HFV3VtGiIichdOjzKnqp9vrhqEpGPM3dILfAK7A\niVM89id/x4JjnVp+7MYxr+er6nRSptSJVzxaMctvAKclYppu8RfAJhF5JvBTuMnAEaB6t6peguvn\nQzgxTrt4L/AjwAtzxw+QHcuTaX+C8Pg6Tjw1HZQxVf33oE5Z298AZmJ6HoBEtHQP8Juq+scd0mdo\nAMb8DW1DVf8N+DMCy59kBf8o8BoROUpEXoeT5XfS/iKOkd4oIicAiMgzReSSmk38/8C/A+8RkTER\nWSMiP+RPJorjNcnPNV6RXELLf+D0HLfhxE5fStp4hoi8IpH9P42b/Bba6KYk7e8FrgeuyZ2/H3hV\nMpYvw8nTO8XNwHXerFVEThCRl9e5MJkgPgn8rohMicgxIvKSpJ1nAn8HfEhVb+mCPkMDMOZv6BS/\niVuZhivGK4Bfxa3cnw+EIoC8wpLI7xDXAF8B/llE9uJWl8+tuFYBVHUB+EmcovjruNXr/xXUOwDs\nS+o/RLprKcNHgJcSrPpx385bcBPeHpyS9ZdhyeHsexVthvR/ACeGCY9dmfThuzhx0x0tro/9DvEB\n4OPA3SKyD/gnnPK91bXhsdfgJriHcKKlueT464FnAe8U5xT2vaR9wwBAUrFhQwS4rflngG+q6k82\nSozBYDAMCfph5X8lTgFlKcUMBoNhhdAo8xfn3v7jOBtliyNiMBgMK4SmV/7vx8mIe2GSZzAYDIaa\naIz5i8hPAN9R1c9iq36DwWBYUTSm8BWR63BWBEdwZneTwF+qahjfxfQABoPB0AFUteWiunFrHwAR\nuQi4Om/tIyJa1YF+gIi8U1Xf2TQdVTA6e4tBoHMQaASjs9eowzublvmHaH4WMhgMhiFBO7G7lw2q\n+vfA3zdNh8FgMAwL+mnlP8jY0TQBNbGjaQJqYkfTBNTEjqYJqIEdTRNQEzuaJqAmdjRNQK/QFzL/\nMgyKzN9gMBj6CYMm8zcYDAbDCsGYv8HQxxCRzSKzd7sim5umx7B6YGKfAYVjBDPz7tfj16vqXc1S\nZOg13DOevANuWuuOzD0F+15pz9pQhTq8sy+sfQztIWUKN3im8GIRMaaw6jAz757xZf7AWtg6D9hz\nNnQNY/4DCWMKBoOhOxjzNxj6Fo9fD3MvBkKxz/WNkmRYNTCZ/wDCZMHDA9PtGDpBHd5pzH9AYUzB\nYDCUwZi/wWAwDCHMyctgMBgMURjzNxgGCOb0ZegVTOxjMAwITNFvqAtz8jIYVhXMv8PQO5jYx2Aw\nGIYQtvI3GAYG5vRl6B1M5m8wDBDMv8NQB2bnbzAYDEOIvrbzF5E1IvIvInK/iDwoIu9uihaDwWAY\nNjQm81fVgyLyI6p6QESOBv6XiLxYVf9XUzQZDAbDsKBRax9VPZD8eyxwFPB4g+QYDAbD0KBR5i8i\nIyJyP/Bt4F5VfbBJegwGg2FY0PTKf1FVzwfWAy8RkU1N0mMwGAzDgr6w81fVvSLyN8APADvCcyLy\nzuDnDlXNnDcYDIZhR7Jw3tTWNU2ZeorI8cARVX1CRNbiXNT/q6p+Kqhjpp4Gg8HQJvo9ts/JwO0i\nMoITP/33kPEbDAaDYflgTl4Gg8GwytDXTl6G9mBx3A0GQy9hK/8BgMVxNxgM7aDfZf6G2rA47gaD\nobcwsY/BYDAMIWzlPxAYrjjuFrbYYFh+mMx/QDAsDNH0GwZD97B4/oa+RdlkJjJ7N9xwcarfuB3Y\neo/qnkuaodRgGDyYqaehL5Gu7m+42JXJO4bVfNVMeA1NwWT+hgbQynppePQbwSTo+/piEel7Edew\niCBXO4z5G1YMAdO4AB6I1lHVu0TklclkAOxbxcylPRPefmC6gzphGYow5m9YEUSYRnLmPPKr+4SR\nGDMJ0D9M13xOVguM+RtWCAWmAWzdA9zXzeq+H1bDnaMdEZcxXUNvYczf0CTu68aKp39Ww51hMEVc\nw6OTWe0wU0/DimA57Pf70Sx0uXYi/eT/MNi7reGAxfYx9A2qVrmrgaEs506kn3YJppNZHbCVv6Fx\ndLqq7afVsKOn/3Yi/YTlnOBXw+Khl7CVv2FA0Jkys59Ww61gjGl5d0WDrvtpCsb8DQON/hJBRJWh\nOzphTKtvwlhOayWzhOoExvwNfYD+tSBphwnHdiKdMCZbyaZYfZNgH0FVGyvAqcC9wBeAzwNzufPa\nJH1WVvRd2Awzd7vC5qbpSWmaPAC3qSuTB9qlzfXnNgVNym0KM3f3+pp+L52MZd1revGcVlupwzub\nJvAk4Pzk/wngS8Dz2umAleEpyzVBlLXbCyYcYUwHYXxnqz6sRubfyfNrZxz6cfHQ8FhrVZ1GxT6q\n+i3gW8n/T4rIF4FTgC82SZeh/7BcopDlFrGoEwVdC1u3gh4Lh9bA/3tB7F6piOPQLMwdAkaTen0j\nBusGuoz6meVse9Wi6RkqmKnOAB4BJtqZvawMR1mu1XC83alkZT6+063UuxH75Ff+xyvcWehDsd7Y\nwZSO/l7Jsqw7ss7FOctF1yCUOryzLxS+IjIB/AVwpao+mTv3zuDnDlXdsYKkGVYxkpX2BXAzTgK5\nFE7/fLghyXXxxkNw1X0wsqddU9Kk/T8uxjS6JbxXgoJieBS27qnyEWhaIdqvjm3DpjQXkU3AprYu\n6oMZ6hjcdu2qTmYvK5nxWrUrHXqs1Cu2d7zCvMLkgvurXe0w0vY3anFnsbHQh84Uw80rOuvQ3cR7\nuVr1Jm28f1pVp9GVv4gI8GHgQVW9sUlaBh2drHSaXjW2A+25Q1dZlNFD34VPnwVbgF9aOtP+WPn2\nTyJ7j7lDsPAFd6+wD52Yu/a/ffuwrcAHCg3PTi8GFoH7gc8m5WXtzF5W/Fi1t9KhD1aNFe9G16vF\nVm2Uy/pDGf/xieydbe2bKYbt35ms9md2l13naJ3a6eqM73S/W49BP6xuq96jpmjs9/d7BfqvlXWa\nJrLbDljxY9Uu82+ecbR47j2wr69iSrHz4zvLlb/LJ5IpqVs54fQLg2t/kl2Z96wXC4hBLcb8h6i0\nywj6m/l3Rlv2Y48x8tay6LL7ugmgOCm0R0/bz2J3nTHodwbXLxPUsJU6vLMvrH1WCoMk424X2rZM\nvH9DKnSCiGx5sSxPsIcGtuHu+kOzyXUj8GngoUV4YgeMb3FpJ29OrnwQWKh8n7SF7Xn22sXZer1M\n66XXz0TvXYWV+hbafy8NK4amZ6huZ6822lp1KxC6XPV1e/3y9qsbGbv61bm2L3bZojCZu250F8wE\nx2bUHYuJasrEH5mxzol0xg5G/Am2legfNrcanzrPdDV+C1YKz1gr6zRNZLcdqN9W/4o5Ohybgf+A\nWzGqdiem+PM9V52idWoB2Fbv2ku12M70vnrHphbKGXLmWUXMSX3Ih7S/7tjGhKY7AzFUmXiqTHeQ\nb3d1fQtWot+WVtUZKrHP6kL/m/m1QpUJoFa46+fFFkBOjHU18Ee4hfLtI7B1E3Bdh9Qerndsw0j8\neRSe1UgqQvIYXXLoEpHNLjHM6OlwIfA+14y75gI49Eicztg7cdW7Uoc1N8ZOVGQYdgwR819dMu7B\nR+eTV2zigH2vdGXrPHABXD5b9KItw+M74KqXws0j8EycfN9j7inYdwPMvZ3su5M/tghXjNS8IU6f\ncLtnykvvoohsg8mQYQNfA/4eNwkwC3MTzvM4H/vHT4YhYhOSfQsGhkfsk7TXlzLuzvsyuGKfbkQP\nVde2MzaRugvA9qKopPju5I5td2KfjZp4CrcS+5SIY9icio7Cvk0fLjdBzV9fJV4KYwn137fQr3QN\nWqnDOxsnstsODHMZ5A+ljEHX6VMvQwrE21q3KzXxbB1+uaQvBR1D/X7FwkFUm32m7Y/vDALC1XJO\n65f3aNAXNP1UjPlbWenn1aaSNm9nH2Oiztu1eF1nTKJ4z0orIXWK46LSOGhrd2/iAc3c7do5UfMT\nSfsOa0u7gW2Ovpndefo7GcvlnChMEd3LsUQr6zRNZLcdsNIfpVOGnGUmMWeqjdG2OmFC5eKXycBK\nZ1pTRn6nOhPLDCMuEbHEQzV3NobzmrdSatXfNq1/OvbArWqv24nBmH9Pv0etrNM0kd12YBhKNx/V\nSm3pO/lwi8xuWottXNoRE4j1u5xJhiaVFwY0xMw+vcw81tZStM5S09Kq51FPPFTXM7mOeKwd5l9e\nt+6urd77YGKfbksd3jlE1j6DiW6iIvZ/RMWZebh8rTPJ/G2cR25oaXMNcDtJsrfaKOt3uYnj/m3w\n4B3whrVZGh5r674uC+nNOKuf33uHyPQWlwfAedBG6HqpyMT9sH9b3WcSt3R6/Nq49U7M+iePXln+\nRM1ZL4AH/7ZuH9W8gVcWTc9Q3c5eq70sp1VMj59VB165UzuLCs55dQ5UU4mlSvvihU7EILk2E0uc\nsUdz3r5KRgwTs6y5M9k9ZPQGC2mb88mO4lJNdzw+a9fUTvd/+2Ka2JjUfSZV41nnGcdFdn7nFBfd\nWVm+Uod3Nk5ktx1Y7WVQmH/yvGqLmFzdsYMshU3I0hhnxvUsWKrFE3VpjDHrvEmpb2t8Z6qsjVns\nTC24kBChDmFKYULdGHgT0XLdQXK/WgHfOnkmNdsoMVEdO1js24Wajp/J71eyGPNfBaWTFXUvrl3+\nfnkGfa2msv46liwxO/iY2WP3/S4PrxCzUhrb5fpxrmb1Bp7GjQrTR9xfz9znFdYFfT8xYJbZvqV9\n8hNEtm/lK/8i429/kq472d6Z0L5RYYOvnzxjY/4r+32hlXWaJrLbDqyGUi3G6H+Fb/t9DlfnxWQn\nAd25lW7UDr5WqOMOmF4+sNrhyM7jIIwezjLkWU3FPvPJ73MDhnhi0OfYJFGV5jE7XiUMOk+nV8Dm\njrdOFN+50vjSXJ86sf7qn/d10Iox/wEoTa7Ou/3Qup+UWsnf/bmYTmAyGkCt4n7b2rkuztDWLcRl\n2+sjTPxchXHNruxP0HQHsTGZFGLioaJjWdkupJzWmHhoo2a9fgumrJFVffl9WzxLhYs0EJWVZjDr\nl29htRVj/gNQupXLd8qAu/3QevGhlosl8qvcAoMqyJ0r2t6WFRfVSauYfy7zCZOPhVqIMf+Z3TD5\naLkSdFpd+OiCH0GZI1Y0vHOLdyjC/EMFrCa/tyicmZQtmXev6r6R8d4Np2tWST6mqcd0J2G5TVTU\nSTHmPwClO4VuN/qA9u6bZ9TxFWEx3kz9frSyW/erx7jCsXpcfMwdL5fOeNC2sIDxO4X5gKGF//tV\n7mieiSsuPlDJ6vvEhNGuV5jY20rsUv6s0kxi9cQ+XtTkx8NPWoW+7Kq4777W456fZFvvLHr1LVgp\nPAutrNMwgX8AfBt4oNMODHpZSQbe6bVFGscOwuRilrHMa1k8+w76H7XoqTtWRQXkem1tiVPa720w\nvpAyMH/NvLqV+0zy/7w6q52TgrGYXHBK4LwFzMk5JlytiK0ndy/TcUztzJnNJnSN73QK6Hy700fK\nJ+El0VGLQHmhWKzcSS7+HhRorQjIZ7qBFt+VVtZpmMAfBr5/mJl/0s8ORTed5ZZN71k38mV+lR81\nZdRO4tuUMbY4M2ttepmld4OmVkR+te5FN/XGLBV7xPoba8ePkf89tstNlF6JO6YpYxtbCMUhZc+j\nXaYY78PUTuc7MZ5rI5aQZn14jxa7hzoLhXaU8/W9g7tZMA1L6XvmnxB5xrAz/87HbnxndmV5vMJ4\nLeafjG/NUAKhovREdcrMmDig/V1Ie6vaib1k0im6/ub6sS3OsL3oKN5GvN/T+1w712pWVDS2kO4i\nMjb5ud9TgXNXKLIa3+ksh5YmhYPZhPN+gp3al3P6WnT01wubwJKuo8yHYHRXUexzrUYm4d2RvtYQ\nEY7vpJiesiOLom7qD2Mx5r/KS92VcIfPpcTU8jZ1K8hoztkOA7vViVjpV+15JrQul0vXi57iIoc6\nY5au+MMJ79rk3hN7YfJwem5G3S5jSuEYTXca08l1qQw9aXsbTD2dioz8BDSx1/2+ULN+D3mGXTQD\nbTGuuUl7Xp1i16/ex3e6CWD6sBP3bMmNl9sRdbPSrrfAMObfe96AVtbpAyJbMn/gnUHZ1DS9/VSW\na/tbvXWfyq+227ajL94vfl3KrMOVd0b8UOIHEFfsZieTeH7fFrLuElPPJYa8CM/Q4mp6KSTEtuy5\nsB/T+9252HiHoqSLogwvP4blfdiYTCijh7MT3NjB7KSWtyhaPhl7u++xiX2iY7Ipxyu18po+INpW\n/t2NX88/yhqmlivmrOOuL3OGmjyQFZeoZv0AvBJ6KiMmocLmP84412tqf9+KOccik87sdu1O7C1a\nSPkdihebxXYsvv8z6nYYeTv/GDOMTVKz4WQTEdPF6ItNMPWS3Czne7yck9FqKMb8rXT4TFqaWq6I\ns06W0ayLhXTw3q3bHcNdr05sMXmAJTPLsgQmMcaY9q8o9klXwcW+5cUyMQsaT2t+1e93H5OL6SR2\npzpnMF9vVp0oaKOmYSOyFjfxCWl8Z04cpjl5fixGUGncoOKOaSYcb2O+fVb6nvkDf4KLm3sI+AZw\nebsdGPayPCv/bh3AeuG4ljEtPZw1LV0S4eTEKJMK3NXKcsRdF5tMNubbTqxkZnbnFaxJG/sdQx3P\n3X/ioDs2H9K0LT4m0+r0BV5p7ZmrF/9s1OxKfylAWhDWwetlQksrP1n7lI7RaKExHU2p3iYufvOJ\nb5Y3sJyVjr5hrazTNJHddmCYy3LKPrv5aOsy/7J7lK9kC6afkZVqbOW9tFLe5sQrU5qdNI5PVsU+\nKNn4zgh9OasdvzPwyt3JZCW8tFNQGN8Pxz7q6IyJWc7NMVi2xT2IvZx+XluEf9bU5yAzIbYV+K31\nMykTv7Xyk2g/FIeVnny/WlmnaSK77cAwl+W2euh0AohMSgfzMuJsnaxsvv7kEWX+EQblrXpCRnSC\nOjHKSQnTrsybq+lq3lsUhVEsT48wxdBBrMCUM2abLJllxvwHpve5ennTycKktT8+eXT/TiQ7ociO\naV3enHVz6/pmlbMSpQ7vtExehii6yQKmmYxMi7Nw5Bz43QvCdljK/PRNnKP3jSPABTD3t3Dwq/DG\nQ8Cou+aNh+DYWZHpnfA0cAzu75Hvwtxseuc5YD0uA5iHzwb2TuCmkSDTFLAdOADcC9xANgvVW94J\n7z8mW//jwPuAm0dcxq9rcBnIAK4C7sLxcY/pEbiWbBtb98DiI8mYTLo+v/FjcPSxcKMUs5nNAUe+\nDQe2JWM2CicBt+D+vhv4HV93DE5k+fDUAlw5kv6+Gnh6Ecbe4eiC7PN97ki0GUN/oGL2eB7wUmAi\nd/xl/TJ7DXNhWcU+vdlVlLWTypBjkS03BruF2Gp3Xp3oZoPCUUfc/96RaVLhPIXTNPU6Dlfr4X38\nqvz8CA0nRo55q54tmoZsztPtdwnHLLhdRcwvoSpM9bzCqcnYhB65Y7uKcve8wvl8bRW7iI53c/45\neuVzaBG0MUe7V7bnaZ1c6NX7aaWSN2hlnRYXz+GSkn4UeAT4qeDcZ/ulA8Ne2vmY26vbmdy++LtV\n+IYw4Fp4finy5b64GMOfX59jwgWzVHUMc0lOn7PS8U5jJysFr9/z8owrN5HEvJzP1DRSZqig9g5d\nE+pCPJ+kWQVtbAzOjBybXCSa+ezS3HXxqKXdLBbS51hmipof/zCkhldcs73p72VYSh3e2Urs80vA\nC1X1SRE5A/gLETlDVW9sd3dhaAZOdOOTeD++AybfXl+MU53YuygaeuNL4GhSEcAbXwIL34CrFuGB\nETgPmDsEC7OOroWvwoVnxcU03wLOPq51D9cDXw5+34ITy1wWHNv6sOvLzDwsfhHefBacO+nENB8A\n3pDUm8OJgWaT6x8G3osTF30TuBj48CKw39XffFyW7iuBc5P/7wdukiwdv4obyvcG9wM3Jp87lPwe\nTc+dFunv0QJrIscfw43Z1cDxuLF78CnY96rs8y0kWV+bJEuvkSTdvw9XrHX3eQD4NPDQohMH3X6M\nS14fjv/XgA8DG4BfBG79WRH50/J3zrCiaDFzfCH3ewL3krwfuL9fZq9hLrRYyUXOLWRXm9ViHFp6\n3vqYL+EqcIOmIoFr86vwhTTQmT82dtCJdObVraRDMY0XZ+RX817s471jQ0VqbAU9tis+Dq2cqcJV\nvhc/hdY1/p4+LPO0pqEdypTOz4gcW7+0Ok/HeioJuTCxt2jGGrXmCcZ97HAr56s6u7kaz/zuJCbQ\nYtZLerSmOKu+v4iVrniDVtZpcfG9wPm5Y8cAfwgs9ksHhrm0+pjj5za2/PDbeC7JxBJ+3HdqKoLx\nDLAw2cTiBIUmnNsd05tedJOBl2X7uD5T+2Bif1GOPpaYU07shckj2YkiZtO/TuNWNes1nbiyYpNi\nZNMtuQlgRuEyTSfB/ISVDwvtxUetktOPJeKl03Jj6c1LNyb3L3oxt35urfQBdUNmR+X5OfqrY/u3\nmmysdMMb0Mo6LS4+FTgpclyAF/dLB4a5tM/824+3H7+vZ4QXBQy/zAY8Q1uFB+nYwfgqv+BhG9q3\nH87tJhKG6Zl0NAJp0G64gt6iWfPNJXv/SEyec4MJIGzj2qTtsYUgaufTsGZ/0dST7eXMMnxmsYlq\n7CAtvJhz31GwqxjbFXNcq36fQiY9tbMsXHOOmQeTQVmMKIvRszy8Aa2sU6ORFwGTwe9J4D/3SweG\nubQp9qmV/rDmPXP28hs0rpgtTDYVHqQxBjF9JO5h6/uRj+0TTjp3qlOyxpi8P79R02QvY5pV/E4e\npJACcokuLd89LDmU7XS2936n4RXU52o8jHMsaunUTrejyfeh6PFc7x3xE2rxmtbK+bzHdczaqaVB\nQO1wErYb6Alv0Mo6NRq5HxgJfh+FWfv0TamW0cZDFHR+v7IdRVm4gG49SH3S9Hbk2J5ZhxE+/Wre\ni2RCBnuhZjN+hW1NH45b3ow9GpftT6vrN9uShC2aZbyTmsbomXg6jdkTxvNZChOdjKvfFfmdRDTW\nUYmXbVkgunwIiJjYyYtmygLdhaGnJ/MirMi7mG87NnHnYxLZbqBDvqCVdWo0UlDuAp/rlw5YaTl+\nPfcDKGG2sXAIHQRyy4t9ZjUWqiB7r7K4NWWrzTCJ+W3qdgahriIWP39ci6ag4zsTsU1IX8LIp/c5\nxhgTOW1UN5nMazox5HUnfoLyO4S8bDwamK4N5n+RZkVqVRN27H7epNRPqqN5xXpkZxEzC64TjdS8\ngjv49rWyTo1G7sDZnh0DHIuzaftov3TASqvx6334h6oJpc4EUL76Z7MTcaxPGJRfIWfpL66ER592\nyuCs/JuCfiBMyjJ9BMYWUwYc3mNjUN87Mm1I6DlTXSiHqX2OlqMfdfSeqU7hGzJTb98fJo7xzD+0\n1/cTklea59sIdQ8zu53SuxDQbnv5mObFPhs0OxHGxjeU248dLCaYGdM0llHnTLs4IbR+Z00kVPs7\n1co6NRo5Efgz4DtJ+RPgGf3SASutxm95Yv+0Zt51Vn91LU7KlIr59JWhaaYPfOYZ1/jOlHmdqVmm\n6eXvhXsXtcTKAAAgAElEQVSoc8S6LGH4x2vqoFVYLS86he5tkbZiCuHRhHF6j+RLk3vMaDqZ5emZ\n2Os8g0NT2DF13rwxpzKvq8gHpVuKw7+7nPkXlLCJaeyMuknuVHUOcBsV1hxxbfl2u8sq5579+E5K\ngsEtx052tZaeMP9+74CVluO3oh9LncmmXh0/ucRzwJaLc2Ievlu0aEEUinU2aNyT14eQWJcc26jl\nDHOdZhPG+3NRHUZy/PyEgfv7jmlqvhleM59j7OFu5MxW9ykw0KKoLB8+Oq+E9m35qKXhGBYmtsPZ\n/qQZzNp/V8uS8CxvIMPVVOrwzlIPXxG5RlV/W0Q+GDmtqjoXOW7oIXIeutdrm56RmgmwBrCv7Tba\nw+JsdZ1qJHTj+r74BXgTMLrH0y8y/QjOFTeHmIfvr0WO3QJLAdi+A1yO8079Es6YbRr4IOCd2a/G\n2TmU4XnAKcD/Sdbr96FI3RFSr+KrccHZPC2/AXwuOe7xB1r0Fr4FeDkuDcbtJfc5biTryfum6+Co\n5wUe3ofgQ1+B0TPhDZIcAxYibY1TDFD3LuCm8Fji4RvW2boJuC7SYAR57+PzRmDrnuV9X4cbrcI7\nPJj8/UzuuAC6POQYPLqJqhkiqd/ymrqTTKt67tzYOVnGNXcoHxIiCRPwEtJQBoU6ad8vXwt/A3wb\nWPhKWuOJbTD3Sdy7iGNaV+DCDeRxbORYGA7hJ3F+i0eAH01+zwPXk2Vkv44L0fALZPt4JfB6XDiI\n85J2b0nu8dRXYO6ZpCEyEjrLJqL9wLOBPcn9jgV0gcJ3+ljS1uEDcNVD8NQkbD0rPb8VeEauz6On\n50I7jMLWabhBsseuwoXy8DRfgwvP0A7uwk0EXCAim4vvSacLmuqQI4Y2ULF1OAq4vp+3Lqu1dLvF\npUIxRoVoJdJG3gwwF4/e03ttIo5YrzC6K37fUFk7lpdPJ+apXuySEeMECcXHdzoZ+5kKz1f3/8Te\nuII3DDY2nbQ7vr8Yx9+LVMoc1iaTa70/wAnJ/zHx0lIGse0sJXMZWyxXLo9pNjrpCZqKoPKirAkN\nlK0LwF2pHmFj0tbo4UCkdDCbTMb3cfpwLOQHS6I173MQC7FREPscdPb/UQuiSpk9NRO/VL3XVpbG\nSSvr1GjknwHp1w6s1tIN82/1kRXPt/LUrPI+9blcxxccc4kmRalwJIoxkfGFcuXnkrXProjFy7Yi\n49KEgZ6sWZPOyUV3n5juIB8/J4wjNJX0ucDsF52547pdbqzK4vCE9vzrNFUml+X3Vc06o41FYjTF\n/A280ttP7p4pxyaTMGx0KaNWN8GuW8hGSs1bBcUc31p5nYfvWhgrqL6+oEPesqonkTq8s04yl/uB\nj4nIn+MyX/iG/6rGtYaO4be4D6xNoicCTzxa79pY9MY3XScym2y3x2fhpuT8x2u2cXMkMccpOPn1\n+0Zg4jj4O1zcv6X7jrSOGnkXLoRURnY86mT0u4DnRK5ZnHUiodOS6JLfxEXjnAQOXa568DoReRU8\neAd8a60Tw/wRTnRxeXgfgbc8CeQihz4G/M+n4cB/wFtOhrPFtQHur9cDXAmcjJPZAzxf4MvTcOgR\n+J1E3r4l3zfgVtyYP4ZLhKIjLhroG3L1bo70fRT45ZGieGs8Undkj+qeS0Rm73ZRVi/DRSZ9PZHI\np3uA+0KdkBYS8iwAj+2BvXlRzXUA6X3K3qdWyL9rt4+0py9oD70SqQ466jD/NcDjOIFoiK6Zv4i8\nDPc1HQX8vqr+dsUlQ4Pk4/szuPW1joEAzL1WRHapapsfxQPAUefDDQkDn1t0x8BF7n51UNfLUb1c\n1uPC5DqSNsLQy/+Ok5HHGNbCsxxjADehsQOueqmbTJ4AnhuhdTcutNT9pPL1B0hCKj8HLl7rJo1P\nAPcQjM9ZIrJNVa9zjOvKD8GRs+BZwL8F91iSSZPNGDa3CEe+Cgunw9Qpjhn5PubDFZMcew0uq9iN\nALMwN52ObQwLOGXtHMCI0wF8NFLvSziF7tUJDe9Ljl8NPEU6IV0DnAXZ7F8lsvDNuDDYBdynuueS\n/EEt0Re1ltv/EtkxCmkJZfZLz7PMSOAC9960b+hQjW5CW68iNLgtOQr4CnAGzoHsfuB57W5dVnOJ\nmzTO7K4xtjVENmHcnYxtfEVsoKmdqVjjNk1tz1WLsuGZREwR3iejX9CsqCgvJlmnzi5+3ULWxt07\nT0XDK+zL6SkOZ9suiD0y+YUphJnwIpepiLw+9BaOjW2+P7PqRDxerr+uhKapZFxO0Hims8lH00Q4\nW4I2zk3GZGxX+XMsPIO2zH8j74Xitl6Vpprp9Xlb/th7URRF9fC7WvUmo3V4Z51GzgY+RRLfH3gB\n8PYeEPci4M7g91uBt7bbgdVcOmX+ydhVhAPwzL62J25Ovju1E9btd6EPbtPU1f/chJGdqaknqb9v\nNLKjprLefJYqL2P3TGFanUPUBk0djwrMP6i/brHY3nSEiWciV+4uyq190LeQQZ+g8TSGt2maqnFm\nN3BXyqjz4xFeGwvsFoZuvjPXfj5g2p1alr4x9xzzYZRL4/GUvJOx8B5Kqm+pbCvexlJo791FncZ4\n5bvaJu9Z9c5ivWL+/wD8Z5Jgbjjzui90SlTQ7k8Dtwa/Xw18sN0OrOZCIZRwe44zQTtdvezp9RmF\n3HZ3bIu6CSBkjD6KZX7HEWP+PtSBDxFdVX9JyaxxD9owYmdscojF1V/aJSzGV+x+AsoHiFva3YQ0\nHGwv1lA4ceR3HLHw1jOaxCPKWVnFQzXUeL9KLWxizNz9HwtZMb2v/vtUvvIunpvXOlZAnX0Tw63w\nrdPIZ5K/nw2OdZ3JC6cNq2T+uDx6vmxqelAbeIjbqBm3vaKdLoKu+Q8+5gmrEaa9tBrcnhM5HC4y\n69M1bTsv/oiJPJby++53f0PT0jX70/qXapzmgpVOwpzDSJk+D8Az1O00YiEgpveTRksNVs8xD9lw\nZxGKN2a03BooNvGdlq+fTMKjh9OcxNfWYv6Olpg4MENrJMJnbEGyRZNoprWSs7RajFAwBa4fwXSY\nC7Apxyu18poajX4Sp1HyK/+fBj7ZA2I3khX7/Bfgmlydyg5YWYkXq0wOvl7T1XB0hR7I0T1jLMR/\n0eyqeoM6cdLUQpFRh6EN1u3KrQiTVIJexl8Wzjmk36dQnLk7FfXkxScnJswtYyZaMGFNxyouYkve\n5yBw3XPVmZ8er25C8JnPPL1luQLyx0YX4sy49Sq++Ex9e61W4BN7S/I2JPfMh2Mu6pLK6MkeDyfI\nqglq9a7eu/tm0co6NRo5Eyfzfwpnn/Zp4IweEHc0zgTjDJwroyl8+7RkV4l55hhz+PGr+FZxfTwD\nPlmdGGOJeS1mV5DjO7O286FoJcyh6x29xpO/G7TIFIsOTSldPoxyNKGMuh3AM7TMeS3tYz7o3FL4\n55xCNL+CP/bRrP4jb/u/TuMK7pgYa/pwdgzHdrkVtFc0+1j6ZakY87L3OxNaysbnfD+eu7PPN+6g\n1/pdqyf2abV7sNIj5h80Ng4c12MC/w+cTdtXgP/SSQesdDTuba+YWJIPR5W2+9LkJucGDCbvuRvq\nDvJWQRPqREDr1DH78bz1USD+Gt1VLoYKdwhbkranFfhyhInc5Rjl9BE4ZiGruwj7V8grcCSWKD1d\n2XvP4wsTmnxCnVAklWdw04dTRzTPpEc1zRjmJ1hPx3xC16xmmbQPV+09svNjfUJAkz/vg9JNPprT\nV2g6GYU7qfzk7593uOuJ7Qa9c1grkVBLZXBO99C9OGi17h56tfKfxnm0vB8X7eqDwE390gErbY9p\nxysmSiNqTuXEL14OXvTadCUmOji/BVMphJZQxyjDNraoW5mfmfzvQzF7Rjm1kJtU7iruDC5UWJNn\n9JpaNIX0emboTUWndjrZez7u/ejTbtUdZgkLGaNXTMeYdJ6OZ6gLM32c5sRQGtFlaCpiy4vaTtJ0\nN5L3lM5HPfUT7gZNE9gv5UTYF4zntqwJZzQ6aUhf1Iu37vvZDfPP7io7N3vt59Ir5v9PwA2k7pGv\nBS7rlw5YiY5b6Wqm2xVT5OOMhBvw9y6szhdTZV5sdZ1vxzPJ2IQTikCujTDKCU1l6EUTyPgEdKam\ntvNnqlOwrokwMk+b/9/vdmLWRXkmnQ/74K+NpYqM+Q/Mq5sECs+w5PpwEvTMfYPSMj9x2LclS6hc\nG2OHg11ZxMZ/Yj+Z7GfRsYnqTuqsxjtdxGSvK9d5DHqpwzvrePiOqurWGvUMfYDldl3XQpjohVk4\n74J47U8Dv002pMLNo84Y4bKg3tW4NcXtuBAEm6nGM0nDJ99MMYzCzUl7/5ynYS1cdR1R7/bDOE/m\nW9WFUX4f8Ju4EA55eq/CGaw9BnwNFxk0760MLqxEeO01wOHvwaE1cPMx8PWEzrMi1xbo+yrc+mx4\nfhBqw3srS6T+LlwIirxn8nbgraOwdSs8NxK2w0c9Db29b8i1cdUDqvuTdyoWjvlND8HBc9zzBlgI\nvMM9NozAl5c8a1PP4RmqPHuL72HdcOUhrZ2EolhFqDGDXI3z2T4Z91RmgJl+mb2s5Mes9cqegjVF\nURFHG3JQyk0CI6aE8+rEH150kHdqCldjBbFPIFrKW8bE5PSXBu3EVpwXanE1O5GsaEcPuxV5uFq9\nM1kxe/PMvDI2tssIvZ/9vcfVWTP5fp+aHC/sUBYdLeG4epm6rxuKimLB6K4N2oyNzdS+bBIaf9+8\nvqXqnYop8p1pcvAe5Z6ht6Ka2R2cX3YFbpHWeATSLvlW43qEOryzTiNvBvYCj+CClj8MfLVfOmAl\nP2Z1mH8hpHJbqRgjz6nEg3RsV7njlI8Gmqd1veZkyYHCNx/SwLc1pkXG501Cz82dC8VUz9ZUju3r\nT+1LxVXeAihGfyzRuwbM7yR1k0iYhctHBA3pPE+zzCi8dizxFPYOXaFC1XsE5yfX6SPZUBh5B7yZ\npF9TC04f4fs2o0430b4cPj1fCJ2Rr9fiGZabdPaYp0TCXcTNUXvTfjN6hF4x/4eB41ea+LodsFIY\ns4oPtZ1VXPZ8nRVN+ccVxpP37a5byDLH49VPRrF7ZSeVdfud3H7sUfd7dMEpSf2uIJwE5jVdYY5X\nWKT4cNB+he2tb06K1D1T00nD39d7II8mIZ4n9qay99O0uNPJ5xWeVPixpI3QNNOHTA6Za8z0018z\nlYTZ9jqFS9Up1cc1O94hLc4fofy5lj97So0BylJ05utWy997taJezpV5r6yQesAHtKpOHZn/LpyN\nv2EAoCWy0CASY4l8vjXq6xIKERNHnXxcDhdbPW4E9gFXKsiT8PQuOLDNnYvfS0ReCJMXwweSNuZO\ngX23AW+GhXk4PAt/8Bw4+7g08qUPgfx4El1y7g5grYv2WYiG+Qn40mvhPcCJwPfhMokdUgqC9RNw\njukHcdLQDxOkfhQ4/iyX2vH3gDHSyJyXkdL2+AJccZSTP+/BGdftBF6Hywx2DXDFWti+1YVM/ibw\nDuBJ4Plks4pdjQtfvZkkDeJh9/8f4fQeNwO/Q1Z2/27SzGNPUwatyAjnns3sfTilTWU7SaTXoG4+\namw2MmkvdVlVfRka1JhBPoqbAG7BTD1XatZOIh+mW/7u2yt1MGqR6CWU4ddb0cTrTSUJQPJWJxcG\nq9VWu4/Myn1/RFSjkT4cjIghYruKvFw6Jr5Q54RV5m08nazUt2hxZ6AaDw+xMbmHF33F9AUXaeqZ\nPB2Io0K6RnelfgShHmRjci40t43tFEIxVD1HrHrvmRexxUOSlLxn29K+jA9k8vay76cBOrSyTo1G\nXhspl/VLB1ZbYUkm3753ZHmbrZhpPVO7+syfzUXF3ry6jznUNUwFzLMV879WUxFImSdxPpSEt7mP\niWpaiTZaBUlbt9/pFlopqSc1G1vnIi0XaXiRkfdgbRXUbVLhmKfj5pI+RAWbXZ8nEhrP1cQcMzK5\nLbW7WOb13MX7G8j1l0RWdaLGtvTaHRTmX/b9NECDVtZpeqC67cAgllYvBxUxVzq7X3vMv5zmeisa\nx+jzq3P/gefzAaRy/uK98iGdY4HevH/AVM5hZ0ZTS5rMOO4u9ss/C69QjTH/ExOGmk8HmZebeyVs\n6HGcj1efv2ZsV9bvoCxW0kUlx5d0AovF6KoTe4tjWx2ErtW7Wvb+siT3z0+OdZ2vqvRNza+oB6X0\nauX/cKSYtU/nfaqhkO01868v9qlup3pFU91HPwkUt/e5e9VQCp6rbuU8tqt47gSNxdkJ7pFbpXom\nXRZ+IhTJ+MQp+dAKs8kEsG6/sxya2JuI7pIV+LpdaY7ieXX3Gz2cWOAEDD3Wzws1a5YZip5CRXV4\n3dLkUDLGdXM9h6KZaNKW3K4iQ1vXzL+d989K75j/8UFZj9PevatfOjBopb4pZu/EPmm7MSuLdj7O\ndu3/u/tQ4zuWvH396ZrqR2Kr9UmNmbWWi6dGE/PKib1pUpZYEpYZTX0FYjqCcGdTGqZiAY5+NGtO\n6k0xT8i1N66pd3AswUsZ81/yko4+4+pdaEzMdJtGdBQRU83yiaecFlvd96Ism9gHuK9fOjBopY7s\nkh4rfNu9f8mzaOvDzDGVbZ1MBJF7LgCfyQY/82kEp/axFCPHM901yerb+w1UKRBP11zbC2lEzHzd\n8zUNvRxTpF6qEQYcC1NxOHvM92le3Wo/tNkPTVczaRAXYPJRt6PI+xFkV9/tTeCtmH9eNBXdre5u\n99211X2vvnO0qk6lqaczrUOTn0cBL0z+GjpCmMQaYsm2tcIUrXUC7e7vH7/P+CzcVEh6LSLkaYmY\n5V3sQkOdRysTvVi/8marREMN3PoLcGNiIngVLhzD956Go46BDWPu+OfOKfbwAVyIBnBWnN8lTQZ/\nDXDFCHz6LPjc0zAn8IA4s9Ev45KrX4VLxn5VbPjq4iAuh3UAOQDbn4LFY+HG47J9vSW558FvwVWn\nuL6uGYEbTnHn36hwpbhP9HJc8vm5Q7Dv+shzeamI/LqqXhcnLfqu3ABzb4fnr83WjZpqvqq9d9Ow\noqgxg+wA7k3KPcCtwNn9MnsNYqGL1Q092BrXuX/kPklylTA6ZD55Ryuz0GywsE77VWy7bMUZMy3N\ny/tjqRrDds4NVrijueQxodK2EHM+WK0vJVc5iMtsptl6bGcpAU0dr9eNSVt+N1KmHA5NPpdCKZdE\nZK29gwtEZvkk7MUQ3p19Fyb26UWpwzvrNLIGeBXwNuA3kvKOfunAsBX3YeVD9C6HG3zUXj/HuNZF\nlKzeVb5d5t+WKemB1gyyVLexu/x+sUlkSp3p5m0KU0+37lOYgDycJH2o5al98Uxm3gQ2jLHj7zGv\nxfAQ40nGso0BDTHmn+lHSZhlfyw0k23XAqx3IppBMufs91KHd9bx8P0Y8ATO7fBgjfqGZcWhWecd\n6r1Fr06OdYe8yAVmIrXyESq3TkcqnQ+P/3pOXABcQTZSZP6ei7X6oAVR0BOPwtwvUPAMnboOyLW5\neKyIbFbVu4r3u5Cst+81wC8Cfw18DlgT+VYew43/hxfhmOc4UdAxj8BPzrpxugvnAfw+gONg7nx3\n3V8m198OjJ7uPHc/Dkzl2j8PWPwe3DwJpyT3+usR+NLpLqrpW3CewHM4712AzylwGG5PomnOKUyP\nwLuAUVxqjrCPrwYemu3Ee1Zresp2J6Y0LBtqzCCf7+fZa9hKqxyxXYxzhUmfF/sUIlTmtv5p+kYq\nFL7Fe+Zt4esGlMsnjPEer6O7sqtmrwz2oqlYusU1C0X/hOnFND5OPurmsY9SCPZ2zNPpsehuYiHb\n5+l9rt75uXt4H4fpIyV5CdRZG02os3paOh44dk3tdLb/mT4m9UMnrGp7/x6/Wz2N0W8lOpZaWadG\nI7cAL+jXDgxbWY6tcVmbZOzxR3fFmHPq0HWhpuEIxisno/g9i+n62m8jDAm9RUvMIu+Oi88m9qam\nmT5A25og+Uxejh4TOfmMXZdqPEGLF4vls0iNaTpBxfIPx0JEzOx24qT4+1A+Pusy6RSXU9zSbtv0\nWJQ0rKUO76wj9vlh4HIReRi3r/UNv6CdHYahV6hvrdMbLD4PblwLzMIbD8FV98HIniBgHPC5j8HY\naCqKmjsnFa84uK3/+HVOzLH4SDyI2Oge1T2XBNdsg5kkkdDjN2ipVUqIU0hFUx8HzgbeAIUEMY9f\nD7e+2AVN+zTwd4vw5F/BoVfD7x8dBI4bhftxFj034ixt5hQWcMlI8jiEE9e8j1Ts4zH3FOzb5sZt\n9m4n7rksOH8rzofySYrJacIAbku4L5F2tQymVoTsy40zZe9U9yKbeuI8D7WgayuHGjPIGbHS5az0\nM8AXcF/QBd3MXsNY6LmirdtgbjFRVGrj7doaO5xzhDqcSxZ+IFt/dFdx9ZsNElakOx82wStc87H/\n1yWioWMfdb+XnMCejlv9+LhC4xrEzVlIdxahaCmfW2BsMRYvvrVSPCYu2qBx65qo1U0oWsuJfSY1\nH/IhqbuNpXy9bpzL3ov23qveOyxaqTX2WlmnIcI24HLe3WvMvz9KbEKpz/zLxAt+EpnaWSL/Loh5\nUoaTT1TiJpQizd4Zbt2u7GTixTc+jtBF6uLyhPLxCS3K/WOZv05TeL4WQz6MJ22cq2mwui0Kp6jL\ns+sczUrGeFu5eejYwdT809M1djBl9hN7YXq/M/f0Xr9LYRZyTH18p6PvTHWiOe88VpgkOp78y98p\nf30oLqsWCTb93q+G0rfMPyDQmH8fl7orv2K9rGcppaEHiorqlGGcFqk/va+Ctm1J4Lggps7UzlSR\nmqchNsGcrEUlrg+pUKZfyPQ3d+3YoqM1Rq832fRpLTeqX3k7pj29LzUR9W3kV9InaKsYOsl9c5Oi\n9xpupRvwDLEXzL+z6/vl/R7EYszfSi+eURvB3OIRHRPRhBYjTo4VRACpItbnyg1XxqO7svXyjCXq\ndLYtVcxeqFlrnpgS9dSEMZ6p2b6Uxc0J//cTQljH29HH6J3Y7yYVb00UtbLKhTXuJH5PTCm/IWD+\nUbHd3b0R+/Qvc+33yanL71ar6tRR+HYEEbkH53uexzZV/UQb7bwz+LlDVXd0SdqqRy/tqrWmAk6d\n8vdV8OAd8K21gU3/DpjaArrowhC8HdgP/CqwfjSx1w/af3wH3HqxU3h+AtgKTOB0mvc+XE7BA8Cx\n58HZx7jXbjO4EBSbYPF+eOACF5bhvUn9VwMHcLbyHnM4hfF5OMXry307JH4AylI2L581y+MxnA39\n66uGKqD36DF4v7+3wr5rYWZTLhPa2uIY5fEYXkkbf/ajexz9PqOX7+u+Ha7+2DlZhbILB6HVWeFo\n9X6VXV9zgAxtQEQ2AZvauqjh2clW/r0f00ZXWxTl2geyq/fTCzuD7PWl+oPFsB+0TAaez907tTMe\nnM2LVJYUsl6BuhiJ1Jm3nw9EKVPJSnpUswrgfE7icCyiiVlailqIK1AXs2KhmOyezSWe0LtTG/96\ncvnler+oucNcTd/KMvdNK+s0TOC9wAu76YCV/JitzFa2zscapyWMUln82OLXrFeY2Ju9d35SyTug\nnZubEPLMrzyhjWt/3S6nzPVJXGIJZ/xEMJ6UdbuSCWV/PJKov2b6cIndfktRC0vK84m97j5TAdP3\nupV8KkfHyOOinY0ad95rZYffe7v9JplwE5POCvVLK+s0RNgrgW/gEsN/C/hkpx2wkh+z5Wf+dT/W\n8lX8EtMt8/yNmCeGQdnK2g1/R0MlJxNAIaHN4XqTUGlikVI5fXz8YjqQyUAxXEx003pVn99RZGIw\nLcTrhUrqqaipaLfvV3fvyeqQvTdV+pb597IDVgpjtmyrqOIKU4OPNWa5U5Ywxa9YS1e425yYZr0m\nWbEyZowlO4ocA4uFLPAexNP7i+fW7crSXiV+qQ4u13ocfTA33z+2tR6TVhY5eU/l/ESYieUfUcoX\nTW47fb+yk2K9sBHG/JeFD2hlnaaJ7LYDVqLj1vOtbPGjzztUxUMDU0iV6OXgrRkDSzb8Ywv5TFwl\nDCizi2hdZ3qxeO/pw0W6405mRYYbF+G0+4zKrG7i9/ST6MTeYk7h0yvGdXlCgkfariVSalLss1qL\nMX8rPXwWreLopAHd4tdGE7pXppOMB19z3ql1Jrhcne3pLmSDRtp9OsuIY8zWM+I8XZOamyg6ZKjl\nO4j4ZDR6OG76uU5b61WWw0Pc7wirdmS1nlXfMf5+py9Cr1bWaZrIbjtgZaWeRZkiNsPQS5h/VEwT\nTQSTuy7CTNZrdhXvFa6xVeiSZc6ubLL1a9Upc5dCNRSYZZZmn5zdhayIr9BP1zIFcv3xLUQOXdpN\nlU++ZfH8O6clTl+d1X4rL+6MVVXfM89sn/NB+Pp/Z2LM30ovn0V+a36wLDZPjWsDZWX5airOZC9K\n/nrnqA3qlJxbcu3mRVRe5HRtnsGqc37KrvDTNgomnyX6hI2lk19xLGJM1DP3yzQNDXH07tZj4WX3\nYX+mkzGpH0ah+jnU1UXcGRmrchPUpt/p+u97NFtcX+skjPlb6fXzyDCJdrbCnWybkw8wmGDCUAZe\nzh6GX/CMO7ZK9h67+dg9Pm5+OFF480ivIC2IVXbF9Rj5FXHWaqc1E2UzjD6dmq36WEHcxZKCuCww\nnu/DuoTxb1QXSK9OWGzfdlavkq3TSvmdP1c0oR1EhW6W5tjuqt/pR6vqLJuHr2H1QSPevmkC95l5\nEfF1/LnEE/TQbJKlak87HsfqPERfAVf9OYwclyYkvxoXqvmypObHcZ6yN5e0BHACLrTzHM671uPT\nuFDNlwXHrsp5Sechz04Txs8Bh74Ch97s+5UkSv8Y3Djq6lw9C/oxkC/ATVHvXdfXGXGZuTKeuJfA\nwvHwO6POc/kWnEfvwgNwYFvqPXvgHJg8Bd7q+3AMLHwIeE6LQcGF2V476sYG4OpRkOvS5wqtM8Xl\nQ4zfugoTt/8SkdDcyxhGfYXQ9AzV7exlpdHnU7GSLRObtKsMjSlfY3F1prxN+zZKk63fllvpzwar\nOr6cjhMAABAdSURBVC/bn9qXXVXnFa2tLVjK5fOtldzON6EsIX2VcjyqbNWqsS65bl87Ij66EBv1\naynSPHZwkHQWdXhn40R22wErTT6fOuKA7rfMEeahgYJWnQhoSoHt2Ulno2YVvf7+U5rqDLYE7eUt\ngPyE4cVKPiBajDlXxerfqFVKbkd/LJ9AHeV4mU6gyuQ0dt10JDtYfV+A8mc4ONYyg0pzQLtW1mma\nyG47YKXJ57MyzD95FwIZus/TO5bL0xtTxpat3G9TtzsY38lSIpM8neHuwjtO3Zm0sSQj95PPAtkk\nKLkwyqGPQstV8l0xs9Gaq+tgLFqb3+auC1f1lT4YVvq/GPO3stzPp0LsM3bQMUafm7bzLX/8XmUZ\nxPLHJvY6Rn2mlolsWvsxhM5Kdyb9iYqEFrL9L4ZpiPcrFuIiDB1Rz1SSgkNdvbHO0zCIYhorhWeq\nlXWaJrLbDljp+Zi3tdUtqx9ZUS62YoLV94nuMmLy6pKY/pMHWpnsRRjewcB/YFvc7K+7XU181R2z\noY/qTbaVj3v3oopBFnlYMeZvpf3x7tmKr9fmffH2oslboiISd6w8321aJ87wgnO7K5h/beeq+M5l\nKghg5z2jz8zdJwxSV5io+oJR2+TR+PhrZZ2miey2A1Z6Od69Y9i9Z/6ljmKJvD5NPF7dTleKy8CS\nqKU+oUYog3zkUTd5BP1dKO42NPI7I6JqXERjYqPmizF/K22Ody+Zf3lgtC7eh5wsfGpnmqC9msn0\ngPHnLImmFoDt6Y6g3SBm6yOTh0tV2dpzNpYI5tKW9x3U98hKp88ArapjTl6GAHmHnW6cWWY2Oaes\njye/rwC2bwKu65Q6TZzMEieqO+DGhM5rcGkjbypNeZhec4Pv24tF5JVa4YyUdfaamnX3vCw5e/sI\nbH2m6p5LRGbvhvMuru7FzHyapnE78ANkx+jD+4rXbMbVfzvwg7jUlmHaxXxKSYOhGsb8DUvQnudc\nPQ94X/L/7d2SFyBkoB7vxk02ta+pzI1bnDCuWixvv5OJc5biGI3sibd36yJcMZLW/S6wdQ8sPgKH\nz4FvjQZ5kxv2Pu3lIsKwbGh6e9Lt1sVKfxaWUe5bIlbQWFyaimsqbOBjcWs6UxjHx6WgMyhpb2qn\n822I37vOfYs0LK8ythf3WAk6V2upwzsbJ7LbDljp37JcHy8ZZWjo0FTMJpa7pq3JqNzCqP0+UdBX\nRP+vCI0wn+gZprowmR0MZSwd+ixYWRo/razTNJHddsDKcJaSBDFtZ8+qrj95sFvG2w3DXRmT2f5S\nxpZP7v1FZz+XOryzMZm/iLwX+AngMPBvwOWqurcpeoYZWaVm/aibzWL/NnjwDnjDWhfps1qurJGo\npHnkonnugCPAe3D3uHEEuADm7ihTFsfHsn19w3BjZh5uGMnqdFpFbDV0hAZnpouBkeT/9wDv6WT2\nstL1cxgIMUBKaxjqoP3sULRY/VOIBurDOtTz5C0by25W271+Psv5vFuNbXvtxMYrniPaSumz0Mo6\nTROZEPpK4I866YCVbse+HmPq1YfdxTtSK0R0js6MPL0V43PnYjl0y1IlxsYoPpbdMtxej/1yPMvI\nxNmFd3hhvJaC5lmpPYZaWadpIhNCPwH8fCcdsNLt2Fcz/37YHaR0ljPiLJ0xS5ryaJWu/WgO2oW6\nOQlajWXTk+fyPpvWyec7b3N1jtcKPROtqrOsMn8RuQeXfiiPbar6iaTO24DDqvqRkjbeGfzcoao7\nek3ncKOOTfagyKxDOrcAN5Gj+fTW11+IcxjzmFuEfb/unNMOzcJVODv8Mv+H8rHUGvqGwcXMPDx3\nJHLiAuf81r4eaXWPV+8hIpuATW1d1PDs9FpcHr01nc5eVnryHCpixTdvIUINsU+WztgOoTwpSrb9\npdANbYsaqsZyNRbX13nNJpKvF+fIynI9E7SqjiQVVxwi8jLgeuAiVd1dUkdVVVaWMkMeqafrTeGK\ntjI0wvLQ4XMCH4NbhaeryiydDwC34lb/Kc3u/7hl02BaPTWPdNyvWOvWcl/C5SIOPZe33qO655LG\niBwy1OGdTTL/XcCxwOPJoX9S1Tfm6hjz7xMMCmPM0bnDxRiCfqZ5NSA77ouzcOMFQQwkjPmvLPqa\n+deBMX+DYfDQLzvFYYYxf4PB0AgGZae4WmHM39DXiDEIYxoGQ/cw5m9oHGXMvEQ0cC1Mvr2OuKDb\nScImGcNqRi3e2bRJUrfmSlb6t9DSo7ZuQvb6YRR6QZeVXjzz4TJ17cdSh3daMhfDMmK5nMM6TcxS\nlpHrgbWw/Y9FZu+zXUDn6DRbmqEZGPM3FLAyIpGoN+wNMPd2epwBqsiU3rTookR+HLgAZ4p4wyxw\nsTGsbjAonuAGMOZvyKG3q7fW4Q5iKSNFZGd1Gsl20wSGTOkuYHQE3pCcuwr4RYxhGYYOTcumupVb\nWen1mPc6echyZvOqmw2rKuzDxkZDV6yWgulS+qbU4Z228jf0FCUio56von279XYq4U7hsUhrDy3C\n7UlgsuaTjQ+qJZKW7OaapcpQiqZnqG5nLys9H/MuUg6u/Mqv/XwE4zvT1IxLNLbcOQzK+Fux4ksd\n3mkrf0MG2tXqrX8VfhrsQNzKutC/65qjLkT/jqFhdcGYv6EAXSZRzfKgXeXvoPXPYFgemIevoWdo\nKqDXoMrIY7CgaIZewMI7GFYcq4kRNwUbQ0O3MOZvMBgMQ4g6vDOWd9NgMBgMqxzG/A0Gg2EIYczf\nYDAYhhCNMH8ReZeI/KuI3C8inxKRU5ugw2AwGIYVjSh8ReQ4Vf1e8v+vAN+nqq+P1DOFr8FgMLSJ\nvlX4esafYALY3QQdBoOhHCKyWWT2bldkc9P0GHqLxkw9ReS3gNcAB4CNqvpEpI6t/A2GBmDOZoON\nRu38ReQe4KTIqW2q+omg3luBs1X18kgbxvwNhgYgMns33HBxGmPodmDrPap7LmmSLkM91OGdyxbb\nR1Uvrln1I8Dflp0UkXcGP3eo6o4uyDIYDIZVBxHZBGxq65qGFL7PUdVdyf+/Avygqr4mUs9W/gZD\nAzCxz2Cjb8M7iMhfAGcDC8C/Ab+sqt+J1DPmbzA0BIsxNLjoW+ZfF8b8DQaDoX30ramnwWAwGJqF\nMX+DwWAYQhjzNxgMhiGEMX+DIYB5tRqGBabwNRgSmHmjYbWgUScvg2HwMDMPN6xNvVpZC1vnsWTv\nhlUIE/sYDAbDEMJW/gbDEh6/HuZeDIRin+sbJclgWCaYzN9gCGBerYbVAPPwNRgMhiGEefgaDAaD\nIQpj/gaDwTCEMOZvMBgMQwhj/gaDwTCEMOZvMBgMQwhj/gaDwTCEMOZvMBgMQwhj/gaDwTCEMOZv\nMBgMQ4hGmb+IzIvIoojMNEmHwWAwDBsaY/4icipwMfBIUzT0CiKyqWka6sDo7C0Ggc5BoBGMzibQ\n5Mr/BuDXGrx/L7GpaQJqYlPTBNTEpqYJqIlNTRNQA5uaJqAmNjVNQE1sapqAXqER5i8irwC+qaqf\na+L+BoPBMOxYtnj+InIPcFLk1NuA/wJcElZfLjoMBoPBUMSKh3QWkXOBTwEHkkPrgUeBH1TV7+Tq\n9m+8aYPBYOhj9H08fxF5GHihqj7eKCEGg8EwROgHO39b3RsMBsMKo/GVv8FgMBhWHv2w8q+FfncI\nE5F3ici/isj9IvKpxI+h7yAi7xWRLya0/pWIrGuapjxE5GdE5AsisiAiFzRNTx4i8jIReUhEdonI\nNU3TE4OI/IGIfFtEHmiallYQkVNF5N7keX9eROaapikGEVkjIv+SfN8Pisi7m6apDCJylIh8VkQ+\n0areQDD/AXEI+2+q+n2qej7wUeA3miaoBHcD56jq9wFfxlle9RseAF4J/EPThOQhIkcBHwJeBjwf\n+DkReV6zVEWxHUdjv+Np4C2qeg6wEXhTP46nqh4EfiT5vl8A/IiIvLhhsspwJfAgFSL1gWD+DIBD\nmKp+L/g5AexuipZWUNV7VHUx+fkvOGurvoKqPqSqX26ajhL8IPAVVf2aqj4N/CnwioZpKkBV/xH4\nbtN0VEFVv6Wq9yf/Pwl8ETilWariUFVvoXgscBTQd0YqIrIe+HHg96kwoe975j9IDmEi8lsi8nXg\nMuA9TdNTA68D/rZpIgYMzwS+Efz+ZnLM0CVE5Azg+3GLkr6DiIyIyP3At4F7VfXBpmmK4P3ArwKL\nVRWXzcmrHQyKQ1gLOrep6idU9W3A20TkrbiHcPmKEpigis6kztuAw6r6kRUlLkEdGvsUZiGxDBCR\nCeAvgCuTHUDfIdkxn5/oye4SkU2quqNhspYgIj8BfEdVP1snBlFfMH9VvTh2PHEIexbwryICTkSx\nU0QKDmErgTI6I/gIDa6oq+gUkdfitoYvXRGCImhjLPsNjwKhMv9U3Orf0CFE5BjgL4E/UtWPNk1P\nFVR1r4j8DfADwI6GyQnxQ8DLReTHgTXApIj8oar+QqxyX4t9VPXzqnqiqj5LVZ+F+8guaILxV0FE\nnhP8fAXw2aZoaQUReRluW/iKRInV7+i30B+fAZ4jImeIyLHAzwIfb5imgYW4Vd2HgQdV9cam6SmD\niBwvIlPJ/2txBih99Y2r6jZVPTXhlf838HdljB/6nPlH0M9b7neLyAOJTHATMN8wPWX4IE4hfU9i\nDva7TROUh4i8UkS+gbP++BsR+WTTNHmo6hHgzcBdOIuKP1PVLzZLVREi8ifA/wc8V0S+ISKNiCBr\n4ELg1Tjrmc8mpR+tlE4G/i75vv8F+ISqfqphmqrQkl+ak5fBYDAMIQZt5W8wGAyGHsCYv8FgMAwh\njPkbDAbDEMKYv8FgMAwhjPkbDAbDEMKYv8FgMAwhjPkbDMsIEfnJqrDPgxJ+2bC6YHb+BsMyQUSO\nUtWFGvV+GHgS+ENVPW/5KTMYbOVvWOVIwjB8UURuSZKF3JUk5tghIi9M6hyf5JJGRF4rIh8VkbtF\n5GERebOIXC0i94nIP4nIdFLvTBH5pIh8RkT+QUTOTo7fJiI3i8g/A/9NRC4TkQ8m504UkTuShCD3\ni8iLYHDCLxtWF4z5G4YBZwEfUtVzgSeALTjX97Jt7zm4ZDL/CfgtYJ+qXgD8E+BjpdwC/Iqq/gAu\nVlIYJuMU4EWqmg/xcRMuFPD5wAXAF7rtmMHQKfoiqqfBsMx4OMgHsRM4o6L+vaq6H9gvIk8APsT0\nA8ALRGQcF0Hxz5Nos+ASfICbUP5c4/LUH8HFsfHhgfd10BeDoScw5m8YBhwK/l8A1gJHcNmYwIW/\nLau/GPxexH0zI8B3VfX7S+53oOQ49F+UUsOQwsQ+hmHF14AXJv//dM1rBJZSdj4sIj8NLiyxiLyg\n1TUJPgX8cnLNUSIy2S7RBkOvYMzfMAzIi2AUeB/wyyJyHzAb1MnrAvL/+9+vAn4xCfH7eeDlNa65\nEhe6+HO4vADPg4EKv2xYRTBTT4PBYBhC2MrfYDAYhhDG/A0Gg2EIYczfYDAYhhDG/A0Gg2EIYczf\nYDAYhhDG/A0Gg2EIYczfYDAYhhDG/A0Gg2EI8b8ByeLEg2CK5lMAAAAASUVORK5CYII=\n",
      "text/plain": [
       "<matplotlib.figure.Figure at 0x33c9588>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "scratch_df.plot.scatter(x='numeric1', y='numeric2',\n",
    "                        title='Numeric1 vs. Numeric2')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "***\n",
    "## 5. Subsetting Pandas dataframes\n",
    "### By columns\n",
    "#### Subsetting by index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0   -1.846671\n",
       "1   -0.707404\n",
       "2   -0.094836\n",
       "3   -0.125780\n",
       "4    0.382183\n",
       "Name: numeric1, dtype: float64"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# one column returns a Pandas series\n",
    "# a Pandas series is like a single column vector\n",
    "scratch_df.iloc[:, 0].head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "pandas.core.series.Series"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "type(scratch_df.iloc[:, 0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>numeric1</th>\n",
       "      <th>numeric2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>-1.846671</td>\n",
       "      <td>1.848302</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>-0.707404</td>\n",
       "      <td>-1.004123</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>-0.094836</td>\n",
       "      <td>-0.251163</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>-0.125780</td>\n",
       "      <td>-1.237378</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.382183</td>\n",
       "      <td>-1.711572</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   numeric1  numeric2\n",
       "0 -1.846671  1.848302\n",
       "1 -0.707404 -1.004123\n",
       "2 -0.094836 -0.251163\n",
       "3 -0.125780 -1.237378\n",
       "4  0.382183 -1.711572"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# more than one columns makes a dataframe\n",
    "# iloc enables location by index\n",
    "scratch_df.iloc[:, 0:2].head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "pandas.core.frame.DataFrame"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "type(scratch_df.iloc[:, 0:2])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Subsetting by variable name"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0   -1.846671\n",
       "1   -0.707404\n",
       "2   -0.094836\n",
       "3   -0.125780\n",
       "4    0.382183\n",
       "Name: numeric1, dtype: float64"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "scratch_df['numeric1'].head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0   -1.846671\n",
       "1   -0.707404\n",
       "2   -0.094836\n",
       "3   -0.125780\n",
       "4    0.382183\n",
       "Name: numeric1, dtype: float64"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "scratch_df.numeric1.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0   -1.846671\n",
       "1   -0.707404\n",
       "2   -0.094836\n",
       "3   -0.125780\n",
       "4    0.382183\n",
       "Name: numeric1, dtype: float64"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# loc[] allows for location by column or row label \n",
    "scratch_df.loc[:, 'numeric1'].head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>numeric1</th>\n",
       "      <th>numeric2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>-1.846671</td>\n",
       "      <td>1.848302</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>-0.707404</td>\n",
       "      <td>-1.004123</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>-0.094836</td>\n",
       "      <td>-0.251163</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>-0.125780</td>\n",
       "      <td>-1.237378</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.382183</td>\n",
       "      <td>-1.711572</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   numeric1  numeric2\n",
       "0 -1.846671  1.848302\n",
       "1 -0.707404 -1.004123\n",
       "2 -0.094836 -0.251163\n",
       "3 -0.125780 -1.237378\n",
       "4  0.382183 -1.711572"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# loc can accept lists as an input\n",
    "scratch_df.loc[:, ['numeric1', 'numeric2']].head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### By rows\n",
    "#### Subsetting by index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>numeric1</th>\n",
       "      <th>numeric2</th>\n",
       "      <th>char1</th>\n",
       "      <th>char2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>-1.846671</td>\n",
       "      <td>1.848302</td>\n",
       "      <td>EEEEEEEE</td>\n",
       "      <td>AAAAAAAA</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>-0.707404</td>\n",
       "      <td>-1.004123</td>\n",
       "      <td>DDDDDDDD</td>\n",
       "      <td>DDDDDDDD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>-0.094836</td>\n",
       "      <td>-0.251163</td>\n",
       "      <td>AAAAAAAA</td>\n",
       "      <td>DDDDDDDD</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   numeric1  numeric2     char1     char2\n",
       "0 -1.846671  1.848302  EEEEEEEE  AAAAAAAA\n",
       "1 -0.707404 -1.004123  DDDDDDDD  DDDDDDDD\n",
       "2 -0.094836 -0.251163  AAAAAAAA  DDDDDDDD"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "scratch_df[0:3]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>numeric1</th>\n",
       "      <th>numeric2</th>\n",
       "      <th>char1</th>\n",
       "      <th>char2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>-1.846671</td>\n",
       "      <td>1.848302</td>\n",
       "      <td>EEEEEEEE</td>\n",
       "      <td>AAAAAAAA</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>-0.707404</td>\n",
       "      <td>-1.004123</td>\n",
       "      <td>DDDDDDDD</td>\n",
       "      <td>DDDDDDDD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>-0.094836</td>\n",
       "      <td>-0.251163</td>\n",
       "      <td>AAAAAAAA</td>\n",
       "      <td>DDDDDDDD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>-0.125780</td>\n",
       "      <td>-1.237378</td>\n",
       "      <td>DDDDDDDD</td>\n",
       "      <td>FFFFFFFF</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.382183</td>\n",
       "      <td>-1.711572</td>\n",
       "      <td>CCCCCCCC</td>\n",
       "      <td>FFFFFFFF</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   numeric1  numeric2     char1     char2\n",
       "0 -1.846671  1.848302  EEEEEEEE  AAAAAAAA\n",
       "1 -0.707404 -1.004123  DDDDDDDD  DDDDDDDD\n",
       "2 -0.094836 -0.251163  AAAAAAAA  DDDDDDDD\n",
       "3 -0.125780 -1.237378  DDDDDDDD  FFFFFFFF\n",
       "4  0.382183 -1.711572  CCCCCCCC  FFFFFFFF"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Selecting by index \n",
    "scratch_df.iloc[0:5, :] "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>numeric1</th>\n",
       "      <th>numeric2</th>\n",
       "      <th>char1</th>\n",
       "      <th>char2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>-1.846671</td>\n",
       "      <td>1.848302</td>\n",
       "      <td>EEEEEEEE</td>\n",
       "      <td>AAAAAAAA</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>-0.707404</td>\n",
       "      <td>-1.004123</td>\n",
       "      <td>DDDDDDDD</td>\n",
       "      <td>DDDDDDDD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>-0.094836</td>\n",
       "      <td>-0.251163</td>\n",
       "      <td>AAAAAAAA</td>\n",
       "      <td>DDDDDDDD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>-0.125780</td>\n",
       "      <td>-1.237378</td>\n",
       "      <td>DDDDDDDD</td>\n",
       "      <td>FFFFFFFF</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.382183</td>\n",
       "      <td>-1.711572</td>\n",
       "      <td>CCCCCCCC</td>\n",
       "      <td>FFFFFFFF</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>-0.661172</td>\n",
       "      <td>-0.124998</td>\n",
       "      <td>FFFFFFFF</td>\n",
       "      <td>GGGGGGGG</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   numeric1  numeric2     char1     char2\n",
       "0 -1.846671  1.848302  EEEEEEEE  AAAAAAAA\n",
       "1 -0.707404 -1.004123  DDDDDDDD  DDDDDDDD\n",
       "2 -0.094836 -0.251163  AAAAAAAA  DDDDDDDD\n",
       "3 -0.125780 -1.237378  DDDDDDDD  FFFFFFFF\n",
       "4  0.382183 -1.711572  CCCCCCCC  FFFFFFFF\n",
       "5 -0.661172 -0.124998  FFFFFFFF  GGGGGGGG"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# select by row label\n",
    "# here index/key values 0:5 are returned\n",
    "scratch_df.loc[0:5, :]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Boolean subsetting"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>numeric1</th>\n",
       "      <th>numeric2</th>\n",
       "      <th>char1</th>\n",
       "      <th>char2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>-1.846671</td>\n",
       "      <td>1.848302</td>\n",
       "      <td>EEEEEEEE</td>\n",
       "      <td>AAAAAAAA</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>-0.135225</td>\n",
       "      <td>0.982267</td>\n",
       "      <td>CCCCCCCC</td>\n",
       "      <td>EEEEEEEE</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>0.534110</td>\n",
       "      <td>0.430327</td>\n",
       "      <td>AAAAAAAA</td>\n",
       "      <td>FFFFFFFF</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>-0.957339</td>\n",
       "      <td>0.435087</td>\n",
       "      <td>EEEEEEEE</td>\n",
       "      <td>BBBBBBBB</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>0.390665</td>\n",
       "      <td>0.408823</td>\n",
       "      <td>GGGGGGGG</td>\n",
       "      <td>FFFFFFFF</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    numeric1  numeric2     char1     char2\n",
       "0  -1.846671  1.848302  EEEEEEEE  AAAAAAAA\n",
       "7  -0.135225  0.982267  CCCCCCCC  EEEEEEEE\n",
       "9   0.534110  0.430327  AAAAAAAA  FFFFFFFF\n",
       "11 -0.957339  0.435087  EEEEEEEE  BBBBBBBB\n",
       "12  0.390665  0.408823  GGGGGGGG  FFFFFFFF"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "scratch_df[scratch_df.numeric2 > 0].head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>numeric1</th>\n",
       "      <th>numeric2</th>\n",
       "      <th>char1</th>\n",
       "      <th>char2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>-0.094836</td>\n",
       "      <td>-0.251163</td>\n",
       "      <td>AAAAAAAA</td>\n",
       "      <td>DDDDDDDD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>0.565589</td>\n",
       "      <td>-1.405314</td>\n",
       "      <td>AAAAAAAA</td>\n",
       "      <td>EEEEEEEE</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>0.534110</td>\n",
       "      <td>0.430327</td>\n",
       "      <td>AAAAAAAA</td>\n",
       "      <td>FFFFFFFF</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>-0.121501</td>\n",
       "      <td>-0.906697</td>\n",
       "      <td>AAAAAAAA</td>\n",
       "      <td>GGGGGGGG</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>2.096871</td>\n",
       "      <td>1.511935</td>\n",
       "      <td>AAAAAAAA</td>\n",
       "      <td>CCCCCCCC</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    numeric1  numeric2     char1     char2\n",
       "2  -0.094836 -0.251163  AAAAAAAA  DDDDDDDD\n",
       "8   0.565589 -1.405314  AAAAAAAA  EEEEEEEE\n",
       "9   0.534110  0.430327  AAAAAAAA  FFFFFFFF\n",
       "21 -0.121501 -0.906697  AAAAAAAA  GGGGGGGG\n",
       "26  2.096871  1.511935  AAAAAAAA  CCCCCCCC"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "scratch_df[scratch_df.char1 == 'AAAAAAAA'].head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>numeric1</th>\n",
       "      <th>numeric2</th>\n",
       "      <th>char1</th>\n",
       "      <th>char2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>-0.094836</td>\n",
       "      <td>-0.251163</td>\n",
       "      <td>AAAAAAAA</td>\n",
       "      <td>DDDDDDDD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>0.565589</td>\n",
       "      <td>-1.405314</td>\n",
       "      <td>AAAAAAAA</td>\n",
       "      <td>EEEEEEEE</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>0.534110</td>\n",
       "      <td>0.430327</td>\n",
       "      <td>AAAAAAAA</td>\n",
       "      <td>FFFFFFFF</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>-0.679535</td>\n",
       "      <td>-1.710162</td>\n",
       "      <td>BBBBBBBB</td>\n",
       "      <td>EEEEEEEE</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>-0.121501</td>\n",
       "      <td>-0.906697</td>\n",
       "      <td>AAAAAAAA</td>\n",
       "      <td>GGGGGGGG</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    numeric1  numeric2     char1     char2\n",
       "2  -0.094836 -0.251163  AAAAAAAA  DDDDDDDD\n",
       "8   0.565589 -1.405314  AAAAAAAA  EEEEEEEE\n",
       "9   0.534110  0.430327  AAAAAAAA  FFFFFFFF\n",
       "13 -0.679535 -1.710162  BBBBBBBB  EEEEEEEE\n",
       "21 -0.121501 -0.906697  AAAAAAAA  GGGGGGGG"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "scratch_df[scratch_df.char1.isin(['AAAAAAAA', 'BBBBBBBB'])].head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "7    EEEEEEEE\n",
       "9    FFFFFFFF\n",
       "Name: char2, dtype: object"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "scratch_df[scratch_df.numeric2 > 0].loc[5:10, 'char2']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "***\n",
    "## 6. Updating the dataframe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>numeric1</th>\n",
       "      <th>numeric2</th>\n",
       "      <th>char1</th>\n",
       "      <th>char2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>995</th>\n",
       "      <td>2.537433</td>\n",
       "      <td>1.944461</td>\n",
       "      <td>ZZZZZZZZ</td>\n",
       "      <td>DDDDDDDD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>996</th>\n",
       "      <td>1.550140</td>\n",
       "      <td>-0.074913</td>\n",
       "      <td>ZZZZZZZZ</td>\n",
       "      <td>FFFFFFFF</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>997</th>\n",
       "      <td>2.759164</td>\n",
       "      <td>-0.211622</td>\n",
       "      <td>ZZZZZZZZ</td>\n",
       "      <td>CCCCCCCC</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>998</th>\n",
       "      <td>2.095885</td>\n",
       "      <td>-0.700426</td>\n",
       "      <td>ZZZZZZZZ</td>\n",
       "      <td>BBBBBBBB</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>999</th>\n",
       "      <td>1.866174</td>\n",
       "      <td>0.539002</td>\n",
       "      <td>ZZZZZZZZ</td>\n",
       "      <td>AAAAAAAA</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     numeric1  numeric2     char1     char2\n",
       "995  2.537433  1.944461  ZZZZZZZZ  DDDDDDDD\n",
       "996  1.550140 -0.074913  ZZZZZZZZ  FFFFFFFF\n",
       "997  2.759164 -0.211622  ZZZZZZZZ  CCCCCCCC\n",
       "998  2.095885 -0.700426  ZZZZZZZZ  BBBBBBBB\n",
       "999  1.866174  0.539002  ZZZZZZZZ  AAAAAAAA"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# must use .copy() or this will be a symbolic link\n",
    "scratch_df2 = scratch_df.copy()\n",
    "\n",
    "# Pandas supports in place overwrites of data\n",
    "# overwrite last 500 rows of char1 with ZZZZZZZZ\n",
    "scratch_df2.loc[500:, 'char1'] = 'ZZZZZZZZ'\n",
    "scratch_df2.tail()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>numeric1</th>\n",
       "      <th>numeric2</th>\n",
       "      <th>char1</th>\n",
       "      <th>char2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1000.000000</td>\n",
       "      <td>1.848302</td>\n",
       "      <td>EEEEEEEE</td>\n",
       "      <td>AAAAAAAA</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>-0.707404</td>\n",
       "      <td>-1.004123</td>\n",
       "      <td>DDDDDDDD</td>\n",
       "      <td>DDDDDDDD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>-0.094836</td>\n",
       "      <td>-0.251163</td>\n",
       "      <td>AAAAAAAA</td>\n",
       "      <td>DDDDDDDD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>-0.125780</td>\n",
       "      <td>-1.237378</td>\n",
       "      <td>DDDDDDDD</td>\n",
       "      <td>FFFFFFFF</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.382183</td>\n",
       "      <td>-1.711572</td>\n",
       "      <td>CCCCCCCC</td>\n",
       "      <td>FFFFFFFF</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      numeric1  numeric2     char1     char2\n",
       "0  1000.000000  1.848302  EEEEEEEE  AAAAAAAA\n",
       "1    -0.707404 -1.004123  DDDDDDDD  DDDDDDDD\n",
       "2    -0.094836 -0.251163  AAAAAAAA  DDDDDDDD\n",
       "3    -0.125780 -1.237378  DDDDDDDD  FFFFFFFF\n",
       "4     0.382183 -1.711572  CCCCCCCC  FFFFFFFF"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# iat[] allows for fast location of specific indices\n",
    "scratch_df2.iat[0, 0] = 1000\n",
    "scratch_df2.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "***\n",
    "## 7. Sorting the dataframe\n",
    "#### Sort by values of one variable"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>numeric1</th>\n",
       "      <th>numeric2</th>\n",
       "      <th>char1</th>\n",
       "      <th>char2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>499</th>\n",
       "      <td>0.199489</td>\n",
       "      <td>-1.008574</td>\n",
       "      <td>AAAAAAAA</td>\n",
       "      <td>DDDDDDDD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>225</th>\n",
       "      <td>0.531949</td>\n",
       "      <td>-1.267235</td>\n",
       "      <td>AAAAAAAA</td>\n",
       "      <td>DDDDDDDD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>347</th>\n",
       "      <td>-0.643377</td>\n",
       "      <td>-0.165108</td>\n",
       "      <td>AAAAAAAA</td>\n",
       "      <td>AAAAAAAA</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>103</th>\n",
       "      <td>0.666005</td>\n",
       "      <td>0.631195</td>\n",
       "      <td>AAAAAAAA</td>\n",
       "      <td>BBBBBBBB</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102</th>\n",
       "      <td>-0.924382</td>\n",
       "      <td>1.784032</td>\n",
       "      <td>AAAAAAAA</td>\n",
       "      <td>CCCCCCCC</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     numeric1  numeric2     char1     char2\n",
       "499  0.199489 -1.008574  AAAAAAAA  DDDDDDDD\n",
       "225  0.531949 -1.267235  AAAAAAAA  DDDDDDDD\n",
       "347 -0.643377 -0.165108  AAAAAAAA  AAAAAAAA\n",
       "103  0.666005  0.631195  AAAAAAAA  BBBBBBBB\n",
       "102 -0.924382  1.784032  AAAAAAAA  CCCCCCCC"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "scratch_df2.sort_values(by='char1').head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Sort by values of multiple variables and specify sort order"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>numeric1</th>\n",
       "      <th>numeric2</th>\n",
       "      <th>char1</th>\n",
       "      <th>char2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>533</th>\n",
       "      <td>-3.490143</td>\n",
       "      <td>1.699398</td>\n",
       "      <td>ZZZZZZZZ</td>\n",
       "      <td>EEEEEEEE</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>605</th>\n",
       "      <td>-2.669272</td>\n",
       "      <td>-0.577942</td>\n",
       "      <td>ZZZZZZZZ</td>\n",
       "      <td>GGGGGGGG</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>759</th>\n",
       "      <td>-2.455348</td>\n",
       "      <td>-0.499223</td>\n",
       "      <td>ZZZZZZZZ</td>\n",
       "      <td>GGGGGGGG</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>630</th>\n",
       "      <td>-2.426346</td>\n",
       "      <td>0.238347</td>\n",
       "      <td>ZZZZZZZZ</td>\n",
       "      <td>EEEEEEEE</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>884</th>\n",
       "      <td>-2.409502</td>\n",
       "      <td>0.063437</td>\n",
       "      <td>ZZZZZZZZ</td>\n",
       "      <td>BBBBBBBB</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     numeric1  numeric2     char1     char2\n",
       "533 -3.490143  1.699398  ZZZZZZZZ  EEEEEEEE\n",
       "605 -2.669272 -0.577942  ZZZZZZZZ  GGGGGGGG\n",
       "759 -2.455348 -0.499223  ZZZZZZZZ  GGGGGGGG\n",
       "630 -2.426346  0.238347  ZZZZZZZZ  EEEEEEEE\n",
       "884 -2.409502  0.063437  ZZZZZZZZ  BBBBBBBB"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "scratch_df3 = scratch_df2.sort_values(by=['char1', 'numeric1'],\n",
    "                            ascending=[False, True]).copy()\n",
    "scratch_df3.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Sort by the value of the dataframe index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>numeric1</th>\n",
       "      <th>numeric2</th>\n",
       "      <th>char1</th>\n",
       "      <th>char2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1000.000000</td>\n",
       "      <td>1.848302</td>\n",
       "      <td>EEEEEEEE</td>\n",
       "      <td>AAAAAAAA</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>-0.707404</td>\n",
       "      <td>-1.004123</td>\n",
       "      <td>DDDDDDDD</td>\n",
       "      <td>DDDDDDDD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>-0.094836</td>\n",
       "      <td>-0.251163</td>\n",
       "      <td>AAAAAAAA</td>\n",
       "      <td>DDDDDDDD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>-0.125780</td>\n",
       "      <td>-1.237378</td>\n",
       "      <td>DDDDDDDD</td>\n",
       "      <td>FFFFFFFF</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.382183</td>\n",
       "      <td>-1.711572</td>\n",
       "      <td>CCCCCCCC</td>\n",
       "      <td>FFFFFFFF</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      numeric1  numeric2     char1     char2\n",
       "0  1000.000000  1.848302  EEEEEEEE  AAAAAAAA\n",
       "1    -0.707404 -1.004123  DDDDDDDD  DDDDDDDD\n",
       "2    -0.094836 -0.251163  AAAAAAAA  DDDDDDDD\n",
       "3    -0.125780 -1.237378  DDDDDDDD  FFFFFFFF\n",
       "4     0.382183 -1.711572  CCCCCCCC  FFFFFFFF"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "scratch_df2.sort_index().head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "***\n",
    "## 8. Adding data to the dataframe\n",
    "* Pandas `concat()` supports numerous types of joins and merges\n",
    "* Pandas `merge()` supports joins and merges using more SQL-like syntax \n",
    "  * i.e. `merge(left, right, on=)`\n",
    "* Pandas `append()` supports stacking dataframes top-to-bottom"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>char3</th>\n",
       "      <th>char4</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>286</th>\n",
       "      <td>AAAAAAAA</td>\n",
       "      <td>AAAAAAAA</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>172</th>\n",
       "      <td>AAAAAAAA</td>\n",
       "      <td>AAAAAAAA</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>AAAAAAAA</td>\n",
       "      <td>CCCCCCCC</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>228</th>\n",
       "      <td>AAAAAAAA</td>\n",
       "      <td>GGGGGGGG</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>117</th>\n",
       "      <td>AAAAAAAA</td>\n",
       "      <td>GGGGGGGG</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        char3     char4\n",
       "286  AAAAAAAA  AAAAAAAA\n",
       "172  AAAAAAAA  AAAAAAAA\n",
       "26   AAAAAAAA  CCCCCCCC\n",
       "228  AAAAAAAA  GGGGGGGG\n",
       "117  AAAAAAAA  GGGGGGGG"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# create a toy dataframe to join/merge onto scratch_df\n",
    "scratch_df3 = scratch_df3.drop(['numeric1', 'numeric2'] , axis=1)\n",
    "scratch_df3.columns = ['char3', 'char4']\n",
    "scratch_df3.tail()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>char1</th>\n",
       "      <th>char2</th>\n",
       "      <th>char3</th>\n",
       "      <th>char4</th>\n",
       "      <th>numeric1</th>\n",
       "      <th>numeric2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>EEEEEEEE</td>\n",
       "      <td>AAAAAAAA</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>-1.846671</td>\n",
       "      <td>1.848302</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>DDDDDDDD</td>\n",
       "      <td>DDDDDDDD</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>-0.707404</td>\n",
       "      <td>-1.004123</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>AAAAAAAA</td>\n",
       "      <td>DDDDDDDD</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>-0.094836</td>\n",
       "      <td>-0.251163</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>DDDDDDDD</td>\n",
       "      <td>FFFFFFFF</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>-0.125780</td>\n",
       "      <td>-1.237378</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>CCCCCCCC</td>\n",
       "      <td>FFFFFFFF</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.382183</td>\n",
       "      <td>-1.711572</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>FFFFFFFF</td>\n",
       "      <td>GGGGGGGG</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>-0.661172</td>\n",
       "      <td>-0.124998</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>GGGGGGGG</td>\n",
       "      <td>GGGGGGGG</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.828998</td>\n",
       "      <td>-0.967002</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>CCCCCCCC</td>\n",
       "      <td>EEEEEEEE</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>-0.135225</td>\n",
       "      <td>0.982267</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>AAAAAAAA</td>\n",
       "      <td>EEEEEEEE</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.565589</td>\n",
       "      <td>-1.405314</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>AAAAAAAA</td>\n",
       "      <td>FFFFFFFF</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.534110</td>\n",
       "      <td>0.430327</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>DDDDDDDD</td>\n",
       "      <td>CCCCCCCC</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.627748</td>\n",
       "      <td>-0.870272</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>EEEEEEEE</td>\n",
       "      <td>BBBBBBBB</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>-0.957339</td>\n",
       "      <td>0.435087</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>GGGGGGGG</td>\n",
       "      <td>FFFFFFFF</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.390665</td>\n",
       "      <td>0.408823</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>BBBBBBBB</td>\n",
       "      <td>EEEEEEEE</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>-0.679535</td>\n",
       "      <td>-1.710162</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>DDDDDDDD</td>\n",
       "      <td>BBBBBBBB</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2.413431</td>\n",
       "      <td>2.178699</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>EEEEEEEE</td>\n",
       "      <td>FFFFFFFF</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.860728</td>\n",
       "      <td>-1.677506</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>EEEEEEEE</td>\n",
       "      <td>GGGGGGGG</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>-2.262108</td>\n",
       "      <td>3.574336</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>CCCCCCCC</td>\n",
       "      <td>GGGGGGGG</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1.559153</td>\n",
       "      <td>0.458943</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>EEEEEEEE</td>\n",
       "      <td>GGGGGGGG</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>-0.027194</td>\n",
       "      <td>-0.293085</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>EEEEEEEE</td>\n",
       "      <td>GGGGGGGG</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1.142378</td>\n",
       "      <td>0.649251</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>FFFFFFFF</td>\n",
       "      <td>CCCCCCCC</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.152942</td>\n",
       "      <td>-0.996234</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>AAAAAAAA</td>\n",
       "      <td>GGGGGGGG</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>-0.121501</td>\n",
       "      <td>-0.906697</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>BBBBBBBB</td>\n",
       "      <td>CCCCCCCC</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1.634791</td>\n",
       "      <td>0.741408</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>DDDDDDDD</td>\n",
       "      <td>FFFFFFFF</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>-0.944472</td>\n",
       "      <td>-0.210692</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>BBBBBBBB</td>\n",
       "      <td>AAAAAAAA</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>-2.409763</td>\n",
       "      <td>0.079427</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>DDDDDDDD</td>\n",
       "      <td>EEEEEEEE</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2.186654</td>\n",
       "      <td>2.085687</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>AAAAAAAA</td>\n",
       "      <td>CCCCCCCC</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2.096871</td>\n",
       "      <td>1.511935</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>CCCCCCCC</td>\n",
       "      <td>AAAAAAAA</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>-0.809646</td>\n",
       "      <td>-1.582986</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>AAAAAAAA</td>\n",
       "      <td>GGGGGGGG</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>-0.581517</td>\n",
       "      <td>-1.024026</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>DDDDDDDD</td>\n",
       "      <td>DDDDDDDD</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.915936</td>\n",
       "      <td>-0.378046</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>63</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>AAAAAAAA</td>\n",
       "      <td>DDDDDDDD</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>218</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>AAAAAAAA</td>\n",
       "      <td>BBBBBBBB</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>396</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>AAAAAAAA</td>\n",
       "      <td>BBBBBBBB</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>225</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>AAAAAAAA</td>\n",
       "      <td>DDDDDDDD</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>149</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>AAAAAAAA</td>\n",
       "      <td>CCCCCCCC</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>AAAAAAAA</td>\n",
       "      <td>FFFFFFFF</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>118</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>AAAAAAAA</td>\n",
       "      <td>GGGGGGGG</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>AAAAAAAA</td>\n",
       "      <td>EEEEEEEE</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>255</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>AAAAAAAA</td>\n",
       "      <td>CCCCCCCC</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>479</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>AAAAAAAA</td>\n",
       "      <td>DDDDDDDD</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>103</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>AAAAAAAA</td>\n",
       "      <td>BBBBBBBB</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>289</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>AAAAAAAA</td>\n",
       "      <td>DDDDDDDD</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>243</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>AAAAAAAA</td>\n",
       "      <td>BBBBBBBB</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>32</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>AAAAAAAA</td>\n",
       "      <td>GGGGGGGG</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>57</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>AAAAAAAA</td>\n",
       "      <td>CCCCCCCC</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>257</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>AAAAAAAA</td>\n",
       "      <td>DDDDDDDD</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>130</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>AAAAAAAA</td>\n",
       "      <td>CCCCCCCC</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>376</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>AAAAAAAA</td>\n",
       "      <td>GGGGGGGG</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>247</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>AAAAAAAA</td>\n",
       "      <td>DDDDDDDD</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>195</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>AAAAAAAA</td>\n",
       "      <td>AAAAAAAA</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>366</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>AAAAAAAA</td>\n",
       "      <td>AAAAAAAA</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>452</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>AAAAAAAA</td>\n",
       "      <td>AAAAAAAA</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>408</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>AAAAAAAA</td>\n",
       "      <td>BBBBBBBB</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>260</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>AAAAAAAA</td>\n",
       "      <td>GGGGGGGG</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>91</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>AAAAAAAA</td>\n",
       "      <td>FFFFFFFF</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>286</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>AAAAAAAA</td>\n",
       "      <td>AAAAAAAA</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>172</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>AAAAAAAA</td>\n",
       "      <td>AAAAAAAA</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>AAAAAAAA</td>\n",
       "      <td>CCCCCCCC</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>228</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>AAAAAAAA</td>\n",
       "      <td>GGGGGGGG</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>117</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>AAAAAAAA</td>\n",
       "      <td>GGGGGGGG</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>2000 rows × 6 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "        char1     char2     char3     char4  numeric1  numeric2\n",
       "0    EEEEEEEE  AAAAAAAA       NaN       NaN -1.846671  1.848302\n",
       "1    DDDDDDDD  DDDDDDDD       NaN       NaN -0.707404 -1.004123\n",
       "2    AAAAAAAA  DDDDDDDD       NaN       NaN -0.094836 -0.251163\n",
       "3    DDDDDDDD  FFFFFFFF       NaN       NaN -0.125780 -1.237378\n",
       "4    CCCCCCCC  FFFFFFFF       NaN       NaN  0.382183 -1.711572\n",
       "5    FFFFFFFF  GGGGGGGG       NaN       NaN -0.661172 -0.124998\n",
       "6    GGGGGGGG  GGGGGGGG       NaN       NaN  0.828998 -0.967002\n",
       "7    CCCCCCCC  EEEEEEEE       NaN       NaN -0.135225  0.982267\n",
       "8    AAAAAAAA  EEEEEEEE       NaN       NaN  0.565589 -1.405314\n",
       "9    AAAAAAAA  FFFFFFFF       NaN       NaN  0.534110  0.430327\n",
       "10   DDDDDDDD  CCCCCCCC       NaN       NaN  0.627748 -0.870272\n",
       "11   EEEEEEEE  BBBBBBBB       NaN       NaN -0.957339  0.435087\n",
       "12   GGGGGGGG  FFFFFFFF       NaN       NaN  0.390665  0.408823\n",
       "13   BBBBBBBB  EEEEEEEE       NaN       NaN -0.679535 -1.710162\n",
       "14   DDDDDDDD  BBBBBBBB       NaN       NaN  2.413431  2.178699\n",
       "15   EEEEEEEE  FFFFFFFF       NaN       NaN  0.860728 -1.677506\n",
       "16   EEEEEEEE  GGGGGGGG       NaN       NaN -2.262108  3.574336\n",
       "17   CCCCCCCC  GGGGGGGG       NaN       NaN  1.559153  0.458943\n",
       "18   EEEEEEEE  GGGGGGGG       NaN       NaN -0.027194 -0.293085\n",
       "19   EEEEEEEE  GGGGGGGG       NaN       NaN  1.142378  0.649251\n",
       "20   FFFFFFFF  CCCCCCCC       NaN       NaN  0.152942 -0.996234\n",
       "21   AAAAAAAA  GGGGGGGG       NaN       NaN -0.121501 -0.906697\n",
       "22   BBBBBBBB  CCCCCCCC       NaN       NaN  1.634791  0.741408\n",
       "23   DDDDDDDD  FFFFFFFF       NaN       NaN -0.944472 -0.210692\n",
       "24   BBBBBBBB  AAAAAAAA       NaN       NaN -2.409763  0.079427\n",
       "25   DDDDDDDD  EEEEEEEE       NaN       NaN  2.186654  2.085687\n",
       "26   AAAAAAAA  CCCCCCCC       NaN       NaN  2.096871  1.511935\n",
       "27   CCCCCCCC  AAAAAAAA       NaN       NaN -0.809646 -1.582986\n",
       "28   AAAAAAAA  GGGGGGGG       NaN       NaN -0.581517 -1.024026\n",
       "29   DDDDDDDD  DDDDDDDD       NaN       NaN  0.915936 -0.378046\n",
       "..        ...       ...       ...       ...       ...       ...\n",
       "63        NaN       NaN  AAAAAAAA  DDDDDDDD       NaN       NaN\n",
       "218       NaN       NaN  AAAAAAAA  BBBBBBBB       NaN       NaN\n",
       "396       NaN       NaN  AAAAAAAA  BBBBBBBB       NaN       NaN\n",
       "225       NaN       NaN  AAAAAAAA  DDDDDDDD       NaN       NaN\n",
       "149       NaN       NaN  AAAAAAAA  CCCCCCCC       NaN       NaN\n",
       "9         NaN       NaN  AAAAAAAA  FFFFFFFF       NaN       NaN\n",
       "118       NaN       NaN  AAAAAAAA  GGGGGGGG       NaN       NaN\n",
       "8         NaN       NaN  AAAAAAAA  EEEEEEEE       NaN       NaN\n",
       "255       NaN       NaN  AAAAAAAA  CCCCCCCC       NaN       NaN\n",
       "479       NaN       NaN  AAAAAAAA  DDDDDDDD       NaN       NaN\n",
       "103       NaN       NaN  AAAAAAAA  BBBBBBBB       NaN       NaN\n",
       "289       NaN       NaN  AAAAAAAA  DDDDDDDD       NaN       NaN\n",
       "243       NaN       NaN  AAAAAAAA  BBBBBBBB       NaN       NaN\n",
       "32        NaN       NaN  AAAAAAAA  GGGGGGGG       NaN       NaN\n",
       "57        NaN       NaN  AAAAAAAA  CCCCCCCC       NaN       NaN\n",
       "257       NaN       NaN  AAAAAAAA  DDDDDDDD       NaN       NaN\n",
       "130       NaN       NaN  AAAAAAAA  CCCCCCCC       NaN       NaN\n",
       "376       NaN       NaN  AAAAAAAA  GGGGGGGG       NaN       NaN\n",
       "247       NaN       NaN  AAAAAAAA  DDDDDDDD       NaN       NaN\n",
       "195       NaN       NaN  AAAAAAAA  AAAAAAAA       NaN       NaN\n",
       "366       NaN       NaN  AAAAAAAA  AAAAAAAA       NaN       NaN\n",
       "452       NaN       NaN  AAAAAAAA  AAAAAAAA       NaN       NaN\n",
       "408       NaN       NaN  AAAAAAAA  BBBBBBBB       NaN       NaN\n",
       "260       NaN       NaN  AAAAAAAA  GGGGGGGG       NaN       NaN\n",
       "91        NaN       NaN  AAAAAAAA  FFFFFFFF       NaN       NaN\n",
       "286       NaN       NaN  AAAAAAAA  AAAAAAAA       NaN       NaN\n",
       "172       NaN       NaN  AAAAAAAA  AAAAAAAA       NaN       NaN\n",
       "26        NaN       NaN  AAAAAAAA  CCCCCCCC       NaN       NaN\n",
       "228       NaN       NaN  AAAAAAAA  GGGGGGGG       NaN       NaN\n",
       "117       NaN       NaN  AAAAAAAA  GGGGGGGG       NaN       NaN\n",
       "\n",
       "[2000 rows x 6 columns]"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# default outer join on matching indices\n",
    "# this will create 2000 row × 6 column dataset because indices are not in identical order\n",
    "scratch_df4 = pd.concat([scratch_df, scratch_df3])\n",
    "scratch_df4"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>numeric1</th>\n",
       "      <th>numeric2</th>\n",
       "      <th>char1</th>\n",
       "      <th>char2</th>\n",
       "      <th>char3</th>\n",
       "      <th>char4</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>-1.846671</td>\n",
       "      <td>1.848302</td>\n",
       "      <td>EEEEEEEE</td>\n",
       "      <td>AAAAAAAA</td>\n",
       "      <td>EEEEEEEE</td>\n",
       "      <td>AAAAAAAA</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>-0.707404</td>\n",
       "      <td>-1.004123</td>\n",
       "      <td>DDDDDDDD</td>\n",
       "      <td>DDDDDDDD</td>\n",
       "      <td>DDDDDDDD</td>\n",
       "      <td>DDDDDDDD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>-0.094836</td>\n",
       "      <td>-0.251163</td>\n",
       "      <td>AAAAAAAA</td>\n",
       "      <td>DDDDDDDD</td>\n",
       "      <td>AAAAAAAA</td>\n",
       "      <td>DDDDDDDD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>-0.125780</td>\n",
       "      <td>-1.237378</td>\n",
       "      <td>DDDDDDDD</td>\n",
       "      <td>FFFFFFFF</td>\n",
       "      <td>DDDDDDDD</td>\n",
       "      <td>FFFFFFFF</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.382183</td>\n",
       "      <td>-1.711572</td>\n",
       "      <td>CCCCCCCC</td>\n",
       "      <td>FFFFFFFF</td>\n",
       "      <td>CCCCCCCC</td>\n",
       "      <td>FFFFFFFF</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   numeric1  numeric2     char1     char2     char3     char4\n",
       "0 -1.846671  1.848302  EEEEEEEE  AAAAAAAA  EEEEEEEE  AAAAAAAA\n",
       "1 -0.707404 -1.004123  DDDDDDDD  DDDDDDDD  DDDDDDDD  DDDDDDDD\n",
       "2 -0.094836 -0.251163  AAAAAAAA  DDDDDDDD  AAAAAAAA  DDDDDDDD\n",
       "3 -0.125780 -1.237378  DDDDDDDD  FFFFFFFF  DDDDDDDD  FFFFFFFF\n",
       "4  0.382183 -1.711572  CCCCCCCC  FFFFFFFF  CCCCCCCC  FFFFFFFF"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# outer join on matching columns\n",
    "# axis=1 specificies to join on matching columns\n",
    "scratch_df5 = pd.concat([scratch_df, scratch_df3], axis=1)\n",
    "scratch_df5.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1000, 6)"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "scratch_df5.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(2000, 4)"
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# append\n",
    "scratch_df6 = scratch_df.append(scratch_df)\n",
    "scratch_df6.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "***\n",
    "## 9. Comparing dataframes\n",
    "* Use Pandas `equals()` to compare dataframes\n",
    "* Row order is not ignored"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "scratch_df.equals(scratch_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "False"
      ]
     },
     "execution_count": 46,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "scratch_df.equals(scratch_df.sort_values(by='char1'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "False"
      ]
     },
     "execution_count": 47,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "scratch_df.equals(scratch_df2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "***\n",
    "## 10. Summarizing dataframes\n",
    "Pandas offers several straightforward summarization functions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "numeric1    0.067432\n",
       "numeric2   -0.002979\n",
       "dtype: float64"
      ]
     },
     "execution_count": 48,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "scratch_df.mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>numeric1</th>\n",
       "      <th>numeric2</th>\n",
       "      <th>char1</th>\n",
       "      <th>char2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>DDDDDDDD</td>\n",
       "      <td>BBBBBBBB</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   numeric1  numeric2     char1     char2\n",
       "0       NaN       NaN  DDDDDDDD  BBBBBBBB"
      ]
     },
     "execution_count": 49,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "scratch_df.mode()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>numeric1</th>\n",
       "      <th>numeric2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>1000.000000</td>\n",
       "      <td>1000.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>0.067432</td>\n",
       "      <td>-0.002979</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>1.014362</td>\n",
       "      <td>1.008141</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>-3.490143</td>\n",
       "      <td>-3.344991</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>-0.610958</td>\n",
       "      <td>-0.692698</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>0.050545</td>\n",
       "      <td>0.002861</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>0.718816</td>\n",
       "      <td>0.676773</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>3.259914</td>\n",
       "      <td>3.623428</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          numeric1     numeric2\n",
       "count  1000.000000  1000.000000\n",
       "mean      0.067432    -0.002979\n",
       "std       1.014362     1.008141\n",
       "min      -3.490143    -3.344991\n",
       "25%      -0.610958    -0.692698\n",
       "50%       0.050545     0.002861\n",
       "75%       0.718816     0.676773\n",
       "max       3.259914     3.623428"
      ]
     },
     "execution_count": 50,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "scratch_df.describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "***\n",
    "## 11. By group processing\n",
    "Use Pandas `groupby()` to create groups for subsequent processing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<matplotlib.axes._subplots.AxesSubplot at 0xd715b38>"
      ]
     },
     "execution_count": 51,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXIAAAFMCAYAAADWVDfUAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3Xm8XFWZ7//PF0KYJSAYIMwgV0BQZtAWIgiNqMCFZmoV\ncVYE1FYxaAtpRRTFoRWwr8p4ldgBkUHk14ThABcFRGYCMkiAMIQZEUQCPL8/1iqyT+VUnTp1hl3r\n5Pt+vfLKHqr2fqpqn6dWrb0GRQRmZlauxeoOwMzMhseJ3MyscE7kZmaFcyI3MyucE7mZWeGcyM3M\nCudEvoiS9L8k3STpr5IOHcLzpkp6cDRjGy2SXpW03hifcxVJd0hacgjP+Z2kD45mXL1O0tsl3S3p\nOUl7DOF5B0u6ajRjazrf8ZI+NVbna2VcJnJJcyS9kC+C53KyWrXuuHrMEcClEfG6iDihzkAk/VTS\nnZJekfShOmMZBdOAUyPiHwCS+iR9tPqA5i/HiNg9Iv7vYAeu44tpDH0d+FFELB8R59cVhKQlJJ0t\n6b78fu/Y9JDjga9IWqKO+BrGZSIHAnhvvgiWz8nq0eoDJE2oKbZesTYwu84AKp/BTcAhwA2kz25c\nyKXwg4BfVDYHI/saNYLHWnBQafHROO4QrEX912fjPbgS+ADwKE2fXc4rdwId/2oYDeM1kQ8of6Me\nIulu4M9523tzFcPTkq6WtGnl8ZtLuiGX6H+V/30j71voJ1y1hCRpyfyz635Jj0r6iaSl8r6pkuZK\n+jdJ8yQ9LOngynGWlvS9/MviGUlXSlpK0oXN1SCSbpG0Z4vXu4ek2/Nru1zSm/L2y4CpwAn5tW0w\nwHNXknSqpIckPSXpN037W8X+Hkk3SnpW0gOSjq7sWye/Rx+RdD9wCUBEnBQRlwEvtvrs8vO3lfSI\nJFW2/W9JN+flbST9Ib/ehyX9uFVJqblk3Px5SnqTpFmSnsy/Fvat7Ns9v69/zZ/jF1qEvC3wTEQ8\n3O51tYtN0gaSrsjXweOSZuTtV+aH35x/de6bt39cqUriSUnnSVqtctxdJf05H+vEfNzGeQ7O1//3\nJT0BHC1pPUmXSXoin/sXklaoHG+OpC/ma/A5SSdLmizpovz5z5I0qc3rHDBWSfcC6wEX5Pd4oc9Q\n0pqSzpH0WI7vx037v5uv279I2q2y/cOSZufj3ivpE5V9jb/LIyQ9ApwSEfMj4kcRcTXwSouX0ge8\np9XrHBMRMe7+AfcBOw+w/VXgf4BJwJLA5sA8YGtSyeag/NwlgInA/cBngcWBfYCXgK/nYx0MXDXA\n8dfLyz8Azs3nWg44Hzg275sKzAem52O/G3geWCHvPxG4DFiN9GW7XY5nX+CayvneAjwBTBjgtW4I\n/A3YOZ/jS8DdjccClwMfafMeXgjMAFYAJgDv6DD2HYFN8vKmpFLMnnl9nfwenQYsDSzZdM6rgIMG\n+WzvAd5VWT8LOCIvbwFsk9+zxi+Oz7b4fPq9/urnCSwLPAh8KB/rrcDjwJvy/keAt+flFYDNW8T6\nGeC3TdsuBz7atG0q8GDTYz6Sl2cAR+blicDbBno9eX2nHOdb82N/BFyR960MPAvslV/T4aTr+SOV\n1z8/x7wYsBSwfr5+lsjPvwL4QdPf2e+BVYDVSX9LN5CuyyWBS4GjWrw3LWOtHHunFs9dHLgZ+F7j\nOmq8L/l1vAR8lPQ3/SngocpzdwfWzcs7kK7dzZuu7W/l17xU03kfBHYYIJ69gT/VmvPqPPmovSiY\nAzwHPJ3/nVO58KdWHvcTcmKubLszf8A7VC+AvO9qOkjk+QL6W9Mf2fbAXyoXzAvAYpX981iQhF4A\nNh3gdS0FPAWsn9ePB05o8R58DfhVZV3A3MaFyAAJpfLY1UiljxUG2Ncy9hbH+iHw/by8Tn6P1mnx\n2E4S+TeAk/Py8vl9XrPFYz/X+Oyrn0/l9bdK5PsDVzYd6/+QkxLpC/4TwOsGifWrwIymbX2k5PF0\n5d9zwAOVx1QT+en53FMGOH5zIj8Z+HZlfVlSUlubVEi5uun5D9A/kd8/yOvZC7ihsn4fcGBl/Wzg\nxMr6ocBvWhyrVaxrVY7dKpFvDzxWvQabPse7K+vL5PfpDS2O9Rvg8Mq1/Q9gYovHtkrkuwD3tnvv\nRvvfeK1aCVIpcMX8b+/KvmqLi7WBL+Sf4k9LehpYg5TIVgceajru/R2efxXSBfSnynEvIpVqGp6M\niFcr6y+QSu4rkxL2vQu9qIgXgZnAB3P1wgFAq5tiq5H+UBvPDdJrn1I9ZIvnrgk8FRHPttjfKvZG\n9cfl+SfvM8Angdc3PX84rV7OBPaWNJEFJaEH87k3lPTbXP3yLPDNAc7dibWBbZuui38FJuf9+5BK\ndnNyNch2LY7zFOnLpiqAwyrX5orAe2ld131E3nedpNskfbhN3KtRuUYj4nngSdJnvhrpi7yqeb3f\n55KrSX6VqxueJV1rze/nvMry35vWXyRfF0OMdTBrkr50Xm2x/7X7YRHxQl5sXJ/vlnRNrs55mvQ5\nVl/T4xHxUgcxVC0PPDPE54yo8ZrI26kmrweAb1b/qCJiuYj4b9LP5+aLau3K8vOkZA2A+reKeYJ0\nUW9cOe6kiHhdB/E9QfoDWKjeOjsdeD/wLuCFiLi2xeMersabE/+aLPzlNJAHgZWq9aFDcCapSmmN\niJgE/BcLX2dd3+yLiDtICeDdpOR6ZmX3T0jVKRtExAqkEnGra/x5Uimwofr5PUD6mV+9LpaPiM/k\nGK6PiL1IX9jnkr5cB3ILqYprMC1vWEbEvIj4RERMIX0pnqTWLVUeJv3qSQeVliUlqbmk63mNyj5V\n1xuna1o/lvTL7M35/fwgg+eMTm++toq10+tzLQ3xhqzSzedfA98hldBXBH7XFHM31+ZGpBv2tVkU\nE3nVz4BPKd0kk6RllW7WLUeq+3tZ0uFKTZD2JtWlN9wMbCLpLUo3Mac3duSSws+AH0paBUDSFEm7\nDhZQfu4pwPclrSZpcUnb5xIoEfEH0sV2PHBGm0PNBN4jaad8s+gLpC+I31ceM+AfXUQ8QvoFcZKk\nSfn17zBY7NlywNMR8ZKkbUjJtu0fRz7+UqTrcaLSjd12CeFMUrXJO0h15NVzPwe8oHRj99NtjnET\nqWS/tNLN3mqTwAuBDSV9IMe2hKStlW6ALiHp/ZJWiIhX8vla3QT7IzBJ0urNL7lNXP0fKO0rqZFw\nnyG9l42S6DxSPXbDDODD+ZpckpSIr4mIB0gJa1NJeyq1FvoM/b+8BrIc6Qvvr5KmkO6zjJR2sQ7m\nWtIX07clLZOvl7d18LyJ+d8TwKuS3g0M+jep1HBhqbxaXW7YkfT3UptFLZH3SygR8Sfg48AJpJ/B\nd5PqEomI+aSf7geTfvLtB5xD/iOMiLtIbV0vIbWAuarp+F8m3Zi7Jv8snUX/0lm75PZF4FZSIniS\ndPOl+lmdQbqR+IuFn/raa7uL1GTqx6SbSu8B3hcRL3cYwwdJN37uJCWMwzt83iHA1yX9lVRP/9/N\noQ3wnFmk6pntgJ/m5Xe0OccM0j2MSyPiqcr2L5K+OP6aj/OrpvNVl39AqpOdB5xKei8DICKeI/2B\nH0AqIT5C+gwm5ud+ALgvf66fIP1CWkj+iX5afny/XQM9vMVr3Yp0DT0HnEeqz52T900HTs/VP/8S\nEZeS3vNfk0q86+bXQEQ8QbpZ/h1SItsIuJ5UJ9w4f3MM/0G6gfwscEE+7mAl1ub3e8DHt4t1MLmw\n8z7Sr9YHSCX0/dqcs/q5Hk4q5DwFHEh6T1vF3/Bn0jW5OqmxxPOS1gJQammzEemXWW2UK+sH3imd\nQkoAj0XEpnnbNqTEtwTwMnBIRPwx7zsS+AiphHJ4RFw8uuGPLUmnAnMj4ms1x/FB4OMR0Wkp2Woi\naWXSl/xbI3cK6gWSFiMlwH+NiCvqjqdUko4H7omI/6ozjsFK5KcCuzVt+w7wtYjYHDgqryNpY9Ld\n/o3zc07KF8t4MiqdL4YUgLQM6WfxT+uOxQYXEU9ExEa9kMSV2pFPylUZX8mbr6kzptJFxBfrTuIw\nSCKPiKtIzaOqHiG1nYXURrpxc2JPUlOr+fmn3z2k5nTjScufimNB0j+Tml09Qv+bfGad2J70d9mo\naturF75gbPjaVq1A6o0HXFCpWlkb+H+khLYYsH1EPKjUs+qaiPhlftzPgYsi4tejF76ZmXVT9XEy\nqf57LeDzpBYWrdRWejUzW1R0M3DUNhHxrrx8NvDzvPwQqZ1ywxoM0CZUkpO7mVkXImLg+3SDdf0k\nNdq/tbJ+A7BjXt4Z+GNe3pjUNnciqSnRveSqm6bjxUh1S20R7/TRPP5o/3P8jt/x1x9HL8beLne2\nLZErjbS2I7Cy0njJR5HazZ6Y73z/Pa8TEbMlzST1rGs0S3Tp28xslLVN5BFxYItd27Z4/LGkHlpm\nZjZGxls7b0ijy5Wsr+4Ahqmv7gCGqa/uAIapr+4Ahqmv7gCGoa+uEw/a/HDETyhFtKqwNzOzAbXL\nneOxRG5mtkhxIjczK5wTuZlZ4ZzIzcwK50RuZlY4J3Izs8I5kZuZFc6J3MyscE7kZmaFcyI3Myuc\nE7mZWeGcyM3MCudEbmZWuG6mejMrxlhMLejRPK1ubUvkkk6RNE/SrU3bD5N0h6TbJB1X2X6kpLsl\n3Slp19EK2mxoYhT/mdVvsBL5qcCPgTMaGyS9E9gD2Cwi5ktaJW/fGNifNHfnFOASSRtGxKujErmZ\nmQGDlMgj4irg6abNnwa+FRHz82Mez9v3BGZExPyImAPcA2wzsuGamVmzbm52vhHYQdI1kvokbZW3\nrw7MrTxuLqlkbmZmo6ibm50TgBUjYjtJWwMzgfVaPNaViGZmo6ybRD4XOAcgIv4o6VVJKwMPAWtW\nHrdG3rYQSdMrq30R0ddFHGZm45akqcDUjh472OTLktYBLoiITfP6J4HVI+JoSRsCl0TEWvlm55mk\nevEpwCXABtF0Ak++bGMpNT8czR+GcvNDGxPtcmfbErmkGcCOwOslPQgcBZwCnJKbJL4EHAQQEbMl\nzQRmAy8DhzQncTMzG3mDlshH/IQukdsYconcxot2udNd9M3MCudEbmZWOCdyM7PCOZGbmRXOidzM\nrHBO5GZmhXMiNzMrnCeWMDMbQEmTkjiRm5m1NLqdyUaKq1bMzArnRG5mVjgncjOzwrmO3KyHlXTD\nzerjRG7W88q44Wb1cdWKmVnhnMjNzArXNpFLOkXSvDwbUPO+L+T5OleqbDtS0t2S7pS060gEKClG\n+99IxGlmVpfBSuSnArs1b5S0JrALcH9l28bA/sDG+TknSRqhEn+M4j8zs7K1TbQRcRXw9AC7vg8c\n0bRtT2BGRMyPiDnAPaSJmM3MbBQNudWKpD2BuRFxi9TvjvfqwDWV9bnAlOGFZ2YlG+2qSzedTIaU\nyCUtA3yFVK3y2uY2T3Hdhdkib7TSgHN4w1BL5OsD6wA359L4GsCfJG0LPASsWXnsGnnbQiRNr6z2\nRUTfEOMwMxvXJE0Fpnb02Ij235aS1gEuiIhNB9h3H7BlRDyVb3aeSaoXnwJcAmwQTSeQFEP5OZR+\nmo1uhwj/PBu/Sr9+HH/boxccOww1/na5c7DmhzOA3wMbSnpQ0oebHvLaq4yI2cBMYDZwEXBIcxI3\nM7ORN2iJfMRP6BK5jaHSrx/H3/boBccOY1YiNzOz3udEbmZWOCdyM7PCOZGbmRXOidzMrHBO5GZm\nhXMiNzMrnBO5mVnhnMjNzArnRG5mVjgncjOzwjmRm5kVzonczKxwQ57qzYZmtKe6gtGd7qr0+M0W\nBU7kY2J0h8IcfaXHbza+uWrFzKxwg80QdIqkeZJurWz7rqQ7JN0s6RxJK1T2HSnpbkl3Stp1NAM3\nM7NksBL5qcBuTdsuBjaJiLcAdwFHAuQ5O/cHNs7POUmSS/xmZqOsbaKNiKuAp5u2zYqIV/PqtcAa\neXlPYEZEzI+IOcA9pImYzcxsFA23xPwR4Hd5eXVgbmXfXGDKMI9vZmaD6LrViqSvAi9FxJltHjZg\ncwdJ0yurfRHR120cZmbjkaSpwNROHttVIpd0MLA7sHNl80PAmpX1NfK2hUTE9G7Oa2a2qMgF3L7G\nuqSjWz12yFUrknYDvgTsGREvVnadDxwgaaKkdYE3AtcN9fhmZjY0bUvkkmYAOwIrS3oQOJrUSmUi\nMEsSwB8i4pCImC1pJjAbeBk4JCJGvVegmdmiTmOdayXFULpkpy7io9uzcPS7uDv+Nmdw/O2O7vjb\nHb3g2GGo8bfLnW7nbWZWOCdyM7PCOZGbmRXOidzMrHBO5GZmhXMiNzMrnBO5mVnhnMjNzArnRG5m\nVjgncjOzwjmRm5kVzonczKxwTuRmZoVzIjczK5wTuZlZ4domckmnSJon6dbKtpUkzZJ0l6SLJU2q\n7DtS0t2S7pS062gGbmZmyWAl8lOB3Zq2TQNmRcSGwKV5HUkbA/sDG+fnnCTJJX4zs1HWNtFGxFXA\n002b9wBOz8unA3vl5T2BGRExPyLmAPcA24xcqGZmNpBuSsyTI2JeXp4HTM7LqwNzK4+bC0wZRmxm\nZtaBYVV95MmV201q58mXzcxG2YQunjNP0qoR8aik1YDH8vaHgDUrj1sjb1uIpOmV1b6I6OsiDjOz\ncUvSVGBqR49Nheq2B1sHuCAiNs3r3wGejIjjJE0DJkXEtHyz80xSvfgU4BJgg2g6QbuZoFucv6dm\nsh7y0R3/YGdw/O2O7vjbHb3g2GGo8bfLnW1L5JJmADsCK0t6EDgK+DYwU9JHgTnAfgARMVvSTGA2\n8DJwSHMSNzOzkTdoiXzET+gS+UifwfG3O7rjH+wMBcdfcuwwkiVyt/M2MyucE7mZWeGcyM3MCudE\nbmZWOCdyM7PCOZGbmRXOidzMrHBO5GZmhXMiNzMrnBO5mVnhnMjNzArnRG5mVjgncjOzwjmRm5kV\nzonczKxwTuRmZoXrOpFLOlLS7ZJulXSmpCUlrSRplqS7JF0sadJIBmtmZgvrKpHneTw/DmyR5/Jc\nHDgAmAbMiogNgUvzupmZjaJuS+R/BeYDy0iaACwDPAzsAZyeH3M6sNewIzQzs7a6SuQR8RTwPeAB\nUgJ/JiJmAZMjYl5+2Dxg8ohEaWZmLXVbtbI+8DlgHWB1YDlJH6g+JtKszmM7s7OZ2SJoQpfP2wr4\nfUQ8CSDpHGB74FFJq0bEo5JWAx4b6MmSpldW+yKir8s4zMzGJUlTgakdPTYVnId8grcAvwS2Bl4E\nTgOuA9YGnoyI4yRNAyZFxLSm50ZEaAjnitEt2IuhxDPkozv+wc7g+Nsd3fG3O3rBscNQ42+XO7sq\nkUfEzZLOAK4HXgVuAH4KLA/MlPRRYA6wXzfHNzOzznVVIh/WCV0iH+kzOP52R3f8g52h4PhLjh1G\nskTunp1mZoVzIjczK5wTuZlZ4ZzIzcwK50RuZlY4J3Izs8I5kZuZFc6J3MyscE7kZmaFcyI3Myuc\nE7mZWeGcyM3MCudEbmZWOCdyM7PCOZGbmRXOidzMrHBdJ3JJkySdLekOSbMlbStpJUmzJN0l6WJJ\nk0YyWDMzW9hwSuT/CfwuIjYCNgPuBKYBsyJiQ+DSvG5mZqOo28mXVwBujIj1mrbfCewYEfMkrQr0\nRcSbmh7jqd5G9gyOv93RHf9gZyg4/pJjh16Y6m1d4HFJp0q6QdLPJC0LTI6Iefkx84DJXR7fzMw6\n1G0inwBsAZwUEVsAz9NUjRKpqD+2MzubmS2CJnT5vLnA3Ij4Y14/GzgSeFTSqhHxqKTVgMcGerKk\n6ZXVvojo6zIOM7NxSdJUYGpHj+2mjjyf5ErgYxFxV07My+RdT0bEcZKmAZMiYlrT81xHPrJncPzt\nju74BztDwfGXHDuMZB35cBL5W4CfAxOBe4EPA4sDM4G1gDnAfhHxTKfBtAq+l97MIR/d8Q92Bsff\n7uiOv93RC44deiKRd8uJfMTP4PjbHd3xD3aGguMvOXbohVYrZmbWI5zIzcwK50RuZlY4J3Izs8I5\nkZuZFc6J3MyscE7kZmaFcyI3MyucE7mZWeGcyM3MCudEbmZWOCdyM7PCOZGbmRXOidzMrHBO5GZm\nhXMiNzMr3LASuaTFJd0o6YK8vpKkWZLuknSxpEkjE6aZmbUy3BL5Z4HZLJhGYxowKyI2BC7N62Zm\nNoq6TuSS1gB2J83b2Zh+aA/g9Lx8OrDXsKIzM7NBDadE/gPgS8CrlW2TI2JeXp4HTB7G8c3MrANd\nJXJJ7wUei4gbWVAa7yfSrM5jO7OzmdkiaEKXz3sbsIek3YGlgNdJ+r/APEmrRsSjklYDHhvoyZKm\nV1b7IqKvyzjMzMYlSVOBqR09NhWch3WyHYEvRsT7JH0HeDIijpM0DZgUEdOaHh8RMWApvsXxY3QL\n9mIo8Qz56I5/sDM4/nZHd/ztjl5w7DDU+NvlzpFqR954td8GdpF0F7BTXjczs1E07BL5kE/oEvlI\nn8Hxtzu64x/sDAXHX3Ls0IslcjMzq4kTuZlZ4ZzIzcwK50RuZlY4J3Izs8I5kZuZFc6J3MyscE7k\nZmaFcyI3MyucE7mZWeGcyM3MCudEbmZWOCdyM7PCOZGbmRXOidzMrHBO5GZmhet28uU1JV0u6XZJ\nt0k6PG9fSdIsSXdJuljSpJEN18zMmnVbIp8PfD4iNgG2Az4jaSNgGjArIjYELs3rZmY2irpK5BHx\naETclJf/BtwBTAH2AE7PDzsd2GskgjQzs9aGXUcuaR1gc+BaYHJEzMu75gGTh3t8MzNrb8Jwnixp\nOeDXwGcj4jlpwbygERFp8tIBnze9stoXEX3DicPMbLyRNBWY2tFjI7qbJVrSEsBvgYsi4od5253A\n1Ih4VNJqwOUR8aam57WcCbrFeXpqJushH93xD3YGx9/u6I6/3dELjh2GGn+73NltqxUBJwOzG0k8\nOx/4UF7+EHBuN8c3M7POdVUil/RPwJXALSz4yjoSuA6YCawFzAH2i4hnmp7rEvnInsHxtzu64x/s\nDAXHX3LsMJIl8q6rVrrlRD7iZ3D87Y7u+Ac7Q8Hxlxw71F61YmZmvcOJ3MyscE7kZmaFcyI3Myuc\nE7mZWeGcyM3MCudEbmZWOCdyM7PCOZGbmRXOidzMrHBO5GZmhXMiNzMrnBO5mVnhnMjNzArnRG5m\nVrgRT+SSdpN0p6S7JX15pI9vZmb9jWgil7Q4cAKwG7AxcKCkjUbyHIPrG9vTjbi+ugMYpr66Axim\nvroDGKa+ugMYpr66AxiGvtrOPNIl8m2AeyJiTkTMB34F7DnC5xhE39iebsT11R3AMPXVHcAw9dUd\nwDD11R3AMPXVHcAw9NV25pFO5FOAByvrc/M2MzMbJSOdyMd2AlAzMxvZyZclbQdMj4jd8vqRwKsR\ncVzlMU72ZmZdaDX58kgn8gnAn4GdgYeB64ADI+KOETuJmZn1M2EkDxYRL0s6FPgfYHHgZCdxM7PR\nNaIlcjMzG3vu2WlmVjgn8h4haVlJW0lape5YOiFpG0m7D7B9d0lb1hFTNyRtJmk/SftKenPd8XRK\n0k6V5XWb9u099hENTTVGSSvVGctwSVpD0lr53xJ1xDDuEnl+M79UdxyDkbSHpDmSbsgJ8TZSr9jb\nJB1cb3QdOQ6YPcD22cDxYxzLkElaQVIfcC5wIPB+4DxJl0t6Xa3BdeZ7leVzmvZ9bSwD6VI1xktr\ni6ILkr4i6ejKpt8DFwKzgC/WEdOI3uysi6Q3APuS/iBXB35Tb0QdOQbYFViB1CVs04j4S34tlwGn\n1RdaR5aPiDnNGyNijqSVa4hnqI4Brgd2iohX4bUhJr4FfBM4rMbYrLftC7yjsv5kRGyer58rSdfQ\nmCo2kedS096k5L0BqWS1bkSU0pP0lYi4C0DSXyLiLwAR8Zik+fWG1pFJbfYtPWZRdO9dwGaNJA4Q\nEa9I+ipwa31hLTKWlrQFoKblAIiIG+oMbjAR8bfK6n/mba9IquXaLzaRA/NIP2WOjohroIy6wYrF\nc92ggKjUE4rUdLPXXSrpm8C/R276JGkx4D9Ivyh63Ut5PKB+ImK+pH/UEdAQrSfpfNL1sq6kCyr7\n1m3xnF7yKAuqh6rLDe8c23CGZFlJEyPiJYCIOA1A0pLA8nUEVGzzQ0mfI5XGlwBmAmcBl0RECRcx\nkuawYEgD0TS8Qa+/DknLAT8nDZR2U978FlJ1xcci4rm6YuuEpDuBf2Xh917ALyPiTbUE1iFJU9vs\njoi4YqxiWdRIOhZYFTgsIp7P25Yj3eN6JCKOHPOYSk3kDZLWBw7I/94IHA38plFtYaMrv/+bkJLh\n7Ii4t+aQOpJvdLa8+COil0uESFohIp5tsW/tiLh/rGMaCknHRsRX8vIuETGr7pg6lXuwHwN8DHgg\nb14LOBn4akS8POYxlZ7IqyRtSiql7x8R69cdz2DyBfFKRISkNYFtgXsj4saaQxuUpN1INzzPatr+\nL8CzJf1hlkjSjRGxeV6+NCJ2Hmhfr2qKv+fjHYikZUj354I0fPff64ql5DryhUTErZLuYcFP/Z4l\n6eOkJnx/k/QN4EvADcDmkk6NiG/XGuDgjgL2GmD7FcAFpPsXPU3S60nNDt9E+mO8A5gREU/WGtjQ\nFd0OuzSSVgAm51/9t0jaj/R3C/A/ETFvrGMaF4k8l2z/mVQa3wW4ilRv3ss+D6wPvI6UQNaKiCfy\nt/z1QK8n8iUj4rHmjRHxuKRl6whoKPLMVZcBF5O+QBcj1fd/VdI7I+LOOuNbBKwi6d9I9ySqy5Dq\n+L9fX2iDOp7UdrxRfXsscBGptdbbgE+NdUDFJnKlr78dScl7d+BaUtvOdSPihTpj69A/IuJp4GlJ\nd0fEEwAR8UIhrSaWl7REc8uP3LNtqZpiGopjgM9GRL8vfEn7kNqR71NLVJ1rlwhL6B38cxa08Kgu\nl2Br4JOV9eci4jAASVfXEVCxiZw0E9Fs4BTg3yLieUn3FZLEoX/b2SXzMnm9hHbY5wA/lXRYo02t\npOVJbWqbexr2ok0jYqFkHRG/ljTmHTq60C4R/mzswxmaiJhedwzDMKHa/wA4qLLcrn/FqCk5kZ8N\n7AHsD9Dun0t9AAARX0lEQVTUjrYE7drRPjL24QzZ14BvAHMkNd+5//faourc813u6wmFJ0IkXRwR\nu+blIyOihC/PhlckrRYRj0C6NwcgaQrwSh0BFd1qJXdAmUqqXnk36dvwo8CFTT2vbJRU7txDunNf\nxC8iSXOB77OgOqLq8xGxxhiHNCSSZkbEfnn5uIj4cmXfa0myV5XcakXSB4DPAV8g3V8B2JJUd/6j\niDhjrGMquURO/nlzGXCZpIksuOF5ItDz431IWht4Pt/k3B74J1Iy7PmxYiR9kFQQOAO4pWn7KxFx\nZm3BdaZdvWzPV02Q+kw07Ap8ubJeQh15sSLiF5KeIN1n2Thvvh34WkRcVEdMRZfIW5G0Q0RcWXcc\n7Ug6CvhQXp1BGvujj9SW/JaI+GxNoXVE0nXAzs09OHMPtysjYouBn2kjoV2JtoQSrqRnSU1VRWqk\ncFVld0TEHrUEVqiiS+RVkjYhlcYPAJ4Btqo3okEdSPo2X4bUO2zVfMN2AnBzrZF1ZomBuuFHxN/q\nGpN5qJTG9D6U1I4c0s3zEyPi8vqi6thAg07RWK8vrI7tWVluHmelp0uXSuPWrx8R5+X1H5JGMQ3g\nhDoG/Co6kSsNqH8AKSm+BKwDbDXQ8Ko96MWI+AfwD0n3NMZsyPOevlRzbJ1YStJyzfcicsuVnk/k\nkt5DGhvj6/mfgM2Bk3NLnAvrjK8Dpd8sv6/XhxFo49v0H6p2V9LN/2Vp3VFuVBWbyCX9AZhIGixr\nrzyW932FJHGAFfJojaos01ivL6yOnQycJenTjfc8f7GemPf1uiNI103118+Nkq4nJfieTuQRMbXu\nGIbpXNIXJ5J+PVBT0B62WkRU24s/FxG/BpD0yRbPGVUlzxA0j5TwJgNvqDmWblwJvA94b2W5sd7z\nI9dFxPHAecAVkp6S9BS5e35EfLfe6DoyuSmJAxARt1DA9STpiMryvk37jh37iIZlvboDGKJ+N8kj\nYtvKai3XTtE3OyVNIk0ucQCpCdxKwD9HxLW1BraIUZ4aLSL+WncsnZJ0Q6sbsu329YpxcLOz5OaH\nfcC0yPMgVLZvD3yrjl9LxVatAETEM6SenadImgzsB/xA0poRsWa90bWXhxjYAXg6Im6RtH9evwc4\nKdef9yxJXyCNcvjzagKX9FHSqIg/rC+6jqzfphNZaSXEEm0mqXGzfOnKMqRWK708b+oRwH9LOo3U\njlzAFsDB5A6KY63oEnkrkrZr/rbsNZJOAjYljUvyZ2A54P8jtSVXRLy/xvAGJekGYLvIs6RUtk8E\n/hQRm9YTWWdU+MQMpZfIS5cLjofSvx35iXWMfAjjKJFLWpE00NGBwEYRsXrNIbUl6Q7SRbAU8BDw\nhtxiRcCtEfHmWgMchKRbImKzFvtu6/X4qyStAmnkxrpj6ZSkV4BGL9plKssAS0dET//alrR3RJyT\nl1fMA8hZl0q+2YmkZSQdqDR34c2kLrLfAHq6WiV7MZK/A/dHnlUk0jdrCZMvS9KqA2ycTI+3A4YU\nvKTpuYfeXcBdkp6QdHTdsXUiIhaPiOXzv+ry8r2exLOvVZZLmOP1NZL2knRoZf06Sfflf/u2e+5o\nKTaRS5oB3EYayvaHpAlnn46IvoioZeCaIVpF0r/luubXlhvrdQfXge8CF0qaKmn5/O+dpGZ7zR08\netHngbcDW0fEihGxImk88rcrDQnb03Jnpsbyuk37SpqEvERHAOdX1ieSOiDuCHy6joBK+OZuZSPg\nMdKkDHdExCupVqIYrYYhFQWM9RERZ0h6nNSZZpO8udbxJoboIGCXanVK7ovwftLsRr08sQGkL8tG\nPfg5lWVIpd1eH0p4oJ6pr02EXUfvyCGYGBEPVNb/X6RZpZ5UTZOqFF1HrjTLy4Gk1iqPk5L7myPi\n0VoDs57Xrh6/hDr+0m92qv/k16KpOi56ePJrSfdGizmBJf0lIsa81VPJJXIi4g5Sl9ijJG1FSurX\nSZobEW+rN7r2JC1Naqr0FPBb0pydjeaH34g8Y1CvknQ8aaTG/2ra/knSLE3T6omsY+3uQ5Rwj6Jo\nhfdMvVbSJyLip9WNkj5FmqlszBVdIh9IbvXxjgJGPzyLND7MssCKpPr+35KaH74lIt5bY3iDys0P\nt4r+M6U0xoi/NSI2GfiZvaGp1UezElp9tBs98B0RUctMNZ2SdGxEfCUv7xIRPT9Zd0O+oX8u8A8W\njEe+BakF2l511AiMm0SeE/jOpFL5+yKip7tZN36+K412ODciVq3suzki3lJjeIOSdHurZC1pdkRs\nPNA+GxnjuR18CXK+2Yl0fyiA2yOittY3PV3q6ETuFnsgacSxlUiN9L9Ua1CdmQ+vjXbYPFrdqwM8\nvte8IGnDiLirulHSG2ld0u05kjZlwTC2d0TEbXXGMwQ3RsSzA+1QmrDERlFuJnwpcKmkNYDFJK0F\nPBJNE5KPhWITudIEufsAfwFmAtNJPQpPqzGsoVhD0o9IP42nVJYBptQXVseOAn4n6RjgT3nbVsBX\nSNNg9TRJK5AG/VqL1AdBwKZK84/uWcC4MX0sGD3w0ojYubLvtZEFe9gquZmnmpYh5cmebTUk6Suk\n8fj/I2/6PfAsqRniafQf4nZMFJvIgY+REshPgIsi4qXCmh9+ifSTTCxIhA3Xj304QxMRF0nai9Sm\n9rC8+XZg78iT0fa4Y0jv806Nen5Ji5P+CL/JgtdUgpXqDqALrZrflmBf0n2JhicjYvN8/VyJE/mQ\nrAbsQhr58ITcnGlpSUvU8dNmqAr65TCg3OrmsYg4qGn7GyQtnXus9rJ3AZtVb9bmvghfBUr4Iipa\nREyvO4bhiP4Tqvxn3vZK/rsYc8X27IyIlyPiooj4EGki2vOAq4G5knp94l8krZK7iB+ee0X+RNLt\nks6TtMHgR6jdj+hfKmn4J3q/Mw3ASwN94edtPT3yZFZ0z2BJF1eWj6wzli4sqzQ4HLCgUCZpSWr6\nZVFsIq+KiBcj4uxIs4y8kTSKYK87k1SntiGp7el9wL+QmiD+vMa4OrVl5FlRqvJASDvWEM9QLSlp\nC0lb5v8b/7YElqw7uA40qiOWa1pejgJ6BtP/y2a/2qLoztnAf1V7cSpNOv5/8r4xN26aHwLkkuyB\nwIG93vyt0cQwN2O6PyLWquy7KSLeWmN4g5J0Z0S8aaj7ekVTz8KF9HLPwvGg5OaHucnwMaT7dI2u\n+muRpjj8amMAvLFUch05AJKmkHpIHkga3/vb1DS4+xC9Cun2vKQnm/aV8O36mKRto2k2JknbkMbA\n6WmF9yxE0syI2C8vHxcRX67suzgidq0vuo6spzRqqYB11X+Sj4iIPWqKa1A5UU+T9HXSzGRB6uVc\n232hYkvkuSv4gaQ58s4mTcJ8fkSs2/aJPWIc9MzbhtTs8zRSqxsBWwIfAg6IHp/YA0DS64H3k9qR\nB2kAthl5AKSeNg7GWpnaZndPd2jKTVcnN/pQSNqP1KsT4H+ihsklSk7k80l14f8eeRJdSfcVlMin\nttnd0xdyQ+6q/Bn6j354QkT0fIlcacC1y4CLSd2sFyO1vd4FeGdE3FljeIMaB4l87Yi4v+44uiHp\nZ8DvI+LUvH4PcBGwNPByRHxqrGMquWplNVJ7zh9JapTKl6g3pM5FRF/dMQxHfs9XjoijmrZvIimi\n92fbOQb4bETMrG6UtA+pHfk+tUTVuYGGgaWxXl9YHXut05KkX+eGCqXYGvhkZf25iDgMQNLVdQRU\nbKuViHgiIn4SETsCu5J6Vs2TdKekY2sOb1CSNpR0mqTvS1pD0kWSnpd0s6St646vAz8GVh5g++vJ\n7Wp73KbNSRwgt8Tp6flGs0dJY5IfX1lurDcP+dDrSpvsekLTYHHVvhS1VImWXCJ/TUQ8SLqAj5e0\nIamTUK87FTgdWAG4jtStfW9SO+wTgG3rC60jGwxU/RMRV0r6SR0BDdHzXe7rCaXfrC3cK5JWi4hH\nABo9mXPDi1pmJxsXibwht+vcFtiu7lg6sGzk8YwlfbJSOpwl6bs1xtWpdh0fSqjiah7fo9++sQ5m\nqCQdERHfycv7RsRZlX2vDRHbwzaT9FxeXrqyDOke0evqCKpD3wUuyJ2vGsPYbkkuTNYRULFVKw2S\nlpS0t9L43o+QhrItoURYvcv8XJt9veoeSe9p3ihpd+DeGuIZquZONNV/JXSoObCy3Jy03z2WgXQj\n+k8YPSH6Tx7dy0mciPgF8O+k+yxz8r+vk6Y5PKOOmEputfLPpIt5J9JIcGcBP46IdWoMq2OS/k6a\nDQhgffonv/UjYpmxj6pzuQrrt6SR36rND98GvDci/lxjeONe6a1WbGSVXLVyESmRbBcRDwMoDQVb\nio3qDmCYAvgoaUiERvPDK0il2THv2dYNpZnoD2XBeOSzgRMj4vL6orJeJ+nNpMLWeXn9h6R7XUFq\nfjvmE0eXXCJ/K6lEvjepNHsWcHS1q3tpJK1MGhKz5z8USRcC05qHrJW0GfDNiHhfPZF1JlcLnUD6\nSXwj6RfF5qSfzIdFxIU1hjco9Z+qbmmg2quw56eqK5mk3wLfioir8/ps4GukaRv3joi9xjymAnJG\nW3mskreRkvo+wE3Ab6JpYtReozSz0bdIky8fA5xBas63OHBQRFxUY3iDknR9RGzVYl8Js9BfARze\n6ExW2b4ZqVS1Qz2RWa+T9KeI2LKyfm1EbJuXr46It491TMXf7Izk6og4lDSzzu8po9XKCcCxwAxS\nD8OPRZq38x3UMDB9F9q1l12qzb5eMbk5iQNExC2kYR/MWunXYquRxLNarp3iEzlAHn70u6Rp33Ym\n/VTudYtHxMW52dgjjbFJctfwEn4mXS/pE80bJX2chWc86kXt5hUtZs5Rq8XDkhYqLOZf2Q/VEE+5\nNzsl/S9Sdcr+wOOkOvLFCuooUU3WL9YWRfc+B/xG0vtZkLgbY3n/79qi6tz6TSPuVZXW09DG1hHA\nf0s6jdSOXMAWwMHUNPJqsXXkkl4ltVo5NCIeyNtKGjSr+JtV+f7EO4E3k76Ybo+Iy+qNqjPjYdAy\nq08eMO5QoDHvwW3ASXWMfAhlJ/K9SCXybUmjIJ4FnFxKO3LrHZJWAShgoC/rATn3rBERJ+T160i9\ngQP4crWX7Vgpto48Is6NiP1JpcGrgM+Tul3/RFKvD6pvNVMyXdITwF3AXZKekHR03bFZzzsCOL+y\nPhHYCpgKfLqOgIpN5A0R8beI+GVEvBdYk3Sjc1rNYVnv+zzwdmDriFgxIlYEtgHensdgMWtlYqM6\nN7s6Ip7M25Zt9aTRVGzVitlwSLoJ2KW5OiVXs8yKHp8z1eoj6d6IWL/Fvr9ExJjfLC++RG7WpQkD\n1YnnbT1/o9lqdW2LprefAq4d4PGjzhesLarmd7nP7PPAuZL+lQXD2G5B6gg35t3zwVUrtohqav7Z\nrIjmn1af3PR2J9KAcbU3vXUiNzMrnOvIzcwK50RuZlY4J3Izs8I5kdsiQ9JpkvYZ5jG+KemBpsmC\nzWrlRG6Lkq7v7Ocu/QLOI/UANesZTuQ2bkk6SNLNkm6S1JjdfAdJV0u6t1E6l7ScpEsk/UnSLZL2\nyNvXkfRnSacDt5IGSrouIh6t6SWZDcjND21ckrQJcA6wfUQ8JWlF4PvAMhGxv6SNgPMj4o2SFs/b\nn8vzpv4hb1+HNB/s9hFxXdPxn4uIfjPFmNXFnR5svNoJmBkRTwFExNOpZoRz8/odeUxpSL9MvyXp\nHcCrwOqSGlN23d+cxM16jRO5jVdBmrml2UuV5cb+95Mmvt4iIl6RdB8L5h19fvRCNBsZriO38eoy\nYF9JKwE0/m/hdcBjOYm/E1h7LAI0GylO5DYuRcRs4JvAFXnI2u+RSunVm0KN5V8CW0m6BfggcMcA\njwFA0nckPQgsLelBSUeN1msw65RvdpqZFc4lcjOzwjmRm5kVzonczKxwTuRmZoVzIjczK5wTuZlZ\n4ZzIzcwK50RuZla4/x/8SCtRCP3UTAAAAABJRU5ErkJggg==\n",
      "text/plain": [
       "<matplotlib.figure.Figure at 0xd79d0f0>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# use summary function size() on groups created by groupby()\n",
    "counts = scratch_df.groupby('char1').size()\n",
    "plt.figure()\n",
    "counts.plot.bar(title='Frequency of char1 values (Histogram of char1)')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>numeric1</th>\n",
       "      <th>numeric2</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>char1</th>\n",
       "      <th>char2</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th rowspan=\"7\" valign=\"top\">AAAAAAAA</th>\n"
Download .txt
gitextract_uh8insif/

├── .gitattributes
├── .gitignore
├── 00_intro_and_history/
│   ├── 00_intro_and_history.md
│   ├── notes/
│   │   └── .gitignore
│   ├── quiz/
│   │   └── .gitignore
│   └── spring_2017_announcements/
│       └── spring_2017_announcements.md
├── 01_basic_data_prep/
│   ├── 01_basic_data_prep.md
│   ├── assignment/
│   │   └── .gitignore
│   ├── notes/
│   │   └── .gitignore
│   ├── quiz/
│   │   └── .gitignore
│   └── src/
│       ├── notebooks/
│       │   ├── py/
│       │   │   ├── .gitignore
│       │   │   └── Py_Part_0_pandas_numpy.ipynb
│       │   ├── r/
│       │   │   ├── .gitignore
│       │   │   ├── R_Part_0_Basics_dplyr_and_ggplot2.ipynb
│       │   │   └── R_Part_1_data.table.ipynb
│       │   └── sas/
│       │       ├── SAS_Part_0_Base_SAS_PROC_SGPLOT.ipynb
│       │       └── SAS_Part_1_PROC_SQL.ipynb
│       └── raw/
│           ├── py/
│           │   ├── Py_Part_0_pandas_numpy.py
│           │   ├── pyspark_example.py
│           │   ├── scratch.csv
│           │   ├── scratch2.csv
│           │   └── scratch3.csv
│           ├── r/
│           │   ├── .gitignore
│           │   ├── R_Part_0_Basics_dplyr_and_ggplot2.r
│           │   └── R_Part_1_data.table.r
│           └── sas/
│               ├── .gitignore
│               ├── SAS_Part_0_Base_SAS_PROC_SGPLOT.sas
│               └── SAS_Part_1_PROC_SQL.sas
├── 02_analytical_data_prep/
│   ├── 02_analytical_data_prep.md
│   ├── data/
│   │   ├── loan.csv
│   │   └── loans.sas7bdat
│   ├── notes/
│   │   └── .gitignore
│   ├── quiz/
│   │   └── .gitignore
│   ├── src/
│   │   ├── .gitignore
│   │   ├── DataPreperation.py
│   │   ├── data_sets/
│   │   │   └── kaggle_house/
│   │   │       ├── test.csv
│   │   │       └── train.csv
│   │   ├── housing.html
│   │   ├── housing.ipynb
│   │   ├── py_part_2_discretization.ipynb
│   │   ├── py_part_2_encoding.ipynb
│   │   ├── py_part_2_feature_extraction.ipynb
│   │   ├── py_part_2_feature_selection.ipynb
│   │   ├── py_part_2_impute.ipynb
│   │   ├── py_part_2_over_sample.ipynb
│   │   ├── py_part_2_standardize.ipynb
│   │   ├── py_part_2_target_encode_categorical.ipynb
│   │   ├── py_part_2_target_encode_numeric.ipynb
│   │   └── py_part_2_winsorize.ipynb
│   └── xml/
│       └── 02_analytical_data_prep.xml
├── 03_regression/
│   ├── .gitignore
│   ├── 03_regression.md
│   ├── assignment/
│   │   └── .gitignore
│   ├── data/
│   │   ├── .gitignore
│   │   ├── loan_clean.csv
│   │   ├── test.csv
│   │   └── train.csv
│   ├── quiz/
│   │   └── .gitignore
│   ├── src/
│   │   ├── .gitignore
│   │   ├── py_part_3_kaggle_starter.ipynb
│   │   ├── py_part_3_linear_regression_gradient_descent.ipynb
│   │   ├── py_part_3_penalized_linear_regression.ipynb
│   │   ├── py_part_3_penalized_logistic_regression.ipynb
│   │   ├── spark_kaggle_starter/
│   │   │   ├── README.md
│   │   │   ├── feature_combiner.py
│   │   │   ├── get_type_lists.py
│   │   │   ├── logging_lib/
│   │   │   │   ├── LICENSE.md
│   │   │   │   ├── LoggingController.py
│   │   │   │   ├── MarkdownBuilder.py
│   │   │   │   ├── README.md
│   │   │   │   ├── __init__.py
│   │   │   │   ├── example.py
│   │   │   │   └── markdown_preview_github.css
│   │   │   ├── main.py
│   │   │   ├── spark_controler/
│   │   │   │   ├── LICENSE.md
│   │   │   │   ├── README.md
│   │   │   │   ├── __init__.py
│   │   │   │   ├── ec2_instance_data_dict.py
│   │   │   │   ├── emr_controller.py
│   │   │   │   ├── files/
│   │   │   │   │   ├── setup.sh
│   │   │   │   │   └── terminate_idle_cluster.sh
│   │   │   │   ├── resource_calculator/
│   │   │   │   │   └── C2FO-Spark-Config-Cheatsheet.xlsx
│   │   │   │   └── scripts/
│   │   │   │       ├── bootstrap_actions.sh
│   │   │   │       ├── deep_learning_install_complete.sh
│   │   │   │       ├── pyspark_quick_setup.sh
│   │   │   │       └── terminate_idle_cluster.sh
│   │   │   ├── spark_main.py
│   │   │   └── target_encoder.py
│   │   └── target_encoder.py
│   ├── xlsx/
│   │   └── assessment_workbook.xlsx
│   └── xml/
│       ├── 03_linear_regression.xml
│       └── 03_logistic_regression.xml
├── 04_decision_trees/
│   ├── 04_decision_trees.md
│   ├── data/
│   │   └── .gitignore
│   ├── quiz/
│   │   └── .gitignore
│   ├── src/
│   │   ├── py_part_4_decision_tree_ensembles.ipynb
│   │   └── py_part_4_kaggle_xgboost.ipynb
│   └── xml/
│       └── 04_decision_trees.xml
├── 05_neural_networks/
│   ├── 05_neural_networks.md
│   ├── assignment/
│   │   └── .gitignore
│   ├── data/
│   │   └── .gitignore
│   ├── quiz/
│   │   ├── .gitignore
│   │   └── sample/
│   │       └── .gitignore
│   ├── src/
│   │   ├── .gitignore
│   │   ├── py_part_5_MNIST_DNN.ipynb
│   │   ├── py_part_5_MNIST_autoencoder.ipynb
│   │   ├── py_part_5_MNIST_data_augmentation.ipynb
│   │   ├── py_part_5_MNIST_keras_lenet.ipynb
│   │   ├── py_part_5_basic_mlp_example.ipynb
│   │   └── py_part_5_neural_networks.ipynb
│   └── xml/
│       └── 05_neural_networks.xml
├── 06_clustering/
│   ├── 06_clustering.md
│   ├── assignment/
│   │   └── key/
│   │       └── .gitignore
│   ├── quiz/
│   │   └── .gitignore
│   ├── src/
│   │   └── py_part_6_clustering.ipynb
│   └── xml/
│       └── 06_clustering.xml
├── 07_association_rules/
│   ├── 07_association_rules.md
│   ├── assignment/
│   │   ├── .gitignore
│   │   └── assignment_7.docx
│   ├── quiz/
│   │   └── .gitignore
│   └── xml/
│       └── 07_association_rules.xml
├── 08_text_mining/
│   ├── 08_text_mining.md
│   ├── quiz/
│   │   ├── .gitignore
│   │   └── sample/
│   │       ├── .gitignore
│   │       └── Quiz_8.docx
│   └── xml/
│       └── 08_text_mining.xml
├── 09_matrix_factorization/
│   ├── 09_matrix_factorization.md
│   └── src/
│       ├── py_part_9_iris_pca.ipynb
│       └── py_part_9_kaggle_GLRM_example.ipynb
├── 10_model_interpretability/
│   ├── 10_model_interpretability.md
│   ├── quiz/
│   │   └── .gitignore
│   └── src/
│       ├── dt_surrogate.ipynb
│       ├── lime.ipynb
│       ├── loco.ipynb
│       ├── mono_xgboost.ipynb
│       ├── pdp_ice.ipynb
│       └── sensitivity_analysis.ipynb
├── README.md
├── anaconda_py35_h2o_xgboost_graphviz/
│   └── Dockerfile
├── cold_call.py
└── requirements.txt
Download .txt
SYMBOL INDEX (50 symbols across 10 files)

FILE: 02_analytical_data_prep/src/DataPreperation.py
  class DataPreperation (line 1) | class DataPreperation(object):
    method __init__ (line 2) | def __init__(self):
    method label_encoder (line 6) | def label_encoder(dataframe,columns=[],frame_type='spark'):
    method imputer (line 73) | def imputer(dataframe,columns=[], type='median',frame_type='spark'):
    method polynomial_expansion (line 145) | def polynomial_expansion(dataframe,columns=[], degree=3,frame_type='sp...
    method get_top_correlations (line 249) | def get_top_correlations(dataframe,columns,frame_type='spark'):
    method feature_combiner (line 287) | def feature_combiner(training_frame, valid_frame = None, test_frame=No...
    method shrunken_averages_encoder (line 432) | def shrunken_averages_encoder(training_frame, valid_frame = None,test_...
    method convert_boolean_to_int (line 778) | def convert_boolean_to_int(frame, rejects=[],frame_type='spark'):
    method get_type_lists (line 801) | def get_type_lists(frame, rejects=['Id', 'ID','id'],frame_type='spark'):
    method remove_outliers_by_percentile (line 839) | def remove_outliers_by_percentile(dataframe, columns, limits =.01, fra...
    method winsorize_columns (line 904) | def winsorize_columns(dataframe, columns, winzerize_type='percentile',...
    method remove_outliers_by_std (line 1004) | def remove_outliers_by_std(dataframe, columns, standard_deviation_limi...
    method create_spark_estimator_vector (line 1064) | def create_spark_estimator_vector(df, ignore = [], out_put_column='fea...
    method dimensionality_reduction (line 1081) | def dimensionality_reduction(train_frame,valid_frame=None,test_frame=N...
    method pca (line 1492) | def pca(frame,columns=[],k=320,frame_type='spark'):

FILE: 03_regression/src/spark_kaggle_starter/feature_combiner.py
  function feature_combiner (line 7) | def feature_combiner(training_frame, test_frame, nums, valid_frame = Non...

FILE: 03_regression/src/spark_kaggle_starter/get_type_lists.py
  function get_type_lists (line 1) | def get_type_lists(frame, rejects=['Id', 'ID','id'],frame_type='h2o'):

FILE: 03_regression/src/spark_kaggle_starter/logging_lib/LoggingController.py
  class LoggingController (line 8) | class LoggingController(object):
    method __init__ (line 13) | def __init__(self, profile_name = 'default', s3_bucket = 'emr-related-...
    method get_datetime_str (line 20) | def get_datetime_str(self):
    method get_path_for_new_log (line 25) | def get_path_for_new_log(self):
    method log_matplotlib_plot (line 30) | def log_matplotlib_plot(self,plot, format = 'png'):
    method log_string (line 53) | def log_string(self,string):

FILE: 03_regression/src/spark_kaggle_starter/logging_lib/MarkdownBuilder.py
  class MarkdownBuilder (line 10) | class MarkdownBuilder(object):
    method __init__ (line 15) | def __init__(self, profile_name = 'default', s3_bucket = 'emr-related-...
    method get_datetime_str (line 22) | def get_datetime_str(self):
    method log_string (line 28) | def log_string(self,string):
    method build_markdowns (line 34) | def build_markdowns(self):

FILE: 03_regression/src/spark_kaggle_starter/spark_controler/emr_controller.py
  class EMRController (line 22) | class EMRController(object):
    method __init__ (line 23) | def __init__(self, profile_name = 'default', aws_access_key = False, a...
    method boto_client (line 49) | def boto_client(self, service):
    method load_cluster (line 64) | def load_cluster(self, _spark_properties=False):
    method add_create_step (line 325) | def add_create_step(self, job_flow_id, master_dns):
    method add_spark_submit_step (line 361) | def add_spark_submit_step(self, job_flow_id,name_of_script_directory):
    method create_bucket_on_s3 (line 416) | def create_bucket_on_s3(self, bucket_name):
    method upload_to_s3 (line 431) | def upload_to_s3(self, path_to_file, bucket_name, path_on_s3):
    method get_maximum_resource_allocation_properties (line 452) | def get_maximum_resource_allocation_properties(self,_master_memory,_ma...
    method get_datetime_str (line 505) | def get_datetime_str(self):
    method generate_job_name (line 511) | def generate_job_name(self):
    method tar_python_script (line 520) | def tar_python_script(self):
    method remove_temp_files (line 536) | def remove_temp_files(self, s3):
    method run (line 549) | def run(self,execute_type='create'):
    method step_copy_data_between_s3_and_hdfs (line 642) | def step_copy_data_between_s3_and_hdfs(self, c, src, dest):

FILE: 03_regression/src/spark_kaggle_starter/spark_main.py
  function glm_grid (line 170) | def glm_grid(X, y, train, valid, should_submit = False):
  function neural_net_grid (line 225) | def neural_net_grid(X, y, train, valid):
  function gboosting_grid (line 258) | def gboosting_grid(X, y, train, valid):

FILE: 03_regression/src/spark_kaggle_starter/target_encoder.py
  function target_encoder (line 1) | def target_encoder(training_frame, test_frame, x, y, lambda_=0.15, thres...

FILE: 03_regression/src/target_encoder.py
  function target_encoder (line 5) | def target_encoder(training_frame, test_frame, x, y, lambda_=0.15, thres...

FILE: cold_call.py
  function main (line 9) | def main(argv):
Copy disabled (too large) Download .json
Condensed preview — 141 files, each showing path, character count, and a content snippet. Download the .json file for the full structured content (11,290K chars).
[
  {
    "path": ".gitattributes",
    "chars": 215,
    "preview": "*.sas7bdat filter=lfs diff=lfs merge=lfs -text\n*.jpg filter=lfs diff=lfs merge=lfs -text\n*.png filter=lfs diff=lfs merge"
  },
  {
    "path": ".gitignore",
    "chars": 85,
    "preview": "*.DS_Store\n.idea*\n*.ipynb_checkpoints\ninterpreting_ml\nFAQ\nRosterInformation.xlsx\nenv\n"
  },
  {
    "path": "00_intro_and_history/00_intro_and_history.md",
    "chars": 2479,
    "preview": "## Section 00: Intro and History\n\n#### Class Notes\n\n* *Introduction to Data Mining* - [chapter 1 notes](http://www-users"
  },
  {
    "path": "00_intro_and_history/notes/.gitignore",
    "chars": 6,
    "preview": "*.pptx"
  },
  {
    "path": "00_intro_and_history/quiz/.gitignore",
    "chars": 4,
    "preview": "key\n"
  },
  {
    "path": "00_intro_and_history/spring_2017_announcements/spring_2017_announcements.md",
    "chars": 2394,
    "preview": "## Section 00 Announcements\n\n1. Attend class **if possible**:\n  * In general, you only need to attend the class (e.g. Th"
  },
  {
    "path": "01_basic_data_prep/01_basic_data_prep.md",
    "chars": 52281,
    "preview": "## Section 01: Basic Data Prep\n\n#### Basic data operations\n\nA great deal of work in data mining projects is spent on dat"
  },
  {
    "path": "01_basic_data_prep/assignment/.gitignore",
    "chars": 26,
    "preview": "raw\nassignment_1.docx\nkey\n"
  },
  {
    "path": "01_basic_data_prep/notes/.gitignore",
    "chars": 7,
    "preview": "*.pptx\n"
  },
  {
    "path": "01_basic_data_prep/quiz/.gitignore",
    "chars": 4,
    "preview": "key\n"
  },
  {
    "path": "01_basic_data_prep/src/notebooks/py/.gitignore",
    "chars": 30,
    "preview": ".ipynb_checkpoints\nscratch.csv"
  },
  {
    "path": "01_basic_data_prep/src/notebooks/py/Py_Part_0_pandas_numpy.ipynb",
    "chars": 155577,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# License \\n\",\n    \"***\\n\",\n    \"Co"
  },
  {
    "path": "01_basic_data_prep/src/notebooks/r/.gitignore",
    "chars": 30,
    "preview": ".ipynb_checkpoints\nscratch.csv"
  },
  {
    "path": "01_basic_data_prep/src/notebooks/r/R_Part_0_Basics_dplyr_and_ggplot2.ipynb",
    "chars": 82685,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# License \\n\",\n    \"***\\n\",\n    \"Co"
  },
  {
    "path": "01_basic_data_prep/src/notebooks/r/R_Part_1_data.table.ipynb",
    "chars": 104896,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# License \\n\",\n    \"***\\n\",\n    \"Co"
  },
  {
    "path": "01_basic_data_prep/src/notebooks/sas/SAS_Part_0_Base_SAS_PROC_SGPLOT.ipynb",
    "chars": 1633605,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# SAS: Part 0 - Base SAS, PROC SGPL"
  },
  {
    "path": "01_basic_data_prep/src/notebooks/sas/SAS_Part_1_PROC_SQL.ipynb",
    "chars": 534099,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# License \\n\",\n    \"***\\n\",\n    \"Co"
  },
  {
    "path": "01_basic_data_prep/src/raw/py/Py_Part_0_pandas_numpy.py",
    "chars": 9790,
    "preview": "\"\"\"\nCopyright (C) 2017 J. Patrick Hall, jphall@gwu.edu\n\nPermission is hereby granted, free of charge, to any person obta"
  },
  {
    "path": "01_basic_data_prep/src/raw/py/pyspark_example.py",
    "chars": 5111,
    "preview": "# read in data\n\n>>> path = 'scratch.csv'\n>>> cust_df = spark.read.option('header', 'true').csv(path)\n>>> cust_df.printSc"
  },
  {
    "path": "01_basic_data_prep/src/raw/py/scratch.csv",
    "chars": 130,
    "preview": "version https://git-lfs.github.com/spec/v1\noid sha256:fc62fe607c91e763bd8ff3c6acf8ab2cc3161c3daae774333ff697fe79e6eb98\ns"
  },
  {
    "path": "01_basic_data_prep/src/raw/py/scratch2.csv",
    "chars": 131,
    "preview": "version https://git-lfs.github.com/spec/v1\noid sha256:38876ee8293ecb125ed00a087dd6243141b5bd9016d85db57ebba6508e2dda46\ns"
  },
  {
    "path": "01_basic_data_prep/src/raw/py/scratch3.csv",
    "chars": 128,
    "preview": "version https://git-lfs.github.com/spec/v1\noid sha256:799b05787882a2af637073f9294ebac7630d75d656e5fc26fe11aefb1337da25\ns"
  },
  {
    "path": "01_basic_data_prep/src/raw/r/.gitignore",
    "chars": 11,
    "preview": "scratch.csv"
  },
  {
    "path": "01_basic_data_prep/src/raw/r/R_Part_0_Basics_dplyr_and_ggplot2.r",
    "chars": 11416,
    "preview": "###############################################################################\n# Copyright (C) 2017 J. Patrick Hall, jp"
  },
  {
    "path": "01_basic_data_prep/src/raw/r/R_Part_1_data.table.r",
    "chars": 8136,
    "preview": "###############################################################################\n# Copyright (C) 2017 J. Patrick Hall, jp"
  },
  {
    "path": "01_basic_data_prep/src/raw/sas/.gitignore",
    "chars": 11,
    "preview": "scratch.csv"
  },
  {
    "path": "01_basic_data_prep/src/raw/sas/SAS_Part_0_Base_SAS_PROC_SGPLOT.sas",
    "chars": 14522,
    "preview": "******************************************************************************;\n* Copyright (c) 2015 by SAS Institute In"
  },
  {
    "path": "01_basic_data_prep/src/raw/sas/SAS_Part_1_PROC_SQL.sas",
    "chars": 5450,
    "preview": "******************************************************************************;\n* Copyright (C) 2017 by J. Patrick Hall,"
  },
  {
    "path": "02_analytical_data_prep/02_analytical_data_prep.md",
    "chars": 12349,
    "preview": "## Section 02: Analytical Data Prep\n\nA great deal of work in data mining projects is spent on data munging. Below some c"
  },
  {
    "path": "02_analytical_data_prep/data/loans.sas7bdat",
    "chars": 133,
    "preview": "version https://git-lfs.github.com/spec/v1\noid sha256:057a8ac066b21a7ecfca31980addfef2ab5f1cd8e91b6d701728d6b6da9c8617\ns"
  },
  {
    "path": "02_analytical_data_prep/notes/.gitignore",
    "chars": 7,
    "preview": "*.pptx\n"
  },
  {
    "path": "02_analytical_data_prep/quiz/.gitignore",
    "chars": 4,
    "preview": "key\n"
  },
  {
    "path": "02_analytical_data_prep/src/.gitignore",
    "chars": 20,
    "preview": ".ipynb_checkpoints\n\n"
  },
  {
    "path": "02_analytical_data_prep/src/DataPreperation.py",
    "chars": 78359,
    "preview": "class DataPreperation(object):\n    def __init__(self):\n        pass\n\n    @staticmethod\n    def label_encoder(dataframe,c"
  },
  {
    "path": "02_analytical_data_prep/src/data_sets/kaggle_house/test.csv",
    "chars": 131,
    "preview": "version https://git-lfs.github.com/spec/v1\noid sha256:8fdd3d829d4d986b58f845c9553b225e67dd8383624d90fb6ca1d4bed5798c1e\ns"
  },
  {
    "path": "02_analytical_data_prep/src/data_sets/kaggle_house/train.csv",
    "chars": 131,
    "preview": "version https://git-lfs.github.com/spec/v1\noid sha256:1e18addf81e5e4d347cc17ee6075bbe4a42b7fa26b9e5b063e8f692a5f929d41\ns"
  },
  {
    "path": "02_analytical_data_prep/src/housing.html",
    "chars": 440737,
    "preview": "<!DOCTYPE html>\n<html>\n<head><meta charset=\"utf-8\" />\n<title>housing</title><script src=\"https://cdnjs.cloudflare.com/aj"
  },
  {
    "path": "02_analytical_data_prep/src/housing.ipynb",
    "chars": 589759,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Imports And Setup\\n\",\n    \"\\n\",\n"
  },
  {
    "path": "02_analytical_data_prep/src/py_part_2_discretization.ipynb",
    "chars": 10601,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# License \\n\",\n    \"***\\n\",\n    \"Co"
  },
  {
    "path": "02_analytical_data_prep/src/py_part_2_encoding.ipynb",
    "chars": 12203,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# License \\n\",\n    \"***\\n\",\n    \"Co"
  },
  {
    "path": "02_analytical_data_prep/src/py_part_2_feature_extraction.ipynb",
    "chars": 36141,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# License \\n\",\n    \"***\\n\",\n    \"Co"
  },
  {
    "path": "02_analytical_data_prep/src/py_part_2_feature_selection.ipynb",
    "chars": 13050,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# License \\n\",\n    \"***\\n\",\n    \"Co"
  },
  {
    "path": "02_analytical_data_prep/src/py_part_2_impute.ipynb",
    "chars": 7229,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# License \\n\",\n    \"***\\n\",\n    \"Co"
  },
  {
    "path": "02_analytical_data_prep/src/py_part_2_over_sample.ipynb",
    "chars": 9524,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# License \\n\",\n    \"***\\n\",\n    \"Co"
  },
  {
    "path": "02_analytical_data_prep/src/py_part_2_standardize.ipynb",
    "chars": 12843,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# License \\n\",\n    \"***\\n\",\n    \"Co"
  },
  {
    "path": "02_analytical_data_prep/src/py_part_2_target_encode_categorical.ipynb",
    "chars": 14373,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# License \\n\",\n    \"***\\n\",\n    \"Co"
  },
  {
    "path": "02_analytical_data_prep/src/py_part_2_target_encode_numeric.ipynb",
    "chars": 14695,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# License \\n\",\n    \"***\\n\",\n    \"Co"
  },
  {
    "path": "02_analytical_data_prep/src/py_part_2_winsorize.ipynb",
    "chars": 9903,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# License \\n\",\n    \"***\\n\",\n    \"Co"
  },
  {
    "path": "02_analytical_data_prep/xml/02_analytical_data_prep.xml",
    "chars": 20301,
    "preview": "<?xml version=\"1.0\" encoding=\"UTF-8\"?><_ROOT_ EMVERSION=\"14.1\" ORIENTATION=\"HORIZONTAL\">\n<Workspace>\n<NODES>\n<NODE CLASS"
  },
  {
    "path": "03_regression/.gitignore",
    "chars": 17,
    "preview": "*.R\n*.sas\n*.docx\n"
  },
  {
    "path": "03_regression/03_regression.md",
    "chars": 2865,
    "preview": "## Section 03: Regression\n\nRegression is important because it is stable, interpretable and widely understood and accepte"
  },
  {
    "path": "03_regression/assignment/.gitignore",
    "chars": 4,
    "preview": "key\n"
  },
  {
    "path": "03_regression/data/.gitignore",
    "chars": 16,
    "preview": "submission*.csv\n"
  },
  {
    "path": "03_regression/data/test.csv",
    "chars": 131,
    "preview": "version https://git-lfs.github.com/spec/v1\noid sha256:8fdd3d829d4d986b58f845c9553b225e67dd8383624d90fb6ca1d4bed5798c1e\ns"
  },
  {
    "path": "03_regression/data/train.csv",
    "chars": 131,
    "preview": "version https://git-lfs.github.com/spec/v1\noid sha256:1e18addf81e5e4d347cc17ee6075bbe4a42b7fa26b9e5b063e8f692a5f929d41\ns"
  },
  {
    "path": "03_regression/quiz/.gitignore",
    "chars": 4,
    "preview": "key\n"
  },
  {
    "path": "03_regression/src/.gitignore",
    "chars": 104,
    "preview": "py_part_3_kaggle_starter-Copy1.ipynb\n.ipynb_checkpoints/py_part_3_kaggle_starter-Copy1-checkpoint.ipynb\n"
  },
  {
    "path": "03_regression/src/py_part_3_kaggle_starter.ipynb",
    "chars": 2128505,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# License \\n\",\n    \"***\\n\",\n    \"Co"
  },
  {
    "path": "03_regression/src/py_part_3_linear_regression_gradient_descent.ipynb",
    "chars": 99847,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# License \\n\",\n    \"***\\n\",\n    \"Co"
  },
  {
    "path": "03_regression/src/py_part_3_penalized_linear_regression.ipynb",
    "chars": 52373,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# License \\n\",\n    \"***\\n\",\n    \"Co"
  },
  {
    "path": "03_regression/src/py_part_3_penalized_logistic_regression.ipynb",
    "chars": 21941,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# License \\n\",\n    \"***\\n\",\n    \"Co"
  },
  {
    "path": "03_regression/src/spark_kaggle_starter/README.md",
    "chars": 2098,
    "preview": "<h1>Spark Kaggle Starter</h1> \n<b>Summary:</b> This code takes much of Patrick's code and upgrades it with Spark and Pys"
  },
  {
    "path": "03_regression/src/spark_kaggle_starter/feature_combiner.py",
    "chars": 4597,
    "preview": "# imports\nimport pandas as pd\nimport numpy as np\n\n\n\ndef feature_combiner(training_frame, test_frame, nums, valid_frame ="
  },
  {
    "path": "03_regression/src/spark_kaggle_starter/get_type_lists.py",
    "chars": 1232,
    "preview": "def get_type_lists(frame, rejects=['Id', 'ID','id'],frame_type='h2o'):\n\n    \"\"\"Creates lists of numeric and categorical "
  },
  {
    "path": "03_regression/src/spark_kaggle_starter/logging_lib/LICENSE.md",
    "chars": 1083,
    "preview": "\nThe MIT License (MIT)\n\nCopyright (c) 2017 Keston Crandall\n\nPermission is hereby granted, free of charge, to any person "
  },
  {
    "path": "03_regression/src/spark_kaggle_starter/logging_lib/LoggingController.py",
    "chars": 2656,
    "preview": "import logging\nimport os\nimport io\nfrom datetime import datetime\nimport boto3\nimport botocore\n\nclass LoggingController(o"
  },
  {
    "path": "03_regression/src/spark_kaggle_starter/logging_lib/MarkdownBuilder.py",
    "chars": 5487,
    "preview": "import logging\nimport os\nimport io\nfrom datetime import datetime\nimport boto3\nfrom boto3.s3.transfer import S3Transfer\ni"
  },
  {
    "path": "03_regression/src/spark_kaggle_starter/logging_lib/README.md",
    "chars": 3438,
    "preview": "<h1> Logging Library </h1>\n\n<b>Summary</b>: This package is designed to make logging easy and clean from an environment "
  },
  {
    "path": "03_regression/src/spark_kaggle_starter/logging_lib/__init__.py",
    "chars": 16,
    "preview": "from . import *\n"
  },
  {
    "path": "03_regression/src/spark_kaggle_starter/logging_lib/example.py",
    "chars": 1160,
    "preview": "# a stacked bar plot with errorbars\nimport numpy as np\nimport matplotlib.pyplot as plt\nN = 5\nmenMeans = (20, 35, 30, 35,"
  },
  {
    "path": "03_regression/src/spark_kaggle_starter/logging_lib/markdown_preview_github.css",
    "chars": 7471,
    "preview": ".markdown-preview.markdown-preview {\n\n  // Includes GitHub.com styles from `../assets/primer-markdown.less`.\n    // Sour"
  },
  {
    "path": "03_regression/src/spark_kaggle_starter/main.py",
    "chars": 976,
    "preview": "import os\nimport sys\n\nfrom spark_controler.emr_controller import EMRController\n\n\n\ndeployer = EMRController()\ndeployer.pr"
  },
  {
    "path": "03_regression/src/spark_kaggle_starter/spark_controler/LICENSE.md",
    "chars": 1083,
    "preview": "\nThe MIT License (MIT)\n\nCopyright (c) 2017 Keston Crandall\n\nPermission is hereby granted, free of charge, to any person "
  },
  {
    "path": "03_regression/src/spark_kaggle_starter/spark_controler/README.md",
    "chars": 3782,
    "preview": "<h1> EMR Automation Controller </h1>\n\n<b>Summary</b>: This package uses boto3 to interact with AWS's EMR service. It has"
  },
  {
    "path": "03_regression/src/spark_kaggle_starter/spark_controler/__init__.py",
    "chars": 16,
    "preview": "from . import *\n"
  },
  {
    "path": "03_regression/src/spark_kaggle_starter/spark_controler/ec2_instance_data_dict.py",
    "chars": 2820,
    "preview": "ec2_data_dict = {\n   \"t2.nano\":{\n      \"cores\":\"1\",\n      \"memory\":\"0.5\"\n   },\n   \"t2.micro\":{\n      \"cores\":\"1\",\n      "
  },
  {
    "path": "03_regression/src/spark_kaggle_starter/spark_controler/emr_controller.py",
    "chars": 34777,
    "preview": "import boto3\nimport botocore\nimport time\nimport logging\nimport os\nfrom datetime import datetime\nimport tarfile\n# https:/"
  },
  {
    "path": "03_regression/src/spark_kaggle_starter/spark_controler/files/setup.sh",
    "chars": 935,
    "preview": "#!/bin/bash\n\n# Parse arguments\ns3_bucket=\"$1\"\ns3_bucket_script=\"$s3_bucket/script.tar.gz\"\n\n# Download compressed script "
  },
  {
    "path": "03_regression/src/spark_kaggle_starter/spark_controler/files/terminate_idle_cluster.sh",
    "chars": 3198,
    "preview": "#!/bin/sh\n\n# Copyright 2013 Lyft\n# Copyright 2014 Alex Konradi\n# Copyright 2015 Yelp and Contributors\n#\n# Licensed under"
  },
  {
    "path": "03_regression/src/spark_kaggle_starter/spark_controler/scripts/bootstrap_actions.sh",
    "chars": 2416,
    "preview": "#!/usr/bin/env bash\n\n# #Mounted directory we want to use\n# export MOUNT_TO_USE=/mnt\n#\n# change Home directory\n# mkdir $M"
  },
  {
    "path": "03_regression/src/spark_kaggle_starter/spark_controler/scripts/deep_learning_install_complete.sh",
    "chars": 10172,
    "preview": "#!/bin/bash\n\n\n# enable debugging & set strict error trap\nset -x -e\n\n\n# change Home directory\nexport HOME=/mnt/home\n\nmkdi"
  },
  {
    "path": "03_regression/src/spark_kaggle_starter/spark_controler/scripts/pyspark_quick_setup.sh",
    "chars": 445,
    "preview": "#!/usr/bin/env bash\n\n# bind conda to spark\necho -e \"\\nexport PYSPARK_PYTHON=/home/hadoop/conda/bin/python\" >> /etc/spark"
  },
  {
    "path": "03_regression/src/spark_kaggle_starter/spark_controler/scripts/terminate_idle_cluster.sh",
    "chars": 3198,
    "preview": "#!/bin/sh\n\n# Copyright 2013 Lyft\n# Copyright 2014 Alex Konradi\n# Copyright 2015 Yelp and Contributors\n#\n# Licensed under"
  },
  {
    "path": "03_regression/src/spark_kaggle_starter/spark_main.py",
    "chars": 14223,
    "preview": "# imports\nimport pandas as pd\nimport numpy as np\nimport time\nimport os\nfrom tabulate import tabulate\n\nimport sys\nfrom op"
  },
  {
    "path": "03_regression/src/spark_kaggle_starter/target_encoder.py",
    "chars": 12362,
    "preview": "def target_encoder(training_frame, test_frame, x, y, lambda_=0.15, threshold=150, test=False, valid_frame = None,frame_t"
  },
  {
    "path": "03_regression/src/target_encoder.py",
    "chars": 2596,
    "preview": "import numpy as np \nimport pandas as pd \nimport h2o \n \ndef target_encoder(training_frame, test_frame, x, y, lambda_=0.15"
  },
  {
    "path": "03_regression/xml/03_linear_regression.xml",
    "chars": 23636,
    "preview": "<?xml version=\"1.0\" encoding=\"UTF-8\"?><_ROOT_ EMVERSION=\"14.1\" ORIENTATION=\"HORIZONTAL\">\n<Workspace>\n<NODES>\n<NODE CLASS"
  },
  {
    "path": "03_regression/xml/03_logistic_regression.xml",
    "chars": 26501,
    "preview": "<?xml version=\"1.0\" encoding=\"UTF-8\"?><_ROOT_ EMVERSION=\"14.1\" ORIENTATION=\"HORIZONTAL\">\n<Workspace>\n<NODES>\n<NODE CLASS"
  },
  {
    "path": "04_decision_trees/04_decision_trees.md",
    "chars": 3308,
    "preview": "## Section 04: Decision Trees\n\nDecision trees strike a nice balance between interpretability and accuracy. They pick up "
  },
  {
    "path": "04_decision_trees/data/.gitignore",
    "chars": 12,
    "preview": "submission*\n"
  },
  {
    "path": "04_decision_trees/quiz/.gitignore",
    "chars": 4,
    "preview": "key\n"
  },
  {
    "path": "04_decision_trees/src/py_part_4_decision_tree_ensembles.ipynb",
    "chars": 174257,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# License \\n\",\n    \"***\\n\",\n    \"Co"
  },
  {
    "path": "04_decision_trees/src/py_part_4_kaggle_xgboost.ipynb",
    "chars": 707121,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# License \\n\",\n    \"***\\n\",\n    \"Co"
  },
  {
    "path": "04_decision_trees/xml/04_decision_trees.xml",
    "chars": 33476,
    "preview": "<?xml version=\"1.0\" encoding=\"UTF-8\"?><_ROOT_ EMVERSION=\"14.1\" ORIENTATION=\"HORIZONTAL\">\n<Workspace>\n<NODES>\n<NODE CLASS"
  },
  {
    "path": "05_neural_networks/05_neural_networks.md",
    "chars": 4175,
    "preview": "## Section 05: Neural Networks\n\nNeural networks are important because of their ability to approximate **any** relationsh"
  },
  {
    "path": "05_neural_networks/assignment/.gitignore",
    "chars": 4,
    "preview": "key\n"
  },
  {
    "path": "05_neural_networks/data/.gitignore",
    "chars": 28,
    "preview": "*.csv\n*.png\n*.jpg\n*.tar.gz\n\n"
  },
  {
    "path": "05_neural_networks/quiz/.gitignore",
    "chars": 4,
    "preview": "key\n"
  },
  {
    "path": "05_neural_networks/quiz/sample/.gitignore",
    "chars": 7,
    "preview": "*.docx\n"
  },
  {
    "path": "05_neural_networks/src/.gitignore",
    "chars": 143,
    "preview": "Keras\\ MNIST\\ MLP\\ using\\ Images\\ Sample.ipynb\nKeras\\ MNIST\\ MLP.ipynb\nkeras-mnist-lenet.py\nkeras-mnist-mlp-image-sample"
  },
  {
    "path": "05_neural_networks/src/py_part_5_MNIST_DNN.ipynb",
    "chars": 427402,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# License \\n\",\n    \"***\\n\",\n    \"Co"
  },
  {
    "path": "05_neural_networks/src/py_part_5_MNIST_autoencoder.ipynb",
    "chars": 331547,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# License \\n\",\n    \"***\\n\",\n    \"Co"
  },
  {
    "path": "05_neural_networks/src/py_part_5_MNIST_data_augmentation.ipynb",
    "chars": 19855,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# License \\n\",\n    \"***\\n\",\n    \"Co"
  },
  {
    "path": "05_neural_networks/src/py_part_5_MNIST_keras_lenet.ipynb",
    "chars": 8069,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 1,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n "
  },
  {
    "path": "05_neural_networks/src/py_part_5_basic_mlp_example.ipynb",
    "chars": 62559,
    "preview": "{\n  \"cells\": [\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"view-in-github\",\n        \"colab_t"
  },
  {
    "path": "05_neural_networks/src/py_part_5_neural_networks.ipynb",
    "chars": 161235,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# License \\n\",\n    \"***\\n\",\n    \"Co"
  },
  {
    "path": "05_neural_networks/xml/05_neural_networks.xml",
    "chars": 38448,
    "preview": "<?xml version=\"1.0\" encoding=\"UTF-8\"?><_ROOT_ EMVERSION=\"14.1\" ORIENTATION=\"HORIZONTAL\">\n<Workspace>\n<NODES>\n<NODE CLASS"
  },
  {
    "path": "06_clustering/06_clustering.md",
    "chars": 1593,
    "preview": "## Section 06: Clustering\n\nClustering enables us to group the rows of a data set together based on their similarities wi"
  },
  {
    "path": "06_clustering/assignment/key/.gitignore",
    "chars": 23,
    "preview": "assignment_4_key.ipynb\n"
  },
  {
    "path": "06_clustering/quiz/.gitignore",
    "chars": 4,
    "preview": "key\n"
  },
  {
    "path": "06_clustering/src/py_part_6_clustering.ipynb",
    "chars": 186555,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# License \\n\",\n    \"***\\n\",\n    \"Co"
  },
  {
    "path": "06_clustering/xml/06_clustering.xml",
    "chars": 30417,
    "preview": "<?xml version=\"1.0\" encoding=\"UTF-8\"?><_ROOT_ EMVERSION=\"14.1\" ORIENTATION=\"HORIZONTAL\">\n<Workspace>\n<NODES>\n<NODE CLASS"
  },
  {
    "path": "07_association_rules/07_association_rules.md",
    "chars": 1384,
    "preview": "## Section 07: Association Rules\n\nAssociation rules, sometimes referred to as market basket analysis, tell us which item"
  },
  {
    "path": "07_association_rules/assignment/.gitignore",
    "chars": 4,
    "preview": "key\n"
  },
  {
    "path": "07_association_rules/quiz/.gitignore",
    "chars": 4,
    "preview": "key\n"
  },
  {
    "path": "07_association_rules/xml/07_association_rules.xml",
    "chars": 2588,
    "preview": "<?xml version=\"1.0\" encoding=\"UTF-8\"?><_ROOT_ EMVERSION=\"14.1\" ORIENTATION=\"HORIZONTAL\">\n<Workspace>\n<NODES>\n<NODE CLASS"
  },
  {
    "path": "08_text_mining/08_text_mining.md",
    "chars": 1977,
    "preview": "##\tSection 08: Text Mining\n\nText mining essentially means converting a group of documents into a meaningful numeric repr"
  },
  {
    "path": "08_text_mining/quiz/.gitignore",
    "chars": 4,
    "preview": "key\n"
  },
  {
    "path": "08_text_mining/quiz/sample/.gitignore",
    "chars": 4,
    "preview": "key\n"
  },
  {
    "path": "08_text_mining/xml/08_text_mining.xml",
    "chars": 36481,
    "preview": "<?xml version=\"1.0\" encoding=\"UTF-8\"?><_ROOT_ EMVERSION=\"13.2\" ORIENTATION=\"HORIZONTAL\">\n<Workspace>\n<NODES>\n<NODE CLASS"
  },
  {
    "path": "09_matrix_factorization/09_matrix_factorization.md",
    "chars": 2696,
    "preview": "## Section 09: Matrix factorization\n\nMatrix factorization enables us to represent sparse or high-dimensional data\nsets a"
  },
  {
    "path": "09_matrix_factorization/src/py_part_9_iris_pca.ipynb",
    "chars": 203939,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# License \\n\",\n    \"***\\n\",\n    \"Co"
  },
  {
    "path": "09_matrix_factorization/src/py_part_9_kaggle_GLRM_example.ipynb",
    "chars": 326043,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# License \\n\",\n    \"***\\n\",\n    \"Co"
  },
  {
    "path": "10_model_interpretability/10_model_interpretability.md",
    "chars": 4336,
    "preview": "## Section 10: Practical Model Interpretability\n\nMachine learning algorithms create potentially more accurate models tha"
  },
  {
    "path": "10_model_interpretability/quiz/.gitignore",
    "chars": 4,
    "preview": "key\n"
  },
  {
    "path": "10_model_interpretability/src/dt_surrogate.ipynb",
    "chars": 152493,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# License \\n\",\n    \"***\\n\",\n    \"Co"
  },
  {
    "path": "10_model_interpretability/src/lime.ipynb",
    "chars": 250374,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# License \\n\",\n    \"***\\n\",\n    \"Co"
  },
  {
    "path": "10_model_interpretability/src/loco.ipynb",
    "chars": 176586,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# License \\n\",\n    \"***\\n\",\n    \"Co"
  },
  {
    "path": "10_model_interpretability/src/mono_xgboost.ipynb",
    "chars": 392540,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# License \\n\",\n    \"***\\n\",\n    \"Co"
  },
  {
    "path": "10_model_interpretability/src/pdp_ice.ipynb",
    "chars": 496442,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# License \\n\",\n    \"***\\n\",\n    \"Co"
  },
  {
    "path": "10_model_interpretability/src/sensitivity_analysis.ipynb",
    "chars": 79568,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# License \\n\",\n    \"***\\n\",\n    \"Co"
  },
  {
    "path": "README.md",
    "chars": 17008,
    "preview": "# Materials for GWU DNSC 6279 and 6290\n\n**DNSC 6279 (\"Data Mining\")** provides exposure to various data preprocessing, s"
  },
  {
    "path": "anaconda_py35_h2o_xgboost_graphviz/Dockerfile",
    "chars": 2109,
    "preview": "# Base debian system\nFROM debian:8.5\nENV LANG=C.UTF-8 LC_ALL=C.UTF-8\n\n# Update OS\nRUN apt-get update --fix-missing && ap"
  },
  {
    "path": "cold_call.py",
    "chars": 531,
    "preview": "#!/usr/bin/env python3\n# -*- coding: utf-8 -*-\n# specify blackboard roster as command line arg\n\nimport numpy as np\nimpor"
  },
  {
    "path": "requirements.txt",
    "chars": 292,
    "preview": "# Python 3.6.3 |Anaconda, Inc.| (default, Oct 13 2017, 12:02:49)\n# [GCC 7.2.0] on linux\n# also required: `apt-get -y ins"
  }
]

// ... and 6 more files (download for full content)

About this extraction

This page contains the full source code of the jphall663/GWU_data_mining GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 141 files (48.6 MB), approximately 2.7M tokens, and a symbol index with 50 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.

Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.

Copied to clipboard!