Full Code of DataTalksClub/machine-learning-zoomcamp for AI

master 2636ec556fac cached

428 files

6.6 MB

1.7M tokens

76 symbols

1 requests

Download .txt

Showing preview only (6,962K chars total). Download the full file or copy to clipboard to get everything.

Repository: DataTalksClub/machine-learning-zoomcamp
Branch: master
Commit: 2636ec556fac
Files: 428
Total size: 6.6 MB

Directory structure:
gitextract_3sp31muz/

├── .github/
│   └── FUNDING.yml
├── .gitignore
├── 01-intro/
│   ├── 01-what-is-ml.md
│   ├── 02-ml-vs-rules.md
│   ├── 03-supervised-ml.md
│   ├── 04-crisp-dm.md
│   ├── 05-model-selection.md
│   ├── 06-environment.md
│   ├── 07-numpy.md
│   ├── 08-linear-algebra.md
│   ├── 09-pandas.md
│   ├── 10-summary.md
│   ├── README.md
│   ├── homework.md
│   └── notebooks/
│       ├── 07-numpy.ipynb
│       ├── 08-linear-algebra.ipynb
│       └── 09-pandas.ipynb
├── 02-regression/
│   ├── 01-car-price-intro.md
│   ├── 02-data-preparation.md
│   ├── 03-eda.md
│   ├── 04-validation-framework.md
│   ├── 05-linear-regression-simple.md
│   ├── 06-linear-regression-vector.md
│   ├── 07-linear-regression-training.md
│   ├── 08-baseline-model.md
│   ├── 09-rmse.md
│   ├── 10-car-price-validation.md
│   ├── 11-feature-engineering.md
│   ├── 12-categorical-variables.md
│   ├── 13-regularization.md
│   ├── 14-tuning-model.md
│   ├── 15-using-model.md
│   ├── 16-summary.md
│   ├── 17-explore-more.md
│   ├── README.md
│   ├── homework.md
│   ├── meta.json
│   └── notebook.ipynb
├── 03-classification/
│   ├── 01-churn-project.md
│   ├── 02-data-preparation.md
│   ├── 03-validation.md
│   ├── 04-eda.md
│   ├── 05-risk.md
│   ├── 06-mutual-info.md
│   ├── 07-correlation.md
│   ├── 08-ohe.md
│   ├── 09-logistic-regression.md
│   ├── 10-training-log-reg.md
│   ├── 11-log-reg-interpretation.md
│   ├── 12-using-log-reg.md
│   ├── 13-summary.md
│   ├── 14-explore-more.md
│   ├── README.md
│   ├── homework.md
│   ├── meta.csv
│   ├── meta.json
│   ├── notebook-scaling-ohe.ipynb
│   └── notebook.ipynb
├── 04-evaluation/
│   ├── 01-overview.md
│   ├── 02-accuracy.md
│   ├── 03-confusion-table.md
│   ├── 04-precision-recall.md
│   ├── 05-roc.md
│   ├── 06-auc.md
│   ├── 07-cross-validation.md
│   ├── 08-summary.md
│   ├── 09-explore-more.md
│   ├── README.md
│   ├── homework.md
│   ├── meta.csv
│   ├── meta.json
│   └── notebook.ipynb
├── 05-deployment/
│   ├── 01-intro.md
│   ├── 02-pickle.md
│   ├── 03-flask-intro.md
│   ├── 04-flask-deployment.md
│   ├── 05-pipenv.md
│   ├── 06-docker.md
│   ├── 07-aws-eb.md
│   ├── 08-summary.md
│   ├── 09-explore-more.md
│   ├── README.md
│   ├── code/
│   │   ├── 05-train-churn-model.ipynb
│   │   ├── Dockerfile
│   │   ├── Pipfile
│   │   ├── ping.py
│   │   ├── plan.md
│   │   ├── predict-test.py
│   │   ├── predict.py
│   │   └── train.py
│   ├── homework.md
│   ├── meta.csv
│   ├── meta.json
│   └── workshop/
│       ├── .dockerignore
│       ├── .python-version
│       ├── Dockerfile
│       ├── README.md
│       ├── fly.toml
│       ├── ping.py
│       ├── predict.py
│       ├── predict_old.py
│       ├── pyproject.toml
│       ├── starter.ipynb
│       ├── test.py
│       ├── train.py
│       └── workshop-uv-fastapi.ipynb
├── 06-trees/
│   ├── 01-credit-risk.md
│   ├── 02-data-prep.md
│   ├── 03-decision-trees.md
│   ├── 04-decision-tree-learning.md
│   ├── 05-decision-tree-tuning.md
│   ├── 06-random-forest.md
│   ├── 07-boosting.md
│   ├── 08-xgb-tuning.md
│   ├── 09-final-model.md
│   ├── 10-summary.md
│   ├── 11-explore-more.md
│   ├── README.md
│   ├── homework.md
│   ├── meta.csv
│   ├── meta.json
│   └── notebook.ipynb
├── 08-deep-learning/
│   ├── 01-fashion-classification.md
│   ├── 02-tensorflow-keras.md
│   ├── 03-pretrained-models.md
│   ├── 04-conv-neural-nets.md
│   ├── 05-transfer-learning.md
│   ├── 06-learning-rate.md
│   ├── 07-checkpointing.md
│   ├── 08-more-layers.md
│   ├── 09-dropout.md
│   ├── 10-augmentation.md
│   ├── 11-large-model.md
│   ├── 12-using-model.md
│   ├── 13-summary.md
│   ├── 14-explore-more.md
│   ├── README.md
│   ├── homework.md
│   ├── install.md
│   ├── meta.csv
│   ├── meta.json
│   ├── notebook.ipynb
│   └── pytorch/
│       ├── README.md
│       └── install_pytorch.md
├── 09-serverless/
│   ├── 01-intro.md
│   ├── 02-aws-lambda.md
│   ├── 03-tensorflow-lite.md
│   ├── 04-preparing-code.md
│   ├── 05-docker-image.md
│   ├── 06-creating-lambda.md
│   ├── 07-api-gateway.md
│   ├── 08-summary.md
│   ├── 09-explore-more.md
│   ├── README.md
│   ├── code/
│   │   ├── Dockerfile
│   │   ├── convert-model.py
│   │   ├── lambda_function.py
│   │   ├── plan.md
│   │   ├── tensorflow-model.ipynb
│   │   └── test.py
│   ├── homework.md
│   ├── meta.csv
│   ├── meta.json
│   ├── updates.md
│   └── workshop/
│       ├── README.md
│       ├── lambda-keras/
│       │   ├── .gitignore
│       │   ├── Dockerfile
│       │   ├── convert/
│       │   │   ├── .dockerignore
│       │   │   ├── Dockerfile
│       │   │   ├── README.md
│       │   │   └── convert-saved-model.py
│       │   ├── lambda_function.py
│       │   ├── test.ipynb
│       │   └── test.py
│       ├── lambda-onnx/
│       │   ├── .gitignore
│       │   ├── Dockerfile
│       │   ├── lambda_function.py
│       │   ├── test.ipynb
│       │   └── test.py
│       ├── lambda-sklearn/
│       │   ├── .dockerignore
│       │   ├── .python-version
│       │   ├── Dockerfile
│       │   ├── customer.json
│       │   ├── deploy.sh
│       │   ├── invoke.py
│       │   ├── lambda_function.py
│       │   ├── pyproject.toml
│       │   └── test.py
│       └── train/
│           ├── .python-version
│           ├── README.md
│           ├── pyproject.toml
│           └── train.py
├── 10-kubernetes/
│   ├── 01-overview.md
│   ├── 02-tensorflow-serving.md
│   ├── 03-preprocessing.md
│   ├── 04-docker-compose.md
│   ├── 05-kubernetes-intro.md
│   ├── 06-kubernetes-simple-service.md
│   ├── 07-kubernetes-tf-serving.md
│   ├── 08-eks.md
│   ├── 09-summary.md
│   ├── 10-explore-more.md
│   ├── README.md
│   ├── code/
│   │   ├── Pipfile
│   │   ├── README.md
│   │   ├── docker-compose.yaml
│   │   ├── gateway.py
│   │   ├── image-gateway.dockerfile
│   │   ├── image-model.dockerfile
│   │   ├── kube-config/
│   │   │   ├── eks-config.yaml
│   │   │   ├── gateway-deployment.yaml
│   │   │   ├── gateway-service.yaml
│   │   │   ├── model-deployment.yaml
│   │   │   └── model-service.yaml
│   │   ├── ping/
│   │   │   ├── Dockerfile
│   │   │   ├── Pipfile
│   │   │   ├── deployment.yaml
│   │   │   ├── metallb-config.yaml
│   │   │   ├── ping.py
│   │   │   └── service.yaml
│   │   ├── plan.md
│   │   ├── proto.py
│   │   ├── test.py
│   │   └── tf-serving-connect.ipynb
│   ├── homework.md
│   ├── meta.csv
│   ├── meta.json
│   └── workshop/
│       ├── README.md
│       ├── k8s/
│       │   ├── deployment.yaml
│       │   ├── hpa.yaml
│       │   └── service.yaml
│       ├── load_test.py
│       └── service/
│           ├── .gitignore
│           ├── .python-version
│           ├── Dockerfile
│           ├── README.md
│           ├── app.py
│           ├── pyproject.toml
│           └── test.py
├── 11-kserve/
│   ├── 01-overview.md
│   ├── 02-kserve-local.md
│   ├── 03-kserve-sklearn.md
│   ├── 04-kserve-custom-image.md
│   ├── 05-tensorflow-kserve.md
│   ├── 06-kserve-transformers.md
│   ├── 07-kserve-eks-upd.md
│   ├── 07-kserve-eks.md
│   ├── 08-summary.md
│   ├── 09-explore-more.md
│   ├── README.md
│   ├── code/
│   │   ├── .gitignore
│   │   ├── README.md
│   │   ├── churn/
│   │   │   ├── Pipfile
│   │   │   ├── churn-service.yaml
│   │   │   ├── churn-test.py
│   │   │   ├── churn-train.py
│   │   │   └── model.joblib
│   │   ├── clothes/
│   │   │   ├── clothes-service.yaml
│   │   │   ├── convert.py
│   │   │   ├── test-transformer.py
│   │   │   ├── test.ipynb
│   │   │   └── test.py
│   │   ├── eks/
│   │   │   ├── clothes-service.yaml
│   │   │   ├── cluster.yaml
│   │   │   └── test-transformer.py
│   │   ├── image_transfomer/
│   │   │   ├── Dockerfile
│   │   │   ├── Pipfile
│   │   │   └── image_transformer.py
│   │   ├── iris/
│   │   │   ├── iris-example.yaml
│   │   │   ├── iris-request.json
│   │   │   └── iris-test.py
│   │   └── plan.md
│   ├── meta.csv
│   └── meta.json
├── README.md
├── after-sign-up.md
├── article/
│   └── README.md
├── asking-questions.md
├── bento.md
├── certificates.md
├── cohorts/
│   ├── 2021/
│   │   ├── 01-intro/
│   │   │   ├── homework-1.ipynb
│   │   │   └── homework.md
│   │   ├── 02-regression/
│   │   │   ├── homework.ipynb
│   │   │   └── homework.md
│   │   ├── 03-classification/
│   │   │   ├── homework.ipynb
│   │   │   └── homework.md
│   │   ├── 04-evaluation/
│   │   │   ├── homework-4-solution.ipynb
│   │   │   ├── homework-4-starter.ipynb
│   │   │   └── homework.md
│   │   ├── 05-deployment/
│   │   │   ├── homework/
│   │   │   │   ├── Dockerfile
│   │   │   │   ├── Pipfile
│   │   │   │   ├── homework.md
│   │   │   │   ├── q3_test.py
│   │   │   │   ├── q4_predict.py
│   │   │   │   ├── q4_test.py
│   │   │   │   ├── q6_predict.py
│   │   │   │   └── q6_test.py
│   │   │   └── homework.md
│   │   ├── 06-trees/
│   │   │   ├── homework-6-solution.ipynb
│   │   │   ├── homework-6-starter.ipynb
│   │   │   └── homework.md
│   │   ├── 07-midterm-project/
│   │   │   ├── README.md
│   │   │   ├── week10-office-hours.ipynb
│   │   │   ├── week8-office-hours.ipynb
│   │   │   └── week9-office-hours.ipynb
│   │   ├── 08-deep-learning/
│   │   │   ├── CNN_solution.ipynb
│   │   │   ├── homework.md
│   │   │   └── week-11-office-hours.ipynb
│   │   ├── 09-serverless/
│   │   │   ├── homework/
│   │   │   │   ├── Dockerfile
│   │   │   │   ├── homework.ipynb
│   │   │   │   ├── homework.py
│   │   │   │   └── test.py
│   │   │   └── homework.md
│   │   ├── 10-kubernetes/
│   │   │   ├── homework/
│   │   │   │   ├── deployment.yaml
│   │   │   │   └── service.yaml
│   │   │   └── homework.md
│   │   ├── 12-capstone/
│   │   │   └── README.md
│   │   ├── 13-article/
│   │   │   └── README.md
│   │   ├── 14-project/
│   │   │   └── README.md
│   │   ├── leaderboard.md
│   │   └── office-hours.md
│   ├── 2022/
│   │   ├── 01-intro/
│   │   │   ├── homework.md
│   │   │   └── homework_1.ipynb
│   │   ├── 02-regression/
│   │   │   ├── homework.md
│   │   │   └── homework_2.ipynb
│   │   ├── 03-classification/
│   │   │   ├── homework.md
│   │   │   └── homework_3.ipynb
│   │   ├── 04-evaluation/
│   │   │   ├── homework.md
│   │   │   └── homework_4.ipynb
│   │   ├── 05-deployment/
│   │   │   ├── homework/
│   │   │   │   ├── Dockerfile
│   │   │   │   ├── Pipfile
│   │   │   │   ├── q3_test.py
│   │   │   │   ├── q4_predict.py
│   │   │   │   ├── q4_test.py
│   │   │   │   ├── q6_predict.py
│   │   │   │   └── q6_test.py
│   │   │   └── homework.md
│   │   ├── 06-trees/
│   │   │   ├── homework.md
│   │   │   ├── homework_6.ipynb
│   │   │   └── homework_6_starter.ipynb
│   │   ├── 07-bento-production/
│   │   │   ├── homework.md
│   │   │   └── locustfile.py
│   │   ├── 08-deep-learning/
│   │   │   ├── homework.md
│   │   │   └── homework_8.ipynb
│   │   ├── 09-serverless/
│   │   │   ├── homework/
│   │   │   │   ├── Dockerfile
│   │   │   │   ├── homework.ipynb
│   │   │   │   ├── homework.py
│   │   │   │   └── test.py
│   │   │   └── homework.md
│   │   ├── 10-kubernetes/
│   │   │   ├── homework/
│   │   │   │   ├── deployment.yaml
│   │   │   │   ├── hpa.yaml
│   │   │   │   ├── service.yaml
│   │   │   │   └── test.py
│   │   │   └── homework.md
│   │   ├── README.md
│   │   ├── article.md
│   │   ├── leaderboard.md
│   │   └── projects.md
│   ├── 2023/
│   │   ├── 01-intro/
│   │   │   ├── homework.md
│   │   │   └── homework_1.ipynb
│   │   ├── 02-regression/
│   │   │   └── homework.md
│   │   ├── 03-classification/
│   │   │   ├── homework.md
│   │   │   └── homework_3.ipynb
│   │   ├── 04-evaluation/
│   │   │   └── homework.md
│   │   ├── 05-deployment/
│   │   │   ├── homework/
│   │   │   │   ├── Dockerfile
│   │   │   │   ├── Pipfile
│   │   │   │   ├── q3_test.py
│   │   │   │   ├── q4_predict.py
│   │   │   │   ├── q4_test.py
│   │   │   │   ├── q6_predict.py
│   │   │   │   └── q6_test.py
│   │   │   └── homework.md
│   │   ├── 06-trees/
│   │   │   └── homework.md
│   │   ├── 08-deep-learning/
│   │   │   ├── homework.ipynb
│   │   │   └── homework.md
│   │   ├── 09-serverless/
│   │   │   └── homework.md
│   │   ├── 10-kubernetes/
│   │   │   └── homework.md
│   │   ├── README.md
│   │   ├── article.md
│   │   ├── leaderboard.md
│   │   └── projects.md
│   ├── 2024/
│   │   ├── 01-intro/
│   │   │   └── homework.md
│   │   ├── 02-regression/
│   │   │   └── homework.md
│   │   ├── 03-classification/
│   │   │   └── homework.md
│   │   ├── 04-evaluation/
│   │   │   └── homework.md
│   │   ├── 05-deployment/
│   │   │   └── homework.md
│   │   ├── 06-trees/
│   │   │   └── homework.md
│   │   ├── 08-deep-learning/
│   │   │   └── homework.md
│   │   ├── 09-serverless/
│   │   │   └── homework.md
│   │   ├── 10-kubernetes/
│   │   │   └── homework.md
│   │   ├── README.md
│   │   ├── article.md
│   │   └── projects.md
│   └── 2025/
│       ├── 01-intro/
│       │   ├── homework.md
│       │   └── homework_1.ipynb
│       ├── 02-regression/
│       │   ├── homework.md
│       │   └── homework_2.ipynb
│       ├── 03-classification/
│       │   ├── homework.md
│       │   └── homework_3.ipynb
│       ├── 04-evaluation/
│       │   ├── homework.md
│       │   └── homework_4.ipynb
│       ├── 05-deployment/
│       │   ├── homework/
│       │   │   ├── .python-version
│       │   │   ├── Dockerfile_base
│       │   │   ├── Dockerfile_full
│       │   │   ├── Dockerfile_hw
│       │   │   ├── README.md
│       │   │   ├── main.py
│       │   │   ├── pyproject.toml
│       │   │   ├── q3_test.py
│       │   │   ├── q4_predict.py
│       │   │   ├── q4_test.py
│       │   │   ├── q6_predict.py
│       │   │   └── q6_test.py
│       │   └── homework.md
│       ├── 06-trees/
│       │   ├── homework.ipynb
│       │   └── homework.md
│       ├── 08-deep-learning/
│       │   └── homework.md
│       ├── 09-serverless/
│       │   └── homework.md
│       ├── 10-kubernetes/
│       │   └── homework.md
│       ├── README.md
│       ├── article.md
│       └── projects.md
├── generate-description.ipynb
├── generate-pages.ipynb
├── learning-in-public.md
└── projects/
    ├── README.md
    ├── how-to.md
    └── project-tips.md

================================================
FILE CONTENTS
================================================

================================================
FILE: .github/FUNDING.yml
================================================
github: alexeygrigorev


================================================
FILE: .gitignore
================================================
# generated
.ipynb_checkpoints/
__pycache__/
**my_dir/
**logs/
**models/

# file types
*.h5
*.tflite
*.keras
*.zip
*.pdf

# data folders
**data/

#  content-specific
/08-deep-learning/clothing-dataset-small/
/08-deep-learning/clothing-dataset/
/08-deep-learning/ImageClassification/
/08-deep-learning/my_dir/

/09-serverless/clothing-model/
/09-serverless/clothing-model/

**midterms_evaluations/
**samples/


================================================
FILE: 01-intro/01-what-is-ml.md
================================================
## 1.1 Introduction to Machine Learning

<a href="https://www.youtube.com/watch?v=Crm_5n4mvmg&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR&index=2"><img src="images/thumbnail-1-01.jpg"></a>

[Slides](https://www.slideshare.net/AlexeyGrigorev/ml-zoomcamp-11-introduction-to-machine-learning)


## Notes

The concept of ML is depicted with an example of predicting the price of a car. The ML model
learns from data, represented as some **features** such as year, mileage, among others, and the **target** variable, in this
case, the car's price, by extracting patterns from the data.

Then, the model is given new data (**without** the target) about cars and predicts their price (target). 

In summary, ML is a process of **extracting patterns from data**, which is of two types:

* features (information about the object) and 
* target (property to predict for unseen objects). 

Therefore, new feature values are presented to the model, and it makes **predictions** from the learned patterns.

<table>
   <tr>
      <td>⚠️</td>
      <td>
         The notes are written by the community. <br>
         If you see an error here, please create a PR with a fix.
      </td>
   </tr>
</table>


## Notes

* [Notes from Peter Ernicke](https://knowmledge.com/2023/09/09/ml-zoomcamp-2023-introduction-to-machine-learning-part-1/)

## Navigation

* [Machine Learning Zoomcamp course](../)
* [Lesson 1: Introduction to Machine Learning](./)
* Next: [ML vs Rule-Based Systems](02-ml-vs-rules.md)


================================================
FILE: 01-intro/02-ml-vs-rules.md
================================================
## 1.2 ML vs Rule-Based Systems

<a href="https://www.youtube.com/watch?v=CeukwyUdaz8&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR&index=3"><img src="images/thumbnail-1-02.jpg"></a>

[Slides](https://www.slideshare.net/AlexeyGrigorev/ml-zoomcamp-12-ml-vs-rulebased-systems)


## Notes

The difference between ML and Rule-Based systems is explained with the example of a **spam filter**.

Traditional Rule-Based systems are based on a set of **characteristics** (keywords, email length, etc.) that identify an email as spam or not. As spam emails keep changing over time the system needs to be upgraded making the process untractable due to the complexity of code maintenance as the system grows.

ML can be used to solve this problem with the following steps:

### 1. Get data 
Emails from the user's spam folder and inbox give examples of spam and non-spam.

### 2. Define and calculate features
Rules/characteristics from rule-based systems can be used as a starting point to define features for the ML model. The value of the target variable for each email can be defined based on where the email was obtained from (spam folder or inbox).

Each email can be encoded (converted) to the values of its features and target.

### 3. Train and use the model
A machine learning algorithm can then be applied to the encoded emails to build a model that can predict whether a new email is spam or not spam. The **predictions are probabilities**, and to make a decision it is necessary to define a threshold to classify emails as spam or not spam. 


<table>
   <tr>
      <td>⚠️</td>
      <td>
         The notes are written by the community. <br>
         If you see an error here, please create a PR with a fix.
      </td>
   </tr>
</table>

* [Notes from Peter Ernicke](https://knowmledge.com/2023/09/10/ml-zoomcamp-2023-introduction-to-machine-learning-part-2/)

## Navigation

* [Machine Learning Zoomcamp course](../)
* [Lesson 1: Introduction to Machine Learning](./)
* Previous: [Introduction to Machine Learning](01-what-is-ml.md)
* Next: [Supervised Machine Learning](03-supervised-ml.md)


================================================
FILE: 01-intro/03-supervised-ml.md
================================================
## 1.3 Supervised Machine Learning

<a href="https://www.youtube.com/watch?v=j9kcEuGcC2Y&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR&index=4"><img src="images/thumbnail-1-03.jpg"></a>

[Slides](https://www.slideshare.net/AlexeyGrigorev/ml-zoomcamp-13-supervised-machine-learning)


## Notes

In Supervised Machine Learning (SML) there are always labels associated with certain features.
The model is trained, and then it can make predictions on new features. In this way, the model
is taught by certain features and targets. 

* **Feature matrix (X):** made of observations or objects (rows) and features (columns).
* **Target variable (y):** a vector with the target information we want to predict. For each row of X there's a value in y.


The model can be represented as a function, **g**, that takes the feature matrix, **X**, as **input** and tries to predict values as close as possible to the targets, **y**. The process of **finding** this function **g** is called **training**.

### Types of SML problems 

* **Regression:** the output is a number (car's price).
* **Classification:** the output is a category (spam example). 
	* **Binary:** there are two categories. 
	* **Multiclass problems:** there are more than two categories. 
* **Ranking:** the output is the top scores associated with corresponding items. It is applied in recommender systems. 

In summary, SML is about teaching the model by showing it different examples, and the goal is to come up with a function, that takes the feature matrix as input, and makes predictions of values as close as possible to the **y** targets. 



<table>
   <tr>
      <td>⚠️</td>
      <td>
         The notes are written by the community. <br>
         If you see an error here, please create a PR with a fix.
      </td>
   </tr>
</table>

* [Notes from Peter Ernicke](https://knowmledge.com/2023/09/11/ml-zoomcamp-2023-introduction-to-machine-learning-part-3/)

## Navigation

* [Machine Learning Zoomcamp course](../)
* [Lesson 1: Introduction to Machine Learning](./)
* Previous: [ML vs Rule-Based Systems](02-ml-vs-rules.md)
* Next: [CRISP-DM](04-crisp-dm.md)


================================================
FILE: 01-intro/04-crisp-dm.md
================================================
## 1.4 CRISP-DM

<a href="https://www.youtube.com/watch?v=dCa3JvmJbr0&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR&index=5"><img src="images/thumbnail-1-04.jpg"></a>

[Slides](https://www.slideshare.net/AlexeyGrigorev/ml-zoomcamp-14-crispdm)


## Notes

CRISP-DM, which stands for Cross-Industry Standard Process for Data Mining, is an open standard process model that describes common approaches used by data mining experts. It is the most widely-used analytics model. Conceived in 1996, it became a European Union project under the ESPRIT funding initiative in 1997. The project was led by five companies: Integral Solutions Ltd (ISL), Teradata, Daimler AG, NCR Corporation and OHRA, an insurance company: 

1. **Business understanding:** An important question is do we need ML for the project. The goal of the project has to be measurable. 
2. **Data understanding:** Analyze available data sources, and decide if more data is required. 
3. **Data preparation:** Clean data, remove noise applying pipelines, and convert the data to a tabular format, so we can put it into ML.
4. **Modeling:** Train different models and choose the best one. Considering the results of this step, it is proper to decide if it is required to add new features or fix data issues. 
5. **Evaluation:** Measure how well the model is performing and if it solves the business problem. 
6. **Deployment:** Roll out to production to all the users. The evaluation and deployment often happen together - **online evaluation**. 

It is important to consider how well maintainable the project is.
  
In general, ML projects require many iterations.

**Iteration:** 
* Start simple
* Learn from the feedback
* Improve

<table>
   <tr>
      <td>⚠️</td>
      <td>
         The notes are written by the community. <br>
         If you see an error here, please create a PR with a fix.
      </td>
   </tr>
</table>

* [Notes from Peter Ernicke](https://knowmledge.com/2023/09/12/ml-zoomcamp-2023-introduction-to-machine-learning-part-4/)

## Navigation

* [Machine Learning Zoomcamp course](../)
* [Lesson 1: Introduction to Machine Learning](./)
* Previous: [Supervised Machine Learning](03-supervised-ml.md)
* Next: [Model Selection Process](05-model-selection.md)


================================================
FILE: 01-intro/05-model-selection.md
================================================
## 1.5 Model Selection Process

<a href="https://www.youtube.com/watch?v=OH_R0Sl9neM&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR&index=6"><img src="images/thumbnail-1-05.jpg"></a>

[Slides](https://www.slideshare.net/AlexeyGrigorev/ml-zoomcamp-15-model-selection-process)


## Notes

### Which model to choose?

- Logistic regression
- Decision tree
- Neural Network
- Or many others

The validation dataset is not used in training. There are feature matrices and y vectors
for both training and validation datasets. 
The model is fitted with training data, and it is used to predict the y values of the validation
feature matrix. Then, the predicted y values (probabilities)
are compared with the actual y values. 

**Multiple comparisons problem (MCP):** just by chance one model can be lucky and obtain
good predictions because all of them are probabilistic. 

The test set can help to avoid the MCP. Obtaining the best model is done with the training and validation datasets, while the test dataset is used for assuring that the proposed best model is the best. 

1. Split datasets in training, validation, and test. E.g. 60%, 20% and 20% respectively 
2. Train the models
3. Evaluate the models
4. Select the best model 
5. Apply the best model to the test dataset 
6. Compare the performance metrics of validation and test

<u>NB:</u> Note that it is possible to reuse the validation data. After selecting the best model (step 4), the validation and training datasets can be combined to form a single training dataset for the chosen model before testing it on the test set.

<table>
   <tr>
      <td>⚠️</td>
      <td>
         The notes are written by the community. <br>
         If you see an error here, please create a PR with a fix.
      </td>
   </tr>
</table>

* [Notes from Peter Ernicke](https://knowmledge.com/2023/09/13/ml-zoomcamp-2023-introduction-to-machine-learning-part-5/)

## Navigation

* [Machine Learning Zoomcamp course](../)
* [Lesson 1: Introduction to Machine Learning](./)
* Previous: [CRISP-DM](04-crisp-dm.md)
* Next: [Setting up the Environment](06-environment.md)


================================================
FILE: 01-intro/06-environment.md
================================================
##  Setting up the Environment

In this section, we'll prepare the environment


You need:

* Python 3.11 (note that videos use 3.8)
* NumPy, Pandas and Scikit-Learn (latest available versions) 
* Matplotlib and Seaborn
* Jupyter notebooks

## Github Codespaces

Video: https://www.youtube.com/watch?v=pqQFlV3f9Bo&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR

This is the recommended approach for the course


## Ubuntu 22.04 on AWS

* [This video](https://www.youtube.com/watch?v=IXSiYkP23zo) shows a complete end-to-end environment configuration for an AWS EC2 instance
* This video was created for another course (MLOps Zoomcamp), so you'll need to adjust it slightly: clone this repo instead of the mlops one
* You can use these instructions for setting up your local Ubuntu

Note for WSL 

* Most of the instructions from the previous video apply to WSL too
* For setting up Docker, install Docker Desktop on Windows and it'll be automatically used in WSL. You don't need to install docker.io

## Anaconda and Conda

The easiest way to set up the environment is to use [Anaconda](https://www.anaconda.com/products/individual) or
[Miniconda](https://docs.conda.io/en/latest/miniconda.html).

Anaconda comes with everything we need (and much more). 
Miniconda is a smaller version of Anaconda that contains only Python. 

Follow the instructions on page for installing the correct package for your system.
The site will automatically detect your operating system and suggest the correct package.

* [Anaconda](https://www.anaconda.com/products/individual)
* [Miniconda](https://docs.conda.io/en/latest/miniconda.html#latest-miniconda-installer-links)

If you are using Windows, you can use WSL, but the plain Windows version should work too.

Anaconda is recommended.


### (Optional) Create environment for course

It is a good idea to set up a dedicated environment for the course 

In your terminal, run this command to create the environment

```bash
conda create -n ml-zoomcamp python=3.11
```

Activate it:

```bash
conda activate ml-zoomcamp
```

Installing libraries

```bash
conda install numpy pandas scikit-learn seaborn jupyter
```

Later in the course you will also need to install XGBoost and Tensorflow,
but we can skip this part for now.

## Cloud

Instead of running things locally, you can use online services or rent a server 

### AWS 

You can rent an instance on AWS:

* [Creating an AWS account](https://mlbookcamp.com/article/aws)
* [Renting an ec2 instance](https://mlbookcamp.com/article/aws-ec2)


### GCP

Google cloud platform offers $300 in free credits when you sign up.
You can use this for taking the course.


## Notebook services

There are services that allow you to host and run notebooks.
Note that notebooks alone are not sufficient for the course and for the deployment modules
you will need to have access to the command line interface with Docker, Python and other libraries installed.

### Kaggle

To use Kaggle to open and run the Jupyter notebooks provided as part of this course do the following:

*Pre-requisites - You need to have an account in Kaggle (it's free) and be logged into Kaggle*

1. Find the URL of the notebook. 
   
   ![See this example](images/sample-jupyter-notebook.png)
   
2. To open the notebook in Kaggle, in your web browser launch paste the URL as shown in below example. (*note the additional https://kaggle.com/kernels/welcome?src= before the URL of the notebook*)

   https://kaggle.com/kernels/welcome?src=https://github.com/alexeygrigorev/mlbookcamp-code/blob/master/chapter-02-car-price/02-carprice.ipynb
  
3. Check if the notebook uses any datafile to read data from it. If yes, note the datafile name from the code.- *look for pd.read_csv("somefilename.csv")*. 
   
   ![See this example](images/sample-code.png)
   
4. You need to download the file into Kaggle. For this:

   a. Find the URL of the datafile in github. 
   
   ![See this example](images/sample-data-file.png)
   
   b. Suppose the URL is https://github.com/alexeygrigorev/mlbookcamp-code/blob/master/chapter-02-car-price/data.csv , you need use the URL to raw file, which will look something like https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv
   
5. In the notebook opened in Kaggle, add a Code block with the command to download the file - !wget your-datafile-url 

   ![See this example](images/add-code-for-datafile-download.png)
   
This way you can start with the exercise using Kaggle


### Google Colab

To use Google Colab to open and run the Jupyter notebooks provided as part of this course do the following:

*Pre-requisites - You need to have a google account (any gmail account) and be logged into that account*

Steps for Google Colab are same as that for Kaggle, except for some changes in Step 2, as explained below.

2. To open the notebook in Google Colab, in your web browser launch paste the URL as shown in below example. (*note the https://github.com/ in the URL of the notebook is replaced by https://colab.research.google.com/github/*)

   https://colab.research.google.com/github/alexeygrigorev/mlbookcamp-code/blob/master/chapter-02-car-price/02-carprice.ipynb



## Navigation

* [Machine Learning Zoomcamp course](../)
* [Lesson 1: Introduction to Machine Learning](./)
* Previous lesson: [The Modelling Step (Model Selection Process)](05-model-selection.md)
* Next lesson: [Introduction to NumPy](07-numpy.md)


================================================
FILE: 01-intro/07-numpy.md
================================================
## 1.7 Introduction to NumPy

<a href="https://www.youtube.com/watch?v=Qa0-jYtRdbY&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR&index=7"><img src="images/thumbnail-1-07.jpg"></a>


## Notes

# Understanding Numpy: A Simple Introduction

Numpy, short for Numerical Python, is a powerful Python library that enables efficient and convenient array manipulation and mathematical operations. It forms the foundation for many scientific and data-related tasks. In this article, we'll provide a straightforward explanation of Numpy concepts and how to use them.

## Importing Numpy

Before diving into Numpy's capabilities, we need to import it. Conventionally, we import Numpy with the alias `np`, making it easier to reference its functions:

```python
import numpy as np
```

## Creating Arrays

Arrays are the building blocks of Numpy, and they can be thought of as lists but with enhanced features.

### Creating Arrays with Zeros, Ones, or Constants

You can create arrays filled with zeros, ones, or any constant using `np.zeros()`, `np.ones()`, and `np.full()`:

```python
zeros_array = np.zeros(10)
ones_array = np.ones(10)
constant_array = np.full(10, 3)
```

### Converting Lists to Arrays

To convert a Python list into a Numpy array, you can use `np.array()`:

```python
my_list = [2, 3, 4]
array_from_list = np.array(my_list)
```

### Generating Ranges of Numbers

Numpy provides functions for generating arrays of sequential numbers. For example:

```python
range_array = np.arange(10)  # Creates an array from 0 to 9
```

### Creating Arrays with Linear Spacing

`np.linspace()` creates arrays with evenly spaced numbers within a specified range:

```python
linspace_array = np.linspace(0, 1, 11)  # Creates 11 numbers from 0 to 1
```

### Multi-dimensional Arrays

Numpy can handle multi-dimensional arrays, often referred to as matrices. Here are some examples:

```python
zeros_matrix = np.zeros((5, 2))
ones_matrix = np.ones((5, 2))
constant_matrix = np.full((5, 2), 3)
```

## Indexing and Slicing Arrays

Like Python lists, you can access elements in Numpy arrays using indexing and slicing. For two-dimensional arrays:

```python
arr = np.array([[2, 3, 4], [4, 5, 6]])
first_row = arr[0]      # Gets the first row
first_col = arr[:, 0]  # Gets the first column
```

## Generating Random Arrays

Numpy can create arrays filled with random numbers. To ensure reproducibility, you can set a seed using `np.random.seed()`:

```python
np.random.seed(2)  # Set the seed
random_array = np.random.rand(5, 2)  # Generates random numbers between 0 and 1
```

For random numbers from a normal distribution or integers within a range:

```python
normal_distribution = np.random.randn(5, 2)
random_integers = np.random.randint(low=0, high=100, size=(5, 2))
```

## Array Operations

Numpy excels in performing mathematical operations on arrays efficiently.

### Element-wise Operations

You can perform operations on entire arrays element by element:

```python
arr = arr + 1   # Adds 1 to each element
arr = arr * 2   # Multiplies each element by 2
# Similar operations for division and exponentiation
```

### Element-wise Operations with Two Arrays

You can also perform operations between two arrays of the same shape:

```python
arr1 = np.ones(4)
arr2 = np.full(4, 3)
result = arr1 + arr2  # Element-wise addition
result = arr1 / arr2  # Element-wise division
```

### Comparison Operations

You can perform element-wise comparisons and create boolean arrays:

```python
arr = np.array([1, 2, 3, 4])
greater_than_2 = arr > 2  # Produces [False, False, True, True]
```

### Selecting Elements Based on Conditions

You can create subarrays based on certain conditions:

```python
selected_elements = arr[arr > 1]  # Gets elements greater than 1
```

## Summary Operations

Numpy provides functions for summarizing array data:

```python
min_value = arr.min()    # Minimum value
max_value = arr.max()    # Maximum value
sum_value = arr.sum()    # Sum of all elements
mean_value = arr.mean()  # Mean (average) value
std_deviation = arr.std()  # Standard deviation
```

In conclusion, Numpy is an essential library for anyone working with numerical data in Python. It simplifies array creation, manipulation, and mathematical operations, making it a powerful tool for scientific computing and data analysis. With the basics covered in this article, you're well on your way to harnessing Numpy's capabilities.


<table>
   <tr>
      <td>⚠️</td>
      <td>
         The notes are written by the community. <br>
         If you see an error here, please create a PR with a fix.
      </td>
   </tr>
</table>

* [Notes from Peter Ernicke - Part 1/3](https://knowmledge.com/2023/09/14/ml-zoomcamp-2023-introduction-to-machine-learning-part-6/)
* [Notes from Peter Ernicke - Part 2/3](https://knowmledge.com/2023/09/14/ml-zoomcamp-2023-introduction-to-machine-learning-part-7/)
* [Notes from Peter Ernicke - Part 3/3](https://knowmledge.com/2023/09/14/ml-zoomcamp-2023-introduction-to-machine-learning-part-8/)

## Links

* [Notebook from the video](notebooks/07-numpy.ipynb)
* [Notebook](https://github.com/alexeygrigorev/mlbookcamp-code/blob/master/appendix-c-numpy.ipynb)
* [Introduction to NumPy](https://mlbookcamp.com/article/numpy)

## Additional links

* [Numpy Cheat sheet](https://www.datacamp.com/community/blog/python-numpy-cheat-sheet)

## Navigation

* [Machine Learning Zoomcamp course](../)
* [Lesson 1: Introduction to Machine Learning](./)
* Previous: [Setting up the Environment](06-environment.md)
* Next: [Linear Algebra Refresher](08-linear-algebra.md)


================================================
FILE: 01-intro/08-linear-algebra.md
================================================
## 1.8 Linear Algebra Refresher

<a href="https://www.youtube.com/watch?v=zZyKUeOR4Gg&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR&index=8"><img src="images/thumbnail-1-08.jpg"></a>

[Slides](https://www.slideshare.net/AlexeyGrigorev/ml-zoomcamp-18-linear-algebra-refresher)


## Notes

### Linear Algebra Refresher
* Vector operations
* Multiplication
  * Vector-vector multiplication
  * Matrix-vector multiplication
  * Matrix-matrix multiplication
* Identity matrix
* Inverse

### Vector operations
~~~~python
u = np.array([2, 7, 5, 6])
v = np.array([3, 4, 8, 6])

# addition 
u + v

# subtraction 
u - v

# scalar multiplication 
2 * v
~~~~
### Multiplication

#####  Vector-vector multiplication

~~~~python
def vector_vector_multiplication(u, v):
    assert u.shape[0] == v.shape[0]
    
    n = u.shape[0]
    
    result = 0.0

    for i in range(n):
        result = result + u[i] * v[i]
    
    return result
~~~~

#####  Matrix-vector multiplication

~~~~python
def matrix_vector_multiplication(U, v):
    assert U.shape[1] == v.shape[0]
    
    num_rows = U.shape[0]
    
    result = np.zeros(num_rows)
    
    for i in range(num_rows):
        result[i] = vector_vector_multiplication(U[i], v)
    
    return result
~~~~

#####  Matrix-matrix multiplication

~~~~python
def matrix_matrix_multiplication(U, V):
    assert U.shape[1] == V.shape[0]
    
    num_rows = U.shape[0]
    num_cols = V.shape[1]
    
    result = np.zeros((num_rows, num_cols))
    
    for i in range(num_cols):
        vi = V[:, i]
        Uvi = matrix_vector_multiplication(U, vi)
        result[:, i] = Uvi
    
    return result
~~~~
### Identity matrix

~~~~python
I = np.eye(3)
~~~~
### Inverse
~~~~python
V = np.array([
    [1, 1, 2],
    [0, 0.5, 1], 
    [0, 2, 1],
])
inv = np.linalg.inv(V)
~~~~


Add notes here (PRs are welcome).

<table>
   <tr>
      <td>⚠️</td>
      <td>
         The notes are written by the community. <br>
         If you see an error here, please create a PR with a fix.
      </td>
   </tr>
</table>

* [Notes from Peter Ernicke - Part 1/3](https://knowmledge.com/2023/09/15/ml-zoomcamp-2023-introduction-to-machine-learning-part-9/)
* [Notes from Peter Ernicke - Part 2/3](https://knowmledge.com/2023/09/15/ml-zoomcamp-2023-introduction-to-machine-learning-part-10/)
* [Notes from Peter Ernicke - Part 3/3](https://knowmledge.com/2023/09/15/ml-zoomcamp-2023-introduction-to-machine-learning-part-11/)

## Links

* [Notebook from the video](notebooks/08-linear-algebra.ipynb)
* [Get a visual understanding of matrix multiplication](http://matrixmultiplication.xyz/)
* [Overview of matrix multiplication functions in python/numpy](https://github.com/MemoonaTahira/MLZoomcamp2022/blob/main/Notes/Week_1-intro_to_ML_linear_algebra/Notes_for_Chapter_1-Linear_Algebra.ipynb) 


## Navigation

* [Machine Learning Zoomcamp course](../)
* [Lesson 1: Introduction to Machine Learning](./)
* Previous: [Introduction to NumPy](07-numpy.md)
* Next: [Introduction to Pandas](09-pandas.md)


================================================
FILE: 01-intro/09-pandas.md
================================================
## 1.9 Introduction to Pandas

<a href="https://www.youtube.com/watch?v=0j3XK5PsnxA&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR&index=9"><img src="images/thumbnail-1-09.jpg"></a>


## Notes


Add notes here (PRs are welcome).

<table>
   <tr>
      <td>⚠️</td>
      <td>
         The notes are written by the community. <br>
         If you see an error here, please create a PR with a fix.
      </td>
   </tr>
</table>

* [Notes from Peter Ernicke - Part 1/2](https://knowmledge.com/2023/09/16/ml-zoomcamp-2023-introduction-to-machine-learning-part-12/)
* [Notes from Peter Ernicke - Part 2/2](https://knowmledge.com/2023/09/17/ml-zoomcamp-2023-introduction-to-machine-learning-part-13/)

## Links

* [Notebook from the video](notebooks/09-pandas.ipynb)
* [Notebook](https://github.com/alexeygrigorev/mlbookcamp-code/blob/master/appendix-d-pandas.ipynb)

## Additional links

* [Pandas Cheat sheet](https://www.datacamp.com/community/blog/python-pandas-cheat-sheet)

## Navigation

* [Machine Learning Zoomcamp course](../)
* [Lesson 1: Introduction to Machine Learning](./)
* Previous: [Linear Algebra Refresher](08-linear-algebra.md)
* Next: [Summary](10-summary.md)


================================================
FILE: 01-intro/10-summary.md
================================================
## 1.10 Summary

<a href="https://www.youtube.com/watch?v=VRrEEVeJ440&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR&index=10"><img src="images/thumbnail-1-10.jpg"></a>

[Slides](https://www.slideshare.net/AlexeyGrigorev/ml-zoomcamp-110-summary)


## Notes
---

### 📚 Summary of First Session - Machine Learning Zoomcamp

1. **🚗 Introduction to Machine Learning with Cars Data**  
   We start with data about cars, including characteristics (features) and prices (target). A Machine Learning (ML) model can be used to extract patterns from known information (data) about some cars in order to predict car prices based on their characteristics.

2. **🧠 Rules-Based Systems vs. Machine Learning**  
   - **Rules-Based Systems:** It is necessary to manually convert rules into code using a programming language and apply them to data. Extracting patterns manually can become complex and challenging.  
   - **Machine Learning:** Instead of manually coding rules, ML models automatically extract patterns from data using Mathematics and Statistics.

3. **🔍 Supervised Machine Learning**  
   In supervised learning, models learn from labeled data (with known outcomes) to make predictions on unseen data.

4. **🛠️ CRISP-DM (Cross Industry Standard Process for Data Mining)**  
   A structured methodology for organizing ML projects, consisting of the following steps:  
   - 💼 **Business Understanding**  
   - 🔎 **Data Understanding**  
   - 🧹 **Data Preparation**  
   - 🤖 **Modeling** (choosing and training models, then selecting the best one)  
   - 📊 **Evaluation**  
   - 🚀 **Deployment**  
   This process is iterative, allowing for continuous improvement.

5. **🏆 Model Selection**  
   Split data into training, validation, and test sets. Train different models, validate them, select the best performing one, and then test it on the test set to ensure generalization.

6. **💻 Setting Up the Environment**  
   Install necessary tools like Python, Numpy, Pandas, Matplotlib, Scikit-learn. Anaconda is the easiest option. Eventually create an AWS account for cloud resources.

7. **🔢 Introduction to Numpy**  
   Numpy is crucial for manipulating numerical data, providing efficient operations on arrays and matrices.

8. **🔗 Linear Algebra**  
   Covering all types of multiplication with vectors and matrices, including the creation of identity matrices using functions like `np.eye()`.

9. **📊 Introduction to Pandas**  
   Pandas is a Python library used for processing and analyzing tabular data efficiently.

---

<table>
   <tr>
      <td>⚠️</td>
      <td>
         The notes are written by the community. <br>
         If you see an error here, please create a PR with a fix.
      </td>
   </tr>
</table>
* [Notes from Maximilien Eyengue](https://github.com/maxim-eyengue/Python-Codes/blob/main/ML_Zoomcamp_2024/01_intro/Summary_Session_01.md)

## Navigation

* [Machine Learning Zoomcamp course](../)
* [Lesson 1: Introduction to Machine Learning](./)
* Previous: [Introduction to Pandas](09-pandas.md)
* Next: [Homework](homework.md)


================================================
FILE: 01-intro/README.md
================================================
## 1. Introduction to Machine Learning

- 1.1 [Introduction to Machine Learning](01-what-is-ml.md)
- 1.2 [ML vs Rule-Based Systems](02-ml-vs-rules.md)
- 1.3 [Supervised Machine Learning](03-supervised-ml.md)
- 1.4 [CRISP-DM](04-crisp-dm.md)
- 1.5 [The Modelling Step (Model Selection Process)](05-model-selection.md)
- 1.6 [Setting up the Environment](06-environment.md)
- 1.7 [Introduction to NumPy](07-numpy.md)
- 1.8 [Linear Algebra Refresher](08-linear-algebra.md)
- 1.9 [Introduction to Pandas](09-pandas.md)
- 1.10 [Summary](10-summary.md)
- 1.11 [Homework](homework.md)


## Community notes

Did you take notes? You can share them here (or in each unit separately)

* [Notes by Ayoub Berdeddouch](https://github.com/ayoub-berdeddouch/mlbookcamp-homeworks/blob/main/Intro/homework_intro_AyoubBerdeddouch.ipynb)
* [Notes from Sebastián Ayala Ruano](https://github.com/sayalaruano/100DaysOfMLCode/blob/main/Intro_ML/Notes/NotesDay1.md)
* [Notes from Alvaro Navas](https://github.com/ziritrion/ml-zoomcamp/blob/main/notes/01_intro.md)
* [Notes from Luis Evaristo Caraballo de la Cruz](https://github.com/varocaraballo/ml-zoomcamp2022/blob/main/01%20-%20Introduction%20to%20Machine%20Learning/notes.md)
* [Notes from Jon Areas](https://github.com/jxareas/Machine-Learning-Bookcamp-2022/blob/master/notes/01-introduction.md)
* [Notes from Hareesh Tummala](https://github.com/tummala-hareesh/ml_zoomcamp_ht/blob/main/notes/week-1-notes.md)
* [Notes from Peter Ernicke](https://knowmledge.com/2023/09/09/ml-zoomcamp-2023-introduction-to-machine-learning-part-1/)
* [Notes from Josiah Adesola](https://colab.research.google.com/drive/1mlwkAaRi7R8C6quUi0-cMfXk0MXD5-wc?usp=sharing)
* [Notes by Kemal](https://github.com/kemaldahha/machine-learning-course/blob/main/week_1_notes.md)
* [Notes by Maximilien Eyengue](https://github.com/maxim-eyengue/Python-Codes/blob/main/ML_Zoomcamp_2024/01_intro/Summary_Session_01.md)
* [Notes by Mahrukh Tariq](https://github.com/mahrukh98/ml-zoomcamp-hw/blob/main/notes/session1.md)
* [Notes by Revathy Ramalingam](https://github.com/RevathyRamalingam/machineLearning/blob/main/01-Intro/01-Intro.md)
* Add your notes here


================================================
FILE: 01-intro/homework.md
================================================
## Homework

* For 2025 cohort homework, check [the 2025 cohort folder](../cohorts/2025/01-intro/homework.md)
* For 2024 cohort homework, check [the 2024 cohort folder](../cohorts/2024/01-intro/homework.md)
* For 2023 cohort homework, check [the 2023 cohort folder](../cohorts/2023/01-intro/homework.md)
* For 2022 cohort homework, check [the 2022 cohort folder](../cohorts/2022/01-intro/homework.md)
* For 2021 cohort homework and solution, check [the 2021 cohort folder](../cohorts/2021/01-intro/)


## Navigation

* [Machine Learning Zoomcamp course](../)
* [Lesson 1: Introduction to Machine Learning](./)
* Previous: [Summary](10-summary.md)


================================================
FILE: 01-intro/notebooks/07-numpy.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "502da6a2",
   "metadata": {},
   "source": [
    "# Machine Learning Zoomcamp\n",
    "\n",
    "\n",
    "## 1.7 Introduction to NumPy\n",
    "\n",
    "\n",
    "Plan:\n",
    "\n",
    "* Creating arrays\n",
    "* Multi-dimensional arrays\n",
    "* Randomly generated arrays\n",
    "* Element-wise operations\n",
    "    * Comparison operations\n",
    "    * Logical operations\n",
    "* Summarizing operations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "95aa5b76",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "aa693c84",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<module 'numpy' from '/home/alexey/.pyenv/versions/3.8.11/lib/python3.8/site-packages/numpy/__init__.py'>"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1e3ff2dc",
   "metadata": {},
   "source": [
    "## Creating arrays\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "783c3362",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.zeros(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "2fc75d89",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.ones(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "5183483b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5])"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.full(10, 2.5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "fe81664d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ 1,  2,  3,  5,  7, 12])"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "a = np.array([1, 2, 3, 5, 7, 12])\n",
    "a"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "dee26035",
   "metadata": {},
   "outputs": [],
   "source": [
    "a[2] = 10"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "0cf95a0f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ 1,  2, 10,  5,  7, 12])"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "a"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "9a579406",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([3, 4, 5, 6, 7, 8, 9])"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.arange(3, 10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "96260ddb",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([  0.,  10.,  20.,  30.,  40.,  50.,  60.,  70.,  80.,  90., 100.])"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.linspace(0, 100, 11)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "37f36946",
   "metadata": {},
   "source": [
    "## Multi-dimensional arrays\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "b4a61c53",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[0., 0.],\n",
       "       [0., 0.],\n",
       "       [0., 0.],\n",
       "       [0., 0.],\n",
       "       [0., 0.]])"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.zeros((5, 2))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "4f75f854",
   "metadata": {},
   "outputs": [],
   "source": [
    "n = np.array([\n",
    "    [1, 2, 3],\n",
    "    [4, 5, 6],\n",
    "    [7, 8, 9]\n",
    "])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "619860f6",
   "metadata": {},
   "outputs": [],
   "source": [
    "n[0, 1] = 20"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "54333fc7",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 1, 20,  3],\n",
       "       [ 4,  5,  6],\n",
       "       [ 7,  8,  9]])"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "57eef634",
   "metadata": {},
   "outputs": [],
   "source": [
    "n[2] = [1, 1, 1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "b3fa6ae7",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 1, 20,  3],\n",
       "       [ 4,  5,  6],\n",
       "       [ 1,  1,  1]])"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "id": "42f1d1f4",
   "metadata": {},
   "outputs": [],
   "source": [
    "n[:, 2] = [0, 1, 2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "id": "13442277",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 1, 20,  0],\n",
       "       [ 4,  5,  1],\n",
       "       [ 1,  1,  2]])"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "62ba6337",
   "metadata": {},
   "source": [
    "## Randomly generated arrays\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "id": "6781ff11",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[43.59949021,  2.59262318],\n",
       "       [54.96624779, 43.53223926],\n",
       "       [42.03678021, 33.0334821 ],\n",
       "       [20.4648634 , 61.92709664],\n",
       "       [29.96546737, 26.68272751]])"
      ]
     },
     "execution_count": 51,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.random.seed(2)\n",
    "100 * np.random.rand(5, 2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "id": "4374e58b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[-0.41675785, -0.05626683],\n",
       "       [-2.1361961 ,  1.64027081],\n",
       "       [-1.79343559, -0.84174737],\n",
       "       [ 0.50288142, -1.24528809],\n",
       "       [-1.05795222, -0.90900761]])"
      ]
     },
     "execution_count": 50,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.random.seed(2)\n",
    "np.random.randn(5, 2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "id": "ebb39565",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[40, 15],\n",
       "       [72, 22],\n",
       "       [43, 82],\n",
       "       [75,  7],\n",
       "       [34, 49]])"
      ]
     },
     "execution_count": 52,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.random.seed(2)\n",
    "np.random.randint(low=0, high=100, size=(5, 2))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "364c6d7c",
   "metadata": {},
   "source": [
    "## Element-wise operations\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "id": "51390a32",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0, 1, 2, 3, 4])"
      ]
     },
     "execution_count": 53,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "a = np.arange(5)\n",
    "a"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "id": "6e87e9b0",
   "metadata": {},
   "outputs": [],
   "source": [
    "b = (10 + (a * 2)) ** 2 / 100"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "id": "013a9e2a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([1.  , 1.44, 1.96, 2.56, 3.24])"
      ]
     },
     "execution_count": 65,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "b"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "id": "08592c4a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([10.        , 10.69444444, 11.02040816, 11.171875  , 11.2345679 ])"
      ]
     },
     "execution_count": 68,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "a / b + 10"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "35fc84d3",
   "metadata": {},
   "source": [
    "## Comparison operations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "id": "e26eefdc",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0, 1, 2, 3, 4])"
      ]
     },
     "execution_count": 70,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "a"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "id": "8fd3fc96",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([False, False,  True,  True,  True])"
      ]
     },
     "execution_count": 69,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "a >= 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "id": "bca43c2c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([1.  , 1.44, 1.96, 2.56, 3.24])"
      ]
     },
     "execution_count": 71,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "b"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "id": "f6e89611",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([False, False,  True,  True,  True])"
      ]
     },
     "execution_count": 72,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "a > b"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "id": "15a5a80a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([2, 3, 4])"
      ]
     },
     "execution_count": 73,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "a[a > b]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0259499b",
   "metadata": {},
   "source": [
    "## Summarizing operations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "id": "c1b30281",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0, 1, 2, 3, 4])"
      ]
     },
     "execution_count": 75,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "a"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "id": "d850c2aa",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1.4142135623730951"
      ]
     },
     "execution_count": 79,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "a.std()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "id": "2b2587f2",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 82,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "n.min()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0662686b",
   "metadata": {},
   "source": [
    "### Next\n",
    "\n",
    "Linear algebra refresher"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}


================================================
FILE: 01-intro/notebooks/08-linear-algebra.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "3aace4b5",
   "metadata": {},
   "source": [
    "# Machine Learning Zoomcamp\n",
    "\n",
    "## 1.8 Linear algebra refresher"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2cd0b8e5",
   "metadata": {},
   "source": [
    "Plan:\n",
    "\n",
    "* Vector operations\n",
    "* Multiplication\n",
    "    * Vector-vector multiplication\n",
    "    * Matrix-vector multiplication\n",
    "    * Matrix-matrix multiplication\n",
    "* Identity matrix\n",
    "* Inverse"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "1317a223",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4052050d",
   "metadata": {},
   "source": [
    "## Vector operations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "e87a01b7",
   "metadata": {},
   "outputs": [],
   "source": [
    "u = np.array([2, 4, 5, 6])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "913795a9",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ 4,  8, 10, 12])"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "2 * u"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "24625627",
   "metadata": {},
   "outputs": [],
   "source": [
    "v = np.array([1, 0, 0, 2])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "edc95be4",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([3, 4, 5, 8])"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "u + v"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "62f471c3",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ 2,  0,  0, 12])"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "u * v"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6cb784ea",
   "metadata": {},
   "source": [
    "## Multiplication"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "6a838e14",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "4"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "v.shape[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "bef565ee",
   "metadata": {},
   "outputs": [],
   "source": [
    "def vector_vector_multiplication(u, v):\n",
    "    assert u.shape[0] == v.shape[0]\n",
    "    \n",
    "    n = u.shape[0]\n",
    "    \n",
    "    result = 0.0\n",
    "\n",
    "    for i in range(n):\n",
    "        result = result + u[i] * v[i]\n",
    "    \n",
    "    return result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "5f212712",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "14.0"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vector_vector_multiplication(u, v)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "b57c4464",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "14"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "u.dot(v)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "b7710217",
   "metadata": {},
   "outputs": [],
   "source": [
    "U = np.array([\n",
    "    [2, 4, 5, 6],\n",
    "    [1, 2, 1, 2],\n",
    "    [3, 1, 2, 1],\n",
    "])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "3f1ee5f1",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(3, 4)"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "U.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "8b4f7530",
   "metadata": {},
   "outputs": [],
   "source": [
    "def matrix_vector_multiplication(U, v):\n",
    "    assert U.shape[1] == v.shape[0]\n",
    "    \n",
    "    num_rows = U.shape[0]\n",
    "    \n",
    "    result = np.zeros(num_rows)\n",
    "    \n",
    "    for i in range(num_rows):\n",
    "        result[i] = vector_vector_multiplication(U[i], v)\n",
    "    \n",
    "    return result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "930f42c3",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([14.,  5.,  5.])"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "matrix_vector_multiplication(U, v)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "0937dafd",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([14,  5,  5])"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "U.dot(v)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "85280363",
   "metadata": {},
   "outputs": [],
   "source": [
    "V = np.array([\n",
    "    [1, 1, 2],\n",
    "    [0, 0.5, 1], \n",
    "    [0, 2, 1],\n",
    "    [2, 1, 0],\n",
    "])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "82039bcf",
   "metadata": {},
   "outputs": [],
   "source": [
    "def matrix_matrix_multiplication(U, V):\n",
    "    assert U.shape[1] == V.shape[0]\n",
    "    \n",
    "    num_rows = U.shape[0]\n",
    "    num_cols = V.shape[1]\n",
    "    \n",
    "    result = np.zeros((num_rows, num_cols))\n",
    "    \n",
    "    for i in range(num_cols):\n",
    "        vi = V[:, i]\n",
    "        Uvi = matrix_vector_multiplication(U, vi)\n",
    "        result[:, i] = Uvi\n",
    "    \n",
    "    return result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "ab0e5aba",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[14. , 20. , 13. ],\n",
       "       [ 5. ,  6. ,  5. ],\n",
       "       [ 5. ,  8.5,  9. ]])"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "matrix_matrix_multiplication(U, V)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "8d0e3b73",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[14. , 20. , 13. ],\n",
       "       [ 5. ,  6. ,  5. ],\n",
       "       [ 5. ,  8.5,  9. ]])"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "U.dot(V)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "fb2cdbdd",
   "metadata": {},
   "source": [
    "## Identity matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "ca913560",
   "metadata": {},
   "outputs": [],
   "source": [
    "I = np.eye(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "0614d05f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[1. , 1. , 2. ],\n",
       "       [0. , 0.5, 1. ],\n",
       "       [0. , 2. , 1. ],\n",
       "       [2. , 1. , 0. ]])"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "V"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "aabbf2ad",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[1. , 1. , 2. ],\n",
       "       [0. , 0.5, 1. ],\n",
       "       [0. , 2. , 1. ],\n",
       "       [2. , 1. , 0. ]])"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "V.dot(I)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e8f786ef",
   "metadata": {},
   "source": [
    "## Inverse"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "3e6fc747",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[1. , 1. , 2. ],\n",
       "       [0. , 0.5, 1. ],\n",
       "       [0. , 2. , 1. ]])"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "Vs = V[[0, 1, 2]]\n",
    "Vs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "5265b91e",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 1.        , -2.        ,  0.        ],\n",
       "       [ 0.        , -0.66666667,  0.66666667],\n",
       "       [ 0.        ,  1.33333333, -0.33333333]])"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "Vs_inv = np.linalg.inv(Vs)\n",
    "Vs_inv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "3cd1d98b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[1., 0., 0.],\n",
       "       [0., 1., 0.],\n",
       "       [0., 0., 1.]])"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "Vs_inv.dot(Vs)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1c54ed7d",
   "metadata": {},
   "source": [
    "### Next \n",
    "\n",
    "Intro to Pandas"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "64d8bdce",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}


================================================
FILE: 01-intro/notebooks/09-pandas.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "3473239e",
   "metadata": {},
   "source": [
    "# Machine Learning Zoomcamp\n",
    "\n",
    "## 1.9 Introduction to Pandas\n",
    "\n",
    "Plan:\n",
    "\n",
    "* Data Frames\n",
    "* Series\n",
    "* Index\n",
    "* Accessing elements\n",
    "* Element-wise operations\n",
    "* Filtering\n",
    "* String operations\n",
    "* Summarizing operations\n",
    "* Missing values\n",
    "* Grouping\n",
    "* Getting the NumPy arrays"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "b1a23fb2",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "06e3062c",
   "metadata": {},
   "source": [
    "## DataFrames"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "114c8ddb",
   "metadata": {},
   "outputs": [],
   "source": [
    "data = [\n",
    "    ['Nissan', 'Stanza', 1991, 138, 4, 'MANUAL', 'sedan', 2000],\n",
    "    ['Hyundai', 'Sonata', 2017, None, 4, 'AUTOMATIC', 'Sedan', 27150],\n",
    "    ['Lotus', 'Elise', 2010, 218, 4, 'MANUAL', 'convertible', 54990],\n",
    "    ['GMC', 'Acadia',  2017, 194, 4, 'AUTOMATIC', '4dr SUV', 34450],\n",
    "    ['Nissan', 'Frontier', 2017, 261, 6, 'MANUAL', 'Pickup', 32340],\n",
    "]\n",
    "\n",
    "columns = [\n",
    "    'Make', 'Model', 'Year', 'Engine HP', 'Engine Cylinders',\n",
    "    'Transmission Type', 'Vehicle_Style', 'MSRP'\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "c25c6c9d",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.DataFrame(data, columns=columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "abe4d2e4",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Make</th>\n",
       "      <th>Model</th>\n",
       "      <th>Year</th>\n",
       "      <th>Engine HP</th>\n",
       "      <th>Engine Cylinders</th>\n",
       "      <th>Transmission Type</th>\n",
       "      <th>Vehicle_Style</th>\n",
       "      <th>MSRP</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Nissan</td>\n",
       "      <td>Stanza</td>\n",
       "      <td>1991</td>\n",
       "      <td>138.0</td>\n",
       "      <td>4</td>\n",
       "      <td>MANUAL</td>\n",
       "      <td>sedan</td>\n",
       "      <td>2000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Hyundai</td>\n",
       "      <td>Sonata</td>\n",
       "      <td>2017</td>\n",
       "      <td>NaN</td>\n",
       "      <td>4</td>\n",
       "      <td>AUTOMATIC</td>\n",
       "      <td>Sedan</td>\n",
       "      <td>27150</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Lotus</td>\n",
       "      <td>Elise</td>\n",
       "      <td>2010</td>\n",
       "      <td>218.0</td>\n",
       "      <td>4</td>\n",
       "      <td>MANUAL</td>\n",
       "      <td>convertible</td>\n",
       "      <td>54990</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>GMC</td>\n",
       "      <td>Acadia</td>\n",
       "      <td>2017</td>\n",
       "      <td>194.0</td>\n",
       "      <td>4</td>\n",
       "      <td>AUTOMATIC</td>\n",
       "      <td>4dr SUV</td>\n",
       "      <td>34450</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Nissan</td>\n",
       "      <td>Frontier</td>\n",
       "      <td>2017</td>\n",
       "      <td>261.0</td>\n",
       "      <td>6</td>\n",
       "      <td>MANUAL</td>\n",
       "      <td>Pickup</td>\n",
       "      <td>32340</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      Make     Model  Year  Engine HP  Engine Cylinders Transmission Type  \\\n",
       "0   Nissan    Stanza  1991      138.0                 4            MANUAL   \n",
       "1  Hyundai    Sonata  2017        NaN                 4         AUTOMATIC   \n",
       "2    Lotus     Elise  2010      218.0                 4            MANUAL   \n",
       "3      GMC    Acadia  2017      194.0                 4         AUTOMATIC   \n",
       "4   Nissan  Frontier  2017      261.0                 6            MANUAL   \n",
       "\n",
       "  Vehicle_Style   MSRP  \n",
       "0         sedan   2000  \n",
       "1         Sedan  27150  \n",
       "2   convertible  54990  \n",
       "3       4dr SUV  34450  \n",
       "4        Pickup  32340  "
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "f104d442",
   "metadata": {},
   "outputs": [],
   "source": [
    "data = [\n",
    "    {\n",
    "        \"Make\": \"Nissan\",\n",
    "        \"Model\": \"Stanza\",\n",
    "        \"Year\": 1991,\n",
    "        \"Engine HP\": 138.0,\n",
    "        \"Engine Cylinders\": 4,\n",
    "        \"Transmission Type\": \"MANUAL\",\n",
    "        \"Vehicle_Style\": \"sedan\",\n",
    "        \"MSRP\": 2000\n",
    "    },\n",
    "    {\n",
    "        \"Make\": \"Hyundai\",\n",
    "        \"Model\": \"Sonata\",\n",
    "        \"Year\": 2017,\n",
    "        \"Engine HP\": None,\n",
    "        \"Engine Cylinders\": 4,\n",
    "        \"Transmission Type\": \"AUTOMATIC\",\n",
    "        \"Vehicle_Style\": \"Sedan\",\n",
    "        \"MSRP\": 27150\n",
    "    },\n",
    "    {\n",
    "        \"Make\": \"Lotus\",\n",
    "        \"Model\": \"Elise\",\n",
    "        \"Year\": 2010,\n",
    "        \"Engine HP\": 218.0,\n",
    "        \"Engine Cylinders\": 4,\n",
    "        \"Transmission Type\": \"MANUAL\",\n",
    "        \"Vehicle_Style\": \"convertible\",\n",
    "        \"MSRP\": 54990\n",
    "    },\n",
    "    {\n",
    "        \"Make\": \"GMC\",\n",
    "        \"Model\": \"Acadia\",\n",
    "        \"Year\": 2017,\n",
    "        \"Engine HP\": 194.0,\n",
    "        \"Engine Cylinders\": 4,\n",
    "        \"Transmission Type\": \"AUTOMATIC\",\n",
    "        \"Vehicle_Style\": \"4dr SUV\",\n",
    "        \"MSRP\": 34450\n",
    "    },\n",
    "    {\n",
    "        \"Make\": \"Nissan\",\n",
    "        \"Model\": \"Frontier\",\n",
    "        \"Year\": 2017,\n",
    "        \"Engine HP\": 261.0,\n",
    "        \"Engine Cylinders\": 6,\n",
    "        \"Transmission Type\": \"MANUAL\",\n",
    "        \"Vehicle_Style\": \"Pickup\",\n",
    "        \"MSRP\": 32340\n",
    "    }\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "2d89579e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Make</th>\n",
       "      <th>Model</th>\n",
       "      <th>Year</th>\n",
       "      <th>Engine HP</th>\n",
       "      <th>Engine Cylinders</th>\n",
       "      <th>Transmission Type</th>\n",
       "      <th>Vehicle_Style</th>\n",
       "      <th>MSRP</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Nissan</td>\n",
       "      <td>Stanza</td>\n",
       "      <td>1991</td>\n",
       "      <td>138.0</td>\n",
       "      <td>4</td>\n",
       "      <td>MANUAL</td>\n",
       "      <td>sedan</td>\n",
       "      <td>2000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Hyundai</td>\n",
       "      <td>Sonata</td>\n",
       "      <td>2017</td>\n",
       "      <td>NaN</td>\n",
       "      <td>4</td>\n",
       "      <td>AUTOMATIC</td>\n",
       "      <td>Sedan</td>\n",
       "      <td>27150</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Lotus</td>\n",
       "      <td>Elise</td>\n",
       "      <td>2010</td>\n",
       "      <td>218.0</td>\n",
       "      <td>4</td>\n",
       "      <td>MANUAL</td>\n",
       "      <td>convertible</td>\n",
       "      <td>54990</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>GMC</td>\n",
       "      <td>Acadia</td>\n",
       "      <td>2017</td>\n",
       "      <td>194.0</td>\n",
       "      <td>4</td>\n",
       "      <td>AUTOMATIC</td>\n",
       "      <td>4dr SUV</td>\n",
       "      <td>34450</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Nissan</td>\n",
       "      <td>Frontier</td>\n",
       "      <td>2017</td>\n",
       "      <td>261.0</td>\n",
       "      <td>6</td>\n",
       "      <td>MANUAL</td>\n",
       "      <td>Pickup</td>\n",
       "      <td>32340</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      Make     Model  Year  Engine HP  Engine Cylinders Transmission Type  \\\n",
       "0   Nissan    Stanza  1991      138.0                 4            MANUAL   \n",
       "1  Hyundai    Sonata  2017        NaN                 4         AUTOMATIC   \n",
       "2    Lotus     Elise  2010      218.0                 4            MANUAL   \n",
       "3      GMC    Acadia  2017      194.0                 4         AUTOMATIC   \n",
       "4   Nissan  Frontier  2017      261.0                 6            MANUAL   \n",
       "\n",
       "  Vehicle_Style   MSRP  \n",
       "0         sedan   2000  \n",
       "1         Sedan  27150  \n",
       "2   convertible  54990  \n",
       "3       4dr SUV  34450  \n",
       "4        Pickup  32340  "
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.DataFrame(data)\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "097f69d9",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Make</th>\n",
       "      <th>Model</th>\n",
       "      <th>Year</th>\n",
       "      <th>Engine HP</th>\n",
       "      <th>Engine Cylinders</th>\n",
       "      <th>Transmission Type</th>\n",
       "      <th>Vehicle_Style</th>\n",
       "      <th>MSRP</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Nissan</td>\n",
       "      <td>Stanza</td>\n",
       "      <td>1991</td>\n",
       "      <td>138.0</td>\n",
       "      <td>4</td>\n",
       "      <td>MANUAL</td>\n",
       "      <td>sedan</td>\n",
       "      <td>2000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Hyundai</td>\n",
       "      <td>Sonata</td>\n",
       "      <td>2017</td>\n",
       "      <td>NaN</td>\n",
       "      <td>4</td>\n",
       "      <td>AUTOMATIC</td>\n",
       "      <td>Sedan</td>\n",
       "      <td>27150</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      Make   Model  Year  Engine HP  Engine Cylinders Transmission Type  \\\n",
       "0   Nissan  Stanza  1991      138.0                 4            MANUAL   \n",
       "1  Hyundai  Sonata  2017        NaN                 4         AUTOMATIC   \n",
       "\n",
       "  Vehicle_Style   MSRP  \n",
       "0         sedan   2000  \n",
       "1         Sedan  27150  "
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head(n=2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0961a097",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "94b432db",
   "metadata": {},
   "source": [
    "## Series"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "7299a212",
   "metadata": {},
   "outputs": [
    {
     "ename": "SyntaxError",
     "evalue": "invalid syntax (1897567212.py, line 1)",
     "output_type": "error",
     "traceback": [
      "\u001b[0;36m  File \u001b[0;32m\"/tmp/ipykernel_580/1897567212.py\"\u001b[0;36m, line \u001b[0;32m1\u001b[0m\n\u001b[0;31m    df.Engine HP\u001b[0m\n\u001b[0m              ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m invalid syntax\n"
     ]
    }
   ],
   "source": [
    "df.Engine HP"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "54898f9d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    138.0\n",
       "1      NaN\n",
       "2    218.0\n",
       "3    194.0\n",
       "4    261.0\n",
       "Name: Engine HP, dtype: float64"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['Engine HP']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "acc40580",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Make</th>\n",
       "      <th>Model</th>\n",
       "      <th>MSRP</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Nissan</td>\n",
       "      <td>Stanza</td>\n",
       "      <td>2000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Hyundai</td>\n",
       "      <td>Sonata</td>\n",
       "      <td>27150</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Lotus</td>\n",
       "      <td>Elise</td>\n",
       "      <td>54990</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>GMC</td>\n",
       "      <td>Acadia</td>\n",
       "      <td>34450</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Nissan</td>\n",
       "      <td>Frontier</td>\n",
       "      <td>32340</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      Make     Model   MSRP\n",
       "0   Nissan    Stanza   2000\n",
       "1  Hyundai    Sonata  27150\n",
       "2    Lotus     Elise  54990\n",
       "3      GMC    Acadia  34450\n",
       "4   Nissan  Frontier  32340"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[['Make', 'Model', 'MSRP']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "9c699894",
   "metadata": {},
   "outputs": [],
   "source": [
    "df['id'] = [1, 2, 3, 4, 5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "70ec4449",
   "metadata": {},
   "outputs": [],
   "source": [
    "df['id'] = [10, 20, 30, 40, 50]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "ff30947e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Make</th>\n",
       "      <th>Model</th>\n",
       "      <th>Year</th>\n",
       "      <th>Engine HP</th>\n",
       "      <th>Engine Cylinders</th>\n",
       "      <th>Transmission Type</th>\n",
       "      <th>Vehicle_Style</th>\n",
       "      <th>MSRP</th>\n",
       "      <th>id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Nissan</td>\n",
       "      <td>Stanza</td>\n",
       "      <td>1991</td>\n",
       "      <td>138.0</td>\n",
       "      <td>4</td>\n",
       "      <td>MANUAL</td>\n",
       "      <td>sedan</td>\n",
       "      <td>2000</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Hyundai</td>\n",
       "      <td>Sonata</td>\n",
       "      <td>2017</td>\n",
       "      <td>NaN</td>\n",
       "      <td>4</td>\n",
       "      <td>AUTOMATIC</td>\n",
       "      <td>Sedan</td>\n",
       "      <td>27150</td>\n",
       "      <td>20</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Lotus</td>\n",
       "      <td>Elise</td>\n",
       "      <td>2010</td>\n",
       "      <td>218.0</td>\n",
       "      <td>4</td>\n",
       "      <td>MANUAL</td>\n",
       "      <td>convertible</td>\n",
       "      <td>54990</td>\n",
       "      <td>30</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>GMC</td>\n",
       "      <td>Acadia</td>\n",
       "      <td>2017</td>\n",
       "      <td>194.0</td>\n",
       "      <td>4</td>\n",
       "      <td>AUTOMATIC</td>\n",
       "      <td>4dr SUV</td>\n",
       "      <td>34450</td>\n",
       "      <td>40</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Nissan</td>\n",
       "      <td>Frontier</td>\n",
       "      <td>2017</td>\n",
       "      <td>261.0</td>\n",
       "      <td>6</td>\n",
       "      <td>MANUAL</td>\n",
       "      <td>Pickup</td>\n",
       "      <td>32340</td>\n",
       "      <td>50</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      Make     Model  Year  Engine HP  Engine Cylinders Transmission Type  \\\n",
       "0   Nissan    Stanza  1991      138.0                 4            MANUAL   \n",
       "1  Hyundai    Sonata  2017        NaN                 4         AUTOMATIC   \n",
       "2    Lotus     Elise  2010      218.0                 4            MANUAL   \n",
       "3      GMC    Acadia  2017      194.0                 4         AUTOMATIC   \n",
       "4   Nissan  Frontier  2017      261.0                 6            MANUAL   \n",
       "\n",
       "  Vehicle_Style   MSRP  id  \n",
       "0         sedan   2000  10  \n",
       "1         Sedan  27150  20  \n",
       "2   convertible  54990  30  \n",
       "3       4dr SUV  34450  40  \n",
       "4        Pickup  32340  50  "
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "064e3e7c",
   "metadata": {},
   "outputs": [],
   "source": [
    "del df['id']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "5206c3ba",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Make</th>\n",
       "      <th>Model</th>\n",
       "      <th>Year</th>\n",
       "      <th>Engine HP</th>\n",
       "      <th>Engine Cylinders</th>\n",
       "      <th>Transmission Type</th>\n",
       "      <th>Vehicle_Style</th>\n",
       "      <th>MSRP</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Nissan</td>\n",
       "      <td>Stanza</td>\n",
       "      <td>1991</td>\n",
       "      <td>138.0</td>\n",
       "      <td>4</td>\n",
       "      <td>MANUAL</td>\n",
       "      <td>sedan</td>\n",
       "      <td>2000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Hyundai</td>\n",
       "      <td>Sonata</td>\n",
       "      <td>2017</td>\n",
       "      <td>NaN</td>\n",
       "      <td>4</td>\n",
       "      <td>AUTOMATIC</td>\n",
       "      <td>Sedan</td>\n",
       "      <td>27150</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Lotus</td>\n",
       "      <td>Elise</td>\n",
       "      <td>2010</td>\n",
       "      <td>218.0</td>\n",
       "      <td>4</td>\n",
       "      <td>MANUAL</td>\n",
       "      <td>convertible</td>\n",
       "      <td>54990</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>GMC</td>\n",
       "      <td>Acadia</td>\n",
       "      <td>2017</td>\n",
       "      <td>194.0</td>\n",
       "      <td>4</td>\n",
       "      <td>AUTOMATIC</td>\n",
       "      <td>4dr SUV</td>\n",
       "      <td>34450</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Nissan</td>\n",
       "      <td>Frontier</td>\n",
       "      <td>2017</td>\n",
       "      <td>261.0</td>\n",
       "      <td>6</td>\n",
       "      <td>MANUAL</td>\n",
       "      <td>Pickup</td>\n",
       "      <td>32340</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      Make     Model  Year  Engine HP  Engine Cylinders Transmission Type  \\\n",
       "0   Nissan    Stanza  1991      138.0                 4            MANUAL   \n",
       "1  Hyundai    Sonata  2017        NaN                 4         AUTOMATIC   \n",
       "2    Lotus     Elise  2010      218.0                 4            MANUAL   \n",
       "3      GMC    Acadia  2017      194.0                 4         AUTOMATIC   \n",
       "4   Nissan  Frontier  2017      261.0                 6            MANUAL   \n",
       "\n",
       "  Vehicle_Style   MSRP  \n",
       "0         sedan   2000  \n",
       "1         Sedan  27150  \n",
       "2   convertible  54990  \n",
       "3       4dr SUV  34450  \n",
       "4        Pickup  32340  "
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "530a4af3",
   "metadata": {},
   "source": [
    "## Index\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "69e9bfbd",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "RangeIndex(start=0, stop=5, step=1)"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "d7e06c93",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "RangeIndex(start=0, stop=5, step=1)"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.Make.index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "14213eb4",
   "metadata": {},
   "outputs": [],
   "source": [
    "df.index = ['a', 'b', 'c', 'd', 'e']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "id": "d9074134",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Make</th>\n",
       "      <th>Model</th>\n",
       "      <th>Year</th>\n",
       "      <th>Engine HP</th>\n",
       "      <th>Engine Cylinders</th>\n",
       "      <th>Transmission Type</th>\n",
       "      <th>Vehicle_Style</th>\n",
       "      <th>MSRP</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>a</th>\n",
       "      <td>Nissan</td>\n",
       "      <td>Stanza</td>\n",
       "      <td>1991</td>\n",
       "      <td>138.0</td>\n",
       "      <td>4</td>\n",
       "      <td>MANUAL</td>\n",
       "      <td>sedan</td>\n",
       "      <td>2000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>b</th>\n",
       "      <td>Hyundai</td>\n",
       "      <td>Sonata</td>\n",
       "      <td>2017</td>\n",
       "      <td>NaN</td>\n",
       "      <td>4</td>\n",
       "      <td>AUTOMATIC</td>\n",
       "      <td>Sedan</td>\n",
       "      <td>27150</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>c</th>\n",
       "      <td>Lotus</td>\n",
       "      <td>Elise</td>\n",
       "      <td>2010</td>\n",
       "      <td>218.0</td>\n",
       "      <td>4</td>\n",
       "      <td>MANUAL</td>\n",
       "      <td>convertible</td>\n",
       "      <td>54990</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>d</th>\n",
       "      <td>GMC</td>\n",
       "      <td>Acadia</td>\n",
       "      <td>2017</td>\n",
       "      <td>194.0</td>\n",
       "      <td>4</td>\n",
       "      <td>AUTOMATIC</td>\n",
       "      <td>4dr SUV</td>\n",
       "      <td>34450</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>e</th>\n",
       "      <td>Nissan</td>\n",
       "      <td>Frontier</td>\n",
       "      <td>2017</td>\n",
       "      <td>261.0</td>\n",
       "      <td>6</td>\n",
       "      <td>MANUAL</td>\n",
       "      <td>Pickup</td>\n",
       "      <td>32340</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      Make     Model  Year  Engine HP  Engine Cylinders Transmission Type  \\\n",
       "a   Nissan    Stanza  1991      138.0                 4            MANUAL   \n",
       "b  Hyundai    Sonata  2017        NaN                 4         AUTOMATIC   \n",
       "c    Lotus     Elise  2010      218.0                 4            MANUAL   \n",
       "d      GMC    Acadia  2017      194.0                 4         AUTOMATIC   \n",
       "e   Nissan  Frontier  2017      261.0                 6            MANUAL   \n",
       "\n",
       "  Vehicle_Style   MSRP  \n",
       "a         sedan   2000  \n",
       "b         Sedan  27150  \n",
       "c   convertible  54990  \n",
       "d       4dr SUV  34450  \n",
       "e        Pickup  32340  "
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "id": "a1c57024",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Make</th>\n",
       "      <th>Model</th>\n",
       "      <th>Year</th>\n",
       "      <th>Engine HP</th>\n",
       "      <th>Engine Cylinders</th>\n",
       "      <th>Transmission Type</th>\n",
       "      <th>Vehicle_Style</th>\n",
       "      <th>MSRP</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>b</th>\n",
       "      <td>Hyundai</td>\n",
       "      <td>Sonata</td>\n",
       "      <td>2017</td>\n",
       "      <td>NaN</td>\n",
       "      <td>4</td>\n",
       "      <td>AUTOMATIC</td>\n",
       "      <td>Sedan</td>\n",
       "      <td>27150</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>c</th>\n",
       "      <td>Lotus</td>\n",
       "      <td>Elise</td>\n",
       "      <td>2010</td>\n",
       "      <td>218.0</td>\n",
       "      <td>4</td>\n",
       "      <td>MANUAL</td>\n",
       "      <td>convertible</td>\n",
       "      <td>54990</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>e</th>\n",
       "      <td>Nissan</td>\n",
       "      <td>Frontier</td>\n",
       "      <td>2017</td>\n",
       "      <td>261.0</td>\n",
       "      <td>6</td>\n",
       "      <td>MANUAL</td>\n",
       "      <td>Pickup</td>\n",
       "      <td>32340</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      Make     Model  Year  Engine HP  Engine Cylinders Transmission Type  \\\n",
       "b  Hyundai    Sonata  2017        NaN                 4         AUTOMATIC   \n",
       "c    Lotus     Elise  2010      218.0                 4            MANUAL   \n",
       "e   Nissan  Frontier  2017      261.0                 6            MANUAL   \n",
       "\n",
       "  Vehicle_Style   MSRP  \n",
       "b         Sedan  27150  \n",
       "c   convertible  54990  \n",
       "e        Pickup  32340  "
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.iloc[[1, 2, 4]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "id": "764c2aad",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = df.reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "id": "f338e70b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Make</th>\n",
       "      <th>Model</th>\n",
       "      <th>Year</th>\n",
       "      <th>Engine HP</th>\n",
       "      <th>Engine Cylinders</th>\n",
       "      <th>Transmission Type</th>\n",
       "      <th>Vehicle_Style</th>\n",
       "      <th>MSRP</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Nissan</td>\n",
       "      <td>Stanza</td>\n",
       "      <td>1991</td>\n",
       "      <td>138.0</td>\n",
       "      <td>4</td>\n",
       "      <td>MANUAL</td>\n",
       "      <td>sedan</td>\n",
       "      <td>2000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Hyundai</td>\n",
       "      <td>Sonata</td>\n",
       "      <td>2017</td>\n",
       "      <td>NaN</td>\n",
       "      <td>4</td>\n",
       "      <td>AUTOMATIC</td>\n",
       "      <td>Sedan</td>\n",
       "      <td>27150</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Lotus</td>\n",
       "      <td>Elise</td>\n",
       "      <td>2010</td>\n",
       "      <td>218.0</td>\n",
       "      <td>4</td>\n",
       "      <td>MANUAL</td>\n",
       "      <td>convertible</td>\n",
       "      <td>54990</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>GMC</td>\n",
       "      <td>Acadia</td>\n",
       "      <td>2017</td>\n",
       "      <td>194.0</td>\n",
       "      <td>4</td>\n",
       "      <td>AUTOMATIC</td>\n",
       "      <td>4dr SUV</td>\n",
       "      <td>34450</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Nissan</td>\n",
       "      <td>Frontier</td>\n",
       "      <td>2017</td>\n",
       "      <td>261.0</td>\n",
       "      <td>6</td>\n",
       "      <td>MANUAL</td>\n",
       "      <td>Pickup</td>\n",
       "      <td>32340</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      Make     Model  Year  Engine HP  Engine Cylinders Transmission Type  \\\n",
       "0   Nissan    Stanza  1991      138.0                 4            MANUAL   \n",
       "1  Hyundai    Sonata  2017        NaN                 4         AUTOMATIC   \n",
       "2    Lotus     Elise  2010      218.0                 4            MANUAL   \n",
       "3      GMC    Acadia  2017      194.0                 4         AUTOMATIC   \n",
       "4   Nissan  Frontier  2017      261.0                 6            MANUAL   \n",
       "\n",
       "  Vehicle_Style   MSRP  \n",
       "0         sedan   2000  \n",
       "1         Sedan  27150  \n",
       "2   convertible  54990  \n",
       "3       4dr SUV  34450  \n",
       "4        Pickup  32340  "
      ]
     },
     "execution_count": 48,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "53457e10",
   "metadata": {},
   "source": [
    "## Accessing elements"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "758f6b8e",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "ecfc3f22",
   "metadata": {},
   "source": [
    "## Element-wise operations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "id": "318183a8",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    276.0\n",
       "1      NaN\n",
       "2    436.0\n",
       "3    388.0\n",
       "4    522.0\n",
       "Name: Engine HP, dtype: float64"
      ]
     },
     "execution_count": 51,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['Engine HP'] * 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "id": "ae5d726d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    False\n",
       "1     True\n",
       "2    False\n",
       "3     True\n",
       "4     True\n",
       "Name: Year, dtype: bool"
      ]
     },
     "execution_count": 53,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['Year'] >= 2015"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5813ae3a",
   "metadata": {},
   "source": [
    "## Filtering"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "id": "73699361",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Make</th>\n",
       "      <th>Model</th>\n",
       "      <th>Year</th>\n",
       "      <th>Engine HP</th>\n",
       "      <th>Engine Cylinders</th>\n",
       "      <th>Transmission Type</th>\n",
       "      <th>Vehicle_Style</th>\n",
       "      <th>MSRP</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Nissan</td>\n",
       "      <td>Stanza</td>\n",
       "      <td>1991</td>\n",
       "      <td>138.0</td>\n",
       "      <td>4</td>\n",
       "      <td>MANUAL</td>\n",
       "      <td>sedan</td>\n",
       "      <td>2000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Nissan</td>\n",
       "      <td>Frontier</td>\n",
       "      <td>2017</td>\n",
       "      <td>261.0</td>\n",
       "      <td>6</td>\n",
       "      <td>MANUAL</td>\n",
       "      <td>Pickup</td>\n",
       "      <td>32340</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     Make     Model  Year  Engine HP  Engine Cylinders Transmission Type  \\\n",
       "0  Nissan    Stanza  1991      138.0                 4            MANUAL   \n",
       "4  Nissan  Frontier  2017      261.0                 6            MANUAL   \n",
       "\n",
       "  Vehicle_Style   MSRP  \n",
       "0         sedan   2000  \n",
       "4        Pickup  32340  "
      ]
     },
     "execution_count": 55,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[\n",
    "    df['Make'] == 'Nissan'\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "id": "0f29ed0f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Make</th>\n",
       "      <th>Model</th>\n",
       "      <th>Year</th>\n",
       "      <th>Engine HP</th>\n",
       "      <th>Engine Cylinders</th>\n",
       "      <th>Transmission Type</th>\n",
       "      <th>Vehicle_Style</th>\n",
       "      <th>MSRP</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Nissan</td>\n",
       "      <td>Frontier</td>\n",
       "      <td>2017</td>\n",
       "      <td>261.0</td>\n",
       "      <td>6</td>\n",
       "      <td>MANUAL</td>\n",
       "      <td>Pickup</td>\n",
       "      <td>32340</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     Make     Model  Year  Engine HP  Engine Cylinders Transmission Type  \\\n",
       "4  Nissan  Frontier  2017      261.0                 6            MANUAL   \n",
       "\n",
       "  Vehicle_Style   MSRP  \n",
       "4        Pickup  32340  "
      ]
     },
     "execution_count": 56,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[\n",
    "    (df['Make'] == 'Nissan') & (df['Year'] >= 2015)\n",
    "]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0a8a3d49",
   "metadata": {},
   "source": [
    "## String operations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "id": "7deaf57a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'machine_learning_zoomcamp'"
      ]
     },
     "execution_count": 68,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "'machine learning zoomcamp'.replace(' ', '_')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "id": "9ffa16ea",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0          sedan\n",
       "1          sedan\n",
       "2    convertible\n",
       "3        4dr suv\n",
       "4         pickup\n",
       "Name: Vehicle_Style, dtype: object"
      ]
     },
     "execution_count": 67,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['Vehicle_Style'].str.lower()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "id": "835c3a40",
   "metadata": {},
   "outputs": [],
   "source": [
    "df['Vehicle_Style'] = df['Vehicle_Style'].str.replace(' ', '_').str.lower()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "id": "5ee197dc",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Make</th>\n",
       "      <th>Model</th>\n",
       "      <th>Year</th>\n",
       "      <th>Engine HP</th>\n",
       "      <th>Engine Cylinders</th>\n",
       "      <th>Transmission Type</th>\n",
       "      <th>Vehicle_Style</th>\n",
       "      <th>MSRP</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Nissan</td>\n",
       "      <td>Stanza</td>\n",
       "      <td>1991</td>\n",
       "      <td>138.0</td>\n",
       "      <td>4</td>\n",
       "      <td>MANUAL</td>\n",
       "      <td>sedan</td>\n",
       "      <td>2000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Hyundai</td>\n",
       "      <td>Sonata</td>\n",
       "      <td>2017</td>\n",
       "      <td>NaN</td>\n",
       "      <td>4</td>\n",
       "      <td>AUTOMATIC</td>\n",
       "      <td>sedan</td>\n",
       "      <td>27150</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Lotus</td>\n",
       "      <td>Elise</td>\n",
       "      <td>2010</td>\n",
       "      <td>218.0</td>\n",
       "      <td>4</td>\n",
       "      <td>MANUAL</td>\n",
       "      <td>convertible</td>\n",
       "      <td>54990</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>GMC</td>\n",
       "      <td>Acadia</td>\n",
       "      <td>2017</td>\n",
       "      <td>194.0</td>\n",
       "      <td>4</td>\n",
       "      <td>AUTOMATIC</td>\n",
       "      <td>4dr_suv</td>\n",
       "      <td>34450</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Nissan</td>\n",
       "      <td>Frontier</td>\n",
       "      <td>2017</td>\n",
       "      <td>261.0</td>\n",
       "      <td>6</td>\n",
       "      <td>MANUAL</td>\n",
       "      <td>pickup</td>\n",
       "      <td>32340</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      Make     Model  Year  Engine HP  Engine Cylinders Transmission Type  \\\n",
       "0   Nissan    Stanza  1991      138.0                 4            MANUAL   \n",
       "1  Hyundai    Sonata  2017        NaN                 4         AUTOMATIC   \n",
       "2    Lotus     Elise  2010      218.0                 4            MANUAL   \n",
       "3      GMC    Acadia  2017      194.0                 4         AUTOMATIC   \n",
       "4   Nissan  Frontier  2017      261.0                 6            MANUAL   \n",
       "\n",
       "  Vehicle_Style   MSRP  \n",
       "0         sedan   2000  \n",
       "1         sedan  27150  \n",
       "2   convertible  54990  \n",
       "3       4dr_suv  34450  \n",
       "4        pickup  32340  "
      ]
     },
     "execution_count": 74,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0f0d1bc6",
   "metadata": {},
   "source": [
    "## Summarizing operations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "id": "0e6bb68a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Year</th>\n",
       "      <th>Engine HP</th>\n",
       "      <th>Engine Cylinders</th>\n",
       "      <th>MSRP</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>5.00</td>\n",
       "      <td>4.00</td>\n",
       "      <td>5.00</td>\n",
       "      <td>5.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>2010.40</td>\n",
       "      <td>202.75</td>\n",
       "      <td>4.40</td>\n",
       "      <td>30186.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>11.26</td>\n",
       "      <td>51.30</td>\n",
       "      <td>0.89</td>\n",
       "      <td>18985.04</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>1991.00</td>\n",
       "      <td>138.00</td>\n",
       "      <td>4.00</td>\n",
       "      <td>2000.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>2010.00</td>\n",
       "      <td>180.00</td>\n",
       "      <td>4.00</td>\n",
       "      <td>27150.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>2017.00</td>\n",
       "      <td>206.00</td>\n",
       "      <td>4.00</td>\n",
       "      <td>32340.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>2017.00</td>\n",
       "      <td>228.75</td>\n",
       "      <td>4.00</td>\n",
       "      <td>34450.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>2017.00</td>\n",
       "      <td>261.00</td>\n",
       "      <td>6.00</td>\n",
       "      <td>54990.00</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          Year  Engine HP  Engine Cylinders      MSRP\n",
       "count     5.00       4.00              5.00      5.00\n",
       "mean   2010.40     202.75              4.40  30186.00\n",
       "std      11.26      51.30              0.89  18985.04\n",
       "min    1991.00     138.00              4.00   2000.00\n",
       "25%    2010.00     180.00              4.00  27150.00\n",
       "50%    2017.00     206.00              4.00  32340.00\n",
       "75%    2017.00     228.75              4.00  34450.00\n",
       "max    2017.00     261.00              6.00  54990.00"
      ]
     },
     "execution_count": 81,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.describe().round(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "id": "ca689f7b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Make                 4\n",
       "Model                5\n",
       "Year                 3\n",
       "Engine HP            4\n",
       "Engine Cylinders     2\n",
       "Transmission Type    2\n",
       "Vehicle_Style        4\n",
       "MSRP                 5\n",
       "dtype: int64"
      ]
     },
     "execution_count": 85,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.nunique()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0318652d",
   "metadata": {},
   "source": [
    "## Missing values\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "id": "05000331",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Make                 0\n",
       "Model                0\n",
       "Year                 0\n",
       "Engine HP            1\n",
       "Engine Cylinders     0\n",
       "Transmission Type    0\n",
       "Vehicle_Style        0\n",
       "MSRP                 0\n",
       "dtype: int64"
      ]
     },
     "execution_count": 87,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.isnull().sum()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "963eded9",
   "metadata": {},
   "source": [
    "## Grouping\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7bf5fad5",
   "metadata": {},
   "source": [
    "```\n",
    "SELECT \n",
    "    transmission_type,\n",
    "    AVG(MSRP)\n",
    "FROM\n",
    "    cars\n",
    "GROUP BY\n",
    "    transmission_type\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "id": "6310552b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Transmission Type\n",
       "AUTOMATIC    34450\n",
       "MANUAL       54990\n",
       "Name: MSRP, dtype: int64"
      ]
     },
     "execution_count": 90,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.groupby('Transmission Type').MSRP.max()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3de63a4b",
   "metadata": {},
   "source": [
    "## Getting the NumPy arrays"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "id": "749f764c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ 2000, 27150, 54990, 34450, 32340])"
      ]
     },
     "execution_count": 93,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.MSRP.values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "id": "1ff56c15",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'Make': 'Nissan',\n",
       "  'Model': 'Stanza',\n",
       "  'Year': 1991,\n",
       "  'Engine HP': 138.0,\n",
       "  'Engine Cylinders': 4,\n",
       "  'Transmission Type': 'MANUAL',\n",
       "  'Vehicle_Style': 'sedan',\n",
       "  'MSRP': 2000},\n",
       " {'Make': 'Hyundai',\n",
       "  'Model': 'Sonata',\n",
       "  'Year': 2017,\n",
       "  'Engine HP': nan,\n",
       "  'Engine Cylinders': 4,\n",
       "  'Transmission Type': 'AUTOMATIC',\n",
       "  'Vehicle_Style': 'sedan',\n",
       "  'MSRP': 27150},\n",
       " {'Make': 'Lotus',\n",
       "  'Model': 'Elise',\n",
       "  'Year': 2010,\n",
       "  'Engine HP': 218.0,\n",
       "  'Engine Cylinders': 4,\n",
       "  'Transmission Type': 'MANUAL',\n",
       "  'Vehicle_Style': 'convertible',\n",
       "  'MSRP': 54990},\n",
       " {'Make': 'GMC',\n",
       "  'Model': 'Acadia',\n",
       "  'Year': 2017,\n",
       "  'Engine HP': 194.0,\n",
       "  'Engine Cylinders': 4,\n",
       "  'Transmission Type': 'AUTOMATIC',\n",
       "  'Vehicle_Style': '4dr_suv',\n",
       "  'MSRP': 34450},\n",
       " {'Make': 'Nissan',\n",
       "  'Model': 'Frontier',\n",
       "  'Year': 2017,\n",
       "  'Engine HP': 261.0,\n",
       "  'Engine Cylinders': 6,\n",
       "  'Transmission Type': 'MANUAL',\n",
       "  'Vehicle_Style': 'pickup',\n",
       "  'MSRP': 32340}]"
      ]
     },
     "execution_count": 95,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.to_dict(orient='records')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f1e6fae3",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}


================================================
FILE: 02-regression/01-car-price-intro.md
================================================

## 2.1 Car price prediction project

<a href="https://www.youtube.com/watch?v=vM3SqPNlStE&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR&index=12"><img src="images/thumbnail-2-01.jpg"></a>

[Slides](https://www.slideshare.net/AlexeyGrigorev/ml-zoomcamp-21-car-price-prediction-project)


## Notes

This project is about the creation of a model for helping users to predict car prices. The dataset was obtained from [this 
kaggle competition](https://www.kaggle.com/CooperUnion/cardataset).

**Project plan:**

* Prepare data and Exploratory data analysis (EDA)
* Use linear regression for predicting price
* Understanding the internals of linear regression 
* Evaluating the model with RMSE
* Feature engineering  
* Regularization 
* Using the model 

The code and dataset are available at this [link](https://github.com/alexeygrigorev/mlbookcamp-code/tree/master/chapter-02-car-price). 

<table>
   <tr>
      <td>⚠️</td>
      <td>
         The notes are written by the community. <br>
         If you see an error here, please create a PR with a fix.
      </td>
   </tr>
</table>

* [Notes from Peter Ernicke](https://knowmledge.com/2023/09/18/ml-zoomcamp-2023-machine-learning-for-regression-part-1/)

## Navigation

* [Machine Learning Zoomcamp course](../)
* [Session 2: Machine Learning for Regression](./)
* Next: [Data preparation](02-data-preparation.md)


================================================
FILE: 02-regression/02-data-preparation.md
================================================

## 2.2 Data preparation

<a href="https://www.youtube.com/watch?v=Kd74oR4QWGM&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR&index=13"><img src="images/thumbnail-2-02.jpg"></a>

[Slides](https://www.slideshare.net/AlexeyGrigorev/ml-zoomcamp-2-slides)


## Notes

**Pandas attributes and methods:** 

* `pd.read_csv(<file_path_string>)` -> read csv files 
* `df.head()` -> take a look of the dataframe 
* `df.columns` -> retrieve colum names of a dataframe 
* `df.columns.str.lower()` -> lowercase all the letters 
* `df.columns.str.replace(' ', '_')` -> replace the space separator 
* `df.dtypes` -> retrieve data types of all features 
* `df.index` -> retrieve indices of a dataframe

The entire code of this project is available in [this jupyter notebook](notebook.ipynb).

<table>
   <tr>
      <td>⚠️</td>
      <td>
         The notes are written by the community. <br>
         If you see an error here, please create a PR with a fix.
      </td>
   </tr>
</table>

* [Notes from Peter Ernicke](https://knowmledge.com/2023/09/18/ml-zoomcamp-2023-machine-learning-for-regression-part-1/)

## Navigation

* [Machine Learning Zoomcamp course](../)
* [Session 2: Machine Learning for Regression](./)
* Previous: [Car price prediction project](01-car-price-intro.md)
* Next: [Exploratory data analysis](03-eda.md)


================================================
FILE: 02-regression/03-eda.md
================================================

## 2.3 Exploratory data analysis

<a href="https://www.youtube.com/watch?v=k6k8sQ0GhPM&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR&index=14"><img src="images/thumbnail-2-03.jpg"></a>

[Slides](https://www.slideshare.net/AlexeyGrigorev/ml-zoomcamp-2-slides)


## Notes

**Pandas attributes and methods:** 

* `df[col].unique()` -> return a list of unique values in the series 
* `df[col].nunique()` -> return the number of unique values in the series 
* `df.isnull().sum()` -> return the number of null values in the dataframe 

**Matplotlib and seaborn methods:**

* `%matplotlib inline` -> assure that plots are displayed in jupyter notebook's cells
* `sns.histplot()` -> show the histogram of a series 
   
**Numpy methods:**
* `np.log1p()` -> apply log transformation to a variable, after adding one to each input value.

Long-tail distributions usually confuse the ML models, so the recommendation is to transform the target variable distribution to a normal one whenever possible. 

The entire code of this project is available in [this jupyter notebook](https://github.com/alexeygrigorev/mlbookcamp-code/blob/master/chapter-02-car-price/02-carprice.ipynb).  

<table>
   <tr>
      <td>⚠️</td>
      <td>
         The notes are written by the community. <br>
         If you see an error here, please create a PR with a fix.
      </td>
   </tr>
</table>

* [Notes from Peter Ernicke](https://knowmledge.com/2023/09/19/ml-zoomcamp-2023-machine-learning-for-regression-part-2/)

## Navigation

* [Machine Learning Zoomcamp course](../)
* [Session 2: Machine Learning for Regression](./)
* Previous: [Data preparation](02-data-preparation.md)
* Next: [Setting up the validation framework](04-validation-framework.md)


================================================
FILE: 02-regression/04-validation-framework.md
================================================

## 2.4 Setting up the validation framework

<a href="https://www.youtube.com/watch?v=ck0IfiPaQi0&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR&index=15"><img src="images/thumbnail-2-04.jpg"></a>

[Slides](https://www.slideshare.net/AlexeyGrigorev/ml-zoomcamp-2-slides)


## Notes

In general, the dataset is splitted into three parts: training, validation, and test. For each partition, we need to obtain feature matrices (X) and vectors of targets (y). First, the size of the partitions is calculated. Next, the records are shuffled to ensure that the values in the three partitions contain non-sequential records from the dataset. Finally, the partitions are created using the shuffled indices.

**Pandas attributes and methods:** 

* `df.iloc[]` -> return subsets of records of a dataframe, being selected by numerical indices
* `df.reset_index()` -> restate the orginal indices 
* `del df[col]` -> eliminate a column variable 

**Numpy methods:**

* `np.arange()` -> return an array of numbers 
* `np.random.shuffle()` -> return a shuffled array
* `np.random.seed()` -> set a seed for reproducibility

The entire code of this project is available in [this jupyter notebook](https://github.com/alexeygrigorev/mlbookcamp-code/blob/master/chapter-02-car-price/02-carprice.ipynb). 

<table>
   <tr>
      <td>⚠️</td>
      <td>
         The notes are written by the community. <br>
         If you see an error here, please create a PR with a fix.
      </td>
   </tr>
</table>

* [Notes from Peter Ernicke](https://knowmledge.com/2023/09/19/ml-zoomcamp-2023-machine-learning-for-regression-part-3/)

## Navigation

* [Machine Learning Zoomcamp course](../)
* [Session 2: Machine Learning for Regression](./)
* Previous: [Exploratory data analysis](03-eda.md)
* Next: [Linear regression](05-linear-regression-simple.md)


================================================
FILE: 02-regression/05-linear-regression-simple.md
================================================

## 2.5 Linear regression

<a href="https://www.youtube.com/watch?v=Dn1eTQLsOdA&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR&index=16"><img src="images/thumbnail-2-05.jpg"></a>

[Slides](https://www.slideshare.net/AlexeyGrigorev/ml-zoomcamp-2-slides)


## Notes

Model for solving regression tasks, in which the objective is to adjust a line for the data and make predictions on new values. The input of this model is the **feature matrix** `X` and a `y` **vector of predictions** is obtained, trying to be as close as possible to the **actual** `y` values. The linear regression formula is the sum of the bias term \( $w_0$ \), which refers to the predictions if there is no information, and each of the feature values times their corresponding weights as \( $x_{i1} \cdot w_1 + x_{i2} \cdot w_2 + ... + x_{in} \cdot w_n$ \).

So the simple linear regression formula looks like:

$g(x_i) = w_0 + x_{i1} \cdot w_1 + x_{i2} \cdot w_2 + ... + x_{in} \cdot w_n$.

And that can be further simplified as:

$g(x_i) = w_0 + \displaystyle\sum_{j=1}^{n} w_j \cdot x_{ij}$

Here is a simple implementation of Linear Regression in python:

~~~~python
w0 = 7.1
def linear_regression(xi):
    
    n = len(xi)
    
    pred = w0
    w = [0.01, 0.04, 0.002]
    for j in range(n):
        pred = pred + w[j] * xi[j]
    return pred
~~~~
        

If we look at the $\displaystyle\sum_{j=1}^{n} w_j \cdot x_{ij}$ part in the above equation, we know that this is nothing else but a vector-vector multiplication. Hence, we can rewrite the equation as $g(x_i) = w_0 + x_i^T \cdot w$

We need to assure that the result is shown on the untransformed scale by using the inverse function `exp()`. 

The entire code of this project is available in [this jupyter notebook](https://github.com/alexeygrigorev/mlbookcamp-code/blob/master/chapter-02-car-price/02-carprice.ipynb).  

<table>
   <tr>
      <td>⚠️</td>
      <td>
         The notes are written by the community. <br>
         If you see an error here, please create a PR with a fix.
      </td>
   </tr>
</table>

* [Notes from Peter Ernicke](https://knowmledge.com/2023/09/20/ml-zoomcamp-2023-machine-learning-for-regression-part-4/)

## Navigation

* [Machine Learning Zoomcamp course](../)
* [Session 2: Machine Learning for Regression](./)
* Previous: [Setting up the validation framework](04-validation-framework.md)
* Next: [Linear regression: vector form](06-linear-regression-vector.md)


================================================
FILE: 02-regression/06-linear-regression-vector.md
================================================

## 2.6 Linear regression: vector form

<a href="https://www.youtube.com/watch?v=YkyevnYyAww&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR&index=17"><img src="images/thumbnail-2-06.jpg"></a>

[Slides](https://www.slideshare.net/AlexeyGrigorev/ml-zoomcamp-2-slides)


## Notes

The formula of linear regression can be synthesized with the dot product between features and weights. The feature vector includes the *bias* term with an *x* value of one, such as $w_{0}^{x_{i0}},\ where\ x_{i0} = 1\ for\ w_0$.

When all the records are included, the linear regression can be calculated with the dot product between ***feature matrix*** and ***vector of weights***, obtaining the `y` vector of predictions. 

The entire code of this project is available in [this jupyter notebook](https://github.com/alexeygrigorev/mlbookcamp-code/blob/master/chapter-02-car-price/02-carprice.ipynb).  

<table>
   <tr>
      <td>⚠️</td>
      <td>
         The notes are written by the community. <br>
         If you see an error here, please create a PR with a fix.
      </td>
   </tr>
</table>

* [Notes from Peter Ernicke](https://knowmledge.wordpress.com/2023/09/20/ml-zoomcamp-2023-machine-learning-for-regression-part-5/)

## Navigation

* [Machine Learning Zoomcamp course](../)
* [Session 2: Machine Learning for Regression](./)
* Previous: [Linear regression](05-linear-regression-simple.md)
* Next: [Training linear regression: Normal equation](07-linear-regression-training.md)


================================================
FILE: 02-regression/07-linear-regression-training.md
================================================

## 2.7 Training linear regression: Normal equation

<a href="https://www.youtube.com/watch?v=hx6nak-Y11g&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR&index=18"><img src="images/thumbnail-2-07.jpg"></a>

[Slides](https://www.slideshare.net/AlexeyGrigorev/ml-zoomcamp-2-slides)


## Notes

Obtaining predictions as close as possible to $y$ target values requires the calculation of weights from the general
LR equation. The feature matrix does not 
have an inverse because it is not square, so it is required to obtain an approximate solution, which can be
obtained using the **Gram matrix** 
(multiplication of feature matrix ($X$) and its transpose ($X^T$)). The vector of weights or coefficients $w$ obtained with this
formula is the closest possible solution to the LR system.

Normal Equation:

$w$ = $(X^TX)^{-1}X^Ty$

Where:

$X^TX$ is the Gram Matrix




The entire code of this project is available in [this jupyter notebook](https://github.com/alexeygrigorev/mlbookcamp-code/blob/master/chapter-02-car-price/02-carprice.ipynb). 

<table>
   <tr>
      <td>⚠️</td>
      <td>
         The notes are written by the community. <br>
         If you see an error here, please create a PR with a fix.
      </td>
   </tr>
</table>

* [Notes from Peter Ernicke](https://knowmledge.com/2023/09/21/ml-zoomcamp-2023-machine-learning-for-regression-part-6/)

## Navigation

* [Machine Learning Zoomcamp course](../)
* [Session 2: Machine Learning for Regression](./)
* Previous: [Linear regression: vector form](06-linear-regression-vector.md)
* Next: [Baseline model for car price prediction project](08-baseline-model.md)


================================================
FILE: 02-regression/08-baseline-model.md
================================================

## 2.8 Baseline model for car price prediction project

<a href="https://www.youtube.com/watch?v=SvPpMMYtYbU&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR&index=19"><img src="images/thumbnail-2-08.jpg"></a>

[Slides](https://www.slideshare.net/AlexeyGrigorev/ml-zoomcamp-2-slides)


## Notes

* In this lesson we build a baseline model and apply the `df_train` dataset to derive weights for the bias (w0) and the features (w). For this, we use the `train_linear_regression(X, y)` function from the previous lesson.
* Linear regression only applies to numerical features. Therefore, only the numerical features from `df_train` are used for the feature matrix. 
* We notice some of the features in `df_train` are `nan`. We set them to `0` for the sake of simplicity, so the model is solvable, but it will be appropriate if a non-zeo value is used as the filler (e.g. mean value of the feature).
* Once the weights are calculated, then we apply them on  $$\\\\ \large g(X) = w_0 + X \cdot w$$ to derive the predicted y vector.
* Then we plot both predicted y and the actual y on the same histogram for a visual comparison.

The entire code of this project is available in [this jupyter notebook](https://github.com/alexeygrigorev/mlbookcamp-code/blob/master/chapter-02-car-price/02-carprice.ipynb).  

<table>
   <tr>
      <td>⚠️</td>
      <td>
         The notes are written by the community. <br>
         If you see an error here, please create a PR with a fix.
      </td>
   </tr>
</table>

* [Notes from Peter Ernicke](https://knowmledge.com/2023/09/21/ml-zoomcamp-2023-machine-learning-for-regression-part-7/)

## Navigation

* [Machine Learning Zoomcamp course](../)
* [Session 2: Machine Learning for Regression](./)
* Previous: [Training linear regression: Normal equation](07-linear-regression-training.md)
* Next: [Root mean squared error](09-rmse.md)


================================================
FILE: 02-regression/09-rmse.md
================================================

## 2.9 Root Mean Squared Error (RMSE)

<a href="https://www.youtube.com/watch?v=0LWoFtbzNUM&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR&index=20"><img src="images/thumbnail-2-09.jpg"></a>

[Slides](https://www.slideshare.net/AlexeyGrigorev/ml-zoomcamp-2-slides)


## Notes

* In the previous lesson we found out our predictions were a bit off from the actual target values in the training dataset. We need a way to quantify how good or bad the model is. This is where RMSE can be of help.
* Root Mean Squared Error (RMSE) is a way to evaluate regression models. It measures the error associated with the model being evaluated. This numerical figure can then be used to compare models, enabling us to choose the one that gives the best predictions.

$$RMSE = \sqrt{ \frac{1}{m} \sum_{i=1}^{m} {(g(x_i) - y_i)^2}}$$

- $g(x_i)$ is the prediction
- $y_i$ is the actual value
- $m$ is the number of observations in the dataset (i.e. cars)


The entire code of this project is available in [this jupyter notebook](https://github.com/alexeygrigorev/mlbookcamp-code/blob/master/chapter-02-car-price/02-carprice.ipynb). 

<table>
   <tr>
      <td>⚠️</td>
      <td>
         The notes are written by the community. <br>
         If you see an error here, please create a PR with a fix.
      </td>
   </tr>
</table>

* [Notes from Peter Ernicke](https://knowmledge.com/2023/09/22/ml-zoomcamp-2023-machine-learning-for-regression-part-8/)

## Navigation

* [Machine Learning Zoomcamp course](../)
* [Session 2: Machine Learning for Regression](./)
* Previous: [Baseline model for car price prediction project](08-baseline-model.md)
* Next: [Using RMSE on validation data](10-car-price-validation.md)


================================================
FILE: 02-regression/10-car-price-validation.md
================================================

## 2.10 Computing RMSE on validation data

<a href="https://www.youtube.com/watch?v=rawGPXg2ofE&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR&index=21"><img src="images/thumbnail-2-10.jpg"></a>

[Slides](https://www.slideshare.net/AlexeyGrigorev/ml-zoomcamp-2-slides)


## Notes

Calculation of the RMSE on validation partition of the dataset of car price prediction. In this way, we have a metric to evaluate the model's 
performance. 

The entire code of this project is available in [this jupyter notebook](https://github.com/alexeygrigorev/mlbookcamp-code/blob/master/chapter-02-car-price/02-carprice.ipynb). 

<table>
   <tr>
      <td>⚠️</td>
      <td>
         The notes are written by the community. <br>
         If you see an error here, please create a PR with a fix.
      </td>
   </tr>
</table>

* [Notes from Peter Ernicke](https://knowmledge.com/2023/09/22/ml-zoomcamp-2023-machine-learning-for-regression-part-8/)

## Navigation

* [Machine Learning Zoomcamp course](../)
* [Session 2: Machine Learning for Regression](./)
* Previous: [Root mean squared error](09-rmse.md)
* Next: [Feature engineering](11-feature-engineering.md)


================================================
FILE: 02-regression/11-feature-engineering.md
================================================
## 2.11 Feature engineering

Feature engineering is the process of creating new features

<a href="https://www.youtube.com/watch?v=-aEShw4ftB0&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR&index=22"><img src="images/thumbnail-2-11.jpg"></a>

[Slides](https://www.slideshare.net/AlexeyGrigorev/ml-zoomcamp-2-slides)


## Notes

The feature age of the car was included in the dataset, obtained with the subtraction of the maximum year of cars and each of the years of cars. 
This new feature improved the model performance, measured with the RMSE and comparing the distributions of y target variable and predictions. 

The entire code of this project is available in [this jupyter notebook](https://github.com/alexeygrigorev/mlbookcamp-code/blob/master/chapter-02-car-price/02-carprice.ipynb).  

<table>
   <tr>
      <td>⚠️</td>
      <td>
         The notes are written by the community. <br>
         If you see an error here, please create a PR with a fix.
      </td>
   </tr>
</table>

* [Notes from Peter Ernicke](https://knowmledge.com/2023/09/22/ml-zoomcamp-2023-machine-learning-for-regression-part-9/)

## Navigation

* [Machine Learning Zoomcamp course](../)
* [Session 2: Machine Learning for Regression](./)
* Previous: [Using RMSE on validation data](10-car-price-validation.md)
* Next: [Categorical variables](12-categorical-variables.md)


================================================
FILE: 02-regression/12-categorical-variables.md
================================================

## 2.12 Categorical variables

<a href="https://www.youtube.com/watch?v=sGLAToAAMa4&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR&index=23"><img src="images/thumbnail-2-12.jpg"></a>

[Slides](https://www.slideshare.net/AlexeyGrigorev/ml-zoomcamp-2-slides)


## Notes

Categorical variables are typically represented as strings, and pandas identifies them as object types. However, some variables that appear to be numerical may actually be categorical (e.g., the number of doors a car has). All these categorical variables need to be converted to a numerical form because ML
models can interpret only numerical features. It is possible to incorporate certain categories from a feature, not necessarily all of them. 
This transformation from categorical to numerical variables is known as One-Hot encoding. 

The entire code of this project is available in [this jupyter notebook](https://github.com/alexeygrigorev/mlbookcamp-code/blob/master/chapter-02-car-price/02-carprice.ipynb). 

<table>
   <tr>
      <td>⚠️</td>
      <td>
         The notes are written by the community. <br>
         If you see an error here, please create a PR with a fix.
      </td>
   </tr>
</table>

* [Notes from Peter Ernicke](https://knowmledge.com/2023/09/23/ml-zoomcamp-2023-machine-learning-for-regression-part-10/)

## Comments

This way of encoding categorical features is called "one-hot encoding".
We'll learn more about it in Session 3. 


## Navigation

* [Machine Learning Zoomcamp course](../)
* [Session 2: Machine Learning for Regression](./)
* Previous: [Feature engineering](11-feature-engineering.md)
* Next: [Regularization](13-regularization.md)


================================================
FILE: 02-regression/13-regularization.md
================================================
## 2.13 Regularization

<a href="https://www.youtube.com/watch?v=91ve3EJlHBc&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR&index=24"><img src="images/thumbnail-2-13.jpg"></a>

[Slides](https://www.slideshare.net/AlexeyGrigorev/ml-zoomcamp-2-slides)


## Notes

If the feature matrix has duplicate columns (or columns that can be expressed as a linear combination of other columns), it will not have an inverse matrix. But, sometimes this error could be passed if certain values are slightly different
between duplicated columns. 

So, if we apply the normal equation with this feature matrix, the values associated with duplicated columns are very large, which decreases
the model performance. To solve this issue, one alternative is adding a small number to the diagonal of the feature matrix, which corresponds to regularization. 

This technique 
works because the addition of small values to the diagonal makes it less likely to have duplicated columns. The regularization value is a hyperparameter of the model. After applying 
regularization the model performance improved. 

The entire code of this project is available in [this jupyter notebook](https://github.com/alexeygrigorev/mlbookcamp-code/blob/master/chapter-02-car-price/02-carprice.ipynb).  

<table>
   <tr>
      <td>⚠️</td>
      <td>
         The notes are written by the community. <br>
         If you see an error here, please create a PR with a fix.
      </td>
   </tr>
</table>

* [Notes from Peter Ernicke](https://knowmledge.com/2023/09/23/ml-zoomcamp-2023-machine-learning-for-regression-part-11/)

## Comments
### Linear combination

I mentioned the term *linear combination* in the video, but didn't explain what it means. 
So if you're interested what it means, you can read here

* One column is a linear combination of others when you can express one column of a matrix as a sum of others columns
* The simplest example is when a column is an exact duplicate of another column
* Another example. Let's say we have 3 columns: `a`, `b`, `c`. If `c = 0.2 * a + 0.5 * b`, then `c` is a linear combination of `a` and `b`
* More formal definition: https://en.wikipedia.org/wiki/Linear_combination

### Ridge Regression
The regularization technique used (adding a factor to the diagonals of Gram Matrix) in this lesson is Ridge Regression. Further explanations are available in this [DataTalks.Club article](https://datatalks.club/blog/regularization-in-regression.html).

## Navigation

* [Machine Learning Zoomcamp course](../)
* [Session 2: Machine Learning for Regression](./)
* Previous: [Categorical variables](12-categorical-variables.md)
* Next: [Tuning the model](14-tuning-model.md)


================================================
FILE: 02-regression/14-tuning-model.md
================================================

## 2.14 Tuning the model

<a href="https://www.youtube.com/watch?v=lW-YVxPgzQw&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR&index=25"><img src="images/thumbnail-2-14.jpg"></a>

[Slides](https://www.slideshare.net/AlexeyGrigorev/ml-zoomcamp-2-slides)


## Notes

Tuning the model consisted of finding the best regularization hyperparameter value, using the validation partition of the dataset. The model was then trained with this regularization value. 

The entire code of this project is available in [this jupyter notebook](https://github.com/alexeygrigorev/mlbookcamp-code/blob/master/chapter-02-car-price/02-carprice.ipynb). 

<table>
   <tr>
      <td>⚠️</td>
      <td>
         The notes are written by the community. <br>
         If you see an error here, please create a PR with a fix.
      </td>
   </tr>
</table>

* [Notes from Peter Ernicke](https://knowmledge.com/2023/09/24/ml-zoomcamp-2023-machine-learning-for-regression-part-12/)

## Navigation

* [Machine Learning Zoomcamp course](../)
* [Session 2: Machine Learning for Regression](./)
* Previous: [Regularization](13-regularization.md)
* Next: [Using the model](15-using-model.md)


================================================
FILE: 02-regression/15-using-model.md
================================================

## 2.15 Using the model

<a href="https://www.youtube.com/watch?v=KT--uIJozes&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR&index=26"><img src="images/thumbnail-2-15.jpg"></a>

[Slides](https://www.slideshare.net/AlexeyGrigorev/ml-zoomcamp-2-slides)

## Notes

After finding the best model and its parameters, it was trained with training and validation partitions and the final RMSE was calculated on the test partition. 

Finally, the final model was used to predict the price of new cars. 

The entire code of this project is available in [this jupyter notebook](https://github.com/alexeygrigorev/mlbookcamp-code/blob/master/chapter-02-car-price/02-carprice.ipynb).  

<table>
   <tr>
      <td>⚠️</td>
      <td>
         The notes are written by the community. <br>
         If you see an error here, please create a PR with a fix.
      </td>
   </tr>
</table>

* [Notes from Peter Ernicke](https://knowmledge.com/2023/09/24/ml-zoomcamp-2023-machine-learning-for-regression-part-12/)

## Navigation

* [Machine Learning Zoomcamp course](../)
* [Session 2: Machine Learning for Regression](./)
* Previous: [Tuning the model](14-tuning-model.md)
* Next: [Car price prediction project summary](16-summary.md)


================================================
FILE: 02-regression/16-summary.md
================================================

## 2.16 Car price prediction project summary

<a href="https://www.youtube.com/watch?v=_qI01YXbyro&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR&index=27"><img src="images/thumbnail-2-16.jpg"></a>



## Notes

In summary, this session covered some topics, including data preparation, exploratory data analysis, the validation framework, linear regression model, LR vector and 
normal forms, the baseline model, root mean squared error, feature engineering, regularization, tuning the model, and using the best model with new data. All these concepts 
were explained using the problem to predict the price of cars. 

<table>
   <tr>
      <td>⚠️</td>
      <td>
         The notes are written by the community. <br>
         If you see an error here, please create a PR with a fix.
      </td>
   </tr>
</table>

* [Notes from Maximilien Eyengue](https://github.com/maxim-eyengue/Python-Codes/blob/main/ML_Zoomcamp_2024/02_regression/Summary_Session_02.md)

## Navigation

* [Machine Learning Zoomcamp course](../)
* [Session 2: Machine Learning for Regression](./)
* Previous: [Using the model](15-using-model.md)
* Next: [Explore more](17-explore-more.md)


================================================
FILE: 02-regression/17-explore-more.md
================================================

## 2.17 Explore more

### Questions

* In this project, we included only 5 top features. What happens if we include 10?

> That's not a graded homework, it's just for you if you want to try more things on this project


### Other projects

Here are other datasets that you can play with to learn more about the topic:

* [California housing dataset](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_california_housing.html) - predict the price of a house
* [Student Performance Data Set](https://archive.ics.uci.edu/ml/datasets/Student+Performance) - predict the performance of students
* UCI ML Repository contains a lot of other datasets suitable for practicing regression - https://archive.ics.uci.edu/ml/datasets.php?task=reg


## Navigation

* [Machine Learning Zoomcamp course](../)
* [Session 2: Machine Learning for Regression](./)
* Previous: [Car price prediction project summary](16-summary.md)
* Next: [Homework](homework.md)


================================================
FILE: 02-regression/README.md
================================================
## 2. Machine Learning for Regression

- 2.1 [Car price prediction project](01-car-price-intro.md)
- 2.2 [Data preparation](02-data-preparation.md)
- 2.3 [Exploratory data analysis](03-eda.md)
- 2.4 [Setting up the validation framework](04-validation-framework.md)
- 2.5 [Linear regression](05-linear-regression-simple.md)
- 2.6 [Linear regression: vector form](06-linear-regression-vector.md)
- 2.7 [Training linear regression: Normal equation](07-linear-regression-training.md)
- 2.8 [Baseline model for car price prediction project](08-baseline-model.md)
- 2.9 [Root mean squared error](09-rmse.md)
- 2.10 [Using RMSE on validation data](10-car-price-validation.md)
- 2.11 [Feature engineering](11-feature-engineering.md)
- 2.12 [Categorical variables](12-categorical-variables.md)
- 2.13 [Regularization](13-regularization.md)
- 2.14 [Tuning the model](14-tuning-model.md)
- 2.15 [Using the model](15-using-model.md)
- 2.16 [Car price prediction project summary](16-summary.md)
- 2.17 [Explore more](17-explore-more.md)
- 2.18 [Homework](homework.md)



## Community notes

Did you take notes? You can share them here (or in each unit separately)

* [Notes from Kwang Yang](https://www.kaggle.com/kwangyangchia/notebook-for-lesson-2-mle)
* [Notes from Sebastián Ayala Ruano](https://github.com/sayalaruano/100DaysOfMLCode/blob/main/Regression/Notes/NotesDay5.md)
* [Notes from Ayoub Berdeddouch](https://github.com/ayoub-berdeddouch/mlbookcamp-homeworks/blob/main/Regression/homework_Regression_AyoubBerdeddouch.ipynb)
* [Notes from Alvaro Navas](https://github.com/ziritrion/ml-zoomcamp/blob/main/notes/02_linear_regression.md)
* [Notes from froukje](https://github.com/froukje/ml-zoomcamp/blob/main/week2/Lecture_2_car_price_prediction.ipynb)
* [Notes from Jon Areas](https://github.com/jxareas/Machine-Learning-Bookcamp-2022/blob/master/notes/02-regression.md)
* [Notes from Memoona Tahira](https://github.com/MemoonaTahira/MLZoomcamp2022/blob/main/Notes/Week_2-linear_regression/readme.md)
* [Notes from Wesley Barreto](https://github.com/wgb-10/ML-Zoomcamp-2022/blob/main/Session-Projects/02-Regression/my-notebook.ipynb)
* [Notes from Hareesh Tummala](https://github.com/tummala-hareesh/ml_zoomcamp_ht/blob/main/notes/week-2-notes.md)
* [Notes from Anneysha Sarkar](https://github.com/Anneysha7/ml-zoomcamp-2023/blob/main/course-notes/week-2.md)
* [Notes from Peter Ernicke](https://knowmledge.com/category/courses/ml-zoomcamp/regression/)
* [Notes from Marcos Benício](https://github.com/marcosbenicio/DataScience/blob/main/01Regression/car_price.ipynb)
* [Notes from Oscar Garcia](https://github.com/ozkary/machine-learning-engineering/tree/main/02-regression)
* [Notes from Maximilien Eyengue](https://github.com/maxim-eyengue/Python-Codes/blob/main/ML_Zoomcamp_2024/02_regression/Summary_Session_02.md)
* [Notes from Kemal Dahha](https://github.com/kemaldahha/machine-learning-course/blob/main/week_2_notes.ipynb)
* [Cohort 2025 | Notes By Nitin Gupta](https://github.com/niting9881/ML-zoomcamp-local/blob/main/02-regression/Linear_Regression_FAQ.md)
* Add your notes here



================================================
FILE: 02-regression/homework.md
================================================
## Homework
* For 2025 cohort homework, check [the 2025 cohort folder](../cohorts/2025/02-regression/homework.md)
* For 2024 cohort homework, check [the 2024 cohort folder](../cohorts/2024/02-regression/homework.md)
* For 2023 cohort homework, check [the 2023 cohort folder](../cohorts/2023/02-regression/homework.md)
* For 2022 cohort homework, check [the 2022 cohort folder](../cohorts/2022/02-regression/homework.md)
* For 2021 cohort homework and solution, check [the 2021 cohort folder](../cohorts/2021/02-regression/)


## Navigation

* [Machine Learning Zoomcamp course](../)
* [Session 2: Machine Learning for Regression](./)
* Previous: [Explore more](17-explore-more.md)


================================================
FILE: 02-regression/meta.json
================================================
{
    "data": "meta.csv",
    "session": 2,
    "name": "Machine Learning for Regression"
}

================================================
FILE: 02-regression/notebook.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. Machine Learning for Regression\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2.2 Data preparation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = 'https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "--2021-09-18 22:31:04--  https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv\n",
      "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.109.133, ...\n",
      "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.\n",
      "HTTP request sent, awaiting response... 200 OK\n",
      "Length: 1475504 (1,4M) [text/plain]\n",
      "Saving to: ‘data.csv’\n",
      "\n",
      "data.csv            100%[===================>]   1,41M  9,27MB/s    in 0,2s    \n",
      "\n",
      "2021-09-18 22:31:04 (9,27 MB/s) - ‘data.csv’ saved [1475504/1475504]\n",
      "\n"
     ]
    }
   ],
   "source": [
    "!wget $data "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv('data.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.columns = df.columns.str.lower().str.replace(' ', '_')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0            bmw\n",
       "1            bmw\n",
       "2            bmw\n",
       "3            bmw\n",
       "4            bmw\n",
       "          ...   \n",
       "11909      acura\n",
       "11910      acura\n",
       "11911      acura\n",
       "11912      acura\n",
       "11913    lincoln\n",
       "Name: make, Length: 11914, dtype: object"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['make'].str.lower().str.replace(' ', '_')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['make',\n",
       " 'model',\n",
       " 'engine_fuel_type',\n",
       " 'transmission_type',\n",
       " 'driven_wheels',\n",
       " 'market_category',\n",
       " 'vehicle_size',\n",
       " 'vehicle_style']"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "strings = list(df.dtypes[df.dtypes == 'object'].index)\n",
    "strings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "for col in strings:\n",
    "    df[col] = df[col].str.lower().str.replace(' ', '_')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "make                  object\n",
       "model                 object\n",
       "year                   int64\n",
       "engine_fuel_type      object\n",
       "engine_hp            float64\n",
       "engine_cylinders     float64\n",
       "transmission_type     object\n",
       "driven_wheels         object\n",
       "number_of_doors      float64\n",
       "market_category       object\n",
       "vehicle_size          object\n",
       "vehicle_style         object\n",
       "highway_mpg            int64\n",
       "city_mpg               int64\n",
       "popularity             int64\n",
       "msrp                   int64\n",
       "dtype: object"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.dtypes"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2.3 Exploratory data analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "make\n",
      "['bmw' 'audi' 'fiat' 'mercedes-benz' 'chrysler']\n",
      "48\n",
      "\n",
      "model\n",
      "['1_series_m' '1_series' '100' '124_spider' '190-class']\n",
      "914\n",
      "\n",
      "year\n",
      "[2011 2012 2013 1992 1993]\n",
      "28\n",
      "\n",
      "engine_fuel_type\n",
      "['premium_unleaded_(required)' 'regular_unleaded'\n",
      " 'premium_unleaded_(recommended)' 'flex-fuel_(unleaded/e85)' 'diesel']\n",
      "10\n",
      "\n",
      "engine_hp\n",
      "[335. 300. 230. 320. 172.]\n",
      "356\n",
      "\n",
      "engine_cylinders\n",
      "[ 6.  4.  5.  8. 12.]\n",
      "9\n",
      "\n",
      "transmission_type\n",
      "['manual' 'automatic' 'automated_manual' 'direct_drive' 'unknown']\n",
      "5\n",
      "\n",
      "driven_wheels\n",
      "['rear_wheel_drive' 'front_wheel_drive' 'all_wheel_drive'\n",
      " 'four_wheel_drive']\n",
      "4\n",
      "\n",
      "number_of_doors\n",
      "[ 2.  4.  3. nan]\n",
      "3\n",
      "\n",
      "market_category\n",
      "['factory_tuner,luxury,high-performance' 'luxury,performance'\n",
      " 'luxury,high-performance' 'luxury' 'performance']\n",
      "71\n",
      "\n",
      "vehicle_size\n",
      "['compact' 'midsize' 'large']\n",
      "3\n",
      "\n",
      "vehicle_style\n",
      "['coupe' 'convertible' 'sedan' 'wagon' '4dr_hatchback']\n",
      "16\n",
      "\n",
      "highway_mpg\n",
      "[26 28 27 25 24]\n",
      "59\n",
      "\n",
      "city_mpg\n",
      "[19 20 18 17 16]\n",
      "69\n",
      "\n",
      "popularity\n",
      "[3916 3105  819  617 1013]\n",
      "48\n",
      "\n",
      "msrp\n",
      "[46135 40650 36350 29450 34500]\n",
      "6049\n",
      "\n"
     ]
    }
   ],
   "source": [
    "for col in df.columns:\n",
    "    print(col)\n",
    "    print(df[col].unique()[:5])\n",
    "    print(df[col].nunique())\n",
    "    print()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>make</th>\n",
       "      <th>model</th>\n",
       "      <th>year</th>\n",
       "      <th>engine_fuel_type</th>\n",
       "      <th>engine_hp</th>\n",
       "      <th>engine_cylinders</th>\n",
       "      <th>transmission_type</th>\n",
       "      <th>driven_wheels</th>\n",
       "      <th>number_of_doors</th>\n",
       "      <th>market_category</th>\n",
       "      <th>vehicle_size</th>\n",
       "      <th>vehicle_style</th>\n",
       "      <th>highway_mpg</th>\n",
       "      <th>city_mpg</th>\n",
       "      <th>popularity</th>\n",
       "      <th>msrp</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>bmw</td>\n",
       "      <td>1_series_m</td>\n",
       "      <td>2011</td>\n",
       "      <td>premium_unleaded_(required)</td>\n",
       "      <td>335.0</td>\n",
       "      <td>6.0</td>\n",
       "      <td>manual</td>\n",
       "      <td>rear_wheel_drive</td>\n",
       "      <td>2.0</td>\n",
       "      <td>factory_tuner,luxury,high-performance</td>\n",
       "      <td>compact</td>\n",
       "      <td>coupe</td>\n",
       "      <td>26</td>\n",
       "      <td>19</td>\n",
       "      <td>3916</td>\n",
       "      <td>46135</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>bmw</td>\n",
       "      <td>1_series</td>\n",
       "      <td>2011</td>\n",
       "      <td>premium_unleaded_(required)</td>\n",
       "      <td>300.0</td>\n",
       "      <td>6.0</td>\n",
       "      <td>manual</td>\n",
       "      <td>rear_wheel_drive</td>\n",
       "      <td>2.0</td>\n",
       "      <td>luxury,performance</td>\n",
       "      <td>compact</td>\n",
       "      <td>convertible</td>\n",
       "      <td>28</td>\n",
       "      <td>19</td>\n",
       "      <td>3916</td>\n",
       "      <td>40650</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>bmw</td>\n",
       "      <td>1_series</td>\n",
       "      <td>2011</td>\n",
       "      <td>premium_unleaded_(required)</td>\n",
       "      <td>300.0</td>\n",
       "      <td>6.0</td>\n",
       "      <td>manual</td>\n",
       "      <td>rear_wheel_drive</td>\n",
       "      <td>2.0</td>\n",
       "      <td>luxury,high-performance</td>\n",
       "      <td>compact</td>\n",
       "      <td>coupe</td>\n",
       "      <td>28</td>\n",
       "      <td>20</td>\n",
       "      <td>3916</td>\n",
       "      <td>36350</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>bmw</td>\n",
       "      <td>1_series</td>\n",
       "      <td>2011</td>\n",
       "      <td>premium_unleaded_(required)</td>\n",
       "      <td>230.0</td>\n",
       "      <td>6.0</td>\n",
       "      <td>manual</td>\n",
       "      <td>rear_wheel_drive</td>\n",
       "      <td>2.0</td>\n",
       "      <td>luxury,performance</td>\n",
       "      <td>compact</td>\n",
       "      <td>coupe</td>\n",
       "      <td>28</td>\n",
       "      <td>18</td>\n",
       "      <td>3916</td>\n",
       "      <td>29450</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>bmw</td>\n",
       "      <td>1_series</td>\n",
       "      <td>2011</td>\n",
       "      <td>premium_unleaded_(required)</td>\n",
       "      <td>230.0</td>\n",
       "      <td>6.0</td>\n",
       "      <td>manual</td>\n",
       "      <td>rear_wheel_drive</td>\n",
       "      <td>2.0</td>\n",
       "      <td>luxury</td>\n",
       "      <td>compact</td>\n",
       "      <td>convertible</td>\n",
       "      <td>28</td>\n",
       "      <td>18</td>\n",
       "      <td>3916</td>\n",
       "      <td>34500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11909</th>\n",
       "      <td>acura</td>\n",
       "      <td>zdx</td>\n",
       "      <td>2012</td>\n",
       "      <td>premium_unleaded_(required)</td>\n",
       "      <td>300.0</td>\n",
       "      <td>6.0</td>\n",
       "      <td>automatic</td>\n",
       "      <td>all_wheel_drive</td>\n",
       "      <td>4.0</td>\n",
       "      <td>crossover,hatchback,luxury</td>\n",
       "      <td>midsize</td>\n",
       "      <td>4dr_hatchback</td>\n",
       "      <td>23</td>\n",
       "      <td>16</td>\n",
       "      <td>204</td>\n",
       "      <td>46120</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11910</th>\n",
       "      <td>acura</td>\n",
       "      <td>zdx</td>\n",
       "      <td>2012</td>\n",
       "      <td>premium_unleaded_(required)</td>\n",
       "      <td>300.0</td>\n",
       "      <td>6.0</td>\n",
       "      <td>automatic</td>\n",
       "      <td>all_wheel_drive</td>\n",
       "      <td>4.0</td>\n",
       "      <td>crossover,hatchback,luxury</td>\n",
       "      <td>midsize</td>\n",
       "      <td>4dr_hatchback</td>\n",
       "      <td>23</td>\n",
       "      <td>16</td>\n",
       "      <td>204</td>\n",
       "      <td>56670</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11911</th>\n",
       "      <td>acura</td>\n",
       "      <td>zdx</td>\n",
       "      <td>2012</td>\n",
       "      <td>premium_unleaded_(required)</td>\n",
       "      <td>300.0</td>\n",
       "      <td>6.0</td>\n",
       "      <td>automatic</td>\n",
       "      <td>all_wheel_drive</td>\n",
       "      <td>4.0</td>\n",
       "      <td>crossover,hatchback,luxury</td>\n",
       "      <td>midsize</td>\n",
       "      <td>4dr_hatchback</td>\n",
       "      <td>23</td>\n",
       "      <td>16</td>\n",
       "      <td>204</td>\n",
       "      <td>50620</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11912</th>\n",
       "      <td>acura</td>\n",
       "      <td>zdx</td>\n",
       "      <td>2013</td>\n",
       "      <td>premium_unleaded_(recommended)</td>\n",
       "      <td>300.0</td>\n",
       "      <td>6.0</td>\n",
       "      <td>automatic</td>\n",
       "      <td>all_wheel_drive</td>\n",
       "      <td>4.0</td>\n",
       "      <td>crossover,hatchback,luxury</td>\n",
       "      <td>midsize</td>\n",
       "      <td>4dr_hatchback</td>\n",
       "      <td>23</td>\n",
       "      <td>16</td>\n",
       "      <td>204</td>\n",
       "      <td>50920</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11913</th>\n",
       "      <td>lincoln</td>\n",
       "      <td>zephyr</td>\n",
       "      <td>2006</td>\n",
       "      <td>regular_unleaded</td>\n",
       "      <td>221.0</td>\n",
       "      <td>6.0</td>\n",
       "      <td>automatic</td>\n",
       "      <td>front_wheel_drive</td>\n",
       "      <td>4.0</td>\n",
       "      <td>luxury</td>\n",
       "      <td>midsize</td>\n",
       "      <td>sedan</td>\n",
       "      <td>26</td>\n",
       "      <td>17</td>\n",
       "      <td>61</td>\n",
       "      <td>28995</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>11914 rows × 16 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "          make       model  year                engine_fuel_type  engine_hp  \\\n",
       "0          bmw  1_series_m  2011     premium_unleaded_(required)      335.0   \n",
       "1          bmw    1_series  2011     premium_unleaded_(required)      300.0   \n",
       "2          bmw    1_series  2011     premium_unleaded_(required)      300.0   \n",
       "3          bmw    1_series  2011     premium_unleaded_(required)      230.0   \n",
       "4          bmw    1_series  2011     premium_unleaded_(required)      230.0   \n",
       "...        ...         ...   ...                             ...        ...   \n",
       "11909    acura         zdx  2012     premium_unleaded_(required)      300.0   \n",
       "11910    acura         zdx  2012     premium_unleaded_(required)      300.0   \n",
       "11911    acura         zdx  2012     premium_unleaded_(required)      300.0   \n",
       "11912    acura         zdx  2013  premium_unleaded_(recommended)      300.0   \n",
       "11913  lincoln      zephyr  2006                regular_unleaded      221.0   \n",
       "\n",
       "       engine_cylinders transmission_type      driven_wheels  number_of_doors  \\\n",
       "0                   6.0            manual   rear_wheel_drive              2.0   \n",
       "1                   6.0            manual   rear_wheel_drive              2.0   \n",
       "2                   6.0            manual   rear_wheel_drive              2.0   \n",
       "3                   6.0            manual   rear_wheel_drive              2.0   \n",
       "4                   6.0            manual   rear_wheel_drive              2.0   \n",
       "...                 ...               ...                ...              ...   \n",
       "11909               6.0         automatic    all_wheel_drive              4.0   \n",
       "11910               6.0         automatic    all_wheel_drive              4.0   \n",
       "11911               6.0         automatic    all_wheel_drive              4.0   \n",
       "11912               6.0         automatic    all_wheel_drive              4.0   \n",
       "11913               6.0         automatic  front_wheel_drive              4.0   \n",
       "\n",
       "                             market_category vehicle_size  vehicle_style  \\\n",
       "0      factory_tuner,luxury,high-performance      compact          coupe   \n",
       "1                         luxury,performance      compact    convertible   \n",
       "2                    luxury,high-performance      compact          coupe   \n",
       "3                         luxury,performance      compact          coupe   \n",
       "4                                     luxury      compact    convertible   \n",
       "...                                      ...          ...            ...   \n",
       "11909             crossover,hatchback,luxury      midsize  4dr_hatchback   \n",
       "11910             crossover,hatchback,luxury      midsize  4dr_hatchback   \n",
       "11911             crossover,hatchback,luxury      midsize  4dr_hatchback   \n",
       "11912             crossover,hatchback,luxury      midsize  4dr_hatchback   \n",
       "11913                                 luxury      midsize          sedan   \n",
       "\n",
       "       highway_mpg  city_mpg  popularity   msrp  \n",
       "0               26        19        3916  46135  \n",
       "1               28        19        3916  40650  \n",
       "2               28        20        3916  36350  \n",
       "3               28        18        3916  29450  \n",
       "4               28        18        3916  34500  \n",
       "...            ...       ...         ...    ...  \n",
       "11909           23        16         204  46120  \n",
       "11910           23        16         204  56670  \n",
       "11911           23        16         204  50620  \n",
       "11912           23        16         204  50920  \n",
       "11913           26        17          61  28995  \n",
       "\n",
       "[11914 rows x 16 columns]"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Distribution of price"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<AxesSubplot:xlabel='msrp', ylabel='Count'>"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYsAAAEGCAYAAACUzrmNAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8rg+JYAAAACXBIWXMAAAsTAAALEwEAmpwYAAATAklEQVR4nO3df/BddX3n8ecLIlRrC2iyLBtgQyVq0a4VUwTpdF3TVWS7ht1FirWauqHZtlR023VX3ZllauuMzjpScSpMJlBRWZFStsZdKsMCbS1T0YBWmkQlC1KSiRIIxFbXH9H3/nE/gdvk+83nAt/7vffL9/mYufM953M+59z3PVx4cX7cz0lVIUnSoRw26QIkSdPPsJAkdRkWkqQuw0KS1GVYSJK6lky6gHFYunRprVixYtJlSNKCcscddzxYVctmWvaUDIsVK1awefPmSZchSQtKkvtmW+ZpKElSl2EhSeoyLCRJXYaFJKnLsJAkdRkWkqQuw0KS1GVYSJK6DAtJUtdT8hfcT9YvX/Br7Hpw70Htxy09io9tvHwCFUnSZBkWM9j14F6WnX3Rwe03XDqBaiRp8jwNJUnqMiwkSV2GhSSpy7CQJHUZFpKkLsNCktRlWEiSugwLSVKXYSFJ6jIsJEldhoUkqcuwkCR1GRaSpC7DQpLUZVhIkroMC0lSl2EhSeoyLCRJXYaFJKnLsJAkdY01LJL8xyRbkvxNko8n+ZEkJyW5Pcn2JJ9IckTre2Sb396Wrxjazjta+1eSvGqcNUuSDja2sEiyHLgIWFVVLwQOB84H3gtcUlUnAw8D69oq64CHW/slrR9JTmnrvQA4C/hQksPHVbck6WDjPg21BHh6kiXAM4BdwCuA69ryq4Bz2vSaNk9bvjpJWvs1VfXdqroX2A6cNua6JUlDxhYWVbUTeB/wtwxCYi9wB/BIVe1r3XYAy9v0cuD+tu6+1v/Zw+0zrPOoJOuTbE6yeffu3XP/gSRpERvnaahjGBwVnAT8E+BHGZxGGouq2lBVq6pq1bJly8b1NpK0KI3zNNTPA/dW1e6q+j5wPXAmcHQ7LQVwPLCzTe8ETgBoy48CHhpun2EdSdI8GGdY/C1wepJntGsPq4GtwK3Aua3PWuCTbXpTm6ctv6WqqrWf3+6WOglYCXxujHVLkg6wpN/liamq25NcB9wJ7AO+AGwA/jdwTZLfa21XtFWuAD6aZDuwh8EdUFTVliTXMgiafcCFVfWDcdUtSTrY2MICoKouBi4+oPkeZribqaq+A7x2lu28G3j3nBcoSRqJv+CWJHUZFpKkLsNCktRlWEiSugwLSVKXYSFJ6jIsJEldhoUkqcuwkCR1GRaSpC7DQpLUZVhIkroMC0lSl2EhSeoyLCRJXYaFJKnLsJAkdRkWkqQuw0KS1GVYSJK6DAtJUpdhIUnqMiwkSV2GhSSpy7CQJHUZFpKkLsNCktRlWEiSugwLSVKXYSFJ6jIsJEldhoUkqcuwkCR1GRaSpC7DQpLUZVhIkrrGGhZJjk5yXZIvJ9mW5Iwkz0pyU5K7299jWt8kuTTJ9iRfSnLq0HbWtv53J1k7zpolSQcb95HFB4BPV9XzgRcB24C3AzdX1Urg5jYP8GpgZXutBy4DSPIs4GLgpcBpwMX7A0aSND/GFhZJjgJ+DrgCoKq+V1WPAGuAq1q3q4Bz2vQa4CM18Fng6CTHAa8CbqqqPVX1MHATcNa46pYkHWycRxYnAbuBP0zyhSQbk/wocGxV7Wp9vg4c26aXA/cPrb+jtc3W/g8kWZ9kc5LNu3fvnuOPIkmL2zjDYglwKnBZVb0Y+BaPnXICoKoKqLl4s6raUFWrqmrVsmXL5mKTkqRmnGGxA9hRVbe3+esYhMc32ukl2t8H2vKdwAlD6x/f2mZrlyTNk7GFRVV9Hbg/yfNa02pgK7AJ2H9H01rgk216E/DGdlfU6cDedrrqRuCVSY5pF7Zf2dokSfNkyZi3/2bg6iRHAPcAb2IQUNcmWQfcB5zX+t4AnA1sB77d+lJVe5L8LvD51u9dVbVnzHVLkoaMNSyq6ovAqhkWrZ6hbwEXzrKdK4Er57Q4SdLI/AW3JKnLsJAkdRkWkqQuw0KS1GVYSJK6DAtJUpdhIUnqMiwkSV2GhSSpy7CQJHWNFBZJzhylTZL01DTqkcUHR2yTJD0FHXIgwSRnAC8DliX5raFFPw4cPs7CJEnTozfq7BHAM1u/Hxtq/yZw7riKkiRNl0OGRVX9OfDnST5cVffNU02SpCkz6vMsjkyyAVgxvE5VvWIcRUmSpsuoYfFHwOXARuAH4ytHkjSNRg2LfVV12VgrkSRNrVFvnf1Ukt9IclySZ+1/jbUySdLUGPXIYm37+7ahtgJ+Ym7LkSRNo5HCoqpOGnchkqTpNVJYJHnjTO1V9ZG5LUeSNI1GPQ31M0PTPwKsBu4EDAtJWgRGPQ315uH5JEcD14yjIEnS9HmiQ5R/C/A6hiQtEqNes/gUg7ufYDCA4E8C146rKEnSdBn1msX7hqb3AfdV1Y4x1CNJmkIjnYZqAwp+mcHIs8cA3xtnUZKk6TLqk/LOAz4HvBY4D7g9iUOUS9IiMeppqP8K/ExVPQCQZBnwf4DrxlWYJGl6jHo31GH7g6J56HGsK0la4EY9svh0khuBj7f5XwRuGE9JkqRp03sG98nAsVX1tiT/FvjZtuivgKvHXZwkaTr0jix+H3gHQFVdD1wPkOSn2rJ/PcbaJElTonfd4diquuvAxta2YiwVSZKmTi8sjj7EsqfPYR2SpCnWC4vNSX71wMYkFwB3jKckSdK06V2zeCvwP5O8nsfCYRVwBPBvRnmDJIcDm4GdVfULSU5iMGLts9s231BV30tyJIMhz1/C4NbcX6yqr7VtvANYB/wAuKiqbhz5E0qSnrRDHllU1Teq6mXA7wBfa6/fqaozqurrI77HW4BtQ/PvBS6pqpOBhxmEAO3vw639ktaPJKcA5wMvAM4CPtQCSJI0T0YdG+rWqvpge90y6saTHA/8K2Bjmw/wCh775fdVwDltek2bpy1f3fqvAa6pqu9W1b3AduC0UWuQJD154/4V9u8D/xn4YZt/NvBIVe1r8zuA5W16OXA/QFu+t/V/tH2GdR6VZH2SzUk27969e44/hiQtbmMLiyS/ADxQVfNyIbyqNlTVqqpatWzZsvl4S0laNEYd7uOJOBN4TZKzGTy3+8eBDwBHJ1nSjh6OB3a2/juBE4AdSZYARzG40L2/fb/hdSRJ82BsRxZV9Y6qOr6qVjC4QH1LVb0euBXYP7z5WuCTbXpTm6ctv6WqqrWfn+TIdifVSgbDpUuS5sk4jyxm81+Aa5L8HvAF4IrWfgXw0STbgT0MAoaq2pLkWmArg6f0XVhVP5j/siVp8ZqXsKiqPwP+rE3fwwx3M1XVdxg8XGmm9d8NvHt8FUqSDsVnUkiSugwLSVKXYSFJ6jIsJEldhoUkqcuwkCR1GRaSpC7DQpLUZVhIkroMC0lSl2EhSeoyLCRJXYaFJKnLsJAkdRkWkqQuw0KS1GVYSJK6DAtJUpdhIUnqMiwkSV2GhSSpy7CQJHUZFpKkLsNCktRlWEiSugwLSVKXYSFJ6jIsJEldhoUkqcuwkCR1GRaSpC7DQpLUZVhIkroMC0lSl2EhSeoyLCRJXWMLiyQnJLk1ydYkW5K8pbU/K8lNSe5uf49p7UlyaZLtSb6U5NShba1t/e9OsnZcNUuSZjbOI4t9wG9X1SnA6cCFSU4B3g7cXFUrgZvbPMCrgZXttR64DAbhAlwMvBQ4Dbh4f8BIkubH2MKiqnZV1Z1t+u+AbcByYA1wVet2FXBOm14DfKQGPgscneQ44FXATVW1p6oeBm4CzhpX3ZKkgy2ZjzdJsgJ4MXA7cGxV7WqLvg4c26aXA/cPrbajtc3WfuB7rGdwRMKJJ544h9U/ZtvWLaw+53UHtR+39Cg+tvHysbynJE2DsYdFkmcCfwy8taq+meTRZVVVSWou3qeqNgAbAFatWjUn2zzQ9+swlp190UHtu264dBxvJ0lTY6x3QyV5GoOguLqqrm/N32inl2h/H2jtO4EThlY/vrXN1i5JmifjvBsqwBXAtqp6/9CiTcD+O5rWAp8can9juyvqdGBvO111I/DKJMe0C9uvbG2SpHkyztNQZwJvAO5K8sXW9k7gPcC1SdYB9wHntWU3AGcD24FvA28CqKo9SX4X+Hzr966q2jPGuiVJBxhbWFTVXwKZZfHqGfoXcOEs27oSuHLuqpMkPR7+gluS1GVYSJK6DAtJUpdhIUnqMiwkSV2GhSSpy7CQJHUZFpKkLsNCktRlWEiSugwLSVKXYSFJ6jIsJEldhoUkqcuwkCR1GRaSpC7DQpLUZVhIkroMC0lSl2EhSeoyLCRJXYaFJKnLsJAkdRkWkqQuw0KS1GVYSJK6DAtJUpdhIUnqMiwkSV2GhSSpy7CQJHUZFpKkLsNCktS1ZNIFPBVs27qF1ee87qD245Yexcc2Xj6BiiRpbhkWc+D7dRjLzr7ooPZdN1w6gWokae55GkqS1GVYSJK6FsxpqCRnAR8ADgc2VtV7JlxS12zXMr72f7/Kiuc896B2r3FImlYLIiySHA78AfAvgR3A55Nsqqqtk63s0Ga7lvGl//7rM7bf8r7/4IVySVNpQYQFcBqwvaruAUhyDbAGmOqweLxmC5fZQmS2I5THe+Tyyxf8Grse3DtjTQaVJIBU1aRr6EpyLnBWVV3Q5t8AvLSqfnOoz3pgfZt9HvCVJ/h2S4EHn0S5i4X7aTTupz730WjmYz/906paNtOChXJk0VVVG4ANT3Y7STZX1ao5KOkpzf00GvdTn/toNJPeTwvlbqidwAlD88e3NknSPFgoYfF5YGWSk5IcAZwPbJpwTZK0aCyI01BVtS/JbwI3Mrh19sqq2jKmt3vSp7IWCffTaNxPfe6j0Ux0Py2IC9ySpMlaKKehJEkTZFhIkroWbVgkOSvJV5JsT/L2GZYfmeQTbfntSVZMoMyJG2E//UqS3Um+2F4XTKLOSUpyZZIHkvzNLMuT5NK2D7+U5NT5rnEajLCfXp5k79B36b/Nd42TluSEJLcm2ZpkS5K3zNBnIt+nRRkWQ8OHvBo4BXhdklMO6LYOeLiqTgYuAd47v1VO3oj7CeATVfXT7bVxXoucDh8GzjrE8lcDK9trPXDZPNQ0jT7MofcTwGeGvkvvmoeaps0+4Ler6hTgdODCGf6dm8j3aVGGBUPDh1TV94D9w4cMWwNc1aavA1YnyTzWOA1G2U+LXlX9BbDnEF3WAB+pgc8CRyc5bn6qmx4j7KdFr6p2VdWdbfrvgG3A8gO6TeT7tFjDYjlw/9D8Dg7+B/Jon6raB+wFnj0v1U2PUfYTwL9rh8PXJTlhhuWL3aj7UXBGkr9O8qdJXjDpYiapnfp+MXD7AYsm8n1arGGhufMpYEVV/TPgJh47GpMerzsZjE30IuCDwJ9MtpzJSfJM4I+Bt1bVNyddDyzesBhl+JBH+yRZAhwFPDQv1U2P7n6qqoeq6rttdiPwknmqbSFxuJoRVNU3q+rv2/QNwNOSLJ1wWfMuydMYBMXVVXX9DF0m8n1arGExyvAhm4C1bfpc4JZafL9g7O6nA86VvobBOVb9Q5uAN7a7WE4H9lbVrkkXNW2S/OP91wWTnMbgv0+L6n/Q2ue/AthWVe+fpdtEvk8LYriPuTbb8CFJ3gVsrqpNDP6BfTTJdgYX5c6fXMWTMeJ+uijJaxjcxbEH+JWJFTwhST4OvBxYmmQHcDHwNICquhy4ATgb2A58G3jTZCqdrBH207nAryfZB/w/4PxF+D9oZwJvAO5K8sXW9k7gRJjs98nhPiRJXYv1NJQk6XEwLCRJXYaFJKnLsJAkdRkWkrTA9QZpnKH/eUODFf6PkdbxbihJWtiS/Bzw9wzGjHphp+9K4FrgFVX1cJJ/VFUP9N7DIwtpCrRRAqQnZKZBGpM8J8mnk9yR5DNJnt8W/SrwB1X1cFu3GxRgWEhPWpIVSb6c5MNJvprk6iQ/n+S2JHcnOS3JPx96TsMXkvxYe37DZ5JsArYObefqJNvawIzPmPTn04K1AXhzVb0E+E/Ah1r7c4Hntu/nZ5P0ho0HFukvuKUxOBl4LfDvGQyT8kvAzzIYAuWdDH4Bf2FV3dYGiftOW+9U4IVVdW8bZfR5wLrW70rgN4D3zesn0YLXvmMvA/5o6MkKR7a/Sxg8C+PlDMaV+oskP1VVjxxqmx5ZSHPj3qq6q6p+CGwBbm5DVdwFrABuA96f5CLg6DbsPcDnqureoe3cX1W3temPMQgc6fE6DHhk6EFSP11VP9mW7QA2VdX323fvqwzCo7tBSU/ed4emfzg0/0NgSVW9B7gAeDpw29D5428dsJ0D7zjxDhQ9bm1Y83uTvBYefRTri9riP2FwVEEb1fe5wD29bRoW0jxI8px25PFeBqepnj9L1xOTnNGmfwn4y3kpUAtaG6Txr4DnJdmRZB3wemBdkr9mcLS7/ymXNwIPJdkK3Aq8raq6o/t6zUKaH29N8i8YHGlsAf4UOGOGfl9h8NzlK4GtLN7ndetxqKrXzbLooIvX7fTob7XXyPydhTQl2gXu/9W7T16aBE9DSZK6PLKQJHV5ZCFJ6jIsJEldhoUkqcuwkCR1GRaSpK7/D5sufaGjZFscAAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "sns.histplot(df.msrp, bins=50)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<AxesSubplot:xlabel='msrp', ylabel='Count'>"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAY8AAAEGCAYAAACdJRn3AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8rg+JYAAAACXBIWXMAAAsTAAALEwEAmpwYAAAY+UlEQVR4nO3de7RedX3n8fenpOC1BOSUSQlMoqId61xkImJtOyoWkXGMM4syUK1RsRkVtdYuLdS1hk5b18LWVQvVQTOQCsqAlNqaKi2laHXqqkjwwv1yBJVkBRME4ywdL9Tv/LF/kceTnOTs5DyXc877tdazzt7f/Xv2/u3sk3zzu+y9U1VIktTHT4y7ApKkhcfkIUnqzeQhSerN5CFJ6s3kIUnqbdm4KzAMRxxxRK1atWrc1ZCkBeXGG298oKqm5lJ2USaPVatWsXnz5nFXQ5IWlCRfnWtZu60kSb2ZPCRJvQ0teSTZmGR7kltmxN+Y5I4ktyb5w4H4OUmmk9yZ5IUD8ZNbbDrJ2cOqryRp7oY55vEB4D3ApbsCSZ4HrAX+bVV9L8lPt/jTgNOBnwN+Bvj7JE9pX3sv8MvAFuCGJJuq6rYh1luStA9DSx5V9ekkq2aEXwecV1Xfa2W2t/ha4IoWvzfJNHB82zZdVfcAJLmilTV5SNIYjXrM4ynALya5PsmnkjyzxY8C7hsot6XFZovvJsn6JJuTbN6xY8cQqi5J2mXUyWMZcDhwAvBW4MokmY8dV9WGqlpTVWumpuY0TVmStJ9GfZ/HFuAj1T0H/nNJfggcAWwFjh4ot7LF2EtckjQmo255/BXwPIA2IH4w8ACwCTg9ySFJVgPHAp8DbgCOTbI6ycF0g+qbRlxnSdIMQ2t5JLkceC5wRJItwLnARmBjm777fWBda4XcmuRKuoHwh4Gzquqf237eAFwDHARsrKpbh1XnXV7+mtey7YGdu8VXHHEoH7rofcM+vCRNvGHOtjpjlk0vn6X8O4B37CF+NXD1PFZtn7Y9sJOpU960e/zqC0ZZDUmaWN5hLknqzeQhSerN5CFJ6s3kIUnqzeQhSerN5CFJ6s3kIUnqzeQhSerN5CFJ6s3kIUnqzeQhSerN5CFJ6s3kIUnqzeQhSerN5CFJ6s3kIUnqzeQhSeptaMkjycYk29srZ2du+60kleSItp4kFySZTnJTkuMGyq5Lcnf7rBtWfSVJczfMlscHgJNnBpMcDZwEfG0g/CLg2PZZD1zYyh5O9+7zZwHHA+cmOWyIdZYkzcHQkkdVfRp4cA+b3g28DaiB2Frg0up8FlieZAXwQuDaqnqwqh4CrmUPCUmSNFojHfNIshbYWlVfmrHpKOC+gfUtLTZbfE/7Xp9kc5LNO3bsmMdaS5JmGlnySPIY4HeA/z6M/VfVhqpaU1VrpqamhnEISVIzypbHk4DVwJeSfAVYCXw+yb8AtgJHD5Rd2WKzxSVJYzSy5FFVN1fVT1fVqqpaRdcFdVxV3Q9sAl7RZl2dAOysqm3ANcBJSQ5rA+UntZgkaYyGOVX3cuCfgKcm2ZLkzL0Uvxq4B5gG/hfweoCqehD4feCG9vm9FpMkjdGyYe24qs7Yx/ZVA8sFnDVLuY3AxnmtnCTpgHiHuSSpN5OHJKk3k4ckqTeThySpN5OHJKk3k4ckqTeThySpN5OHJKk3k4ckqTeThySpN5OHJKk3k4ckqTeThySpN5OHJKk3k4ckqTeThySpN5OHJKm3Yb6GdmOS7UluGYj9UZI7ktyU5C+TLB/Ydk6S6SR3JnnhQPzkFptOcvaw6itJmrthtjw+AJw8I3Yt8PSq+jfAXcA5AEmeBpwO/Fz7zv9MclCSg4D3Ai8Cngac0cpKksZoaMmjqj4NPDgj9ndV9XBb/Sywsi2vBa6oqu9V1b3ANHB8+0xX1T1V9X3gilZWkjRG4xzzeDXwN235KOC+gW1bWmy2uCRpjMaSPJK8HXgYuGwe97k+yeYkm3fs2DFfu5Uk7cHIk0eSVwIvBl5WVdXCW4GjB4qtbLHZ4rupqg1Vtaaq1kxNTc17vSVJjxhp8khyMvA24CVV9Z2BTZuA05MckmQ1cCzwOeAG4Ngkq5McTDeovmmUdZYk7W7ZsHac5HLgucARSbYA59LNrjoEuDYJwGer6rVVdWuSK4Hb6Lqzzqqqf277eQNwDXAQsLGqbh1WnSVJczO05FFVZ+whfPFeyr8DeMce4lcDV89j1SRJB8g7zCVJvZk8JEm9mTwkSb2ZPCRJvZk8JEm9mTwkSb2ZPCRJvZk8JEm9mTwkSb2ZPCRJvZk8JEm9mTwkSb2ZPCRJvZk8JEm9mTwkSb2ZPCRJvZk8JEm9mTwkSb0NLXkk2Zhke5JbBmKHJ7k2yd3t52EtniQXJJlOclOS4wa+s66VvzvJumHVV5I0d8NseXwAOHlG7Gzguqo6FriurQO8CDi2fdYDF0KXbIBzgWcBxwPn7ko4kqTxGVryqKpPAw/OCK8FLmnLlwAvHYhfWp3PAsuTrABeCFxbVQ9W1UPAteyekCRJIzbqMY8jq2pbW74fOLItHwXcN1BuS4vNFt9NkvVJNifZvGPHjvmttSTpx4xtwLyqCqh53N+GqlpTVWumpqbma7eSpD0YdfL4euuOov3c3uJbgaMHyq1ssdnikqQxGnXy2ATsmjG1DvjoQPwVbdbVCcDO1r11DXBSksPaQPlJLSZJGqNlw9pxksuB5wJHJNlCN2vqPODKJGcCXwVOa8WvBk4BpoHvAK8CqKoHk/w+cEMr93tVNXMQXpI0YkNLHlV1xiybTtxD2QLOmmU/G4GN81g1SdIB8g5zSVJvJg9JUm8mD0lSbyYPSVJvJg9JUm9zSh5JnjOXmCRpaZhry+NP5xiTJC0Be73PI8mzgZ8HppK8ZWDTTwEHDbNikqTJta+bBA8GHtfKPX4g/i3g1GFVSpI02faaPKrqU8Cnknygqr46ojpJkibcXB9PckiSDcCqwe9U1fOHUSlJ0mSba/L4c+B9wEXAPw+vOpKkhWCuyePhqrpwqDWRJC0Yc52q+9dJXp9kRZLDd32GWjNJ0sSaa8tj1wuc3joQK+CJ81sdSdJCMKfkUVWrh10RSdLCMafkkeQVe4pX1aXzWx1J0kIw1zGPZw58fhH4XeAl+3vQJL+Z5NYktyS5PMmjkqxOcn2S6SQfTnJwK3tIW59u21ft73ElSfNjTsmjqt448Pl14Di6O897S3IU8CZgTVU9ne4xJ6cD7wTeXVVPBh4CzmxfORN4qMXf3cpJksZofx/J/m3gQMZBlgGPTrIMeAywDXg+cFXbfgnw0ra8tq3Ttp+YJAdwbEnSAZrrmMdf082ugq6l8K+AK/fngFW1Ncm7gK8B/w/4O+BG4JtV9XArtgU4qi0fBdzXvvtwkp3AE4AHZtRxPbAe4JhjjtmfqkmS5miuU3XfNbD8MPDVqtqyPwdMchhda2I18E26u9dP3p99DaqqDcAGgDVr1tQ+ikuSDsBcxzw+BdxB92Tdw4DvH8AxXwDcW1U7quoHwEeA5wDLWzcWwEpga1veChwN0LYfCnzjAI4vSTpAc32T4GnA54BfAU4Drk+yv49k/xpwQpLHtLGLE4HbgE/yyGPe1wEfbcubeOQmxVOBT1SVLQtJGqO5dlu9HXhmVW0HSDIF/D2PDHDPWVVdn+Qq4PN0XWBfoOtu+jhwRZI/aLGL21cuBj6YZBp4kG5mliRpjOaaPH5iV+JovsH+z9Siqs4Fzp0Rvgc4fg9lv0vX4pEkTYi5Jo+/TXINcHlb/6/A1cOpkiRp0u3rHeZPBo6sqrcm+S/AL7RN/wRcNuzKSZIm075aHn8CnANQVR+hmxlFkn/dtv2nIdZNkjSh9jVucWRV3Twz2GKrhlIjSdLE21fyWL6XbY+ex3pIkhaQfSWPzUl+fWYwyWvoHikiSVqC9jXm8WbgL5O8jEeSxRrgYOA/D7FemnAvf81r2fbAzt3iK444lA9d9L4x1EjSKO01eVTV14GfT/I84Okt/PGq+sTQa6aJtu2BnUyd8qbd41dfMIbaSBq1ub6G9pN0jw+RJGn/7xKXJC1dc73DXEvUbGMbd9x1N1OnjKFCkiaCyUN7NdvYxk23vm4MtZE0Key2kiT1ZvKQJPVm8pAk9WbykCT15oC5RsI70qXFxeShkfCOdGlxGUvySLIcuIjukScFvBq4E/gw3aPevwKcVlUPJQlwPnAK8B3glVX1+dHXenGbr/s5br/tVk586RkHvB9Jk21cLY/zgb+tqlOTHAw8Bvgd4LqqOi/J2cDZwG8DLwKObZ9nARe2n5pH83U/xw/qJ7wvRFoCRj5gnuRQ4JeAiwGq6vtV9U1gLXBJK3YJ8NK2vBa4tDqfBZYnWTHSSkuSfsw4ZlutBnYAf5bkC0kuSvJYurcWbmtl7geObMtHAfcNfH9Li/2YJOuTbE6yeceOHUOsviRpHMljGXAccGFVPQP4Nl0X1Y9UVdGNhcxZVW2oqjVVtWZqamreKitJ2t04xjy2AFuq6vq2fhVd8vh6khVVta11S21v27cCRw98f2WLaRGYbYDdKbzSZBt58qiq+5Pcl+SpVXUncCJwW/usA85rPz/avrIJeEOSK+gGyncOdG9pgZttgN0pvNJkG9dsqzcCl7WZVvcAr6LrQrsyyZnAV4HTWtmr6abpTtNN1X3V6KsrSRo0luRRVV+kexf6TCfuoWwBZw27TpKkufPZVpKk3kwekqTeTB6SpN58MKImklN4pclm8tBEcgqvNNnstpIk9WbLYwmZ7bHr4CPTJfVj8lhCZnvsOvjIdEn92G0lSerN5CFJ6s3kIUnqzTEPLSje/yFNBpOHFhTv/5Amg91WkqTeTB6SpN5MHpKk3kwekqTexpY8khyU5AtJPtbWVye5Psl0kg+3V9SS5JC2Pt22rxpXnSVJnXG2PH4DuH1g/Z3Au6vqycBDwJktfibwUIu/u5WTJI3RWKbqJlkJ/EfgHcBbkgR4PvCrrcglwO8CFwJr2zLAVcB7kqS921wCvP9DGrVx3efxJ8DbgMe39ScA36yqh9v6FuCotnwUcB9AVT2cZGcr/8DgDpOsB9YDHHPMMcOsuyaQ939IozXybqskLwa2V9WN87nfqtpQVWuqas3U1NR87lqSNMM4Wh7PAV6S5BTgUcBPAecDy5Msa62PlcDWVn4rcDSwJcky4FDgG6OvtiRpl5G3PKrqnKpaWVWrgNOBT1TVy4BPAqe2YuuAj7blTW2dtv0TjndI0nhN0rOtfhu4IskfAF8ALm7xi4EPJpkGHqRLONqL2d4Y6NsCJc2XsSaPqvoH4B/a8j3A8Xso813gV0ZasQVutjcGLsW3BToLSxqOSWp5SPPOWVjScPh4EklSbyYPSVJvdltpSZptLAQcD5HmwuShJWm2sRBwPESaC7utJEm9mTwkSb3ZbbWAeTOgpHExeSxg3gwoaVzstpIk9WbykCT1ZreVNIPPw5L2zeQhzeDzsKR9s9tKktSbLQ9pjuzOkh5h8pDmyO4s6RF2W0mSeht5yyPJ0cClwJFAARuq6vwkhwMfBlYBXwFOq6qHkgQ4HzgF+A7wyqr6/KjrPU7eSS5p0oyj2+ph4Leq6vNJHg/cmORa4JXAdVV1XpKzgbPp3mv+IuDY9nkWcGH7uWR4J7mkSTPybquq2rar5VBV/xe4HTgKWAtc0opdAry0La8FLq3OZ4HlSVaMttaSpEFjHTBPsgp4BnA9cGRVbWub7qfr1oIusdw38LUtLbZtIEaS9cB6gGOOOWZ4lZZmcBaWlqKxJY8kjwP+AnhzVX2rG9roVFUlqT77q6oNwAaANWvW9PqudCCchaWlaCzJI8lP0iWOy6rqIy389SQrqmpb65ba3uJbgaMHvr6yxaSJZotEi9k4ZlsFuBi4var+eGDTJmAdcF77+dGB+BuSXEE3UL5zoHtLmli2SLSYjaPl8Rzg14Cbk3yxxX6HLmlcmeRM4KvAaW3b1XTTdKfppuq+aqS1lSTtZuTJo6r+Ecgsm0/cQ/kCzhpqpSRJvXiHuSSpN59tJY2YA+laDEwe0og5kK7FwG4rSVJvJg9JUm92W0kTwrEQLSQmD2lCOBaihcTkIU04WySaRCYPacLZItEkMnlIS9xsb6q0ZaO9MXlIi8xsyeArX76LVU96ym7xO+66m19885/uFrdlo70xeUgL1GxjIbMlg5v+6HW+zljzxuQhLVCzjYXMVzKYLTmBXVoyeUiaxWzJCezSksmjl2FPmZytr/qOu+5m6pQD3r00b5w+LJNHD8OeMrntgZ32SWtBmO3vwife9d9MKkuEyUPSvJmvpNJ3+vCwy2t3Jg9JQ9c3qfSdPjxbq71v+f1pOS3VRLRgkkeSk4HzgYOAi6rqvDFXab85tiF1+s4Y29v05D393elbvm+S27WvPSW62b4z2/028xUfVdJaEMkjyUHAe4FfBrYANyTZVFW3jbdmnb6Dh45tSPunb7KZr+nMe5t51vvYs91vM0/xUc2EWxDJAzgemK6qewCSXAGsBSYieexPk9wWhqSFLFU17jrsU5JTgZOr6jVt/deAZ1XVGwbKrAfWt9WnAnfuY7dHAA8MoboLwVI9d897afG8+/uXVTU1l4ILpeWxT1W1Adgw1/JJNlfVmiFWaWIt1XP3vJcWz3u4FspraLcCRw+sr2wxSdIYLJTkcQNwbJLVSQ4GTgc2jblOkrRkLYhuq6p6OMkbgGvopupurKpbD3C3c+7iWoSW6rl73kuL5z1EC2LAXJI0WRZKt5UkaYKYPCRJvS3J5JHk5CR3JplOcva467M/khyd5JNJbktya5LfaPHDk1yb5O7287AWT5IL2jnflOS4gX2ta+XvTrJuIP7vk9zcvnNBkoz+TPcsyUFJvpDkY219dZLrW10/3CZWkOSQtj7dtq8a2Mc5LX5nkhcOxCfy9yPJ8iRXJbkjye1Jnr0UrneS32y/47ckuTzJoxbj9U6yMcn2JLcMxIZ+fWc7xj5V1ZL60A24fxl4InAw8CXgaeOu136cxwrguLb8eOAu4GnAHwJnt/jZwDvb8inA3wABTgCub/HDgXvaz8Pa8mFt2+da2bTvvmjc5z1w/m8B/jfwsbZ+JXB6W34f8Lq2/HrgfW35dODDbflp7dofAqxuvxMHTfLvB3AJ8Jq2fDCwfLFfb+Ao4F7g0QPX+ZWL8XoDvwQcB9wyEBv69Z3tGPus77h/OcZwgZ4NXDOwfg5wzrjrNQ/n9VG6Z3/dCaxosRXAnW35/cAZA+XvbNvPAN4/EH9/i60A7hiI/1i5MZ/rSuA64PnAx9pfhgeAZTOvMd0MvWe35WWtXGZe913lJvX3Azi0/SOaGfFFfb3pksd97R/DZe16v3CxXm9gFT+ePIZ+fWc7xr4+S7Hbatcv4y5bWmzBak3zZwDXA0dW1ba26X7gyLY823nvLb5lD/FJ8CfA24AftvUnAN+sqofb+mBdf3R+bfvOVr7vn8e4rQZ2AH/WuusuSvJYFvn1rqqtwLuArwHb6K7fjSz+673LKK7vbMfYq6WYPBaVJI8D/gJ4c1V9a3Bbdf+VWFRzsZO8GNheVTeOuy4jtoyuS+PCqnoG8G26LoYfWaTX+zC6h6CuBn4GeCxw8lgrNSajuL59jrEUk8eiedRJkp+kSxyXVdVHWvjrSVa07SuA7S0+23nvLb5yD/Fxew7wkiRfAa6g67o6H1ieZNdNr4N1/dH5te2HAt+g/5/HuG0BtlTV9W39Krpkstiv9wuAe6tqR1X9APgI3e/AYr/eu4zi+s52jL1aisljUTzqpM2UuBi4var+eGDTJmDXDIt1dGMhu+KvaLM0TgB2tqbqNcBJSQ5r/8s7ia4PeBvwrSQntGO9YmBfY1NV51TVyqpaRXftPlFVLwM+CZzais08711/Hqe28tXip7fZOauBY+kGFCfy96Oq7gfuS/LUFjqR7pUEi/p603VXnZDkMa1eu857UV/vAaO4vrMdY+/GNTA0zg/dTIW76GZZvH3c9dnPc/gFuublTcAX2+cUuv7d64C7gb8HDm/lQ/dCrS8DNwNrBvb1amC6fV41EF8D3NK+8x5mDNaO+wM8l0dmWz2R7h+DaeDPgUNa/FFtfbptf+LA99/ezu1OBmYWTervB/DvgM3tmv8V3WyaRX+9gf8B3NHq9kG6GVOL7noDl9ON6/yArqV55iiu72zH2NfHx5NIknpbit1WkqQDZPKQJPVm8pAk9WbykCT1ZvKQJPVm8pAk9WbykCbAwN3S0oJg8pAOUJJV6d6x8YEkdyW5LMkLknymvSPh+CT/IckX2+cLSR6f5LlJ/k+STcBtA/u5LN37Oq5K8phxn5+0J94kKB2g9lTjabonG99K98iLL9HdIfwS4FV07404r6o+0x5m+V26pwR8HHh6Vd3b9nMv8Aut3Ebgtqp614hPSdonWx7S/Li3qm6uqh/SJZDrqvuf2c1072j4DPDHSd4ELK9HHif+uaq6d2A/91XVZ9ryh+gSjDRxTB7S/PjewPIPB9Z/SPfSovOA1wCPBj6T5Gfb9m/P2M/MrgC7BjSRTB7SCCR5UmuZvJOuW+tnZyl6TJJnt+VfBf5xJBWUejJ5SKPx5iS3JLmJ7qmpfzNLuTuBs5LcTvfU3AtHVUGpDwfMpQnRBsw/VlVPH3ddpH2x5SFJ6s2WhySpN1sekqTeTB6SpN5MHpKk3kwekqTeTB6SpN7+P0fq7giiP7SsAAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "sns.histplot(df.msrp[df.msrp < 100000], bins=50)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ 0.        ,  0.69314718,  2.39789527,  6.90875478, 11.51293546])"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.log1p([0, 1, 10, 1000, 100000])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ 0.        ,  0.69314718,  2.39789527,  6.90875478, 11.51292546])"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.log([0 + 1, 1+ 1, 10 + 1, 1000 + 1, 100000])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "price_logs = np.log1p(df.msrp)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<AxesSubplot:xlabel='msrp', ylabel='Count'>"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYsAAAEGCAYAAACUzrmNAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8rg+JYAAAACXBIWXMAAAsTAAALEwEAmpwYAAAW+ElEQVR4nO3dfbRddX3n8fcHM6BoC2iuNEKYMJpqHacdbYr40A7LWMSMY5iOIo5oRGgGiqW2ThV0rbKsdS1cumrB6YKVAQYoDEgdHVKbqhSfWpcg4UEgBCHDgyQTIBGMs8pYiXznj7MzPeTek30T7jn73Hvfr7XuOnt/9z7nfNGb+zn799tn71QVkiTtyX5dNyBJGn+GhSSplWEhSWplWEiSWhkWkqRWC7puYBgWLlxYS5Ys6boNSZpVbr755u1VNTHVtjkZFkuWLGH9+vVdtyFJs0qSBwdtcxhKktRqaGGR5JIkjya5c4ptH0xSSRY260lyfpJNSW5P8qq+fVclubf5WTWsfiVJgw3zyOJS4Ljdi0kWA8cCP+grvxlY2vysBi5o9n0+cA7wauAo4JwkhwyxZ0nSFIYWFlX1LeCxKTZ9BvgQ0H+dkZXA5dVzA3BwkkXAm4DrquqxqnocuI4pAkiSNFwjnbNIshLYUlXf223TYcBDfeubm9qg+lSvvTrJ+iTrt23bNoNdS5JGFhZJDgQ+AvzRMF6/qtZU1bKqWjYxMeWZX5KkfTTKI4sXA0cC30vyAHA4cEuSXwC2AIv79j28qQ2qS5JGaGRhUVV3VNULq2pJVS2hN6T0qqp6GFgLvKc5K+poYEdVbQW+Ahyb5JBmYvvYpiZJGqFhnjp7FfAd4KVJNic5ZQ+7rwPuAzYB/xX4HYCqegz4OHBT8/PHTU2SNEKZizc/WrZsWfkNbgGcdOppbN2+Y1J90cKDuOKiCzvoSBpfSW6uqmVTbZuTl/uQdtm6fQcTK86cXF93fgfdSLOXl/uQJLUyLCRJrQwLSVIrw0KS1MqwkCS1MiwkSa0MC0lSK8NCktTKsJAktTIsJEmtDAtJUivDQpLUyrCQJLUyLCRJrbxEueaEQfetuPuee5lY0UFD0hxjWGhOGHTfits3nN5BN9Lc4zCUJKmVYSFJauUwlOaljXdtYPnx75xym/fnliYzLDQvPVn7TTnHAd6fW5qKw1CSpFZDC4sklyR5NMmdfbVPJbk7ye1Jvpjk4L5tZyfZlOT7Sd7UVz+uqW1Kctaw+pUkDTbMYahLgf8CXN5Xuw44u6p2JvkkcDbw4SQvB04E/iXwIuBvk/xi85w/B34T2AzclGRtVd01xL4HnrPvWLak+WpoYVFV30qyZLfaV/tWbwDe1iyvBK6uqn8E7k+yCTiq2bapqu4DSHJ1s+9Qw2LQOfuOZUuar7qcs3gf8DfN8mHAQ33bNje1QfVJkqxOsj7J+m3btg2hXUmavzoJiyQfBXYCV87Ua1bVmqpaVlXLJiYmZuplJUl0cOpskvcCbwGWV1U15S3A4r7dDm9q7KEuSRqRkR5ZJDkO+BDw1qp6om/TWuDEJAckORJYCnwXuAlYmuTIJPvTmwRfO8qeJUlDPLJIchVwDLAwyWbgHHpnPx0AXJcE4IaqOq2qNiS5ht7E9U7gjKr6WfM67we+AjwLuKSqNgyrZ0nS1IZ5NtRU11K4eA/7fwL4xBT1dcC6GWxNkrSX/Aa3JKmVYSFJamVYSJJaGRaSpFaGhSSplWEhSWplWEiSWhkWkqRWhoUkqZVhIUlqZVhIkloZFpKkVoaFJKnVyG9+JI27jXdtYPnxky+avGjhQVxx0YUddCR1z7CQdvNk7cfEijMn1beuO7+DbqTx4DCUJKmVYSFJamVYSJJaGRaSpFaGhSSplWEhSWo1tLBIckmSR5Pc2Vd7fpLrktzbPB7S1JPk/CSbktye5FV9z1nV7H9vklXD6leSNNgwjywuBY7brXYWcH1VLQWub9YB3gwsbX5WAxdAL1yAc4BXA0cB5+wKGEnS6AwtLKrqW8Bju5VXApc1y5cBx/fVL6+eG4CDkywC3gRcV1WPVdXjwHVMDiBJ0pCN+hvch1bV1mb5YeDQZvkw4KG+/TY3tUH1SZKspndUwhFHHDGDLWucnHTqaWzdvmNS/e577mViRQcNSfNEZ5f7qKpKUjP4emuANQDLli2bsdfVeNm6fceUl+K4fcPpHXQjzR+jPhvqkWZ4iebx0aa+BVjct9/hTW1QXZI0QqMOi7XArjOaVgHX9tXf05wVdTSwoxmu+gpwbJJDmontY5uaJGmEhjYMleQq4BhgYZLN9M5qOhe4JskpwIPACc3u64AVwCbgCeBkgKp6LMnHgZua/f64qnafNJckDdnQwqKqJt8QoGf5FPsWcMaA17kEuGQGW5Mk7SW/wS1JamVYSJJaGRaSpFaGhSSplWEhSWplWEiSWhkWkqRWhoUkqZVhIUlqZVhIkloZFpKkVoaFJKmVYSFJamVYSJJaGRaSpFaGhSSplWEhSWplWEiSWhkWkqRWhoUkqZVhIUlqtaCLN03y+8CpQAF3ACcDi4CrgRcANwPvrqqfJjkAuBz4VeCHwDuq6oEu+tb8tvGuDSw//p2T6osWHsQVF13YQUfS6Iw8LJIcBpwJvLyq/m+Sa4ATgRXAZ6rq6iQXAqcAFzSPj1fVS5KcCHwSeMeo+5aerP2YWHHmpPrWded30I00Wl0NQy0AnpNkAXAgsBV4A/D5ZvtlwPHN8spmnWb78iQZXauSpGmFRZLXTac2HVW1Bfg08AN6IbGD3rDTj6pqZ7PbZuCwZvkw4KHmuTub/V8wRT+rk6xPsn7btm370pokaYDpHll8dpq1VkkOoXe0cCTwIuC5wHH78lr9qmpNVS2rqmUTExPP9OUkSX32OGeR5DXAa4GJJH/Qt+nngWft43u+Ebi/qrY17/EF4HXAwUkWNEcPhwNbmv23AIuBzc2w1UH0JrolSSPSdmSxP/A8eqHyc30/Pwbeto/v+QPg6CQHNnMPy4G7gK/3veYq4NpmeW2zTrP9a1VV+/jekqR9sMcji6r6JvDNJJdW1YMz8YZVdWOSzwO3ADuBW4E1wF8DVyf5k6Z2cfOUi4G/SLIJeIzemVOSpBGa7qmzByRZAyzpf05VvWFf3rSqzgHO2a18H3DUFPv+BHj7vryPJGlmTDcs/hK4ELgI+Nnw2pEkjaPphsXOqrpgqJ1IksbWdE+d/askv5NkUZLn7/oZameSpLEx3SOLXWcj/WFfrYB/MbPtSJLG0bTCoqqOHHYjkqTxNa2wSPKeqepVdfnMtiNJGkfTHYb6tb7lZ9P7It0t9C4dLkma46Y7DPW7/etJDqZ37wlJ0jywr5co/wd6FwKUJM0D052z+Ct6Zz9B7wKCvwRcM6ymJEnjZbpzFp/uW94JPFhVm4fQjyRpDE1rGKq5oODd9K44ewjw02E2JUkaL9O9U94JwHfpXdDvBODGJPt6iXJJ0iwz3WGojwK/VlWPAiSZAP6Wf7pntiRpDpvu2VD77QqKxg/34rmSpFluukcWX07yFeCqZv0dwLrhtCRJGjdt9+B+CXBoVf1hkt8CXt9s+g5w5bCbkySNh7Yjiz8Dzgaoqi8AXwBI8q+abf9uiL1JksZEW1gcWlV37F6sqjuSLBlOSxKcdOppbN2+Y1L97nvuZWJFBw1J81xbWBy8h23PmcE+pKfZun0HEyvOnFS/fcPpHXQjqe2MpvVJfnv3YpJTgZuH05Ikady0HVl8APhiknfxT+GwDNgf+Pf7+qbNVWsvAl5B75pT7wO+D3wOWAI8AJxQVY8nCXAesAJ4AnhvVd2yr+8tSdp7ezyyqKpHquq1wMfo/QF/APhYVb2mqh5+Bu97HvDlqnoZ8CvARuAs4PqqWgpc36wDvBlY2vysBi54Bu8rSdoH072fxdeBr8/EGyY5CPgN4L3Na/8U+GmSlcAxzW6XAd8APgysBC6vqgJuSHJwkkVVtXUm+pEkteviW9hHAtuA/5bk1iQXJXkuvTOvdgXAw8ChzfJhwEN9z9/c1J4myeok65Os37Zt2xDbl6T5p4uwWAC8Crigql5J70ZKZ/Xv0BxF1BTPHaiq1lTVsqpaNjExMWPNSpK6CYvNwOaqurFZ/zy98HgkySKA5nHXtai2AIv7nn94U5MkjcjIw6KZGH8oyUub0nLgLmAtsKqprQKubZbXAu9Jz9HADucrJGm0pnshwZn2u8CVSfYH7gNOphdc1yQ5BXiQ3n0zoHfBwhXAJnqnzp48+nYlaX7rJCyq6jZ639fY3fIp9i3gjGH3JEkarKsjC2nO2HjXBpYf/85J9UULD+KKiy7soCNp5hkW0jP0ZO035XWstq47v4NupOHwbneSpFaGhSSplWEhSWplWEiSWhkWkqRWhoUkqZVhIUlqZVhIkloZFpKkVoaFJKmVYSFJamVYSJJaGRaSpFaGhSSplWEhSWplWEiSWhkWkqRWhoUkqZVhIUlq1VlYJHlWkluTfKlZPzLJjUk2Jflckv2b+gHN+qZm+5Kuepak+arLI4vfAzb2rX8S+ExVvQR4HDilqZ8CPN7UP9PsJ0kaoU7CIsnhwL8FLmrWA7wB+Hyzy2XA8c3yymadZvvyZn9J0oh0dWTxZ8CHgKea9RcAP6qqnc36ZuCwZvkw4CGAZvuOZv+nSbI6yfok67dt2zbE1iVp/hl5WCR5C/BoVd08k69bVWuqallVLZuYmJjJl5akeW9BB+/5OuCtSVYAzwZ+HjgPODjJgubo4XBgS7P/FmAxsDnJAuAg4Iejb1uS5q+Rh0VVnQ2cDZDkGOA/V9W7kvwl8DbgamAVcG3zlLXN+nea7V+rqhpx2xqSk049ja3bd0yq333PvUys6KAhSVPq4shikA8DVyf5E+BW4OKmfjHwF0k2AY8BJ3bUn4Zg6/YdTKw4c1L99g2nd9CNpEE6DYuq+gbwjWb5PuCoKfb5CfD2kTYmSXoav8EtSWplWEiSWo3TnIU0p2y8awPLj3/npPqihQdxxUUXdtCRtO8MC2lInqz9ppy837ru/A66kZ4Zh6EkSa0MC0lSK8NCktTKsJAktTIsJEmtDAtJUivDQpLUyrCQJLUyLCRJrQwLSVIrw0KS1MprQ2kkvCOeNLsZFhoJ74gnzW4OQ0mSWhkWkqRWhoUkqZVzFtKIeQc9zUYjD4ski4HLgUOBAtZU1XlJng98DlgCPACcUFWPJwlwHrACeAJ4b1XdMuq+pZniHfQ0G3UxDLUT+GBVvRw4GjgjycuBs4Drq2opcH2zDvBmYGnzsxq4YPQtS9L8NvKwqKqtu44Mqur/ABuBw4CVwGXNbpcBxzfLK4HLq+cG4OAki0bbtSTNb51OcCdZArwSuBE4tKq2NpsepjdMBb0geajvaZubmiRpRDoLiyTPA/4H8IGq+nH/tqoqevMZe/N6q5OsT7J+27ZtM9ipJKmTsEjyz+gFxZVV9YWm/Miu4aXm8dGmvgVY3Pf0w5va01TVmqpaVlXLJiYmhte8JM1DXZwNFeBiYGNV/WnfprXAKuDc5vHavvr7k1wNvBrY0TdcJc0ZnlKrcdbF9yxeB7wbuCPJbU3tI/RC4pokpwAPAic029bRO212E71TZ08eabfSiHhKrcbZyMOiqv4eyIDNy6fYv4AzhtqUJGmPvNyHJKmVYSFJauW1oaQx58S3xoFhIY05J741DhyGkiS1MiwkSa0chtoLjh1Lmq8Mi73g2LGk+cphKElSK8NCktTKYagZ4FyGuuDvnUbJsJgBg+Yyvvbp/+Q/Zg1Nl3NoJ516Glu375hU93d77jIshsgJcc1VW7fv8Hd7njEspDnG4SkNg2EhzTEOi2oYDAtpnnBYVM+EYSFpSoMmsQHuvudeJlZMrjsENncZFppRg/7ADPrjou4N+gN/9z338usf+OyUz7l9w+lT1h0Cm7sMiw7M5U9fg86SGfTHRd0b9Ad+Jv8/cwhs9jMsOuA/HKlnLn9wmmsMC0md8YPT7GFYjJHZ9CnLuQlpfpk1YZHkOOA84FnARVV1bsctzbjZNDno3IS6MOhDygP/6x6WvPgXJ9XH8YPWbDUrwiLJs4A/B34T2AzclGRtVd3VbWej0WWIeAShLuztGVq3f+p0h7OGbFaEBXAUsKmq7gNIcjWwEpgXYTHI3obIoE9fe9o28B+nRxAaopk6Q2suDO2OS6+pqq57aJXkbcBxVXVqs/5u4NVV9f6+fVYDq5vVlwLfH3mjT7cQ2N5xD3vDfofLfodnNvUK493vP6+qiak2zJYji1ZVtQZY03UfuyRZX1XLuu5juux3uOx3eGZTrzD7+t1lttwpbwuwuG/98KYmSRqB2RIWNwFLkxyZZH/gRGBtxz1J0rwxK4ahqmpnkvcDX6F36uwlVbWh47bajM2Q2DTZ73DZ7/DMpl5h9vULzJIJbklSt2bLMJQkqUOGhSSplWExBEl+P8mGJHcmuSrJs7vuaZAkv9f0uSHJB7ruZypJLknyaJI7+2rPT3Jdknubx0O67HGXAb2+vfnf96kkY3XK5IB+P5Xk7iS3J/likoM7bPFpBvT78abX25J8NcmLuuyx31T99m37YJJKsrCL3vaWYTHDkhwGnAksq6pX0JuQP7HbrqaW5BXAb9P7hvyvAG9J8pJuu5rSpcBxu9XOAq6vqqXA9c36OLiUyb3eCfwW8K2Rd9PuUib3ex3wiqr6ZeAe4OxRN7UHlzK5309V1S9X1b8GvgT80aib2oNLmdwvSRYDxwI/GHVD+8qwGI4FwHOSLAAOBP53x/0M8kvAjVX1RFXtBL5J74/aWKmqbwGP7VZeCVzWLF8GHD/KngaZqteq2lhVXV9RYEoD+v1q8/sAcAO97zWNhQH9/rhv9bnA2Jy1M+B3F+AzwIcYo17bGBYzrKq2AJ+m94lhK7Cjqr7abVcD3Qn8epIXJDkQWMHTv/w4zg6tqq3N8sPAoV02M4e9D/ibrptok+QTSR4C3sV4HVlMkmQlsKWqvtd1L3vDsJhhzdj5SuBI4EXAc5Oc1G1XU6uqjcAnga8CXwZuA37WZU/7onrnf8+aT2izRZKPAjuBK7vupU1VfbSqFtPr9f1t+3el+VD2EcY80KZiWMy8NwL3V9W2qnoS+ALw2o57GqiqLq6qX62q3wAepzdGPRs8kmQRQPP4aMf9zClJ3gu8BXhXza4vY10J/Ieum9iDF9P7IPm9JA/QG+K7JckvdNrVNBgWM+8HwNFJDkwSYDmwseOeBkrywubxCHrzFf+9246mbS2wqlleBVzbYS9zSnOjsQ8Bb62qJ7rup02SpX2rK4G7u+qlTVXdUVUvrKolVbWE3v15XlVVD3fcWiu/wT0EST4GvIPeIfytwKlV9Y/ddjW1JH8HvAB4EviDqrq+45YmSXIVcAy9Szs/ApwD/E/gGuAI4EHghKqaaiJxpAb0+hjwWWAC+BFwW1W9qaMWn2ZAv2cDBwA/bHa7oapO66TB3QzodwW92xI8Re934bRm7rBzU/VbVRf3bX+A3pmT43rJ8v/PsJAktXIYSpLUyrCQJLUyLCRJrQwLSVIrw0KS1MqwkCS1MiykMdBcdFIaW4aF9AwlWdLc/+HSJPckuTLJG5N8u7nfxlFJ/k1zv4Xbktya5OeSHJPk75KsBe7qe50rk2xM8vnmWkJS5/xSnvQMJVkCbAJeCWwAbgK+B5wCvBU4md59Tc6tqm8neR7wE+D1wF/Tu3fE/c3r3A+8vtnvEuCuqvr0iP+TpEk8spBmxv3NdX+eohcY1zcX4LsDWAJ8G/jTJGcCB/fdL+K7VXV/3+s8VFXfbpavoBcoUucMC2lm9F/766m+9aeABVV1LnAq8Bzg20le1mz/h91eZ/dDfQ/9NRYMC2kEkry4OfL4JL1hqpcN2PWIJK9plv8j8PcjaVBqYVhIo/GBJHcmuZ3eFX4H3X3u+8AZSTYChwAXjKpBaU+c4JbGRDPB/aWqekXXvUi788hCktTKIwtJUiuPLCRJrQwLSVIrw0KS1MqwkCS1MiwkSa3+H9MtYMrZEHyyAAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "\n",
    "sns.histplot(price_logs, bins=50)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Missing values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "make                    0\n",
       "model                   0\n",
       "year                    0\n",
       "engine_fuel_type        3\n",
       "engine_hp              69\n",
       "engine_cylinders       30\n",
       "transmission_type       0\n",
       "driven_wheels           0\n",
       "number_of_doors         6\n",
       "market_category      3742\n",
       "vehicle_size            0\n",
       "vehicle_style           0\n",
       "highway_mpg             0\n",
       "city_mpg                0\n",
       "popularity              0\n",
       "msrp                    0\n",
       "dtype: int64"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.isnull().sum()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2.4 Setting up the validation framework"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's draw it"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "n = len(df)\n",
    "\n",
    "n_val = int(n * 0.2)\n",
    "n_test = int(n * 0.2)\n",
    "n_train = n - n_val - n_test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "11914"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(2382, 2382, 7150)"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "n_val, n_test, n_train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>make</th>\n",
       "      <th>model</th>\n",
       "      <th>year</th>\n",
       "      <th>engine_fuel_type</th>\n",
       "      <th>engine_hp</th>\n",
       "      <th>engine_cylinders</th>\n",
       "      <th>transmission_type</th>\n",
       "      <th>driven_wheels</th>\n",
       "      <th>number_of_doors</th>\n",
       "      <th>market_category</th>\n",
       "      <th>vehicle_size</th>\n",
       "      <th>vehicle_style</th>\n",
       "      <th>highway_mpg</th>\n",
       "      <th>city_mpg</th>\n",
       "      <th>popularity</th>\n",
       "      <th>msrp</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>bmw</td>\n",
       "      <td>1_series</td>\n",
       "      <td>2013</td>\n",
       "      <td>premium_unleaded_(required)</td>\n",
       "      <td>300.0</td>\n",
       "      <td>6.0</td>\n",
       "      <td>manual</td>\n",
       "      <td>rear_wheel_drive</td>\n",
       "      <td>2.0</td>\n",
       "      <td>luxury,high-performance</td>\n",
       "      <td>compact</td>\n",
       "      <td>coupe</td>\n",
       "      <td>28</td>\n",
       "      <td>20</td>\n",
       "      <td>3916</td>\n",
       "      <td>39600</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>bmw</td>\n",
       "      <td>1_series_m</td>\n",
       "      <td>2011</td>\n",
       "      <td>premium_unleaded_(required)</td>\n",
       "      <td>335.0</td>\n",
       "      <td>6.0</td>\n",
       "      <td>manual</td>\n",
       "      <td>rear_wheel_drive</td>\n",
       "      <td>2.0</td>\n",
       "      <td>factory_tuner,luxury,high-performance</td>\n",
       "      <td>compact</td>\n",
       "      <td>coupe</td>\n",
       "      <td>26</td>\n",
       "      <td>19</td>\n",
       "      <td>3916</td>\n",
       "      <td>46135</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>bmw</td>\n",
       "      <td>1_series</td>\n",
       "      <td>2011</td>\n",
       "      <td>premium_unleaded_(required)</td>\n",
       "      <td>230.0</td>\n",
       "      <td>6.0</td>\n",
       "      <td>manual</td>\n",
       "      <td>rear_wheel_drive</td>\n",
       "      <td>2.0</td>\n",
       "      <td>luxury,performance</td>\n",
       "      <td>compact</td>\n",
       "      <td>coupe</td>\n",
       "      <td>28</td>\n",
       "      <td>18</td>\n",
       "      <td>3916</td>\n",
       "      <td>29450</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>bmw</td>\n",
       "      <td>1_series</td>\n",
       "      <td>2012</td>\n",
       "      <td>premium_unleaded_(required)</td>\n",
       "      <td>230.0</td>\n",
       "      <td>6.0</td>\n",
       "      <td>manual</td>\n",
       "      <td>rear_wheel_drive</td>\n",
       "      <td>2.0</td>\n",
       "      <td>luxury,performance</td>\n",
       "      <td>compact</td>\n",
       "      <td>coupe</td>\n",
       "      <td>28</td>\n",
       "      <td>18</td>\n",
       "      <td>3916</td>\n",
       "      <td>31200</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   make       model  year             engine_fuel_type  engine_hp  \\\n",
       "10  bmw    1_series  2013  premium_unleaded_(required)      300.0   \n",
       "0   bmw  1_series_m  2011  premium_unleaded_(required)      335.0   \n",
       "3   bmw    1_series  2011  premium_unleaded_(required)      230.0   \n",
       "5   bmw    1_series  2012  premium_unleaded_(required)      230.0   \n",
       "\n",
       "    engine_cylinders transmission_type     driven_wheels  number_of_doors  \\\n",
       "10               6.0            manual  rear_wheel_drive              2.0   \n",
       "0                6.0            manual  rear_wheel_drive              2.0   \n",

Download .txt

gitextract_3sp31muz/

├── .github/
│   └── FUNDING.yml
├── .gitignore
├── 01-intro/
│   ├── 01-what-is-ml.md
│   ├── 02-ml-vs-rules.md
│   ├── 03-supervised-ml.md
│   ├── 04-crisp-dm.md
│   ├── 05-model-selection.md
│   ├── 06-environment.md
│   ├── 07-numpy.md
│   ├── 08-linear-algebra.md
│   ├── 09-pandas.md
│   ├── 10-summary.md
│   ├── README.md
│   ├── homework.md
│   └── notebooks/
│       ├── 07-numpy.ipynb
│       ├── 08-linear-algebra.ipynb
│       └── 09-pandas.ipynb
├── 02-regression/
│   ├── 01-car-price-intro.md
│   ├── 02-data-preparation.md
│   ├── 03-eda.md
│   ├── 04-validation-framework.md
│   ├── 05-linear-regression-simple.md
│   ├── 06-linear-regression-vector.md
│   ├── 07-linear-regression-training.md
│   ├── 08-baseline-model.md
│   ├── 09-rmse.md
│   ├── 10-car-price-validation.md
│   ├── 11-feature-engineering.md
│   ├── 12-categorical-variables.md
│   ├── 13-regularization.md
│   ├── 14-tuning-model.md
│   ├── 15-using-model.md
│   ├── 16-summary.md
│   ├── 17-explore-more.md
│   ├── README.md
│   ├── homework.md
│   ├── meta.json
│   └── notebook.ipynb
├── 03-classification/
│   ├── 01-churn-project.md
│   ├── 02-data-preparation.md
│   ├── 03-validation.md
│   ├── 04-eda.md
│   ├── 05-risk.md
│   ├── 06-mutual-info.md
│   ├── 07-correlation.md
│   ├── 08-ohe.md
│   ├── 09-logistic-regression.md
│   ├── 10-training-log-reg.md
│   ├── 11-log-reg-interpretation.md
│   ├── 12-using-log-reg.md
│   ├── 13-summary.md
│   ├── 14-explore-more.md
│   ├── README.md
│   ├── homework.md
│   ├── meta.csv
│   ├── meta.json
│   ├── notebook-scaling-ohe.ipynb
│   └── notebook.ipynb
├── 04-evaluation/
│   ├── 01-overview.md
│   ├── 02-accuracy.md
│   ├── 03-confusion-table.md
│   ├── 04-precision-recall.md
│   ├── 05-roc.md
│   ├── 06-auc.md
│   ├── 07-cross-validation.md
│   ├── 08-summary.md
│   ├── 09-explore-more.md
│   ├── README.md
│   ├── homework.md
│   ├── meta.csv
│   ├── meta.json
│   └── notebook.ipynb
├── 05-deployment/
│   ├── 01-intro.md
│   ├── 02-pickle.md
│   ├── 03-flask-intro.md
│   ├── 04-flask-deployment.md
│   ├── 05-pipenv.md
│   ├── 06-docker.md
│   ├── 07-aws-eb.md
│   ├── 08-summary.md
│   ├── 09-explore-more.md
│   ├── README.md
│   ├── code/
│   │   ├── 05-train-churn-model.ipynb
│   │   ├── Dockerfile
│   │   ├── Pipfile
│   │   ├── ping.py
│   │   ├── plan.md
│   │   ├── predict-test.py
│   │   ├── predict.py
│   │   └── train.py
│   ├── homework.md
│   ├── meta.csv
│   ├── meta.json
│   └── workshop/
│       ├── .dockerignore
│       ├── .python-version
│       ├── Dockerfile
│       ├── README.md
│       ├── fly.toml
│       ├── ping.py
│       ├── predict.py
│       ├── predict_old.py
│       ├── pyproject.toml
│       ├── starter.ipynb
│       ├── test.py
│       ├── train.py
│       └── workshop-uv-fastapi.ipynb
├── 06-trees/
│   ├── 01-credit-risk.md
│   ├── 02-data-prep.md
│   ├── 03-decision-trees.md
│   ├── 04-decision-tree-learning.md
│   ├── 05-decision-tree-tuning.md
│   ├── 06-random-forest.md
│   ├── 07-boosting.md
│   ├── 08-xgb-tuning.md
│   ├── 09-final-model.md
│   ├── 10-summary.md
│   ├── 11-explore-more.md
│   ├── README.md
│   ├── homework.md
│   ├── meta.csv
│   ├── meta.json
│   └── notebook.ipynb
├── 08-deep-learning/
│   ├── 01-fashion-classification.md
│   ├── 02-tensorflow-keras.md
│   ├── 03-pretrained-models.md
│   ├── 04-conv-neural-nets.md
│   ├── 05-transfer-learning.md
│   ├── 06-learning-rate.md
│   ├── 07-checkpointing.md
│   ├── 08-more-layers.md
│   ├── 09-dropout.md
│   ├── 10-augmentation.md
│   ├── 11-large-model.md
│   ├── 12-using-model.md
│   ├── 13-summary.md
│   ├── 14-explore-more.md
│   ├── README.md
│   ├── homework.md
│   ├── install.md
│   ├── meta.csv
│   ├── meta.json
│   ├── notebook.ipynb
│   └── pytorch/
│       ├── README.md
│       └── install_pytorch.md
├── 09-serverless/
│   ├── 01-intro.md
│   ├── 02-aws-lambda.md
│   ├── 03-tensorflow-lite.md
│   ├── 04-preparing-code.md
│   ├── 05-docker-image.md
│   ├── 06-creating-lambda.md
│   ├── 07-api-gateway.md
│   ├── 08-summary.md
│   ├── 09-explore-more.md
│   ├── README.md
│   ├── code/
│   │   ├── Dockerfile
│   │   ├── convert-model.py
│   │   ├── lambda_function.py
│   │   ├── plan.md
│   │   ├── tensorflow-model.ipynb
│   │   └── test.py
│   ├── homework.md
│   ├── meta.csv
│   ├── meta.json
│   ├── updates.md
│   └── workshop/
│       ├── README.md
│       ├── lambda-keras/
│       │   ├── .gitignore
│       │   ├── Dockerfile
│       │   ├── convert/
│       │   │   ├── .dockerignore
│       │   │   ├── Dockerfile
│       │   │   ├── README.md
│       │   │   └── convert-saved-model.py
│       │   ├── lambda_function.py
│       │   ├── test.ipynb
│       │   └── test.py
│       ├── lambda-onnx/
│       │   ├── .gitignore
│       │   ├── Dockerfile
│       │   ├── lambda_function.py
│       │   ├── test.ipynb
│       │   └── test.py
│       ├── lambda-sklearn/
│       │   ├── .dockerignore
│       │   ├── .python-version
│       │   ├── Dockerfile
│       │   ├── customer.json
│       │   ├── deploy.sh
│       │   ├── invoke.py
│       │   ├── lambda_function.py
│       │   ├── pyproject.toml
│       │   └── test.py
│       └── train/
│           ├── .python-version
│           ├── README.md
│           ├── pyproject.toml
│           └── train.py
├── 10-kubernetes/
│   ├── 01-overview.md
│   ├── 02-tensorflow-serving.md
│   ├── 03-preprocessing.md
│   ├── 04-docker-compose.md
│   ├── 05-kubernetes-intro.md
│   ├── 06-kubernetes-simple-service.md
│   ├── 07-kubernetes-tf-serving.md
│   ├── 08-eks.md
│   ├── 09-summary.md
│   ├── 10-explore-more.md
│   ├── README.md
│   ├── code/
│   │   ├── Pipfile
│   │   ├── README.md
│   │   ├── docker-compose.yaml
│   │   ├── gateway.py
│   │   ├── image-gateway.dockerfile
│   │   ├── image-model.dockerfile
│   │   ├── kube-config/
│   │   │   ├── eks-config.yaml
│   │   │   ├── gateway-deployment.yaml
│   │   │   ├── gateway-service.yaml
│   │   │   ├── model-deployment.yaml
│   │   │   └── model-service.yaml
│   │   ├── ping/
│   │   │   ├── Dockerfile
│   │   │   ├── Pipfile
│   │   │   ├── deployment.yaml
│   │   │   ├── metallb-config.yaml
│   │   │   ├── ping.py
│   │   │   └── service.yaml
│   │   ├── plan.md
│   │   ├── proto.py
│   │   ├── test.py
│   │   └── tf-serving-connect.ipynb
│   ├── homework.md
│   ├── meta.csv
│   ├── meta.json
│   └── workshop/
│       ├── README.md
│       ├── k8s/
│       │   ├── deployment.yaml
│       │   ├── hpa.yaml
│       │   └── service.yaml
│       ├── load_test.py
│       └── service/
│           ├── .gitignore
│           ├── .python-version
│           ├── Dockerfile
│           ├── README.md
│           ├── app.py
│           ├── pyproject.toml
│           └── test.py
├── 11-kserve/
│   ├── 01-overview.md
│   ├── 02-kserve-local.md
│   ├── 03-kserve-sklearn.md
│   ├── 04-kserve-custom-image.md
│   ├── 05-tensorflow-kserve.md
│   ├── 06-kserve-transformers.md
│   ├── 07-kserve-eks-upd.md
│   ├── 07-kserve-eks.md
│   ├── 08-summary.md
│   ├── 09-explore-more.md
│   ├── README.md
│   ├── code/
│   │   ├── .gitignore
│   │   ├── README.md
│   │   ├── churn/
│   │   │   ├── Pipfile
│   │   │   ├── churn-service.yaml
│   │   │   ├── churn-test.py
│   │   │   ├── churn-train.py
│   │   │   └── model.joblib
│   │   ├── clothes/
│   │   │   ├── clothes-service.yaml
│   │   │   ├── convert.py
│   │   │   ├── test-transformer.py
│   │   │   ├── test.ipynb
│   │   │   └── test.py
│   │   ├── eks/
│   │   │   ├── clothes-service.yaml
│   │   │   ├── cluster.yaml
│   │   │   └── test-transformer.py
│   │   ├── image_transfomer/
│   │   │   ├── Dockerfile
│   │   │   ├── Pipfile
│   │   │   └── image_transformer.py
│   │   ├── iris/
│   │   │   ├── iris-example.yaml
│   │   │   ├── iris-request.json
│   │   │   └── iris-test.py
│   │   └── plan.md
│   ├── meta.csv
│   └── meta.json
├── README.md
├── after-sign-up.md
├── article/
│   └── README.md
├── asking-questions.md
├── bento.md
├── certificates.md
├── cohorts/
│   ├── 2021/
│   │   ├── 01-intro/
│   │   │   ├── homework-1.ipynb
│   │   │   └── homework.md
│   │   ├── 02-regression/
│   │   │   ├── homework.ipynb
│   │   │   └── homework.md
│   │   ├── 03-classification/
│   │   │   ├── homework.ipynb
│   │   │   └── homework.md
│   │   ├── 04-evaluation/
│   │   │   ├── homework-4-solution.ipynb
│   │   │   ├── homework-4-starter.ipynb
│   │   │   └── homework.md
│   │   ├── 05-deployment/
│   │   │   ├── homework/
│   │   │   │   ├── Dockerfile
│   │   │   │   ├── Pipfile
│   │   │   │   ├── homework.md
│   │   │   │   ├── q3_test.py
│   │   │   │   ├── q4_predict.py
│   │   │   │   ├── q4_test.py
│   │   │   │   ├── q6_predict.py
│   │   │   │   └── q6_test.py
│   │   │   └── homework.md
│   │   ├── 06-trees/
│   │   │   ├── homework-6-solution.ipynb
│   │   │   ├── homework-6-starter.ipynb
│   │   │   └── homework.md
│   │   ├── 07-midterm-project/
│   │   │   ├── README.md
│   │   │   ├── week10-office-hours.ipynb
│   │   │   ├── week8-office-hours.ipynb
│   │   │   └── week9-office-hours.ipynb
│   │   ├── 08-deep-learning/
│   │   │   ├── CNN_solution.ipynb
│   │   │   ├── homework.md
│   │   │   └── week-11-office-hours.ipynb
│   │   ├── 09-serverless/
│   │   │   ├── homework/
│   │   │   │   ├── Dockerfile
│   │   │   │   ├── homework.ipynb
│   │   │   │   ├── homework.py
│   │   │   │   └── test.py
│   │   │   └── homework.md
│   │   ├── 10-kubernetes/
│   │   │   ├── homework/
│   │   │   │   ├── deployment.yaml
│   │   │   │   └── service.yaml
│   │   │   └── homework.md
│   │   ├── 12-capstone/
│   │   │   └── README.md
│   │   ├── 13-article/
│   │   │   └── README.md
│   │   ├── 14-project/
│   │   │   └── README.md
│   │   ├── leaderboard.md
│   │   └── office-hours.md
│   ├── 2022/
│   │   ├── 01-intro/
│   │   │   ├── homework.md
│   │   │   └── homework_1.ipynb
│   │   ├── 02-regression/
│   │   │   ├── homework.md
│   │   │   └── homework_2.ipynb
│   │   ├── 03-classification/
│   │   │   ├── homework.md
│   │   │   └── homework_3.ipynb
│   │   ├── 04-evaluation/
│   │   │   ├── homework.md
│   │   │   └── homework_4.ipynb
│   │   ├── 05-deployment/
│   │   │   ├── homework/
│   │   │   │   ├── Dockerfile
│   │   │   │   ├── Pipfile
│   │   │   │   ├── q3_test.py
│   │   │   │   ├── q4_predict.py
│   │   │   │   ├── q4_test.py
│   │   │   │   ├── q6_predict.py
│   │   │   │   └── q6_test.py
│   │   │   └── homework.md
│   │   ├── 06-trees/
│   │   │   ├── homework.md
│   │   │   ├── homework_6.ipynb
│   │   │   └── homework_6_starter.ipynb
│   │   ├── 07-bento-production/
│   │   │   ├── homework.md
│   │   │   └── locustfile.py
│   │   ├── 08-deep-learning/
│   │   │   ├── homework.md
│   │   │   └── homework_8.ipynb
│   │   ├── 09-serverless/
│   │   │   ├── homework/
│   │   │   │   ├── Dockerfile
│   │   │   │   ├── homework.ipynb
│   │   │   │   ├── homework.py
│   │   │   │   └── test.py
│   │   │   └── homework.md
│   │   ├── 10-kubernetes/
│   │   │   ├── homework/
│   │   │   │   ├── deployment.yaml
│   │   │   │   ├── hpa.yaml
│   │   │   │   ├── service.yaml
│   │   │   │   └── test.py
│   │   │   └── homework.md
│   │   ├── README.md
│   │   ├── article.md
│   │   ├── leaderboard.md
│   │   └── projects.md
│   ├── 2023/
│   │   ├── 01-intro/
│   │   │   ├── homework.md
│   │   │   └── homework_1.ipynb
│   │   ├── 02-regression/
│   │   │   └── homework.md
│   │   ├── 03-classification/
│   │   │   ├── homework.md
│   │   │   └── homework_3.ipynb
│   │   ├── 04-evaluation/
│   │   │   └── homework.md
│   │   ├── 05-deployment/
│   │   │   ├── homework/
│   │   │   │   ├── Dockerfile
│   │   │   │   ├── Pipfile
│   │   │   │   ├── q3_test.py
│   │   │   │   ├── q4_predict.py
│   │   │   │   ├── q4_test.py
│   │   │   │   ├── q6_predict.py
│   │   │   │   └── q6_test.py
│   │   │   └── homework.md
│   │   ├── 06-trees/
│   │   │   └── homework.md
│   │   ├── 08-deep-learning/
│   │   │   ├── homework.ipynb
│   │   │   └── homework.md
│   │   ├── 09-serverless/
│   │   │   └── homework.md
│   │   ├── 10-kubernetes/
│   │   │   └── homework.md
│   │   ├── README.md
│   │   ├── article.md
│   │   ├── leaderboard.md
│   │   └── projects.md
│   ├── 2024/
│   │   ├── 01-intro/
│   │   │   └── homework.md
│   │   ├── 02-regression/
│   │   │   └── homework.md
│   │   ├── 03-classification/
│   │   │   └── homework.md
│   │   ├── 04-evaluation/
│   │   │   └── homework.md
│   │   ├── 05-deployment/
│   │   │   └── homework.md
│   │   ├── 06-trees/
│   │   │   └── homework.md
│   │   ├── 08-deep-learning/
│   │   │   └── homework.md
│   │   ├── 09-serverless/
│   │   │   └── homework.md
│   │   ├── 10-kubernetes/
│   │   │   └── homework.md
│   │   ├── README.md
│   │   ├── article.md
│   │   └── projects.md
│   └── 2025/
│       ├── 01-intro/
│       │   ├── homework.md
│       │   └── homework_1.ipynb
│       ├── 02-regression/
│       │   ├── homework.md
│       │   └── homework_2.ipynb
│       ├── 03-classification/
│       │   ├── homework.md
│       │   └── homework_3.ipynb
│       ├── 04-evaluation/
│       │   ├── homework.md
│       │   └── homework_4.ipynb
│       ├── 05-deployment/
│       │   ├── homework/
│       │   │   ├── .python-version
│       │   │   ├── Dockerfile_base
│       │   │   ├── Dockerfile_full
│       │   │   ├── Dockerfile_hw
│       │   │   ├── README.md
│       │   │   ├── main.py
│       │   │   ├── pyproject.toml
│       │   │   ├── q3_test.py
│       │   │   ├── q4_predict.py
│       │   │   ├── q4_test.py
│       │   │   ├── q6_predict.py
│       │   │   └── q6_test.py
│       │   └── homework.md
│       ├── 06-trees/
│       │   ├── homework.ipynb
│       │   └── homework.md
│       ├── 08-deep-learning/
│       │   └── homework.md
│       ├── 09-serverless/
│       │   └── homework.md
│       ├── 10-kubernetes/
│       │   └── homework.md
│       ├── README.md
│       ├── article.md
│       └── projects.md
├── generate-description.ipynb
├── generate-pages.ipynb
├── learning-in-public.md
└── projects/
    ├── README.md
    ├── how-to.md
    └── project-tips.md

Download .txt

SYMBOL INDEX (76 symbols across 32 files)

FILE: 05-deployment/code/ping.py
  function ping (line 6) | def ping():

FILE: 05-deployment/code/predict.py
  function predict (line 16) | def predict():

FILE: 05-deployment/code/train.py
  function train (line 66) | def train(df_train, y_train, C=1.0):
  function predict (line 78) | def predict(df, dv, model):

FILE: 05-deployment/workshop/ping.py
  function ping (line 7) | def ping():

FILE: 05-deployment/workshop/predict.py
  class Customer (line 11) | class Customer(BaseModel):
  class PredictResponse (line 38) | class PredictResponse(BaseModel):
  function predict_single (line 49) | def predict_single(customer):
  function predict (line 55) | def predict(customer: Customer) -> PredictResponse:

FILE: 05-deployment/workshop/train.py
  function load_data (line 20) | def load_data():
  function train_model (line 41) | def train_model(df):
  function save_model (line 77) | def save_model(pipeline, output_file):

FILE: 09-serverless/code/lambda_function.py
  function predict (line 33) | def predict(url):
  function lambda_handler (line 45) | def lambda_handler(event, context):

FILE: 09-serverless/workshop/lambda-keras/lambda_function.py
  function preprocess_pytorch (line 6) | def preprocess_pytorch(X):
  function predict (line 46) | def predict(url):
  function lambda_handler (line 53) | def lambda_handler(event, context):

FILE: 09-serverless/workshop/lambda-onnx/lambda_function.py
  function preprocess_pytorch_style (line 11) | def preprocess_pytorch_style(X):
  function predict (line 55) | def predict(url):
  function lambda_handler (line 62) | def lambda_handler(event, context):

FILE: 09-serverless/workshop/lambda-sklearn/lambda_function.py
  function predict_single (line 6) | def predict_single(customer):
  function lambda_handler (line 10) | def lambda_handler(event, context):

FILE: 09-serverless/workshop/train/train.py
  function load_data (line 15) | def load_data():
  function train_model (line 34) | def train_model(df):
  function save_model (line 70) | def save_model(pipeline, output_file):

FILE: 10-kubernetes/code/gateway.py
  function prepare_request (line 28) | def prepare_request(X):
  function prepare_response (line 51) | def prepare_response(pb_response):
  function predict (line 56) | def predict(url):
  function predict_endpoint (line 68) | def predict_endpoint():

FILE: 10-kubernetes/code/ping/ping.py
  function ping (line 6) | def ping():

FILE: 10-kubernetes/code/proto.py
  function dtypes_as_dtype (line 4) | def dtypes_as_dtype(dtype):
  function make_tensor_proto (line 10) | def make_tensor_proto(data):
  function np_to_protobuf (line 23) | def np_to_protobuf(data):

FILE: 10-kubernetes/workshop/load_test.py
  function send_request (line 13) | def send_request(_):

FILE: 10-kubernetes/workshop/service/app.py
  function preprocess_pytorch_style (line 10) | def preprocess_pytorch_style(X):
  class PredictRequest (line 46) | class PredictRequest(BaseModel):
  class PredictResponse (line 50) | class PredictResponse(BaseModel):
  function predict (line 56) | def predict(url: str):
  function root (line 69) | def root():
  function health (line 74) | def health():
  function predict_endpoint (line 79) | def predict_endpoint(request: PredictRequest):

FILE: 11-kserve/code/image_transfomer/image_transformer.py
  class ImageTransformer (line 7) | class ImageTransformer(kserve.KFModel):
    method __init__ (line 8) | def __init__(self, name: str, predictor_host: str):
    method prepare_input (line 25) | def prepare_input(self, url: str) -> List:
    method preprocess (line 29) | def preprocess(self, request: Dict) -> Dict:
    method postprocess (line 38) | def postprocess(self, response: Dict) -> Dict:

FILE: cohorts/2021/05-deployment/homework/q3_test.py
  function load (line 3) | def load(filename):

FILE: cohorts/2021/05-deployment/homework/q4_predict.py
  function load (line 7) | def load(filename):
  function predict (line 18) | def predict():

FILE: cohorts/2021/05-deployment/homework/q6_predict.py
  function load (line 7) | def load(filename):
  function predict (line 18) | def predict():

FILE: cohorts/2021/09-serverless/homework/homework.py
  function download_image (line 19) | def download_image(url):
  function prepare_image (line 27) | def prepare_image(img, target_size):
  function prepare_input (line 34) | def prepare_input(x):
  function predict (line 47) | def predict(url):
  function lambda_handler (line 63) | def lambda_handler(event, context):

FILE: cohorts/2022/05-deployment/homework/q3_test.py
  function load (line 4) | def load(filename: str):

FILE: cohorts/2022/05-deployment/homework/q4_predict.py
  function load (line 8) | def load(filename: str):
  function predict (line 20) | def predict():

FILE: cohorts/2022/05-deployment/homework/q6_predict.py
  function load (line 8) | def load(filename: str):
  function predict (line 20) | def predict():

FILE: cohorts/2022/07-bento-production/locustfile.py
  class MLZoomUser (line 7) | class MLZoomUser(HttpUser):
    method classify (line 19) | def classify(self):

FILE: cohorts/2022/09-serverless/homework/homework.py
  function download_image (line 19) | def download_image(url):
  function prepare_image (line 27) | def prepare_image(img, target_size):
  function prepare_input (line 34) | def prepare_input(x):
  function predict (line 47) | def predict(url):
  function lambda_handler (line 63) | def lambda_handler(event, context):

FILE: cohorts/2023/05-deployment/homework/q3_test.py
  function load (line 4) | def load(filename: str):

FILE: cohorts/2023/05-deployment/homework/q4_predict.py
  function load (line 8) | def load(filename: str):
  function predict (line 20) | def predict():

FILE: cohorts/2023/05-deployment/homework/q6_predict.py
  function load (line 8) | def load(filename: str):
  function predict (line 20) | def predict():

FILE: cohorts/2025/05-deployment/homework/main.py
  function main (line 1) | def main():

FILE: cohorts/2025/05-deployment/homework/q4_predict.py
  function predict (line 13) | def predict(lead: dict) -> dict:

FILE: cohorts/2025/05-deployment/homework/q6_predict.py
  function predict (line 13) | def predict(lead: dict) -> dict:

Download .json

Condensed preview — 428 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (7,159K chars).

[
  {
    "path": ".github/FUNDING.yml",
    "chars": 23,
    "preview": "github: alexeygrigorev\n"
  },
  {
    "path": ".gitignore",
    "chars": 408,
    "preview": "# generated\n.ipynb_checkpoints/\n__pycache__/\n**my_dir/\n**logs/\n**models/\n\n# file types\n*.h5\n*.tflite\n*.keras\n*.zip\n*.pdf"
  },
  {
    "path": "01-intro/01-what-is-ml.md",
    "chars": 1482,
    "preview": "## 1.1 Introduction to Machine Learning\n\n<a href=\"https://www.youtube.com/watch?v=Crm_5n4mvmg&list=PL3MmuxUbc_hIhxl5Ji8t"
  },
  {
    "path": "01-intro/02-ml-vs-rules.md",
    "chars": 2090,
    "preview": "## 1.2 ML vs Rule-Based Systems\n\n<a href=\"https://www.youtube.com/watch?v=CeukwyUdaz8&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOp"
  },
  {
    "path": "01-intro/03-supervised-ml.md",
    "chars": 2121,
    "preview": "## 1.3 Supervised Machine Learning\n\n<a href=\"https://www.youtube.com/watch?v=j9kcEuGcC2Y&list=PL3MmuxUbc_hIhxl5Ji8t4O6lP"
  },
  {
    "path": "01-intro/04-crisp-dm.md",
    "chars": 2232,
    "preview": "## 1.4 CRISP-DM\n\n<a href=\"https://www.youtube.com/watch?v=dCa3JvmJbr0&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR&index=5\"><"
  },
  {
    "path": "01-intro/05-model-selection.md",
    "chars": 2099,
    "preview": "## 1.5 Model Selection Process\n\n<a href=\"https://www.youtube.com/watch?v=OH_R0Sl9neM&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpH"
  },
  {
    "path": "01-intro/06-environment.md",
    "chars": 5442,
    "preview": "##  Setting up the Environment\n\nIn this section, we'll prepare the environment\n\n\nYou need:\n\n* Python 3.11 (note that vid"
  },
  {
    "path": "01-intro/07-numpy.md",
    "chars": 5568,
    "preview": "## 1.7 Introduction to NumPy\n\n<a href=\"https://www.youtube.com/watch?v=Qa0-jYtRdbY&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaC"
  },
  {
    "path": "01-intro/08-linear-algebra.md",
    "chars": 3004,
    "preview": "## 1.8 Linear Algebra Refresher\n\n<a href=\"https://www.youtube.com/watch?v=zZyKUeOR4Gg&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOp"
  },
  {
    "path": "01-intro/09-pandas.md",
    "chars": 1168,
    "preview": "## 1.9 Introduction to Pandas\n\n<a href=\"https://www.youtube.com/watch?v=0j3XK5PsnxA&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHa"
  },
  {
    "path": "01-intro/10-summary.md",
    "chars": 3044,
    "preview": "## 1.10 Summary\n\n<a href=\"https://www.youtube.com/watch?v=VRrEEVeJ440&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR&index=10\">"
  },
  {
    "path": "01-intro/README.md",
    "chars": 2155,
    "preview": "## 1. Introduction to Machine Learning\n\n- 1.1 [Introduction to Machine Learning](01-what-is-ml.md)\n- 1.2 [ML vs Rule-Bas"
  },
  {
    "path": "01-intro/homework.md",
    "chars": 661,
    "preview": "## Homework\r\n\r\n* For 2025 cohort homework, check [the 2025 cohort folder](../cohorts/2025/01-intro/homework.md)\r\n* For 2"
  },
  {
    "path": "01-intro/notebooks/07-numpy.ipynb",
    "chars": 13219,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"502da6a2\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Machine Lear"
  },
  {
    "path": "01-intro/notebooks/08-linear-algebra.ipynb",
    "chars": 10618,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"3aace4b5\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Machine Lear"
  },
  {
    "path": "01-intro/notebooks/09-pandas.ipynb",
    "chars": 56323,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"3473239e\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Machine Lear"
  },
  {
    "path": "02-regression/01-car-price-intro.md",
    "chars": 1361,
    "preview": "\n## 2.1 Car price prediction project\n\n<a href=\"https://www.youtube.com/watch?v=vM3SqPNlStE&list=PL3MmuxUbc_hIhxl5Ji8t4O6"
  },
  {
    "path": "02-regression/02-data-preparation.md",
    "chars": 1309,
    "preview": "\n## 2.2 Data preparation\n\n<a href=\"https://www.youtube.com/watch?v=Kd74oR4QWGM&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR&i"
  },
  {
    "path": "02-regression/03-eda.md",
    "chars": 1718,
    "preview": "\n## 2.3 Exploratory data analysis\n\n<a href=\"https://www.youtube.com/watch?v=k6k8sQ0GhPM&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPA"
  },
  {
    "path": "02-regression/04-validation-framework.md",
    "chars": 1815,
    "preview": "\n## 2.4 Setting up the validation framework\n\n<a href=\"https://www.youtube.com/watch?v=ck0IfiPaQi0&list=PL3MmuxUbc_hIhxl5"
  },
  {
    "path": "02-regression/05-linear-regression-simple.md",
    "chars": 2427,
    "preview": "\n## 2.5 Linear regression\n\n<a href=\"https://www.youtube.com/watch?v=Dn1eTQLsOdA&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR&"
  },
  {
    "path": "02-regression/06-linear-regression-vector.md",
    "chars": 1464,
    "preview": "\n## 2.6 Linear regression: vector form\n\n<a href=\"https://www.youtube.com/watch?v=YkyevnYyAww&list=PL3MmuxUbc_hIhxl5Ji8t4"
  },
  {
    "path": "02-regression/07-linear-regression-training.md",
    "chars": 1617,
    "preview": "\n## 2.7 Training linear regression: Normal equation\n\n<a href=\"https://www.youtube.com/watch?v=hx6nak-Y11g&list=PL3MmuxUb"
  },
  {
    "path": "02-regression/08-baseline-model.md",
    "chars": 1858,
    "preview": "\n## 2.8 Baseline model for car price prediction project\n\n<a href=\"https://www.youtube.com/watch?v=SvPpMMYtYbU&list=PL3Mm"
  },
  {
    "path": "02-regression/09-rmse.md",
    "chars": 1689,
    "preview": "\n## 2.9 Root Mean Squared Error (RMSE)\n\n<a href=\"https://www.youtube.com/watch?v=0LWoFtbzNUM&list=PL3MmuxUbc_hIhxl5Ji8t4"
  },
  {
    "path": "02-regression/10-car-price-validation.md",
    "chars": 1143,
    "preview": "\n## 2.10 Computing RMSE on validation data\n\n<a href=\"https://www.youtube.com/watch?v=rawGPXg2ofE&list=PL3MmuxUbc_hIhxl5J"
  },
  {
    "path": "02-regression/11-feature-engineering.md",
    "chars": 1348,
    "preview": "## 2.11 Feature engineering\n\nFeature engineering is the process of creating new features\n\n<a href=\"https://www.youtube.c"
  },
  {
    "path": "02-regression/12-categorical-variables.md",
    "chars": 1643,
    "preview": "\n## 2.12 Categorical variables\n\n<a href=\"https://www.youtube.com/watch?v=sGLAToAAMa4&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpH"
  },
  {
    "path": "02-regression/13-regularization.md",
    "chars": 2665,
    "preview": "## 2.13 Regularization\n\n<a href=\"https://www.youtube.com/watch?v=91ve3EJlHBc&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR&ind"
  },
  {
    "path": "02-regression/14-tuning-model.md",
    "chars": 1150,
    "preview": "\n## 2.14 Tuning the model\n\n<a href=\"https://www.youtube.com/watch?v=lW-YVxPgzQw&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR&"
  },
  {
    "path": "02-regression/15-using-model.md",
    "chars": 1207,
    "preview": "\n## 2.15 Using the model\n\n<a href=\"https://www.youtube.com/watch?v=KT--uIJozes&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR&i"
  },
  {
    "path": "02-regression/16-summary.md",
    "chars": 1152,
    "preview": "\n## 2.16 Car price prediction project summary\n\n<a href=\"https://www.youtube.com/watch?v=_qI01YXbyro&list=PL3MmuxUbc_hIhx"
  },
  {
    "path": "02-regression/17-explore-more.md",
    "chars": 963,
    "preview": "\n## 2.17 Explore more\n\n### Questions\n\n* In this project, we included only 5 top features. What happens if we include 10?"
  },
  {
    "path": "02-regression/README.md",
    "chars": 3088,
    "preview": "## 2. Machine Learning for Regression\n\n- 2.1 [Car price prediction project](01-car-price-intro.md)\n- 2.2 [Data preparati"
  },
  {
    "path": "02-regression/homework.md",
    "chars": 681,
    "preview": "## Homework\n* For 2025 cohort homework, check [the 2025 cohort folder](../cohorts/2025/02-regression/homework.md)\n* For "
  },
  {
    "path": "02-regression/meta.json",
    "chars": 91,
    "preview": "{\n    \"data\": \"meta.csv\",\n    \"session\": 2,\n    \"name\": \"Machine Learning for Regression\"\n}"
  },
  {
    "path": "02-regression/notebook.ipynb",
    "chars": 116429,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 2. Machine Learning for Regressi"
  },
  {
    "path": "03-classification/01-churn-project.md",
    "chars": 1612,
    "preview": "# 3.1 Churn prediction project\n\n<a href=\"https://www.youtube.com/watch?v=0Zw04wdeTQo&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpH"
  },
  {
    "path": "03-classification/02-data-preparation.md",
    "chars": 1979,
    "preview": "\n## 3.2 Data preparation\n\n<a href=\"https://www.youtube.com/watch?v=VSGGU9gYvdg&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR\">"
  },
  {
    "path": "03-classification/03-validation.md",
    "chars": 1502,
    "preview": "\n## 3.3 Setting up the validation framework\n\n<a href=\"https://www.youtube.com/watch?v=_lwz34sOnSE&list=PL3MmuxUbc_hIhxl5"
  },
  {
    "path": "03-classification/04-eda.md",
    "chars": 1689,
    "preview": "\n## 3.4 EDA\n\n<a href=\"https://www.youtube.com/watch?v=BNF1wjBwTQA&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR\"><img src=\"ima"
  },
  {
    "path": "03-classification/05-risk.md",
    "chars": 1933,
    "preview": "\n## 3.5 Feature importance: Churn rate and risk ratio\n\n<a href=\"https://www.youtube.com/watch?v=fzdzPLlvs40&list=PL3Mmux"
  },
  {
    "path": "03-classification/06-mutual-info.md",
    "chars": 1760,
    "preview": "\n## 3.6 Feature importance: Mutual information\n\n<a href=\"https://www.youtube.com/watch?v=_u2YaGT6RN0&list=PL3MmuxUbc_hIh"
  },
  {
    "path": "03-classification/07-correlation.md",
    "chars": 1967,
    "preview": "\n## 3.7 Feature importance: Correlation\n\n<a href=\"https://www.youtube.com/watch?v=mz1707QVxiY&list=PL3MmuxUbc_hIhxl5Ji8t"
  },
  {
    "path": "03-classification/08-ohe.md",
    "chars": 1635,
    "preview": "\n## 3.8 One-hot encoding\n\n<a href=\"https://www.youtube.com/watch?v=L-mjQFN5aR0&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR\">"
  },
  {
    "path": "03-classification/09-logistic-regression.md",
    "chars": 2123,
    "preview": "\n## 3.9 Logistic regression\n\n<a href=\"https://www.youtube.com/watch?v=7KFE2ltnBAg&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCL"
  },
  {
    "path": "03-classification/10-training-log-reg.md",
    "chars": 1747,
    "preview": "\n## 3.10 Training logistic regression with Scikit-Learn\n\n<a href=\"https://www.youtube.com/watch?v=hae_jXe2fN0&list=PL3Mm"
  },
  {
    "path": "03-classification/11-log-reg-interpretation.md",
    "chars": 1491,
    "preview": "\n## 3.11 Model interpretation\n\n<a href=\"https://www.youtube.com/watch?v=OUrlxnUAAEA&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHa"
  },
  {
    "path": "03-classification/12-using-log-reg.md",
    "chars": 1172,
    "preview": "\n## 3.12 Using the model\n\n<a href=\"https://www.youtube.com/watch?v=Y-NGmnFpNuM&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR\">"
  },
  {
    "path": "03-classification/13-summary.md",
    "chars": 999,
    "preview": "\n## 3.13 Summary\n\n<a href=\"https://www.youtube.com/watch?v=Zz6oRGsJkW4&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR\"><img src"
  },
  {
    "path": "03-classification/14-explore-more.md",
    "chars": 1058,
    "preview": "\n## 3.14 Explore more\n\nMore things\n\n* Try to exclude least useful features\n\nUse scikit-learn in project of last week\n\n* "
  },
  {
    "path": "03-classification/README.md",
    "chars": 2888,
    "preview": "## 3. Machine Learning for Classification\n\n- 3.1 [Churn prediction project](01-churn-project.md)\n- 3.2 [Data preparatio"
  },
  {
    "path": "03-classification/homework.md",
    "chars": 705,
    "preview": "## Homework\n* For 2025 cohort homework, check [the 2025 cohort folder](../cohorts/2025/03-classification/homework.md)\n* "
  },
  {
    "path": "03-classification/meta.csv",
    "chars": 2893,
    "preview": "lesson,name,page_name,video,slides,notebook\r\n1,Churn prediction project,01-churn-project.md,https://www.youtube.com/watc"
  },
  {
    "path": "03-classification/meta.json",
    "chars": 95,
    "preview": "{\n    \"data\": \"meta.csv\",\n    \"session\": 3,\n    \"name\": \"Machine Learning for Classification\"\n}"
  },
  {
    "path": "03-classification/notebook-scaling-ohe.ipynb",
    "chars": 10770,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 2,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n "
  },
  {
    "path": "03-classification/notebook.ipynb",
    "chars": 115411,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# 3. Machine Learning for Classific"
  },
  {
    "path": "04-evaluation/01-overview.md",
    "chars": 1218,
    "preview": "\n## 4.1 Evaluation metrics: session overview\n\n<a href=\"https://www.youtube.com/watch?v=gmg5jw1bM8A&list=PL3MmuxUbc_hIhxl"
  },
  {
    "path": "04-evaluation/02-accuracy.md",
    "chars": 2230,
    "preview": "\n## 4.2 Accuracy and dummy model\n\n<a href=\"https://www.youtube.com/watch?v=FW_l7lB0HUI&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAO"
  },
  {
    "path": "04-evaluation/03-confusion-table.md",
    "chars": 2529,
    "preview": "## 4.3 Confusion table\n\n<a href=\"https://www.youtube.com/watch?v=Jt2dDLSlBng&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR\"><i"
  },
  {
    "path": "04-evaluation/04-precision-recall.md",
    "chars": 1913,
    "preview": "## 4.4 Precision and Recall\n\n<a href=\"https://www.youtube.com/watch?v=gRLP_mlglMM&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCL"
  },
  {
    "path": "04-evaluation/05-roc.md",
    "chars": 2920,
    "preview": "\n## 4.5 ROC Curves\n\n<a href=\"https://www.youtube.com/watch?v=dnBZLk53sQI&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR\"><img s"
  },
  {
    "path": "04-evaluation/06-auc.md",
    "chars": 1773,
    "preview": "## 4.6 ROC AUC\n\n<a href=\"https://www.youtube.com/watch?v=hvIQPAwkVZo&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR\"><img src=\""
  },
  {
    "path": "04-evaluation/07-cross-validation.md",
    "chars": 2414,
    "preview": "## 4.7 Cross-Validation\n\n<a href=\"https://www.youtube.com/watch?v=BIIZaVtUbf4&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR\"><"
  },
  {
    "path": "04-evaluation/08-summary.md",
    "chars": 1794,
    "preview": "## 4.8 Summary\n\n<a href=\"https://www.youtube.com/watch?v=-v8XEQ2AHvQ&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR\"><img src=\""
  },
  {
    "path": "04-evaluation/09-explore-more.md",
    "chars": 606,
    "preview": "## 4.9 Explore more\n\n* Check the precision and recall of the dummy classifier that always predict \"FALSE\"\n* F1 score = 2"
  },
  {
    "path": "04-evaluation/README.md",
    "chars": 1760,
    "preview": "## 4. Evaluation Metrics for Classification\n\n- 4.1 [Evaluation metrics: session overview](01-overview.md)\n- 4.2 [Accurac"
  },
  {
    "path": "04-evaluation/homework.md",
    "chars": 687,
    "preview": "## Homework\n* For 2025 cohort homework, check [the 2025 cohort folder](../cohorts/2025/04-evaluation/homework.md)\n* For "
  },
  {
    "path": "04-evaluation/meta.csv",
    "chars": 1660,
    "preview": "lesson,name,page_name,video,slides,notebook\r\n1,Evaluation metrics: session overview,01-overview.md,https://www.youtube.c"
  },
  {
    "path": "04-evaluation/meta.json",
    "chars": 97,
    "preview": "{\n    \"data\": \"meta.csv\",\n    \"session\": 4,\n    \"name\": \"Evaluation Metrics for Classification\"\n}"
  },
  {
    "path": "04-evaluation/notebook.ipynb",
    "chars": 153777,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 1,\n   \"id\": \"3fb05700\",\n   \"metadata\": {},\n   \"outputs\":"
  },
  {
    "path": "05-deployment/01-intro.md",
    "chars": 2795,
    "preview": "\n## 5.1 Intro / Session overview\n\n<a href=\"https://www.youtube.com/watch?v=agIFak9A3m8&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAO"
  },
  {
    "path": "05-deployment/02-pickle.md",
    "chars": 2196,
    "preview": "\n## 5.2 Saving and loading the model\n\n<a href=\"https://www.youtube.com/watch?v=EJpqZ7OlwFU&list=PL3MmuxUbc_hIhxl5Ji8t4O6"
  },
  {
    "path": "05-deployment/03-flask-intro.md",
    "chars": 3476,
    "preview": "\n## 5.3 Web services: introduction to Flask\n\n<a href=\"https://www.youtube.com/watch?v=W7ubna1Rfv8&list=PL3MmuxUbc_hIhxl5"
  },
  {
    "path": "05-deployment/04-flask-deployment.md",
    "chars": 5624,
    "preview": "\n## 5.4 Serving the churn model with Flask\n\n<a href=\"https://www.youtube.com/watch?v=Q7ZWPgPnRz8&list=PL3MmuxUbc_hIhxl5J"
  },
  {
    "path": "05-deployment/05-pipenv.md",
    "chars": 4181,
    "preview": "## 5.5 Python virtual environment: Pipenv\n\n<a href=\"https://www.youtube.com/watch?v=BMXh8JGROHM&list=PL3MmuxUbc_hIhxl5Ji"
  },
  {
    "path": "05-deployment/06-docker.md",
    "chars": 4429,
    "preview": "## 5.6 Environment management: Docker\n\n<a href=\"https://www.youtube.com/watch?v=wAtyYZ6zvAs&list=PL3MmuxUbc_hIhxl5Ji8t4O"
  },
  {
    "path": "05-deployment/07-aws-eb.md",
    "chars": 5271,
    "preview": "## 5.7 Deployment to the cloud: AWS Elastic Beanstalk (optional)\n\n<a href=\"https://www.youtube.com/watch?v=HGPJ4ekhcLg&l"
  },
  {
    "path": "05-deployment/08-summary.md",
    "chars": 1285,
    "preview": "## 5.8 Summary\n\n<a href=\"https://www.youtube.com/watch?v=sSAqYSk7Br4&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR\"><img src=\""
  },
  {
    "path": "05-deployment/09-explore-more.md",
    "chars": 715,
    "preview": "\n## 5.9 Explore more\n\n* Flask is not the only framework for creating web services. Try others, e.g. FastAPI.\n* Experimen"
  },
  {
    "path": "05-deployment/README.md",
    "chars": 3188,
    "preview": "## 5. Deploying Machine Learning Models\n\nNote: these materials are partly outdated, which\nis why we recorded a workshop "
  },
  {
    "path": "05-deployment/code/05-train-churn-model.ipynb",
    "chars": 12744,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"464b9b4a\",\n   \"metadata\": {},\n   \"source\": [\n    \"In the previou"
  },
  {
    "path": "05-deployment/code/Dockerfile",
    "chars": 275,
    "preview": "FROM python:3.8.12-slim\r\n\r\nRUN pip install pipenv\r\n\r\nWORKDIR /app\r\n\r\nCOPY [\"Pipfile\", \"Pipfile.lock\", \"./\"]\r\n\r\nRUN pipen"
  },
  {
    "path": "05-deployment/code/Pipfile",
    "chars": 203,
    "preview": "[[source]]\nurl = \"https://pypi.org/simple\"\nverify_ssl = true\nname = \"pypi\"\n\n[packages]\nnumpy = \"*\"\nscikit-learn = \"==0.2"
  },
  {
    "path": "05-deployment/code/ping.py",
    "chars": 200,
    "preview": "from flask import Flask\r\n\r\napp = Flask('ping')\r\n\r\n@app.route('/ping', methods=['GET'])\r\ndef ping():\r\n    return \"PONG\"\r\n"
  },
  {
    "path": "05-deployment/code/plan.md",
    "chars": 1714,
    "preview": "# 5. Deploying Machine Learning models \r\n\r\nWe'll use the same model we trained and evaluated\r\npreviously - the churn pre"
  },
  {
    "path": "05-deployment/code/predict-test.py",
    "chars": 892,
    "preview": "#!/usr/bin/env python\n# coding: utf-8\n\nimport requests\n\n\nurl = 'http://localhost:9696/predict'\n\ncustomer_id = 'xyz-123'\n"
  },
  {
    "path": "05-deployment/code/predict.py",
    "chars": 644,
    "preview": "import pickle\r\n\r\nfrom flask import Flask\r\nfrom flask import request\r\nfrom flask import jsonify\r\n\r\n\r\nmodel_file = 'model_"
  },
  {
    "path": "05-deployment/code/train.py",
    "chars": 2864,
    "preview": "#!/usr/bin/env python\n# coding: utf-8\n\nimport pickle\n\nimport pandas as pd\nimport numpy as np\n\nfrom sklearn.model_selecti"
  },
  {
    "path": "05-deployment/homework.md",
    "chars": 610,
    "preview": "## Homework\n\n\n* For 2025 cohort homework, check [the 2025 cohort folder](../cohorts/2025/05-deployment/homework.md)\n* Fo"
  },
  {
    "path": "05-deployment/meta.csv",
    "chars": 1434,
    "preview": "lesson,name,page_name,video,slides\r\n1,Intro / Session overview,01-intro.md,https://www.youtube.com/watch?v=agIFak9A3m8,h"
  },
  {
    "path": "05-deployment/meta.json",
    "chars": 93,
    "preview": "{\n    \"data\": \"meta.csv\",\n    \"session\": 5,\n    \"name\": \"Deploying Machine Learning Models\"\n}"
  },
  {
    "path": "05-deployment/workshop/.dockerignore",
    "chars": 64,
    "preview": "# flyctl launch added from .venv\\.gitignore\n.venv\\**\\*\nfly.toml\n"
  },
  {
    "path": "05-deployment/workshop/.python-version",
    "chars": 5,
    "preview": "3.13\n"
  },
  {
    "path": "05-deployment/workshop/Dockerfile",
    "chars": 337,
    "preview": "FROM python:3.13.5-slim-bookworm\n\nCOPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/\nWORKDIR /code\n\nENV PATH=\"/code/"
  },
  {
    "path": "05-deployment/workshop/README.md",
    "chars": 15969,
    "preview": "# Deploying ML Models with FastAPI and uv\n\n* Video: https://www.youtube.com/watch?v=jzGzw98Eikk\n\nIn this workshop we wil"
  },
  {
    "path": "05-deployment/workshop/fly.toml",
    "chars": 483,
    "preview": "# fly.toml app configuration file generated for mlzoomcamp-flask-uv on 2025-08-11T15:21:50+02:00\n#\n# See https://fly.io/"
  },
  {
    "path": "05-deployment/workshop/ping.py",
    "chars": 197,
    "preview": "from fastapi import FastAPI\nimport uvicorn\n\napp = FastAPI(title=\"ping\")\n\n@app.get(\"/ping\")\ndef ping():\n    return \"PONG\""
  },
  {
    "path": "05-deployment/workshop/predict.py",
    "chars": 1835,
    "preview": "import pickle\nfrom typing import Literal\nfrom pydantic import BaseModel, Field\n\n\nfrom fastapi import FastAPI\nimport uvic"
  },
  {
    "path": "05-deployment/workshop/predict_old.py",
    "chars": 730,
    "preview": "import pickle\n\nwith open('model.bin', 'rb') as f_in:\n    pipeline = pickle.load(f_in)\n\ndatapoint = {\n    'gender': 'fema"
  },
  {
    "path": "05-deployment/workshop/pyproject.toml",
    "chars": 295,
    "preview": "[project]\nname = \"mlzoomcamp-flask-uv\"\nversion = \"0.1.0\"\ndescription = \"Add your description here\"\nreadme = \"README.md\"\n"
  },
  {
    "path": "05-deployment/workshop/starter.ipynb",
    "chars": 34623,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"4458df13-d0f7-462e-bc80-42169bb1a62b\",\n   \"metadata\": {},\n   \"so"
  },
  {
    "path": "05-deployment/workshop/test.py",
    "chars": 864,
    "preview": "import requests\n\nurl = 'http://localhost:9696/predict'\n# url = 'https://mlzoomcamp-flask-uv.fly.dev/predict'\n\ncustomer ="
  },
  {
    "path": "05-deployment/workshop/train.py",
    "chars": 1966,
    "preview": "#!/usr/bin/env python\n\nimport pickle\n\nimport pandas as pd\nimport numpy as np\nimport sklearn\n\n\nfrom sklearn.feature_extra"
  },
  {
    "path": "05-deployment/workshop/workshop-uv-fastapi.ipynb",
    "chars": 82015,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"4458df13-d0f7-462e-bc80-42169bb1a62b\",\n   \"metadata\": {\n    \"_sp"
  },
  {
    "path": "06-trees/01-credit-risk.md",
    "chars": 1654,
    "preview": "\n## 6.1 Credit risk scoring project\n\n<a href=\"https://www.youtube.com/watch?v=GJGmlfZoCoU&list=PL3MmuxUbc_hIhxl5Ji8t4O6l"
  },
  {
    "path": "06-trees/02-data-prep.md",
    "chars": 1953,
    "preview": "## 6.2 Data cleaning and preparation\n\n<a href=\"https://www.youtube.com/watch?v=tfuQdI3YO2c&list=PL3MmuxUbc_hIhxl5Ji8t4O6"
  },
  {
    "path": "06-trees/03-decision-trees.md",
    "chars": 2126,
    "preview": "## 6.3 Decision trees\n\n<a href=\"https://www.youtube.com/watch?v=YGiQvFbSIg8&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR\"><im"
  },
  {
    "path": "06-trees/04-decision-tree-learning.md",
    "chars": 4380,
    "preview": "## 6.4 Decision tree learning algorithm\n\n<a href=\"https://www.youtube.com/watch?v=XODz6LwKY7g&list=PL3MmuxUbc_hIhxl5Ji8t"
  },
  {
    "path": "06-trees/05-decision-tree-tuning.md",
    "chars": 3564,
    "preview": "## 6.5 Decision trees parameter tuning\n\n<a href=\"https://www.youtube.com/watch?v=XJaxwH50Qok&list=PL3MmuxUbc_hIhxl5Ji8t4"
  },
  {
    "path": "06-trees/06-random-forest.md",
    "chars": 3043,
    "preview": "## 6.6 Ensemble learning and random forest\n\n<a href=\"https://www.youtube.com/watch?v=FZhcmOfNNZE&list=PL3MmuxUbc_hIhxl5J"
  },
  {
    "path": "06-trees/07-boosting.md",
    "chars": 5979,
    "preview": "## 6.7 Gradient boosting and XGBoost\n\n<a href=\"https://www.youtube.com/watch?v=xFarGClszEM&list=PL3MmuxUbc_hIhxl5Ji8t4O6"
  },
  {
    "path": "06-trees/08-xgb-tuning.md",
    "chars": 3655,
    "preview": "## 6.8 XGBoost parameter tuning\n\n<a href=\"https://www.youtube.com/watch?v=VX6ftRzYROM&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOp"
  },
  {
    "path": "06-trees/09-final-model.md",
    "chars": 1492,
    "preview": "## 6.9 Selecting the best model\n\n<a href=\"https://www.youtube.com/watch?v=lqdnyIVQq-M&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOp"
  },
  {
    "path": "06-trees/10-summary.md",
    "chars": 1222,
    "preview": "## 6.10 Summary\n\n<a href=\"https://www.youtube.com/watch?v=JZ6sRZ_5j_c&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR\"><img src="
  },
  {
    "path": "06-trees/11-explore-more.md",
    "chars": 1665,
    "preview": "\n## 6.11 Explore more\n\n* For this dataset we didn't do EDA or feature engineering. You can do it to get more insights in"
  },
  {
    "path": "06-trees/README.md",
    "chars": 1573,
    "preview": "## 6. Decision Trees and Ensemble Learning\n\n- 6.1 [Credit risk scoring project](01-credit-risk.md)\n- 6.2 [Data cleaning "
  },
  {
    "path": "06-trees/homework.md",
    "chars": 622,
    "preview": "## Homework\n\n* For 2025 cohort homework, check [the 2025 cohort folder](../cohorts/2025/06-trees/homework.md)\n* For 2024"
  },
  {
    "path": "06-trees/meta.csv",
    "chars": 2184,
    "preview": "lesson,name,page_name,video,slides,notebook\r\n1,Credit risk scoring project,01-credit-risk.md,https://www.youtube.com/wat"
  },
  {
    "path": "06-trees/meta.json",
    "chars": 96,
    "preview": "{\n    \"data\": \"meta.csv\",\n    \"session\": 6,\n    \"name\": \"Decision Trees and Ensemble Learning\"\n}"
  },
  {
    "path": "06-trees/notebook.ipynb",
    "chars": 321921,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"52472024\",\n   \"metadata\": {},\n   \"source\": [\n    \"# 6. Decision "
  },
  {
    "path": "08-deep-learning/01-fashion-classification.md",
    "chars": 2200,
    "preview": "## 8.1 Fashion classification\n\n<a href=\"https://www.youtube.com/watch?v=it1Lu7NmMpw&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHa"
  },
  {
    "path": "08-deep-learning/02-tensorflow-keras.md",
    "chars": 3279,
    "preview": "## 8.2 TensorFlow and Keras\n\n<a href=\"https://www.youtube.com/watch?v=R6o_CUmoN9Q&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCL"
  },
  {
    "path": "08-deep-learning/03-pretrained-models.md",
    "chars": 3404,
    "preview": "## 8.3 Pre-trained convolutional neural networks\r\n\r\n<a href=\"https://www.youtube.com/watch?v=qGDXEz-cr6M&list=PL3MmuxUbc"
  },
  {
    "path": "08-deep-learning/04-conv-neural-nets.md",
    "chars": 5941,
    "preview": "## 8.4 Convolutional neural networks\n\n<a href=\"https://www.youtube.com/watch?v=BN-fnYzbdc8&list=PL3MmuxUbc_hIhxl5Ji8t4O6"
  },
  {
    "path": "08-deep-learning/05-transfer-learning.md",
    "chars": 5814,
    "preview": "## 8.5 Transfer learning\n\n<a href=\"https://www.youtube.com/watch?v=WKHylqfNmq4&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR\">"
  },
  {
    "path": "08-deep-learning/06-learning-rate.md",
    "chars": 4599,
    "preview": "## 8.6 Adjusting the learning rate\n\n<a href=\"https://www.youtube.com/watch?v=2gPmRRGz0Hc&list=PL3MmuxUbc_hIhxl5Ji8t4O6lP"
  },
  {
    "path": "08-deep-learning/07-checkpointing.md",
    "chars": 1679,
    "preview": "## 8.7 Checkpointing\n\n<a href=\"https://www.youtube.com/watch?v=NRpGUx0o3Ps&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR\"><img"
  },
  {
    "path": "08-deep-learning/08-more-layers.md",
    "chars": 3472,
    "preview": "## 8.8 Adding more layers\n\n<a href=\"https://www.youtube.com/watch?v=bSRRrorvAZs&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR\""
  },
  {
    "path": "08-deep-learning/09-dropout.md",
    "chars": 4249,
    "preview": "## 8.9 Regularization and dropout\n\n<a href=\"https://www.youtube.com/watch?v=74YmhVM6FTM&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPA"
  },
  {
    "path": "08-deep-learning/10-augmentation.md",
    "chars": 2893,
    "preview": "## 8.10 Data augmentation\r\n\r\n<a href=\"https://www.youtube.com/watch?v=aoPfVsS3BDE&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCL"
  },
  {
    "path": "08-deep-learning/11-large-model.md",
    "chars": 1089,
    "preview": "## 8.11 Training a larger model\r\n\r\n<a href=\"https://www.youtube.com/watch?v=_QpDGJwFjYA&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPA"
  },
  {
    "path": "08-deep-learning/12-using-model.md",
    "chars": 2695,
    "preview": "## 8.12 Using the model\r\n\r\n<a href=\"https://www.youtube.com/watch?v=cM1WHKae1wo&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR\""
  },
  {
    "path": "08-deep-learning/13-summary.md",
    "chars": 1213,
    "preview": "## 8.13 Summary\r\n\r\n<a href=\"https://www.youtube.com/watch?v=mn0BcXJlRFM&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR\"><img sr"
  },
  {
    "path": "08-deep-learning/14-explore-more.md",
    "chars": 525,
    "preview": "## 8.14 Explore more\r\n\r\n**TODO**\r\n\r\n- Add more data, e.g, Zalando etc\r\n- Albumentations - another way of generating augm"
  },
  {
    "path": "08-deep-learning/README.md",
    "chars": 3123,
    "preview": "## 8. Neural Networks and Deep Learning\n\nNote: in the module we use TensorFlow+Keras. These videos \nwere recorded a whil"
  },
  {
    "path": "08-deep-learning/homework.md",
    "chars": 661,
    "preview": "## Homework\r\n* For 2025 cohort homework, check [the 2025 cohort folder](../cohorts/2025/08-deep-learning/homework.md)\r\n*"
  },
  {
    "path": "08-deep-learning/install.md",
    "chars": 7109,
    "preview": "# Installation of tensorflow\n\ndate: 2023 Nov 12\n\nThis installation guide is specific to the use case as in the pre-requi"
  },
  {
    "path": "08-deep-learning/meta.csv",
    "chars": 3908,
    "preview": "lesson,name,page_name,video,slides,notebook\r\n1,Fashion classification,01-fashion-classification.md,https://www.youtube.c"
  },
  {
    "path": "08-deep-learning/meta.json",
    "chars": 93,
    "preview": "{\n    \"data\": \"meta.csv\",\n    \"session\": 8,\n    \"name\": \"Neural Networks and Deep Learning\"\n}"
  },
  {
    "path": "08-deep-learning/notebook.ipynb",
    "chars": 528405,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 1,\n   \"id\": \"a8f9405b\",\n   \"metadata\": {},\n   \"outputs\":"
  },
  {
    "path": "08-deep-learning/pytorch/README.md",
    "chars": 27125,
    "preview": "# Deep Learning with PyTorch Workshop\n\n\n* [Video](https://www.youtube.com/watch?v=Ne25VujHRLA)\n* [Notebook](https://cola"
  },
  {
    "path": "08-deep-learning/pytorch/install_pytorch.md",
    "chars": 7171,
    "preview": "# **Complete PyTorch Installation Guide (GPU & CPU Support)**\n\nHi 👋\nThis guide walks you through installing PyTorch with"
  },
  {
    "path": "09-serverless/01-intro.md",
    "chars": 1298,
    "preview": "\n## 9.1 Introduction to Serverless\n\n<a href=\"https://www.youtube.com/watch?v=JLIVwIsU6RA&list=PL3MmuxUbc_hIhxl5Ji8t4O6lP"
  },
  {
    "path": "09-serverless/02-aws-lambda.md",
    "chars": 2863,
    "preview": "\n## 9.2 AWS Lambda\n\n<a href=\"https://www.youtube.com/watch?v=_UX8-2WhHZo&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR\"><img s"
  },
  {
    "path": "09-serverless/03-tensorflow-lite.md",
    "chars": 2519,
    "preview": "\n## 9.3 TensorFlow Lite\n\n> Note: the materials in this unit are outdated.\n> \n> Refer to the [ONNX Workshop](workshop/) f"
  },
  {
    "path": "09-serverless/04-preparing-code.md",
    "chars": 868,
    "preview": "\n## 9.4 Preparing the code for Lambda\n\n> Note: the materials in this unit are outdated.\n> \n> Refer to the [ONNX Workshop"
  },
  {
    "path": "09-serverless/05-docker-image.md",
    "chars": 2301,
    "preview": "## 9.5 Preparing a Docker image\n\n> Note: the materials in this unit are outdated.\n> \n> Refer to the [ONNX Workshop](work"
  },
  {
    "path": "09-serverless/06-creating-lambda.md",
    "chars": 889,
    "preview": "\n## 9.6 Creating the lambda function\n\n> Note: the materials in this unit are outdated.\n> \n> Refer to the [ONNX Workshop]"
  },
  {
    "path": "09-serverless/07-api-gateway.md",
    "chars": 745,
    "preview": "\n## 9.7 API Gateway: exposing the lambda function\n\n<a href=\"https://www.youtube.com/watch?v=wyZ9aqQOXvs&list=PL3MmuxUbc_"
  },
  {
    "path": "09-serverless/08-summary.md",
    "chars": 624,
    "preview": "## 9.8 Summary\n\n<a href=\"https://www.youtube.com/watch?v=bu3nPiHCNLU&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR\"><img src=\""
  },
  {
    "path": "09-serverless/09-explore-more.md",
    "chars": 457,
    "preview": "## 9.9 Explore more\n\n* Try similar serverless services from Google Cloud and Microsoft Azure\n* Deploy cats vs dogs and o"
  },
  {
    "path": "09-serverless/README.md",
    "chars": 1656,
    "preview": "## 9. Serverless Deep Learning\n\n\nUse the [workshop](workshop/) for most of the content.\n\nThe content in the module still"
  },
  {
    "path": "09-serverless/code/Dockerfile",
    "chars": 307,
    "preview": "FROM public.ecr.aws/lambda/python:3.10\n\nRUN pip install keras-image-helper\nRUN pip install https://github.com/alexeygrig"
  },
  {
    "path": "09-serverless/code/convert-model.py",
    "chars": 285,
    "preview": "import tensorflow as tf\nfrom tensorflow import keras\n\nmodel = keras.models.load_model('clothing-model.h5')\n\nconverter = "
  },
  {
    "path": "09-serverless/code/lambda_function.py",
    "chars": 995,
    "preview": "#!/usr/bin/env python\n# coding: utf-8\n\nimport tflite_runtime.interpreter as tflite\nfrom keras_image_helper import create"
  },
  {
    "path": "09-serverless/code/plan.md",
    "chars": 1407,
    "preview": "# 9. Serverless Deep Learning\n\nWe'll deploy the clothes classification model we trained previously. \n\n## 9.1 Introductio"
  },
  {
    "path": "09-serverless/code/tensorflow-model.ipynb",
    "chars": 19570,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 1,\n   \"id\": \"37a8e270\",\n   \"metadata\": {},\n   \"outputs\":"
  },
  {
    "path": "09-serverless/code/test.py",
    "chars": 200,
    "preview": "import requests\n\nurl = 'http://localhost:8080/2015-03-31/functions/function/invocations'\n\ndata = {'url': 'http://bit.ly/"
  },
  {
    "path": "09-serverless/homework.md",
    "chars": 522,
    "preview": "## Homework\n\n* For 2025 cohort homework, check [the 2025 cohort folder](../cohorts/2025/09-serverless/homework.md)\n* For"
  },
  {
    "path": "09-serverless/meta.csv",
    "chars": 851,
    "preview": "lesson,name,page_name,video,slides,notebook\n1,Introduction to Serverless,01-intro.md,https://www.youtube.com/watch?v=JLI"
  },
  {
    "path": "09-serverless/meta.json",
    "chars": 84,
    "preview": "{\n    \"data\": \"meta.csv\",\n    \"session\": 9,\n    \"name\": \"Serverless Deep Learning\"\n}"
  },
  {
    "path": "09-serverless/updates.md",
    "chars": 1026,
    "preview": "## Python 3.12 vs TF Lite 2.17\n\nThe latest versions of TF Lite don't support Python 3.12 yet. \n\nAs a workaround, we can "
  },
  {
    "path": "09-serverless/workshop/README.md",
    "chars": 16318,
    "preview": "# Machine Learning and Deep Learning Model Deployment with Serverless\n\n* Video: https://www.youtube.com/watch?v=sHQaeVm5"
  },
  {
    "path": "09-serverless/workshop/lambda-keras/.gitignore",
    "chars": 47,
    "preview": "*.keras\n*saved_model\n*.onnx\n*.h5\nconvert/models"
  },
  {
    "path": "09-serverless/workshop/lambda-keras/Dockerfile",
    "chars": 207,
    "preview": "FROM public.ecr.aws/lambda/python:3.13\n\nRUN pip install onnxruntime keras-image-helper\n\nCOPY clothing-model-new.onnx clo"
  },
  {
    "path": "09-serverless/workshop/lambda-keras/convert/.dockerignore",
    "chars": 6,
    "preview": "models"
  },
  {
    "path": "09-serverless/workshop/lambda-keras/convert/Dockerfile",
    "chars": 439,
    "preview": "FROM python:3.13.5-slim\n\nARG COMMIT_ID\n\nENV PYTHONDONTWRITEBYTECODE=1 \\\n    PYTHONUNBUFFERED=1 \\\n    PIP_NO_CACHE_DIR=1\n"
  },
  {
    "path": "09-serverless/workshop/lambda-keras/convert/README.md",
    "chars": 495,
    "preview": "\n```bash\n# you can update it to the latest commit\nCOMMIT_ID=c34ac1d751427cf5d98023a21cce4c82b0cf96a1\nTAG=${COMMIT_ID:0:7"
  },
  {
    "path": "09-serverless/workshop/lambda-keras/convert/convert-saved-model.py",
    "chars": 136,
    "preview": "from tensorflow import keras\n\nmodel = keras.models.load_model('clothing-model-new.keras')\nmodel.export(\"clothing-model-n"
  },
  {
    "path": "09-serverless/workshop/lambda-keras/lambda_function.py",
    "chars": 1283,
    "preview": "import numpy as np\nimport onnxruntime as ort\nfrom keras_image_helper import create_preprocessor\n\n\ndef preprocess_pytorch"
  },
  {
    "path": "09-serverless/workshop/lambda-keras/test.ipynb",
    "chars": 3487,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 1,\n   \"id\": \"ced7eb3e-93b2-47b5-9c5a-ef2cf91baf4e\",\n   \""
  },
  {
    "path": "09-serverless/workshop/lambda-keras/test.py",
    "chars": 211,
    "preview": "import requests\n\nurl = 'http://localhost:8080/2015-03-31/functions/function/invocations'\n\nrequest = {\n    \"url\": \"http:/"
  },
  {
    "path": "09-serverless/workshop/lambda-onnx/.gitignore",
    "chars": 6,
    "preview": "*.onnx"
  },
  {
    "path": "09-serverless/workshop/lambda-onnx/Dockerfile",
    "chars": 285,
    "preview": "FROM public.ecr.aws/lambda/python:3.13\n\nRUN pip install onnxruntime keras-image-helper==0.0.2\n\nARG MODEL_NAME=clothing_c"
  },
  {
    "path": "09-serverless/workshop/lambda-onnx/lambda_function.py",
    "chars": 1370,
    "preview": "import os\n\nimport numpy as np\nimport onnxruntime as ort\nfrom keras_image_helper import create_preprocessor\n\n\nmodel_name "
  },
  {
    "path": "09-serverless/workshop/lambda-onnx/test.ipynb",
    "chars": 5232,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 5,\n   \"id\": \"b2b3e76f-09ac-42c1-bb0a-c93fe73e7167\",\n   \""
  },
  {
    "path": "09-serverless/workshop/lambda-onnx/test.py",
    "chars": 211,
    "preview": "import requests\n\nurl = 'http://localhost:8080/2015-03-31/functions/function/invocations'\n\nrequest = {\n    \"url\": \"http:/"
  },
  {
    "path": "09-serverless/workshop/lambda-sklearn/.dockerignore",
    "chars": 406,
    "preview": "\n# Project specific\ninvoke.py\ncustomer.json\nREADME.md\ntest.py\n\n# Git\n.git\n.gitignore\n\n# Python\n__pycache__\n*.pyc\n*.pyo\n*"
  },
  {
    "path": "09-serverless/workshop/lambda-sklearn/.python-version",
    "chars": 5,
    "preview": "3.13\n"
  },
  {
    "path": "09-serverless/workshop/lambda-sklearn/Dockerfile",
    "chars": 268,
    "preview": "FROM public.ecr.aws/lambda/python:3.13\nCOPY --from=ghcr.io/astral-sh/uv:latest /uv /bin/\n\nCOPY pyproject.toml uv.lock ./"
  },
  {
    "path": "09-serverless/workshop/lambda-sklearn/customer.json",
    "chars": 557,
    "preview": "{\n  \"customer\": {\n    \"gender\": \"female\",\n    \"seniorcitizen\": 0,\n    \"partner\": \"yes\",\n    \"dependents\": \"no\",\n    \"pho"
  },
  {
    "path": "09-serverless/workshop/lambda-sklearn/deploy.sh",
    "chars": 910,
    "preview": "#!/bin/bash\n\nIMAGE_NAME=\"churn-prediction-lambda\"\nAWS_REGION=\"eu-west-1\"\n\nAWS_ACCOUNT_ID=$(aws sts get-caller-identity |"
  },
  {
    "path": "09-serverless/workshop/lambda-sklearn/invoke.py",
    "chars": 959,
    "preview": "import boto3\nimport json\n\nlambda_client = boto3.client('lambda')\n\ncustomer_data = {\n    \"customer\": {\n        \"gender\": "
  },
  {
    "path": "09-serverless/workshop/lambda-sklearn/lambda_function.py",
    "chars": 425,
    "preview": "import pickle\n\nwith open('model.bin', 'rb') as f_in:\n    pipeline = pickle.load(f_in)\n\ndef predict_single(customer):\n   "
  },
  {
    "path": "09-serverless/workshop/lambda-sklearn/pyproject.toml",
    "chars": 189,
    "preview": "[project]\nname = \"lambda-1-simple\"\nversion = \"0.1.0\"\ndescription = \"Add your description here\"\nreadme = \"README.md\"\nrequ"
  },
  {
    "path": "09-serverless/workshop/lambda-sklearn/test.py",
    "chars": 242,
    "preview": "import requests\nimport json\n\nurl = 'http://localhost:8080/2015-03-31/functions/function/invocations'\n\n\nwith open('custom"
  },
  {
    "path": "09-serverless/workshop/train/.python-version",
    "chars": 5,
    "preview": "3.13\n"
  },
  {
    "path": "09-serverless/workshop/train/README.md",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "09-serverless/workshop/train/pyproject.toml",
    "chars": 200,
    "preview": "[project]\nname = \"train\"\nversion = \"0.1.0\"\ndescription = \"Add your description here\"\nreadme = \"README.md\"\nrequires-pytho"
  },
  {
    "path": "09-serverless/workshop/train/train.py",
    "chars": 1887,
    "preview": "import pickle\n\nimport pandas as pd\nimport sklearn\n\nfrom sklearn.feature_extraction import DictVectorizer\nfrom sklearn.li"
  },
  {
    "path": "10-kubernetes/01-overview.md",
    "chars": 1410,
    "preview": "\n## 10.1 Overview\n\n<a href=\"https://www.youtube.com/watch?v=mvPER7YfTkw&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR\"><img sr"
  },
  {
    "path": "10-kubernetes/02-tensorflow-serving.md",
    "chars": 3058,
    "preview": "\r\n## 10.2 TensorFlow Serving\r\n\r\n<a href=\"https://www.youtube.com/watch?v=deXR2fThYDw&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpH"
  },
  {
    "path": "10-kubernetes/03-preprocessing.md",
    "chars": 3906,
    "preview": "\n## 10.3 Creating a pre-processing service\n\n<a href=\"https://www.youtube.com/watch?v=OIlrS14Zi0o&list=PL3MmuxUbc_hIhxl5J"
  },
  {
    "path": "10-kubernetes/04-docker-compose.md",
    "chars": 4126,
    "preview": "\r\n## 10.4 Running everything locally with Docker-compose\r\n\r\n<a href=\"https://www.youtube.com/watch?v=ZhQQfpWfkKY&list=PL"
  },
  {
    "path": "10-kubernetes/05-kubernetes-intro.md",
    "chars": 3947,
    "preview": "\n## 10.5 Introduction to Kubernetes\n\n<a href=\"https://www.youtube.com/watch?v=UjVkpszDzgk&list=PL3MmuxUbc_hIhxl5Ji8t4O6l"
  },
  {
    "path": "10-kubernetes/06-kubernetes-simple-service.md",
    "chars": 7753,
    "preview": "\n## 10.6 Deploying a simple service to Kubernetes\n\n<a href=\"https://www.youtube.com/watch?v=PPUCVRIV9t8&list=PL3MmuxUbc_"
  },
  {
    "path": "10-kubernetes/07-kubernetes-tf-serving.md",
    "chars": 4728,
    "preview": "## 10.7 Deploying TensorFlow models to Kubernetes\n\n<a href=\"https://www.youtube.com/watch?v=6vHLMdnjO2w&list=PL3MmuxUbc_"
  },
  {
    "path": "10-kubernetes/08-eks.md",
    "chars": 3576,
    "preview": "## 10.8 Deploying to EKS\r\n\r\n<a href=\"https://www.youtube.com/watch?v=89jxeddZtC0&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR"
  }
]

// ... and 228 more files (download for full content)

About this extraction

This page contains the full source code of the DataTalksClub/machine-learning-zoomcamp GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 428 files (6.6 MB), approximately 1.7M tokens, and a symbol index with 76 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.

Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.

Extract another repo