Showing preview only (6,962K chars total). Download the full file or copy to clipboard to get everything.
Repository: DataTalksClub/machine-learning-zoomcamp
Branch: master
Commit: 2636ec556fac
Files: 428
Total size: 6.6 MB
Directory structure:
gitextract_3sp31muz/
├── .github/
│ └── FUNDING.yml
├── .gitignore
├── 01-intro/
│ ├── 01-what-is-ml.md
│ ├── 02-ml-vs-rules.md
│ ├── 03-supervised-ml.md
│ ├── 04-crisp-dm.md
│ ├── 05-model-selection.md
│ ├── 06-environment.md
│ ├── 07-numpy.md
│ ├── 08-linear-algebra.md
│ ├── 09-pandas.md
│ ├── 10-summary.md
│ ├── README.md
│ ├── homework.md
│ └── notebooks/
│ ├── 07-numpy.ipynb
│ ├── 08-linear-algebra.ipynb
│ └── 09-pandas.ipynb
├── 02-regression/
│ ├── 01-car-price-intro.md
│ ├── 02-data-preparation.md
│ ├── 03-eda.md
│ ├── 04-validation-framework.md
│ ├── 05-linear-regression-simple.md
│ ├── 06-linear-regression-vector.md
│ ├── 07-linear-regression-training.md
│ ├── 08-baseline-model.md
│ ├── 09-rmse.md
│ ├── 10-car-price-validation.md
│ ├── 11-feature-engineering.md
│ ├── 12-categorical-variables.md
│ ├── 13-regularization.md
│ ├── 14-tuning-model.md
│ ├── 15-using-model.md
│ ├── 16-summary.md
│ ├── 17-explore-more.md
│ ├── README.md
│ ├── homework.md
│ ├── meta.json
│ └── notebook.ipynb
├── 03-classification/
│ ├── 01-churn-project.md
│ ├── 02-data-preparation.md
│ ├── 03-validation.md
│ ├── 04-eda.md
│ ├── 05-risk.md
│ ├── 06-mutual-info.md
│ ├── 07-correlation.md
│ ├── 08-ohe.md
│ ├── 09-logistic-regression.md
│ ├── 10-training-log-reg.md
│ ├── 11-log-reg-interpretation.md
│ ├── 12-using-log-reg.md
│ ├── 13-summary.md
│ ├── 14-explore-more.md
│ ├── README.md
│ ├── homework.md
│ ├── meta.csv
│ ├── meta.json
│ ├── notebook-scaling-ohe.ipynb
│ └── notebook.ipynb
├── 04-evaluation/
│ ├── 01-overview.md
│ ├── 02-accuracy.md
│ ├── 03-confusion-table.md
│ ├── 04-precision-recall.md
│ ├── 05-roc.md
│ ├── 06-auc.md
│ ├── 07-cross-validation.md
│ ├── 08-summary.md
│ ├── 09-explore-more.md
│ ├── README.md
│ ├── homework.md
│ ├── meta.csv
│ ├── meta.json
│ └── notebook.ipynb
├── 05-deployment/
│ ├── 01-intro.md
│ ├── 02-pickle.md
│ ├── 03-flask-intro.md
│ ├── 04-flask-deployment.md
│ ├── 05-pipenv.md
│ ├── 06-docker.md
│ ├── 07-aws-eb.md
│ ├── 08-summary.md
│ ├── 09-explore-more.md
│ ├── README.md
│ ├── code/
│ │ ├── 05-train-churn-model.ipynb
│ │ ├── Dockerfile
│ │ ├── Pipfile
│ │ ├── ping.py
│ │ ├── plan.md
│ │ ├── predict-test.py
│ │ ├── predict.py
│ │ └── train.py
│ ├── homework.md
│ ├── meta.csv
│ ├── meta.json
│ └── workshop/
│ ├── .dockerignore
│ ├── .python-version
│ ├── Dockerfile
│ ├── README.md
│ ├── fly.toml
│ ├── ping.py
│ ├── predict.py
│ ├── predict_old.py
│ ├── pyproject.toml
│ ├── starter.ipynb
│ ├── test.py
│ ├── train.py
│ └── workshop-uv-fastapi.ipynb
├── 06-trees/
│ ├── 01-credit-risk.md
│ ├── 02-data-prep.md
│ ├── 03-decision-trees.md
│ ├── 04-decision-tree-learning.md
│ ├── 05-decision-tree-tuning.md
│ ├── 06-random-forest.md
│ ├── 07-boosting.md
│ ├── 08-xgb-tuning.md
│ ├── 09-final-model.md
│ ├── 10-summary.md
│ ├── 11-explore-more.md
│ ├── README.md
│ ├── homework.md
│ ├── meta.csv
│ ├── meta.json
│ └── notebook.ipynb
├── 08-deep-learning/
│ ├── 01-fashion-classification.md
│ ├── 02-tensorflow-keras.md
│ ├── 03-pretrained-models.md
│ ├── 04-conv-neural-nets.md
│ ├── 05-transfer-learning.md
│ ├── 06-learning-rate.md
│ ├── 07-checkpointing.md
│ ├── 08-more-layers.md
│ ├── 09-dropout.md
│ ├── 10-augmentation.md
│ ├── 11-large-model.md
│ ├── 12-using-model.md
│ ├── 13-summary.md
│ ├── 14-explore-more.md
│ ├── README.md
│ ├── homework.md
│ ├── install.md
│ ├── meta.csv
│ ├── meta.json
│ ├── notebook.ipynb
│ └── pytorch/
│ ├── README.md
│ └── install_pytorch.md
├── 09-serverless/
│ ├── 01-intro.md
│ ├── 02-aws-lambda.md
│ ├── 03-tensorflow-lite.md
│ ├── 04-preparing-code.md
│ ├── 05-docker-image.md
│ ├── 06-creating-lambda.md
│ ├── 07-api-gateway.md
│ ├── 08-summary.md
│ ├── 09-explore-more.md
│ ├── README.md
│ ├── code/
│ │ ├── Dockerfile
│ │ ├── convert-model.py
│ │ ├── lambda_function.py
│ │ ├── plan.md
│ │ ├── tensorflow-model.ipynb
│ │ └── test.py
│ ├── homework.md
│ ├── meta.csv
│ ├── meta.json
│ ├── updates.md
│ └── workshop/
│ ├── README.md
│ ├── lambda-keras/
│ │ ├── .gitignore
│ │ ├── Dockerfile
│ │ ├── convert/
│ │ │ ├── .dockerignore
│ │ │ ├── Dockerfile
│ │ │ ├── README.md
│ │ │ └── convert-saved-model.py
│ │ ├── lambda_function.py
│ │ ├── test.ipynb
│ │ └── test.py
│ ├── lambda-onnx/
│ │ ├── .gitignore
│ │ ├── Dockerfile
│ │ ├── lambda_function.py
│ │ ├── test.ipynb
│ │ └── test.py
│ ├── lambda-sklearn/
│ │ ├── .dockerignore
│ │ ├── .python-version
│ │ ├── Dockerfile
│ │ ├── customer.json
│ │ ├── deploy.sh
│ │ ├── invoke.py
│ │ ├── lambda_function.py
│ │ ├── pyproject.toml
│ │ └── test.py
│ └── train/
│ ├── .python-version
│ ├── README.md
│ ├── pyproject.toml
│ └── train.py
├── 10-kubernetes/
│ ├── 01-overview.md
│ ├── 02-tensorflow-serving.md
│ ├── 03-preprocessing.md
│ ├── 04-docker-compose.md
│ ├── 05-kubernetes-intro.md
│ ├── 06-kubernetes-simple-service.md
│ ├── 07-kubernetes-tf-serving.md
│ ├── 08-eks.md
│ ├── 09-summary.md
│ ├── 10-explore-more.md
│ ├── README.md
│ ├── code/
│ │ ├── Pipfile
│ │ ├── README.md
│ │ ├── docker-compose.yaml
│ │ ├── gateway.py
│ │ ├── image-gateway.dockerfile
│ │ ├── image-model.dockerfile
│ │ ├── kube-config/
│ │ │ ├── eks-config.yaml
│ │ │ ├── gateway-deployment.yaml
│ │ │ ├── gateway-service.yaml
│ │ │ ├── model-deployment.yaml
│ │ │ └── model-service.yaml
│ │ ├── ping/
│ │ │ ├── Dockerfile
│ │ │ ├── Pipfile
│ │ │ ├── deployment.yaml
│ │ │ ├── metallb-config.yaml
│ │ │ ├── ping.py
│ │ │ └── service.yaml
│ │ ├── plan.md
│ │ ├── proto.py
│ │ ├── test.py
│ │ └── tf-serving-connect.ipynb
│ ├── homework.md
│ ├── meta.csv
│ ├── meta.json
│ └── workshop/
│ ├── README.md
│ ├── k8s/
│ │ ├── deployment.yaml
│ │ ├── hpa.yaml
│ │ └── service.yaml
│ ├── load_test.py
│ └── service/
│ ├── .gitignore
│ ├── .python-version
│ ├── Dockerfile
│ ├── README.md
│ ├── app.py
│ ├── pyproject.toml
│ └── test.py
├── 11-kserve/
│ ├── 01-overview.md
│ ├── 02-kserve-local.md
│ ├── 03-kserve-sklearn.md
│ ├── 04-kserve-custom-image.md
│ ├── 05-tensorflow-kserve.md
│ ├── 06-kserve-transformers.md
│ ├── 07-kserve-eks-upd.md
│ ├── 07-kserve-eks.md
│ ├── 08-summary.md
│ ├── 09-explore-more.md
│ ├── README.md
│ ├── code/
│ │ ├── .gitignore
│ │ ├── README.md
│ │ ├── churn/
│ │ │ ├── Pipfile
│ │ │ ├── churn-service.yaml
│ │ │ ├── churn-test.py
│ │ │ ├── churn-train.py
│ │ │ └── model.joblib
│ │ ├── clothes/
│ │ │ ├── clothes-service.yaml
│ │ │ ├── convert.py
│ │ │ ├── test-transformer.py
│ │ │ ├── test.ipynb
│ │ │ └── test.py
│ │ ├── eks/
│ │ │ ├── clothes-service.yaml
│ │ │ ├── cluster.yaml
│ │ │ └── test-transformer.py
│ │ ├── image_transfomer/
│ │ │ ├── Dockerfile
│ │ │ ├── Pipfile
│ │ │ └── image_transformer.py
│ │ ├── iris/
│ │ │ ├── iris-example.yaml
│ │ │ ├── iris-request.json
│ │ │ └── iris-test.py
│ │ └── plan.md
│ ├── meta.csv
│ └── meta.json
├── README.md
├── after-sign-up.md
├── article/
│ └── README.md
├── asking-questions.md
├── bento.md
├── certificates.md
├── cohorts/
│ ├── 2021/
│ │ ├── 01-intro/
│ │ │ ├── homework-1.ipynb
│ │ │ └── homework.md
│ │ ├── 02-regression/
│ │ │ ├── homework.ipynb
│ │ │ └── homework.md
│ │ ├── 03-classification/
│ │ │ ├── homework.ipynb
│ │ │ └── homework.md
│ │ ├── 04-evaluation/
│ │ │ ├── homework-4-solution.ipynb
│ │ │ ├── homework-4-starter.ipynb
│ │ │ └── homework.md
│ │ ├── 05-deployment/
│ │ │ ├── homework/
│ │ │ │ ├── Dockerfile
│ │ │ │ ├── Pipfile
│ │ │ │ ├── homework.md
│ │ │ │ ├── q3_test.py
│ │ │ │ ├── q4_predict.py
│ │ │ │ ├── q4_test.py
│ │ │ │ ├── q6_predict.py
│ │ │ │ └── q6_test.py
│ │ │ └── homework.md
│ │ ├── 06-trees/
│ │ │ ├── homework-6-solution.ipynb
│ │ │ ├── homework-6-starter.ipynb
│ │ │ └── homework.md
│ │ ├── 07-midterm-project/
│ │ │ ├── README.md
│ │ │ ├── week10-office-hours.ipynb
│ │ │ ├── week8-office-hours.ipynb
│ │ │ └── week9-office-hours.ipynb
│ │ ├── 08-deep-learning/
│ │ │ ├── CNN_solution.ipynb
│ │ │ ├── homework.md
│ │ │ └── week-11-office-hours.ipynb
│ │ ├── 09-serverless/
│ │ │ ├── homework/
│ │ │ │ ├── Dockerfile
│ │ │ │ ├── homework.ipynb
│ │ │ │ ├── homework.py
│ │ │ │ └── test.py
│ │ │ └── homework.md
│ │ ├── 10-kubernetes/
│ │ │ ├── homework/
│ │ │ │ ├── deployment.yaml
│ │ │ │ └── service.yaml
│ │ │ └── homework.md
│ │ ├── 12-capstone/
│ │ │ └── README.md
│ │ ├── 13-article/
│ │ │ └── README.md
│ │ ├── 14-project/
│ │ │ └── README.md
│ │ ├── leaderboard.md
│ │ └── office-hours.md
│ ├── 2022/
│ │ ├── 01-intro/
│ │ │ ├── homework.md
│ │ │ └── homework_1.ipynb
│ │ ├── 02-regression/
│ │ │ ├── homework.md
│ │ │ └── homework_2.ipynb
│ │ ├── 03-classification/
│ │ │ ├── homework.md
│ │ │ └── homework_3.ipynb
│ │ ├── 04-evaluation/
│ │ │ ├── homework.md
│ │ │ └── homework_4.ipynb
│ │ ├── 05-deployment/
│ │ │ ├── homework/
│ │ │ │ ├── Dockerfile
│ │ │ │ ├── Pipfile
│ │ │ │ ├── q3_test.py
│ │ │ │ ├── q4_predict.py
│ │ │ │ ├── q4_test.py
│ │ │ │ ├── q6_predict.py
│ │ │ │ └── q6_test.py
│ │ │ └── homework.md
│ │ ├── 06-trees/
│ │ │ ├── homework.md
│ │ │ ├── homework_6.ipynb
│ │ │ └── homework_6_starter.ipynb
│ │ ├── 07-bento-production/
│ │ │ ├── homework.md
│ │ │ └── locustfile.py
│ │ ├── 08-deep-learning/
│ │ │ ├── homework.md
│ │ │ └── homework_8.ipynb
│ │ ├── 09-serverless/
│ │ │ ├── homework/
│ │ │ │ ├── Dockerfile
│ │ │ │ ├── homework.ipynb
│ │ │ │ ├── homework.py
│ │ │ │ └── test.py
│ │ │ └── homework.md
│ │ ├── 10-kubernetes/
│ │ │ ├── homework/
│ │ │ │ ├── deployment.yaml
│ │ │ │ ├── hpa.yaml
│ │ │ │ ├── service.yaml
│ │ │ │ └── test.py
│ │ │ └── homework.md
│ │ ├── README.md
│ │ ├── article.md
│ │ ├── leaderboard.md
│ │ └── projects.md
│ ├── 2023/
│ │ ├── 01-intro/
│ │ │ ├── homework.md
│ │ │ └── homework_1.ipynb
│ │ ├── 02-regression/
│ │ │ └── homework.md
│ │ ├── 03-classification/
│ │ │ ├── homework.md
│ │ │ └── homework_3.ipynb
│ │ ├── 04-evaluation/
│ │ │ └── homework.md
│ │ ├── 05-deployment/
│ │ │ ├── homework/
│ │ │ │ ├── Dockerfile
│ │ │ │ ├── Pipfile
│ │ │ │ ├── q3_test.py
│ │ │ │ ├── q4_predict.py
│ │ │ │ ├── q4_test.py
│ │ │ │ ├── q6_predict.py
│ │ │ │ └── q6_test.py
│ │ │ └── homework.md
│ │ ├── 06-trees/
│ │ │ └── homework.md
│ │ ├── 08-deep-learning/
│ │ │ ├── homework.ipynb
│ │ │ └── homework.md
│ │ ├── 09-serverless/
│ │ │ └── homework.md
│ │ ├── 10-kubernetes/
│ │ │ └── homework.md
│ │ ├── README.md
│ │ ├── article.md
│ │ ├── leaderboard.md
│ │ └── projects.md
│ ├── 2024/
│ │ ├── 01-intro/
│ │ │ └── homework.md
│ │ ├── 02-regression/
│ │ │ └── homework.md
│ │ ├── 03-classification/
│ │ │ └── homework.md
│ │ ├── 04-evaluation/
│ │ │ └── homework.md
│ │ ├── 05-deployment/
│ │ │ └── homework.md
│ │ ├── 06-trees/
│ │ │ └── homework.md
│ │ ├── 08-deep-learning/
│ │ │ └── homework.md
│ │ ├── 09-serverless/
│ │ │ └── homework.md
│ │ ├── 10-kubernetes/
│ │ │ └── homework.md
│ │ ├── README.md
│ │ ├── article.md
│ │ └── projects.md
│ └── 2025/
│ ├── 01-intro/
│ │ ├── homework.md
│ │ └── homework_1.ipynb
│ ├── 02-regression/
│ │ ├── homework.md
│ │ └── homework_2.ipynb
│ ├── 03-classification/
│ │ ├── homework.md
│ │ └── homework_3.ipynb
│ ├── 04-evaluation/
│ │ ├── homework.md
│ │ └── homework_4.ipynb
│ ├── 05-deployment/
│ │ ├── homework/
│ │ │ ├── .python-version
│ │ │ ├── Dockerfile_base
│ │ │ ├── Dockerfile_full
│ │ │ ├── Dockerfile_hw
│ │ │ ├── README.md
│ │ │ ├── main.py
│ │ │ ├── pyproject.toml
│ │ │ ├── q3_test.py
│ │ │ ├── q4_predict.py
│ │ │ ├── q4_test.py
│ │ │ ├── q6_predict.py
│ │ │ └── q6_test.py
│ │ └── homework.md
│ ├── 06-trees/
│ │ ├── homework.ipynb
│ │ └── homework.md
│ ├── 08-deep-learning/
│ │ └── homework.md
│ ├── 09-serverless/
│ │ └── homework.md
│ ├── 10-kubernetes/
│ │ └── homework.md
│ ├── README.md
│ ├── article.md
│ └── projects.md
├── generate-description.ipynb
├── generate-pages.ipynb
├── learning-in-public.md
└── projects/
├── README.md
├── how-to.md
└── project-tips.md
================================================
FILE CONTENTS
================================================
================================================
FILE: .github/FUNDING.yml
================================================
github: alexeygrigorev
================================================
FILE: .gitignore
================================================
# generated
.ipynb_checkpoints/
__pycache__/
**my_dir/
**logs/
**models/
# file types
*.h5
*.tflite
*.keras
*.zip
*.pdf
# data folders
**data/
# content-specific
/08-deep-learning/clothing-dataset-small/
/08-deep-learning/clothing-dataset/
/08-deep-learning/ImageClassification/
/08-deep-learning/my_dir/
/09-serverless/clothing-model/
/09-serverless/clothing-model/
**midterms_evaluations/
**samples/
================================================
FILE: 01-intro/01-what-is-ml.md
================================================
## 1.1 Introduction to Machine Learning
<a href="https://www.youtube.com/watch?v=Crm_5n4mvmg&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR&index=2"><img src="images/thumbnail-1-01.jpg"></a>
[Slides](https://www.slideshare.net/AlexeyGrigorev/ml-zoomcamp-11-introduction-to-machine-learning)
## Notes
The concept of ML is depicted with an example of predicting the price of a car. The ML model
learns from data, represented as some **features** such as year, mileage, among others, and the **target** variable, in this
case, the car's price, by extracting patterns from the data.
Then, the model is given new data (**without** the target) about cars and predicts their price (target).
In summary, ML is a process of **extracting patterns from data**, which is of two types:
* features (information about the object) and
* target (property to predict for unseen objects).
Therefore, new feature values are presented to the model, and it makes **predictions** from the learned patterns.
<table>
<tr>
<td>⚠️</td>
<td>
The notes are written by the community. <br>
If you see an error here, please create a PR with a fix.
</td>
</tr>
</table>
## Notes
* [Notes from Peter Ernicke](https://knowmledge.com/2023/09/09/ml-zoomcamp-2023-introduction-to-machine-learning-part-1/)
## Navigation
* [Machine Learning Zoomcamp course](../)
* [Lesson 1: Introduction to Machine Learning](./)
* Next: [ML vs Rule-Based Systems](02-ml-vs-rules.md)
================================================
FILE: 01-intro/02-ml-vs-rules.md
================================================
## 1.2 ML vs Rule-Based Systems
<a href="https://www.youtube.com/watch?v=CeukwyUdaz8&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR&index=3"><img src="images/thumbnail-1-02.jpg"></a>
[Slides](https://www.slideshare.net/AlexeyGrigorev/ml-zoomcamp-12-ml-vs-rulebased-systems)
## Notes
The difference between ML and Rule-Based systems is explained with the example of a **spam filter**.
Traditional Rule-Based systems are based on a set of **characteristics** (keywords, email length, etc.) that identify an email as spam or not. As spam emails keep changing over time the system needs to be upgraded making the process untractable due to the complexity of code maintenance as the system grows.
ML can be used to solve this problem with the following steps:
### 1. Get data
Emails from the user's spam folder and inbox give examples of spam and non-spam.
### 2. Define and calculate features
Rules/characteristics from rule-based systems can be used as a starting point to define features for the ML model. The value of the target variable for each email can be defined based on where the email was obtained from (spam folder or inbox).
Each email can be encoded (converted) to the values of its features and target.
### 3. Train and use the model
A machine learning algorithm can then be applied to the encoded emails to build a model that can predict whether a new email is spam or not spam. The **predictions are probabilities**, and to make a decision it is necessary to define a threshold to classify emails as spam or not spam.
<table>
<tr>
<td>⚠️</td>
<td>
The notes are written by the community. <br>
If you see an error here, please create a PR with a fix.
</td>
</tr>
</table>
* [Notes from Peter Ernicke](https://knowmledge.com/2023/09/10/ml-zoomcamp-2023-introduction-to-machine-learning-part-2/)
## Navigation
* [Machine Learning Zoomcamp course](../)
* [Lesson 1: Introduction to Machine Learning](./)
* Previous: [Introduction to Machine Learning](01-what-is-ml.md)
* Next: [Supervised Machine Learning](03-supervised-ml.md)
================================================
FILE: 01-intro/03-supervised-ml.md
================================================
## 1.3 Supervised Machine Learning
<a href="https://www.youtube.com/watch?v=j9kcEuGcC2Y&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR&index=4"><img src="images/thumbnail-1-03.jpg"></a>
[Slides](https://www.slideshare.net/AlexeyGrigorev/ml-zoomcamp-13-supervised-machine-learning)
## Notes
In Supervised Machine Learning (SML) there are always labels associated with certain features.
The model is trained, and then it can make predictions on new features. In this way, the model
is taught by certain features and targets.
* **Feature matrix (X):** made of observations or objects (rows) and features (columns).
* **Target variable (y):** a vector with the target information we want to predict. For each row of X there's a value in y.
The model can be represented as a function, **g**, that takes the feature matrix, **X**, as **input** and tries to predict values as close as possible to the targets, **y**. The process of **finding** this function **g** is called **training**.
### Types of SML problems
* **Regression:** the output is a number (car's price).
* **Classification:** the output is a category (spam example).
* **Binary:** there are two categories.
* **Multiclass problems:** there are more than two categories.
* **Ranking:** the output is the top scores associated with corresponding items. It is applied in recommender systems.
In summary, SML is about teaching the model by showing it different examples, and the goal is to come up with a function, that takes the feature matrix as input, and makes predictions of values as close as possible to the **y** targets.
<table>
<tr>
<td>⚠️</td>
<td>
The notes are written by the community. <br>
If you see an error here, please create a PR with a fix.
</td>
</tr>
</table>
* [Notes from Peter Ernicke](https://knowmledge.com/2023/09/11/ml-zoomcamp-2023-introduction-to-machine-learning-part-3/)
## Navigation
* [Machine Learning Zoomcamp course](../)
* [Lesson 1: Introduction to Machine Learning](./)
* Previous: [ML vs Rule-Based Systems](02-ml-vs-rules.md)
* Next: [CRISP-DM](04-crisp-dm.md)
================================================
FILE: 01-intro/04-crisp-dm.md
================================================
## 1.4 CRISP-DM
<a href="https://www.youtube.com/watch?v=dCa3JvmJbr0&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR&index=5"><img src="images/thumbnail-1-04.jpg"></a>
[Slides](https://www.slideshare.net/AlexeyGrigorev/ml-zoomcamp-14-crispdm)
## Notes
CRISP-DM, which stands for Cross-Industry Standard Process for Data Mining, is an open standard process model that describes common approaches used by data mining experts. It is the most widely-used analytics model. Conceived in 1996, it became a European Union project under the ESPRIT funding initiative in 1997. The project was led by five companies: Integral Solutions Ltd (ISL), Teradata, Daimler AG, NCR Corporation and OHRA, an insurance company:
1. **Business understanding:** An important question is do we need ML for the project. The goal of the project has to be measurable.
2. **Data understanding:** Analyze available data sources, and decide if more data is required.
3. **Data preparation:** Clean data, remove noise applying pipelines, and convert the data to a tabular format, so we can put it into ML.
4. **Modeling:** Train different models and choose the best one. Considering the results of this step, it is proper to decide if it is required to add new features or fix data issues.
5. **Evaluation:** Measure how well the model is performing and if it solves the business problem.
6. **Deployment:** Roll out to production to all the users. The evaluation and deployment often happen together - **online evaluation**.
It is important to consider how well maintainable the project is.
In general, ML projects require many iterations.
**Iteration:**
* Start simple
* Learn from the feedback
* Improve
<table>
<tr>
<td>⚠️</td>
<td>
The notes are written by the community. <br>
If you see an error here, please create a PR with a fix.
</td>
</tr>
</table>
* [Notes from Peter Ernicke](https://knowmledge.com/2023/09/12/ml-zoomcamp-2023-introduction-to-machine-learning-part-4/)
## Navigation
* [Machine Learning Zoomcamp course](../)
* [Lesson 1: Introduction to Machine Learning](./)
* Previous: [Supervised Machine Learning](03-supervised-ml.md)
* Next: [Model Selection Process](05-model-selection.md)
================================================
FILE: 01-intro/05-model-selection.md
================================================
## 1.5 Model Selection Process
<a href="https://www.youtube.com/watch?v=OH_R0Sl9neM&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR&index=6"><img src="images/thumbnail-1-05.jpg"></a>
[Slides](https://www.slideshare.net/AlexeyGrigorev/ml-zoomcamp-15-model-selection-process)
## Notes
### Which model to choose?
- Logistic regression
- Decision tree
- Neural Network
- Or many others
The validation dataset is not used in training. There are feature matrices and y vectors
for both training and validation datasets.
The model is fitted with training data, and it is used to predict the y values of the validation
feature matrix. Then, the predicted y values (probabilities)
are compared with the actual y values.
**Multiple comparisons problem (MCP):** just by chance one model can be lucky and obtain
good predictions because all of them are probabilistic.
The test set can help to avoid the MCP. Obtaining the best model is done with the training and validation datasets, while the test dataset is used for assuring that the proposed best model is the best.
1. Split datasets in training, validation, and test. E.g. 60%, 20% and 20% respectively
2. Train the models
3. Evaluate the models
4. Select the best model
5. Apply the best model to the test dataset
6. Compare the performance metrics of validation and test
<u>NB:</u> Note that it is possible to reuse the validation data. After selecting the best model (step 4), the validation and training datasets can be combined to form a single training dataset for the chosen model before testing it on the test set.
<table>
<tr>
<td>⚠️</td>
<td>
The notes are written by the community. <br>
If you see an error here, please create a PR with a fix.
</td>
</tr>
</table>
* [Notes from Peter Ernicke](https://knowmledge.com/2023/09/13/ml-zoomcamp-2023-introduction-to-machine-learning-part-5/)
## Navigation
* [Machine Learning Zoomcamp course](../)
* [Lesson 1: Introduction to Machine Learning](./)
* Previous: [CRISP-DM](04-crisp-dm.md)
* Next: [Setting up the Environment](06-environment.md)
================================================
FILE: 01-intro/06-environment.md
================================================
## Setting up the Environment
In this section, we'll prepare the environment
You need:
* Python 3.11 (note that videos use 3.8)
* NumPy, Pandas and Scikit-Learn (latest available versions)
* Matplotlib and Seaborn
* Jupyter notebooks
## Github Codespaces
Video: https://www.youtube.com/watch?v=pqQFlV3f9Bo&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR
This is the recommended approach for the course
## Ubuntu 22.04 on AWS
* [This video](https://www.youtube.com/watch?v=IXSiYkP23zo) shows a complete end-to-end environment configuration for an AWS EC2 instance
* This video was created for another course (MLOps Zoomcamp), so you'll need to adjust it slightly: clone this repo instead of the mlops one
* You can use these instructions for setting up your local Ubuntu
Note for WSL
* Most of the instructions from the previous video apply to WSL too
* For setting up Docker, install Docker Desktop on Windows and it'll be automatically used in WSL. You don't need to install docker.io
## Anaconda and Conda
The easiest way to set up the environment is to use [Anaconda](https://www.anaconda.com/products/individual) or
[Miniconda](https://docs.conda.io/en/latest/miniconda.html).
Anaconda comes with everything we need (and much more).
Miniconda is a smaller version of Anaconda that contains only Python.
Follow the instructions on page for installing the correct package for your system.
The site will automatically detect your operating system and suggest the correct package.
* [Anaconda](https://www.anaconda.com/products/individual)
* [Miniconda](https://docs.conda.io/en/latest/miniconda.html#latest-miniconda-installer-links)
If you are using Windows, you can use WSL, but the plain Windows version should work too.
Anaconda is recommended.
### (Optional) Create environment for course
It is a good idea to set up a dedicated environment for the course
In your terminal, run this command to create the environment
```bash
conda create -n ml-zoomcamp python=3.11
```
Activate it:
```bash
conda activate ml-zoomcamp
```
Installing libraries
```bash
conda install numpy pandas scikit-learn seaborn jupyter
```
Later in the course you will also need to install XGBoost and Tensorflow,
but we can skip this part for now.
## Cloud
Instead of running things locally, you can use online services or rent a server
### AWS
You can rent an instance on AWS:
* [Creating an AWS account](https://mlbookcamp.com/article/aws)
* [Renting an ec2 instance](https://mlbookcamp.com/article/aws-ec2)
### GCP
Google cloud platform offers $300 in free credits when you sign up.
You can use this for taking the course.
## Notebook services
There are services that allow you to host and run notebooks.
Note that notebooks alone are not sufficient for the course and for the deployment modules
you will need to have access to the command line interface with Docker, Python and other libraries installed.
### Kaggle
To use Kaggle to open and run the Jupyter notebooks provided as part of this course do the following:
*Pre-requisites - You need to have an account in Kaggle (it's free) and be logged into Kaggle*
1. Find the URL of the notebook.

2. To open the notebook in Kaggle, in your web browser launch paste the URL as shown in below example. (*note the additional https://kaggle.com/kernels/welcome?src= before the URL of the notebook*)
https://kaggle.com/kernels/welcome?src=https://github.com/alexeygrigorev/mlbookcamp-code/blob/master/chapter-02-car-price/02-carprice.ipynb
3. Check if the notebook uses any datafile to read data from it. If yes, note the datafile name from the code.- *look for pd.read_csv("somefilename.csv")*.

4. You need to download the file into Kaggle. For this:
a. Find the URL of the datafile in github.

b. Suppose the URL is https://github.com/alexeygrigorev/mlbookcamp-code/blob/master/chapter-02-car-price/data.csv , you need use the URL to raw file, which will look something like https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv
5. In the notebook opened in Kaggle, add a Code block with the command to download the file - !wget your-datafile-url

This way you can start with the exercise using Kaggle
### Google Colab
To use Google Colab to open and run the Jupyter notebooks provided as part of this course do the following:
*Pre-requisites - You need to have a google account (any gmail account) and be logged into that account*
Steps for Google Colab are same as that for Kaggle, except for some changes in Step 2, as explained below.
2. To open the notebook in Google Colab, in your web browser launch paste the URL as shown in below example. (*note the https://github.com/ in the URL of the notebook is replaced by https://colab.research.google.com/github/*)
https://colab.research.google.com/github/alexeygrigorev/mlbookcamp-code/blob/master/chapter-02-car-price/02-carprice.ipynb
## Navigation
* [Machine Learning Zoomcamp course](../)
* [Lesson 1: Introduction to Machine Learning](./)
* Previous lesson: [The Modelling Step (Model Selection Process)](05-model-selection.md)
* Next lesson: [Introduction to NumPy](07-numpy.md)
================================================
FILE: 01-intro/07-numpy.md
================================================
## 1.7 Introduction to NumPy
<a href="https://www.youtube.com/watch?v=Qa0-jYtRdbY&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR&index=7"><img src="images/thumbnail-1-07.jpg"></a>
## Notes
# Understanding Numpy: A Simple Introduction
Numpy, short for Numerical Python, is a powerful Python library that enables efficient and convenient array manipulation and mathematical operations. It forms the foundation for many scientific and data-related tasks. In this article, we'll provide a straightforward explanation of Numpy concepts and how to use them.
## Importing Numpy
Before diving into Numpy's capabilities, we need to import it. Conventionally, we import Numpy with the alias `np`, making it easier to reference its functions:
```python
import numpy as np
```
## Creating Arrays
Arrays are the building blocks of Numpy, and they can be thought of as lists but with enhanced features.
### Creating Arrays with Zeros, Ones, or Constants
You can create arrays filled with zeros, ones, or any constant using `np.zeros()`, `np.ones()`, and `np.full()`:
```python
zeros_array = np.zeros(10)
ones_array = np.ones(10)
constant_array = np.full(10, 3)
```
### Converting Lists to Arrays
To convert a Python list into a Numpy array, you can use `np.array()`:
```python
my_list = [2, 3, 4]
array_from_list = np.array(my_list)
```
### Generating Ranges of Numbers
Numpy provides functions for generating arrays of sequential numbers. For example:
```python
range_array = np.arange(10) # Creates an array from 0 to 9
```
### Creating Arrays with Linear Spacing
`np.linspace()` creates arrays with evenly spaced numbers within a specified range:
```python
linspace_array = np.linspace(0, 1, 11) # Creates 11 numbers from 0 to 1
```
### Multi-dimensional Arrays
Numpy can handle multi-dimensional arrays, often referred to as matrices. Here are some examples:
```python
zeros_matrix = np.zeros((5, 2))
ones_matrix = np.ones((5, 2))
constant_matrix = np.full((5, 2), 3)
```
## Indexing and Slicing Arrays
Like Python lists, you can access elements in Numpy arrays using indexing and slicing. For two-dimensional arrays:
```python
arr = np.array([[2, 3, 4], [4, 5, 6]])
first_row = arr[0] # Gets the first row
first_col = arr[:, 0] # Gets the first column
```
## Generating Random Arrays
Numpy can create arrays filled with random numbers. To ensure reproducibility, you can set a seed using `np.random.seed()`:
```python
np.random.seed(2) # Set the seed
random_array = np.random.rand(5, 2) # Generates random numbers between 0 and 1
```
For random numbers from a normal distribution or integers within a range:
```python
normal_distribution = np.random.randn(5, 2)
random_integers = np.random.randint(low=0, high=100, size=(5, 2))
```
## Array Operations
Numpy excels in performing mathematical operations on arrays efficiently.
### Element-wise Operations
You can perform operations on entire arrays element by element:
```python
arr = arr + 1 # Adds 1 to each element
arr = arr * 2 # Multiplies each element by 2
# Similar operations for division and exponentiation
```
### Element-wise Operations with Two Arrays
You can also perform operations between two arrays of the same shape:
```python
arr1 = np.ones(4)
arr2 = np.full(4, 3)
result = arr1 + arr2 # Element-wise addition
result = arr1 / arr2 # Element-wise division
```
### Comparison Operations
You can perform element-wise comparisons and create boolean arrays:
```python
arr = np.array([1, 2, 3, 4])
greater_than_2 = arr > 2 # Produces [False, False, True, True]
```
### Selecting Elements Based on Conditions
You can create subarrays based on certain conditions:
```python
selected_elements = arr[arr > 1] # Gets elements greater than 1
```
## Summary Operations
Numpy provides functions for summarizing array data:
```python
min_value = arr.min() # Minimum value
max_value = arr.max() # Maximum value
sum_value = arr.sum() # Sum of all elements
mean_value = arr.mean() # Mean (average) value
std_deviation = arr.std() # Standard deviation
```
In conclusion, Numpy is an essential library for anyone working with numerical data in Python. It simplifies array creation, manipulation, and mathematical operations, making it a powerful tool for scientific computing and data analysis. With the basics covered in this article, you're well on your way to harnessing Numpy's capabilities.
<table>
<tr>
<td>⚠️</td>
<td>
The notes are written by the community. <br>
If you see an error here, please create a PR with a fix.
</td>
</tr>
</table>
* [Notes from Peter Ernicke - Part 1/3](https://knowmledge.com/2023/09/14/ml-zoomcamp-2023-introduction-to-machine-learning-part-6/)
* [Notes from Peter Ernicke - Part 2/3](https://knowmledge.com/2023/09/14/ml-zoomcamp-2023-introduction-to-machine-learning-part-7/)
* [Notes from Peter Ernicke - Part 3/3](https://knowmledge.com/2023/09/14/ml-zoomcamp-2023-introduction-to-machine-learning-part-8/)
## Links
* [Notebook from the video](notebooks/07-numpy.ipynb)
* [Notebook](https://github.com/alexeygrigorev/mlbookcamp-code/blob/master/appendix-c-numpy.ipynb)
* [Introduction to NumPy](https://mlbookcamp.com/article/numpy)
## Additional links
* [Numpy Cheat sheet](https://www.datacamp.com/community/blog/python-numpy-cheat-sheet)
## Navigation
* [Machine Learning Zoomcamp course](../)
* [Lesson 1: Introduction to Machine Learning](./)
* Previous: [Setting up the Environment](06-environment.md)
* Next: [Linear Algebra Refresher](08-linear-algebra.md)
================================================
FILE: 01-intro/08-linear-algebra.md
================================================
## 1.8 Linear Algebra Refresher
<a href="https://www.youtube.com/watch?v=zZyKUeOR4Gg&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR&index=8"><img src="images/thumbnail-1-08.jpg"></a>
[Slides](https://www.slideshare.net/AlexeyGrigorev/ml-zoomcamp-18-linear-algebra-refresher)
## Notes
### Linear Algebra Refresher
* Vector operations
* Multiplication
* Vector-vector multiplication
* Matrix-vector multiplication
* Matrix-matrix multiplication
* Identity matrix
* Inverse
### Vector operations
~~~~python
u = np.array([2, 7, 5, 6])
v = np.array([3, 4, 8, 6])
# addition
u + v
# subtraction
u - v
# scalar multiplication
2 * v
~~~~
### Multiplication
##### Vector-vector multiplication
~~~~python
def vector_vector_multiplication(u, v):
assert u.shape[0] == v.shape[0]
n = u.shape[0]
result = 0.0
for i in range(n):
result = result + u[i] * v[i]
return result
~~~~
##### Matrix-vector multiplication
~~~~python
def matrix_vector_multiplication(U, v):
assert U.shape[1] == v.shape[0]
num_rows = U.shape[0]
result = np.zeros(num_rows)
for i in range(num_rows):
result[i] = vector_vector_multiplication(U[i], v)
return result
~~~~
##### Matrix-matrix multiplication
~~~~python
def matrix_matrix_multiplication(U, V):
assert U.shape[1] == V.shape[0]
num_rows = U.shape[0]
num_cols = V.shape[1]
result = np.zeros((num_rows, num_cols))
for i in range(num_cols):
vi = V[:, i]
Uvi = matrix_vector_multiplication(U, vi)
result[:, i] = Uvi
return result
~~~~
### Identity matrix
~~~~python
I = np.eye(3)
~~~~
### Inverse
~~~~python
V = np.array([
[1, 1, 2],
[0, 0.5, 1],
[0, 2, 1],
])
inv = np.linalg.inv(V)
~~~~
Add notes here (PRs are welcome).
<table>
<tr>
<td>⚠️</td>
<td>
The notes are written by the community. <br>
If you see an error here, please create a PR with a fix.
</td>
</tr>
</table>
* [Notes from Peter Ernicke - Part 1/3](https://knowmledge.com/2023/09/15/ml-zoomcamp-2023-introduction-to-machine-learning-part-9/)
* [Notes from Peter Ernicke - Part 2/3](https://knowmledge.com/2023/09/15/ml-zoomcamp-2023-introduction-to-machine-learning-part-10/)
* [Notes from Peter Ernicke - Part 3/3](https://knowmledge.com/2023/09/15/ml-zoomcamp-2023-introduction-to-machine-learning-part-11/)
## Links
* [Notebook from the video](notebooks/08-linear-algebra.ipynb)
* [Get a visual understanding of matrix multiplication](http://matrixmultiplication.xyz/)
* [Overview of matrix multiplication functions in python/numpy](https://github.com/MemoonaTahira/MLZoomcamp2022/blob/main/Notes/Week_1-intro_to_ML_linear_algebra/Notes_for_Chapter_1-Linear_Algebra.ipynb)
## Navigation
* [Machine Learning Zoomcamp course](../)
* [Lesson 1: Introduction to Machine Learning](./)
* Previous: [Introduction to NumPy](07-numpy.md)
* Next: [Introduction to Pandas](09-pandas.md)
================================================
FILE: 01-intro/09-pandas.md
================================================
## 1.9 Introduction to Pandas
<a href="https://www.youtube.com/watch?v=0j3XK5PsnxA&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR&index=9"><img src="images/thumbnail-1-09.jpg"></a>
## Notes
Add notes here (PRs are welcome).
<table>
<tr>
<td>⚠️</td>
<td>
The notes are written by the community. <br>
If you see an error here, please create a PR with a fix.
</td>
</tr>
</table>
* [Notes from Peter Ernicke - Part 1/2](https://knowmledge.com/2023/09/16/ml-zoomcamp-2023-introduction-to-machine-learning-part-12/)
* [Notes from Peter Ernicke - Part 2/2](https://knowmledge.com/2023/09/17/ml-zoomcamp-2023-introduction-to-machine-learning-part-13/)
## Links
* [Notebook from the video](notebooks/09-pandas.ipynb)
* [Notebook](https://github.com/alexeygrigorev/mlbookcamp-code/blob/master/appendix-d-pandas.ipynb)
## Additional links
* [Pandas Cheat sheet](https://www.datacamp.com/community/blog/python-pandas-cheat-sheet)
## Navigation
* [Machine Learning Zoomcamp course](../)
* [Lesson 1: Introduction to Machine Learning](./)
* Previous: [Linear Algebra Refresher](08-linear-algebra.md)
* Next: [Summary](10-summary.md)
================================================
FILE: 01-intro/10-summary.md
================================================
## 1.10 Summary
<a href="https://www.youtube.com/watch?v=VRrEEVeJ440&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR&index=10"><img src="images/thumbnail-1-10.jpg"></a>
[Slides](https://www.slideshare.net/AlexeyGrigorev/ml-zoomcamp-110-summary)
## Notes
---
### 📚 Summary of First Session - Machine Learning Zoomcamp
1. **🚗 Introduction to Machine Learning with Cars Data**
We start with data about cars, including characteristics (features) and prices (target). A Machine Learning (ML) model can be used to extract patterns from known information (data) about some cars in order to predict car prices based on their characteristics.
2. **🧠 Rules-Based Systems vs. Machine Learning**
- **Rules-Based Systems:** It is necessary to manually convert rules into code using a programming language and apply them to data. Extracting patterns manually can become complex and challenging.
- **Machine Learning:** Instead of manually coding rules, ML models automatically extract patterns from data using Mathematics and Statistics.
3. **🔍 Supervised Machine Learning**
In supervised learning, models learn from labeled data (with known outcomes) to make predictions on unseen data.
4. **🛠️ CRISP-DM (Cross Industry Standard Process for Data Mining)**
A structured methodology for organizing ML projects, consisting of the following steps:
- 💼 **Business Understanding**
- 🔎 **Data Understanding**
- 🧹 **Data Preparation**
- 🤖 **Modeling** (choosing and training models, then selecting the best one)
- 📊 **Evaluation**
- 🚀 **Deployment**
This process is iterative, allowing for continuous improvement.
5. **🏆 Model Selection**
Split data into training, validation, and test sets. Train different models, validate them, select the best performing one, and then test it on the test set to ensure generalization.
6. **💻 Setting Up the Environment**
Install necessary tools like Python, Numpy, Pandas, Matplotlib, Scikit-learn. Anaconda is the easiest option. Eventually create an AWS account for cloud resources.
7. **🔢 Introduction to Numpy**
Numpy is crucial for manipulating numerical data, providing efficient operations on arrays and matrices.
8. **🔗 Linear Algebra**
Covering all types of multiplication with vectors and matrices, including the creation of identity matrices using functions like `np.eye()`.
9. **📊 Introduction to Pandas**
Pandas is a Python library used for processing and analyzing tabular data efficiently.
---
<table>
<tr>
<td>⚠️</td>
<td>
The notes are written by the community. <br>
If you see an error here, please create a PR with a fix.
</td>
</tr>
</table>
* [Notes from Maximilien Eyengue](https://github.com/maxim-eyengue/Python-Codes/blob/main/ML_Zoomcamp_2024/01_intro/Summary_Session_01.md)
## Navigation
* [Machine Learning Zoomcamp course](../)
* [Lesson 1: Introduction to Machine Learning](./)
* Previous: [Introduction to Pandas](09-pandas.md)
* Next: [Homework](homework.md)
================================================
FILE: 01-intro/README.md
================================================
## 1. Introduction to Machine Learning
- 1.1 [Introduction to Machine Learning](01-what-is-ml.md)
- 1.2 [ML vs Rule-Based Systems](02-ml-vs-rules.md)
- 1.3 [Supervised Machine Learning](03-supervised-ml.md)
- 1.4 [CRISP-DM](04-crisp-dm.md)
- 1.5 [The Modelling Step (Model Selection Process)](05-model-selection.md)
- 1.6 [Setting up the Environment](06-environment.md)
- 1.7 [Introduction to NumPy](07-numpy.md)
- 1.8 [Linear Algebra Refresher](08-linear-algebra.md)
- 1.9 [Introduction to Pandas](09-pandas.md)
- 1.10 [Summary](10-summary.md)
- 1.11 [Homework](homework.md)
## Community notes
Did you take notes? You can share them here (or in each unit separately)
* [Notes by Ayoub Berdeddouch](https://github.com/ayoub-berdeddouch/mlbookcamp-homeworks/blob/main/Intro/homework_intro_AyoubBerdeddouch.ipynb)
* [Notes from Sebastián Ayala Ruano](https://github.com/sayalaruano/100DaysOfMLCode/blob/main/Intro_ML/Notes/NotesDay1.md)
* [Notes from Alvaro Navas](https://github.com/ziritrion/ml-zoomcamp/blob/main/notes/01_intro.md)
* [Notes from Luis Evaristo Caraballo de la Cruz](https://github.com/varocaraballo/ml-zoomcamp2022/blob/main/01%20-%20Introduction%20to%20Machine%20Learning/notes.md)
* [Notes from Jon Areas](https://github.com/jxareas/Machine-Learning-Bookcamp-2022/blob/master/notes/01-introduction.md)
* [Notes from Hareesh Tummala](https://github.com/tummala-hareesh/ml_zoomcamp_ht/blob/main/notes/week-1-notes.md)
* [Notes from Peter Ernicke](https://knowmledge.com/2023/09/09/ml-zoomcamp-2023-introduction-to-machine-learning-part-1/)
* [Notes from Josiah Adesola](https://colab.research.google.com/drive/1mlwkAaRi7R8C6quUi0-cMfXk0MXD5-wc?usp=sharing)
* [Notes by Kemal](https://github.com/kemaldahha/machine-learning-course/blob/main/week_1_notes.md)
* [Notes by Maximilien Eyengue](https://github.com/maxim-eyengue/Python-Codes/blob/main/ML_Zoomcamp_2024/01_intro/Summary_Session_01.md)
* [Notes by Mahrukh Tariq](https://github.com/mahrukh98/ml-zoomcamp-hw/blob/main/notes/session1.md)
* [Notes by Revathy Ramalingam](https://github.com/RevathyRamalingam/machineLearning/blob/main/01-Intro/01-Intro.md)
* Add your notes here
================================================
FILE: 01-intro/homework.md
================================================
## Homework
* For 2025 cohort homework, check [the 2025 cohort folder](../cohorts/2025/01-intro/homework.md)
* For 2024 cohort homework, check [the 2024 cohort folder](../cohorts/2024/01-intro/homework.md)
* For 2023 cohort homework, check [the 2023 cohort folder](../cohorts/2023/01-intro/homework.md)
* For 2022 cohort homework, check [the 2022 cohort folder](../cohorts/2022/01-intro/homework.md)
* For 2021 cohort homework and solution, check [the 2021 cohort folder](../cohorts/2021/01-intro/)
## Navigation
* [Machine Learning Zoomcamp course](../)
* [Lesson 1: Introduction to Machine Learning](./)
* Previous: [Summary](10-summary.md)
================================================
FILE: 01-intro/notebooks/07-numpy.ipynb
================================================
{
"cells": [
{
"cell_type": "markdown",
"id": "502da6a2",
"metadata": {},
"source": [
"# Machine Learning Zoomcamp\n",
"\n",
"\n",
"## 1.7 Introduction to NumPy\n",
"\n",
"\n",
"Plan:\n",
"\n",
"* Creating arrays\n",
"* Multi-dimensional arrays\n",
"* Randomly generated arrays\n",
"* Element-wise operations\n",
" * Comparison operations\n",
" * Logical operations\n",
"* Summarizing operations"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "95aa5b76",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "aa693c84",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<module 'numpy' from '/home/alexey/.pyenv/versions/3.8.11/lib/python3.8/site-packages/numpy/__init__.py'>"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np"
]
},
{
"cell_type": "markdown",
"id": "1e3ff2dc",
"metadata": {},
"source": [
"## Creating arrays\n"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "783c3362",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.zeros(10)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "2fc75d89",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.ones(10)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "5183483b",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5])"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.full(10, 2.5)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "fe81664d",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([ 1, 2, 3, 5, 7, 12])"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"a = np.array([1, 2, 3, 5, 7, 12])\n",
"a"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "dee26035",
"metadata": {},
"outputs": [],
"source": [
"a[2] = 10"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "0cf95a0f",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([ 1, 2, 10, 5, 7, 12])"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"a"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "9a579406",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([3, 4, 5, 6, 7, 8, 9])"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.arange(3, 10)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "96260ddb",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([ 0., 10., 20., 30., 40., 50., 60., 70., 80., 90., 100.])"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.linspace(0, 100, 11)"
]
},
{
"cell_type": "markdown",
"id": "37f36946",
"metadata": {},
"source": [
"## Multi-dimensional arrays\n"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "b4a61c53",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[0., 0.],\n",
" [0., 0.],\n",
" [0., 0.],\n",
" [0., 0.],\n",
" [0., 0.]])"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.zeros((5, 2))"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "4f75f854",
"metadata": {},
"outputs": [],
"source": [
"n = np.array([\n",
" [1, 2, 3],\n",
" [4, 5, 6],\n",
" [7, 8, 9]\n",
"])"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "619860f6",
"metadata": {},
"outputs": [],
"source": [
"n[0, 1] = 20"
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "54333fc7",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[ 1, 20, 3],\n",
" [ 4, 5, 6],\n",
" [ 7, 8, 9]])"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"n"
]
},
{
"cell_type": "code",
"execution_count": 34,
"id": "57eef634",
"metadata": {},
"outputs": [],
"source": [
"n[2] = [1, 1, 1]"
]
},
{
"cell_type": "code",
"execution_count": 35,
"id": "b3fa6ae7",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[ 1, 20, 3],\n",
" [ 4, 5, 6],\n",
" [ 1, 1, 1]])"
]
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"n"
]
},
{
"cell_type": "code",
"execution_count": 39,
"id": "42f1d1f4",
"metadata": {},
"outputs": [],
"source": [
"n[:, 2] = [0, 1, 2]"
]
},
{
"cell_type": "code",
"execution_count": 40,
"id": "13442277",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[ 1, 20, 0],\n",
" [ 4, 5, 1],\n",
" [ 1, 1, 2]])"
]
},
"execution_count": 40,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"n"
]
},
{
"cell_type": "markdown",
"id": "62ba6337",
"metadata": {},
"source": [
"## Randomly generated arrays\n"
]
},
{
"cell_type": "code",
"execution_count": 51,
"id": "6781ff11",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[43.59949021, 2.59262318],\n",
" [54.96624779, 43.53223926],\n",
" [42.03678021, 33.0334821 ],\n",
" [20.4648634 , 61.92709664],\n",
" [29.96546737, 26.68272751]])"
]
},
"execution_count": 51,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.random.seed(2)\n",
"100 * np.random.rand(5, 2)"
]
},
{
"cell_type": "code",
"execution_count": 50,
"id": "4374e58b",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[-0.41675785, -0.05626683],\n",
" [-2.1361961 , 1.64027081],\n",
" [-1.79343559, -0.84174737],\n",
" [ 0.50288142, -1.24528809],\n",
" [-1.05795222, -0.90900761]])"
]
},
"execution_count": 50,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.random.seed(2)\n",
"np.random.randn(5, 2)"
]
},
{
"cell_type": "code",
"execution_count": 52,
"id": "ebb39565",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[40, 15],\n",
" [72, 22],\n",
" [43, 82],\n",
" [75, 7],\n",
" [34, 49]])"
]
},
"execution_count": 52,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.random.seed(2)\n",
"np.random.randint(low=0, high=100, size=(5, 2))"
]
},
{
"cell_type": "markdown",
"id": "364c6d7c",
"metadata": {},
"source": [
"## Element-wise operations\n"
]
},
{
"cell_type": "code",
"execution_count": 53,
"id": "51390a32",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([0, 1, 2, 3, 4])"
]
},
"execution_count": 53,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"a = np.arange(5)\n",
"a"
]
},
{
"cell_type": "code",
"execution_count": 62,
"id": "6e87e9b0",
"metadata": {},
"outputs": [],
"source": [
"b = (10 + (a * 2)) ** 2 / 100"
]
},
{
"cell_type": "code",
"execution_count": 65,
"id": "013a9e2a",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([1. , 1.44, 1.96, 2.56, 3.24])"
]
},
"execution_count": 65,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"b"
]
},
{
"cell_type": "code",
"execution_count": 68,
"id": "08592c4a",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([10. , 10.69444444, 11.02040816, 11.171875 , 11.2345679 ])"
]
},
"execution_count": 68,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"a / b + 10"
]
},
{
"cell_type": "markdown",
"id": "35fc84d3",
"metadata": {},
"source": [
"## Comparison operations"
]
},
{
"cell_type": "code",
"execution_count": 70,
"id": "e26eefdc",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([0, 1, 2, 3, 4])"
]
},
"execution_count": 70,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"a"
]
},
{
"cell_type": "code",
"execution_count": 69,
"id": "8fd3fc96",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([False, False, True, True, True])"
]
},
"execution_count": 69,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"a >= 2"
]
},
{
"cell_type": "code",
"execution_count": 71,
"id": "bca43c2c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([1. , 1.44, 1.96, 2.56, 3.24])"
]
},
"execution_count": 71,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"b"
]
},
{
"cell_type": "code",
"execution_count": 72,
"id": "f6e89611",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([False, False, True, True, True])"
]
},
"execution_count": 72,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"a > b"
]
},
{
"cell_type": "code",
"execution_count": 73,
"id": "15a5a80a",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([2, 3, 4])"
]
},
"execution_count": 73,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"a[a > b]"
]
},
{
"cell_type": "markdown",
"id": "0259499b",
"metadata": {},
"source": [
"## Summarizing operations"
]
},
{
"cell_type": "code",
"execution_count": 75,
"id": "c1b30281",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([0, 1, 2, 3, 4])"
]
},
"execution_count": 75,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"a"
]
},
{
"cell_type": "code",
"execution_count": 79,
"id": "d850c2aa",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1.4142135623730951"
]
},
"execution_count": 79,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"a.std()"
]
},
{
"cell_type": "code",
"execution_count": 82,
"id": "2b2587f2",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 82,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"n.min()"
]
},
{
"cell_type": "markdown",
"id": "0662686b",
"metadata": {},
"source": [
"### Next\n",
"\n",
"Linear algebra refresher"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
================================================
FILE: 01-intro/notebooks/08-linear-algebra.ipynb
================================================
{
"cells": [
{
"cell_type": "markdown",
"id": "3aace4b5",
"metadata": {},
"source": [
"# Machine Learning Zoomcamp\n",
"\n",
"## 1.8 Linear algebra refresher"
]
},
{
"cell_type": "markdown",
"id": "2cd0b8e5",
"metadata": {},
"source": [
"Plan:\n",
"\n",
"* Vector operations\n",
"* Multiplication\n",
" * Vector-vector multiplication\n",
" * Matrix-vector multiplication\n",
" * Matrix-matrix multiplication\n",
"* Identity matrix\n",
"* Inverse"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "1317a223",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np"
]
},
{
"cell_type": "markdown",
"id": "4052050d",
"metadata": {},
"source": [
"## Vector operations"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "e87a01b7",
"metadata": {},
"outputs": [],
"source": [
"u = np.array([2, 4, 5, 6])"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "913795a9",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([ 4, 8, 10, 12])"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"2 * u"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "24625627",
"metadata": {},
"outputs": [],
"source": [
"v = np.array([1, 0, 0, 2])"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "edc95be4",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([3, 4, 5, 8])"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"u + v"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "62f471c3",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([ 2, 0, 0, 12])"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"u * v"
]
},
{
"cell_type": "markdown",
"id": "6cb784ea",
"metadata": {},
"source": [
"## Multiplication"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "6a838e14",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"4"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"v.shape[0]"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "bef565ee",
"metadata": {},
"outputs": [],
"source": [
"def vector_vector_multiplication(u, v):\n",
" assert u.shape[0] == v.shape[0]\n",
" \n",
" n = u.shape[0]\n",
" \n",
" result = 0.0\n",
"\n",
" for i in range(n):\n",
" result = result + u[i] * v[i]\n",
" \n",
" return result"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "5f212712",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"14.0"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"vector_vector_multiplication(u, v)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "b57c4464",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"14"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"u.dot(v)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "b7710217",
"metadata": {},
"outputs": [],
"source": [
"U = np.array([\n",
" [2, 4, 5, 6],\n",
" [1, 2, 1, 2],\n",
" [3, 1, 2, 1],\n",
"])"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "3f1ee5f1",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(3, 4)"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"U.shape"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "8b4f7530",
"metadata": {},
"outputs": [],
"source": [
"def matrix_vector_multiplication(U, v):\n",
" assert U.shape[1] == v.shape[0]\n",
" \n",
" num_rows = U.shape[0]\n",
" \n",
" result = np.zeros(num_rows)\n",
" \n",
" for i in range(num_rows):\n",
" result[i] = vector_vector_multiplication(U[i], v)\n",
" \n",
" return result"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "930f42c3",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([14., 5., 5.])"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"matrix_vector_multiplication(U, v)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "0937dafd",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([14, 5, 5])"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"U.dot(v)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "85280363",
"metadata": {},
"outputs": [],
"source": [
"V = np.array([\n",
" [1, 1, 2],\n",
" [0, 0.5, 1], \n",
" [0, 2, 1],\n",
" [2, 1, 0],\n",
"])"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "82039bcf",
"metadata": {},
"outputs": [],
"source": [
"def matrix_matrix_multiplication(U, V):\n",
" assert U.shape[1] == V.shape[0]\n",
" \n",
" num_rows = U.shape[0]\n",
" num_cols = V.shape[1]\n",
" \n",
" result = np.zeros((num_rows, num_cols))\n",
" \n",
" for i in range(num_cols):\n",
" vi = V[:, i]\n",
" Uvi = matrix_vector_multiplication(U, vi)\n",
" result[:, i] = Uvi\n",
" \n",
" return result"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "ab0e5aba",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[14. , 20. , 13. ],\n",
" [ 5. , 6. , 5. ],\n",
" [ 5. , 8.5, 9. ]])"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"matrix_matrix_multiplication(U, V)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "8d0e3b73",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[14. , 20. , 13. ],\n",
" [ 5. , 6. , 5. ],\n",
" [ 5. , 8.5, 9. ]])"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"U.dot(V)"
]
},
{
"cell_type": "markdown",
"id": "fb2cdbdd",
"metadata": {},
"source": [
"## Identity matrix"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "ca913560",
"metadata": {},
"outputs": [],
"source": [
"I = np.eye(3)"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "0614d05f",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[1. , 1. , 2. ],\n",
" [0. , 0.5, 1. ],\n",
" [0. , 2. , 1. ],\n",
" [2. , 1. , 0. ]])"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"V"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "aabbf2ad",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[1. , 1. , 2. ],\n",
" [0. , 0.5, 1. ],\n",
" [0. , 2. , 1. ],\n",
" [2. , 1. , 0. ]])"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"V.dot(I)"
]
},
{
"cell_type": "markdown",
"id": "e8f786ef",
"metadata": {},
"source": [
"## Inverse"
]
},
{
"cell_type": "code",
"execution_count": 31,
"id": "3e6fc747",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[1. , 1. , 2. ],\n",
" [0. , 0.5, 1. ],\n",
" [0. , 2. , 1. ]])"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"Vs = V[[0, 1, 2]]\n",
"Vs"
]
},
{
"cell_type": "code",
"execution_count": 33,
"id": "5265b91e",
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"array([[ 1. , -2. , 0. ],\n",
" [ 0. , -0.66666667, 0.66666667],\n",
" [ 0. , 1.33333333, -0.33333333]])"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"Vs_inv = np.linalg.inv(Vs)\n",
"Vs_inv"
]
},
{
"cell_type": "code",
"execution_count": 34,
"id": "3cd1d98b",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[1., 0., 0.],\n",
" [0., 1., 0.],\n",
" [0., 0., 1.]])"
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"Vs_inv.dot(Vs)"
]
},
{
"cell_type": "markdown",
"id": "1c54ed7d",
"metadata": {},
"source": [
"### Next \n",
"\n",
"Intro to Pandas"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "64d8bdce",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
================================================
FILE: 01-intro/notebooks/09-pandas.ipynb
================================================
{
"cells": [
{
"cell_type": "markdown",
"id": "3473239e",
"metadata": {},
"source": [
"# Machine Learning Zoomcamp\n",
"\n",
"## 1.9 Introduction to Pandas\n",
"\n",
"Plan:\n",
"\n",
"* Data Frames\n",
"* Series\n",
"* Index\n",
"* Accessing elements\n",
"* Element-wise operations\n",
"* Filtering\n",
"* String operations\n",
"* Summarizing operations\n",
"* Missing values\n",
"* Grouping\n",
"* Getting the NumPy arrays"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "b1a23fb2",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd"
]
},
{
"cell_type": "markdown",
"id": "06e3062c",
"metadata": {},
"source": [
"## DataFrames"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "114c8ddb",
"metadata": {},
"outputs": [],
"source": [
"data = [\n",
" ['Nissan', 'Stanza', 1991, 138, 4, 'MANUAL', 'sedan', 2000],\n",
" ['Hyundai', 'Sonata', 2017, None, 4, 'AUTOMATIC', 'Sedan', 27150],\n",
" ['Lotus', 'Elise', 2010, 218, 4, 'MANUAL', 'convertible', 54990],\n",
" ['GMC', 'Acadia', 2017, 194, 4, 'AUTOMATIC', '4dr SUV', 34450],\n",
" ['Nissan', 'Frontier', 2017, 261, 6, 'MANUAL', 'Pickup', 32340],\n",
"]\n",
"\n",
"columns = [\n",
" 'Make', 'Model', 'Year', 'Engine HP', 'Engine Cylinders',\n",
" 'Transmission Type', 'Vehicle_Style', 'MSRP'\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "c25c6c9d",
"metadata": {},
"outputs": [],
"source": [
"df = pd.DataFrame(data, columns=columns)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "abe4d2e4",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Make</th>\n",
" <th>Model</th>\n",
" <th>Year</th>\n",
" <th>Engine HP</th>\n",
" <th>Engine Cylinders</th>\n",
" <th>Transmission Type</th>\n",
" <th>Vehicle_Style</th>\n",
" <th>MSRP</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Nissan</td>\n",
" <td>Stanza</td>\n",
" <td>1991</td>\n",
" <td>138.0</td>\n",
" <td>4</td>\n",
" <td>MANUAL</td>\n",
" <td>sedan</td>\n",
" <td>2000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Hyundai</td>\n",
" <td>Sonata</td>\n",
" <td>2017</td>\n",
" <td>NaN</td>\n",
" <td>4</td>\n",
" <td>AUTOMATIC</td>\n",
" <td>Sedan</td>\n",
" <td>27150</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Lotus</td>\n",
" <td>Elise</td>\n",
" <td>2010</td>\n",
" <td>218.0</td>\n",
" <td>4</td>\n",
" <td>MANUAL</td>\n",
" <td>convertible</td>\n",
" <td>54990</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>GMC</td>\n",
" <td>Acadia</td>\n",
" <td>2017</td>\n",
" <td>194.0</td>\n",
" <td>4</td>\n",
" <td>AUTOMATIC</td>\n",
" <td>4dr SUV</td>\n",
" <td>34450</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Nissan</td>\n",
" <td>Frontier</td>\n",
" <td>2017</td>\n",
" <td>261.0</td>\n",
" <td>6</td>\n",
" <td>MANUAL</td>\n",
" <td>Pickup</td>\n",
" <td>32340</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Make Model Year Engine HP Engine Cylinders Transmission Type \\\n",
"0 Nissan Stanza 1991 138.0 4 MANUAL \n",
"1 Hyundai Sonata 2017 NaN 4 AUTOMATIC \n",
"2 Lotus Elise 2010 218.0 4 MANUAL \n",
"3 GMC Acadia 2017 194.0 4 AUTOMATIC \n",
"4 Nissan Frontier 2017 261.0 6 MANUAL \n",
"\n",
" Vehicle_Style MSRP \n",
"0 sedan 2000 \n",
"1 Sedan 27150 \n",
"2 convertible 54990 \n",
"3 4dr SUV 34450 \n",
"4 Pickup 32340 "
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "f104d442",
"metadata": {},
"outputs": [],
"source": [
"data = [\n",
" {\n",
" \"Make\": \"Nissan\",\n",
" \"Model\": \"Stanza\",\n",
" \"Year\": 1991,\n",
" \"Engine HP\": 138.0,\n",
" \"Engine Cylinders\": 4,\n",
" \"Transmission Type\": \"MANUAL\",\n",
" \"Vehicle_Style\": \"sedan\",\n",
" \"MSRP\": 2000\n",
" },\n",
" {\n",
" \"Make\": \"Hyundai\",\n",
" \"Model\": \"Sonata\",\n",
" \"Year\": 2017,\n",
" \"Engine HP\": None,\n",
" \"Engine Cylinders\": 4,\n",
" \"Transmission Type\": \"AUTOMATIC\",\n",
" \"Vehicle_Style\": \"Sedan\",\n",
" \"MSRP\": 27150\n",
" },\n",
" {\n",
" \"Make\": \"Lotus\",\n",
" \"Model\": \"Elise\",\n",
" \"Year\": 2010,\n",
" \"Engine HP\": 218.0,\n",
" \"Engine Cylinders\": 4,\n",
" \"Transmission Type\": \"MANUAL\",\n",
" \"Vehicle_Style\": \"convertible\",\n",
" \"MSRP\": 54990\n",
" },\n",
" {\n",
" \"Make\": \"GMC\",\n",
" \"Model\": \"Acadia\",\n",
" \"Year\": 2017,\n",
" \"Engine HP\": 194.0,\n",
" \"Engine Cylinders\": 4,\n",
" \"Transmission Type\": \"AUTOMATIC\",\n",
" \"Vehicle_Style\": \"4dr SUV\",\n",
" \"MSRP\": 34450\n",
" },\n",
" {\n",
" \"Make\": \"Nissan\",\n",
" \"Model\": \"Frontier\",\n",
" \"Year\": 2017,\n",
" \"Engine HP\": 261.0,\n",
" \"Engine Cylinders\": 6,\n",
" \"Transmission Type\": \"MANUAL\",\n",
" \"Vehicle_Style\": \"Pickup\",\n",
" \"MSRP\": 32340\n",
" }\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "2d89579e",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Make</th>\n",
" <th>Model</th>\n",
" <th>Year</th>\n",
" <th>Engine HP</th>\n",
" <th>Engine Cylinders</th>\n",
" <th>Transmission Type</th>\n",
" <th>Vehicle_Style</th>\n",
" <th>MSRP</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Nissan</td>\n",
" <td>Stanza</td>\n",
" <td>1991</td>\n",
" <td>138.0</td>\n",
" <td>4</td>\n",
" <td>MANUAL</td>\n",
" <td>sedan</td>\n",
" <td>2000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Hyundai</td>\n",
" <td>Sonata</td>\n",
" <td>2017</td>\n",
" <td>NaN</td>\n",
" <td>4</td>\n",
" <td>AUTOMATIC</td>\n",
" <td>Sedan</td>\n",
" <td>27150</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Lotus</td>\n",
" <td>Elise</td>\n",
" <td>2010</td>\n",
" <td>218.0</td>\n",
" <td>4</td>\n",
" <td>MANUAL</td>\n",
" <td>convertible</td>\n",
" <td>54990</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>GMC</td>\n",
" <td>Acadia</td>\n",
" <td>2017</td>\n",
" <td>194.0</td>\n",
" <td>4</td>\n",
" <td>AUTOMATIC</td>\n",
" <td>4dr SUV</td>\n",
" <td>34450</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Nissan</td>\n",
" <td>Frontier</td>\n",
" <td>2017</td>\n",
" <td>261.0</td>\n",
" <td>6</td>\n",
" <td>MANUAL</td>\n",
" <td>Pickup</td>\n",
" <td>32340</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Make Model Year Engine HP Engine Cylinders Transmission Type \\\n",
"0 Nissan Stanza 1991 138.0 4 MANUAL \n",
"1 Hyundai Sonata 2017 NaN 4 AUTOMATIC \n",
"2 Lotus Elise 2010 218.0 4 MANUAL \n",
"3 GMC Acadia 2017 194.0 4 AUTOMATIC \n",
"4 Nissan Frontier 2017 261.0 6 MANUAL \n",
"\n",
" Vehicle_Style MSRP \n",
"0 sedan 2000 \n",
"1 Sedan 27150 \n",
"2 convertible 54990 \n",
"3 4dr SUV 34450 \n",
"4 Pickup 32340 "
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.DataFrame(data)\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "097f69d9",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Make</th>\n",
" <th>Model</th>\n",
" <th>Year</th>\n",
" <th>Engine HP</th>\n",
" <th>Engine Cylinders</th>\n",
" <th>Transmission Type</th>\n",
" <th>Vehicle_Style</th>\n",
" <th>MSRP</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Nissan</td>\n",
" <td>Stanza</td>\n",
" <td>1991</td>\n",
" <td>138.0</td>\n",
" <td>4</td>\n",
" <td>MANUAL</td>\n",
" <td>sedan</td>\n",
" <td>2000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Hyundai</td>\n",
" <td>Sonata</td>\n",
" <td>2017</td>\n",
" <td>NaN</td>\n",
" <td>4</td>\n",
" <td>AUTOMATIC</td>\n",
" <td>Sedan</td>\n",
" <td>27150</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Make Model Year Engine HP Engine Cylinders Transmission Type \\\n",
"0 Nissan Stanza 1991 138.0 4 MANUAL \n",
"1 Hyundai Sonata 2017 NaN 4 AUTOMATIC \n",
"\n",
" Vehicle_Style MSRP \n",
"0 sedan 2000 \n",
"1 Sedan 27150 "
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head(n=2)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0961a097",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "94b432db",
"metadata": {},
"source": [
"## Series"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "7299a212",
"metadata": {},
"outputs": [
{
"ename": "SyntaxError",
"evalue": "invalid syntax (1897567212.py, line 1)",
"output_type": "error",
"traceback": [
"\u001b[0;36m File \u001b[0;32m\"/tmp/ipykernel_580/1897567212.py\"\u001b[0;36m, line \u001b[0;32m1\u001b[0m\n\u001b[0;31m df.Engine HP\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m invalid syntax\n"
]
}
],
"source": [
"df.Engine HP"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "54898f9d",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 138.0\n",
"1 NaN\n",
"2 218.0\n",
"3 194.0\n",
"4 261.0\n",
"Name: Engine HP, dtype: float64"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['Engine HP']"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "acc40580",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Make</th>\n",
" <th>Model</th>\n",
" <th>MSRP</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Nissan</td>\n",
" <td>Stanza</td>\n",
" <td>2000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Hyundai</td>\n",
" <td>Sonata</td>\n",
" <td>27150</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Lotus</td>\n",
" <td>Elise</td>\n",
" <td>54990</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>GMC</td>\n",
" <td>Acadia</td>\n",
" <td>34450</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Nissan</td>\n",
" <td>Frontier</td>\n",
" <td>32340</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Make Model MSRP\n",
"0 Nissan Stanza 2000\n",
"1 Hyundai Sonata 27150\n",
"2 Lotus Elise 54990\n",
"3 GMC Acadia 34450\n",
"4 Nissan Frontier 32340"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[['Make', 'Model', 'MSRP']]"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "9c699894",
"metadata": {},
"outputs": [],
"source": [
"df['id'] = [1, 2, 3, 4, 5]"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "70ec4449",
"metadata": {},
"outputs": [],
"source": [
"df['id'] = [10, 20, 30, 40, 50]"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "ff30947e",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Make</th>\n",
" <th>Model</th>\n",
" <th>Year</th>\n",
" <th>Engine HP</th>\n",
" <th>Engine Cylinders</th>\n",
" <th>Transmission Type</th>\n",
" <th>Vehicle_Style</th>\n",
" <th>MSRP</th>\n",
" <th>id</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Nissan</td>\n",
" <td>Stanza</td>\n",
" <td>1991</td>\n",
" <td>138.0</td>\n",
" <td>4</td>\n",
" <td>MANUAL</td>\n",
" <td>sedan</td>\n",
" <td>2000</td>\n",
" <td>10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Hyundai</td>\n",
" <td>Sonata</td>\n",
" <td>2017</td>\n",
" <td>NaN</td>\n",
" <td>4</td>\n",
" <td>AUTOMATIC</td>\n",
" <td>Sedan</td>\n",
" <td>27150</td>\n",
" <td>20</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Lotus</td>\n",
" <td>Elise</td>\n",
" <td>2010</td>\n",
" <td>218.0</td>\n",
" <td>4</td>\n",
" <td>MANUAL</td>\n",
" <td>convertible</td>\n",
" <td>54990</td>\n",
" <td>30</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>GMC</td>\n",
" <td>Acadia</td>\n",
" <td>2017</td>\n",
" <td>194.0</td>\n",
" <td>4</td>\n",
" <td>AUTOMATIC</td>\n",
" <td>4dr SUV</td>\n",
" <td>34450</td>\n",
" <td>40</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Nissan</td>\n",
" <td>Frontier</td>\n",
" <td>2017</td>\n",
" <td>261.0</td>\n",
" <td>6</td>\n",
" <td>MANUAL</td>\n",
" <td>Pickup</td>\n",
" <td>32340</td>\n",
" <td>50</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Make Model Year Engine HP Engine Cylinders Transmission Type \\\n",
"0 Nissan Stanza 1991 138.0 4 MANUAL \n",
"1 Hyundai Sonata 2017 NaN 4 AUTOMATIC \n",
"2 Lotus Elise 2010 218.0 4 MANUAL \n",
"3 GMC Acadia 2017 194.0 4 AUTOMATIC \n",
"4 Nissan Frontier 2017 261.0 6 MANUAL \n",
"\n",
" Vehicle_Style MSRP id \n",
"0 sedan 2000 10 \n",
"1 Sedan 27150 20 \n",
"2 convertible 54990 30 \n",
"3 4dr SUV 34450 40 \n",
"4 Pickup 32340 50 "
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "064e3e7c",
"metadata": {},
"outputs": [],
"source": [
"del df['id']"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "5206c3ba",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Make</th>\n",
" <th>Model</th>\n",
" <th>Year</th>\n",
" <th>Engine HP</th>\n",
" <th>Engine Cylinders</th>\n",
" <th>Transmission Type</th>\n",
" <th>Vehicle_Style</th>\n",
" <th>MSRP</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Nissan</td>\n",
" <td>Stanza</td>\n",
" <td>1991</td>\n",
" <td>138.0</td>\n",
" <td>4</td>\n",
" <td>MANUAL</td>\n",
" <td>sedan</td>\n",
" <td>2000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Hyundai</td>\n",
" <td>Sonata</td>\n",
" <td>2017</td>\n",
" <td>NaN</td>\n",
" <td>4</td>\n",
" <td>AUTOMATIC</td>\n",
" <td>Sedan</td>\n",
" <td>27150</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Lotus</td>\n",
" <td>Elise</td>\n",
" <td>2010</td>\n",
" <td>218.0</td>\n",
" <td>4</td>\n",
" <td>MANUAL</td>\n",
" <td>convertible</td>\n",
" <td>54990</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>GMC</td>\n",
" <td>Acadia</td>\n",
" <td>2017</td>\n",
" <td>194.0</td>\n",
" <td>4</td>\n",
" <td>AUTOMATIC</td>\n",
" <td>4dr SUV</td>\n",
" <td>34450</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Nissan</td>\n",
" <td>Frontier</td>\n",
" <td>2017</td>\n",
" <td>261.0</td>\n",
" <td>6</td>\n",
" <td>MANUAL</td>\n",
" <td>Pickup</td>\n",
" <td>32340</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Make Model Year Engine HP Engine Cylinders Transmission Type \\\n",
"0 Nissan Stanza 1991 138.0 4 MANUAL \n",
"1 Hyundai Sonata 2017 NaN 4 AUTOMATIC \n",
"2 Lotus Elise 2010 218.0 4 MANUAL \n",
"3 GMC Acadia 2017 194.0 4 AUTOMATIC \n",
"4 Nissan Frontier 2017 261.0 6 MANUAL \n",
"\n",
" Vehicle_Style MSRP \n",
"0 sedan 2000 \n",
"1 Sedan 27150 \n",
"2 convertible 54990 \n",
"3 4dr SUV 34450 \n",
"4 Pickup 32340 "
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "markdown",
"id": "530a4af3",
"metadata": {},
"source": [
"## Index\n"
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "69e9bfbd",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"RangeIndex(start=0, stop=5, step=1)"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.index"
]
},
{
"cell_type": "code",
"execution_count": 32,
"id": "d7e06c93",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"RangeIndex(start=0, stop=5, step=1)"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.Make.index"
]
},
{
"cell_type": "code",
"execution_count": 37,
"id": "14213eb4",
"metadata": {},
"outputs": [],
"source": [
"df.index = ['a', 'b', 'c', 'd', 'e']"
]
},
{
"cell_type": "code",
"execution_count": 38,
"id": "d9074134",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Make</th>\n",
" <th>Model</th>\n",
" <th>Year</th>\n",
" <th>Engine HP</th>\n",
" <th>Engine Cylinders</th>\n",
" <th>Transmission Type</th>\n",
" <th>Vehicle_Style</th>\n",
" <th>MSRP</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>a</th>\n",
" <td>Nissan</td>\n",
" <td>Stanza</td>\n",
" <td>1991</td>\n",
" <td>138.0</td>\n",
" <td>4</td>\n",
" <td>MANUAL</td>\n",
" <td>sedan</td>\n",
" <td>2000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>b</th>\n",
" <td>Hyundai</td>\n",
" <td>Sonata</td>\n",
" <td>2017</td>\n",
" <td>NaN</td>\n",
" <td>4</td>\n",
" <td>AUTOMATIC</td>\n",
" <td>Sedan</td>\n",
" <td>27150</td>\n",
" </tr>\n",
" <tr>\n",
" <th>c</th>\n",
" <td>Lotus</td>\n",
" <td>Elise</td>\n",
" <td>2010</td>\n",
" <td>218.0</td>\n",
" <td>4</td>\n",
" <td>MANUAL</td>\n",
" <td>convertible</td>\n",
" <td>54990</td>\n",
" </tr>\n",
" <tr>\n",
" <th>d</th>\n",
" <td>GMC</td>\n",
" <td>Acadia</td>\n",
" <td>2017</td>\n",
" <td>194.0</td>\n",
" <td>4</td>\n",
" <td>AUTOMATIC</td>\n",
" <td>4dr SUV</td>\n",
" <td>34450</td>\n",
" </tr>\n",
" <tr>\n",
" <th>e</th>\n",
" <td>Nissan</td>\n",
" <td>Frontier</td>\n",
" <td>2017</td>\n",
" <td>261.0</td>\n",
" <td>6</td>\n",
" <td>MANUAL</td>\n",
" <td>Pickup</td>\n",
" <td>32340</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Make Model Year Engine HP Engine Cylinders Transmission Type \\\n",
"a Nissan Stanza 1991 138.0 4 MANUAL \n",
"b Hyundai Sonata 2017 NaN 4 AUTOMATIC \n",
"c Lotus Elise 2010 218.0 4 MANUAL \n",
"d GMC Acadia 2017 194.0 4 AUTOMATIC \n",
"e Nissan Frontier 2017 261.0 6 MANUAL \n",
"\n",
" Vehicle_Style MSRP \n",
"a sedan 2000 \n",
"b Sedan 27150 \n",
"c convertible 54990 \n",
"d 4dr SUV 34450 \n",
"e Pickup 32340 "
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "code",
"execution_count": 42,
"id": "a1c57024",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Make</th>\n",
" <th>Model</th>\n",
" <th>Year</th>\n",
" <th>Engine HP</th>\n",
" <th>Engine Cylinders</th>\n",
" <th>Transmission Type</th>\n",
" <th>Vehicle_Style</th>\n",
" <th>MSRP</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>b</th>\n",
" <td>Hyundai</td>\n",
" <td>Sonata</td>\n",
" <td>2017</td>\n",
" <td>NaN</td>\n",
" <td>4</td>\n",
" <td>AUTOMATIC</td>\n",
" <td>Sedan</td>\n",
" <td>27150</td>\n",
" </tr>\n",
" <tr>\n",
" <th>c</th>\n",
" <td>Lotus</td>\n",
" <td>Elise</td>\n",
" <td>2010</td>\n",
" <td>218.0</td>\n",
" <td>4</td>\n",
" <td>MANUAL</td>\n",
" <td>convertible</td>\n",
" <td>54990</td>\n",
" </tr>\n",
" <tr>\n",
" <th>e</th>\n",
" <td>Nissan</td>\n",
" <td>Frontier</td>\n",
" <td>2017</td>\n",
" <td>261.0</td>\n",
" <td>6</td>\n",
" <td>MANUAL</td>\n",
" <td>Pickup</td>\n",
" <td>32340</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Make Model Year Engine HP Engine Cylinders Transmission Type \\\n",
"b Hyundai Sonata 2017 NaN 4 AUTOMATIC \n",
"c Lotus Elise 2010 218.0 4 MANUAL \n",
"e Nissan Frontier 2017 261.0 6 MANUAL \n",
"\n",
" Vehicle_Style MSRP \n",
"b Sedan 27150 \n",
"c convertible 54990 \n",
"e Pickup 32340 "
]
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.iloc[[1, 2, 4]]"
]
},
{
"cell_type": "code",
"execution_count": 47,
"id": "764c2aad",
"metadata": {},
"outputs": [],
"source": [
"df = df.reset_index(drop=True)"
]
},
{
"cell_type": "code",
"execution_count": 48,
"id": "f338e70b",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Make</th>\n",
" <th>Model</th>\n",
" <th>Year</th>\n",
" <th>Engine HP</th>\n",
" <th>Engine Cylinders</th>\n",
" <th>Transmission Type</th>\n",
" <th>Vehicle_Style</th>\n",
" <th>MSRP</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Nissan</td>\n",
" <td>Stanza</td>\n",
" <td>1991</td>\n",
" <td>138.0</td>\n",
" <td>4</td>\n",
" <td>MANUAL</td>\n",
" <td>sedan</td>\n",
" <td>2000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Hyundai</td>\n",
" <td>Sonata</td>\n",
" <td>2017</td>\n",
" <td>NaN</td>\n",
" <td>4</td>\n",
" <td>AUTOMATIC</td>\n",
" <td>Sedan</td>\n",
" <td>27150</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Lotus</td>\n",
" <td>Elise</td>\n",
" <td>2010</td>\n",
" <td>218.0</td>\n",
" <td>4</td>\n",
" <td>MANUAL</td>\n",
" <td>convertible</td>\n",
" <td>54990</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>GMC</td>\n",
" <td>Acadia</td>\n",
" <td>2017</td>\n",
" <td>194.0</td>\n",
" <td>4</td>\n",
" <td>AUTOMATIC</td>\n",
" <td>4dr SUV</td>\n",
" <td>34450</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Nissan</td>\n",
" <td>Frontier</td>\n",
" <td>2017</td>\n",
" <td>261.0</td>\n",
" <td>6</td>\n",
" <td>MANUAL</td>\n",
" <td>Pickup</td>\n",
" <td>32340</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Make Model Year Engine HP Engine Cylinders Transmission Type \\\n",
"0 Nissan Stanza 1991 138.0 4 MANUAL \n",
"1 Hyundai Sonata 2017 NaN 4 AUTOMATIC \n",
"2 Lotus Elise 2010 218.0 4 MANUAL \n",
"3 GMC Acadia 2017 194.0 4 AUTOMATIC \n",
"4 Nissan Frontier 2017 261.0 6 MANUAL \n",
"\n",
" Vehicle_Style MSRP \n",
"0 sedan 2000 \n",
"1 Sedan 27150 \n",
"2 convertible 54990 \n",
"3 4dr SUV 34450 \n",
"4 Pickup 32340 "
]
},
"execution_count": 48,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "markdown",
"id": "53457e10",
"metadata": {},
"source": [
"## Accessing elements"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "758f6b8e",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "ecfc3f22",
"metadata": {},
"source": [
"## Element-wise operations"
]
},
{
"cell_type": "code",
"execution_count": 51,
"id": "318183a8",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 276.0\n",
"1 NaN\n",
"2 436.0\n",
"3 388.0\n",
"4 522.0\n",
"Name: Engine HP, dtype: float64"
]
},
"execution_count": 51,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['Engine HP'] * 2"
]
},
{
"cell_type": "code",
"execution_count": 53,
"id": "ae5d726d",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 False\n",
"1 True\n",
"2 False\n",
"3 True\n",
"4 True\n",
"Name: Year, dtype: bool"
]
},
"execution_count": 53,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['Year'] >= 2015"
]
},
{
"cell_type": "markdown",
"id": "5813ae3a",
"metadata": {},
"source": [
"## Filtering"
]
},
{
"cell_type": "code",
"execution_count": 55,
"id": "73699361",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Make</th>\n",
" <th>Model</th>\n",
" <th>Year</th>\n",
" <th>Engine HP</th>\n",
" <th>Engine Cylinders</th>\n",
" <th>Transmission Type</th>\n",
" <th>Vehicle_Style</th>\n",
" <th>MSRP</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Nissan</td>\n",
" <td>Stanza</td>\n",
" <td>1991</td>\n",
" <td>138.0</td>\n",
" <td>4</td>\n",
" <td>MANUAL</td>\n",
" <td>sedan</td>\n",
" <td>2000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Nissan</td>\n",
" <td>Frontier</td>\n",
" <td>2017</td>\n",
" <td>261.0</td>\n",
" <td>6</td>\n",
" <td>MANUAL</td>\n",
" <td>Pickup</td>\n",
" <td>32340</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Make Model Year Engine HP Engine Cylinders Transmission Type \\\n",
"0 Nissan Stanza 1991 138.0 4 MANUAL \n",
"4 Nissan Frontier 2017 261.0 6 MANUAL \n",
"\n",
" Vehicle_Style MSRP \n",
"0 sedan 2000 \n",
"4 Pickup 32340 "
]
},
"execution_count": 55,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[\n",
" df['Make'] == 'Nissan'\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": 56,
"id": "0f29ed0f",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Make</th>\n",
" <th>Model</th>\n",
" <th>Year</th>\n",
" <th>Engine HP</th>\n",
" <th>Engine Cylinders</th>\n",
" <th>Transmission Type</th>\n",
" <th>Vehicle_Style</th>\n",
" <th>MSRP</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Nissan</td>\n",
" <td>Frontier</td>\n",
" <td>2017</td>\n",
" <td>261.0</td>\n",
" <td>6</td>\n",
" <td>MANUAL</td>\n",
" <td>Pickup</td>\n",
" <td>32340</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Make Model Year Engine HP Engine Cylinders Transmission Type \\\n",
"4 Nissan Frontier 2017 261.0 6 MANUAL \n",
"\n",
" Vehicle_Style MSRP \n",
"4 Pickup 32340 "
]
},
"execution_count": 56,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[\n",
" (df['Make'] == 'Nissan') & (df['Year'] >= 2015)\n",
"]"
]
},
{
"cell_type": "markdown",
"id": "0a8a3d49",
"metadata": {},
"source": [
"## String operations"
]
},
{
"cell_type": "code",
"execution_count": 68,
"id": "7deaf57a",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'machine_learning_zoomcamp'"
]
},
"execution_count": 68,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"'machine learning zoomcamp'.replace(' ', '_')"
]
},
{
"cell_type": "code",
"execution_count": 67,
"id": "9ffa16ea",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 sedan\n",
"1 sedan\n",
"2 convertible\n",
"3 4dr suv\n",
"4 pickup\n",
"Name: Vehicle_Style, dtype: object"
]
},
"execution_count": 67,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['Vehicle_Style'].str.lower()"
]
},
{
"cell_type": "code",
"execution_count": 72,
"id": "835c3a40",
"metadata": {},
"outputs": [],
"source": [
"df['Vehicle_Style'] = df['Vehicle_Style'].str.replace(' ', '_').str.lower()"
]
},
{
"cell_type": "code",
"execution_count": 74,
"id": "5ee197dc",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Make</th>\n",
" <th>Model</th>\n",
" <th>Year</th>\n",
" <th>Engine HP</th>\n",
" <th>Engine Cylinders</th>\n",
" <th>Transmission Type</th>\n",
" <th>Vehicle_Style</th>\n",
" <th>MSRP</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Nissan</td>\n",
" <td>Stanza</td>\n",
" <td>1991</td>\n",
" <td>138.0</td>\n",
" <td>4</td>\n",
" <td>MANUAL</td>\n",
" <td>sedan</td>\n",
" <td>2000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Hyundai</td>\n",
" <td>Sonata</td>\n",
" <td>2017</td>\n",
" <td>NaN</td>\n",
" <td>4</td>\n",
" <td>AUTOMATIC</td>\n",
" <td>sedan</td>\n",
" <td>27150</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Lotus</td>\n",
" <td>Elise</td>\n",
" <td>2010</td>\n",
" <td>218.0</td>\n",
" <td>4</td>\n",
" <td>MANUAL</td>\n",
" <td>convertible</td>\n",
" <td>54990</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>GMC</td>\n",
" <td>Acadia</td>\n",
" <td>2017</td>\n",
" <td>194.0</td>\n",
" <td>4</td>\n",
" <td>AUTOMATIC</td>\n",
" <td>4dr_suv</td>\n",
" <td>34450</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Nissan</td>\n",
" <td>Frontier</td>\n",
" <td>2017</td>\n",
" <td>261.0</td>\n",
" <td>6</td>\n",
" <td>MANUAL</td>\n",
" <td>pickup</td>\n",
" <td>32340</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Make Model Year Engine HP Engine Cylinders Transmission Type \\\n",
"0 Nissan Stanza 1991 138.0 4 MANUAL \n",
"1 Hyundai Sonata 2017 NaN 4 AUTOMATIC \n",
"2 Lotus Elise 2010 218.0 4 MANUAL \n",
"3 GMC Acadia 2017 194.0 4 AUTOMATIC \n",
"4 Nissan Frontier 2017 261.0 6 MANUAL \n",
"\n",
" Vehicle_Style MSRP \n",
"0 sedan 2000 \n",
"1 sedan 27150 \n",
"2 convertible 54990 \n",
"3 4dr_suv 34450 \n",
"4 pickup 32340 "
]
},
"execution_count": 74,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "markdown",
"id": "0f0d1bc6",
"metadata": {},
"source": [
"## Summarizing operations"
]
},
{
"cell_type": "code",
"execution_count": 81,
"id": "0e6bb68a",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Year</th>\n",
" <th>Engine HP</th>\n",
" <th>Engine Cylinders</th>\n",
" <th>MSRP</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>5.00</td>\n",
" <td>4.00</td>\n",
" <td>5.00</td>\n",
" <td>5.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>2010.40</td>\n",
" <td>202.75</td>\n",
" <td>4.40</td>\n",
" <td>30186.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>11.26</td>\n",
" <td>51.30</td>\n",
" <td>0.89</td>\n",
" <td>18985.04</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>1991.00</td>\n",
" <td>138.00</td>\n",
" <td>4.00</td>\n",
" <td>2000.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>2010.00</td>\n",
" <td>180.00</td>\n",
" <td>4.00</td>\n",
" <td>27150.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>2017.00</td>\n",
" <td>206.00</td>\n",
" <td>4.00</td>\n",
" <td>32340.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>2017.00</td>\n",
" <td>228.75</td>\n",
" <td>4.00</td>\n",
" <td>34450.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>2017.00</td>\n",
" <td>261.00</td>\n",
" <td>6.00</td>\n",
" <td>54990.00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Year Engine HP Engine Cylinders MSRP\n",
"count 5.00 4.00 5.00 5.00\n",
"mean 2010.40 202.75 4.40 30186.00\n",
"std 11.26 51.30 0.89 18985.04\n",
"min 1991.00 138.00 4.00 2000.00\n",
"25% 2010.00 180.00 4.00 27150.00\n",
"50% 2017.00 206.00 4.00 32340.00\n",
"75% 2017.00 228.75 4.00 34450.00\n",
"max 2017.00 261.00 6.00 54990.00"
]
},
"execution_count": 81,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.describe().round(2)"
]
},
{
"cell_type": "code",
"execution_count": 85,
"id": "ca689f7b",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Make 4\n",
"Model 5\n",
"Year 3\n",
"Engine HP 4\n",
"Engine Cylinders 2\n",
"Transmission Type 2\n",
"Vehicle_Style 4\n",
"MSRP 5\n",
"dtype: int64"
]
},
"execution_count": 85,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.nunique()"
]
},
{
"cell_type": "markdown",
"id": "0318652d",
"metadata": {},
"source": [
"## Missing values\n"
]
},
{
"cell_type": "code",
"execution_count": 87,
"id": "05000331",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Make 0\n",
"Model 0\n",
"Year 0\n",
"Engine HP 1\n",
"Engine Cylinders 0\n",
"Transmission Type 0\n",
"Vehicle_Style 0\n",
"MSRP 0\n",
"dtype: int64"
]
},
"execution_count": 87,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.isnull().sum()"
]
},
{
"cell_type": "markdown",
"id": "963eded9",
"metadata": {},
"source": [
"## Grouping\n"
]
},
{
"cell_type": "markdown",
"id": "7bf5fad5",
"metadata": {},
"source": [
"```\n",
"SELECT \n",
" transmission_type,\n",
" AVG(MSRP)\n",
"FROM\n",
" cars\n",
"GROUP BY\n",
" transmission_type\n",
"```"
]
},
{
"cell_type": "code",
"execution_count": 90,
"id": "6310552b",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Transmission Type\n",
"AUTOMATIC 34450\n",
"MANUAL 54990\n",
"Name: MSRP, dtype: int64"
]
},
"execution_count": 90,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.groupby('Transmission Type').MSRP.max()"
]
},
{
"cell_type": "markdown",
"id": "3de63a4b",
"metadata": {},
"source": [
"## Getting the NumPy arrays"
]
},
{
"cell_type": "code",
"execution_count": 93,
"id": "749f764c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([ 2000, 27150, 54990, 34450, 32340])"
]
},
"execution_count": 93,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.MSRP.values"
]
},
{
"cell_type": "code",
"execution_count": 95,
"id": "1ff56c15",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[{'Make': 'Nissan',\n",
" 'Model': 'Stanza',\n",
" 'Year': 1991,\n",
" 'Engine HP': 138.0,\n",
" 'Engine Cylinders': 4,\n",
" 'Transmission Type': 'MANUAL',\n",
" 'Vehicle_Style': 'sedan',\n",
" 'MSRP': 2000},\n",
" {'Make': 'Hyundai',\n",
" 'Model': 'Sonata',\n",
" 'Year': 2017,\n",
" 'Engine HP': nan,\n",
" 'Engine Cylinders': 4,\n",
" 'Transmission Type': 'AUTOMATIC',\n",
" 'Vehicle_Style': 'sedan',\n",
" 'MSRP': 27150},\n",
" {'Make': 'Lotus',\n",
" 'Model': 'Elise',\n",
" 'Year': 2010,\n",
" 'Engine HP': 218.0,\n",
" 'Engine Cylinders': 4,\n",
" 'Transmission Type': 'MANUAL',\n",
" 'Vehicle_Style': 'convertible',\n",
" 'MSRP': 54990},\n",
" {'Make': 'GMC',\n",
" 'Model': 'Acadia',\n",
" 'Year': 2017,\n",
" 'Engine HP': 194.0,\n",
" 'Engine Cylinders': 4,\n",
" 'Transmission Type': 'AUTOMATIC',\n",
" 'Vehicle_Style': '4dr_suv',\n",
" 'MSRP': 34450},\n",
" {'Make': 'Nissan',\n",
" 'Model': 'Frontier',\n",
" 'Year': 2017,\n",
" 'Engine HP': 261.0,\n",
" 'Engine Cylinders': 6,\n",
" 'Transmission Type': 'MANUAL',\n",
" 'Vehicle_Style': 'pickup',\n",
" 'MSRP': 32340}]"
]
},
"execution_count": 95,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.to_dict(orient='records')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f1e6fae3",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
================================================
FILE: 02-regression/01-car-price-intro.md
================================================
## 2.1 Car price prediction project
<a href="https://www.youtube.com/watch?v=vM3SqPNlStE&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR&index=12"><img src="images/thumbnail-2-01.jpg"></a>
[Slides](https://www.slideshare.net/AlexeyGrigorev/ml-zoomcamp-21-car-price-prediction-project)
## Notes
This project is about the creation of a model for helping users to predict car prices. The dataset was obtained from [this
kaggle competition](https://www.kaggle.com/CooperUnion/cardataset).
**Project plan:**
* Prepare data and Exploratory data analysis (EDA)
* Use linear regression for predicting price
* Understanding the internals of linear regression
* Evaluating the model with RMSE
* Feature engineering
* Regularization
* Using the model
The code and dataset are available at this [link](https://github.com/alexeygrigorev/mlbookcamp-code/tree/master/chapter-02-car-price).
<table>
<tr>
<td>⚠️</td>
<td>
The notes are written by the community. <br>
If you see an error here, please create a PR with a fix.
</td>
</tr>
</table>
* [Notes from Peter Ernicke](https://knowmledge.com/2023/09/18/ml-zoomcamp-2023-machine-learning-for-regression-part-1/)
## Navigation
* [Machine Learning Zoomcamp course](../)
* [Session 2: Machine Learning for Regression](./)
* Next: [Data preparation](02-data-preparation.md)
================================================
FILE: 02-regression/02-data-preparation.md
================================================
## 2.2 Data preparation
<a href="https://www.youtube.com/watch?v=Kd74oR4QWGM&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR&index=13"><img src="images/thumbnail-2-02.jpg"></a>
[Slides](https://www.slideshare.net/AlexeyGrigorev/ml-zoomcamp-2-slides)
## Notes
**Pandas attributes and methods:**
* `pd.read_csv(<file_path_string>)` -> read csv files
* `df.head()` -> take a look of the dataframe
* `df.columns` -> retrieve colum names of a dataframe
* `df.columns.str.lower()` -> lowercase all the letters
* `df.columns.str.replace(' ', '_')` -> replace the space separator
* `df.dtypes` -> retrieve data types of all features
* `df.index` -> retrieve indices of a dataframe
The entire code of this project is available in [this jupyter notebook](notebook.ipynb).
<table>
<tr>
<td>⚠️</td>
<td>
The notes are written by the community. <br>
If you see an error here, please create a PR with a fix.
</td>
</tr>
</table>
* [Notes from Peter Ernicke](https://knowmledge.com/2023/09/18/ml-zoomcamp-2023-machine-learning-for-regression-part-1/)
## Navigation
* [Machine Learning Zoomcamp course](../)
* [Session 2: Machine Learning for Regression](./)
* Previous: [Car price prediction project](01-car-price-intro.md)
* Next: [Exploratory data analysis](03-eda.md)
================================================
FILE: 02-regression/03-eda.md
================================================
## 2.3 Exploratory data analysis
<a href="https://www.youtube.com/watch?v=k6k8sQ0GhPM&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR&index=14"><img src="images/thumbnail-2-03.jpg"></a>
[Slides](https://www.slideshare.net/AlexeyGrigorev/ml-zoomcamp-2-slides)
## Notes
**Pandas attributes and methods:**
* `df[col].unique()` -> return a list of unique values in the series
* `df[col].nunique()` -> return the number of unique values in the series
* `df.isnull().sum()` -> return the number of null values in the dataframe
**Matplotlib and seaborn methods:**
* `%matplotlib inline` -> assure that plots are displayed in jupyter notebook's cells
* `sns.histplot()` -> show the histogram of a series
**Numpy methods:**
* `np.log1p()` -> apply log transformation to a variable, after adding one to each input value.
Long-tail distributions usually confuse the ML models, so the recommendation is to transform the target variable distribution to a normal one whenever possible.
The entire code of this project is available in [this jupyter notebook](https://github.com/alexeygrigorev/mlbookcamp-code/blob/master/chapter-02-car-price/02-carprice.ipynb).
<table>
<tr>
<td>⚠️</td>
<td>
The notes are written by the community. <br>
If you see an error here, please create a PR with a fix.
</td>
</tr>
</table>
* [Notes from Peter Ernicke](https://knowmledge.com/2023/09/19/ml-zoomcamp-2023-machine-learning-for-regression-part-2/)
## Navigation
* [Machine Learning Zoomcamp course](../)
* [Session 2: Machine Learning for Regression](./)
* Previous: [Data preparation](02-data-preparation.md)
* Next: [Setting up the validation framework](04-validation-framework.md)
================================================
FILE: 02-regression/04-validation-framework.md
================================================
## 2.4 Setting up the validation framework
<a href="https://www.youtube.com/watch?v=ck0IfiPaQi0&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR&index=15"><img src="images/thumbnail-2-04.jpg"></a>
[Slides](https://www.slideshare.net/AlexeyGrigorev/ml-zoomcamp-2-slides)
## Notes
In general, the dataset is splitted into three parts: training, validation, and test. For each partition, we need to obtain feature matrices (X) and vectors of targets (y). First, the size of the partitions is calculated. Next, the records are shuffled to ensure that the values in the three partitions contain non-sequential records from the dataset. Finally, the partitions are created using the shuffled indices.
**Pandas attributes and methods:**
* `df.iloc[]` -> return subsets of records of a dataframe, being selected by numerical indices
* `df.reset_index()` -> restate the orginal indices
* `del df[col]` -> eliminate a column variable
**Numpy methods:**
* `np.arange()` -> return an array of numbers
* `np.random.shuffle()` -> return a shuffled array
* `np.random.seed()` -> set a seed for reproducibility
The entire code of this project is available in [this jupyter notebook](https://github.com/alexeygrigorev/mlbookcamp-code/blob/master/chapter-02-car-price/02-carprice.ipynb).
<table>
<tr>
<td>⚠️</td>
<td>
The notes are written by the community. <br>
If you see an error here, please create a PR with a fix.
</td>
</tr>
</table>
* [Notes from Peter Ernicke](https://knowmledge.com/2023/09/19/ml-zoomcamp-2023-machine-learning-for-regression-part-3/)
## Navigation
* [Machine Learning Zoomcamp course](../)
* [Session 2: Machine Learning for Regression](./)
* Previous: [Exploratory data analysis](03-eda.md)
* Next: [Linear regression](05-linear-regression-simple.md)
================================================
FILE: 02-regression/05-linear-regression-simple.md
================================================
## 2.5 Linear regression
<a href="https://www.youtube.com/watch?v=Dn1eTQLsOdA&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR&index=16"><img src="images/thumbnail-2-05.jpg"></a>
[Slides](https://www.slideshare.net/AlexeyGrigorev/ml-zoomcamp-2-slides)
## Notes
Model for solving regression tasks, in which the objective is to adjust a line for the data and make predictions on new values. The input of this model is the **feature matrix** `X` and a `y` **vector of predictions** is obtained, trying to be as close as possible to the **actual** `y` values. The linear regression formula is the sum of the bias term \( $w_0$ \), which refers to the predictions if there is no information, and each of the feature values times their corresponding weights as \( $x_{i1} \cdot w_1 + x_{i2} \cdot w_2 + ... + x_{in} \cdot w_n$ \).
So the simple linear regression formula looks like:
$g(x_i) = w_0 + x_{i1} \cdot w_1 + x_{i2} \cdot w_2 + ... + x_{in} \cdot w_n$.
And that can be further simplified as:
$g(x_i) = w_0 + \displaystyle\sum_{j=1}^{n} w_j \cdot x_{ij}$
Here is a simple implementation of Linear Regression in python:
~~~~python
w0 = 7.1
def linear_regression(xi):
n = len(xi)
pred = w0
w = [0.01, 0.04, 0.002]
for j in range(n):
pred = pred + w[j] * xi[j]
return pred
~~~~
If we look at the $\displaystyle\sum_{j=1}^{n} w_j \cdot x_{ij}$ part in the above equation, we know that this is nothing else but a vector-vector multiplication. Hence, we can rewrite the equation as $g(x_i) = w_0 + x_i^T \cdot w$
We need to assure that the result is shown on the untransformed scale by using the inverse function `exp()`.
The entire code of this project is available in [this jupyter notebook](https://github.com/alexeygrigorev/mlbookcamp-code/blob/master/chapter-02-car-price/02-carprice.ipynb).
<table>
<tr>
<td>⚠️</td>
<td>
The notes are written by the community. <br>
If you see an error here, please create a PR with a fix.
</td>
</tr>
</table>
* [Notes from Peter Ernicke](https://knowmledge.com/2023/09/20/ml-zoomcamp-2023-machine-learning-for-regression-part-4/)
## Navigation
* [Machine Learning Zoomcamp course](../)
* [Session 2: Machine Learning for Regression](./)
* Previous: [Setting up the validation framework](04-validation-framework.md)
* Next: [Linear regression: vector form](06-linear-regression-vector.md)
================================================
FILE: 02-regression/06-linear-regression-vector.md
================================================
## 2.6 Linear regression: vector form
<a href="https://www.youtube.com/watch?v=YkyevnYyAww&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR&index=17"><img src="images/thumbnail-2-06.jpg"></a>
[Slides](https://www.slideshare.net/AlexeyGrigorev/ml-zoomcamp-2-slides)
## Notes
The formula of linear regression can be synthesized with the dot product between features and weights. The feature vector includes the *bias* term with an *x* value of one, such as $w_{0}^{x_{i0}},\ where\ x_{i0} = 1\ for\ w_0$.
When all the records are included, the linear regression can be calculated with the dot product between ***feature matrix*** and ***vector of weights***, obtaining the `y` vector of predictions.
The entire code of this project is available in [this jupyter notebook](https://github.com/alexeygrigorev/mlbookcamp-code/blob/master/chapter-02-car-price/02-carprice.ipynb).
<table>
<tr>
<td>⚠️</td>
<td>
The notes are written by the community. <br>
If you see an error here, please create a PR with a fix.
</td>
</tr>
</table>
* [Notes from Peter Ernicke](https://knowmledge.wordpress.com/2023/09/20/ml-zoomcamp-2023-machine-learning-for-regression-part-5/)
## Navigation
* [Machine Learning Zoomcamp course](../)
* [Session 2: Machine Learning for Regression](./)
* Previous: [Linear regression](05-linear-regression-simple.md)
* Next: [Training linear regression: Normal equation](07-linear-regression-training.md)
================================================
FILE: 02-regression/07-linear-regression-training.md
================================================
## 2.7 Training linear regression: Normal equation
<a href="https://www.youtube.com/watch?v=hx6nak-Y11g&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR&index=18"><img src="images/thumbnail-2-07.jpg"></a>
[Slides](https://www.slideshare.net/AlexeyGrigorev/ml-zoomcamp-2-slides)
## Notes
Obtaining predictions as close as possible to $y$ target values requires the calculation of weights from the general
LR equation. The feature matrix does not
have an inverse because it is not square, so it is required to obtain an approximate solution, which can be
obtained using the **Gram matrix**
(multiplication of feature matrix ($X$) and its transpose ($X^T$)). The vector of weights or coefficients $w$ obtained with this
formula is the closest possible solution to the LR system.
Normal Equation:
$w$ = $(X^TX)^{-1}X^Ty$
Where:
$X^TX$ is the Gram Matrix
The entire code of this project is available in [this jupyter notebook](https://github.com/alexeygrigorev/mlbookcamp-code/blob/master/chapter-02-car-price/02-carprice.ipynb).
<table>
<tr>
<td>⚠️</td>
<td>
The notes are written by the community. <br>
If you see an error here, please create a PR with a fix.
</td>
</tr>
</table>
* [Notes from Peter Ernicke](https://knowmledge.com/2023/09/21/ml-zoomcamp-2023-machine-learning-for-regression-part-6/)
## Navigation
* [Machine Learning Zoomcamp course](../)
* [Session 2: Machine Learning for Regression](./)
* Previous: [Linear regression: vector form](06-linear-regression-vector.md)
* Next: [Baseline model for car price prediction project](08-baseline-model.md)
================================================
FILE: 02-regression/08-baseline-model.md
================================================
## 2.8 Baseline model for car price prediction project
<a href="https://www.youtube.com/watch?v=SvPpMMYtYbU&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR&index=19"><img src="images/thumbnail-2-08.jpg"></a>
[Slides](https://www.slideshare.net/AlexeyGrigorev/ml-zoomcamp-2-slides)
## Notes
* In this lesson we build a baseline model and apply the `df_train` dataset to derive weights for the bias (w0) and the features (w). For this, we use the `train_linear_regression(X, y)` function from the previous lesson.
* Linear regression only applies to numerical features. Therefore, only the numerical features from `df_train` are used for the feature matrix.
* We notice some of the features in `df_train` are `nan`. We set them to `0` for the sake of simplicity, so the model is solvable, but it will be appropriate if a non-zeo value is used as the filler (e.g. mean value of the feature).
* Once the weights are calculated, then we apply them on $$\\\\ \large g(X) = w_0 + X \cdot w$$ to derive the predicted y vector.
* Then we plot both predicted y and the actual y on the same histogram for a visual comparison.
The entire code of this project is available in [this jupyter notebook](https://github.com/alexeygrigorev/mlbookcamp-code/blob/master/chapter-02-car-price/02-carprice.ipynb).
<table>
<tr>
<td>⚠️</td>
<td>
The notes are written by the community. <br>
If you see an error here, please create a PR with a fix.
</td>
</tr>
</table>
* [Notes from Peter Ernicke](https://knowmledge.com/2023/09/21/ml-zoomcamp-2023-machine-learning-for-regression-part-7/)
## Navigation
* [Machine Learning Zoomcamp course](../)
* [Session 2: Machine Learning for Regression](./)
* Previous: [Training linear regression: Normal equation](07-linear-regression-training.md)
* Next: [Root mean squared error](09-rmse.md)
================================================
FILE: 02-regression/09-rmse.md
================================================
## 2.9 Root Mean Squared Error (RMSE)
<a href="https://www.youtube.com/watch?v=0LWoFtbzNUM&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR&index=20"><img src="images/thumbnail-2-09.jpg"></a>
[Slides](https://www.slideshare.net/AlexeyGrigorev/ml-zoomcamp-2-slides)
## Notes
* In the previous lesson we found out our predictions were a bit off from the actual target values in the training dataset. We need a way to quantify how good or bad the model is. This is where RMSE can be of help.
* Root Mean Squared Error (RMSE) is a way to evaluate regression models. It measures the error associated with the model being evaluated. This numerical figure can then be used to compare models, enabling us to choose the one that gives the best predictions.
$$RMSE = \sqrt{ \frac{1}{m} \sum_{i=1}^{m} {(g(x_i) - y_i)^2}}$$
- $g(x_i)$ is the prediction
- $y_i$ is the actual value
- $m$ is the number of observations in the dataset (i.e. cars)
The entire code of this project is available in [this jupyter notebook](https://github.com/alexeygrigorev/mlbookcamp-code/blob/master/chapter-02-car-price/02-carprice.ipynb).
<table>
<tr>
<td>⚠️</td>
<td>
The notes are written by the community. <br>
If you see an error here, please create a PR with a fix.
</td>
</tr>
</table>
* [Notes from Peter Ernicke](https://knowmledge.com/2023/09/22/ml-zoomcamp-2023-machine-learning-for-regression-part-8/)
## Navigation
* [Machine Learning Zoomcamp course](../)
* [Session 2: Machine Learning for Regression](./)
* Previous: [Baseline model for car price prediction project](08-baseline-model.md)
* Next: [Using RMSE on validation data](10-car-price-validation.md)
================================================
FILE: 02-regression/10-car-price-validation.md
================================================
## 2.10 Computing RMSE on validation data
<a href="https://www.youtube.com/watch?v=rawGPXg2ofE&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR&index=21"><img src="images/thumbnail-2-10.jpg"></a>
[Slides](https://www.slideshare.net/AlexeyGrigorev/ml-zoomcamp-2-slides)
## Notes
Calculation of the RMSE on validation partition of the dataset of car price prediction. In this way, we have a metric to evaluate the model's
performance.
The entire code of this project is available in [this jupyter notebook](https://github.com/alexeygrigorev/mlbookcamp-code/blob/master/chapter-02-car-price/02-carprice.ipynb).
<table>
<tr>
<td>⚠️</td>
<td>
The notes are written by the community. <br>
If you see an error here, please create a PR with a fix.
</td>
</tr>
</table>
* [Notes from Peter Ernicke](https://knowmledge.com/2023/09/22/ml-zoomcamp-2023-machine-learning-for-regression-part-8/)
## Navigation
* [Machine Learning Zoomcamp course](../)
* [Session 2: Machine Learning for Regression](./)
* Previous: [Root mean squared error](09-rmse.md)
* Next: [Feature engineering](11-feature-engineering.md)
================================================
FILE: 02-regression/11-feature-engineering.md
================================================
## 2.11 Feature engineering
Feature engineering is the process of creating new features
<a href="https://www.youtube.com/watch?v=-aEShw4ftB0&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR&index=22"><img src="images/thumbnail-2-11.jpg"></a>
[Slides](https://www.slideshare.net/AlexeyGrigorev/ml-zoomcamp-2-slides)
## Notes
The feature age of the car was included in the dataset, obtained with the subtraction of the maximum year of cars and each of the years of cars.
This new feature improved the model performance, measured with the RMSE and comparing the distributions of y target variable and predictions.
The entire code of this project is available in [this jupyter notebook](https://github.com/alexeygrigorev/mlbookcamp-code/blob/master/chapter-02-car-price/02-carprice.ipynb).
<table>
<tr>
<td>⚠️</td>
<td>
The notes are written by the community. <br>
If you see an error here, please create a PR with a fix.
</td>
</tr>
</table>
* [Notes from Peter Ernicke](https://knowmledge.com/2023/09/22/ml-zoomcamp-2023-machine-learning-for-regression-part-9/)
## Navigation
* [Machine Learning Zoomcamp course](../)
* [Session 2: Machine Learning for Regression](./)
* Previous: [Using RMSE on validation data](10-car-price-validation.md)
* Next: [Categorical variables](12-categorical-variables.md)
================================================
FILE: 02-regression/12-categorical-variables.md
================================================
## 2.12 Categorical variables
<a href="https://www.youtube.com/watch?v=sGLAToAAMa4&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR&index=23"><img src="images/thumbnail-2-12.jpg"></a>
[Slides](https://www.slideshare.net/AlexeyGrigorev/ml-zoomcamp-2-slides)
## Notes
Categorical variables are typically represented as strings, and pandas identifies them as object types. However, some variables that appear to be numerical may actually be categorical (e.g., the number of doors a car has). All these categorical variables need to be converted to a numerical form because ML
models can interpret only numerical features. It is possible to incorporate certain categories from a feature, not necessarily all of them.
This transformation from categorical to numerical variables is known as One-Hot encoding.
The entire code of this project is available in [this jupyter notebook](https://github.com/alexeygrigorev/mlbookcamp-code/blob/master/chapter-02-car-price/02-carprice.ipynb).
<table>
<tr>
<td>⚠️</td>
<td>
The notes are written by the community. <br>
If you see an error here, please create a PR with a fix.
</td>
</tr>
</table>
* [Notes from Peter Ernicke](https://knowmledge.com/2023/09/23/ml-zoomcamp-2023-machine-learning-for-regression-part-10/)
## Comments
This way of encoding categorical features is called "one-hot encoding".
We'll learn more about it in Session 3.
## Navigation
* [Machine Learning Zoomcamp course](../)
* [Session 2: Machine Learning for Regression](./)
* Previous: [Feature engineering](11-feature-engineering.md)
* Next: [Regularization](13-regularization.md)
================================================
FILE: 02-regression/13-regularization.md
================================================
## 2.13 Regularization
<a href="https://www.youtube.com/watch?v=91ve3EJlHBc&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR&index=24"><img src="images/thumbnail-2-13.jpg"></a>
[Slides](https://www.slideshare.net/AlexeyGrigorev/ml-zoomcamp-2-slides)
## Notes
If the feature matrix has duplicate columns (or columns that can be expressed as a linear combination of other columns), it will not have an inverse matrix. But, sometimes this error could be passed if certain values are slightly different
between duplicated columns.
So, if we apply the normal equation with this feature matrix, the values associated with duplicated columns are very large, which decreases
the model performance. To solve this issue, one alternative is adding a small number to the diagonal of the feature matrix, which corresponds to regularization.
This technique
works because the addition of small values to the diagonal makes it less likely to have duplicated columns. The regularization value is a hyperparameter of the model. After applying
regularization the model performance improved.
The entire code of this project is available in [this jupyter notebook](https://github.com/alexeygrigorev/mlbookcamp-code/blob/master/chapter-02-car-price/02-carprice.ipynb).
<table>
<tr>
<td>⚠️</td>
<td>
The notes are written by the community. <br>
If you see an error here, please create a PR with a fix.
</td>
</tr>
</table>
* [Notes from Peter Ernicke](https://knowmledge.com/2023/09/23/ml-zoomcamp-2023-machine-learning-for-regression-part-11/)
## Comments
### Linear combination
I mentioned the term *linear combination* in the video, but didn't explain what it means.
So if you're interested what it means, you can read here
* One column is a linear combination of others when you can express one column of a matrix as a sum of others columns
* The simplest example is when a column is an exact duplicate of another column
* Another example. Let's say we have 3 columns: `a`, `b`, `c`. If `c = 0.2 * a + 0.5 * b`, then `c` is a linear combination of `a` and `b`
* More formal definition: https://en.wikipedia.org/wiki/Linear_combination
### Ridge Regression
The regularization technique used (adding a factor to the diagonals of Gram Matrix) in this lesson is Ridge Regression. Further explanations are available in this [DataTalks.Club article](https://datatalks.club/blog/regularization-in-regression.html).
## Navigation
* [Machine Learning Zoomcamp course](../)
* [Session 2: Machine Learning for Regression](./)
* Previous: [Categorical variables](12-categorical-variables.md)
* Next: [Tuning the model](14-tuning-model.md)
================================================
FILE: 02-regression/14-tuning-model.md
================================================
## 2.14 Tuning the model
<a href="https://www.youtube.com/watch?v=lW-YVxPgzQw&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR&index=25"><img src="images/thumbnail-2-14.jpg"></a>
[Slides](https://www.slideshare.net/AlexeyGrigorev/ml-zoomcamp-2-slides)
## Notes
Tuning the model consisted of finding the best regularization hyperparameter value, using the validation partition of the dataset. The model was then trained with this regularization value.
The entire code of this project is available in [this jupyter notebook](https://github.com/alexeygrigorev/mlbookcamp-code/blob/master/chapter-02-car-price/02-carprice.ipynb).
<table>
<tr>
<td>⚠️</td>
<td>
The notes are written by the community. <br>
If you see an error here, please create a PR with a fix.
</td>
</tr>
</table>
* [Notes from Peter Ernicke](https://knowmledge.com/2023/09/24/ml-zoomcamp-2023-machine-learning-for-regression-part-12/)
## Navigation
* [Machine Learning Zoomcamp course](../)
* [Session 2: Machine Learning for Regression](./)
* Previous: [Regularization](13-regularization.md)
* Next: [Using the model](15-using-model.md)
================================================
FILE: 02-regression/15-using-model.md
================================================
## 2.15 Using the model
<a href="https://www.youtube.com/watch?v=KT--uIJozes&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR&index=26"><img src="images/thumbnail-2-15.jpg"></a>
[Slides](https://www.slideshare.net/AlexeyGrigorev/ml-zoomcamp-2-slides)
## Notes
After finding the best model and its parameters, it was trained with training and validation partitions and the final RMSE was calculated on the test partition.
Finally, the final model was used to predict the price of new cars.
The entire code of this project is available in [this jupyter notebook](https://github.com/alexeygrigorev/mlbookcamp-code/blob/master/chapter-02-car-price/02-carprice.ipynb).
<table>
<tr>
<td>⚠️</td>
<td>
The notes are written by the community. <br>
If you see an error here, please create a PR with a fix.
</td>
</tr>
</table>
* [Notes from Peter Ernicke](https://knowmledge.com/2023/09/24/ml-zoomcamp-2023-machine-learning-for-regression-part-12/)
## Navigation
* [Machine Learning Zoomcamp course](../)
* [Session 2: Machine Learning for Regression](./)
* Previous: [Tuning the model](14-tuning-model.md)
* Next: [Car price prediction project summary](16-summary.md)
================================================
FILE: 02-regression/16-summary.md
================================================
## 2.16 Car price prediction project summary
<a href="https://www.youtube.com/watch?v=_qI01YXbyro&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR&index=27"><img src="images/thumbnail-2-16.jpg"></a>
## Notes
In summary, this session covered some topics, including data preparation, exploratory data analysis, the validation framework, linear regression model, LR vector and
normal forms, the baseline model, root mean squared error, feature engineering, regularization, tuning the model, and using the best model with new data. All these concepts
were explained using the problem to predict the price of cars.
<table>
<tr>
<td>⚠️</td>
<td>
The notes are written by the community. <br>
If you see an error here, please create a PR with a fix.
</td>
</tr>
</table>
* [Notes from Maximilien Eyengue](https://github.com/maxim-eyengue/Python-Codes/blob/main/ML_Zoomcamp_2024/02_regression/Summary_Session_02.md)
## Navigation
* [Machine Learning Zoomcamp course](../)
* [Session 2: Machine Learning for Regression](./)
* Previous: [Using the model](15-using-model.md)
* Next: [Explore more](17-explore-more.md)
================================================
FILE: 02-regression/17-explore-more.md
================================================
## 2.17 Explore more
### Questions
* In this project, we included only 5 top features. What happens if we include 10?
> That's not a graded homework, it's just for you if you want to try more things on this project
### Other projects
Here are other datasets that you can play with to learn more about the topic:
* [California housing dataset](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_california_housing.html) - predict the price of a house
* [Student Performance Data Set](https://archive.ics.uci.edu/ml/datasets/Student+Performance) - predict the performance of students
* UCI ML Repository contains a lot of other datasets suitable for practicing regression - https://archive.ics.uci.edu/ml/datasets.php?task=reg
## Navigation
* [Machine Learning Zoomcamp course](../)
* [Session 2: Machine Learning for Regression](./)
* Previous: [Car price prediction project summary](16-summary.md)
* Next: [Homework](homework.md)
================================================
FILE: 02-regression/README.md
================================================
## 2. Machine Learning for Regression
- 2.1 [Car price prediction project](01-car-price-intro.md)
- 2.2 [Data preparation](02-data-preparation.md)
- 2.3 [Exploratory data analysis](03-eda.md)
- 2.4 [Setting up the validation framework](04-validation-framework.md)
- 2.5 [Linear regression](05-linear-regression-simple.md)
- 2.6 [Linear regression: vector form](06-linear-regression-vector.md)
- 2.7 [Training linear regression: Normal equation](07-linear-regression-training.md)
- 2.8 [Baseline model for car price prediction project](08-baseline-model.md)
- 2.9 [Root mean squared error](09-rmse.md)
- 2.10 [Using RMSE on validation data](10-car-price-validation.md)
- 2.11 [Feature engineering](11-feature-engineering.md)
- 2.12 [Categorical variables](12-categorical-variables.md)
- 2.13 [Regularization](13-regularization.md)
- 2.14 [Tuning the model](14-tuning-model.md)
- 2.15 [Using the model](15-using-model.md)
- 2.16 [Car price prediction project summary](16-summary.md)
- 2.17 [Explore more](17-explore-more.md)
- 2.18 [Homework](homework.md)
## Community notes
Did you take notes? You can share them here (or in each unit separately)
* [Notes from Kwang Yang](https://www.kaggle.com/kwangyangchia/notebook-for-lesson-2-mle)
* [Notes from Sebastián Ayala Ruano](https://github.com/sayalaruano/100DaysOfMLCode/blob/main/Regression/Notes/NotesDay5.md)
* [Notes from Ayoub Berdeddouch](https://github.com/ayoub-berdeddouch/mlbookcamp-homeworks/blob/main/Regression/homework_Regression_AyoubBerdeddouch.ipynb)
* [Notes from Alvaro Navas](https://github.com/ziritrion/ml-zoomcamp/blob/main/notes/02_linear_regression.md)
* [Notes from froukje](https://github.com/froukje/ml-zoomcamp/blob/main/week2/Lecture_2_car_price_prediction.ipynb)
* [Notes from Jon Areas](https://github.com/jxareas/Machine-Learning-Bookcamp-2022/blob/master/notes/02-regression.md)
* [Notes from Memoona Tahira](https://github.com/MemoonaTahira/MLZoomcamp2022/blob/main/Notes/Week_2-linear_regression/readme.md)
* [Notes from Wesley Barreto](https://github.com/wgb-10/ML-Zoomcamp-2022/blob/main/Session-Projects/02-Regression/my-notebook.ipynb)
* [Notes from Hareesh Tummala](https://github.com/tummala-hareesh/ml_zoomcamp_ht/blob/main/notes/week-2-notes.md)
* [Notes from Anneysha Sarkar](https://github.com/Anneysha7/ml-zoomcamp-2023/blob/main/course-notes/week-2.md)
* [Notes from Peter Ernicke](https://knowmledge.com/category/courses/ml-zoomcamp/regression/)
* [Notes from Marcos Benício](https://github.com/marcosbenicio/DataScience/blob/main/01Regression/car_price.ipynb)
* [Notes from Oscar Garcia](https://github.com/ozkary/machine-learning-engineering/tree/main/02-regression)
* [Notes from Maximilien Eyengue](https://github.com/maxim-eyengue/Python-Codes/blob/main/ML_Zoomcamp_2024/02_regression/Summary_Session_02.md)
* [Notes from Kemal Dahha](https://github.com/kemaldahha/machine-learning-course/blob/main/week_2_notes.ipynb)
* [Cohort 2025 | Notes By Nitin Gupta](https://github.com/niting9881/ML-zoomcamp-local/blob/main/02-regression/Linear_Regression_FAQ.md)
* Add your notes here
================================================
FILE: 02-regression/homework.md
================================================
## Homework
* For 2025 cohort homework, check [the 2025 cohort folder](../cohorts/2025/02-regression/homework.md)
* For 2024 cohort homework, check [the 2024 cohort folder](../cohorts/2024/02-regression/homework.md)
* For 2023 cohort homework, check [the 2023 cohort folder](../cohorts/2023/02-regression/homework.md)
* For 2022 cohort homework, check [the 2022 cohort folder](../cohorts/2022/02-regression/homework.md)
* For 2021 cohort homework and solution, check [the 2021 cohort folder](../cohorts/2021/02-regression/)
## Navigation
* [Machine Learning Zoomcamp course](../)
* [Session 2: Machine Learning for Regression](./)
* Previous: [Explore more](17-explore-more.md)
================================================
FILE: 02-regression/meta.json
================================================
{
"data": "meta.csv",
"session": 2,
"name": "Machine Learning for Regression"
}
================================================
FILE: 02-regression/notebook.ipynb
================================================
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 2. Machine Learning for Regression\n"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 2.2 Data preparation"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"data = 'https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv'"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"--2021-09-18 22:31:04-- https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv\n",
"Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.109.133, ...\n",
"Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 1475504 (1,4M) [text/plain]\n",
"Saving to: ‘data.csv’\n",
"\n",
"data.csv 100%[===================>] 1,41M 9,27MB/s in 0,2s \n",
"\n",
"2021-09-18 22:31:04 (9,27 MB/s) - ‘data.csv’ saved [1475504/1475504]\n",
"\n"
]
}
],
"source": [
"!wget $data "
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv('data.csv')"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"df.columns = df.columns.str.lower().str.replace(' ', '_')"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 bmw\n",
"1 bmw\n",
"2 bmw\n",
"3 bmw\n",
"4 bmw\n",
" ... \n",
"11909 acura\n",
"11910 acura\n",
"11911 acura\n",
"11912 acura\n",
"11913 lincoln\n",
"Name: make, Length: 11914, dtype: object"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['make'].str.lower().str.replace(' ', '_')"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['make',\n",
" 'model',\n",
" 'engine_fuel_type',\n",
" 'transmission_type',\n",
" 'driven_wheels',\n",
" 'market_category',\n",
" 'vehicle_size',\n",
" 'vehicle_style']"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"strings = list(df.dtypes[df.dtypes == 'object'].index)\n",
"strings"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"for col in strings:\n",
" df[col] = df[col].str.lower().str.replace(' ', '_')"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"make object\n",
"model object\n",
"year int64\n",
"engine_fuel_type object\n",
"engine_hp float64\n",
"engine_cylinders float64\n",
"transmission_type object\n",
"driven_wheels object\n",
"number_of_doors float64\n",
"market_category object\n",
"vehicle_size object\n",
"vehicle_style object\n",
"highway_mpg int64\n",
"city_mpg int64\n",
"popularity int64\n",
"msrp int64\n",
"dtype: object"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.dtypes"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 2.3 Exploratory data analysis"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"make\n",
"['bmw' 'audi' 'fiat' 'mercedes-benz' 'chrysler']\n",
"48\n",
"\n",
"model\n",
"['1_series_m' '1_series' '100' '124_spider' '190-class']\n",
"914\n",
"\n",
"year\n",
"[2011 2012 2013 1992 1993]\n",
"28\n",
"\n",
"engine_fuel_type\n",
"['premium_unleaded_(required)' 'regular_unleaded'\n",
" 'premium_unleaded_(recommended)' 'flex-fuel_(unleaded/e85)' 'diesel']\n",
"10\n",
"\n",
"engine_hp\n",
"[335. 300. 230. 320. 172.]\n",
"356\n",
"\n",
"engine_cylinders\n",
"[ 6. 4. 5. 8. 12.]\n",
"9\n",
"\n",
"transmission_type\n",
"['manual' 'automatic' 'automated_manual' 'direct_drive' 'unknown']\n",
"5\n",
"\n",
"driven_wheels\n",
"['rear_wheel_drive' 'front_wheel_drive' 'all_wheel_drive'\n",
" 'four_wheel_drive']\n",
"4\n",
"\n",
"number_of_doors\n",
"[ 2. 4. 3. nan]\n",
"3\n",
"\n",
"market_category\n",
"['factory_tuner,luxury,high-performance' 'luxury,performance'\n",
" 'luxury,high-performance' 'luxury' 'performance']\n",
"71\n",
"\n",
"vehicle_size\n",
"['compact' 'midsize' 'large']\n",
"3\n",
"\n",
"vehicle_style\n",
"['coupe' 'convertible' 'sedan' 'wagon' '4dr_hatchback']\n",
"16\n",
"\n",
"highway_mpg\n",
"[26 28 27 25 24]\n",
"59\n",
"\n",
"city_mpg\n",
"[19 20 18 17 16]\n",
"69\n",
"\n",
"popularity\n",
"[3916 3105 819 617 1013]\n",
"48\n",
"\n",
"msrp\n",
"[46135 40650 36350 29450 34500]\n",
"6049\n",
"\n"
]
}
],
"source": [
"for col in df.columns:\n",
" print(col)\n",
" print(df[col].unique()[:5])\n",
" print(df[col].nunique())\n",
" print()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>make</th>\n",
" <th>model</th>\n",
" <th>year</th>\n",
" <th>engine_fuel_type</th>\n",
" <th>engine_hp</th>\n",
" <th>engine_cylinders</th>\n",
" <th>transmission_type</th>\n",
" <th>driven_wheels</th>\n",
" <th>number_of_doors</th>\n",
" <th>market_category</th>\n",
" <th>vehicle_size</th>\n",
" <th>vehicle_style</th>\n",
" <th>highway_mpg</th>\n",
" <th>city_mpg</th>\n",
" <th>popularity</th>\n",
" <th>msrp</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>bmw</td>\n",
" <td>1_series_m</td>\n",
" <td>2011</td>\n",
" <td>premium_unleaded_(required)</td>\n",
" <td>335.0</td>\n",
" <td>6.0</td>\n",
" <td>manual</td>\n",
" <td>rear_wheel_drive</td>\n",
" <td>2.0</td>\n",
" <td>factory_tuner,luxury,high-performance</td>\n",
" <td>compact</td>\n",
" <td>coupe</td>\n",
" <td>26</td>\n",
" <td>19</td>\n",
" <td>3916</td>\n",
" <td>46135</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>bmw</td>\n",
" <td>1_series</td>\n",
" <td>2011</td>\n",
" <td>premium_unleaded_(required)</td>\n",
" <td>300.0</td>\n",
" <td>6.0</td>\n",
" <td>manual</td>\n",
" <td>rear_wheel_drive</td>\n",
" <td>2.0</td>\n",
" <td>luxury,performance</td>\n",
" <td>compact</td>\n",
" <td>convertible</td>\n",
" <td>28</td>\n",
" <td>19</td>\n",
" <td>3916</td>\n",
" <td>40650</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>bmw</td>\n",
" <td>1_series</td>\n",
" <td>2011</td>\n",
" <td>premium_unleaded_(required)</td>\n",
" <td>300.0</td>\n",
" <td>6.0</td>\n",
" <td>manual</td>\n",
" <td>rear_wheel_drive</td>\n",
" <td>2.0</td>\n",
" <td>luxury,high-performance</td>\n",
" <td>compact</td>\n",
" <td>coupe</td>\n",
" <td>28</td>\n",
" <td>20</td>\n",
" <td>3916</td>\n",
" <td>36350</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>bmw</td>\n",
" <td>1_series</td>\n",
" <td>2011</td>\n",
" <td>premium_unleaded_(required)</td>\n",
" <td>230.0</td>\n",
" <td>6.0</td>\n",
" <td>manual</td>\n",
" <td>rear_wheel_drive</td>\n",
" <td>2.0</td>\n",
" <td>luxury,performance</td>\n",
" <td>compact</td>\n",
" <td>coupe</td>\n",
" <td>28</td>\n",
" <td>18</td>\n",
" <td>3916</td>\n",
" <td>29450</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>bmw</td>\n",
" <td>1_series</td>\n",
" <td>2011</td>\n",
" <td>premium_unleaded_(required)</td>\n",
" <td>230.0</td>\n",
" <td>6.0</td>\n",
" <td>manual</td>\n",
" <td>rear_wheel_drive</td>\n",
" <td>2.0</td>\n",
" <td>luxury</td>\n",
" <td>compact</td>\n",
" <td>convertible</td>\n",
" <td>28</td>\n",
" <td>18</td>\n",
" <td>3916</td>\n",
" <td>34500</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11909</th>\n",
" <td>acura</td>\n",
" <td>zdx</td>\n",
" <td>2012</td>\n",
" <td>premium_unleaded_(required)</td>\n",
" <td>300.0</td>\n",
" <td>6.0</td>\n",
" <td>automatic</td>\n",
" <td>all_wheel_drive</td>\n",
" <td>4.0</td>\n",
" <td>crossover,hatchback,luxury</td>\n",
" <td>midsize</td>\n",
" <td>4dr_hatchback</td>\n",
" <td>23</td>\n",
" <td>16</td>\n",
" <td>204</td>\n",
" <td>46120</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11910</th>\n",
" <td>acura</td>\n",
" <td>zdx</td>\n",
" <td>2012</td>\n",
" <td>premium_unleaded_(required)</td>\n",
" <td>300.0</td>\n",
" <td>6.0</td>\n",
" <td>automatic</td>\n",
" <td>all_wheel_drive</td>\n",
" <td>4.0</td>\n",
" <td>crossover,hatchback,luxury</td>\n",
" <td>midsize</td>\n",
" <td>4dr_hatchback</td>\n",
" <td>23</td>\n",
" <td>16</td>\n",
" <td>204</td>\n",
" <td>56670</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11911</th>\n",
" <td>acura</td>\n",
" <td>zdx</td>\n",
" <td>2012</td>\n",
" <td>premium_unleaded_(required)</td>\n",
" <td>300.0</td>\n",
" <td>6.0</td>\n",
" <td>automatic</td>\n",
" <td>all_wheel_drive</td>\n",
" <td>4.0</td>\n",
" <td>crossover,hatchback,luxury</td>\n",
" <td>midsize</td>\n",
" <td>4dr_hatchback</td>\n",
" <td>23</td>\n",
" <td>16</td>\n",
" <td>204</td>\n",
" <td>50620</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11912</th>\n",
" <td>acura</td>\n",
" <td>zdx</td>\n",
" <td>2013</td>\n",
" <td>premium_unleaded_(recommended)</td>\n",
" <td>300.0</td>\n",
" <td>6.0</td>\n",
" <td>automatic</td>\n",
" <td>all_wheel_drive</td>\n",
" <td>4.0</td>\n",
" <td>crossover,hatchback,luxury</td>\n",
" <td>midsize</td>\n",
" <td>4dr_hatchback</td>\n",
" <td>23</td>\n",
" <td>16</td>\n",
" <td>204</td>\n",
" <td>50920</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11913</th>\n",
" <td>lincoln</td>\n",
" <td>zephyr</td>\n",
" <td>2006</td>\n",
" <td>regular_unleaded</td>\n",
" <td>221.0</td>\n",
" <td>6.0</td>\n",
" <td>automatic</td>\n",
" <td>front_wheel_drive</td>\n",
" <td>4.0</td>\n",
" <td>luxury</td>\n",
" <td>midsize</td>\n",
" <td>sedan</td>\n",
" <td>26</td>\n",
" <td>17</td>\n",
" <td>61</td>\n",
" <td>28995</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>11914 rows × 16 columns</p>\n",
"</div>"
],
"text/plain": [
" make model year engine_fuel_type engine_hp \\\n",
"0 bmw 1_series_m 2011 premium_unleaded_(required) 335.0 \n",
"1 bmw 1_series 2011 premium_unleaded_(required) 300.0 \n",
"2 bmw 1_series 2011 premium_unleaded_(required) 300.0 \n",
"3 bmw 1_series 2011 premium_unleaded_(required) 230.0 \n",
"4 bmw 1_series 2011 premium_unleaded_(required) 230.0 \n",
"... ... ... ... ... ... \n",
"11909 acura zdx 2012 premium_unleaded_(required) 300.0 \n",
"11910 acura zdx 2012 premium_unleaded_(required) 300.0 \n",
"11911 acura zdx 2012 premium_unleaded_(required) 300.0 \n",
"11912 acura zdx 2013 premium_unleaded_(recommended) 300.0 \n",
"11913 lincoln zephyr 2006 regular_unleaded 221.0 \n",
"\n",
" engine_cylinders transmission_type driven_wheels number_of_doors \\\n",
"0 6.0 manual rear_wheel_drive 2.0 \n",
"1 6.0 manual rear_wheel_drive 2.0 \n",
"2 6.0 manual rear_wheel_drive 2.0 \n",
"3 6.0 manual rear_wheel_drive 2.0 \n",
"4 6.0 manual rear_wheel_drive 2.0 \n",
"... ... ... ... ... \n",
"11909 6.0 automatic all_wheel_drive 4.0 \n",
"11910 6.0 automatic all_wheel_drive 4.0 \n",
"11911 6.0 automatic all_wheel_drive 4.0 \n",
"11912 6.0 automatic all_wheel_drive 4.0 \n",
"11913 6.0 automatic front_wheel_drive 4.0 \n",
"\n",
" market_category vehicle_size vehicle_style \\\n",
"0 factory_tuner,luxury,high-performance compact coupe \n",
"1 luxury,performance compact convertible \n",
"2 luxury,high-performance compact coupe \n",
"3 luxury,performance compact coupe \n",
"4 luxury compact convertible \n",
"... ... ... ... \n",
"11909 crossover,hatchback,luxury midsize 4dr_hatchback \n",
"11910 crossover,hatchback,luxury midsize 4dr_hatchback \n",
"11911 crossover,hatchback,luxury midsize 4dr_hatchback \n",
"11912 crossover,hatchback,luxury midsize 4dr_hatchback \n",
"11913 luxury midsize sedan \n",
"\n",
" highway_mpg city_mpg popularity msrp \n",
"0 26 19 3916 46135 \n",
"1 28 19 3916 40650 \n",
"2 28 20 3916 36350 \n",
"3 28 18 3916 29450 \n",
"4 28 18 3916 34500 \n",
"... ... ... ... ... \n",
"11909 23 16 204 46120 \n",
"11910 23 16 204 56670 \n",
"11911 23 16 204 50620 \n",
"11912 23 16 204 50920 \n",
"11913 26 17 61 28995 \n",
"\n",
"[11914 rows x 16 columns]"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Distribution of price"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"\n",
"%matplotlib inline"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<AxesSubplot:xlabel='msrp', ylabel='Count'>"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAYsAAAEGCAYAAACUzrmNAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8rg+JYAAAACXBIWXMAAAsTAAALEwEAmpwYAAATAklEQVR4nO3df/BddX3n8ecLIlRrC2iyLBtgQyVq0a4VUwTpdF3TVWS7ht1FirWauqHZtlR023VX3ZllauuMzjpScSpMJlBRWZFStsZdKsMCbS1T0YBWmkQlC1KSiRIIxFbXH9H3/nE/gdvk+83nAt/7vffL9/mYufM953M+59z3PVx4cX7cz0lVIUnSoRw26QIkSdPPsJAkdRkWkqQuw0KS1GVYSJK6lky6gHFYunRprVixYtJlSNKCcscddzxYVctmWvaUDIsVK1awefPmSZchSQtKkvtmW+ZpKElSl2EhSeoyLCRJXYaFJKnLsJAkdRkWkqQuw0KS1GVYSJK6DAtJUtdT8hfcT9YvX/Br7Hpw70Htxy09io9tvHwCFUnSZBkWM9j14F6WnX3Rwe03XDqBaiRp8jwNJUnqMiwkSV2GhSSpy7CQJHUZFpKkLsNCktRlWEiSugwLSVKXYSFJ6jIsJEldhoUkqcuwkCR1GRaSpC7DQpLUZVhIkroMC0lSl2EhSeoyLCRJXYaFJKnLsJAkdY01LJL8xyRbkvxNko8n+ZEkJyW5Pcn2JJ9IckTre2Sb396Wrxjazjta+1eSvGqcNUuSDja2sEiyHLgIWFVVLwQOB84H3gtcUlUnAw8D69oq64CHW/slrR9JTmnrvQA4C/hQksPHVbck6WDjPg21BHh6kiXAM4BdwCuA69ryq4Bz2vSaNk9bvjpJWvs1VfXdqroX2A6cNua6JUlDxhYWVbUTeB/wtwxCYi9wB/BIVe1r3XYAy9v0cuD+tu6+1v/Zw+0zrPOoJOuTbE6yeffu3XP/gSRpERvnaahjGBwVnAT8E+BHGZxGGouq2lBVq6pq1bJly8b1NpK0KI3zNNTPA/dW1e6q+j5wPXAmcHQ7LQVwPLCzTe8ETgBoy48CHhpun2EdSdI8GGdY/C1wepJntGsPq4GtwK3Aua3PWuCTbXpTm6ctv6WqqrWf3+6WOglYCXxujHVLkg6wpN/liamq25NcB9wJ7AO+AGwA/jdwTZLfa21XtFWuAD6aZDuwh8EdUFTVliTXMgiafcCFVfWDcdUtSTrY2MICoKouBi4+oPkeZribqaq+A7x2lu28G3j3nBcoSRqJv+CWJHUZFpKkLsNCktRlWEiSugwLSVKXYSFJ6jIsJEldhoUkqcuwkCR1GRaSpC7DQpLUZVhIkroMC0lSl2EhSeoyLCRJXYaFJKnLsJAkdRkWkqQuw0KS1GVYSJK6DAtJUpdhIUnqMiwkSV2GhSSpy7CQJHUZFpKkLsNCktRlWEiSugwLSVKXYSFJ6jIsJEldhoUkqcuwkCR1GRaSpC7DQpLUZVhIkrrGGhZJjk5yXZIvJ9mW5Iwkz0pyU5K7299jWt8kuTTJ9iRfSnLq0HbWtv53J1k7zpolSQcb95HFB4BPV9XzgRcB24C3AzdX1Urg5jYP8GpgZXutBy4DSPIs4GLgpcBpwMX7A0aSND/GFhZJjgJ+DrgCoKq+V1WPAGuAq1q3q4Bz2vQa4CM18Fng6CTHAa8CbqqqPVX1MHATcNa46pYkHWycRxYnAbuBP0zyhSQbk/wocGxV7Wp9vg4c26aXA/cPrb+jtc3W/g8kWZ9kc5LNu3fvnuOPIkmL2zjDYglwKnBZVb0Y+BaPnXICoKoKqLl4s6raUFWrqmrVsmXL5mKTkqRmnGGxA9hRVbe3+esYhMc32ukl2t8H2vKdwAlD6x/f2mZrlyTNk7GFRVV9Hbg/yfNa02pgK7AJ2H9H01rgk216E/DGdlfU6cDedrrqRuCVSY5pF7Zf2dokSfNkyZi3/2bg6iRHAPcAb2IQUNcmWQfcB5zX+t4AnA1sB77d+lJVe5L8LvD51u9dVbVnzHVLkoaMNSyq6ovAqhkWrZ6hbwEXzrKdK4Er57Q4SdLI/AW3JKnLsJAkdRkWkqQuw0KS1GVYSJK6DAtJUpdhIUnqMiwkSV2GhSSpy7CQJHWNFBZJzhylTZL01DTqkcUHR2yTJD0FHXIgwSRnAC8DliX5raFFPw4cPs7CJEnTozfq7BHAM1u/Hxtq/yZw7riKkiRNl0OGRVX9OfDnST5cVffNU02SpCkz6vMsjkyyAVgxvE5VvWIcRUmSpsuoYfFHwOXARuAH4ytHkjSNRg2LfVV12VgrkSRNrVFvnf1Ukt9IclySZ+1/jbUySdLUGPXIYm37+7ahtgJ+Ym7LkSRNo5HCoqpOGnchkqTpNVJYJHnjTO1V9ZG5LUeSNI1GPQ31M0PTPwKsBu4EDAtJWgRGPQ315uH5JEcD14yjIEnS9HmiQ5R/C/A6hiQtEqNes/gUg7ufYDCA4E8C146rKEnSdBn1msX7hqb3AfdV1Y4x1CNJmkIjnYZqAwp+mcHIs8cA3xtnUZKk6TLqk/LOAz4HvBY4D7g9iUOUS9IiMeppqP8K/ExVPQCQZBnwf4DrxlWYJGl6jHo31GH7g6J56HGsK0la4EY9svh0khuBj7f5XwRuGE9JkqRp03sG98nAsVX1tiT/FvjZtuivgKvHXZwkaTr0jix+H3gHQFVdD1wPkOSn2rJ/PcbaJElTonfd4diquuvAxta2YiwVSZKmTi8sjj7EsqfPYR2SpCnWC4vNSX71wMYkFwB3jKckSdK06V2zeCvwP5O8nsfCYRVwBPBvRnmDJIcDm4GdVfULSU5iMGLts9s231BV30tyJIMhz1/C4NbcX6yqr7VtvANYB/wAuKiqbhz5E0qSnrRDHllU1Teq6mXA7wBfa6/fqaozqurrI77HW4BtQ/PvBS6pqpOBhxmEAO3vw639ktaPJKcA5wMvAM4CPtQCSJI0T0YdG+rWqvpge90y6saTHA/8K2Bjmw/wCh775fdVwDltek2bpy1f3fqvAa6pqu9W1b3AduC0UWuQJD154/4V9u8D/xn4YZt/NvBIVe1r8zuA5W16OXA/QFu+t/V/tH2GdR6VZH2SzUk27969e44/hiQtbmMLiyS/ADxQVfNyIbyqNlTVqqpatWzZsvl4S0laNEYd7uOJOBN4TZKzGTy3+8eBDwBHJ1nSjh6OB3a2/juBE4AdSZYARzG40L2/fb/hdSRJ82BsRxZV9Y6qOr6qVjC4QH1LVb0euBXYP7z5WuCTbXpTm6ctv6WqqrWfn+TIdifVSgbDpUuS5sk4jyxm81+Aa5L8HvAF4IrWfgXw0STbgT0MAoaq2pLkWmArg6f0XVhVP5j/siVp8ZqXsKiqPwP+rE3fwwx3M1XVdxg8XGmm9d8NvHt8FUqSDsVnUkiSugwLSVKXYSFJ6jIsJEldhoUkqcuwkCR1GRaSpC7DQpLUZVhIkroMC0lSl2EhSeoyLCRJXYaFJKnLsJAkdRkWkqQuw0KS1GVYSJK6DAtJUpdhIUnqMiwkSV2GhSSpy7CQJHUZFpKkLsNCktRlWEiSugwLSVKXYSFJ6jIsJEldhoUkqcuwkCR1GRaSpC7DQpLUZVhIkroMC0lSl2EhSeoyLCRJXWMLiyQnJLk1ydYkW5K8pbU/K8lNSe5uf49p7UlyaZLtSb6U5NShba1t/e9OsnZcNUuSZjbOI4t9wG9X1SnA6cCFSU4B3g7cXFUrgZvbPMCrgZXttR64DAbhAlwMvBQ4Dbh4f8BIkubH2MKiqnZV1Z1t+u+AbcByYA1wVet2FXBOm14DfKQGPgscneQ44FXATVW1p6oeBm4CzhpX3ZKkgy2ZjzdJsgJ4MXA7cGxV7WqLvg4c26aXA/cPrbajtc3WfuB7rGdwRMKJJ544h9U/ZtvWLaw+53UHtR+39Cg+tvHysbynJE2DsYdFkmcCfwy8taq+meTRZVVVSWou3qeqNgAbAFatWjUn2zzQ9+swlp190UHtu264dBxvJ0lTY6x3QyV5GoOguLqqrm/N32inl2h/H2jtO4EThlY/vrXN1i5JmifjvBsqwBXAtqp6/9CiTcD+O5rWAp8can9juyvqdGBvO111I/DKJMe0C9uvbG2SpHkyztNQZwJvAO5K8sXW9k7gPcC1SdYB9wHntWU3AGcD24FvA28CqKo9SX4X+Hzr966q2jPGuiVJBxhbWFTVXwKZZfHqGfoXcOEs27oSuHLuqpMkPR7+gluS1GVYSJK6DAtJUpdhIUnqMiwkSV2GhSSpy7CQJHUZFpKkLsNCktRlWEiSugwLSVKXYSFJ6jIsJEldhoUkqcuwkCR1GRaSpC7DQpLUZVhIkroMC0lSl2EhSeoyLCRJXYaFJKnLsJAkdRkWkqQuw0KS1GVYSJK6DAtJUpdhIUnqMiwkSV2GhSSpy7CQJHUZFpKkLsNCktS1ZNIFPBVs27qF1ee87qD245Yexcc2Xj6BiiRpbhkWc+D7dRjLzr7ooPZdN1w6gWokae55GkqS1GVYSJK6FsxpqCRnAR8ADgc2VtV7JlxS12zXMr72f7/Kiuc896B2r3FImlYLIiySHA78AfAvgR3A55Nsqqqtk63s0Ga7lvGl//7rM7bf8r7/4IVySVNpQYQFcBqwvaruAUhyDbAGmOqweLxmC5fZQmS2I5THe+Tyyxf8Grse3DtjTQaVJIBU1aRr6EpyLnBWVV3Q5t8AvLSqfnOoz3pgfZt9HvCVJ/h2S4EHn0S5i4X7aTTupz730WjmYz/906paNtOChXJk0VVVG4ANT3Y7STZX1ao5KOkpzf00GvdTn/toNJPeTwvlbqidwAlD88e3NknSPFgoYfF5YGWSk5IcAZwPbJpwTZK0aCyI01BVtS/JbwI3Mrh19sqq2jKmt3vSp7IWCffTaNxPfe6j0Ux0Py2IC9ySpMlaKKehJEkTZFhIkroWbVgkOSvJV5JsT/L2GZYfmeQTbfntSVZMoMyJG2E//UqS3Um+2F4XTKLOSUpyZZIHkvzNLMuT5NK2D7+U5NT5rnEajLCfXp5k79B36b/Nd42TluSEJLcm2ZpkS5K3zNBnIt+nRRkWQ8OHvBo4BXhdklMO6LYOeLiqTgYuAd47v1VO3oj7CeATVfXT7bVxXoucDh8GzjrE8lcDK9trPXDZPNQ0jT7MofcTwGeGvkvvmoeaps0+4Ler6hTgdODCGf6dm8j3aVGGBUPDh1TV94D9w4cMWwNc1aavA1YnyTzWOA1G2U+LXlX9BbDnEF3WAB+pgc8CRyc5bn6qmx4j7KdFr6p2VdWdbfrvgG3A8gO6TeT7tFjDYjlw/9D8Dg7+B/Jon6raB+wFnj0v1U2PUfYTwL9rh8PXJTlhhuWL3aj7UXBGkr9O8qdJXjDpYiapnfp+MXD7AYsm8n1arGGhufMpYEVV/TPgJh47GpMerzsZjE30IuCDwJ9MtpzJSfJM4I+Bt1bVNyddDyzesBhl+JBH+yRZAhwFPDQv1U2P7n6qqoeq6rttdiPwknmqbSFxuJoRVNU3q+rv2/QNwNOSLJ1wWfMuydMYBMXVVXX9DF0m8n1arGExyvAhm4C1bfpc4JZafL9g7O6nA86VvobBOVb9Q5uAN7a7WE4H9lbVrkkXNW2S/OP91wWTnMbgv0+L6n/Q2ue/AthWVe+fpdtEvk8LYriPuTbb8CFJ3gVsrqpNDP6BfTTJdgYX5c6fXMWTMeJ+uijJaxjcxbEH+JWJFTwhST4OvBxYmmQHcDHwNICquhy4ATgb2A58G3jTZCqdrBH207nAryfZB/w/4PxF+D9oZwJvAO5K8sXW9k7gRJjs98nhPiRJXYv1NJQk6XEwLCRJXYaFJKnLsJAkdRkWkrTA9QZpnKH/eUODFf6PkdbxbihJWtiS/Bzw9wzGjHphp+9K4FrgFVX1cJJ/VFUP9N7DIwtpCrRRAqQnZKZBGpM8J8mnk9yR5DNJnt8W/SrwB1X1cFu3GxRgWEhPWpIVSb6c5MNJvprk6iQ/n+S2JHcnOS3JPx96TsMXkvxYe37DZ5JsArYObefqJNvawIzPmPTn04K1AXhzVb0E+E/Ah1r7c4Hntu/nZ5P0ho0HFukvuKUxOBl4LfDvGQyT8kvAzzIYAuWdDH4Bf2FV3dYGiftOW+9U4IVVdW8bZfR5wLrW70rgN4D3zesn0YLXvmMvA/5o6MkKR7a/Sxg8C+PlDMaV+oskP1VVjxxqmx5ZSHPj3qq6q6p+CGwBbm5DVdwFrABuA96f5CLg6DbsPcDnqureoe3cX1W3temPMQgc6fE6DHhk6EFSP11VP9mW7QA2VdX323fvqwzCo7tBSU/ed4emfzg0/0NgSVW9B7gAeDpw29D5428dsJ0D7zjxDhQ9bm1Y83uTvBYefRTri9riP2FwVEEb1fe5wD29bRoW0jxI8px25PFeBqepnj9L1xOTnNGmfwn4y3kpUAtaG6Txr4DnJdmRZB3wemBdkr9mcLS7/ymXNwIPJdkK3Aq8raq6o/t6zUKaH29N8i8YHGlsAf4UOGOGfl9h8NzlK4GtLN7ndetxqKrXzbLooIvX7fTob7XXyPydhTQl2gXu/9W7T16aBE9DSZK6PLKQJHV5ZCFJ6jIsJEldhoUkqcuwkCR1GRaSpK7/D5sufaGjZFscAAAAAElFTkSuQmCC\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"sns.histplot(df.msrp, bins=50)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<AxesSubplot:xlabel='msrp', ylabel='Count'>"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAY8AAAEGCAYAAACdJRn3AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8rg+JYAAAACXBIWXMAAAsTAAALEwEAmpwYAAAY+UlEQVR4nO3de7RedX3n8fenpOC1BOSUSQlMoqId61xkImJtOyoWkXGMM4syUK1RsRkVtdYuLdS1hk5b18LWVQvVQTOQCsqAlNqaKi2laHXqqkjwwv1yBJVkBRME4ywdL9Tv/LF/kceTnOTs5DyXc877tdazzt7f/Xv2/u3sk3zzu+y9U1VIktTHT4y7ApKkhcfkIUnqzeQhSerN5CFJ6s3kIUnqbdm4KzAMRxxxRK1atWrc1ZCkBeXGG298oKqm5lJ2USaPVatWsXnz5nFXQ5IWlCRfnWtZu60kSb2ZPCRJvQ0teSTZmGR7kltmxN+Y5I4ktyb5w4H4OUmmk9yZ5IUD8ZNbbDrJ2cOqryRp7oY55vEB4D3ApbsCSZ4HrAX+bVV9L8lPt/jTgNOBnwN+Bvj7JE9pX3sv8MvAFuCGJJuq6rYh1luStA9DSx5V9ekkq2aEXwecV1Xfa2W2t/ha4IoWvzfJNHB82zZdVfcAJLmilTV5SNIYjXrM4ynALya5PsmnkjyzxY8C7hsot6XFZovvJsn6JJuTbN6xY8cQqi5J2mXUyWMZcDhwAvBW4MokmY8dV9WGqlpTVWumpuY0TVmStJ9GfZ/HFuAj1T0H/nNJfggcAWwFjh4ot7LF2EtckjQmo255/BXwPIA2IH4w8ACwCTg9ySFJVgPHAp8DbgCOTbI6ycF0g+qbRlxnSdIMQ2t5JLkceC5wRJItwLnARmBjm777fWBda4XcmuRKuoHwh4Gzquqf237eAFwDHARsrKpbh1XnXV7+mtey7YGdu8VXHHEoH7rofcM+vCRNvGHOtjpjlk0vn6X8O4B37CF+NXD1PFZtn7Y9sJOpU960e/zqC0ZZDUmaWN5hLknqzeQhSerN5CFJ6s3kIUnqzeQhSerN5CFJ6s3kIUnqzeQhSerN5CFJ6s3kIUnqzeQhSerN5CFJ6s3kIUnqzeQhSerN5CFJ6s3kIUnqzeQhSeptaMkjycYk29srZ2du+60kleSItp4kFySZTnJTkuMGyq5Lcnf7rBtWfSVJczfMlscHgJNnBpMcDZwEfG0g/CLg2PZZD1zYyh5O9+7zZwHHA+cmOWyIdZYkzcHQkkdVfRp4cA+b3g28DaiB2Frg0up8FlieZAXwQuDaqnqwqh4CrmUPCUmSNFojHfNIshbYWlVfmrHpKOC+gfUtLTZbfE/7Xp9kc5LNO3bsmMdaS5JmGlnySPIY4HeA/z6M/VfVhqpaU1VrpqamhnEISVIzypbHk4DVwJeSfAVYCXw+yb8AtgJHD5Rd2WKzxSVJYzSy5FFVN1fVT1fVqqpaRdcFdVxV3Q9sAl7RZl2dAOysqm3ANcBJSQ5rA+UntZgkaYyGOVX3cuCfgKcm2ZLkzL0Uvxq4B5gG/hfweoCqehD4feCG9vm9FpMkjdGyYe24qs7Yx/ZVA8sFnDVLuY3AxnmtnCTpgHiHuSSpN5OHJKk3k4ckqTeThySpN5OHJKk3k4ckqTeThySpN5OHJKk3k4ckqTeThySpN5OHJKk3k4ckqTeThySpN5OHJKk3k4ckqTeThySpN5OHJKm3Yb6GdmOS7UluGYj9UZI7ktyU5C+TLB/Ydk6S6SR3JnnhQPzkFptOcvaw6itJmrthtjw+AJw8I3Yt8PSq+jfAXcA5AEmeBpwO/Fz7zv9MclCSg4D3Ai8Cngac0cpKksZoaMmjqj4NPDgj9ndV9XBb/Sywsi2vBa6oqu9V1b3ANHB8+0xX1T1V9X3gilZWkjRG4xzzeDXwN235KOC+gW1bWmy2uCRpjMaSPJK8HXgYuGwe97k+yeYkm3fs2DFfu5Uk7cHIk0eSVwIvBl5WVdXCW4GjB4qtbLHZ4rupqg1Vtaaq1kxNTc17vSVJjxhp8khyMvA24CVV9Z2BTZuA05MckmQ1cCzwOeAG4Ngkq5McTDeovmmUdZYk7W7ZsHac5HLgucARSbYA59LNrjoEuDYJwGer6rVVdWuSK4Hb6Lqzzqqqf277eQNwDXAQsLGqbh1WnSVJczO05FFVZ+whfPFeyr8DeMce4lcDV89j1SRJB8g7zCVJvZk8JEm9mTwkSb2ZPCRJvZk8JEm9mTwkSb2ZPCRJvZk8JEm9mTwkSb2ZPCRJvZk8JEm9mTwkSb2ZPCRJvZk8JEm9mTwkSb2ZPCRJvZk8JEm9mTwkSb0NLXkk2Zhke5JbBmKHJ7k2yd3t52EtniQXJJlOclOS4wa+s66VvzvJumHVV5I0d8NseXwAOHlG7Gzguqo6FriurQO8CDi2fdYDF0KXbIBzgWcBxwPn7ko4kqTxGVryqKpPAw/OCK8FLmnLlwAvHYhfWp3PAsuTrABeCFxbVQ9W1UPAteyekCRJIzbqMY8jq2pbW74fOLItHwXcN1BuS4vNFt9NkvVJNifZvGPHjvmttSTpx4xtwLyqCqh53N+GqlpTVWumpqbma7eSpD0YdfL4euuOov3c3uJbgaMHyq1ssdnikqQxGnXy2ATsmjG1DvjoQPwVbdbVCcDO1r11DXBSksPaQPlJLSZJGqNlw9pxksuB5wJHJNlCN2vqPODKJGcCXwVOa8WvBk4BpoHvAK8CqKoHk/w+cEMr93tVNXMQXpI0YkNLHlV1xiybTtxD2QLOmmU/G4GN81g1SdIB8g5zSVJvJg9JUm8mD0lSbyYPSVJvJg9JUm9zSh5JnjOXmCRpaZhry+NP5xiTJC0Be73PI8mzgZ8HppK8ZWDTTwEHDbNikqTJta+bBA8GHtfKPX4g/i3g1GFVSpI02faaPKrqU8Cnknygqr46ojpJkibcXB9PckiSDcCqwe9U1fOHUSlJ0mSba/L4c+B9wEXAPw+vOpKkhWCuyePhqrpwqDWRJC0Yc52q+9dJXp9kRZLDd32GWjNJ0sSaa8tj1wuc3joQK+CJ81sdSdJCMKfkUVWrh10RSdLCMafkkeQVe4pX1aXzWx1J0kIw1zGPZw58fhH4XeAl+3vQJL+Z5NYktyS5PMmjkqxOcn2S6SQfTnJwK3tIW59u21ft73ElSfNjTsmjqt448Pl14Di6O897S3IU8CZgTVU9ne4xJ6cD7wTeXVVPBh4CzmxfORN4qMXf3cpJksZofx/J/m3gQMZBlgGPTrIMeAywDXg+cFXbfgnw0ra8tq3Ttp+YJAdwbEnSAZrrmMdf082ugq6l8K+AK/fngFW1Ncm7gK8B/w/4O+BG4JtV9XArtgU4qi0fBdzXvvtwkp3AE4AHZtRxPbAe4JhjjtmfqkmS5miuU3XfNbD8MPDVqtqyPwdMchhda2I18E26u9dP3p99DaqqDcAGgDVr1tQ+ikuSDsBcxzw+BdxB92Tdw4DvH8AxXwDcW1U7quoHwEeA5wDLWzcWwEpga1veChwN0LYfCnzjAI4vSTpAc32T4GnA54BfAU4Drk+yv49k/xpwQpLHtLGLE4HbgE/yyGPe1wEfbcubeOQmxVOBT1SVLQtJGqO5dlu9HXhmVW0HSDIF/D2PDHDPWVVdn+Qq4PN0XWBfoOtu+jhwRZI/aLGL21cuBj6YZBp4kG5mliRpjOaaPH5iV+JovsH+z9Siqs4Fzp0Rvgc4fg9lv0vX4pEkTYi5Jo+/TXINcHlb/6/A1cOpkiRp0u3rHeZPBo6sqrcm+S/AL7RN/wRcNuzKSZIm075aHn8CnANQVR+hmxlFkn/dtv2nIdZNkjSh9jVucWRV3Twz2GKrhlIjSdLE21fyWL6XbY+ex3pIkhaQfSWPzUl+fWYwyWvoHikiSVqC9jXm8WbgL5O8jEeSxRrgYOA/D7FemnAvf81r2fbAzt3iK444lA9d9L4x1EjSKO01eVTV14GfT/I84Okt/PGq+sTQa6aJtu2BnUyd8qbd41dfMIbaSBq1ub6G9pN0jw+RJGn/7xKXJC1dc73DXEvUbGMbd9x1N1OnjKFCkiaCyUN7NdvYxk23vm4MtZE0Key2kiT1ZvKQJPVm8pAk9WbykCT15oC5RsI70qXFxeShkfCOdGlxGUvySLIcuIjukScFvBq4E/gw3aPevwKcVlUPJQlwPnAK8B3glVX1+dHXenGbr/s5br/tVk586RkHvB9Jk21cLY/zgb+tqlOTHAw8Bvgd4LqqOi/J2cDZwG8DLwKObZ9nARe2n5pH83U/xw/qJ7wvRFoCRj5gnuRQ4JeAiwGq6vtV9U1gLXBJK3YJ8NK2vBa4tDqfBZYnWTHSSkuSfsw4ZlutBnYAf5bkC0kuSvJYurcWbmtl7geObMtHAfcNfH9Li/2YJOuTbE6yeceOHUOsviRpHMljGXAccGFVPQP4Nl0X1Y9UVdGNhcxZVW2oqjVVtWZqamreKitJ2t04xjy2AFuq6vq2fhVd8vh6khVVta11S21v27cCRw98f2WLaRGYbYDdKbzSZBt58qiq+5Pcl+SpVXUncCJwW/usA85rPz/avrIJeEOSK+gGyncOdG9pgZttgN0pvNJkG9dsqzcCl7WZVvcAr6LrQrsyyZnAV4HTWtmr6abpTtNN1X3V6KsrSRo0luRRVV+kexf6TCfuoWwBZw27TpKkufPZVpKk3kwekqTeTB6SpN58MKImklN4pclm8tBEcgqvNNnstpIk9WbLYwmZ7bHr4CPTJfVj8lhCZnvsOvjIdEn92G0lSerN5CFJ6s3kIUnqzTEPLSje/yFNBpOHFhTv/5Amg91WkqTeTB6SpN5MHpKk3kwekqTexpY8khyU5AtJPtbWVye5Psl0kg+3V9SS5JC2Pt22rxpXnSVJnXG2PH4DuH1g/Z3Au6vqycBDwJktfibwUIu/u5WTJI3RWKbqJlkJ/EfgHcBbkgR4PvCrrcglwO8CFwJr2zLAVcB7kqS921wCvP9DGrVx3efxJ8DbgMe39ScA36yqh9v6FuCotnwUcB9AVT2cZGcr/8DgDpOsB9YDHHPMMcOsuyaQ939IozXybqskLwa2V9WN87nfqtpQVWuqas3U1NR87lqSNMM4Wh7PAV6S5BTgUcBPAecDy5Msa62PlcDWVn4rcDSwJcky4FDgG6OvtiRpl5G3PKrqnKpaWVWrgNOBT1TVy4BPAqe2YuuAj7blTW2dtv0TjndI0nhN0rOtfhu4IskfAF8ALm7xi4EPJpkGHqRLONqL2d4Y6NsCJc2XsSaPqvoH4B/a8j3A8Xso813gV0ZasQVutjcGLsW3BToLSxqOSWp5SPPOWVjScPh4EklSbyYPSVJvdltpSZptLAQcD5HmwuShJWm2sRBwPESaC7utJEm9mTwkSb3ZbbWAeTOgpHExeSxg3gwoaVzstpIk9WbykCT1ZreVNIPPw5L2zeQhzeDzsKR9s9tKktSbLQ9pjuzOkh5h8pDmyO4s6RF2W0mSeht5yyPJ0cClwJFAARuq6vwkhwMfBlYBXwFOq6qHkgQ4HzgF+A7wyqr6/KjrPU7eSS5p0oyj2+ph4Leq6vNJHg/cmORa4JXAdVV1XpKzgbPp3mv+IuDY9nkWcGH7uWR4J7mkSTPybquq2rar5VBV/xe4HTgKWAtc0opdAry0La8FLq3OZ4HlSVaMttaSpEFjHTBPsgp4BnA9cGRVbWub7qfr1oIusdw38LUtLbZtIEaS9cB6gGOOOWZ4lZZmcBaWlqKxJY8kjwP+AnhzVX2rG9roVFUlqT77q6oNwAaANWvW9PqudCCchaWlaCzJI8lP0iWOy6rqIy389SQrqmpb65ba3uJbgaMHvr6yxaSJZotEi9k4ZlsFuBi4var+eGDTJmAdcF77+dGB+BuSXEE3UL5zoHtLmli2SLSYjaPl8Rzg14Cbk3yxxX6HLmlcmeRM4KvAaW3b1XTTdKfppuq+aqS1lSTtZuTJo6r+Ecgsm0/cQ/kCzhpqpSRJvXiHuSSpN59tJY2YA+laDEwe0og5kK7FwG4rSVJvJg9JUm92W0kTwrEQLSQmD2lCOBaihcTkIU04WySaRCYPacLZItEkMnlIS9xsb6q0ZaO9MXlIi8xsyeArX76LVU96ym7xO+66m19885/uFrdlo70xeUgL1GxjIbMlg5v+6HW+zljzxuQhLVCzjYXMVzKYLTmBXVoyeUiaxWzJCezSksmjl2FPmZytr/qOu+5m6pQD3r00b5w+LJNHD8OeMrntgZ32SWtBmO3vwife9d9MKkuEyUPSvJmvpNJ3+vCwy2t3Jg9JQ9c3qfSdPjxbq71v+f1pOS3VRLRgkkeSk4HzgYOAi6rqvDFXab85tiF1+s4Y29v05D393elbvm+S27WvPSW62b4z2/028xUfVdJaEMkjyUHAe4FfBrYANyTZVFW3jbdmnb6Dh45tSPunb7KZr+nMe5t51vvYs91vM0/xUc2EWxDJAzgemK6qewCSXAGsBSYieexPk9wWhqSFLFU17jrsU5JTgZOr6jVt/deAZ1XVGwbKrAfWt9WnAnfuY7dHAA8MoboLwVI9d897afG8+/uXVTU1l4ILpeWxT1W1Adgw1/JJNlfVmiFWaWIt1XP3vJcWz3u4FspraLcCRw+sr2wxSdIYLJTkcQNwbJLVSQ4GTgc2jblOkrRkLYhuq6p6OMkbgGvopupurKpbD3C3c+7iWoSW6rl73kuL5z1EC2LAXJI0WRZKt5UkaYKYPCRJvS3J5JHk5CR3JplOcva467M/khyd5JNJbktya5LfaPHDk1yb5O7287AWT5IL2jnflOS4gX2ta+XvTrJuIP7vk9zcvnNBkoz+TPcsyUFJvpDkY219dZLrW10/3CZWkOSQtj7dtq8a2Mc5LX5nkhcOxCfy9yPJ8iRXJbkjye1Jnr0UrneS32y/47ckuTzJoxbj9U6yMcn2JLcMxIZ+fWc7xj5V1ZL60A24fxl4InAw8CXgaeOu136cxwrguLb8eOAu4GnAHwJnt/jZwDvb8inA3wABTgCub/HDgXvaz8Pa8mFt2+da2bTvvmjc5z1w/m8B/jfwsbZ+JXB6W34f8Lq2/HrgfW35dODDbflp7dofAqxuvxMHTfLvB3AJ8Jq2fDCwfLFfb+Ao4F7g0QPX+ZWL8XoDvwQcB9wyEBv69Z3tGPus77h/OcZwgZ4NXDOwfg5wzrjrNQ/n9VG6Z3/dCaxosRXAnW35/cAZA+XvbNvPAN4/EH9/i60A7hiI/1i5MZ/rSuA64PnAx9pfhgeAZTOvMd0MvWe35WWtXGZe913lJvX3Azi0/SOaGfFFfb3pksd97R/DZe16v3CxXm9gFT+ePIZ+fWc7xr4+S7Hbatcv4y5bWmzBak3zZwDXA0dW1ba26X7gyLY823nvLb5lD/FJ8CfA24AftvUnAN+sqofb+mBdf3R+bfvOVr7vn8e4rQZ2AH/WuusuSvJYFvn1rqqtwLuArwHb6K7fjSz+673LKK7vbMfYq6WYPBaVJI8D/gJ4c1V9a3Bbdf+VWFRzsZO8GNheVTeOuy4jtoyuS+PCqnoG8G26LoYfWaTX+zC6h6CuBn4GeCxw8lgrNSajuL59jrEUk8eiedRJkp+kSxyXVdVHWvjrSVa07SuA7S0+23nvLb5yD/Fxew7wkiRfAa6g67o6H1ieZNdNr4N1/dH5te2HAt+g/5/HuG0BtlTV9W39Krpkstiv9wuAe6tqR1X9APgI3e/AYr/eu4zi+s52jL1aisljUTzqpM2UuBi4var+eGDTJmDXDIt1dGMhu+KvaLM0TgB2tqbqNcBJSQ5r/8s7ia4PeBvwrSQntGO9YmBfY1NV51TVyqpaRXftPlFVLwM+CZzais08711/Hqe28tXip7fZOauBY+kGFCfy96Oq7gfuS/LUFjqR7pUEi/p603VXnZDkMa1eu857UV/vAaO4vrMdY+/GNTA0zg/dTIW76GZZvH3c9dnPc/gFuublTcAX2+cUuv7d64C7gb8HDm/lQ/dCrS8DNwNrBvb1amC6fV41EF8D3NK+8x5mDNaO+wM8l0dmWz2R7h+DaeDPgUNa/FFtfbptf+LA99/ezu1OBmYWTervB/DvgM3tmv8V3WyaRX+9gf8B3NHq9kG6GVOL7noDl9ON6/yArqV55iiu72zH2NfHx5NIknpbit1WkqQDZPKQJPVm8pAk9WbykCT1ZvKQJPVm8pAk9WbykCbAwN3S0oJg8pAOUJJV6d6x8YEkdyW5LMkLknymvSPh+CT/IckX2+cLSR6f5LlJ/k+STcBtA/u5LN37Oq5K8phxn5+0J94kKB2g9lTjabonG99K98iLL9HdIfwS4FV07404r6o+0x5m+V26pwR8HHh6Vd3b9nMv8Aut3Ebgtqp614hPSdonWx7S/Li3qm6uqh/SJZDrqvuf2c1072j4DPDHSd4ELK9HHif+uaq6d2A/91XVZ9ryh+gSjDRxTB7S/PjewPIPB9Z/SPfSovOA1wCPBj6T5Gfb9m/P2M/MrgC7BjSRTB7SCCR5UmuZvJOuW+tnZyl6TJJnt+VfBf5xJBWUejJ5SKPx5iS3JLmJ7qmpfzNLuTuBs5LcTvfU3AtHVUGpDwfMpQnRBsw/VlVPH3ddpH2x5SFJ6s2WhySpN1sekqTeTB6SpN5MHpKk3kwekqTeTB6SpN7+P0fq7giiP7SsAAAAAElFTkSuQmCC\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"sns.histplot(df.msrp[df.msrp < 100000], bins=50)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([ 0. , 0.69314718, 2.39789527, 6.90875478, 11.51293546])"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.log1p([0, 1, 10, 1000, 100000])"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([ 0. , 0.69314718, 2.39789527, 6.90875478, 11.51292546])"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.log([0 + 1, 1+ 1, 10 + 1, 1000 + 1, 100000])"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"price_logs = np.log1p(df.msrp)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<AxesSubplot:xlabel='msrp', ylabel='Count'>"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAYsAAAEGCAYAAACUzrmNAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8rg+JYAAAACXBIWXMAAAsTAAALEwEAmpwYAAAW+ElEQVR4nO3dfbRddX3n8fcHM6BoC2iuNEKYMJpqHacdbYr40A7LWMSMY5iOIo5oRGgGiqW2ThV0rbKsdS1cumrB6YKVAQYoDEgdHVKbqhSfWpcg4UEgBCHDgyQTIBGMs8pYiXznj7MzPeTek30T7jn73Hvfr7XuOnt/9z7nfNGb+zn799tn71QVkiTtyX5dNyBJGn+GhSSplWEhSWplWEiSWhkWkqRWC7puYBgWLlxYS5Ys6boNSZpVbr755u1VNTHVtjkZFkuWLGH9+vVdtyFJs0qSBwdtcxhKktRqaGGR5JIkjya5c4ptH0xSSRY260lyfpJNSW5P8qq+fVclubf5WTWsfiVJgw3zyOJS4Ljdi0kWA8cCP+grvxlY2vysBi5o9n0+cA7wauAo4JwkhwyxZ0nSFIYWFlX1LeCxKTZ9BvgQ0H+dkZXA5dVzA3BwkkXAm4DrquqxqnocuI4pAkiSNFwjnbNIshLYUlXf223TYcBDfeubm9qg+lSvvTrJ+iTrt23bNoNdS5JGFhZJDgQ+AvzRMF6/qtZU1bKqWjYxMeWZX5KkfTTKI4sXA0cC30vyAHA4cEuSXwC2AIv79j28qQ2qS5JGaGRhUVV3VNULq2pJVS2hN6T0qqp6GFgLvKc5K+poYEdVbQW+Ahyb5JBmYvvYpiZJGqFhnjp7FfAd4KVJNic5ZQ+7rwPuAzYB/xX4HYCqegz4OHBT8/PHTU2SNEKZizc/WrZsWfkNbgGcdOppbN2+Y1J90cKDuOKiCzvoSBpfSW6uqmVTbZuTl/uQdtm6fQcTK86cXF93fgfdSLOXl/uQJLUyLCRJrQwLSVIrw0KS1MqwkCS1MiwkSa0MC0lSK8NCktTKsJAktTIsJEmtDAtJUivDQpLUyrCQJLUyLCRJrbxEueaEQfetuPuee5lY0UFD0hxjWGhOGHTfits3nN5BN9Lc4zCUJKmVYSFJauUwlOaljXdtYPnx75xym/fnliYzLDQvPVn7TTnHAd6fW5qKw1CSpFZDC4sklyR5NMmdfbVPJbk7ye1Jvpjk4L5tZyfZlOT7Sd7UVz+uqW1Kctaw+pUkDTbMYahLgf8CXN5Xuw44u6p2JvkkcDbw4SQvB04E/iXwIuBvk/xi85w/B34T2AzclGRtVd01xL4HnrPvWLak+WpoYVFV30qyZLfaV/tWbwDe1iyvBK6uqn8E7k+yCTiq2bapqu4DSHJ1s+9Qw2LQOfuOZUuar7qcs3gf8DfN8mHAQ33bNje1QfVJkqxOsj7J+m3btg2hXUmavzoJiyQfBXYCV87Ua1bVmqpaVlXLJiYmZuplJUl0cOpskvcCbwGWV1U15S3A4r7dDm9q7KEuSRqRkR5ZJDkO+BDw1qp6om/TWuDEJAckORJYCnwXuAlYmuTIJPvTmwRfO8qeJUlDPLJIchVwDLAwyWbgHHpnPx0AXJcE4IaqOq2qNiS5ht7E9U7gjKr6WfM67we+AjwLuKSqNgyrZ0nS1IZ5NtRU11K4eA/7fwL4xBT1dcC6GWxNkrSX/Aa3JKmVYSFJamVYSJJaGRaSpFaGhSSplWEhSWplWEiSWhkWkqRWhoUkqZVhIUlqZVhIkloZFpKkVoaFJKnVyG9+JI27jXdtYPnxky+avGjhQVxx0YUddCR1z7CQdvNk7cfEijMn1beuO7+DbqTx4DCUJKmVYSFJamVYSJJaGRaSpFaGhSSplWEhSWo1tLBIckmSR5Pc2Vd7fpLrktzbPB7S1JPk/CSbktye5FV9z1nV7H9vklXD6leSNNgwjywuBY7brXYWcH1VLQWub9YB3gwsbX5WAxdAL1yAc4BXA0cB5+wKGEnS6AwtLKrqW8Bju5VXApc1y5cBx/fVL6+eG4CDkywC3gRcV1WPVdXjwHVMDiBJ0pCN+hvch1bV1mb5YeDQZvkw4KG+/TY3tUH1SZKspndUwhFHHDGDLWucnHTqaWzdvmNS/e577mViRQcNSfNEZ5f7qKpKUjP4emuANQDLli2bsdfVeNm6fceUl+K4fcPpHXQjzR+jPhvqkWZ4iebx0aa+BVjct9/hTW1QXZI0QqMOi7XArjOaVgHX9tXf05wVdTSwoxmu+gpwbJJDmontY5uaJGmEhjYMleQq4BhgYZLN9M5qOhe4JskpwIPACc3u64AVwCbgCeBkgKp6LMnHgZua/f64qnafNJckDdnQwqKqJt8QoGf5FPsWcMaA17kEuGQGW5Mk7SW/wS1JamVYSJJaGRaSpFaGhSSplWEhSWplWEiSWhkWkqRWhoUkqZVhIUlqZVhIkloZFpKkVoaFJKmVYSFJamVYSJJaGRaSpFaGhSSplWEhSWplWEiSWhkWkqRWhoUkqZVhIUlqtaCLN03y+8CpQAF3ACcDi4CrgRcANwPvrqqfJjkAuBz4VeCHwDuq6oEu+tb8tvGuDSw//p2T6osWHsQVF13YQUfS6Iw8LJIcBpwJvLyq/m+Sa4ATgRXAZ6rq6iQXAqcAFzSPj1fVS5KcCHwSeMeo+5aerP2YWHHmpPrWded30I00Wl0NQy0AnpNkAXAgsBV4A/D5ZvtlwPHN8spmnWb78iQZXauSpGmFRZLXTac2HVW1Bfg08AN6IbGD3rDTj6pqZ7PbZuCwZvkw4KHmuTub/V8wRT+rk6xPsn7btm370pokaYDpHll8dpq1VkkOoXe0cCTwIuC5wHH78lr9qmpNVS2rqmUTExPP9OUkSX32OGeR5DXAa4GJJH/Qt+nngWft43u+Ebi/qrY17/EF4HXAwUkWNEcPhwNbmv23AIuBzc2w1UH0JrolSSPSdmSxP/A8eqHyc30/Pwbeto/v+QPg6CQHNnMPy4G7gK/3veYq4NpmeW2zTrP9a1VV+/jekqR9sMcji6r6JvDNJJdW1YMz8YZVdWOSzwO3ADuBW4E1wF8DVyf5k6Z2cfOUi4G/SLIJeIzemVOSpBGa7qmzByRZAyzpf05VvWFf3rSqzgHO2a18H3DUFPv+BHj7vryPJGlmTDcs/hK4ELgI+Nnw2pEkjaPphsXOqrpgqJ1IksbWdE+d/askv5NkUZLn7/oZameSpLEx3SOLXWcj/WFfrYB/MbPtSJLG0bTCoqqOHHYjkqTxNa2wSPKeqepVdfnMtiNJGkfTHYb6tb7lZ9P7It0t9C4dLkma46Y7DPW7/etJDqZ37wlJ0jywr5co/wd6FwKUJM0D052z+Ct6Zz9B7wKCvwRcM6ymJEnjZbpzFp/uW94JPFhVm4fQjyRpDE1rGKq5oODd9K44ewjw02E2JUkaL9O9U94JwHfpXdDvBODGJPt6iXJJ0iwz3WGojwK/VlWPAiSZAP6Wf7pntiRpDpvu2VD77QqKxg/34rmSpFluukcWX07yFeCqZv0dwLrhtCRJGjdt9+B+CXBoVf1hkt8CXt9s+g5w5bCbkySNh7Yjiz8Dzgaoqi8AXwBI8q+abf9uiL1JksZEW1gcWlV37F6sqjuSLBlOSxKcdOppbN2+Y1L97nvuZWJFBw1J81xbWBy8h23PmcE+pKfZun0HEyvOnFS/fcPpHXQjqe2MpvVJfnv3YpJTgZuH05Ikady0HVl8APhiknfxT+GwDNgf+Pf7+qbNVWsvAl5B75pT7wO+D3wOWAI8AJxQVY8nCXAesAJ4AnhvVd2yr+8tSdp7ezyyqKpHquq1wMfo/QF/APhYVb2mqh5+Bu97HvDlqnoZ8CvARuAs4PqqWgpc36wDvBlY2vysBi54Bu8rSdoH072fxdeBr8/EGyY5CPgN4L3Na/8U+GmSlcAxzW6XAd8APgysBC6vqgJuSHJwkkVVtXUm+pEkteviW9hHAtuA/5bk1iQXJXkuvTOvdgXAw8ChzfJhwEN9z9/c1J4myeok65Os37Zt2xDbl6T5p4uwWAC8Crigql5J70ZKZ/Xv0BxF1BTPHaiq1lTVsqpaNjExMWPNSpK6CYvNwOaqurFZ/zy98HgkySKA5nHXtai2AIv7nn94U5MkjcjIw6KZGH8oyUub0nLgLmAtsKqprQKubZbXAu9Jz9HADucrJGm0pnshwZn2u8CVSfYH7gNOphdc1yQ5BXiQ3n0zoHfBwhXAJnqnzp48+nYlaX7rJCyq6jZ639fY3fIp9i3gjGH3JEkarKsjC2nO2HjXBpYf/85J9UULD+KKiy7soCNp5hkW0jP0ZO035XWstq47v4NupOHwbneSpFaGhSSplWEhSWplWEiSWhkWkqRWhoUkqZVhIUlqZVhIkloZFpKkVoaFJKmVYSFJamVYSJJaGRaSpFaGhSSplWEhSWplWEiSWhkWkqRWhoUkqZVhIUlq1VlYJHlWkluTfKlZPzLJjUk2Jflckv2b+gHN+qZm+5Kuepak+arLI4vfAzb2rX8S+ExVvQR4HDilqZ8CPN7UP9PsJ0kaoU7CIsnhwL8FLmrWA7wB+Hyzy2XA8c3yymadZvvyZn9J0oh0dWTxZ8CHgKea9RcAP6qqnc36ZuCwZvkw4CGAZvuOZv+nSbI6yfok67dt2zbE1iVp/hl5WCR5C/BoVd08k69bVWuqallVLZuYmJjJl5akeW9BB+/5OuCtSVYAzwZ+HjgPODjJgubo4XBgS7P/FmAxsDnJAuAg4Iejb1uS5q+Rh0VVnQ2cDZDkGOA/V9W7kvwl8DbgamAVcG3zlLXN+nea7V+rqhpx2xqSk049ja3bd0yq333PvUys6KAhSVPq4shikA8DVyf5E+BW4OKmfjHwF0k2AY8BJ3bUn4Zg6/YdTKw4c1L99g2nd9CNpEE6DYuq+gbwjWb5PuCoKfb5CfD2kTYmSXoav8EtSWplWEiSWo3TnIU0p2y8awPLj3/npPqihQdxxUUXdtCRtO8MC2lInqz9ppy837ru/A66kZ4Zh6EkSa0MC0lSK8NCktTKsJAktTIsJEmtDAtJUivDQpLUyrCQJLUyLCRJrQwLSVIrw0KS1MprQ2kkvCOeNLsZFhoJ74gnzW4OQ0mSWhkWkqRWhoUkqZVzFtKIeQc9zUYjD4ski4HLgUOBAtZU1XlJng98DlgCPACcUFWPJwlwHrACeAJ4b1XdMuq+pZniHfQ0G3UxDLUT+GBVvRw4GjgjycuBs4Drq2opcH2zDvBmYGnzsxq4YPQtS9L8NvKwqKqtu44Mqur/ABuBw4CVwGXNbpcBxzfLK4HLq+cG4OAki0bbtSTNb51OcCdZArwSuBE4tKq2NpsepjdMBb0geajvaZubmiRpRDoLiyTPA/4H8IGq+nH/tqoqevMZe/N6q5OsT7J+27ZtM9ipJKmTsEjyz+gFxZVV9YWm/Miu4aXm8dGmvgVY3Pf0w5va01TVmqpaVlXLJiYmhte8JM1DXZwNFeBiYGNV/WnfprXAKuDc5vHavvr7k1wNvBrY0TdcJc0ZnlKrcdbF9yxeB7wbuCPJbU3tI/RC4pokpwAPAic029bRO212E71TZ08eabfSiHhKrcbZyMOiqv4eyIDNy6fYv4AzhtqUJGmPvNyHJKmVYSFJauW1oaQx58S3xoFhIY05J741DhyGkiS1MiwkSa0chtoLjh1Lmq8Mi73g2LGk+cphKElSK8NCktTKYagZ4FyGuuDvnUbJsJgBg+Yyvvbp/+Q/Zg1Nl3NoJ516Glu375hU93d77jIshsgJcc1VW7fv8Hd7njEspDnG4SkNg2EhzTEOi2oYDAtpnnBYVM+EYSFpSoMmsQHuvudeJlZMrjsENncZFppRg/7ADPrjou4N+gN/9z338usf+OyUz7l9w+lT1h0Cm7sMiw7M5U9fg86SGfTHRd0b9Ad+Jv8/cwhs9jMsOuA/HKlnLn9wmmsMC0md8YPT7GFYjJHZ9CnLuQlpfpk1YZHkOOA84FnARVV1bsctzbjZNDno3IS6MOhDygP/6x6WvPgXJ9XH8YPWbDUrwiLJs4A/B34T2AzclGRtVd3VbWej0WWIeAShLuztGVq3f+p0h7OGbFaEBXAUsKmq7gNIcjWwEpgXYTHI3obIoE9fe9o28B+nRxAaopk6Q2suDO2OS6+pqq57aJXkbcBxVXVqs/5u4NVV9f6+fVYDq5vVlwLfH3mjT7cQ2N5xD3vDfofLfodnNvUK493vP6+qiak2zJYji1ZVtQZY03UfuyRZX1XLuu5juux3uOx3eGZTrzD7+t1lttwpbwuwuG/98KYmSRqB2RIWNwFLkxyZZH/gRGBtxz1J0rwxK4ahqmpnkvcDX6F36uwlVbWh47bajM2Q2DTZ73DZ7/DMpl5h9vULzJIJbklSt2bLMJQkqUOGhSSplWExBEl+P8mGJHcmuSrJs7vuaZAkv9f0uSHJB7ruZypJLknyaJI7+2rPT3Jdknubx0O67HGXAb2+vfnf96kkY3XK5IB+P5Xk7iS3J/likoM7bPFpBvT78abX25J8NcmLuuyx31T99m37YJJKsrCL3vaWYTHDkhwGnAksq6pX0JuQP7HbrqaW5BXAb9P7hvyvAG9J8pJuu5rSpcBxu9XOAq6vqqXA9c36OLiUyb3eCfwW8K2Rd9PuUib3ex3wiqr6ZeAe4OxRN7UHlzK5309V1S9X1b8GvgT80aib2oNLmdwvSRYDxwI/GHVD+8qwGI4FwHOSLAAOBP53x/0M8kvAjVX1RFXtBL5J74/aWKmqbwGP7VZeCVzWLF8GHD/KngaZqteq2lhVXV9RYEoD+v1q8/sAcAO97zWNhQH9/rhv9bnA2Jy1M+B3F+AzwIcYo17bGBYzrKq2AJ+m94lhK7Cjqr7abVcD3Qn8epIXJDkQWMHTv/w4zg6tqq3N8sPAoV02M4e9D/ibrptok+QTSR4C3sV4HVlMkmQlsKWqvtd1L3vDsJhhzdj5SuBI4EXAc5Oc1G1XU6uqjcAnga8CXwZuA37WZU/7onrnf8+aT2izRZKPAjuBK7vupU1VfbSqFtPr9f1t+3el+VD2EcY80KZiWMy8NwL3V9W2qnoS+ALw2o57GqiqLq6qX62q3wAepzdGPRs8kmQRQPP4aMf9zClJ3gu8BXhXza4vY10J/Ieum9iDF9P7IPm9JA/QG+K7JckvdNrVNBgWM+8HwNFJDkwSYDmwseOeBkrywubxCHrzFf+9246mbS2wqlleBVzbYS9zSnOjsQ8Bb62qJ7rup02SpX2rK4G7u+qlTVXdUVUvrKolVbWE3v15XlVVD3fcWiu/wT0EST4GvIPeIfytwKlV9Y/ddjW1JH8HvAB4EviDqrq+45YmSXIVcAy9Szs/ApwD/E/gGuAI4EHghKqaaiJxpAb0+hjwWWAC+BFwW1W9qaMWn2ZAv2cDBwA/bHa7oapO66TB3QzodwW92xI8Re934bRm7rBzU/VbVRf3bX+A3pmT43rJ8v/PsJAktXIYSpLUyrCQJLUyLCRJrQwLSVIrw0KS1MqwkCS1MiykMdBcdFIaW4aF9AwlWdLc/+HSJPckuTLJG5N8u7nfxlFJ/k1zv4Xbktya5OeSHJPk75KsBe7qe50rk2xM8vnmWkJS5/xSnvQMJVkCbAJeCWwAbgK+B5wCvBU4md59Tc6tqm8neR7wE+D1wF/Tu3fE/c3r3A+8vtnvEuCuqvr0iP+TpEk8spBmxv3NdX+eohcY1zcX4LsDWAJ8G/jTJGcCB/fdL+K7VXV/3+s8VFXfbpavoBcoUucMC2lm9F/766m+9aeABVV1LnAq8Bzg20le1mz/h91eZ/dDfQ/9NRYMC2kEkry4OfL4JL1hqpcN2PWIJK9plv8j8PcjaVBqYVhIo/GBJHcmuZ3eFX4H3X3u+8AZSTYChwAXjKpBaU+c4JbGRDPB/aWqekXXvUi788hCktTKIwtJUiuPLCRJrQwLSVIrw0KS1MqwkCS1MiwkSa3+H9MtYMrZEHyyAAAAAElFTkSuQmCC\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"\n",
"sns.histplot(price_logs, bins=50)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Missing values"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"make 0\n",
"model 0\n",
"year 0\n",
"engine_fuel_type 3\n",
"engine_hp 69\n",
"engine_cylinders 30\n",
"transmission_type 0\n",
"driven_wheels 0\n",
"number_of_doors 6\n",
"market_category 3742\n",
"vehicle_size 0\n",
"vehicle_style 0\n",
"highway_mpg 0\n",
"city_mpg 0\n",
"popularity 0\n",
"msrp 0\n",
"dtype: int64"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.isnull().sum()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 2.4 Setting up the validation framework"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's draw it"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"n = len(df)\n",
"\n",
"n_val = int(n * 0.2)\n",
"n_test = int(n * 0.2)\n",
"n_train = n - n_val - n_test"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"11914"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"n"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(2382, 2382, 7150)"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"n_val, n_test, n_train"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>make</th>\n",
" <th>model</th>\n",
" <th>year</th>\n",
" <th>engine_fuel_type</th>\n",
" <th>engine_hp</th>\n",
" <th>engine_cylinders</th>\n",
" <th>transmission_type</th>\n",
" <th>driven_wheels</th>\n",
" <th>number_of_doors</th>\n",
" <th>market_category</th>\n",
" <th>vehicle_size</th>\n",
" <th>vehicle_style</th>\n",
" <th>highway_mpg</th>\n",
" <th>city_mpg</th>\n",
" <th>popularity</th>\n",
" <th>msrp</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>bmw</td>\n",
" <td>1_series</td>\n",
" <td>2013</td>\n",
" <td>premium_unleaded_(required)</td>\n",
" <td>300.0</td>\n",
" <td>6.0</td>\n",
" <td>manual</td>\n",
" <td>rear_wheel_drive</td>\n",
" <td>2.0</td>\n",
" <td>luxury,high-performance</td>\n",
" <td>compact</td>\n",
" <td>coupe</td>\n",
" <td>28</td>\n",
" <td>20</td>\n",
" <td>3916</td>\n",
" <td>39600</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>bmw</td>\n",
" <td>1_series_m</td>\n",
" <td>2011</td>\n",
" <td>premium_unleaded_(required)</td>\n",
" <td>335.0</td>\n",
" <td>6.0</td>\n",
" <td>manual</td>\n",
" <td>rear_wheel_drive</td>\n",
" <td>2.0</td>\n",
" <td>factory_tuner,luxury,high-performance</td>\n",
" <td>compact</td>\n",
" <td>coupe</td>\n",
" <td>26</td>\n",
" <td>19</td>\n",
" <td>3916</td>\n",
" <td>46135</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>bmw</td>\n",
" <td>1_series</td>\n",
" <td>2011</td>\n",
" <td>premium_unleaded_(required)</td>\n",
" <td>230.0</td>\n",
" <td>6.0</td>\n",
" <td>manual</td>\n",
" <td>rear_wheel_drive</td>\n",
" <td>2.0</td>\n",
" <td>luxury,performance</td>\n",
" <td>compact</td>\n",
" <td>coupe</td>\n",
" <td>28</td>\n",
" <td>18</td>\n",
" <td>3916</td>\n",
" <td>29450</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>bmw</td>\n",
" <td>1_series</td>\n",
" <td>2012</td>\n",
" <td>premium_unleaded_(required)</td>\n",
" <td>230.0</td>\n",
" <td>6.0</td>\n",
" <td>manual</td>\n",
" <td>rear_wheel_drive</td>\n",
" <td>2.0</td>\n",
" <td>luxury,performance</td>\n",
" <td>compact</td>\n",
" <td>coupe</td>\n",
" <td>28</td>\n",
" <td>18</td>\n",
" <td>3916</td>\n",
" <td>31200</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" make model year engine_fuel_type engine_hp \\\n",
"10 bmw 1_series 2013 premium_unleaded_(required) 300.0 \n",
"0 bmw 1_series_m 2011 premium_unleaded_(required) 335.0 \n",
"3 bmw 1_series 2011 premium_unleaded_(required) 230.0 \n",
"5 bmw 1_series 2012 premium_unleaded_(required) 230.0 \n",
"\n",
" engine_cylinders transmission_type driven_wheels number_of_doors \\\n",
"10 6.0 manual rear_wheel_drive 2.0 \n",
"0 6.0 manual rear_wheel_drive 2.0 \n",
gitextract_3sp31muz/
├── .github/
│ └── FUNDING.yml
├── .gitignore
├── 01-intro/
│ ├── 01-what-is-ml.md
│ ├── 02-ml-vs-rules.md
│ ├── 03-supervised-ml.md
│ ├── 04-crisp-dm.md
│ ├── 05-model-selection.md
│ ├── 06-environment.md
│ ├── 07-numpy.md
│ ├── 08-linear-algebra.md
│ ├── 09-pandas.md
│ ├── 10-summary.md
│ ├── README.md
│ ├── homework.md
│ └── notebooks/
│ ├── 07-numpy.ipynb
│ ├── 08-linear-algebra.ipynb
│ └── 09-pandas.ipynb
├── 02-regression/
│ ├── 01-car-price-intro.md
│ ├── 02-data-preparation.md
│ ├── 03-eda.md
│ ├── 04-validation-framework.md
│ ├── 05-linear-regression-simple.md
│ ├── 06-linear-regression-vector.md
│ ├── 07-linear-regression-training.md
│ ├── 08-baseline-model.md
│ ├── 09-rmse.md
│ ├── 10-car-price-validation.md
│ ├── 11-feature-engineering.md
│ ├── 12-categorical-variables.md
│ ├── 13-regularization.md
│ ├── 14-tuning-model.md
│ ├── 15-using-model.md
│ ├── 16-summary.md
│ ├── 17-explore-more.md
│ ├── README.md
│ ├── homework.md
│ ├── meta.json
│ └── notebook.ipynb
├── 03-classification/
│ ├── 01-churn-project.md
│ ├── 02-data-preparation.md
│ ├── 03-validation.md
│ ├── 04-eda.md
│ ├── 05-risk.md
│ ├── 06-mutual-info.md
│ ├── 07-correlation.md
│ ├── 08-ohe.md
│ ├── 09-logistic-regression.md
│ ├── 10-training-log-reg.md
│ ├── 11-log-reg-interpretation.md
│ ├── 12-using-log-reg.md
│ ├── 13-summary.md
│ ├── 14-explore-more.md
│ ├── README.md
│ ├── homework.md
│ ├── meta.csv
│ ├── meta.json
│ ├── notebook-scaling-ohe.ipynb
│ └── notebook.ipynb
├── 04-evaluation/
│ ├── 01-overview.md
│ ├── 02-accuracy.md
│ ├── 03-confusion-table.md
│ ├── 04-precision-recall.md
│ ├── 05-roc.md
│ ├── 06-auc.md
│ ├── 07-cross-validation.md
│ ├── 08-summary.md
│ ├── 09-explore-more.md
│ ├── README.md
│ ├── homework.md
│ ├── meta.csv
│ ├── meta.json
│ └── notebook.ipynb
├── 05-deployment/
│ ├── 01-intro.md
│ ├── 02-pickle.md
│ ├── 03-flask-intro.md
│ ├── 04-flask-deployment.md
│ ├── 05-pipenv.md
│ ├── 06-docker.md
│ ├── 07-aws-eb.md
│ ├── 08-summary.md
│ ├── 09-explore-more.md
│ ├── README.md
│ ├── code/
│ │ ├── 05-train-churn-model.ipynb
│ │ ├── Dockerfile
│ │ ├── Pipfile
│ │ ├── ping.py
│ │ ├── plan.md
│ │ ├── predict-test.py
│ │ ├── predict.py
│ │ └── train.py
│ ├── homework.md
│ ├── meta.csv
│ ├── meta.json
│ └── workshop/
│ ├── .dockerignore
│ ├── .python-version
│ ├── Dockerfile
│ ├── README.md
│ ├── fly.toml
│ ├── ping.py
│ ├── predict.py
│ ├── predict_old.py
│ ├── pyproject.toml
│ ├── starter.ipynb
│ ├── test.py
│ ├── train.py
│ └── workshop-uv-fastapi.ipynb
├── 06-trees/
│ ├── 01-credit-risk.md
│ ├── 02-data-prep.md
│ ├── 03-decision-trees.md
│ ├── 04-decision-tree-learning.md
│ ├── 05-decision-tree-tuning.md
│ ├── 06-random-forest.md
│ ├── 07-boosting.md
│ ├── 08-xgb-tuning.md
│ ├── 09-final-model.md
│ ├── 10-summary.md
│ ├── 11-explore-more.md
│ ├── README.md
│ ├── homework.md
│ ├── meta.csv
│ ├── meta.json
│ └── notebook.ipynb
├── 08-deep-learning/
│ ├── 01-fashion-classification.md
│ ├── 02-tensorflow-keras.md
│ ├── 03-pretrained-models.md
│ ├── 04-conv-neural-nets.md
│ ├── 05-transfer-learning.md
│ ├── 06-learning-rate.md
│ ├── 07-checkpointing.md
│ ├── 08-more-layers.md
│ ├── 09-dropout.md
│ ├── 10-augmentation.md
│ ├── 11-large-model.md
│ ├── 12-using-model.md
│ ├── 13-summary.md
│ ├── 14-explore-more.md
│ ├── README.md
│ ├── homework.md
│ ├── install.md
│ ├── meta.csv
│ ├── meta.json
│ ├── notebook.ipynb
│ └── pytorch/
│ ├── README.md
│ └── install_pytorch.md
├── 09-serverless/
│ ├── 01-intro.md
│ ├── 02-aws-lambda.md
│ ├── 03-tensorflow-lite.md
│ ├── 04-preparing-code.md
│ ├── 05-docker-image.md
│ ├── 06-creating-lambda.md
│ ├── 07-api-gateway.md
│ ├── 08-summary.md
│ ├── 09-explore-more.md
│ ├── README.md
│ ├── code/
│ │ ├── Dockerfile
│ │ ├── convert-model.py
│ │ ├── lambda_function.py
│ │ ├── plan.md
│ │ ├── tensorflow-model.ipynb
│ │ └── test.py
│ ├── homework.md
│ ├── meta.csv
│ ├── meta.json
│ ├── updates.md
│ └── workshop/
│ ├── README.md
│ ├── lambda-keras/
│ │ ├── .gitignore
│ │ ├── Dockerfile
│ │ ├── convert/
│ │ │ ├── .dockerignore
│ │ │ ├── Dockerfile
│ │ │ ├── README.md
│ │ │ └── convert-saved-model.py
│ │ ├── lambda_function.py
│ │ ├── test.ipynb
│ │ └── test.py
│ ├── lambda-onnx/
│ │ ├── .gitignore
│ │ ├── Dockerfile
│ │ ├── lambda_function.py
│ │ ├── test.ipynb
│ │ └── test.py
│ ├── lambda-sklearn/
│ │ ├── .dockerignore
│ │ ├── .python-version
│ │ ├── Dockerfile
│ │ ├── customer.json
│ │ ├── deploy.sh
│ │ ├── invoke.py
│ │ ├── lambda_function.py
│ │ ├── pyproject.toml
│ │ └── test.py
│ └── train/
│ ├── .python-version
│ ├── README.md
│ ├── pyproject.toml
│ └── train.py
├── 10-kubernetes/
│ ├── 01-overview.md
│ ├── 02-tensorflow-serving.md
│ ├── 03-preprocessing.md
│ ├── 04-docker-compose.md
│ ├── 05-kubernetes-intro.md
│ ├── 06-kubernetes-simple-service.md
│ ├── 07-kubernetes-tf-serving.md
│ ├── 08-eks.md
│ ├── 09-summary.md
│ ├── 10-explore-more.md
│ ├── README.md
│ ├── code/
│ │ ├── Pipfile
│ │ ├── README.md
│ │ ├── docker-compose.yaml
│ │ ├── gateway.py
│ │ ├── image-gateway.dockerfile
│ │ ├── image-model.dockerfile
│ │ ├── kube-config/
│ │ │ ├── eks-config.yaml
│ │ │ ├── gateway-deployment.yaml
│ │ │ ├── gateway-service.yaml
│ │ │ ├── model-deployment.yaml
│ │ │ └── model-service.yaml
│ │ ├── ping/
│ │ │ ├── Dockerfile
│ │ │ ├── Pipfile
│ │ │ ├── deployment.yaml
│ │ │ ├── metallb-config.yaml
│ │ │ ├── ping.py
│ │ │ └── service.yaml
│ │ ├── plan.md
│ │ ├── proto.py
│ │ ├── test.py
│ │ └── tf-serving-connect.ipynb
│ ├── homework.md
│ ├── meta.csv
│ ├── meta.json
│ └── workshop/
│ ├── README.md
│ ├── k8s/
│ │ ├── deployment.yaml
│ │ ├── hpa.yaml
│ │ └── service.yaml
│ ├── load_test.py
│ └── service/
│ ├── .gitignore
│ ├── .python-version
│ ├── Dockerfile
│ ├── README.md
│ ├── app.py
│ ├── pyproject.toml
│ └── test.py
├── 11-kserve/
│ ├── 01-overview.md
│ ├── 02-kserve-local.md
│ ├── 03-kserve-sklearn.md
│ ├── 04-kserve-custom-image.md
│ ├── 05-tensorflow-kserve.md
│ ├── 06-kserve-transformers.md
│ ├── 07-kserve-eks-upd.md
│ ├── 07-kserve-eks.md
│ ├── 08-summary.md
│ ├── 09-explore-more.md
│ ├── README.md
│ ├── code/
│ │ ├── .gitignore
│ │ ├── README.md
│ │ ├── churn/
│ │ │ ├── Pipfile
│ │ │ ├── churn-service.yaml
│ │ │ ├── churn-test.py
│ │ │ ├── churn-train.py
│ │ │ └── model.joblib
│ │ ├── clothes/
│ │ │ ├── clothes-service.yaml
│ │ │ ├── convert.py
│ │ │ ├── test-transformer.py
│ │ │ ├── test.ipynb
│ │ │ └── test.py
│ │ ├── eks/
│ │ │ ├── clothes-service.yaml
│ │ │ ├── cluster.yaml
│ │ │ └── test-transformer.py
│ │ ├── image_transfomer/
│ │ │ ├── Dockerfile
│ │ │ ├── Pipfile
│ │ │ └── image_transformer.py
│ │ ├── iris/
│ │ │ ├── iris-example.yaml
│ │ │ ├── iris-request.json
│ │ │ └── iris-test.py
│ │ └── plan.md
│ ├── meta.csv
│ └── meta.json
├── README.md
├── after-sign-up.md
├── article/
│ └── README.md
├── asking-questions.md
├── bento.md
├── certificates.md
├── cohorts/
│ ├── 2021/
│ │ ├── 01-intro/
│ │ │ ├── homework-1.ipynb
│ │ │ └── homework.md
│ │ ├── 02-regression/
│ │ │ ├── homework.ipynb
│ │ │ └── homework.md
│ │ ├── 03-classification/
│ │ │ ├── homework.ipynb
│ │ │ └── homework.md
│ │ ├── 04-evaluation/
│ │ │ ├── homework-4-solution.ipynb
│ │ │ ├── homework-4-starter.ipynb
│ │ │ └── homework.md
│ │ ├── 05-deployment/
│ │ │ ├── homework/
│ │ │ │ ├── Dockerfile
│ │ │ │ ├── Pipfile
│ │ │ │ ├── homework.md
│ │ │ │ ├── q3_test.py
│ │ │ │ ├── q4_predict.py
│ │ │ │ ├── q4_test.py
│ │ │ │ ├── q6_predict.py
│ │ │ │ └── q6_test.py
│ │ │ └── homework.md
│ │ ├── 06-trees/
│ │ │ ├── homework-6-solution.ipynb
│ │ │ ├── homework-6-starter.ipynb
│ │ │ └── homework.md
│ │ ├── 07-midterm-project/
│ │ │ ├── README.md
│ │ │ ├── week10-office-hours.ipynb
│ │ │ ├── week8-office-hours.ipynb
│ │ │ └── week9-office-hours.ipynb
│ │ ├── 08-deep-learning/
│ │ │ ├── CNN_solution.ipynb
│ │ │ ├── homework.md
│ │ │ └── week-11-office-hours.ipynb
│ │ ├── 09-serverless/
│ │ │ ├── homework/
│ │ │ │ ├── Dockerfile
│ │ │ │ ├── homework.ipynb
│ │ │ │ ├── homework.py
│ │ │ │ └── test.py
│ │ │ └── homework.md
│ │ ├── 10-kubernetes/
│ │ │ ├── homework/
│ │ │ │ ├── deployment.yaml
│ │ │ │ └── service.yaml
│ │ │ └── homework.md
│ │ ├── 12-capstone/
│ │ │ └── README.md
│ │ ├── 13-article/
│ │ │ └── README.md
│ │ ├── 14-project/
│ │ │ └── README.md
│ │ ├── leaderboard.md
│ │ └── office-hours.md
│ ├── 2022/
│ │ ├── 01-intro/
│ │ │ ├── homework.md
│ │ │ └── homework_1.ipynb
│ │ ├── 02-regression/
│ │ │ ├── homework.md
│ │ │ └── homework_2.ipynb
│ │ ├── 03-classification/
│ │ │ ├── homework.md
│ │ │ └── homework_3.ipynb
│ │ ├── 04-evaluation/
│ │ │ ├── homework.md
│ │ │ └── homework_4.ipynb
│ │ ├── 05-deployment/
│ │ │ ├── homework/
│ │ │ │ ├── Dockerfile
│ │ │ │ ├── Pipfile
│ │ │ │ ├── q3_test.py
│ │ │ │ ├── q4_predict.py
│ │ │ │ ├── q4_test.py
│ │ │ │ ├── q6_predict.py
│ │ │ │ └── q6_test.py
│ │ │ └── homework.md
│ │ ├── 06-trees/
│ │ │ ├── homework.md
│ │ │ ├── homework_6.ipynb
│ │ │ └── homework_6_starter.ipynb
│ │ ├── 07-bento-production/
│ │ │ ├── homework.md
│ │ │ └── locustfile.py
│ │ ├── 08-deep-learning/
│ │ │ ├── homework.md
│ │ │ └── homework_8.ipynb
│ │ ├── 09-serverless/
│ │ │ ├── homework/
│ │ │ │ ├── Dockerfile
│ │ │ │ ├── homework.ipynb
│ │ │ │ ├── homework.py
│ │ │ │ └── test.py
│ │ │ └── homework.md
│ │ ├── 10-kubernetes/
│ │ │ ├── homework/
│ │ │ │ ├── deployment.yaml
│ │ │ │ ├── hpa.yaml
│ │ │ │ ├── service.yaml
│ │ │ │ └── test.py
│ │ │ └── homework.md
│ │ ├── README.md
│ │ ├── article.md
│ │ ├── leaderboard.md
│ │ └── projects.md
│ ├── 2023/
│ │ ├── 01-intro/
│ │ │ ├── homework.md
│ │ │ └── homework_1.ipynb
│ │ ├── 02-regression/
│ │ │ └── homework.md
│ │ ├── 03-classification/
│ │ │ ├── homework.md
│ │ │ └── homework_3.ipynb
│ │ ├── 04-evaluation/
│ │ │ └── homework.md
│ │ ├── 05-deployment/
│ │ │ ├── homework/
│ │ │ │ ├── Dockerfile
│ │ │ │ ├── Pipfile
│ │ │ │ ├── q3_test.py
│ │ │ │ ├── q4_predict.py
│ │ │ │ ├── q4_test.py
│ │ │ │ ├── q6_predict.py
│ │ │ │ └── q6_test.py
│ │ │ └── homework.md
│ │ ├── 06-trees/
│ │ │ └── homework.md
│ │ ├── 08-deep-learning/
│ │ │ ├── homework.ipynb
│ │ │ └── homework.md
│ │ ├── 09-serverless/
│ │ │ └── homework.md
│ │ ├── 10-kubernetes/
│ │ │ └── homework.md
│ │ ├── README.md
│ │ ├── article.md
│ │ ├── leaderboard.md
│ │ └── projects.md
│ ├── 2024/
│ │ ├── 01-intro/
│ │ │ └── homework.md
│ │ ├── 02-regression/
│ │ │ └── homework.md
│ │ ├── 03-classification/
│ │ │ └── homework.md
│ │ ├── 04-evaluation/
│ │ │ └── homework.md
│ │ ├── 05-deployment/
│ │ │ └── homework.md
│ │ ├── 06-trees/
│ │ │ └── homework.md
│ │ ├── 08-deep-learning/
│ │ │ └── homework.md
│ │ ├── 09-serverless/
│ │ │ └── homework.md
│ │ ├── 10-kubernetes/
│ │ │ └── homework.md
│ │ ├── README.md
│ │ ├── article.md
│ │ └── projects.md
│ └── 2025/
│ ├── 01-intro/
│ │ ├── homework.md
│ │ └── homework_1.ipynb
│ ├── 02-regression/
│ │ ├── homework.md
│ │ └── homework_2.ipynb
│ ├── 03-classification/
│ │ ├── homework.md
│ │ └── homework_3.ipynb
│ ├── 04-evaluation/
│ │ ├── homework.md
│ │ └── homework_4.ipynb
│ ├── 05-deployment/
│ │ ├── homework/
│ │ │ ├── .python-version
│ │ │ ├── Dockerfile_base
│ │ │ ├── Dockerfile_full
│ │ │ ├── Dockerfile_hw
│ │ │ ├── README.md
│ │ │ ├── main.py
│ │ │ ├── pyproject.toml
│ │ │ ├── q3_test.py
│ │ │ ├── q4_predict.py
│ │ │ ├── q4_test.py
│ │ │ ├── q6_predict.py
│ │ │ └── q6_test.py
│ │ └── homework.md
│ ├── 06-trees/
│ │ ├── homework.ipynb
│ │ └── homework.md
│ ├── 08-deep-learning/
│ │ └── homework.md
│ ├── 09-serverless/
│ │ └── homework.md
│ ├── 10-kubernetes/
│ │ └── homework.md
│ ├── README.md
│ ├── article.md
│ └── projects.md
├── generate-description.ipynb
├── generate-pages.ipynb
├── learning-in-public.md
└── projects/
├── README.md
├── how-to.md
└── project-tips.md
SYMBOL INDEX (76 symbols across 32 files)
FILE: 05-deployment/code/ping.py
function ping (line 6) | def ping():
FILE: 05-deployment/code/predict.py
function predict (line 16) | def predict():
FILE: 05-deployment/code/train.py
function train (line 66) | def train(df_train, y_train, C=1.0):
function predict (line 78) | def predict(df, dv, model):
FILE: 05-deployment/workshop/ping.py
function ping (line 7) | def ping():
FILE: 05-deployment/workshop/predict.py
class Customer (line 11) | class Customer(BaseModel):
class PredictResponse (line 38) | class PredictResponse(BaseModel):
function predict_single (line 49) | def predict_single(customer):
function predict (line 55) | def predict(customer: Customer) -> PredictResponse:
FILE: 05-deployment/workshop/train.py
function load_data (line 20) | def load_data():
function train_model (line 41) | def train_model(df):
function save_model (line 77) | def save_model(pipeline, output_file):
FILE: 09-serverless/code/lambda_function.py
function predict (line 33) | def predict(url):
function lambda_handler (line 45) | def lambda_handler(event, context):
FILE: 09-serverless/workshop/lambda-keras/lambda_function.py
function preprocess_pytorch (line 6) | def preprocess_pytorch(X):
function predict (line 46) | def predict(url):
function lambda_handler (line 53) | def lambda_handler(event, context):
FILE: 09-serverless/workshop/lambda-onnx/lambda_function.py
function preprocess_pytorch_style (line 11) | def preprocess_pytorch_style(X):
function predict (line 55) | def predict(url):
function lambda_handler (line 62) | def lambda_handler(event, context):
FILE: 09-serverless/workshop/lambda-sklearn/lambda_function.py
function predict_single (line 6) | def predict_single(customer):
function lambda_handler (line 10) | def lambda_handler(event, context):
FILE: 09-serverless/workshop/train/train.py
function load_data (line 15) | def load_data():
function train_model (line 34) | def train_model(df):
function save_model (line 70) | def save_model(pipeline, output_file):
FILE: 10-kubernetes/code/gateway.py
function prepare_request (line 28) | def prepare_request(X):
function prepare_response (line 51) | def prepare_response(pb_response):
function predict (line 56) | def predict(url):
function predict_endpoint (line 68) | def predict_endpoint():
FILE: 10-kubernetes/code/ping/ping.py
function ping (line 6) | def ping():
FILE: 10-kubernetes/code/proto.py
function dtypes_as_dtype (line 4) | def dtypes_as_dtype(dtype):
function make_tensor_proto (line 10) | def make_tensor_proto(data):
function np_to_protobuf (line 23) | def np_to_protobuf(data):
FILE: 10-kubernetes/workshop/load_test.py
function send_request (line 13) | def send_request(_):
FILE: 10-kubernetes/workshop/service/app.py
function preprocess_pytorch_style (line 10) | def preprocess_pytorch_style(X):
class PredictRequest (line 46) | class PredictRequest(BaseModel):
class PredictResponse (line 50) | class PredictResponse(BaseModel):
function predict (line 56) | def predict(url: str):
function root (line 69) | def root():
function health (line 74) | def health():
function predict_endpoint (line 79) | def predict_endpoint(request: PredictRequest):
FILE: 11-kserve/code/image_transfomer/image_transformer.py
class ImageTransformer (line 7) | class ImageTransformer(kserve.KFModel):
method __init__ (line 8) | def __init__(self, name: str, predictor_host: str):
method prepare_input (line 25) | def prepare_input(self, url: str) -> List:
method preprocess (line 29) | def preprocess(self, request: Dict) -> Dict:
method postprocess (line 38) | def postprocess(self, response: Dict) -> Dict:
FILE: cohorts/2021/05-deployment/homework/q3_test.py
function load (line 3) | def load(filename):
FILE: cohorts/2021/05-deployment/homework/q4_predict.py
function load (line 7) | def load(filename):
function predict (line 18) | def predict():
FILE: cohorts/2021/05-deployment/homework/q6_predict.py
function load (line 7) | def load(filename):
function predict (line 18) | def predict():
FILE: cohorts/2021/09-serverless/homework/homework.py
function download_image (line 19) | def download_image(url):
function prepare_image (line 27) | def prepare_image(img, target_size):
function prepare_input (line 34) | def prepare_input(x):
function predict (line 47) | def predict(url):
function lambda_handler (line 63) | def lambda_handler(event, context):
FILE: cohorts/2022/05-deployment/homework/q3_test.py
function load (line 4) | def load(filename: str):
FILE: cohorts/2022/05-deployment/homework/q4_predict.py
function load (line 8) | def load(filename: str):
function predict (line 20) | def predict():
FILE: cohorts/2022/05-deployment/homework/q6_predict.py
function load (line 8) | def load(filename: str):
function predict (line 20) | def predict():
FILE: cohorts/2022/07-bento-production/locustfile.py
class MLZoomUser (line 7) | class MLZoomUser(HttpUser):
method classify (line 19) | def classify(self):
FILE: cohorts/2022/09-serverless/homework/homework.py
function download_image (line 19) | def download_image(url):
function prepare_image (line 27) | def prepare_image(img, target_size):
function prepare_input (line 34) | def prepare_input(x):
function predict (line 47) | def predict(url):
function lambda_handler (line 63) | def lambda_handler(event, context):
FILE: cohorts/2023/05-deployment/homework/q3_test.py
function load (line 4) | def load(filename: str):
FILE: cohorts/2023/05-deployment/homework/q4_predict.py
function load (line 8) | def load(filename: str):
function predict (line 20) | def predict():
FILE: cohorts/2023/05-deployment/homework/q6_predict.py
function load (line 8) | def load(filename: str):
function predict (line 20) | def predict():
FILE: cohorts/2025/05-deployment/homework/main.py
function main (line 1) | def main():
FILE: cohorts/2025/05-deployment/homework/q4_predict.py
function predict (line 13) | def predict(lead: dict) -> dict:
FILE: cohorts/2025/05-deployment/homework/q6_predict.py
function predict (line 13) | def predict(lead: dict) -> dict:
Condensed preview — 428 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (7,159K chars).
[
{
"path": ".github/FUNDING.yml",
"chars": 23,
"preview": "github: alexeygrigorev\n"
},
{
"path": ".gitignore",
"chars": 408,
"preview": "# generated\n.ipynb_checkpoints/\n__pycache__/\n**my_dir/\n**logs/\n**models/\n\n# file types\n*.h5\n*.tflite\n*.keras\n*.zip\n*.pdf"
},
{
"path": "01-intro/01-what-is-ml.md",
"chars": 1482,
"preview": "## 1.1 Introduction to Machine Learning\n\n<a href=\"https://www.youtube.com/watch?v=Crm_5n4mvmg&list=PL3MmuxUbc_hIhxl5Ji8t"
},
{
"path": "01-intro/02-ml-vs-rules.md",
"chars": 2090,
"preview": "## 1.2 ML vs Rule-Based Systems\n\n<a href=\"https://www.youtube.com/watch?v=CeukwyUdaz8&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOp"
},
{
"path": "01-intro/03-supervised-ml.md",
"chars": 2121,
"preview": "## 1.3 Supervised Machine Learning\n\n<a href=\"https://www.youtube.com/watch?v=j9kcEuGcC2Y&list=PL3MmuxUbc_hIhxl5Ji8t4O6lP"
},
{
"path": "01-intro/04-crisp-dm.md",
"chars": 2232,
"preview": "## 1.4 CRISP-DM\n\n<a href=\"https://www.youtube.com/watch?v=dCa3JvmJbr0&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR&index=5\"><"
},
{
"path": "01-intro/05-model-selection.md",
"chars": 2099,
"preview": "## 1.5 Model Selection Process\n\n<a href=\"https://www.youtube.com/watch?v=OH_R0Sl9neM&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpH"
},
{
"path": "01-intro/06-environment.md",
"chars": 5442,
"preview": "## Setting up the Environment\n\nIn this section, we'll prepare the environment\n\n\nYou need:\n\n* Python 3.11 (note that vid"
},
{
"path": "01-intro/07-numpy.md",
"chars": 5568,
"preview": "## 1.7 Introduction to NumPy\n\n<a href=\"https://www.youtube.com/watch?v=Qa0-jYtRdbY&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaC"
},
{
"path": "01-intro/08-linear-algebra.md",
"chars": 3004,
"preview": "## 1.8 Linear Algebra Refresher\n\n<a href=\"https://www.youtube.com/watch?v=zZyKUeOR4Gg&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOp"
},
{
"path": "01-intro/09-pandas.md",
"chars": 1168,
"preview": "## 1.9 Introduction to Pandas\n\n<a href=\"https://www.youtube.com/watch?v=0j3XK5PsnxA&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHa"
},
{
"path": "01-intro/10-summary.md",
"chars": 3044,
"preview": "## 1.10 Summary\n\n<a href=\"https://www.youtube.com/watch?v=VRrEEVeJ440&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR&index=10\">"
},
{
"path": "01-intro/README.md",
"chars": 2155,
"preview": "## 1. Introduction to Machine Learning\n\n- 1.1 [Introduction to Machine Learning](01-what-is-ml.md)\n- 1.2 [ML vs Rule-Bas"
},
{
"path": "01-intro/homework.md",
"chars": 661,
"preview": "## Homework\r\n\r\n* For 2025 cohort homework, check [the 2025 cohort folder](../cohorts/2025/01-intro/homework.md)\r\n* For 2"
},
{
"path": "01-intro/notebooks/07-numpy.ipynb",
"chars": 13219,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"id\": \"502da6a2\",\n \"metadata\": {},\n \"source\": [\n \"# Machine Lear"
},
{
"path": "01-intro/notebooks/08-linear-algebra.ipynb",
"chars": 10618,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"id\": \"3aace4b5\",\n \"metadata\": {},\n \"source\": [\n \"# Machine Lear"
},
{
"path": "01-intro/notebooks/09-pandas.ipynb",
"chars": 56323,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"id\": \"3473239e\",\n \"metadata\": {},\n \"source\": [\n \"# Machine Lear"
},
{
"path": "02-regression/01-car-price-intro.md",
"chars": 1361,
"preview": "\n## 2.1 Car price prediction project\n\n<a href=\"https://www.youtube.com/watch?v=vM3SqPNlStE&list=PL3MmuxUbc_hIhxl5Ji8t4O6"
},
{
"path": "02-regression/02-data-preparation.md",
"chars": 1309,
"preview": "\n## 2.2 Data preparation\n\n<a href=\"https://www.youtube.com/watch?v=Kd74oR4QWGM&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR&i"
},
{
"path": "02-regression/03-eda.md",
"chars": 1718,
"preview": "\n## 2.3 Exploratory data analysis\n\n<a href=\"https://www.youtube.com/watch?v=k6k8sQ0GhPM&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPA"
},
{
"path": "02-regression/04-validation-framework.md",
"chars": 1815,
"preview": "\n## 2.4 Setting up the validation framework\n\n<a href=\"https://www.youtube.com/watch?v=ck0IfiPaQi0&list=PL3MmuxUbc_hIhxl5"
},
{
"path": "02-regression/05-linear-regression-simple.md",
"chars": 2427,
"preview": "\n## 2.5 Linear regression\n\n<a href=\"https://www.youtube.com/watch?v=Dn1eTQLsOdA&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR&"
},
{
"path": "02-regression/06-linear-regression-vector.md",
"chars": 1464,
"preview": "\n## 2.6 Linear regression: vector form\n\n<a href=\"https://www.youtube.com/watch?v=YkyevnYyAww&list=PL3MmuxUbc_hIhxl5Ji8t4"
},
{
"path": "02-regression/07-linear-regression-training.md",
"chars": 1617,
"preview": "\n## 2.7 Training linear regression: Normal equation\n\n<a href=\"https://www.youtube.com/watch?v=hx6nak-Y11g&list=PL3MmuxUb"
},
{
"path": "02-regression/08-baseline-model.md",
"chars": 1858,
"preview": "\n## 2.8 Baseline model for car price prediction project\n\n<a href=\"https://www.youtube.com/watch?v=SvPpMMYtYbU&list=PL3Mm"
},
{
"path": "02-regression/09-rmse.md",
"chars": 1689,
"preview": "\n## 2.9 Root Mean Squared Error (RMSE)\n\n<a href=\"https://www.youtube.com/watch?v=0LWoFtbzNUM&list=PL3MmuxUbc_hIhxl5Ji8t4"
},
{
"path": "02-regression/10-car-price-validation.md",
"chars": 1143,
"preview": "\n## 2.10 Computing RMSE on validation data\n\n<a href=\"https://www.youtube.com/watch?v=rawGPXg2ofE&list=PL3MmuxUbc_hIhxl5J"
},
{
"path": "02-regression/11-feature-engineering.md",
"chars": 1348,
"preview": "## 2.11 Feature engineering\n\nFeature engineering is the process of creating new features\n\n<a href=\"https://www.youtube.c"
},
{
"path": "02-regression/12-categorical-variables.md",
"chars": 1643,
"preview": "\n## 2.12 Categorical variables\n\n<a href=\"https://www.youtube.com/watch?v=sGLAToAAMa4&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpH"
},
{
"path": "02-regression/13-regularization.md",
"chars": 2665,
"preview": "## 2.13 Regularization\n\n<a href=\"https://www.youtube.com/watch?v=91ve3EJlHBc&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR&ind"
},
{
"path": "02-regression/14-tuning-model.md",
"chars": 1150,
"preview": "\n## 2.14 Tuning the model\n\n<a href=\"https://www.youtube.com/watch?v=lW-YVxPgzQw&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR&"
},
{
"path": "02-regression/15-using-model.md",
"chars": 1207,
"preview": "\n## 2.15 Using the model\n\n<a href=\"https://www.youtube.com/watch?v=KT--uIJozes&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR&i"
},
{
"path": "02-regression/16-summary.md",
"chars": 1152,
"preview": "\n## 2.16 Car price prediction project summary\n\n<a href=\"https://www.youtube.com/watch?v=_qI01YXbyro&list=PL3MmuxUbc_hIhx"
},
{
"path": "02-regression/17-explore-more.md",
"chars": 963,
"preview": "\n## 2.17 Explore more\n\n### Questions\n\n* In this project, we included only 5 top features. What happens if we include 10?"
},
{
"path": "02-regression/README.md",
"chars": 3088,
"preview": "## 2. Machine Learning for Regression\n\n- 2.1 [Car price prediction project](01-car-price-intro.md)\n- 2.2 [Data preparati"
},
{
"path": "02-regression/homework.md",
"chars": 681,
"preview": "## Homework\n* For 2025 cohort homework, check [the 2025 cohort folder](../cohorts/2025/02-regression/homework.md)\n* For "
},
{
"path": "02-regression/meta.json",
"chars": 91,
"preview": "{\n \"data\": \"meta.csv\",\n \"session\": 2,\n \"name\": \"Machine Learning for Regression\"\n}"
},
{
"path": "02-regression/notebook.ipynb",
"chars": 116429,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"## 2. Machine Learning for Regressi"
},
{
"path": "03-classification/01-churn-project.md",
"chars": 1612,
"preview": "# 3.1 Churn prediction project\n\n<a href=\"https://www.youtube.com/watch?v=0Zw04wdeTQo&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpH"
},
{
"path": "03-classification/02-data-preparation.md",
"chars": 1979,
"preview": "\n## 3.2 Data preparation\n\n<a href=\"https://www.youtube.com/watch?v=VSGGU9gYvdg&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR\">"
},
{
"path": "03-classification/03-validation.md",
"chars": 1502,
"preview": "\n## 3.3 Setting up the validation framework\n\n<a href=\"https://www.youtube.com/watch?v=_lwz34sOnSE&list=PL3MmuxUbc_hIhxl5"
},
{
"path": "03-classification/04-eda.md",
"chars": 1689,
"preview": "\n## 3.4 EDA\n\n<a href=\"https://www.youtube.com/watch?v=BNF1wjBwTQA&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR\"><img src=\"ima"
},
{
"path": "03-classification/05-risk.md",
"chars": 1933,
"preview": "\n## 3.5 Feature importance: Churn rate and risk ratio\n\n<a href=\"https://www.youtube.com/watch?v=fzdzPLlvs40&list=PL3Mmux"
},
{
"path": "03-classification/06-mutual-info.md",
"chars": 1760,
"preview": "\n## 3.6 Feature importance: Mutual information\n\n<a href=\"https://www.youtube.com/watch?v=_u2YaGT6RN0&list=PL3MmuxUbc_hIh"
},
{
"path": "03-classification/07-correlation.md",
"chars": 1967,
"preview": "\n## 3.7 Feature importance: Correlation\n\n<a href=\"https://www.youtube.com/watch?v=mz1707QVxiY&list=PL3MmuxUbc_hIhxl5Ji8t"
},
{
"path": "03-classification/08-ohe.md",
"chars": 1635,
"preview": "\n## 3.8 One-hot encoding\n\n<a href=\"https://www.youtube.com/watch?v=L-mjQFN5aR0&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR\">"
},
{
"path": "03-classification/09-logistic-regression.md",
"chars": 2123,
"preview": "\n## 3.9 Logistic regression\n\n<a href=\"https://www.youtube.com/watch?v=7KFE2ltnBAg&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCL"
},
{
"path": "03-classification/10-training-log-reg.md",
"chars": 1747,
"preview": "\n## 3.10 Training logistic regression with Scikit-Learn\n\n<a href=\"https://www.youtube.com/watch?v=hae_jXe2fN0&list=PL3Mm"
},
{
"path": "03-classification/11-log-reg-interpretation.md",
"chars": 1491,
"preview": "\n## 3.11 Model interpretation\n\n<a href=\"https://www.youtube.com/watch?v=OUrlxnUAAEA&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHa"
},
{
"path": "03-classification/12-using-log-reg.md",
"chars": 1172,
"preview": "\n## 3.12 Using the model\n\n<a href=\"https://www.youtube.com/watch?v=Y-NGmnFpNuM&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR\">"
},
{
"path": "03-classification/13-summary.md",
"chars": 999,
"preview": "\n## 3.13 Summary\n\n<a href=\"https://www.youtube.com/watch?v=Zz6oRGsJkW4&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR\"><img src"
},
{
"path": "03-classification/14-explore-more.md",
"chars": 1058,
"preview": "\n## 3.14 Explore more\n\nMore things\n\n* Try to exclude least useful features\n\nUse scikit-learn in project of last week\n\n* "
},
{
"path": "03-classification/README.md",
"chars": 2888,
"preview": "## 3. Machine Learning for Classification\n\n- 3.1 [Churn prediction project](01-churn-project.md)\n- 3.2 [Data preparatio"
},
{
"path": "03-classification/homework.md",
"chars": 705,
"preview": "## Homework\n* For 2025 cohort homework, check [the 2025 cohort folder](../cohorts/2025/03-classification/homework.md)\n* "
},
{
"path": "03-classification/meta.csv",
"chars": 2893,
"preview": "lesson,name,page_name,video,slides,notebook\r\n1,Churn prediction project,01-churn-project.md,https://www.youtube.com/watc"
},
{
"path": "03-classification/meta.json",
"chars": 95,
"preview": "{\n \"data\": \"meta.csv\",\n \"session\": 3,\n \"name\": \"Machine Learning for Classification\"\n}"
},
{
"path": "03-classification/notebook-scaling-ohe.ipynb",
"chars": 10770,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"code\",\n \"execution_count\": 2,\n \"metadata\": {},\n \"outputs\": [],\n \"source\": [\n "
},
{
"path": "03-classification/notebook.ipynb",
"chars": 115411,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# 3. Machine Learning for Classific"
},
{
"path": "04-evaluation/01-overview.md",
"chars": 1218,
"preview": "\n## 4.1 Evaluation metrics: session overview\n\n<a href=\"https://www.youtube.com/watch?v=gmg5jw1bM8A&list=PL3MmuxUbc_hIhxl"
},
{
"path": "04-evaluation/02-accuracy.md",
"chars": 2230,
"preview": "\n## 4.2 Accuracy and dummy model\n\n<a href=\"https://www.youtube.com/watch?v=FW_l7lB0HUI&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAO"
},
{
"path": "04-evaluation/03-confusion-table.md",
"chars": 2529,
"preview": "## 4.3 Confusion table\n\n<a href=\"https://www.youtube.com/watch?v=Jt2dDLSlBng&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR\"><i"
},
{
"path": "04-evaluation/04-precision-recall.md",
"chars": 1913,
"preview": "## 4.4 Precision and Recall\n\n<a href=\"https://www.youtube.com/watch?v=gRLP_mlglMM&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCL"
},
{
"path": "04-evaluation/05-roc.md",
"chars": 2920,
"preview": "\n## 4.5 ROC Curves\n\n<a href=\"https://www.youtube.com/watch?v=dnBZLk53sQI&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR\"><img s"
},
{
"path": "04-evaluation/06-auc.md",
"chars": 1773,
"preview": "## 4.6 ROC AUC\n\n<a href=\"https://www.youtube.com/watch?v=hvIQPAwkVZo&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR\"><img src=\""
},
{
"path": "04-evaluation/07-cross-validation.md",
"chars": 2414,
"preview": "## 4.7 Cross-Validation\n\n<a href=\"https://www.youtube.com/watch?v=BIIZaVtUbf4&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR\"><"
},
{
"path": "04-evaluation/08-summary.md",
"chars": 1794,
"preview": "## 4.8 Summary\n\n<a href=\"https://www.youtube.com/watch?v=-v8XEQ2AHvQ&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR\"><img src=\""
},
{
"path": "04-evaluation/09-explore-more.md",
"chars": 606,
"preview": "## 4.9 Explore more\n\n* Check the precision and recall of the dummy classifier that always predict \"FALSE\"\n* F1 score = 2"
},
{
"path": "04-evaluation/README.md",
"chars": 1760,
"preview": "## 4. Evaluation Metrics for Classification\n\n- 4.1 [Evaluation metrics: session overview](01-overview.md)\n- 4.2 [Accurac"
},
{
"path": "04-evaluation/homework.md",
"chars": 687,
"preview": "## Homework\n* For 2025 cohort homework, check [the 2025 cohort folder](../cohorts/2025/04-evaluation/homework.md)\n* For "
},
{
"path": "04-evaluation/meta.csv",
"chars": 1660,
"preview": "lesson,name,page_name,video,slides,notebook\r\n1,Evaluation metrics: session overview,01-overview.md,https://www.youtube.c"
},
{
"path": "04-evaluation/meta.json",
"chars": 97,
"preview": "{\n \"data\": \"meta.csv\",\n \"session\": 4,\n \"name\": \"Evaluation Metrics for Classification\"\n}"
},
{
"path": "04-evaluation/notebook.ipynb",
"chars": 153777,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"code\",\n \"execution_count\": 1,\n \"id\": \"3fb05700\",\n \"metadata\": {},\n \"outputs\":"
},
{
"path": "05-deployment/01-intro.md",
"chars": 2795,
"preview": "\n## 5.1 Intro / Session overview\n\n<a href=\"https://www.youtube.com/watch?v=agIFak9A3m8&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAO"
},
{
"path": "05-deployment/02-pickle.md",
"chars": 2196,
"preview": "\n## 5.2 Saving and loading the model\n\n<a href=\"https://www.youtube.com/watch?v=EJpqZ7OlwFU&list=PL3MmuxUbc_hIhxl5Ji8t4O6"
},
{
"path": "05-deployment/03-flask-intro.md",
"chars": 3476,
"preview": "\n## 5.3 Web services: introduction to Flask\n\n<a href=\"https://www.youtube.com/watch?v=W7ubna1Rfv8&list=PL3MmuxUbc_hIhxl5"
},
{
"path": "05-deployment/04-flask-deployment.md",
"chars": 5624,
"preview": "\n## 5.4 Serving the churn model with Flask\n\n<a href=\"https://www.youtube.com/watch?v=Q7ZWPgPnRz8&list=PL3MmuxUbc_hIhxl5J"
},
{
"path": "05-deployment/05-pipenv.md",
"chars": 4181,
"preview": "## 5.5 Python virtual environment: Pipenv\n\n<a href=\"https://www.youtube.com/watch?v=BMXh8JGROHM&list=PL3MmuxUbc_hIhxl5Ji"
},
{
"path": "05-deployment/06-docker.md",
"chars": 4429,
"preview": "## 5.6 Environment management: Docker\n\n<a href=\"https://www.youtube.com/watch?v=wAtyYZ6zvAs&list=PL3MmuxUbc_hIhxl5Ji8t4O"
},
{
"path": "05-deployment/07-aws-eb.md",
"chars": 5271,
"preview": "## 5.7 Deployment to the cloud: AWS Elastic Beanstalk (optional)\n\n<a href=\"https://www.youtube.com/watch?v=HGPJ4ekhcLg&l"
},
{
"path": "05-deployment/08-summary.md",
"chars": 1285,
"preview": "## 5.8 Summary\n\n<a href=\"https://www.youtube.com/watch?v=sSAqYSk7Br4&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR\"><img src=\""
},
{
"path": "05-deployment/09-explore-more.md",
"chars": 715,
"preview": "\n## 5.9 Explore more\n\n* Flask is not the only framework for creating web services. Try others, e.g. FastAPI.\n* Experimen"
},
{
"path": "05-deployment/README.md",
"chars": 3188,
"preview": "## 5. Deploying Machine Learning Models\n\nNote: these materials are partly outdated, which\nis why we recorded a workshop "
},
{
"path": "05-deployment/code/05-train-churn-model.ipynb",
"chars": 12744,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"id\": \"464b9b4a\",\n \"metadata\": {},\n \"source\": [\n \"In the previou"
},
{
"path": "05-deployment/code/Dockerfile",
"chars": 275,
"preview": "FROM python:3.8.12-slim\r\n\r\nRUN pip install pipenv\r\n\r\nWORKDIR /app\r\n\r\nCOPY [\"Pipfile\", \"Pipfile.lock\", \"./\"]\r\n\r\nRUN pipen"
},
{
"path": "05-deployment/code/Pipfile",
"chars": 203,
"preview": "[[source]]\nurl = \"https://pypi.org/simple\"\nverify_ssl = true\nname = \"pypi\"\n\n[packages]\nnumpy = \"*\"\nscikit-learn = \"==0.2"
},
{
"path": "05-deployment/code/ping.py",
"chars": 200,
"preview": "from flask import Flask\r\n\r\napp = Flask('ping')\r\n\r\n@app.route('/ping', methods=['GET'])\r\ndef ping():\r\n return \"PONG\"\r\n"
},
{
"path": "05-deployment/code/plan.md",
"chars": 1714,
"preview": "# 5. Deploying Machine Learning models \r\n\r\nWe'll use the same model we trained and evaluated\r\npreviously - the churn pre"
},
{
"path": "05-deployment/code/predict-test.py",
"chars": 892,
"preview": "#!/usr/bin/env python\n# coding: utf-8\n\nimport requests\n\n\nurl = 'http://localhost:9696/predict'\n\ncustomer_id = 'xyz-123'\n"
},
{
"path": "05-deployment/code/predict.py",
"chars": 644,
"preview": "import pickle\r\n\r\nfrom flask import Flask\r\nfrom flask import request\r\nfrom flask import jsonify\r\n\r\n\r\nmodel_file = 'model_"
},
{
"path": "05-deployment/code/train.py",
"chars": 2864,
"preview": "#!/usr/bin/env python\n# coding: utf-8\n\nimport pickle\n\nimport pandas as pd\nimport numpy as np\n\nfrom sklearn.model_selecti"
},
{
"path": "05-deployment/homework.md",
"chars": 610,
"preview": "## Homework\n\n\n* For 2025 cohort homework, check [the 2025 cohort folder](../cohorts/2025/05-deployment/homework.md)\n* Fo"
},
{
"path": "05-deployment/meta.csv",
"chars": 1434,
"preview": "lesson,name,page_name,video,slides\r\n1,Intro / Session overview,01-intro.md,https://www.youtube.com/watch?v=agIFak9A3m8,h"
},
{
"path": "05-deployment/meta.json",
"chars": 93,
"preview": "{\n \"data\": \"meta.csv\",\n \"session\": 5,\n \"name\": \"Deploying Machine Learning Models\"\n}"
},
{
"path": "05-deployment/workshop/.dockerignore",
"chars": 64,
"preview": "# flyctl launch added from .venv\\.gitignore\n.venv\\**\\*\nfly.toml\n"
},
{
"path": "05-deployment/workshop/.python-version",
"chars": 5,
"preview": "3.13\n"
},
{
"path": "05-deployment/workshop/Dockerfile",
"chars": 337,
"preview": "FROM python:3.13.5-slim-bookworm\n\nCOPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/\nWORKDIR /code\n\nENV PATH=\"/code/"
},
{
"path": "05-deployment/workshop/README.md",
"chars": 15969,
"preview": "# Deploying ML Models with FastAPI and uv\n\n* Video: https://www.youtube.com/watch?v=jzGzw98Eikk\n\nIn this workshop we wil"
},
{
"path": "05-deployment/workshop/fly.toml",
"chars": 483,
"preview": "# fly.toml app configuration file generated for mlzoomcamp-flask-uv on 2025-08-11T15:21:50+02:00\n#\n# See https://fly.io/"
},
{
"path": "05-deployment/workshop/ping.py",
"chars": 197,
"preview": "from fastapi import FastAPI\nimport uvicorn\n\napp = FastAPI(title=\"ping\")\n\n@app.get(\"/ping\")\ndef ping():\n return \"PONG\""
},
{
"path": "05-deployment/workshop/predict.py",
"chars": 1835,
"preview": "import pickle\nfrom typing import Literal\nfrom pydantic import BaseModel, Field\n\n\nfrom fastapi import FastAPI\nimport uvic"
},
{
"path": "05-deployment/workshop/predict_old.py",
"chars": 730,
"preview": "import pickle\n\nwith open('model.bin', 'rb') as f_in:\n pipeline = pickle.load(f_in)\n\ndatapoint = {\n 'gender': 'fema"
},
{
"path": "05-deployment/workshop/pyproject.toml",
"chars": 295,
"preview": "[project]\nname = \"mlzoomcamp-flask-uv\"\nversion = \"0.1.0\"\ndescription = \"Add your description here\"\nreadme = \"README.md\"\n"
},
{
"path": "05-deployment/workshop/starter.ipynb",
"chars": 34623,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"id\": \"4458df13-d0f7-462e-bc80-42169bb1a62b\",\n \"metadata\": {},\n \"so"
},
{
"path": "05-deployment/workshop/test.py",
"chars": 864,
"preview": "import requests\n\nurl = 'http://localhost:9696/predict'\n# url = 'https://mlzoomcamp-flask-uv.fly.dev/predict'\n\ncustomer ="
},
{
"path": "05-deployment/workshop/train.py",
"chars": 1966,
"preview": "#!/usr/bin/env python\n\nimport pickle\n\nimport pandas as pd\nimport numpy as np\nimport sklearn\n\n\nfrom sklearn.feature_extra"
},
{
"path": "05-deployment/workshop/workshop-uv-fastapi.ipynb",
"chars": 82015,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"id\": \"4458df13-d0f7-462e-bc80-42169bb1a62b\",\n \"metadata\": {\n \"_sp"
},
{
"path": "06-trees/01-credit-risk.md",
"chars": 1654,
"preview": "\n## 6.1 Credit risk scoring project\n\n<a href=\"https://www.youtube.com/watch?v=GJGmlfZoCoU&list=PL3MmuxUbc_hIhxl5Ji8t4O6l"
},
{
"path": "06-trees/02-data-prep.md",
"chars": 1953,
"preview": "## 6.2 Data cleaning and preparation\n\n<a href=\"https://www.youtube.com/watch?v=tfuQdI3YO2c&list=PL3MmuxUbc_hIhxl5Ji8t4O6"
},
{
"path": "06-trees/03-decision-trees.md",
"chars": 2126,
"preview": "## 6.3 Decision trees\n\n<a href=\"https://www.youtube.com/watch?v=YGiQvFbSIg8&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR\"><im"
},
{
"path": "06-trees/04-decision-tree-learning.md",
"chars": 4380,
"preview": "## 6.4 Decision tree learning algorithm\n\n<a href=\"https://www.youtube.com/watch?v=XODz6LwKY7g&list=PL3MmuxUbc_hIhxl5Ji8t"
},
{
"path": "06-trees/05-decision-tree-tuning.md",
"chars": 3564,
"preview": "## 6.5 Decision trees parameter tuning\n\n<a href=\"https://www.youtube.com/watch?v=XJaxwH50Qok&list=PL3MmuxUbc_hIhxl5Ji8t4"
},
{
"path": "06-trees/06-random-forest.md",
"chars": 3043,
"preview": "## 6.6 Ensemble learning and random forest\n\n<a href=\"https://www.youtube.com/watch?v=FZhcmOfNNZE&list=PL3MmuxUbc_hIhxl5J"
},
{
"path": "06-trees/07-boosting.md",
"chars": 5979,
"preview": "## 6.7 Gradient boosting and XGBoost\n\n<a href=\"https://www.youtube.com/watch?v=xFarGClszEM&list=PL3MmuxUbc_hIhxl5Ji8t4O6"
},
{
"path": "06-trees/08-xgb-tuning.md",
"chars": 3655,
"preview": "## 6.8 XGBoost parameter tuning\n\n<a href=\"https://www.youtube.com/watch?v=VX6ftRzYROM&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOp"
},
{
"path": "06-trees/09-final-model.md",
"chars": 1492,
"preview": "## 6.9 Selecting the best model\n\n<a href=\"https://www.youtube.com/watch?v=lqdnyIVQq-M&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOp"
},
{
"path": "06-trees/10-summary.md",
"chars": 1222,
"preview": "## 6.10 Summary\n\n<a href=\"https://www.youtube.com/watch?v=JZ6sRZ_5j_c&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR\"><img src="
},
{
"path": "06-trees/11-explore-more.md",
"chars": 1665,
"preview": "\n## 6.11 Explore more\n\n* For this dataset we didn't do EDA or feature engineering. You can do it to get more insights in"
},
{
"path": "06-trees/README.md",
"chars": 1573,
"preview": "## 6. Decision Trees and Ensemble Learning\n\n- 6.1 [Credit risk scoring project](01-credit-risk.md)\n- 6.2 [Data cleaning "
},
{
"path": "06-trees/homework.md",
"chars": 622,
"preview": "## Homework\n\n* For 2025 cohort homework, check [the 2025 cohort folder](../cohorts/2025/06-trees/homework.md)\n* For 2024"
},
{
"path": "06-trees/meta.csv",
"chars": 2184,
"preview": "lesson,name,page_name,video,slides,notebook\r\n1,Credit risk scoring project,01-credit-risk.md,https://www.youtube.com/wat"
},
{
"path": "06-trees/meta.json",
"chars": 96,
"preview": "{\n \"data\": \"meta.csv\",\n \"session\": 6,\n \"name\": \"Decision Trees and Ensemble Learning\"\n}"
},
{
"path": "06-trees/notebook.ipynb",
"chars": 321921,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"id\": \"52472024\",\n \"metadata\": {},\n \"source\": [\n \"# 6. Decision "
},
{
"path": "08-deep-learning/01-fashion-classification.md",
"chars": 2200,
"preview": "## 8.1 Fashion classification\n\n<a href=\"https://www.youtube.com/watch?v=it1Lu7NmMpw&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHa"
},
{
"path": "08-deep-learning/02-tensorflow-keras.md",
"chars": 3279,
"preview": "## 8.2 TensorFlow and Keras\n\n<a href=\"https://www.youtube.com/watch?v=R6o_CUmoN9Q&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCL"
},
{
"path": "08-deep-learning/03-pretrained-models.md",
"chars": 3404,
"preview": "## 8.3 Pre-trained convolutional neural networks\r\n\r\n<a href=\"https://www.youtube.com/watch?v=qGDXEz-cr6M&list=PL3MmuxUbc"
},
{
"path": "08-deep-learning/04-conv-neural-nets.md",
"chars": 5941,
"preview": "## 8.4 Convolutional neural networks\n\n<a href=\"https://www.youtube.com/watch?v=BN-fnYzbdc8&list=PL3MmuxUbc_hIhxl5Ji8t4O6"
},
{
"path": "08-deep-learning/05-transfer-learning.md",
"chars": 5814,
"preview": "## 8.5 Transfer learning\n\n<a href=\"https://www.youtube.com/watch?v=WKHylqfNmq4&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR\">"
},
{
"path": "08-deep-learning/06-learning-rate.md",
"chars": 4599,
"preview": "## 8.6 Adjusting the learning rate\n\n<a href=\"https://www.youtube.com/watch?v=2gPmRRGz0Hc&list=PL3MmuxUbc_hIhxl5Ji8t4O6lP"
},
{
"path": "08-deep-learning/07-checkpointing.md",
"chars": 1679,
"preview": "## 8.7 Checkpointing\n\n<a href=\"https://www.youtube.com/watch?v=NRpGUx0o3Ps&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR\"><img"
},
{
"path": "08-deep-learning/08-more-layers.md",
"chars": 3472,
"preview": "## 8.8 Adding more layers\n\n<a href=\"https://www.youtube.com/watch?v=bSRRrorvAZs&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR\""
},
{
"path": "08-deep-learning/09-dropout.md",
"chars": 4249,
"preview": "## 8.9 Regularization and dropout\n\n<a href=\"https://www.youtube.com/watch?v=74YmhVM6FTM&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPA"
},
{
"path": "08-deep-learning/10-augmentation.md",
"chars": 2893,
"preview": "## 8.10 Data augmentation\r\n\r\n<a href=\"https://www.youtube.com/watch?v=aoPfVsS3BDE&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCL"
},
{
"path": "08-deep-learning/11-large-model.md",
"chars": 1089,
"preview": "## 8.11 Training a larger model\r\n\r\n<a href=\"https://www.youtube.com/watch?v=_QpDGJwFjYA&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPA"
},
{
"path": "08-deep-learning/12-using-model.md",
"chars": 2695,
"preview": "## 8.12 Using the model\r\n\r\n<a href=\"https://www.youtube.com/watch?v=cM1WHKae1wo&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR\""
},
{
"path": "08-deep-learning/13-summary.md",
"chars": 1213,
"preview": "## 8.13 Summary\r\n\r\n<a href=\"https://www.youtube.com/watch?v=mn0BcXJlRFM&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR\"><img sr"
},
{
"path": "08-deep-learning/14-explore-more.md",
"chars": 525,
"preview": "## 8.14 Explore more\r\n\r\n**TODO**\r\n\r\n- Add more data, e.g, Zalando etc\r\n- Albumentations - another way of generating augm"
},
{
"path": "08-deep-learning/README.md",
"chars": 3123,
"preview": "## 8. Neural Networks and Deep Learning\n\nNote: in the module we use TensorFlow+Keras. These videos \nwere recorded a whil"
},
{
"path": "08-deep-learning/homework.md",
"chars": 661,
"preview": "## Homework\r\n* For 2025 cohort homework, check [the 2025 cohort folder](../cohorts/2025/08-deep-learning/homework.md)\r\n*"
},
{
"path": "08-deep-learning/install.md",
"chars": 7109,
"preview": "# Installation of tensorflow\n\ndate: 2023 Nov 12\n\nThis installation guide is specific to the use case as in the pre-requi"
},
{
"path": "08-deep-learning/meta.csv",
"chars": 3908,
"preview": "lesson,name,page_name,video,slides,notebook\r\n1,Fashion classification,01-fashion-classification.md,https://www.youtube.c"
},
{
"path": "08-deep-learning/meta.json",
"chars": 93,
"preview": "{\n \"data\": \"meta.csv\",\n \"session\": 8,\n \"name\": \"Neural Networks and Deep Learning\"\n}"
},
{
"path": "08-deep-learning/notebook.ipynb",
"chars": 528405,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"code\",\n \"execution_count\": 1,\n \"id\": \"a8f9405b\",\n \"metadata\": {},\n \"outputs\":"
},
{
"path": "08-deep-learning/pytorch/README.md",
"chars": 27125,
"preview": "# Deep Learning with PyTorch Workshop\n\n\n* [Video](https://www.youtube.com/watch?v=Ne25VujHRLA)\n* [Notebook](https://cola"
},
{
"path": "08-deep-learning/pytorch/install_pytorch.md",
"chars": 7171,
"preview": "# **Complete PyTorch Installation Guide (GPU & CPU Support)**\n\nHi 👋\nThis guide walks you through installing PyTorch with"
},
{
"path": "09-serverless/01-intro.md",
"chars": 1298,
"preview": "\n## 9.1 Introduction to Serverless\n\n<a href=\"https://www.youtube.com/watch?v=JLIVwIsU6RA&list=PL3MmuxUbc_hIhxl5Ji8t4O6lP"
},
{
"path": "09-serverless/02-aws-lambda.md",
"chars": 2863,
"preview": "\n## 9.2 AWS Lambda\n\n<a href=\"https://www.youtube.com/watch?v=_UX8-2WhHZo&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR\"><img s"
},
{
"path": "09-serverless/03-tensorflow-lite.md",
"chars": 2519,
"preview": "\n## 9.3 TensorFlow Lite\n\n> Note: the materials in this unit are outdated.\n> \n> Refer to the [ONNX Workshop](workshop/) f"
},
{
"path": "09-serverless/04-preparing-code.md",
"chars": 868,
"preview": "\n## 9.4 Preparing the code for Lambda\n\n> Note: the materials in this unit are outdated.\n> \n> Refer to the [ONNX Workshop"
},
{
"path": "09-serverless/05-docker-image.md",
"chars": 2301,
"preview": "## 9.5 Preparing a Docker image\n\n> Note: the materials in this unit are outdated.\n> \n> Refer to the [ONNX Workshop](work"
},
{
"path": "09-serverless/06-creating-lambda.md",
"chars": 889,
"preview": "\n## 9.6 Creating the lambda function\n\n> Note: the materials in this unit are outdated.\n> \n> Refer to the [ONNX Workshop]"
},
{
"path": "09-serverless/07-api-gateway.md",
"chars": 745,
"preview": "\n## 9.7 API Gateway: exposing the lambda function\n\n<a href=\"https://www.youtube.com/watch?v=wyZ9aqQOXvs&list=PL3MmuxUbc_"
},
{
"path": "09-serverless/08-summary.md",
"chars": 624,
"preview": "## 9.8 Summary\n\n<a href=\"https://www.youtube.com/watch?v=bu3nPiHCNLU&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR\"><img src=\""
},
{
"path": "09-serverless/09-explore-more.md",
"chars": 457,
"preview": "## 9.9 Explore more\n\n* Try similar serverless services from Google Cloud and Microsoft Azure\n* Deploy cats vs dogs and o"
},
{
"path": "09-serverless/README.md",
"chars": 1656,
"preview": "## 9. Serverless Deep Learning\n\n\nUse the [workshop](workshop/) for most of the content.\n\nThe content in the module still"
},
{
"path": "09-serverless/code/Dockerfile",
"chars": 307,
"preview": "FROM public.ecr.aws/lambda/python:3.10\n\nRUN pip install keras-image-helper\nRUN pip install https://github.com/alexeygrig"
},
{
"path": "09-serverless/code/convert-model.py",
"chars": 285,
"preview": "import tensorflow as tf\nfrom tensorflow import keras\n\nmodel = keras.models.load_model('clothing-model.h5')\n\nconverter = "
},
{
"path": "09-serverless/code/lambda_function.py",
"chars": 995,
"preview": "#!/usr/bin/env python\n# coding: utf-8\n\nimport tflite_runtime.interpreter as tflite\nfrom keras_image_helper import create"
},
{
"path": "09-serverless/code/plan.md",
"chars": 1407,
"preview": "# 9. Serverless Deep Learning\n\nWe'll deploy the clothes classification model we trained previously. \n\n## 9.1 Introductio"
},
{
"path": "09-serverless/code/tensorflow-model.ipynb",
"chars": 19570,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"code\",\n \"execution_count\": 1,\n \"id\": \"37a8e270\",\n \"metadata\": {},\n \"outputs\":"
},
{
"path": "09-serverless/code/test.py",
"chars": 200,
"preview": "import requests\n\nurl = 'http://localhost:8080/2015-03-31/functions/function/invocations'\n\ndata = {'url': 'http://bit.ly/"
},
{
"path": "09-serverless/homework.md",
"chars": 522,
"preview": "## Homework\n\n* For 2025 cohort homework, check [the 2025 cohort folder](../cohorts/2025/09-serverless/homework.md)\n* For"
},
{
"path": "09-serverless/meta.csv",
"chars": 851,
"preview": "lesson,name,page_name,video,slides,notebook\n1,Introduction to Serverless,01-intro.md,https://www.youtube.com/watch?v=JLI"
},
{
"path": "09-serverless/meta.json",
"chars": 84,
"preview": "{\n \"data\": \"meta.csv\",\n \"session\": 9,\n \"name\": \"Serverless Deep Learning\"\n}"
},
{
"path": "09-serverless/updates.md",
"chars": 1026,
"preview": "## Python 3.12 vs TF Lite 2.17\n\nThe latest versions of TF Lite don't support Python 3.12 yet. \n\nAs a workaround, we can "
},
{
"path": "09-serverless/workshop/README.md",
"chars": 16318,
"preview": "# Machine Learning and Deep Learning Model Deployment with Serverless\n\n* Video: https://www.youtube.com/watch?v=sHQaeVm5"
},
{
"path": "09-serverless/workshop/lambda-keras/.gitignore",
"chars": 47,
"preview": "*.keras\n*saved_model\n*.onnx\n*.h5\nconvert/models"
},
{
"path": "09-serverless/workshop/lambda-keras/Dockerfile",
"chars": 207,
"preview": "FROM public.ecr.aws/lambda/python:3.13\n\nRUN pip install onnxruntime keras-image-helper\n\nCOPY clothing-model-new.onnx clo"
},
{
"path": "09-serverless/workshop/lambda-keras/convert/.dockerignore",
"chars": 6,
"preview": "models"
},
{
"path": "09-serverless/workshop/lambda-keras/convert/Dockerfile",
"chars": 439,
"preview": "FROM python:3.13.5-slim\n\nARG COMMIT_ID\n\nENV PYTHONDONTWRITEBYTECODE=1 \\\n PYTHONUNBUFFERED=1 \\\n PIP_NO_CACHE_DIR=1\n"
},
{
"path": "09-serverless/workshop/lambda-keras/convert/README.md",
"chars": 495,
"preview": "\n```bash\n# you can update it to the latest commit\nCOMMIT_ID=c34ac1d751427cf5d98023a21cce4c82b0cf96a1\nTAG=${COMMIT_ID:0:7"
},
{
"path": "09-serverless/workshop/lambda-keras/convert/convert-saved-model.py",
"chars": 136,
"preview": "from tensorflow import keras\n\nmodel = keras.models.load_model('clothing-model-new.keras')\nmodel.export(\"clothing-model-n"
},
{
"path": "09-serverless/workshop/lambda-keras/lambda_function.py",
"chars": 1283,
"preview": "import numpy as np\nimport onnxruntime as ort\nfrom keras_image_helper import create_preprocessor\n\n\ndef preprocess_pytorch"
},
{
"path": "09-serverless/workshop/lambda-keras/test.ipynb",
"chars": 3487,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"code\",\n \"execution_count\": 1,\n \"id\": \"ced7eb3e-93b2-47b5-9c5a-ef2cf91baf4e\",\n \""
},
{
"path": "09-serverless/workshop/lambda-keras/test.py",
"chars": 211,
"preview": "import requests\n\nurl = 'http://localhost:8080/2015-03-31/functions/function/invocations'\n\nrequest = {\n \"url\": \"http:/"
},
{
"path": "09-serverless/workshop/lambda-onnx/.gitignore",
"chars": 6,
"preview": "*.onnx"
},
{
"path": "09-serverless/workshop/lambda-onnx/Dockerfile",
"chars": 285,
"preview": "FROM public.ecr.aws/lambda/python:3.13\n\nRUN pip install onnxruntime keras-image-helper==0.0.2\n\nARG MODEL_NAME=clothing_c"
},
{
"path": "09-serverless/workshop/lambda-onnx/lambda_function.py",
"chars": 1370,
"preview": "import os\n\nimport numpy as np\nimport onnxruntime as ort\nfrom keras_image_helper import create_preprocessor\n\n\nmodel_name "
},
{
"path": "09-serverless/workshop/lambda-onnx/test.ipynb",
"chars": 5232,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"code\",\n \"execution_count\": 5,\n \"id\": \"b2b3e76f-09ac-42c1-bb0a-c93fe73e7167\",\n \""
},
{
"path": "09-serverless/workshop/lambda-onnx/test.py",
"chars": 211,
"preview": "import requests\n\nurl = 'http://localhost:8080/2015-03-31/functions/function/invocations'\n\nrequest = {\n \"url\": \"http:/"
},
{
"path": "09-serverless/workshop/lambda-sklearn/.dockerignore",
"chars": 406,
"preview": "\n# Project specific\ninvoke.py\ncustomer.json\nREADME.md\ntest.py\n\n# Git\n.git\n.gitignore\n\n# Python\n__pycache__\n*.pyc\n*.pyo\n*"
},
{
"path": "09-serverless/workshop/lambda-sklearn/.python-version",
"chars": 5,
"preview": "3.13\n"
},
{
"path": "09-serverless/workshop/lambda-sklearn/Dockerfile",
"chars": 268,
"preview": "FROM public.ecr.aws/lambda/python:3.13\nCOPY --from=ghcr.io/astral-sh/uv:latest /uv /bin/\n\nCOPY pyproject.toml uv.lock ./"
},
{
"path": "09-serverless/workshop/lambda-sklearn/customer.json",
"chars": 557,
"preview": "{\n \"customer\": {\n \"gender\": \"female\",\n \"seniorcitizen\": 0,\n \"partner\": \"yes\",\n \"dependents\": \"no\",\n \"pho"
},
{
"path": "09-serverless/workshop/lambda-sklearn/deploy.sh",
"chars": 910,
"preview": "#!/bin/bash\n\nIMAGE_NAME=\"churn-prediction-lambda\"\nAWS_REGION=\"eu-west-1\"\n\nAWS_ACCOUNT_ID=$(aws sts get-caller-identity |"
},
{
"path": "09-serverless/workshop/lambda-sklearn/invoke.py",
"chars": 959,
"preview": "import boto3\nimport json\n\nlambda_client = boto3.client('lambda')\n\ncustomer_data = {\n \"customer\": {\n \"gender\": "
},
{
"path": "09-serverless/workshop/lambda-sklearn/lambda_function.py",
"chars": 425,
"preview": "import pickle\n\nwith open('model.bin', 'rb') as f_in:\n pipeline = pickle.load(f_in)\n\ndef predict_single(customer):\n "
},
{
"path": "09-serverless/workshop/lambda-sklearn/pyproject.toml",
"chars": 189,
"preview": "[project]\nname = \"lambda-1-simple\"\nversion = \"0.1.0\"\ndescription = \"Add your description here\"\nreadme = \"README.md\"\nrequ"
},
{
"path": "09-serverless/workshop/lambda-sklearn/test.py",
"chars": 242,
"preview": "import requests\nimport json\n\nurl = 'http://localhost:8080/2015-03-31/functions/function/invocations'\n\n\nwith open('custom"
},
{
"path": "09-serverless/workshop/train/.python-version",
"chars": 5,
"preview": "3.13\n"
},
{
"path": "09-serverless/workshop/train/README.md",
"chars": 0,
"preview": ""
},
{
"path": "09-serverless/workshop/train/pyproject.toml",
"chars": 200,
"preview": "[project]\nname = \"train\"\nversion = \"0.1.0\"\ndescription = \"Add your description here\"\nreadme = \"README.md\"\nrequires-pytho"
},
{
"path": "09-serverless/workshop/train/train.py",
"chars": 1887,
"preview": "import pickle\n\nimport pandas as pd\nimport sklearn\n\nfrom sklearn.feature_extraction import DictVectorizer\nfrom sklearn.li"
},
{
"path": "10-kubernetes/01-overview.md",
"chars": 1410,
"preview": "\n## 10.1 Overview\n\n<a href=\"https://www.youtube.com/watch?v=mvPER7YfTkw&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR\"><img sr"
},
{
"path": "10-kubernetes/02-tensorflow-serving.md",
"chars": 3058,
"preview": "\r\n## 10.2 TensorFlow Serving\r\n\r\n<a href=\"https://www.youtube.com/watch?v=deXR2fThYDw&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpH"
},
{
"path": "10-kubernetes/03-preprocessing.md",
"chars": 3906,
"preview": "\n## 10.3 Creating a pre-processing service\n\n<a href=\"https://www.youtube.com/watch?v=OIlrS14Zi0o&list=PL3MmuxUbc_hIhxl5J"
},
{
"path": "10-kubernetes/04-docker-compose.md",
"chars": 4126,
"preview": "\r\n## 10.4 Running everything locally with Docker-compose\r\n\r\n<a href=\"https://www.youtube.com/watch?v=ZhQQfpWfkKY&list=PL"
},
{
"path": "10-kubernetes/05-kubernetes-intro.md",
"chars": 3947,
"preview": "\n## 10.5 Introduction to Kubernetes\n\n<a href=\"https://www.youtube.com/watch?v=UjVkpszDzgk&list=PL3MmuxUbc_hIhxl5Ji8t4O6l"
},
{
"path": "10-kubernetes/06-kubernetes-simple-service.md",
"chars": 7753,
"preview": "\n## 10.6 Deploying a simple service to Kubernetes\n\n<a href=\"https://www.youtube.com/watch?v=PPUCVRIV9t8&list=PL3MmuxUbc_"
},
{
"path": "10-kubernetes/07-kubernetes-tf-serving.md",
"chars": 4728,
"preview": "## 10.7 Deploying TensorFlow models to Kubernetes\n\n<a href=\"https://www.youtube.com/watch?v=6vHLMdnjO2w&list=PL3MmuxUbc_"
},
{
"path": "10-kubernetes/08-eks.md",
"chars": 3576,
"preview": "## 10.8 Deploying to EKS\r\n\r\n<a href=\"https://www.youtube.com/watch?v=89jxeddZtC0&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR"
}
]
// ... and 228 more files (download for full content)
About this extraction
This page contains the full source code of the DataTalksClub/machine-learning-zoomcamp GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 428 files (6.6 MB), approximately 1.7M tokens, and a symbol index with 76 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.