Repository: minsuk-heo/pandas Branch: master Commit: 85331d806d59 Files: 6 Total size: 418.1 KB Directory structure: gitextract_520hdf_4/ ├── Pandas_Cheatsheet.ipynb ├── data/ │ ├── friend_list.csv │ ├── friend_list.txt │ ├── friend_list_no_head.csv │ └── friend_list_tab.txt └── 팬더스_명령어_꿀팁.ipynb ================================================ FILE CONTENTS ================================================ ================================================ FILE: Pandas_Cheatsheet.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# What is Pandas?\n", "python library for data manipulation and analysis" ] }, { "cell_type": "code", "execution_count": 178, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "data_frame = pd.read_csv('data/friend_list.csv')" ] }, { "cell_type": "code", "execution_count": 179, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	name	age	job
0	John	20	student
1	Jenny	30	developer
2	Nate	30	teacher
3	Julia	40	dentist
4	Brian	45	manager
5	Chris	25	intern

\n", "

" ], "text/plain": [ " name age job\n", "0 John 20 student\n", "1 Jenny 30 developer\n", "2 Nate 30 teacher\n", "3 Julia 40 dentist\n", "4 Brian 45 manager\n", "5 Chris 25 intern" ] }, "execution_count": 179, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data_frame" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# What is DataFrame?\n", "dataframe is a 2-dimensional labeled data structure with columns" ] }, { "cell_type": "code", "execution_count": 180, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	name	age	job
0	John	20	student
1	Jenny	30	developer
2	Nate	30	teacher
3	Julia	40	dentist
4	Brian	45	manager

\n", "

	name	age	job
0	John	20	STUDENT
1	Jenny	30	DEVELOPER
2	Nate	30	TEACHER
3	Julia	40	DENTIST
4	Brian	45	MANAGER

\n", "

\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "

	num	word
0	1	one
1	2	two
2	3	three

\n", "

" ], "text/plain": [ " num word\n", "0 1 one\n", "1 2 two\n", "2 3 three" ] }, "execution_count": 183, "metadata": {}, "output_type": "execute_result" } ], "source": [ "s1 = pd.core.series.Series(['one', 'two', 'three'])\n", "s2 = pd.core.series.Series([1, 2, 3])\n", "pd.DataFrame(data=dict(word=s1, num=s2))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Why Pandas?\n", "\n", "Very similar to Excel spreadsheet view, \n", "support various functions for data manipulation and analysis. \n", "Fast based on Numpy. \n", "Easy to manipulate data for your purpose" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Read File to DataFrame\n", "A **Data frame** is a two-dimensional data structure, i.e., data is aligned in a tabular fashion in rows and columns." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "by default, pandas support csv format" ] }, { "cell_type": "code", "execution_count": 184, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv('data/friend_list.csv')" ] }, { "cell_type": "code", "execution_count": 185, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	name	age	job
0	John	20	student
1	Jenny	30	developer
2	Nate	30	teacher
3	Julia	40	dentist
4	Brian	45	manager
5	Chris	25	intern

\n", "

" ], "text/plain": [ " name age job\n", "0 John 20 student\n", "1 Jenny 30 developer\n", "2 Nate 30 teacher\n", "3 Julia 40 dentist\n", "4 Brian 45 manager\n", "5 Chris 25 intern" ] }, "execution_count": 185, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "you can read txt file like below, if the txt file data are comma separated" ] }, { "cell_type": "code", "execution_count": 186, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv('data/friend_list.txt')" ] }, { "cell_type": "code", "execution_count": 187, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	name	age	job
0	John	20	student
1	Jenny	30	developer
2	Nate	30	teacher
3	Julia	40	dentist
4	Brian	45	manager

\n", "

	name	age	job
0	John	20	student
1	Jenny	30	developer
2	Nate	30	teacher
3	Julia	40	dentist
4	Brian	45	manager

\n", "

	0	1	2
0	John	20	student
1	Jenny	30	developer
2	Nate	30	teacher
3	Julia	40	dentist
4	Brian	45	manager

\n", "

" ], "text/plain": [ " 0 1 2\n", "0 John 20 student\n", "1 Jenny 30 developer\n", "2 Nate 30 teacher\n", "3 Julia 40 dentist\n", "4 Brian 45 manager" ] }, "execution_count": 191, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "you can add column header after you create dataframe" ] }, { "cell_type": "code", "execution_count": 192, "metadata": {}, "outputs": [], "source": [ "df.columns = ['name', 'age', 'job']" ] }, { "cell_type": "code", "execution_count": 193, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	name	age	job
0	John	20	student
1	Jenny	30	developer
2	Nate	30	teacher
3	Julia	40	dentist
4	Brian	45	manager

\n", "

	name	age	job
0	John	20	student
1	Jenny	30	developer
2	Nate	30	teacher
3	Julia	40	dentist
4	Brian	45	manager

\n", "

" ], "text/plain": [ " name age job\n", "0 John 20 student\n", "1 Jenny 30 developer\n", "2 Nate 30 teacher\n", "3 Julia 40 dentist\n", "4 Brian 45 manager" ] }, "execution_count": 195, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Create DataFrame\n", "when you want to create dataframe from your python code" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## from dictionary" ] }, { "cell_type": "code", "execution_count": 196, "metadata": {}, "outputs": [], "source": [ "friend_dict_list = [{'name': 'Jone', 'age': 20, 'job': 'student'},\n", " {'name': 'Jenny', 'age': 30, 'job': 'developer'},\n", " {'name': 'Nate', 'age': 30, 'job': 'teacher'}]\n", "df = pd.DataFrame(friend_dict_list)" ] }, { "cell_type": "code", "execution_count": 197, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	age	job	name
0	20	student	Jone
1	30	developer	Jenny
2	30	teacher	Nate

\n", "

" ], "text/plain": [ " age job name\n", "0 20 student Jone\n", "1 30 developer Jenny\n", "2 30 teacher Nate" ] }, "execution_count": 197, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "if you need fixed column order, you can adjust column order like below," ] }, { "cell_type": "code", "execution_count": 198, "metadata": {}, "outputs": [], "source": [ "df = df[['name', 'age', 'job']]" ] }, { "cell_type": "code", "execution_count": 199, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	name	age	job
0	Jone	20	student
1	Jenny	30	developer
2	Nate	30	teacher

\n", "

" ], "text/plain": [ " name age job\n", "0 Jone 20 student\n", "1 Jenny 30 developer\n", "2 Nate 30 teacher" ] }, "execution_count": 199, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## from OrderedDict\n", "OrderedDict helps you to have fixed column order at once" ] }, { "cell_type": "code", "execution_count": 200, "metadata": {}, "outputs": [], "source": [ "from collections import OrderedDict" ] }, { "cell_type": "code", "execution_count": 201, "metadata": {}, "outputs": [], "source": [ "friend_ordered_dict = OrderedDict([ ('name', ['John', 'Jenny', 'Nate']),\n", " ('age', [20, 30, 30]),\n", " ('job', ['student', 'developer', 'teacher']) ] )\n", "df = pd.DataFrame.from_dict(friend_ordered_dict)" ] }, { "cell_type": "code", "execution_count": 202, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	name	age	job
0	John	20	student
1	Jenny	30	developer
2	Nate	30	teacher

\n", "

" ], "text/plain": [ " name age job\n", "0 John 20 student\n", "1 Jenny 30 developer\n", "2 Nate 30 teacher" ] }, "execution_count": 202, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [ "## from list" ] }, { "cell_type": "code", "execution_count": 203, "metadata": {}, "outputs": [], "source": [ "friend_list = [ ['John', 20, 'student'],['Jenny', 30, 'developer'],['Nate', 30, 'teacher'] ]\n", "column_name = ['name', 'age', 'job']\n", "df = pd.DataFrame.from_records(friend_list, columns=column_name)" ] }, { "cell_type": "code", "execution_count": 204, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	name	age	job
0	John	20	student
1	Jenny	30	developer
2	Nate	30	teacher

\n", "

" ], "text/plain": [ " name age job\n", "0 John 20 student\n", "1 Jenny 30 developer\n", "2 Nate 30 teacher" ] }, "execution_count": 204, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": 205, "metadata": {}, "outputs": [], "source": [ "friend_list = [ \n", " ['name',['John', 'Jenny', 'Nate']],\n", " ['age',[20,30,30]],\n", " ['job',['student', 'developer', 'teacher']] \n", " ]\n", "df = pd.DataFrame.from_items(friend_list)" ] }, { "cell_type": "code", "execution_count": 206, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	name	age	job
0	John	20	student
1	Jenny	30	developer
2	Nate	30	teacher

\n", "

" ], "text/plain": [ " name age job\n", "0 John 20 student\n", "1 Jenny 30 developer\n", "2 Nate 30 teacher" ] }, "execution_count": 206, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Write DataFrame to File" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "here is one dataframe example with header" ] }, { "cell_type": "code", "execution_count": 207, "metadata": {}, "outputs": [], "source": [ "friend_list = [ \n", " ['name',['John', 'Jenny', 'nate']],\n", " ['age',[20,30,30]],\n", " ['job',['student', 'developer', 'teacher']] \n", " ]\n", "df = pd.DataFrame.from_items(friend_list)" ] }, { "cell_type": "code", "execution_count": 208, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	name	age	job
0	John	20	student
1	Jenny	30	developer
2	nate	30	teacher

\n", "

" ], "text/plain": [ " name age job\n", "0 John 20 student\n", "1 Jenny 30 developer\n", "2 nate 30 teacher" ] }, "execution_count": 208, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "you can create csv file using below command," ] }, { "cell_type": "code", "execution_count": 209, "metadata": {}, "outputs": [], "source": [ "df.to_csv('friend_list_from_df.csv')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "below is one example of dataframe **doesn't** have header" ] }, { "cell_type": "code", "execution_count": 210, "metadata": {}, "outputs": [], "source": [ "friend_list = [ ['John', 20, 'student'],['Jenny', 30, 'developer'],['Nate', 30, 'teacher'] ]\n", "df = pd.DataFrame.from_records(friend_list)" ] }, { "cell_type": "code", "execution_count": 211, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	0	1	2
0	John	20	student
1	Jenny	30	developer
2	Nate	30	teacher

\n", "

" ], "text/plain": [ " 0 1 2\n", "0 John 20 student\n", "1 Jenny 30 developer\n", "2 Nate 30 teacher" ] }, "execution_count": 211, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "you can write csv file using below command," ] }, { "cell_type": "code", "execution_count": 212, "metadata": {}, "outputs": [], "source": [ "df.to_csv('friend_list_from_df.csv')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "you also can write txt file using same command" ] }, { "cell_type": "code", "execution_count": 213, "metadata": {}, "outputs": [], "source": [ "df.to_csv('friend_list_from_df.txt')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "by default, header and index are True like below, even if you don't mention it in the command" ] }, { "cell_type": "code", "execution_count": 214, "metadata": {}, "outputs": [], "source": [ "df.to_csv('friend_list_from_df.csv', header = True, index = True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**header = False** means you don't want to create column names. no 0,1,2 at column name \n", "**index = False** means you don't want to create row names. no 0,1,2 at row name" ] }, { "cell_type": "code", "execution_count": 215, "metadata": {}, "outputs": [], "source": [ "df.to_csv('friend_list_from_df.csv', header = False, index = False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "you can specify add column names by giving **header** with list" ] }, { "cell_type": "code", "execution_count": 216, "metadata": {}, "outputs": [], "source": [ "df.to_csv('friend_list_from_df.csv', header = ['name', 'age', 'job'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "below is dataframe has **None** value" ] }, { "cell_type": "code", "execution_count": 217, "metadata": {}, "outputs": [], "source": [ "friend_list = [ \n", " ['name',['John', None, 'nate']],\n", " ['age',[20,None,30]],\n", " ['job',['student', 'developer', 'teacher']] \n", " ]\n", "df = pd.DataFrame.from_items(friend_list)" ] }, { "cell_type": "code", "execution_count": 218, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	name	age	job
0	John	20.0	student
1	None	NaN	developer
2	nate	30.0	teacher

\n", "

" ], "text/plain": [ " name age job\n", "0 John 20.0 student\n", "1 None NaN developer\n", "2 nate 30.0 teacher" ] }, "execution_count": 218, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": 219, "metadata": {}, "outputs": [], "source": [ "df.to_csv('friend_list_from_df.csv')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**na_rep** replace **None** with provided value" ] }, { "cell_type": "code", "execution_count": 220, "metadata": {}, "outputs": [], "source": [ "df.to_csv('friend_list_from_df.csv', na_rep = '-')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Select Row" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## by index" ] }, { "cell_type": "code", "execution_count": 221, "metadata": {}, "outputs": [], "source": [ "friend_list = [ \n", " ['name',['John', 'Jenny', 'Nate']],\n", " ['age',[20,30,30]],\n", " ['job',['student', 'developer', 'teacher']] \n", " ]\n", "df = pd.DataFrame.from_items(friend_list)" ] }, { "cell_type": "code", "execution_count": 222, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	name	age	job
0	John	20	student
1	Jenny	30	developer
2	Nate	30	teacher

\n", "

" ], "text/plain": [ " name age job\n", "0 John 20 student\n", "1 Jenny 30 developer\n", "2 Nate 30 teacher" ] }, "execution_count": 222, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "select rows from index 1 to index 2" ] }, { "cell_type": "code", "execution_count": 223, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "

	name	age	job
1	Jenny	30	developer
2	Nate	30	teacher

\n", "

" ], "text/plain": [ " name age job\n", "1 Jenny 30 developer\n", "2 Nate 30 teacher" ] }, "execution_count": 223, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[1:3]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "select row index 0 and index 2" ] }, { "cell_type": "code", "execution_count": 224, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "

	name	age	job
0	John	20	student
2	Nate	30	teacher

\n", "

" ], "text/plain": [ " name age job\n", "0 John 20 student\n", "2 Nate 30 teacher" ] }, "execution_count": 224, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.loc[[0,2]]" ] }, { "cell_type": "code", "execution_count": 225, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	name	age	job
0	John	20	student
1	Jenny	30	developer
2	Nate	30	teacher

\n", "

" ], "text/plain": [ " name age job\n", "0 John 20 student\n", "1 Jenny 30 developer\n", "2 Nate 30 teacher" ] }, "execution_count": 225, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## by column condition" ] }, { "cell_type": "code", "execution_count": 226, "metadata": {}, "outputs": [], "source": [ "df_filtered = df[df.age > 25]" ] }, { "cell_type": "code", "execution_count": 227, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "

	name	age	job
1	Jenny	30	developer
2	Nate	30	teacher

\n", "

" ], "text/plain": [ " name age job\n", "1 Jenny 30 developer\n", "2 Nate 30 teacher" ] }, "execution_count": 227, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_filtered" ] }, { "cell_type": "code", "execution_count": 228, "metadata": {}, "outputs": [], "source": [ "df_filtered = df.query('age>25')" ] }, { "cell_type": "code", "execution_count": 229, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "

	name	age	job
1	Jenny	30	developer
2	Nate	30	teacher

\n", "

" ], "text/plain": [ " name age job\n", "1 Jenny 30 developer\n", "2 Nate 30 teacher" ] }, "execution_count": 229, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_filtered" ] }, { "cell_type": "code", "execution_count": 230, "metadata": {}, "outputs": [], "source": [ "df_filtered = df[(df.age >25) & (df.name == 'Nate')]" ] }, { "cell_type": "code", "execution_count": 231, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "

	name	age	job
2	Nate	30	teacher

\n", "

" ], "text/plain": [ " name age job\n", "2 Nate 30 teacher" ] }, "execution_count": 231, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_filtered" ] }, { "cell_type": "code", "execution_count": 232, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	name	age	job
0	John	20	student
1	Jenny	30	developer
2	Nate	30	teacher

\n", "

" ], "text/plain": [ " name age job\n", "0 John 20 student\n", "1 Jenny 30 developer\n", "2 Nate 30 teacher" ] }, "execution_count": 232, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Filter Column" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## by index" ] }, { "cell_type": "code", "execution_count": 233, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	0	1	2
0	John	20	student
1	Jenny	30	developer
2	Nate	30	teacher

\n", "

" ], "text/plain": [ " 0 1 2\n", "0 John 20 student\n", "1 Jenny 30 developer\n", "2 Nate 30 teacher" ] }, "execution_count": 233, "metadata": {}, "output_type": "execute_result" } ], "source": [ "friend_list = [ ['John', 20, 'student'],['Jenny', 30, 'developer'],['Nate', 30, 'teacher'] ]\n", "df = pd.DataFrame.from_records(friend_list)\n", "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "select all rows, from column 0 to column 1" ] }, { "cell_type": "code", "execution_count": 234, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "

	0	1
0	John	20
1	Jenny	30
2	Nate	30

\n", "

" ], "text/plain": [ " 0 1\n", "0 John 20\n", "1 Jenny 30\n", "2 Nate 30" ] }, "execution_count": 234, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.iloc[:, 0:2]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "select all rows, column 0 and column 2" ] }, { "cell_type": "code", "execution_count": 235, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "

	0	2
0	John	student
1	Jenny	developer
2	Nate	teacher

\n", "

" ], "text/plain": [ " 0 2\n", "0 John student\n", "1 Jenny developer\n", "2 Nate teacher" ] }, "execution_count": 235, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.iloc[:,[0,2]]" ] }, { "cell_type": "code", "execution_count": 236, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	0	1	2
0	John	20	student
1	Jenny	30	developer
2	Nate	30	teacher

\n", "

" ], "text/plain": [ " 0 1 2\n", "0 John 20 student\n", "1 Jenny 30 developer\n", "2 Nate 30 teacher" ] }, "execution_count": 236, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## by column name" ] }, { "cell_type": "code", "execution_count": 237, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	name	age	job
0	John	20	student
1	Jenny	30	developer
2	Nate	30	teacher
3	Julia	40	dentist
4	Brian	45	manager
5	Chris	25	intern

\n", "

" ], "text/plain": [ " name age job\n", "0 John 20 student\n", "1 Jenny 30 developer\n", "2 Nate 30 teacher\n", "3 Julia 40 dentist\n", "4 Brian 45 manager\n", "5 Chris 25 intern" ] }, "execution_count": 237, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# you can create column header for no header data at once\n", "df = pd.read_csv('data/friend_list_no_head.csv', header = None, names=['name', 'age', 'job'])\n", "df" ] }, { "cell_type": "code", "execution_count": 238, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	name	age
0	John	20
1	Jenny	30
2	Nate	30
3	Julia	40
4	Brian	45
5	Chris	25

\n", "

" ], "text/plain": [ " name age\n", "0 John 20\n", "1 Jenny 30\n", "2 Nate 30\n", "3 Julia 40\n", "4 Brian 45\n", "5 Chris 25" ] }, "execution_count": 238, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_filtered = df[['name', 'age']]\n", "df_filtered" ] }, { "cell_type": "code", "execution_count": 239, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	age	job
0	20	student
1	30	developer
2	30	teacher
3	40	dentist
4	45	manager
5	25	intern

\n", "

" ], "text/plain": [ " age job\n", "0 20 student\n", "1 30 developer\n", "2 30 teacher\n", "3 40 dentist\n", "4 45 manager\n", "5 25 intern" ] }, "execution_count": 239, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.filter(items=['age', 'job'])" ] }, { "cell_type": "code", "execution_count": 240, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	name	age	job
0	John	20	student
1	Jenny	30	developer
2	Nate	30	teacher
3	Julia	40	dentist
4	Brian	45	manager
5	Chris	25	intern

\n", "

	name	age
0	John	20
1	Jenny	30
2	Nate	30
3	Julia	40
4	Brian	45
5	Chris	25

\n", "

" ], "text/plain": [ " name age\n", "0 John 20\n", "1 Jenny 30\n", "2 Nate 30\n", "3 Julia 40\n", "4 Brian 45\n", "5 Chris 25" ] }, "execution_count": 241, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# select columns containing 'a'\n", "df.filter(like='a',axis=1)" ] }, { "cell_type": "code", "execution_count": 242, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	job
0	student
1	developer
2	teacher
3	dentist
4	manager
5	intern

\n", "

" ], "text/plain": [ " job\n", "0 student\n", "1 developer\n", "2 teacher\n", "3 dentist\n", "4 manager\n", "5 intern" ] }, "execution_count": 242, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# select columns using regex\n", "df.filter(regex='b$',axis=1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Drop rows" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## by row name (index name)" ] }, { "cell_type": "code", "execution_count": 243, "metadata": {}, "outputs": [], "source": [ "friend_dict_list = [{'age': 20, 'job': 'student'},\n", " {'age': 30, 'job': 'developer'},\n", " {'age': 30, 'job': 'teacher'}]\n", "df = pd.DataFrame(friend_dict_list, index = ['John', 'Jenny', 'Nate'])" ] }, { "cell_type": "code", "execution_count": 244, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "

	age	job
John	20	student
Jenny	30	developer
Nate	30	teacher

\n", "

" ], "text/plain": [ " age job\n", "John 20 student\n", "Jenny 30 developer\n", "Nate 30 teacher" ] }, "execution_count": 244, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### drop row\n", "dropped result will be shown, but dataframe keeps the dropped row" ] }, { "cell_type": "code", "execution_count": 245, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "

	age	job
Jenny	30	developer

\n", "

" ], "text/plain": [ " age job\n", "Jenny 30 developer" ] }, "execution_count": 245, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.drop(['John', 'Nate'])" ] }, { "cell_type": "code", "execution_count": 246, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "

	age	job
John	20	student
Jenny	30	developer
Nate	30	teacher

\n", "

" ], "text/plain": [ " age job\n", "John 20 student\n", "Jenny 30 developer\n", "Nate 30 teacher" ] }, "execution_count": 246, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "you can assign the result to dataframe to keep the dropped result like below," ] }, { "cell_type": "code", "execution_count": 247, "metadata": {}, "outputs": [], "source": [ "df = df.drop(['John', 'Nate'])" ] }, { "cell_type": "code", "execution_count": 248, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "

	age	job
Jenny	30	developer

\n", "

" ], "text/plain": [ " age job\n", "Jenny 30 developer" ] }, "execution_count": 248, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### drop row in place\n", "The dropped row will be deleted from dataframe with inplace keyword parameter" ] }, { "cell_type": "code", "execution_count": 249, "metadata": {}, "outputs": [], "source": [ "friend_dict_list = [{'age': 20, 'job': 'student'},\n", " {'age': 30, 'job': 'developer'},\n", " {'age': 30, 'job': 'teacher'}]\n", "df = pd.DataFrame(friend_dict_list, index = ['John', 'Jenny', 'Nate'])" ] }, { "cell_type": "code", "execution_count": 250, "metadata": {}, "outputs": [], "source": [ "df.drop(['John', 'Nate'], inplace = True)" ] }, { "cell_type": "code", "execution_count": 251, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "

	age	job
Jenny	30	developer

\n", "

" ], "text/plain": [ " age job\n", "Jenny 30 developer" ] }, "execution_count": 251, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## by row id (index number)" ] }, { "cell_type": "code", "execution_count": 252, "metadata": {}, "outputs": [], "source": [ "friend_dict_list = [{'name': 'Jone', 'age': 20, 'job': 'student'},\n", " {'name': 'Jenny', 'age': 30, 'job': 'developer'},\n", " {'name': 'Nate', 'age': 30, 'job': 'teacher'}]\n", "df = pd.DataFrame(friend_dict_list)" ] }, { "cell_type": "code", "execution_count": 253, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	age	job	name
0	20	student	Jone
1	30	developer	Jenny
2	30	teacher	Nate

\n", "

" ], "text/plain": [ " age job name\n", "0 20 student Jone\n", "1 30 developer Jenny\n", "2 30 teacher Nate" ] }, "execution_count": 253, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "you can drop rows by its index" ] }, { "cell_type": "code", "execution_count": 254, "metadata": {}, "outputs": [], "source": [ "df = df.drop(df.index[[0,2]])" ] }, { "cell_type": "code", "execution_count": 255, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "

	age	job	name
1	30	developer	Jenny

\n", "

" ], "text/plain": [ " age job name\n", "1 30 developer Jenny" ] }, "execution_count": 255, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## By Column value" ] }, { "cell_type": "code", "execution_count": 256, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	age	job	name
0	20	student	Jone
1	30	developer	Jenny
2	30	teacher	Nate

\n", "

" ], "text/plain": [ " age job name\n", "0 20 student Jone\n", "1 30 developer Jenny\n", "2 30 teacher Nate" ] }, "execution_count": 256, "metadata": {}, "output_type": "execute_result" } ], "source": [ "friend_dict_list = [{'name': 'Jone', 'age': 20, 'job': 'student'},\n", " {'name': 'Jenny', 'age': 30, 'job': 'developer'},\n", " {'name': 'Nate', 'age': 30, 'job': 'teacher'}]\n", "df = pd.DataFrame(friend_dict_list)\n", "df" ] }, { "cell_type": "code", "execution_count": 257, "metadata": {}, "outputs": [], "source": [ "df = df[df.age != 30]" ] }, { "cell_type": "code", "execution_count": 258, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "

	age	job	name
0	20	student	Jone

\n", "

" ], "text/plain": [ " age job name\n", "0 20 student Jone" ] }, "execution_count": 258, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Drop column" ] }, { "cell_type": "code", "execution_count": 259, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	age	job	name
0	20	student	Jone
1	30	developer	Jenny
2	30	teacher	Nate

\n", "

" ], "text/plain": [ " age job name\n", "0 20 student Jone\n", "1 30 developer Jenny\n", "2 30 teacher Nate" ] }, "execution_count": 259, "metadata": {}, "output_type": "execute_result" } ], "source": [ "friend_dict_list = [{'name': 'Jone', 'age': 20, 'job': 'student'},\n", " {'name': 'Jenny', 'age': 30, 'job': 'developer'},\n", " {'name': 'Nate', 'age': 30, 'job': 'teacher'}]\n", "df = pd.DataFrame(friend_dict_list)\n", "df" ] }, { "cell_type": "code", "execution_count": 260, "metadata": {}, "outputs": [], "source": [ "df = df.drop('age', axis=1)" ] }, { "cell_type": "code", "execution_count": 261, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "

	job	name
0	student	Jone
1	developer	Jenny
2	teacher	Nate

\n", "

" ], "text/plain": [ " job name\n", "0 student Jone\n", "1 developer Jenny\n", "2 teacher Nate" ] }, "execution_count": 261, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [ "# Add Column / Update Column" ] }, { "cell_type": "code", "execution_count": 262, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	name	age	job
0	Jone	15	student
1	Jenny	30	developer
2	Nate	30	teacher

\n", "

" ], "text/plain": [ " name age job\n", "0 Jone 15 student\n", "1 Jenny 30 developer\n", "2 Nate 30 teacher" ] }, "execution_count": 262, "metadata": {}, "output_type": "execute_result" } ], "source": [ "friend_dict_list = [{'name': 'Jone', 'age': 15, 'job': 'student'},\n", " {'name': 'Jenny', 'age': 30, 'job': 'developer'},\n", " {'name': 'Nate', 'age': 30, 'job': 'teacher'}]\n", "df = pd.DataFrame(friend_dict_list, columns = ['name', 'age', 'job'])\n", "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Add New Column with default value" ] }, { "cell_type": "code", "execution_count": 263, "metadata": {}, "outputs": [], "source": [ "df['salary'] = 0" ] }, { "cell_type": "code", "execution_count": 264, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	name	age	job
0	Jone	15	student
1	Jenny	30	developer
2	Nate	30	teacher

\n", "

" ], "text/plain": [ " name age job salary\n", "0 Jone 15 student 0\n", "1 Jenny 30 developer 0\n", "2 Nate 30 teacher 0" ] }, "execution_count": 264, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Add New Column derived from existing value" ] }, { "cell_type": "code", "execution_count": 265, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	name	age	job
0	Jone	15	student
1	Jenny	30	developer
2	Nate	30	teacher

\n", "

" ], "text/plain": [ " name age job\n", "0 Jone 15 student\n", "1 Jenny 30 developer\n", "2 Nate 30 teacher" ] }, "execution_count": 265, "metadata": {}, "output_type": "execute_result" } ], "source": [ "friend_dict_list = [{'name': 'Jone', 'age': 15, 'job': 'student'},\n", " {'name': 'Jenny', 'age': 30, 'job': 'developer'},\n", " {'name': 'Nate', 'age': 30, 'job': 'teacher'}]\n", "df = pd.DataFrame(friend_dict_list, columns = ['name', 'age', 'job'])\n", "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## one liner adding column by true or false condition" ] }, { "cell_type": "code", "execution_count": 266, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "df['salary'] = np.where(df['job'] != 'student' , 'yes', 'no')" ] }, { "cell_type": "code", "execution_count": 267, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	name	age	job	salary
0	Jone	15	student	no
1	Jenny	30	developer	yes
2	Nate	30	teacher	yes

\n", "

" ], "text/plain": [ " name age job salary\n", "0 Jone 15 student no\n", "1 Jenny 30 developer yes\n", "2 Nate 30 teacher yes" ] }, "execution_count": 267, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "code", "execution_count": 268, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	name	midterm	final
0	John	95	85
1	Jenny	85	80
2	Nate	10	30

\n", "

" ], "text/plain": [ " name midterm final\n", "0 John 95 85\n", "1 Jenny 85 80\n", "2 Nate 10 30" ] }, "execution_count": 268, "metadata": {}, "output_type": "execute_result" } ], "source": [ "friend_dict_list = [{'name': 'John', 'midterm': 95, 'final': 85},\n", " {'name': 'Jenny', 'midterm': 85, 'final': 80},\n", " {'name': 'Nate', 'midterm': 10, 'final': 30}]\n", "df = pd.DataFrame(friend_dict_list, columns = ['name', 'midterm', 'final'])\n", "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## column derived from adding two existing columns" ] }, { "cell_type": "code", "execution_count": 269, "metadata": {}, "outputs": [], "source": [ "df['total'] = df['midterm'] + df['final']" ] }, { "cell_type": "code", "execution_count": 270, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	name	midterm	final	total
0	John	95	85	180
1	Jenny	85	80	165
2	Nate	10	30	40

\n", "

" ], "text/plain": [ " name midterm final total\n", "0 John 95 85 180\n", "1 Jenny 85 80 165\n", "2 Nate 10 30 40" ] }, "execution_count": 270, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## columm from existing column" ] }, { "cell_type": "code", "execution_count": 271, "metadata": {}, "outputs": [], "source": [ "df['average'] = df['total'] / 2" ] }, { "cell_type": "code", "execution_count": 272, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	name	midterm	final	total	average
0	John	95	85	180	90.0
1	Jenny	85	80	165	82.5
2	Nate	10	30	40	20.0

\n", "

" ], "text/plain": [ " name midterm final total average\n", "0 John 95 85 180 90.0\n", "1 Jenny 85 80 165 82.5\n", "2 Nate 10 30 40 20.0" ] }, "execution_count": 272, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## column by conditional condition" ] }, { "cell_type": "code", "execution_count": 273, "metadata": {}, "outputs": [], "source": [ "grades = []\n", "\n", "for row in df['average']:\n", " if row >= 90:\n", " grades.append('A')\n", " elif row >= 80:\n", " grades.append('B')\n", " elif row >= 70:\n", " grades.append('C')\n", " else:\n", " grades.append('F')\n", " \n", "df['grade'] = grades" ] }, { "cell_type": "code", "execution_count": 274, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	name	midterm	final	total	average	grade
0	John	95	85	180	90.0	A
1	Jenny	85	80	165	82.5	B
2	Nate	10	30	40	20.0	F

\n", "

" ], "text/plain": [ " name midterm final total average grade\n", "0 John 95 85 180 90.0 A\n", "1 Jenny 85 80 165 82.5 B\n", "2 Nate 10 30 40 20.0 F" ] }, "execution_count": 274, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## how to use apply function\n", "apply function helps you code concisely.\n", "the function will be applied to selected column(s) on all rows" ] }, { "cell_type": "code", "execution_count": 275, "metadata": {}, "outputs": [], "source": [ "def pass_or_fail(row):\n", " print(row)\n", " if row != \"F\":\n", " return 'Pass'\n", " else:\n", " return 'Fail'" ] }, { "cell_type": "code", "execution_count": 276, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "A\n", "B\n", "F\n" ] } ], "source": [ "df.grade = df.grade.apply(pass_or_fail)" ] }, { "cell_type": "code", "execution_count": 277, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	name	midterm	final	total	average	grade
0	John	95	85	180	90.0	Pass
1	Jenny	85	80	165	82.5	Pass
2	Nate	10	30	40	20.0	Fail

\n", "

" ], "text/plain": [ " name midterm final total average grade\n", "0 John 95 85 180 90.0 Pass\n", "1 Jenny 85 80 165 82.5 Pass\n", "2 Nate 10 30 40 20.0 Fail" ] }, "execution_count": 277, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## info extraction using df.apply" ] }, { "cell_type": "code", "execution_count": 278, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "

	yyyy-mm-dd
0	2000-06-27
1	2002-09-24
2	2005-12-20

\n", "

" ], "text/plain": [ " yyyy-mm-dd\n", "0 2000-06-27\n", "1 2002-09-24\n", "2 2005-12-20" ] }, "execution_count": 278, "metadata": {}, "output_type": "execute_result" } ], "source": [ "date_list = [{'yyyy-mm-dd': '2000-06-27'},\n", " {'yyyy-mm-dd': '2002-09-24'},\n", " {'yyyy-mm-dd': '2005-12-20'}]\n", "df = pd.DataFrame(date_list, columns = ['yyyy-mm-dd'])\n", "df" ] }, { "cell_type": "code", "execution_count": 279, "metadata": {}, "outputs": [], "source": [ "def extract_year(row):\n", " return row.split('-')[0]" ] }, { "cell_type": "code", "execution_count": 280, "metadata": {}, "outputs": [], "source": [ "df['year'] = df['yyyy-mm-dd'].apply(extract_year)" ] }, { "cell_type": "code", "execution_count": 281, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "

	yyyy-mm-dd	year
0	2000-06-27	2000
1	2002-09-24	2002
2	2005-12-20	2005

\n", "

" ], "text/plain": [ " yyyy-mm-dd year\n", "0 2000-06-27 2000\n", "1 2002-09-24 2002\n", "2 2005-12-20 2005" ] }, "execution_count": 281, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## passing keyword parameter to apply function\n", "you also can send parameter to apply function" ] }, { "cell_type": "code", "execution_count": 282, "metadata": {}, "outputs": [], "source": [ "def extract_year(year, current_year):\n", " return current_year - int(year)" ] }, { "cell_type": "code", "execution_count": 283, "metadata": {}, "outputs": [], "source": [ "df['age'] = df['year'].apply(extract_year, current_year=2018)" ] }, { "cell_type": "code", "execution_count": 284, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	yyyy-mm-dd	year	age
0	2000-06-27	2000	18
1	2002-09-24	2002	16
2	2005-12-20	2005	13

\n", "

" ], "text/plain": [ " yyyy-mm-dd year age\n", "0 2000-06-27 2000 18\n", "1 2002-09-24 2002 16\n", "2 2005-12-20 2005 13" ] }, "execution_count": 284, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## passing multiple keyword parameter to apply function\n", "you also can send multiple parameter to apply function" ] }, { "cell_type": "code", "execution_count": 285, "metadata": {}, "outputs": [], "source": [ "def get_introduce(age, prefix, suffix):\n", " return prefix + str(age) + suffix" ] }, { "cell_type": "code", "execution_count": 286, "metadata": {}, "outputs": [], "source": [ "df['introduce'] = df['age'].apply(get_introduce, prefix=\"I am \", suffix=\" years old\")" ] }, { "cell_type": "code", "execution_count": 287, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	yyyy-mm-dd	year	age	introduce
0	2000-06-27	2000	18	I am 18 years old
1	2002-09-24	2002	16	I am 16 years old
2	2005-12-20	2005	13	I am 13 years old

\n", "

	yyyy-mm-dd	year	age	introduce
0	2000-06-27	2000	18	I was born in 2000 my age is 18
1	2002-09-24	2002	16	I was born in 2002 my age is 16
2	2005-12-20	2005	13	I was born in 2005 my age is 13

\n", "

" ], "text/plain": [ " yyyy-mm-dd year age introduce\n", "0 2000-06-27 2000 18 I was born in 2000 my age is 18\n", "1 2002-09-24 2002 16 I was born in 2002 my age is 16\n", "2 2005-12-20 2005 13 I was born in 2005 my age is 13" ] }, "execution_count": 288, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def get_introduce2(row):\n", " return \"I was born in \"+str(row.year)+\" my age is \"+str(row.age)\n", "df.introduce = df.apply(get_introduce2, axis=1)\n", "\n", "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## how to use map function\n", "if you give function as parameter, it works same as apply function on the column" ] }, { "cell_type": "code", "execution_count": 289, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "

	yyyy-mm-dd
0	2000-06-27
1	2002-09-24
2	2005-12-20

\n", "

" ], "text/plain": [ " yyyy-mm-dd\n", "0 2000-06-27\n", "1 2002-09-24\n", "2 2005-12-20" ] }, "execution_count": 289, "metadata": {}, "output_type": "execute_result" } ], "source": [ "date_list = [{'yyyy-mm-dd': '2000-06-27'},\n", " {'yyyy-mm-dd': '2002-09-24'},\n", " {'yyyy-mm-dd': '2005-12-20'}]\n", "df = pd.DataFrame(date_list, columns = ['yyyy-mm-dd'])\n", "df" ] }, { "cell_type": "code", "execution_count": 290, "metadata": {}, "outputs": [], "source": [ "def extract_year(row):\n", " return row.split('-')[0]" ] }, { "cell_type": "code", "execution_count": 291, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "

	yyyy-mm-dd	year
0	2000-06-27	2000
1	2002-09-24	2002
2	2005-12-20	2005

\n", "

" ], "text/plain": [ " yyyy-mm-dd year\n", "0 2000-06-27 2000\n", "1 2002-09-24 2002\n", "2 2005-12-20 2005" ] }, "execution_count": 291, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['year'] = df['yyyy-mm-dd'].map(extract_year)\n", "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "if you give dictionary as parameter, \n", "column will be updated with new value like \n", "new value = dict['old value']" ] }, { "cell_type": "code", "execution_count": 292, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "

	age	job
0	20	student
1	30	developer
2	30	teacher

\n", "

" ], "text/plain": [ " age job\n", "0 20 student\n", "1 30 developer\n", "2 30 teacher" ] }, "execution_count": 292, "metadata": {}, "output_type": "execute_result" } ], "source": [ "job_list = [{'age': 20, 'job': 'student'},\n", " {'age': 30, 'job': 'developer'},\n", " {'age': 30, 'job': 'teacher'}]\n", "df = pd.DataFrame(job_list)\n", "df" ] }, { "cell_type": "code", "execution_count": 293, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "

	age	job
0	20	1
1	30	2
2	30	3

\n", "

" ], "text/plain": [ " age job\n", "0 20 1\n", "1 30 2\n", "2 30 3" ] }, "execution_count": 293, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.job = df.job.map({\"student\":1,\"developer\":2,\"teacher\":3})\n", "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Applymap\n", "update all elements in the dataframe at once" ] }, { "cell_type": "code", "execution_count": 294, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "

	x	y
0	5.5	-5.6
1	-5.2	5.5
2	-1.6	-4.5

\n", "

" ], "text/plain": [ " x y\n", "0 5.5 -5.6\n", "1 -5.2 5.5\n", "2 -1.6 -4.5" ] }, "execution_count": 294, "metadata": {}, "output_type": "execute_result" } ], "source": [ "x_y = [{'x': 5.5, 'y': -5.6},\n", " {'x': -5.2, 'y': 5.5},\n", " {'x': -1.6, 'y': -4.5}]\n", "df = pd.DataFrame(x_y)\n", "df" ] }, { "cell_type": "code", "execution_count": 295, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "

	x	y
0	6.0	-6.0
1	-5.0	6.0
2	-2.0	-4.0

\n", "

" ], "text/plain": [ " x y\n", "0 6.0 -6.0\n", "1 -5.0 6.0\n", "2 -2.0 -4.0" ] }, "execution_count": 295, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = df.applymap(np.around)\n", "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Add Row" ] }, { "cell_type": "code", "execution_count": 296, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	name	midterm	final
0	John	95	85
1	Jenny	85	80
2	Nate	10	30

\n", "

" ], "text/plain": [ " name midterm final\n", "0 John 95 85\n", "1 Jenny 85 80\n", "2 Nate 10 30" ] }, "execution_count": 296, "metadata": {}, "output_type": "execute_result" } ], "source": [ "friend_dict_list = [{'name': 'John', 'midterm': 95, 'final': 85},\n", " {'name': 'Jenny', 'midterm': 85, 'final': 80},\n", " {'name': 'Nate', 'midterm': 10, 'final': 30}]\n", "df = pd.DataFrame(friend_dict_list, columns = ['name', 'midterm', 'final'])\n", "df" ] }, { "cell_type": "code", "execution_count": 297, "metadata": {}, "outputs": [], "source": [ "df2 = pd.DataFrame([['Ben', 50,50]], columns = ['name', 'midterm', 'final'])" ] }, { "cell_type": "code", "execution_count": 298, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "

	name	midterm	final
0	Ben	50	50

\n", "

" ], "text/plain": [ " name midterm final\n", "0 Ben 50 50" ] }, "execution_count": 298, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df2.head()" ] }, { "cell_type": "code", "execution_count": 299, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	name	midterm	final
0	John	95	85
1	Jenny	85	80
2	Nate	10	30
3	Ben	50	50

\n", "

" ], "text/plain": [ " name midterm final\n", "0 John 95 85\n", "1 Jenny 85 80\n", "2 Nate 10 30\n", "3 Ben 50 50" ] }, "execution_count": 299, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.append(df2, ignore_index=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Group by\n", "group by command helps to get more information from given data" ] }, { "cell_type": "code", "execution_count": 300, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "

	name	major	sex
0	John	Computer Science	male
1	Nate	Computer Science	male
2	Abraham	Physics	male
3	Brian	Psychology	male
4	Janny	Economics	female
5	Yuna	Economics	female
6	Jeniffer	Computer Science	female
7	Edward	Computer Science	male
8	Zara	Psychology	female
9	Wendy	Economics	female
10	Sera	Psychology	female

\n", "

" ], "text/plain": [ " name major sex\n", "0 John Computer Science male\n", "1 Nate Computer Science male\n", "2 Abraham Physics male\n", "3 Brian Psychology male\n", "4 Janny Economics female\n", "5 Yuna Economics female\n", "6 Jeniffer Computer Science female\n", "7 Edward Computer Science male\n", "8 Zara Psychology female\n", "9 Wendy Economics female\n", "10 Sera Psychology female" ] }, "execution_count": 300, "metadata": {}, "output_type": "execute_result" } ], "source": [ "student_list = [{'name': 'John', 'major': \"Computer Science\", 'sex': \"male\"},\n", " {'name': 'Nate', 'major': \"Computer Science\", 'sex': \"male\"},\n", " {'name': 'Abraham', 'major': \"Physics\", 'sex': \"male\"},\n", " {'name': 'Brian', 'major': \"Psychology\", 'sex': \"male\"},\n", " {'name': 'Janny', 'major': \"Economics\", 'sex': \"female\"},\n", " {'name': 'Yuna', 'major': \"Economics\", 'sex': \"female\"},\n", " {'name': 'Jeniffer', 'major': \"Computer Science\", 'sex': \"female\"},\n", " {'name': 'Edward', 'major': \"Computer Science\", 'sex': \"male\"},\n", " {'name': 'Zara', 'major': \"Psychology\", 'sex': \"female\"},\n", " {'name': 'Wendy', 'major': \"Economics\", 'sex': \"female\"},\n", " {'name': 'Sera', 'major': \"Psychology\", 'sex': \"female\"}\n", " ]\n", "df = pd.DataFrame(student_list, columns = ['name', 'major', 'sex'])\n", "df" ] }, { "cell_type": "code", "execution_count": 301, "metadata": {}, "outputs": [], "source": [ "groupby_major = df.groupby('major')" ] }, { "cell_type": "code", "execution_count": 302, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'Computer Science': Int64Index([0, 1, 6, 7], dtype='int64'),\n", " 'Economics': Int64Index([4, 5, 9], dtype='int64'),\n", " 'Physics': Int64Index([2], dtype='int64'),\n", " 'Psychology': Int64Index([3, 8, 10], dtype='int64')}" ] }, "execution_count": 302, "metadata": {}, "output_type": "execute_result" } ], "source": [ "groupby_major.groups" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "here we can see, computer science has mostly man, while economic has mostly woman students" ] }, { "cell_type": "code", "execution_count": 303, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Computer Science: 4\n", " name major sex\n", "0 John Computer Science male\n", "1 Nate Computer Science male\n", "6 Jeniffer Computer Science female\n", "7 Edward Computer Science male\n", "\n", "Economics: 3\n", " name major sex\n", "4 Janny Economics female\n", "5 Yuna Economics female\n", "9 Wendy Economics female\n", "\n", "Physics: 1\n", " name major sex\n", "2 Abraham Physics male\n", "\n", "Psychology: 3\n", " name major sex\n", "3 Brian Psychology male\n", "8 Zara Psychology female\n", "10 Sera Psychology female\n", "\n" ] } ], "source": [ "for name, group in groupby_major:\n", " print(name + \": \" + str(len(group)))\n", " print(group)\n", " print()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### group object to dataframe" ] }, { "cell_type": "code", "execution_count": 304, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	major	count
0	Computer Science	4
1	Economics	3
2	Physics	1
3	Psychology	3

\n", "

" ], "text/plain": [ " major count\n", "0 Computer Science 4\n", "1 Economics 3\n", "2 Physics 1\n", "3 Psychology 3" ] }, "execution_count": 304, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_major_cnt = pd.DataFrame({'count' : groupby_major.size()}).reset_index()\n", "df_major_cnt" ] }, { "cell_type": "code", "execution_count": 305, "metadata": {}, "outputs": [], "source": [ "groupby_sex = df.groupby('sex')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "here we can see, this school has balanced woman and man ratio" ] }, { "cell_type": "code", "execution_count": 306, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "female: 6\n", " name major sex\n", "4 Janny Economics female\n", "5 Yuna Economics female\n", "6 Jeniffer Computer Science female\n", "8 Zara Psychology female\n", "9 Wendy Economics female\n", "10 Sera Psychology female\n", "\n", "male: 5\n", " name major sex\n", "0 John Computer Science male\n", "1 Nate Computer Science male\n", "2 Abraham Physics male\n", "3 Brian Psychology male\n", "7 Edward Computer Science male\n", "\n" ] } ], "source": [ "for name, group in groupby_sex:\n", " print(name + \": \" + str(len(group)))\n", " print(group)\n", " print()" ] }, { "cell_type": "code", "execution_count": 307, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "

	sex	count
0	female	6
1	male	5

\n", "

" ], "text/plain": [ " sex count\n", "0 female 6\n", "1 male 5" ] }, "execution_count": 307, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_sex_cnt = pd.DataFrame({'count' : groupby_sex.size()}).reset_index()\n", "df_sex_cnt" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Drop Duplicate\n", "sometimes you need to drop duplicate rows and here is elegant way to to it" ] }, { "cell_type": "code", "execution_count": 308, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	name	major	sex
0	John	Computer Science	male
1	Nate	Computer Science	male
2	Abraham	Physics	male
3	Brian	Psychology	male
4	Janny	Economics	female
5	Yuna	Economics	female
6	Jeniffer	Computer Science	female
7	Edward	Computer Science	male
8	Zara	Psychology	female
9	Wendy	Economics	female
10	Sera	Psychology	female
11	John	Computer Science	male

\n", "

	name	major	sex
0	John	Computer Science	male
1	Nate	Computer Science	male
2	Abraham	Physics	male
3	Brian	Psychology	male
4	Janny	Economics	female
5	Yuna	Economics	female
6	Jeniffer	Computer Science	female
7	Edward	Computer Science	male
8	Zara	Psychology	female
9	Wendy	Economics	female
10	Sera	Psychology	female

\n", "

	name	major	sex
0	John	Computer Science	male
1	Nate	Computer Science	male
2	Abraham	Physics	male
3	Brian	Psychology	male
4	Janny	Economics	female
5	Yuna	Economics	female
6	Jeniffer	Computer Science	female
7	Edward	Computer Science	male
8	Zara	Psychology	female
9	Wendy	Economics	female
10	Nate	None	male
11	John	Computer Science	None

\n", "

	name	major	sex
2	Abraham	Physics	male
3	Brian	Psychology	male
4	Janny	Economics	female
5	Yuna	Economics	female
6	Jeniffer	Computer Science	female
7	Edward	Computer Science	male
8	Zara	Psychology	female
9	Wendy	Economics	female
10	Nate	None	male
11	John	Computer Science	None

\n", "

" ], "text/plain": [ " name major sex\n", "2 Abraham Physics male\n", "3 Brian Psychology male\n", "4 Janny Economics female\n", "5 Yuna Economics female\n", "6 Jeniffer Computer Science female\n", "7 Edward Computer Science male\n", "8 Zara Psychology female\n", "9 Wendy Economics female\n", "10 Nate None male\n", "11 John Computer Science None" ] }, "execution_count": 314, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.drop_duplicates(['name'], keep='last')" ] }, { "cell_type": "code", "execution_count": 315, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	name	major	sex
0	John	Computer Science	male
1	Nate	Computer Science	male
2	Abraham	Physics	male
3	Brian	Psychology	male
4	Janny	Economics	female
5	Yuna	Economics	female
6	Jeniffer	Computer Science	female
7	Edward	Computer Science	male
8	Zara	Psychology	female
9	Wendy	Economics	female
10	Nate	None	male
11	John	Computer Science	None

\n", "

	name	job	age
0	John	teacher	40.0
1	Nate	teacher	35.0
2	Yuna	teacher	37.0
3	Abraham	student	10.0
4	Brian	student	12.0
5	Janny	student	11.0
6	Nate	teacher	NaN
7	John	student	NaN

\n", "

	name	job	age
0	False	False	False
1	False	False	False
2	False	False	False
3	False	False	False
4	False	False	False
5	False	False	False
6	False	False	True
7	False	False	True

\n", "

	name	job	age
0	False	False	False
1	False	False	False
2	False	False	False
3	False	False	False
4	False	False	False
5	False	False	False
6	False	False	True
7	False	False	True

\n", "

" ], "text/plain": [ " name job age\n", "0 False False False\n", "1 False False False\n", "2 False False False\n", "3 False False False\n", "4 False False False\n", "5 False False False\n", "6 False False True\n", "7 False False True" ] }, "execution_count": 319, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.isnull()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## how to fill Null or NaN" ] }, { "cell_type": "code", "execution_count": 320, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	name	job	age
0	John	teacher	40.0
1	Nate	teacher	35.0
2	Yuna	teacher	37.0
3	Abraham	student	10.0
4	Brian	student	12.0
5	Janny	student	11.0
6	Nate	teacher	0.0
7	John	student	0.0

\n", "

	name	job	age
0	John	teacher	40.0
1	Nate	teacher	35.0
2	Yuna	teacher	37.0
3	Abraham	student	10.0
4	Brian	student	12.0
5	Janny	student	11.0
6	Nate	teacher	0.0
7	John	student	0.0

\n", "

" ], "text/plain": [ " name job age\n", "0 John teacher 40.0\n", "1 Nate teacher 35.0\n", "2 Yuna teacher 37.0\n", "3 Abraham student 10.0\n", "4 Brian student 12.0\n", "5 Janny student 11.0\n", "6 Nate teacher 0.0\n", "7 John student 0.0" ] }, "execution_count": 322, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Unique" ] }, { "cell_type": "code", "execution_count": 323, "metadata": {}, "outputs": [], "source": [ "job_list = [{'name': 'John', 'job': \"teacher\"},\n", " {'name': 'Nate', 'job': \"teacher\"},\n", " {'name': 'Fred', 'job': \"teacher\"},\n", " {'name': 'Abraham', 'job': \"student\"},\n", " {'name': 'Brian', 'job': \"student\"},\n", " {'name': 'Janny', 'job': \"developer\"},\n", " {'name': 'Nate', 'job': \"teacher\"},\n", " {'name': 'Obrian', 'job': \"dentist\"},\n", " {'name': 'Yuna', 'job': \"teacher\"},\n", " {'name': 'Rob', 'job': \"lawyer\"},\n", " {'name': 'Brian', 'job': \"student\"},\n", " {'name': 'Matt', 'job': \"student\"},\n", " {'name': 'Wendy', 'job': \"banker\"},\n", " {'name': 'Edward', 'job': \"teacher\"},\n", " {'name': 'Ian', 'job': \"teacher\"},\n", " {'name': 'Chris', 'job': \"banker\"},\n", " {'name': 'Philip', 'job': \"lawyer\"},\n", " {'name': 'Janny', 'job': \"basketball player\"},\n", " {'name': 'Gwen', 'job': \"teacher\"},\n", " {'name': 'Jessy', 'job': \"student\"}\n", " ]\n", "df = pd.DataFrame(job_list, columns = ['name', 'job'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "unique() gives you unique values of the column in list format" ] }, { "cell_type": "code", "execution_count": 324, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['teacher' 'student' 'developer' 'dentist' 'lawyer' 'banker'\n", " 'basketball player']\n" ] } ], "source": [ "print( df.job.unique() )" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "value_counts() gives you the number of item for each unique columns" ] }, { "cell_type": "code", "execution_count": 325, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "teacher 8\n", "student 5\n", "lawyer 2\n", "banker 2\n", "developer 1\n", "dentist 1\n", "basketball player 1\n", "Name: job, dtype: int64" ] }, "execution_count": 325, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.job.value_counts()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Concatenate two dataframe" ] }, { "cell_type": "code", "execution_count": 326, "metadata": {}, "outputs": [], "source": [ "l1 = [{'name': 'John', 'job': \"teacher\"},\n", " {'name': 'Nate', 'job': \"student\"},\n", " {'name': 'Fred', 'job': \"developer\"}]\n", "\n", "l2 = [{'name': 'Ed', 'job': \"dentist\"},\n", " {'name': 'Jack', 'job': \"farmer\"},\n", " {'name': 'Ted', 'job': \"designer\"}]\n", " \n", "df1 = pd.DataFrame(l1, columns = ['name', 'job'])\n", "df2 = pd.DataFrame(l2, columns = ['name', 'job'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## pd.concat\n", "below is to add second dataframe as new rows in first dataframe" ] }, { "cell_type": "code", "execution_count": 327, "metadata": {}, "outputs": [], "source": [ "frames = [df1, df2]\n", "result = pd.concat(frames, ignore_index=True)" ] }, { "cell_type": "code", "execution_count": 328, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	name	job
0	John	teacher
1	Nate	student
2	Fred	developer
3	Ed	dentist
4	Jack	farmer
5	Ted	designer

\n", "

" ], "text/plain": [ " name job\n", "0 John teacher\n", "1 Nate student\n", "2 Fred developer\n", "3 Ed dentist\n", "4 Jack farmer\n", "5 Ted designer" ] }, "execution_count": 328, "metadata": {}, "output_type": "execute_result" } ], "source": [ "result" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## df.append\n", "below is to add second dataframe as new rows in first dataframe" ] }, { "cell_type": "code", "execution_count": 329, "metadata": {}, "outputs": [], "source": [ "l1 = [{'name': 'John', 'job': \"teacher\"},\n", " {'name': 'Nate', 'job': \"student\"},\n", " {'name': 'Fred', 'job': \"developer\"}]\n", "\n", "l2 = [{'name': 'Ed', 'job': \"dentist\"},\n", " {'name': 'Jack', 'job': \"farmer\"},\n", " {'name': 'Ted', 'job': \"designer\"}]\n", " \n", "df1 = pd.DataFrame(l1, columns = ['name', 'job'])\n", "df2 = pd.DataFrame(l2, columns = ['name', 'job'])\n", "result = df1.append(df2, ignore_index=True)" ] }, { "cell_type": "code", "execution_count": 330, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	name	job
0	John	teacher
1	Nate	student
2	Fred	developer
3	Ed	dentist
4	Jack	farmer
5	Ted	designer

\n", "

" ], "text/plain": [ " name job\n", "0 John teacher\n", "1 Nate student\n", "2 Fred developer\n", "3 Ed dentist\n", "4 Jack farmer\n", "5 Ted designer" ] }, "execution_count": 330, "metadata": {}, "output_type": "execute_result" } ], "source": [ "result" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## pd.concat\n", "below is to add second dataframe as new columns in first dataframe" ] }, { "cell_type": "code", "execution_count": 331, "metadata": {}, "outputs": [], "source": [ "l1 = [{'name': 'John', 'job': \"teacher\"},\n", " {'name': 'Nate', 'job': \"student\"},\n", " {'name': 'Jack', 'job': \"developer\"}]\n", "\n", "l2 = [{'age': 25, 'country': \"U.S\"},\n", " {'age': 30, 'country': \"U.K\"},\n", " {'age': 45, 'country': \"Korea\"}]\n", " \n", "df1 = pd.DataFrame(l1, columns = ['name', 'job'])\n", "df2 = pd.DataFrame(l2, columns = ['age', 'country'])\n", "result = pd.concat([df1, df2], axis=1, ignore_index=True)" ] }, { "cell_type": "code", "execution_count": 332, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	0	1	2	3
0	John	teacher	25	U.S
1	Nate	student	30	U.K
2	Jack	developer	45	Korea

\n", "

" ], "text/plain": [ " 0 1 2 3\n", "0 John teacher 25 U.S\n", "1 Nate student 30 U.K\n", "2 Jack developer 45 Korea" ] }, "execution_count": 332, "metadata": {}, "output_type": "execute_result" } ], "source": [ "result" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Concatenate two list as a dataframe" ] }, { "cell_type": "code", "execution_count": 333, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	label	prediction
0	1	1
1	2	2
2	3	2
3	4	5
4	5	5

\n", "

" ], "text/plain": [ " label prediction\n", "0 1 1\n", "1 2 2\n", "2 3 2\n", "3 4 5\n", "4 5 5" ] }, "execution_count": 333, "metadata": {}, "output_type": "execute_result" } ], "source": [ "label = [1,2,3,4,5]\n", "prediction = [1,2,2,5,5]\n", "\n", "comparison = pd.DataFrame(\n", " {'label': label,\n", " 'prediction': prediction\n", " })\n", "\n", "comparison" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.4" } }, "nbformat": 4, "nbformat_minor": 2 } ================================================ FILE: data/friend_list.csv ================================================ name,age,job John,20,student Jenny,30,developer Nate,30,teacher Julia,40,dentist Brian,45,manager Chris,25,intern ================================================ FILE: data/friend_list.txt ================================================ name,age,job John,20,student Jenny,30,developer Nate,30,teacher Julia,40,dentist Brian,45,manager Chris,25,intern ================================================ FILE: data/friend_list_no_head.csv ================================================ John,20,student Jenny,30,developer Nate,30,teacher Julia,40,dentist Brian,45,manager Chris,25,intern ================================================ FILE: data/friend_list_tab.txt ================================================ name age job John 20 student Jenny 30 developer Nate 30 teacher Julia 40 dentist Brian 45 manager Chris 25 intern ================================================ FILE: 팬더스_명령어_꿀팁.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Pandas는 무엇인가요?\n", "데이터 분석 및 가공에 사용되는 파이썬 라이브러리입니다" ] }, { "cell_type": "code", "execution_count": 156, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	name	age	job
0	John	20	student
1	Jenny	30	developer
2	Nate	30	teacher
3	Julia	40	dentist
4	Brian	45	manager

\n", "

" ], "text/plain": [ " name age job\n", "0 John 20 student\n", "1 Jenny 30 developer\n", "2 Nate 30 teacher\n", "3 Julia 40 dentist\n", "4 Brian 45 manager" ] }, "execution_count": 156, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "data_frame = pd.read_csv('data/friend_list.csv')\n", "data_frame.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 데이터프레임은 무엇인가요?\n", "가로축과 세로축이 있는 엑셀과 유사한 데이터구조입니다. 가로축은 로우(행), 세로축은 컬럼(열)이라고 합니다." ] }, { "cell_type": "code", "execution_count": 157, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	name	age	job
0	John	20	student
1	Jenny	30	developer
2	Nate	30	teacher
3	Julia	40	dentist
4	Brian	45	manager

\n", "

" ], "text/plain": [ " name age job\n", "0 John 20 student\n", "1 Jenny 30 developer\n", "2 Nate 30 teacher\n", "3 Julia 40 dentist\n", "4 Brian 45 manager" ] }, "execution_count": 157, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 데이터프레임이 가지고 있는 함수의 예제입니다.\n", "data_frame.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 시리즈는 무엇인가요?\n", "데이터프레임의 컬럼(행)은 모두 시리즈입니다. 위의 예제는 3개의 시리즈로 구성된 데이터프레임입니다." ] }, { "cell_type": "code", "execution_count": 158, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "pandas.core.series.Series" ] }, "execution_count": 158, "metadata": {}, "output_type": "execute_result" } ], "source": [ "type(data_frame.job)" ] }, { "cell_type": "code", "execution_count": 159, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	name	age	job
0	John	20	STUDENT
1	Jenny	30	DEVELOPER
2	Nate	30	TEACHER
3	Julia	40	DENTIST
4	Brian	45	MANAGER

\n", "

" ], "text/plain": [ " name age job\n", "0 John 20 STUDENT\n", "1 Jenny 30 DEVELOPER\n", "2 Nate 30 TEACHER\n", "3 Julia 40 DENTIST\n", "4 Brian 45 MANAGER" ] }, "execution_count": 159, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 시리즈의 함수 예제입니다.\n", "data_frame.job = data_frame.job.str.upper()\n", "data_frame.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "시리즈는 단순히 파이썬 리스트를 간직한 오브젝트입니다. \n", "리스트를 파라미터로 주면 바로 시리즈가 생성됩니다. \n", "시리즈는 데이터 가공 및 분석이 파이썬 리스트보다 훨씬 쉽습니다." ] }, { "cell_type": "code", "execution_count": 160, "metadata": {}, "outputs": [], "source": [ "s1 = pd.core.series.Series(['one', 'two', 'three'])\n", "s2 = pd.core.series.Series([1, 2, 3])" ] }, { "cell_type": "code", "execution_count": 161, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "

	num	word
0	1	one
1	2	two
2	3	three

\n", "

" ], "text/plain": [ " num word\n", "0 1 one\n", "1 2 two\n", "2 3 three" ] }, "execution_count": 161, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.DataFrame(data=dict(word=s1, num=s2))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 왜 팬더스를 쓰나요?\n", "\n", "엑셀과 상당히 유사합니다, 데이터의 수정/가공 및 분석이 용이합니다. \n", "데이터 가공을 위한 수많은 함수를 지원합니다. \n", "Numpy 기반으로 데이터 처리가 상당히 빠릅니다. " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 파일을 데이터프레임으로 불러오기\n", "데이터프레임 (dataframe)은 2차원 자료구조입니다. 로우와 컬럼으로 엑셀 형식과 유사합니다. \n", "기본적으로 csv 포맷을 지원하지만, 구분자로 컬럼이 구분되어 있는 데이터는 모두 지원합니다. \n", "read_csv 함수로 파일을 데이터프레임으로 호출할 수 있습니다." ] }, { "cell_type": "code", "execution_count": 162, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv('data/friend_list.csv')" ] }, { "cell_type": "code", "execution_count": 163, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	name	age	job
0	John	20	student
1	Jenny	30	developer
2	Nate	30	teacher
3	Julia	40	dentist
4	Brian	45	manager
5	Chris	25	intern

\n", "

" ], "text/plain": [ " name age job\n", "0 John 20 student\n", "1 Jenny 30 developer\n", "2 Nate 30 teacher\n", "3 Julia 40 dentist\n", "4 Brian 45 manager\n", "5 Chris 25 intern" ] }, "execution_count": 163, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "아래는 csv 파일이 아닌 파일을 호출하는 예제입니다. \n", "파일명은 txt이지만, 쉼표로 컬럼이 구분되어 있는 파일입니다." ] }, { "cell_type": "code", "execution_count": 164, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv('data/friend_list.txt')" ] }, { "cell_type": "code", "execution_count": 165, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	name	age	job
0	John	20	student
1	Jenny	30	developer
2	Nate	30	teacher
3	Julia	40	dentist
4	Brian	45	manager

\n", "

" ], "text/plain": [ " name age job\n", "0 John 20 student\n", "1 Jenny 30 developer\n", "2 Nate 30 teacher\n", "3 Julia 40 dentist\n", "4 Brian 45 manager" ] }, "execution_count": 165, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "만약 파일의 컬럼들이 쉼표로 구분되어 있지 않을 경우라도, delimiter 파라미터에 구분자를 지정해줘서 \n", "컬럼을 제대로 나줘줄 수 있습니다. 아래는 탭으로 컬럼이 구분된 경우의 예제입니다." ] }, { "cell_type": "code", "execution_count": 166, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv('data/friend_list_tab.txt', delimiter = \"\\t\")" ] }, { "cell_type": "code", "execution_count": 167, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	name	age	job
0	John	20	student
1	Jenny	30	developer
2	Nate	30	teacher
3	Julia	40	dentist
4	Brian	45	manager

\n", "

" ], "text/plain": [ " name age job\n", "0 John 20 student\n", "1 Jenny 30 developer\n", "2 Nate 30 teacher\n", "3 Julia 40 dentist\n", "4 Brian 45 manager" ] }, "execution_count": 167, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "만약 파일에 데이터 헤더가 없을 경우, header = None으로 지정해줘서, \n", "첫번째 데이터가 데이터 헤더로 들어가는 것을 방지해줘야합니다." ] }, { "cell_type": "code", "execution_count": 168, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv('data/friend_list_no_head.csv', header = None)" ] }, { "cell_type": "code", "execution_count": 169, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	0	1	2
0	John	20	student
1	Jenny	30	developer
2	Nate	30	teacher
3	Julia	40	dentist
4	Brian	45	manager

\n", "

" ], "text/plain": [ " 0 1 2\n", "0 John 20 student\n", "1 Jenny 30 developer\n", "2 Nate 30 teacher\n", "3 Julia 40 dentist\n", "4 Brian 45 manager" ] }, "execution_count": 169, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "헤더가 없는 데이터를 데이터프레임으로 호출했을 경우, \n", "아래와 같이 데이터프레임 생성 후에, 컬럼 헤더를 지정해주실 수 있습니다." ] }, { "cell_type": "code", "execution_count": 170, "metadata": {}, "outputs": [], "source": [ "df.columns = ['name', 'age', 'job']" ] }, { "cell_type": "code", "execution_count": 171, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	name	age	job
0	John	20	student
1	Jenny	30	developer
2	Nate	30	teacher
3	Julia	40	dentist
4	Brian	45	manager

\n", "

	name	age	job
0	John	20	student
1	Jenny	30	developer
2	Nate	30	teacher
3	Julia	40	dentist
4	Brian	45	manager

\n", "

" ], "text/plain": [ " name age job\n", "0 John 20 student\n", "1 Jenny 30 developer\n", "2 Nate 30 teacher\n", "3 Julia 40 dentist\n", "4 Brian 45 manager" ] }, "execution_count": 173, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 데이터프레임을 파이썬 코드로 생성하기" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 딕셔너리로 데이터프레임 생성하기\n", "파이썬의 기본 자료구조로 데이터프레임 생성이 가능합니다. \n", "아래의 예제는 딕셔너리로 데이터프레임을 생성하는 예제입니다." ] }, { "cell_type": "code", "execution_count": 174, "metadata": {}, "outputs": [], "source": [ "friend_dict_list = [{'name': 'Jone', 'age': 20, 'job': 'student'},\n", " {'name': 'Jenny', 'age': 30, 'job': 'developer'},\n", " {'name': 'Nate', 'age': 30, 'job': 'teacher'}]\n", "df = pd.DataFrame(friend_dict_list)" ] }, { "cell_type": "code", "execution_count": 175, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	age	job	name
0	20	student	Jone
1	30	developer	Jenny
2	30	teacher	Nate

\n", "

" ], "text/plain": [ " age job name\n", "0 20 student Jone\n", "1 30 developer Jenny\n", "2 30 teacher Nate" ] }, "execution_count": 175, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "데이터프레임 생성 시, 컬럼의 순서가 뒤바뀔 수 있습니다. \n", "아래와 같이 컬럼을 원하시는 순서로 지정하실 수 있습니다." ] }, { "cell_type": "code", "execution_count": 176, "metadata": {}, "outputs": [], "source": [ "df = df[['name', 'age', 'job']]" ] }, { "cell_type": "code", "execution_count": 177, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	name	age	job
0	Jone	20	student
1	Jenny	30	developer
2	Nate	30	teacher

\n", "

" ], "text/plain": [ " name age job\n", "0 Jone 20 student\n", "1 Jenny 30 developer\n", "2 Nate 30 teacher" ] }, "execution_count": 177, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## OrderedDict로 데이터프레임 생성하기\n", "OrderedDict 자료구조로 데이터프레임을 생성하면, 컬럼의 순서가 뒤바뀌지 않습니다." ] }, { "cell_type": "code", "execution_count": 178, "metadata": {}, "outputs": [], "source": [ "from collections import OrderedDict" ] }, { "cell_type": "code", "execution_count": 179, "metadata": {}, "outputs": [], "source": [ "friend_ordered_dict = OrderedDict([ ('name', ['John', 'Jenny', 'Nate']),\n", " ('age', [20, 30, 30]),\n", " ('job', ['student', 'developer', 'teacher']) ] )\n", "df = pd.DataFrame.from_dict(friend_ordered_dict)" ] }, { "cell_type": "code", "execution_count": 180, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	name	age	job
0	John	20	student
1	Jenny	30	developer
2	Nate	30	teacher

\n", "

" ], "text/plain": [ " name age job\n", "0 John 20 student\n", "1 Jenny 30 developer\n", "2 Nate 30 teacher" ] }, "execution_count": 180, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [ "## list로 데이터프레임 생성하기\n", "리스트로 데이터프레임을 생성하는 예제입니다." ] }, { "cell_type": "code", "execution_count": 181, "metadata": {}, "outputs": [], "source": [ "friend_list = [ ['John', 20, 'student'],['Jenny', 30, 'developer'],['Nate', 30, 'teacher'] ]\n", "column_name = ['name', 'age', 'job']\n", "df = pd.DataFrame.from_records(friend_list, columns=column_name)" ] }, { "cell_type": "code", "execution_count": 182, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	name	age	job
0	John	20	student
1	Jenny	30	developer
2	Nate	30	teacher

\n", "

" ], "text/plain": [ " name age job\n", "0 John 20 student\n", "1 Jenny 30 developer\n", "2 Nate 30 teacher" ] }, "execution_count": 182, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": 183, "metadata": {}, "outputs": [], "source": [ "friend_list = [ \n", " ['name',['John', 'Jenny', 'Nate']],\n", " ['age',[20,30,30]],\n", " ['job',['student', 'developer', 'teacher']] \n", " ]\n", "df = pd.DataFrame.from_items(friend_list)" ] }, { "cell_type": "code", "execution_count": 184, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	name	age	job
0	John	20	student
1	Jenny	30	developer
2	Nate	30	teacher

\n", "

" ], "text/plain": [ " name age job\n", "0 John 20 student\n", "1 Jenny 30 developer\n", "2 Nate 30 teacher" ] }, "execution_count": 184, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 파일로 데이터프레임을 저장하기" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "아래는 데이터프레임을 헤더와 함께 저장하는 예제입니다." ] }, { "cell_type": "code", "execution_count": 185, "metadata": {}, "outputs": [], "source": [ "friend_list = [ \n", " ['name',['John', 'Jenny', 'nate']],\n", " ['age',[20,30,30]],\n", " ['job',['student', 'developer', 'teacher']] \n", " ]\n", "df = pd.DataFrame.from_items(friend_list)" ] }, { "cell_type": "code", "execution_count": 186, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	name	age	job
0	John	20	student
1	Jenny	30	developer
2	nate	30	teacher

\n", "

" ], "text/plain": [ " name age job\n", "0 John 20 student\n", "1 Jenny 30 developer\n", "2 nate 30 teacher" ] }, "execution_count": 186, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "to_csv 함수를 사용하여 파일로 저장하실 수 있습니다." ] }, { "cell_type": "code", "execution_count": 187, "metadata": {}, "outputs": [], "source": [ "df.to_csv('friend_list_from_df.csv')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "아래는 헤더가 없는 데이터프레임의 예제입니다." ] }, { "cell_type": "code", "execution_count": 188, "metadata": {}, "outputs": [], "source": [ "friend_list = [ ['John', 20, 'student'],['Jenny', 30, 'developer'],['Nate', 30, 'teacher'] ]\n", "df = pd.DataFrame.from_records(friend_list)" ] }, { "cell_type": "code", "execution_count": 189, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	0	1	2
0	John	20	student
1	Jenny	30	developer
2	Nate	30	teacher

\n", "

" ], "text/plain": [ " 0 1 2\n", "0 John 20 student\n", "1 Jenny 30 developer\n", "2 Nate 30 teacher" ] }, "execution_count": 189, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "to_csv 함수로 파일로 저장하실 수 있습니다." ] }, { "cell_type": "code", "execution_count": 190, "metadata": {}, "outputs": [], "source": [ "df.to_csv('friend_list_from_df.csv')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "사실 파일의 확장자명은 원하시는대로 주셔도 무방합니다." ] }, { "cell_type": "code", "execution_count": 191, "metadata": {}, "outputs": [], "source": [ "df.to_csv('friend_list_from_df.txt')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "기본적으로, 헤더와 인덱스값은 주시지 않아도, 기본적으로 True로 설정되어 있습니다." ] }, { "cell_type": "code", "execution_count": 192, "metadata": {}, "outputs": [], "source": [ "df.to_csv('friend_list_from_df.csv', header = True, index = True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**header = False** 는 컬럼 이름을 파일에 저장하지 않겠다라는 의미입니다. 예제에서 0,1,2가 헤더에 저장되지 않습니다. \n", "**index = False** 는 로우 인덱스를 파일에 저장하지 않겠다라는 의미입니다. 예제에서 0,1,2가 로우 인덱스에 저장되지 않습니다." ] }, { "cell_type": "code", "execution_count": 193, "metadata": {}, "outputs": [], "source": [ "df.to_csv('friend_list_from_df.csv', header = False, index = False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "헤더 정보를 원하실 경우, header 키워드로 컬럼 이름을 파일에 저장하실 수 있습니다." ] }, { "cell_type": "code", "execution_count": 194, "metadata": {}, "outputs": [], "source": [ "df.to_csv('friend_list_from_df.csv', header = ['name', 'age', 'job'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "아래는 **None** 값이 있는 데이터프레임의 예제입니다." ] }, { "cell_type": "code", "execution_count": 195, "metadata": {}, "outputs": [], "source": [ "friend_list = [ \n", " ['name',['John', None, 'nate']],\n", " ['age',[20,None,30]],\n", " ['job',['student', 'developer', 'teacher']] \n", " ]\n", "df = pd.DataFrame.from_items(friend_list)" ] }, { "cell_type": "code", "execution_count": 196, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	name	age	job
0	John	20.0	student
1	None	NaN	developer
2	nate	30.0	teacher

\n", "

" ], "text/plain": [ " name age job\n", "0 John 20.0 student\n", "1 None NaN developer\n", "2 nate 30.0 teacher" ] }, "execution_count": 196, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": 197, "metadata": {}, "outputs": [], "source": [ "df.to_csv('friend_list_from_df.csv')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**na_rep** 을 사용하시면 **None** 을 원하시는 값으로 쉽게 변경하실 수 있습니다." ] }, { "cell_type": "code", "execution_count": 198, "metadata": {}, "outputs": [], "source": [ "df.to_csv('friend_list_from_df.csv', na_rep = '-')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 로우 선택하기" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 인덱스로 로우 선택하기" ] }, { "cell_type": "code", "execution_count": 199, "metadata": {}, "outputs": [], "source": [ "friend_list = [ \n", " ['name',['John', 'Jenny', 'Nate']],\n", " ['age',[20,30,30]],\n", " ['job',['student', 'developer', 'teacher']] \n", " ]\n", "df = pd.DataFrame.from_items(friend_list)" ] }, { "cell_type": "code", "execution_count": 200, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	name	age	job
0	John	20	student
1	Jenny	30	developer
2	Nate	30	teacher

\n", "

" ], "text/plain": [ " name age job\n", "0 John 20 student\n", "1 Jenny 30 developer\n", "2 Nate 30 teacher" ] }, "execution_count": 200, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "아래는 로우 인덱스를 사용하여 로우1부터 3까지 순차적으로 선택하는 예제입니다." ] }, { "cell_type": "code", "execution_count": 201, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "

	name	age	job
1	Jenny	30	developer
2	Nate	30	teacher

\n", "

" ], "text/plain": [ " name age job\n", "1 Jenny 30 developer\n", "2 Nate 30 teacher" ] }, "execution_count": 201, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[1:3]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "아래는 순차적이지 않은 로우를 선택하는 예제입니다." ] }, { "cell_type": "code", "execution_count": 202, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "

	name	age	job
0	John	20	student
2	Nate	30	teacher

\n", "

" ], "text/plain": [ " name age job\n", "0 John 20 student\n", "2 Nate 30 teacher" ] }, "execution_count": 202, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.loc[[0,2]]" ] }, { "cell_type": "code", "execution_count": 203, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	name	age	job
0	John	20	student
1	Jenny	30	developer
2	Nate	30	teacher

\n", "

" ], "text/plain": [ " name age job\n", "0 John 20 student\n", "1 Jenny 30 developer\n", "2 Nate 30 teacher" ] }, "execution_count": 203, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 컬럼값에 따른 로우 선택하기\n", "마치 데이터베이스에 쿼리를 전달하듯, 특정한 컬럼값을 충족하는 로우만 선택하실 수 있습니다." ] }, { "cell_type": "code", "execution_count": 204, "metadata": {}, "outputs": [], "source": [ "df_filtered = df[df.age > 25]" ] }, { "cell_type": "code", "execution_count": 205, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "

	name	age	job
1	Jenny	30	developer
2	Nate	30	teacher

\n", "

" ], "text/plain": [ " name age job\n", "1 Jenny 30 developer\n", "2 Nate 30 teacher" ] }, "execution_count": 205, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_filtered" ] }, { "cell_type": "code", "execution_count": 206, "metadata": {}, "outputs": [], "source": [ "df_filtered = df.query('age>25')" ] }, { "cell_type": "code", "execution_count": 207, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "

	name	age	job
1	Jenny	30	developer
2	Nate	30	teacher

\n", "

" ], "text/plain": [ " name age job\n", "1 Jenny 30 developer\n", "2 Nate 30 teacher" ] }, "execution_count": 207, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_filtered" ] }, { "cell_type": "code", "execution_count": 208, "metadata": {}, "outputs": [], "source": [ "df_filtered = df[(df.age >25) & (df.name == 'Nate')]" ] }, { "cell_type": "code", "execution_count": 209, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "

	name	age	job
2	Nate	30	teacher

\n", "

" ], "text/plain": [ " name age job\n", "2 Nate 30 teacher" ] }, "execution_count": 209, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_filtered" ] }, { "cell_type": "code", "execution_count": 210, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	name	age	job
0	John	20	student
1	Jenny	30	developer
2	Nate	30	teacher

\n", "

" ], "text/plain": [ " name age job\n", "0 John 20 student\n", "1 Jenny 30 developer\n", "2 Nate 30 teacher" ] }, "execution_count": 210, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 컬럼 필터하기" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 인덱스로 필터하기" ] }, { "cell_type": "code", "execution_count": 211, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	0	1	2
0	John	20	student
1	Jenny	30	developer
2	Nate	30	teacher

\n", "

" ], "text/plain": [ " 0 1 2\n", "0 John 20 student\n", "1 Jenny 30 developer\n", "2 Nate 30 teacher" ] }, "execution_count": 211, "metadata": {}, "output_type": "execute_result" } ], "source": [ "friend_list = [ ['John', 20, 'student'],['Jenny', 30, 'developer'],['Nate', 30, 'teacher'] ]\n", "df = pd.DataFrame.from_records(friend_list)\n", "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "모든 로우를 보여주되, 컬럼은 0부터 1까지만 출력하는 예제입니다." ] }, { "cell_type": "code", "execution_count": 212, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "

	0	1
0	John	20
1	Jenny	30
2	Nate	30

\n", "

" ], "text/plain": [ " 0 1\n", "0 John 20\n", "1 Jenny 30\n", "2 Nate 30" ] }, "execution_count": 212, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.iloc[:, 0:2]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "모든 로우를 보여주되, 컬럼 0와 2만 출력하는 예제입니다." ] }, { "cell_type": "code", "execution_count": 213, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "

	0	2
0	John	student
1	Jenny	developer
2	Nate	teacher

\n", "

" ], "text/plain": [ " 0 2\n", "0 John student\n", "1 Jenny developer\n", "2 Nate teacher" ] }, "execution_count": 213, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.iloc[:,[0,2]]" ] }, { "cell_type": "code", "execution_count": 214, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	0	1	2
0	John	20	student
1	Jenny	30	developer
2	Nate	30	teacher

\n", "

" ], "text/plain": [ " 0 1 2\n", "0 John 20 student\n", "1 Jenny 30 developer\n", "2 Nate 30 teacher" ] }, "execution_count": 214, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 컬럼 이름으로 필터하기" ] }, { "cell_type": "code", "execution_count": 215, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	name	age	job
0	John	20	student
1	Jenny	30	developer
2	Nate	30	teacher
3	Julia	40	dentist
4	Brian	45	manager
5	Chris	25	intern

\n", "

" ], "text/plain": [ " name age job\n", "0 John 20 student\n", "1 Jenny 30 developer\n", "2 Nate 30 teacher\n", "3 Julia 40 dentist\n", "4 Brian 45 manager\n", "5 Chris 25 intern" ] }, "execution_count": 215, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# you can create column header for no header data at once\n", "df = pd.read_csv('data/friend_list_no_head.csv', header = None, names=['name', 'age', 'job'])\n", "df" ] }, { "cell_type": "code", "execution_count": 216, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	name	age
0	John	20
1	Jenny	30
2	Nate	30
3	Julia	40
4	Brian	45
5	Chris	25

\n", "

" ], "text/plain": [ " name age\n", "0 John 20\n", "1 Jenny 30\n", "2 Nate 30\n", "3 Julia 40\n", "4 Brian 45\n", "5 Chris 25" ] }, "execution_count": 216, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_filtered = df[['name', 'age']]\n", "df_filtered" ] }, { "cell_type": "code", "execution_count": 217, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	age	job
0	20	student
1	30	developer
2	30	teacher
3	40	dentist
4	45	manager
5	25	intern

\n", "

" ], "text/plain": [ " age job\n", "0 20 student\n", "1 30 developer\n", "2 30 teacher\n", "3 40 dentist\n", "4 45 manager\n", "5 25 intern" ] }, "execution_count": 217, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.filter(items=['age', 'job'])" ] }, { "cell_type": "code", "execution_count": 218, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	name	age	job
0	John	20	student
1	Jenny	30	developer
2	Nate	30	teacher
3	Julia	40	dentist
4	Brian	45	manager
5	Chris	25	intern

\n", "

	name	age
0	John	20
1	Jenny	30
2	Nate	30
3	Julia	40
4	Brian	45
5	Chris	25

\n", "

" ], "text/plain": [ " name age\n", "0 John 20\n", "1 Jenny 30\n", "2 Nate 30\n", "3 Julia 40\n", "4 Brian 45\n", "5 Chris 25" ] }, "execution_count": 219, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# select columns containing 'a'\n", "df.filter(like='a',axis=1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "정규식으로 필터도 가능합니다." ] }, { "cell_type": "code", "execution_count": 220, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	job
0	student
1	developer
2	teacher
3	dentist
4	manager
5	intern

\n", "

" ], "text/plain": [ " job\n", "0 student\n", "1 developer\n", "2 teacher\n", "3 dentist\n", "4 manager\n", "5 intern" ] }, "execution_count": 220, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# select columns using regex\n", "df.filter(regex='b$',axis=1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 로우 드롭하기" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "로우 인덱스로 로우를 드롭할 수 있습니다." ] }, { "cell_type": "code", "execution_count": 221, "metadata": {}, "outputs": [], "source": [ "friend_dict_list = [{'age': 20, 'job': 'student'},\n", " {'age': 30, 'job': 'developer'},\n", " {'age': 30, 'job': 'teacher'}]\n", "df = pd.DataFrame(friend_dict_list, index = ['John', 'Jenny', 'Nate'])" ] }, { "cell_type": "code", "execution_count": 222, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "

	age	job
John	20	student
Jenny	30	developer
Nate	30	teacher

\n", "

" ], "text/plain": [ " age job\n", "John 20 student\n", "Jenny 30 developer\n", "Nate 30 teacher" ] }, "execution_count": 222, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "드롭된 결과는 데이터프레임에 저장되지 않습니다. 저장하고 싶으실 경우, 결과를 데이터프레임에 따로 저장하셔야 합니다." ] }, { "cell_type": "code", "execution_count": 223, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "

	age	job
Jenny	30	developer

\n", "

" ], "text/plain": [ " age job\n", "Jenny 30 developer" ] }, "execution_count": 223, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.drop(['John', 'Nate'])" ] }, { "cell_type": "code", "execution_count": 224, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "

	age	job
John	20	student
Jenny	30	developer
Nate	30	teacher

\n", "

" ], "text/plain": [ " age job\n", "John 20 student\n", "Jenny 30 developer\n", "Nate 30 teacher" ] }, "execution_count": 224, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "드롭된 결과를 데이터프레임에 저장하는 예제입니다." ] }, { "cell_type": "code", "execution_count": 225, "metadata": {}, "outputs": [], "source": [ "df = df.drop(['John', 'Nate'])" ] }, { "cell_type": "code", "execution_count": 226, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "

	age	job
Jenny	30	developer

\n", "

" ], "text/plain": [ " age job\n", "Jenny 30 developer" ] }, "execution_count": 226, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 드롭된 결과를 바로 데이터프레임에 저장하는 방법\n", "inplace 키워드를 사용하시면, 따로 저장할 필요없이, 드롭된 결과가 데이터프레임에 반영됩니다." ] }, { "cell_type": "code", "execution_count": 227, "metadata": {}, "outputs": [], "source": [ "friend_dict_list = [{'age': 20, 'job': 'student'},\n", " {'age': 30, 'job': 'developer'},\n", " {'age': 30, 'job': 'teacher'}]\n", "df = pd.DataFrame(friend_dict_list, index = ['John', 'Jenny', 'Nate'])" ] }, { "cell_type": "code", "execution_count": 228, "metadata": {}, "outputs": [], "source": [ "df.drop(['John', 'Nate'], inplace = True)" ] }, { "cell_type": "code", "execution_count": 229, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "

	age	job
Jenny	30	developer

\n", "

" ], "text/plain": [ " age job\n", "Jenny 30 developer" ] }, "execution_count": 229, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 로우 인덱스로 드롭하기" ] }, { "cell_type": "code", "execution_count": 230, "metadata": {}, "outputs": [], "source": [ "friend_dict_list = [{'name': 'Jone', 'age': 20, 'job': 'student'},\n", " {'name': 'Jenny', 'age': 30, 'job': 'developer'},\n", " {'name': 'Nate', 'age': 30, 'job': 'teacher'}]\n", "df = pd.DataFrame(friend_dict_list)" ] }, { "cell_type": "code", "execution_count": 231, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	age	job	name
0	20	student	Jone
1	30	developer	Jenny
2	30	teacher	Nate

\n", "

" ], "text/plain": [ " age job name\n", "0 20 student Jone\n", "1 30 developer Jenny\n", "2 30 teacher Nate" ] }, "execution_count": 231, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "로우 인덱스로 드롭하는 예제입니다." ] }, { "cell_type": "code", "execution_count": 232, "metadata": {}, "outputs": [], "source": [ "df = df.drop(df.index[[0,2]])" ] }, { "cell_type": "code", "execution_count": 233, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "

	age	job	name
1	30	developer	Jenny

\n", "

" ], "text/plain": [ " age job name\n", "1 30 developer Jenny" ] }, "execution_count": 233, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 컬럼값으로 로우 드롭하기" ] }, { "cell_type": "code", "execution_count": 234, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	age	job	name
0	20	student	Jone
1	30	developer	Jenny
2	30	teacher	Nate

\n", "

" ], "text/plain": [ " age job name\n", "0 20 student Jone\n", "1 30 developer Jenny\n", "2 30 teacher Nate" ] }, "execution_count": 234, "metadata": {}, "output_type": "execute_result" } ], "source": [ "friend_dict_list = [{'name': 'Jone', 'age': 20, 'job': 'student'},\n", " {'name': 'Jenny', 'age': 30, 'job': 'developer'},\n", " {'name': 'Nate', 'age': 30, 'job': 'teacher'}]\n", "df = pd.DataFrame(friend_dict_list)\n", "df" ] }, { "cell_type": "code", "execution_count": 235, "metadata": {}, "outputs": [], "source": [ "df = df[df.age != 30]" ] }, { "cell_type": "code", "execution_count": 236, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "

	age	job	name
0	20	student	Jone

\n", "

" ], "text/plain": [ " age job name\n", "0 20 student Jone" ] }, "execution_count": 236, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 컬럼 드롭하기" ] }, { "cell_type": "code", "execution_count": 237, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	age	job	name
0	20	student	Jone
1	30	developer	Jenny
2	30	teacher	Nate

\n", "

" ], "text/plain": [ " age job name\n", "0 20 student Jone\n", "1 30 developer Jenny\n", "2 30 teacher Nate" ] }, "execution_count": 237, "metadata": {}, "output_type": "execute_result" } ], "source": [ "friend_dict_list = [{'name': 'Jone', 'age': 20, 'job': 'student'},\n", " {'name': 'Jenny', 'age': 30, 'job': 'developer'},\n", " {'name': 'Nate', 'age': 30, 'job': 'teacher'}]\n", "df = pd.DataFrame(friend_dict_list)\n", "df" ] }, { "cell_type": "code", "execution_count": 238, "metadata": {}, "outputs": [], "source": [ "df = df.drop('age', axis=1)" ] }, { "cell_type": "code", "execution_count": 239, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "

	job	name
0	student	Jone
1	developer	Jenny
2	teacher	Nate

\n", "

" ], "text/plain": [ " job name\n", "0 student Jone\n", "1 developer Jenny\n", "2 teacher Nate" ] }, "execution_count": 239, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [ "# 컬럼 추가 또는 변경하기" ] }, { "cell_type": "code", "execution_count": 240, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	name	age	job
0	Jone	15	student
1	Jenny	30	developer
2	Nate	30	teacher

\n", "

" ], "text/plain": [ " name age job\n", "0 Jone 15 student\n", "1 Jenny 30 developer\n", "2 Nate 30 teacher" ] }, "execution_count": 240, "metadata": {}, "output_type": "execute_result" } ], "source": [ "friend_dict_list = [{'name': 'Jone', 'age': 15, 'job': 'student'},\n", " {'name': 'Jenny', 'age': 30, 'job': 'developer'},\n", " {'name': 'Nate', 'age': 30, 'job': 'teacher'}]\n", "df = pd.DataFrame(friend_dict_list, columns = ['name', 'age', 'job'])\n", "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "아래와 같은 방법으로 새로운 컬럼을 기본값과 함께 추가하실 수 있습니다." ] }, { "cell_type": "code", "execution_count": 241, "metadata": {}, "outputs": [], "source": [ "df['salary'] = 0" ] }, { "cell_type": "code", "execution_count": 242, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	name	age	job
0	Jone	15	student
1	Jenny	30	developer
2	Nate	30	teacher

\n", "

" ], "text/plain": [ " name age job salary\n", "0 Jone 15 student 0\n", "1 Jenny 30 developer 0\n", "2 Nate 30 teacher 0" ] }, "execution_count": 242, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "기존 컬럼값을 가지고 새로운 컬럼을 생성하는 예제입니다." ] }, { "cell_type": "code", "execution_count": 243, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	name	age	job
0	Jone	15	student
1	Jenny	30	developer
2	Nate	30	teacher

\n", "

" ], "text/plain": [ " name age job\n", "0 Jone 15 student\n", "1 Jenny 30 developer\n", "2 Nate 30 teacher" ] }, "execution_count": 243, "metadata": {}, "output_type": "execute_result" } ], "source": [ "friend_dict_list = [{'name': 'Jone', 'age': 15, 'job': 'student'},\n", " {'name': 'Jenny', 'age': 30, 'job': 'developer'},\n", " {'name': 'Nate', 'age': 30, 'job': 'teacher'}]\n", "df = pd.DataFrame(friend_dict_list, columns = ['name', 'age', 'job'])\n", "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "넘파이를 사용하셔서, 한줄에 새로운 컬럼값을 생성하실 수도 있습니다." ] }, { "cell_type": "code", "execution_count": 244, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "df['salary'] = np.where(df['job'] != 'student' , 'yes', 'no')" ] }, { "cell_type": "code", "execution_count": 245, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	name	age	job	salary
0	Jone	15	student	no
1	Jenny	30	developer	yes
2	Nate	30	teacher	yes

\n", "

" ], "text/plain": [ " name age job salary\n", "0 Jone 15 student no\n", "1 Jenny 30 developer yes\n", "2 Nate 30 teacher yes" ] }, "execution_count": 245, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "code", "execution_count": 246, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	name	midterm	final
0	John	95	85
1	Jenny	85	80
2	Nate	10	30

\n", "

" ], "text/plain": [ " name midterm final\n", "0 John 95 85\n", "1 Jenny 85 80\n", "2 Nate 10 30" ] }, "execution_count": 246, "metadata": {}, "output_type": "execute_result" } ], "source": [ "friend_dict_list = [{'name': 'John', 'midterm': 95, 'final': 85},\n", " {'name': 'Jenny', 'midterm': 85, 'final': 80},\n", " {'name': 'Nate', 'midterm': 10, 'final': 30}]\n", "df = pd.DataFrame(friend_dict_list, columns = ['name', 'midterm', 'final'])\n", "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "아래는 기존에 있는 두 컬럼값을 더해서 새로운 컬럼을 만드는 예제입니다." ] }, { "cell_type": "code", "execution_count": 247, "metadata": {}, "outputs": [], "source": [ "df['total'] = df['midterm'] + df['final']" ] }, { "cell_type": "code", "execution_count": 248, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	name	midterm	final	total
0	John	95	85	180
1	Jenny	85	80	165
2	Nate	10	30	40

\n", "

" ], "text/plain": [ " name midterm final total\n", "0 John 95 85 180\n", "1 Jenny 85 80 165\n", "2 Nate 10 30 40" ] }, "execution_count": 248, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "기존의 컬럼을 사용하여 새로운 컬럼을 만드는 예제입니다." ] }, { "cell_type": "code", "execution_count": 249, "metadata": {}, "outputs": [], "source": [ "df['average'] = df['total'] / 2" ] }, { "cell_type": "code", "execution_count": 250, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	name	midterm	final	total	average
0	John	95	85	180	90.0
1	Jenny	85	80	165	82.5
2	Nate	10	30	40	20.0

\n", "

" ], "text/plain": [ " name midterm final total average\n", "0 John 95 85 180 90.0\n", "1 Jenny 85 80 165 82.5\n", "2 Nate 10 30 40 20.0" ] }, "execution_count": 250, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "아래와 같이, 리스트에 조건별 값을 담아서, 새로운 컬럼으로 추가시킬 수 있습니다." ] }, { "cell_type": "code", "execution_count": 251, "metadata": {}, "outputs": [], "source": [ "grades = []\n", "\n", "for row in df['average']:\n", " if row >= 90:\n", " grades.append('A')\n", " elif row >= 80:\n", " grades.append('B')\n", " elif row >= 70:\n", " grades.append('C')\n", " else:\n", " grades.append('F')\n", " \n", "df['grade'] = grades" ] }, { "cell_type": "code", "execution_count": 252, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	name	midterm	final	total	average	grade
0	John	95	85	180	90.0	A
1	Jenny	85	80	165	82.5	B
2	Nate	10	30	40	20.0	F

\n", "

" ], "text/plain": [ " name midterm final total average grade\n", "0 John 95 85 180 90.0 A\n", "1 Jenny 85 80 165 82.5 B\n", "2 Nate 10 30 40 20.0 F" ] }, "execution_count": 252, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "apply 함수 사용 예제입니다. \n", "apply를 사용하시면, 깔끔하게 컬럼의 값을 변경하는 코드를 구현하실 수 있습니다." ] }, { "cell_type": "code", "execution_count": 253, "metadata": {}, "outputs": [], "source": [ "def pass_or_fail(row):\n", " print(row)\n", " if row != \"F\":\n", " return 'Pass'\n", " else:\n", " return 'Fail'" ] }, { "cell_type": "code", "execution_count": 254, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "A\n", "B\n", "F\n" ] } ], "source": [ "df.grade = df.grade.apply(pass_or_fail)" ] }, { "cell_type": "code", "execution_count": 255, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	name	midterm	final	total	average	grade
0	John	95	85	180	90.0	Pass
1	Jenny	85	80	165	82.5	Pass
2	Nate	10	30	40	20.0	Fail

\n", "

" ], "text/plain": [ " name midterm final total average grade\n", "0 John 95 85 180 90.0 Pass\n", "1 Jenny 85 80 165 82.5 Pass\n", "2 Nate 10 30 40 20.0 Fail" ] }, "execution_count": 255, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "apply를 사용해서 연월일의 정보에서 연도만 빼보는 예제입니다." ] }, { "cell_type": "code", "execution_count": 256, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "

	yyyy-mm-dd
0	2000-06-27
1	2002-09-24
2	2005-12-20

\n", "

" ], "text/plain": [ " yyyy-mm-dd\n", "0 2000-06-27\n", "1 2002-09-24\n", "2 2005-12-20" ] }, "execution_count": 256, "metadata": {}, "output_type": "execute_result" } ], "source": [ "date_list = [{'yyyy-mm-dd': '2000-06-27'},\n", " {'yyyy-mm-dd': '2002-09-24'},\n", " {'yyyy-mm-dd': '2005-12-20'}]\n", "df = pd.DataFrame(date_list, columns = ['yyyy-mm-dd'])\n", "df" ] }, { "cell_type": "code", "execution_count": 257, "metadata": {}, "outputs": [], "source": [ "def extract_year(row):\n", " return row.split('-')[0]" ] }, { "cell_type": "code", "execution_count": 258, "metadata": {}, "outputs": [], "source": [ "df['year'] = df['yyyy-mm-dd'].apply(extract_year)" ] }, { "cell_type": "code", "execution_count": 259, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "

	yyyy-mm-dd	year
0	2000-06-27	2000
1	2002-09-24	2002
2	2005-12-20	2005

\n", "

" ], "text/plain": [ " yyyy-mm-dd year\n", "0 2000-06-27 2000\n", "1 2002-09-24 2002\n", "2 2005-12-20 2005" ] }, "execution_count": 259, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### apply 함수에 파라미터 전달하기\n", "키워드 파라미터를 사용하시면, apply가 적용된 함수에 파라미터를 전달하실 수 있습니다." ] }, { "cell_type": "code", "execution_count": 260, "metadata": {}, "outputs": [], "source": [ "def extract_year(year, current_year):\n", " return current_year - int(year)" ] }, { "cell_type": "code", "execution_count": 261, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	yyyy-mm-dd	year	age
0	2000-06-27	2000	18
1	2002-09-24	2002	16
2	2005-12-20	2005	13

\n", "

" ], "text/plain": [ " yyyy-mm-dd year age\n", "0 2000-06-27 2000 18\n", "1 2002-09-24 2002 16\n", "2 2005-12-20 2005 13" ] }, "execution_count": 261, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['age'] = df['year'].apply(extract_year, current_year=2018)\n", "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### apply 함수에 한 개 이상의 파라미터 전달하기\n", "키워드 파라미터를 추가해주시면, 원하시는만큼의 파라미터를 함수에 전달 가능합니다." ] }, { "cell_type": "code", "execution_count": 262, "metadata": {}, "outputs": [], "source": [ "def get_introduce(age, prefix, suffix):\n", " return prefix + str(age) + suffix" ] }, { "cell_type": "code", "execution_count": 263, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	yyyy-mm-dd	year	age	introduce
0	2000-06-27	2000	18	I am 18 years old
1	2002-09-24	2002	16	I am 16 years old
2	2005-12-20	2005	13	I am 13 years old

\n", "

" ], "text/plain": [ " yyyy-mm-dd year age introduce\n", "0 2000-06-27 2000 18 I am 18 years old\n", "1 2002-09-24 2002 16 I am 16 years old\n", "2 2005-12-20 2005 13 I am 13 years old" ] }, "execution_count": 263, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['introduce'] = df['age'].apply(get_introduce, prefix=\"I am \", suffix=\" years old\")\n", "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### apply 함수에 여러개의 컬럼을 동시에 전달하기\n", "axis=1이라는 키워드 파라미터를 apply 함수에 전달해주면, 모든 컬럼을 지정된 함수에서 사용 가능합니다." ] }, { "cell_type": "code", "execution_count": 264, "metadata": {}, "outputs": [], "source": [ "def get_introduce2(row):\n", " return \"I was born in \"+str(row.year)+\" my age is \"+str(row.age)\n", "\n", "df.introduce = df.apply(get_introduce2, axis=1)" ] }, { "cell_type": "code", "execution_count": 265, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	yyyy-mm-dd	year	age	introduce
0	2000-06-27	2000	18	I was born in 2000 my age is 18
1	2002-09-24	2002	16	I was born in 2002 my age is 16
2	2005-12-20	2005	13	I was born in 2005 my age is 13

\n", "

" ], "text/plain": [ " yyyy-mm-dd year age introduce\n", "0 2000-06-27 2000 18 I was born in 2000 my age is 18\n", "1 2002-09-24 2002 16 I was born in 2002 my age is 16\n", "2 2005-12-20 2005 13 I was born in 2005 my age is 13" ] }, "execution_count": 265, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Map 함수로 컬럼 추가 및 변경하기\n", "파라미터로 함수를 전달하면 apply 함수와 동일하게 컬럼값을 추가 및 변경할 수 있습니다." ] }, { "cell_type": "code", "execution_count": 266, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "

	yyyy-mm-dd
0	2000-06-27
1	2002-09-24
2	2005-12-20

\n", "

" ], "text/plain": [ " yyyy-mm-dd\n", "0 2000-06-27\n", "1 2002-09-24\n", "2 2005-12-20" ] }, "execution_count": 266, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def extract_year(row):\n", " return row.split('-')[0]\n", "\n", "date_list = [{'yyyy-mm-dd': '2000-06-27'},\n", " {'yyyy-mm-dd': '2002-09-24'},\n", " {'yyyy-mm-dd': '2005-12-20'}]\n", "df = pd.DataFrame(date_list, columns = ['yyyy-mm-dd'])\n", "df" ] }, { "cell_type": "code", "execution_count": 267, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "

	yyyy-mm-dd	year
0	2000-06-27	2000
1	2002-09-24	2002
2	2005-12-20	2005

\n", "

" ], "text/plain": [ " yyyy-mm-dd year\n", "0 2000-06-27 2000\n", "1 2002-09-24 2002\n", "2 2005-12-20 2005" ] }, "execution_count": 267, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['year'] = df['yyyy-mm-dd'].map(extract_year)\n", "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "파라미터로 딕셔너리를 전달하면 컬럼값을 쉽게 원하는 값으로 변경 가능합니다. \n", "기존의 컬럼값은 딕셔너리의 key로 사용되고, 해당되는 value의 값으로 컬럼값이 변경됩니다." ] }, { "cell_type": "code", "execution_count": 268, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "

	age	job
0	20	student
1	30	developer
2	30	teacher

\n", "

" ], "text/plain": [ " age job\n", "0 20 student\n", "1 30 developer\n", "2 30 teacher" ] }, "execution_count": 268, "metadata": {}, "output_type": "execute_result" } ], "source": [ "job_list = [{'age': 20, 'job': 'student'},\n", " {'age': 30, 'job': 'developer'},\n", " {'age': 30, 'job': 'teacher'}]\n", "df = pd.DataFrame(job_list)\n", "df" ] }, { "cell_type": "code", "execution_count": 269, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "

	age	job
0	20	1
1	30	2
2	30	3

\n", "

" ], "text/plain": [ " age job\n", "0 20 1\n", "1 30 2\n", "2 30 3" ] }, "execution_count": 269, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.job = df.job.map({\"student\":1,\"developer\":2,\"teacher\":3})\n", "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Applymap\n", "데이터프레임 전체의 각각의 값을 한번에 변경시키실 때 사용하시면 좋습니다." ] }, { "cell_type": "code", "execution_count": 270, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "

	x	y
0	5.5	-5.6
1	-5.2	5.5
2	-1.6	-4.5

\n", "

" ], "text/plain": [ " x y\n", "0 5.5 -5.6\n", "1 -5.2 5.5\n", "2 -1.6 -4.5" ] }, "execution_count": 270, "metadata": {}, "output_type": "execute_result" } ], "source": [ "x_y = [{'x': 5.5, 'y': -5.6},\n", " {'x': -5.2, 'y': 5.5},\n", " {'x': -1.6, 'y': -4.5}]\n", "df = pd.DataFrame(x_y)\n", "df" ] }, { "cell_type": "code", "execution_count": 271, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "

	x	y
0	6.0	-6.0
1	-5.0	6.0
2	-2.0	-4.0

\n", "

" ], "text/plain": [ " x y\n", "0 6.0 -6.0\n", "1 -5.0 6.0\n", "2 -2.0 -4.0" ] }, "execution_count": 271, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = df.applymap(np.around)\n", "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 데이터프레임에 로우 추가하기" ] }, { "cell_type": "code", "execution_count": 272, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	name	midterm	final
0	John	95	85
1	Jenny	85	80
2	Nate	10	30

\n", "

" ], "text/plain": [ " name midterm final\n", "0 John 95 85\n", "1 Jenny 85 80\n", "2 Nate 10 30" ] }, "execution_count": 272, "metadata": {}, "output_type": "execute_result" } ], "source": [ "friend_dict_list = [{'name': 'John', 'midterm': 95, 'final': 85},\n", " {'name': 'Jenny', 'midterm': 85, 'final': 80},\n", " {'name': 'Nate', 'midterm': 10, 'final': 30}]\n", "df = pd.DataFrame(friend_dict_list, columns = ['name', 'midterm', 'final'])\n", "df" ] }, { "cell_type": "code", "execution_count": 273, "metadata": {}, "outputs": [], "source": [ "df2 = pd.DataFrame([['Ben', 50,50]], columns = ['name', 'midterm', 'final'])" ] }, { "cell_type": "code", "execution_count": 274, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "

	name	midterm	final
0	Ben	50	50

\n", "

" ], "text/plain": [ " name midterm final\n", "0 Ben 50 50" ] }, "execution_count": 274, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df2.head()" ] }, { "cell_type": "code", "execution_count": 275, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	name	midterm	final
0	John	95	85
1	Jenny	85	80
2	Nate	10	30
3	Ben	50	50

\n", "

" ], "text/plain": [ " name midterm final\n", "0 John 95 85\n", "1 Jenny 85 80\n", "2 Nate 10 30\n", "3 Ben 50 50" ] }, "execution_count": 275, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.append(df2, ignore_index=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Group by\n", "데이터에서 정보를 취하기 위해서 그룹별로 묶는 방법에 대해 알아보겠습니다." ] }, { "cell_type": "code", "execution_count": 276, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	name	major	sex
0	John	Computer Science	male
1	Nate	Computer Science	male
2	Abraham	Physics	male
3	Brian	Psychology	male
4	Janny	Economics	female
5	Yuna	Economics	female
6	Jeniffer	Computer Science	female
7	Edward	Computer Science	male
8	Zara	Psychology	female
9	Wendy	Economics	female
10	Sera	Psychology	female

\n", "

" ], "text/plain": [ " name major sex\n", "0 John Computer Science male\n", "1 Nate Computer Science male\n", "2 Abraham Physics male\n", "3 Brian Psychology male\n", "4 Janny Economics female\n", "5 Yuna Economics female\n", "6 Jeniffer Computer Science female\n", "7 Edward Computer Science male\n", "8 Zara Psychology female\n", "9 Wendy Economics female\n", "10 Sera Psychology female" ] }, "execution_count": 276, "metadata": {}, "output_type": "execute_result" } ], "source": [ "student_list = [{'name': 'John', 'major': \"Computer Science\", 'sex': \"male\"},\n", " {'name': 'Nate', 'major': \"Computer Science\", 'sex': \"male\"},\n", " {'name': 'Abraham', 'major': \"Physics\", 'sex': \"male\"},\n", " {'name': 'Brian', 'major': \"Psychology\", 'sex': \"male\"},\n", " {'name': 'Janny', 'major': \"Economics\", 'sex': \"female\"},\n", " {'name': 'Yuna', 'major': \"Economics\", 'sex': \"female\"},\n", " {'name': 'Jeniffer', 'major': \"Computer Science\", 'sex': \"female\"},\n", " {'name': 'Edward', 'major': \"Computer Science\", 'sex': \"male\"},\n", " {'name': 'Zara', 'major': \"Psychology\", 'sex': \"female\"},\n", " {'name': 'Wendy', 'major': \"Economics\", 'sex': \"female\"},\n", " {'name': 'Sera', 'major': \"Psychology\", 'sex': \"female\"}\n", " ]\n", "df = pd.DataFrame(student_list, columns = ['name', 'major', 'sex'])\n", "df" ] }, { "cell_type": "code", "execution_count": 277, "metadata": {}, "outputs": [], "source": [ "groupby_major = df.groupby('major')" ] }, { "cell_type": "code", "execution_count": 278, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'Computer Science': Int64Index([0, 1, 6, 7], dtype='int64'),\n", " 'Economics': Int64Index([4, 5, 9], dtype='int64'),\n", " 'Physics': Int64Index([2], dtype='int64'),\n", " 'Psychology': Int64Index([3, 8, 10], dtype='int64')}" ] }, "execution_count": 278, "metadata": {}, "output_type": "execute_result" } ], "source": [ "groupby_major.groups" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "here we can see, computer science has mostly man, while economic has mostly woman students" ] }, { "cell_type": "code", "execution_count": 279, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Computer Science: 4\n", " name major sex\n", "0 John Computer Science male\n", "1 Nate Computer Science male\n", "6 Jeniffer Computer Science female\n", "7 Edward Computer Science male\n", "\n", "Economics: 3\n", " name major sex\n", "4 Janny Economics female\n", "5 Yuna Economics female\n", "9 Wendy Economics female\n", "\n", "Physics: 1\n", " name major sex\n", "2 Abraham Physics male\n", "\n", "Psychology: 3\n", " name major sex\n", "3 Brian Psychology male\n", "8 Zara Psychology female\n", "10 Sera Psychology female\n", "\n" ] } ], "source": [ "for name, group in groupby_major:\n", " print(name + \": \" + str(len(group)))\n", " print(group)\n", " print()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "그룹 객체를 다시 데이터프레임으로 생성하는 예제입니다." ] }, { "cell_type": "code", "execution_count": 280, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	major	count
0	Computer Science	4
1	Economics	3
2	Physics	1
3	Psychology	3

\n", "

" ], "text/plain": [ " major count\n", "0 Computer Science 4\n", "1 Economics 3\n", "2 Physics 1\n", "3 Psychology 3" ] }, "execution_count": 280, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_major_cnt = pd.DataFrame({'count' : groupby_major.size()}).reset_index()\n", "df_major_cnt" ] }, { "cell_type": "code", "execution_count": 281, "metadata": {}, "outputs": [], "source": [ "groupby_sex = df.groupby('sex')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "아래의 출력을 통해, 이 학교의 남녀 성비가 균등하다는 정보를 알 수 있습니다." ] }, { "cell_type": "code", "execution_count": 282, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "female: 6\n", " name major sex\n", "4 Janny Economics female\n", "5 Yuna Economics female\n", "6 Jeniffer Computer Science female\n", "8 Zara Psychology female\n", "9 Wendy Economics female\n", "10 Sera Psychology female\n", "\n", "male: 5\n", " name major sex\n", "0 John Computer Science male\n", "1 Nate Computer Science male\n", "2 Abraham Physics male\n", "3 Brian Psychology male\n", "7 Edward Computer Science male\n", "\n" ] } ], "source": [ "for name, group in groupby_sex:\n", " print(name + \": \" + str(len(group)))\n", " print(group)\n", " print()" ] }, { "cell_type": "code", "execution_count": 283, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "

	sex	count
0	female	6
1	male	5

\n", "

" ], "text/plain": [ " sex count\n", "0 female 6\n", "1 male 5" ] }, "execution_count": 283, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_sex_cnt = pd.DataFrame({'count' : groupby_sex.size()}).reset_index()\n", "df_sex_cnt" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 중복 데이터 드롭하기\n", "중복된 데이터 드롭하는 방법에 대해 알아보겠습니다." ] }, { "cell_type": "code", "execution_count": 284, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	name	major	sex
0	John	Computer Science	male
1	Nate	Computer Science	male
2	Abraham	Physics	male
3	Brian	Psychology	male
4	Janny	Economics	female
5	Yuna	Economics	female
6	Jeniffer	Computer Science	female
7	Edward	Computer Science	male
8	Zara	Psychology	female
9	Wendy	Economics	female
10	Sera	Psychology	female
11	John	Computer Science	male

\n", "

" ], "text/plain": [ " name major sex\n", "0 John Computer Science male\n", "1 Nate Computer Science male\n", "2 Abraham Physics male\n", "3 Brian Psychology male\n", "4 Janny Economics female\n", "5 Yuna Economics female\n", "6 Jeniffer Computer Science female\n", "7 Edward Computer Science male\n", "8 Zara Psychology female\n", "9 Wendy Economics female\n", "10 Sera Psychology female\n", "11 John Computer Science male" ] }, "execution_count": 284, "metadata": {}, "output_type": "execute_result" } ], "source": [ "student_list = [{'name': 'John', 'major': \"Computer Science\", 'sex': \"male\"},\n", " {'name': 'Nate', 'major': \"Computer Science\", 'sex': \"male\"},\n", " {'name': 'Abraham', 'major': \"Physics\", 'sex': \"male\"},\n", " {'name': 'Brian', 'major': \"Psychology\", 'sex': \"male\"},\n", " {'name': 'Janny', 'major': \"Economics\", 'sex': \"female\"},\n", " {'name': 'Yuna', 'major': \"Economics\", 'sex': \"female\"},\n", " {'name': 'Jeniffer', 'major': \"Computer Science\", 'sex': \"female\"},\n", " {'name': 'Edward', 'major': \"Computer Science\", 'sex': \"male\"},\n", " {'name': 'Zara', 'major': \"Psychology\", 'sex': \"female\"},\n", " {'name': 'Wendy', 'major': \"Economics\", 'sex': \"female\"},\n", " {'name': 'Sera', 'major': \"Psychology\", 'sex': \"female\"},\n", " {'name': 'John', 'major': \"Computer Science\", 'sex': \"male\"},\n", " ]\n", "df = pd.DataFrame(student_list, columns = ['name', 'major', 'sex'])\n", "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 중복된 데이터 확인 하기" ] }, { "cell_type": "code", "execution_count": 285, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 False\n", "1 False\n", "2 False\n", "3 False\n", "4 False\n", "5 False\n", "6 False\n", "7 False\n", "8 False\n", "9 False\n", "10 False\n", "11 True\n", "dtype: bool" ] }, "execution_count": 285, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.duplicated()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "drop_duplicates 함수로 중복 데이터를 삭제하는 예제입니다." ] }, { "cell_type": "code", "execution_count": 286, "metadata": {}, "outputs": [], "source": [ "df = df.drop_duplicates()" ] }, { "cell_type": "code", "execution_count": 287, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	name	major	sex
0	John	Computer Science	male
1	Nate	Computer Science	male
2	Abraham	Physics	male
3	Brian	Psychology	male
4	Janny	Economics	female
5	Yuna	Economics	female
6	Jeniffer	Computer Science	female
7	Edward	Computer Science	male
8	Zara	Psychology	female
9	Wendy	Economics	female
10	Sera	Psychology	female

\n", "

	name	major	sex
0	John	Computer Science	male
1	Nate	Computer Science	male
2	Abraham	Physics	male
3	Brian	Psychology	male
4	Janny	Economics	female
5	Yuna	Economics	female
6	Jeniffer	Computer Science	female
7	Edward	Computer Science	male
8	Zara	Psychology	female
9	Wendy	Economics	female
10	Nate	None	male
11	John	Computer Science	None

\n", "

" ], "text/plain": [ " name major sex\n", "0 John Computer Science male\n", "1 Nate Computer Science male\n", "2 Abraham Physics male\n", "3 Brian Psychology male\n", "4 Janny Economics female\n", "5 Yuna Economics female\n", "6 Jeniffer Computer Science female\n", "7 Edward Computer Science male\n", "8 Zara Psychology female\n", "9 Wendy Economics female\n", "10 Nate None male\n", "11 John Computer Science None" ] }, "execution_count": 288, "metadata": {}, "output_type": "execute_result" } ], "source": [ "student_list = [{'name': 'John', 'major': \"Computer Science\", 'sex': \"male\"},\n", " {'name': 'Nate', 'major': \"Computer Science\", 'sex': \"male\"},\n", " {'name': 'Abraham', 'major': \"Physics\", 'sex': \"male\"},\n", " {'name': 'Brian', 'major': \"Psychology\", 'sex': \"male\"},\n", " {'name': 'Janny', 'major': \"Economics\", 'sex': \"female\"},\n", " {'name': 'Yuna', 'major': \"Economics\", 'sex': \"female\"},\n", " {'name': 'Jeniffer', 'major': \"Computer Science\", 'sex': \"female\"},\n", " {'name': 'Edward', 'major': \"Computer Science\", 'sex': \"male\"},\n", " {'name': 'Zara', 'major': \"Psychology\", 'sex': \"female\"},\n", " {'name': 'Wendy', 'major': \"Economics\", 'sex': \"female\"},\n", " {'name': 'Nate', 'major': None, 'sex': \"male\"},\n", " {'name': 'John', 'major': \"Computer Science\", 'sex': None},\n", " ]\n", "df = pd.DataFrame(student_list, columns = ['name', 'major', 'sex'])\n", "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "name 컬럼이 똑같을 경우, 중복된 데이터라고 표시하라는 예제입니다." ] }, { "cell_type": "code", "execution_count": 289, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 False\n", "1 False\n", "2 False\n", "3 False\n", "4 False\n", "5 False\n", "6 False\n", "7 False\n", "8 False\n", "9 False\n", "10 True\n", "11 True\n", "dtype: bool" ] }, "execution_count": 289, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.duplicated(['name'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "keep 값을 first 또는 last라고 값을 줘서 중복된 값 중, 어느값을 살릴 지 결정하실 수 있습니다. \n", "기본적으로 first로 설정되어 있습니다." ] }, { "cell_type": "code", "execution_count": 290, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	name	major	sex
2	Abraham	Physics	male
3	Brian	Psychology	male
4	Janny	Economics	female
5	Yuna	Economics	female
6	Jeniffer	Computer Science	female
7	Edward	Computer Science	male
8	Zara	Psychology	female
9	Wendy	Economics	female
10	Nate	None	male
11	John	Computer Science	None

\n", "

" ], "text/plain": [ " name major sex\n", "2 Abraham Physics male\n", "3 Brian Psychology male\n", "4 Janny Economics female\n", "5 Yuna Economics female\n", "6 Jeniffer Computer Science female\n", "7 Edward Computer Science male\n", "8 Zara Psychology female\n", "9 Wendy Economics female\n", "10 Nate None male\n", "11 John Computer Science None" ] }, "execution_count": 290, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.drop_duplicates(['name'], keep='last')" ] }, { "cell_type": "code", "execution_count": 291, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	name	major	sex
0	John	Computer Science	male
1	Nate	Computer Science	male
2	Abraham	Physics	male
3	Brian	Psychology	male
4	Janny	Economics	female
5	Yuna	Economics	female
6	Jeniffer	Computer Science	female
7	Edward	Computer Science	male
8	Zara	Psychology	female
9	Wendy	Economics	female
10	Nate	None	male
11	John	Computer Science	None

\n", "

	name	job	age
0	John	teacher	40.0
1	Nate	teacher	35.0
2	Yuna	teacher	37.0
3	Abraham	student	10.0
4	Brian	student	12.0
5	Janny	student	11.0
6	Nate	teacher	NaN
7	John	student	NaN

\n", "

	name	job	age
0	False	False	False
1	False	False	False
2	False	False	False
3	False	False	False
4	False	False	False
5	False	False	False
6	False	False	True
7	False	False	True

\n", "

	name	job	age
0	False	False	False
1	False	False	False
2	False	False	False
3	False	False	False
4	False	False	False
5	False	False	False
6	False	False	True
7	False	False	True

\n", "

" ], "text/plain": [ " name job age\n", "0 False False False\n", "1 False False False\n", "2 False False False\n", "3 False False False\n", "4 False False False\n", "5 False False False\n", "6 False False True\n", "7 False False True" ] }, "execution_count": 295, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.isnull()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Null 또는 NaN 값 변경하기" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "아래는 Null을 0으로 설정하는 예제입니다." ] }, { "cell_type": "code", "execution_count": 296, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	name	job	age
0	John	teacher	40.0
1	Nate	teacher	35.0
2	Yuna	teacher	37.0
3	Abraham	student	10.0
4	Brian	student	12.0
5	Janny	student	11.0
6	Nate	teacher	0.0
7	John	student	0.0

\n", "

	name	job	age
0	John	teacher	40.0
1	Nate	teacher	35.0
2	Yuna	teacher	37.0
3	Abraham	student	10.0
4	Brian	student	12.0
5	Janny	student	11.0
6	Nate	teacher	0.0
7	John	student	0.0

\n", "

" ], "text/plain": [ " name job age\n", "0 John teacher 40.0\n", "1 Nate teacher 35.0\n", "2 Yuna teacher 37.0\n", "3 Abraham student 10.0\n", "4 Brian student 12.0\n", "5 Janny student 11.0\n", "6 Nate teacher 0.0\n", "7 John student 0.0" ] }, "execution_count": 298, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Unique\n", "컬럼에 여러 값이 있을 때, 중복 없이 어떤 값들이 있는 지 확인하는 방법입니다." ] }, { "cell_type": "code", "execution_count": 299, "metadata": {}, "outputs": [], "source": [ "job_list = [{'name': 'John', 'job': \"teacher\"},\n", " {'name': 'Nate', 'job': \"teacher\"},\n", " {'name': 'Fred', 'job': \"teacher\"},\n", " {'name': 'Abraham', 'job': \"student\"},\n", " {'name': 'Brian', 'job': \"student\"},\n", " {'name': 'Janny', 'job': \"developer\"},\n", " {'name': 'Nate', 'job': \"teacher\"},\n", " {'name': 'Obrian', 'job': \"dentist\"},\n", " {'name': 'Yuna', 'job': \"teacher\"},\n", " {'name': 'Rob', 'job': \"lawyer\"},\n", " {'name': 'Brian', 'job': \"student\"},\n", " {'name': 'Matt', 'job': \"student\"},\n", " {'name': 'Wendy', 'job': \"banker\"},\n", " {'name': 'Edward', 'job': \"teacher\"},\n", " {'name': 'Ian', 'job': \"teacher\"},\n", " {'name': 'Chris', 'job': \"banker\"},\n", " {'name': 'Philip', 'job': \"lawyer\"},\n", " {'name': 'Janny', 'job': \"basketball player\"},\n", " {'name': 'Gwen', 'job': \"teacher\"},\n", " {'name': 'Jessy', 'job': \"student\"}\n", " ]\n", "df = pd.DataFrame(job_list, columns = ['name', 'job'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "컬럼(시리즈)의 unique() 함수를 사용하여, 중복 없이, 컬럼에 있는 모든 값들을 출력할 수 있습니다." ] }, { "cell_type": "code", "execution_count": 300, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['teacher' 'student' 'developer' 'dentist' 'lawyer' 'banker'\n", " 'basketball player']\n" ] } ], "source": [ "print( df.job.unique() )" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "각 유니크한 값별로 몇개의 데이터가 속하는 지 value_counts() 함수로 확인할 수 있습니다." ] }, { "cell_type": "code", "execution_count": 301, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "teacher 8\n", "student 5\n", "banker 2\n", "lawyer 2\n", "basketball player 1\n", "dentist 1\n", "developer 1\n", "Name: job, dtype: int64" ] }, "execution_count": 301, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.job.value_counts()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 두개의 데이터프레임 합치기" ] }, { "cell_type": "code", "execution_count": 302, "metadata": {}, "outputs": [], "source": [ "l1 = [{'name': 'John', 'job': \"teacher\"},\n", " {'name': 'Nate', 'job': \"student\"},\n", " {'name': 'Fred', 'job': \"developer\"}]\n", "\n", "l2 = [{'name': 'Ed', 'job': \"dentist\"},\n", " {'name': 'Jack', 'job': \"farmer\"},\n", " {'name': 'Ted', 'job': \"designer\"}]\n", " \n", "df1 = pd.DataFrame(l1, columns = ['name', 'job'])\n", "df2 = pd.DataFrame(l2, columns = ['name', 'job'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## pd.concat\n", "두번째 데이터프레임을 첫번째 데이터프레임의 새로운 로우(행)로 합칩니다." ] }, { "cell_type": "code", "execution_count": 303, "metadata": {}, "outputs": [], "source": [ "frames = [df1, df2]\n", "result = pd.concat(frames, ignore_index=True)" ] }, { "cell_type": "code", "execution_count": 304, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	name	job
0	John	teacher
1	Nate	student
2	Fred	developer
3	Ed	dentist
4	Jack	farmer
5	Ted	designer

\n", "

" ], "text/plain": [ " name job\n", "0 John teacher\n", "1 Nate student\n", "2 Fred developer\n", "3 Ed dentist\n", "4 Jack farmer\n", "5 Ted designer" ] }, "execution_count": 304, "metadata": {}, "output_type": "execute_result" } ], "source": [ "result" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## df.append\n", "두번째 데이터프레임을 첫번째 데이터프레임의 새로운 로우(행)로 합칩니다." ] }, { "cell_type": "code", "execution_count": 305, "metadata": {}, "outputs": [], "source": [ "l1 = [{'name': 'John', 'job': \"teacher\"},\n", " {'name': 'Nate', 'job': \"student\"},\n", " {'name': 'Fred', 'job': \"developer\"}]\n", "\n", "l2 = [{'name': 'Ed', 'job': \"dentist\"},\n", " {'name': 'Jack', 'job': \"farmer\"},\n", " {'name': 'Ted', 'job': \"designer\"}]\n", " \n", "df1 = pd.DataFrame(l1, columns = ['name', 'job'])\n", "df2 = pd.DataFrame(l2, columns = ['name', 'job'])\n", "result = df1.append(df2, ignore_index=True)" ] }, { "cell_type": "code", "execution_count": 306, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	name	job
0	John	teacher
1	Nate	student
2	Fred	developer
3	Ed	dentist
4	Jack	farmer
5	Ted	designer

\n", "

" ], "text/plain": [ " name job\n", "0 John teacher\n", "1 Nate student\n", "2 Fred developer\n", "3 Ed dentist\n", "4 Jack farmer\n", "5 Ted designer" ] }, "execution_count": 306, "metadata": {}, "output_type": "execute_result" } ], "source": [ "result" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## pd.concat\n", "두번째 데이터프레임을 첫번째 데이터프레임의 새로운 컬럼(열)으로 합칩니다." ] }, { "cell_type": "code", "execution_count": 307, "metadata": {}, "outputs": [], "source": [ "l1 = [{'name': 'John', 'job': \"teacher\"},\n", " {'name': 'Nate', 'job': \"student\"},\n", " {'name': 'Jack', 'job': \"developer\"}]\n", "\n", "l2 = [{'age': 25, 'country': \"U.S\"},\n", " {'age': 30, 'country': \"U.K\"},\n", " {'age': 45, 'country': \"Korea\"}]\n", " \n", "df1 = pd.DataFrame(l1, columns = ['name', 'job'])\n", "df2 = pd.DataFrame(l2, columns = ['age', 'country'])\n", "result = pd.concat([df1, df2], axis=1, ignore_index=True)" ] }, { "cell_type": "code", "execution_count": 308, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	0	1	2	3
0	John	teacher	25	U.S
1	Nate	student	30	U.K
2	Jack	developer	45	Korea

\n", "

" ], "text/plain": [ " 0 1 2 3\n", "0 John teacher 25 U.S\n", "1 Nate student 30 U.K\n", "2 Jack developer 45 Korea" ] }, "execution_count": 308, "metadata": {}, "output_type": "execute_result" } ], "source": [ "result" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 두개의 리스트를 묶어서 데이터프레임으로 생성하기" ] }, { "cell_type": "code", "execution_count": 309, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

	label	prediction
0	1	1
1	2	2
2	3	2
3	4	5
4	5	5

\n", "

" ], "text/plain": [ " label prediction\n", "0 1 1\n", "1 2 2\n", "2 3 2\n", "3 4 5\n", "4 5 5" ] }, "execution_count": 309, "metadata": {}, "output_type": "execute_result" } ], "source": [ "label = [1,2,3,4,5]\n", "prediction = [1,2,2,5,5]\n", "\n", "comparison = pd.DataFrame(\n", " {'label': label,\n", " 'prediction': prediction\n", " })\n", "\n", "comparison" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.4" } }, "nbformat": 4, "nbformat_minor": 2 }