Pandas

In [2]:
import pandas as pd

Load Data

In [ ]:
# Load csv from colab
input_path = 'sample_data/california_housing_train.csv'
input_df = pd.read_csv(input_path)

# Set header           (header=0)
# No header            (header=None)
# Set header name      (names=('c1', 'c2'))
# Select column index  (usecols=[1, 2])
# Select column name   (usecols=['col1', 'col2'])
# Set column index     (index_col=0)
# Set str datatype     (dtype=str)
# Set encode           (encoding='shift_jis')
In [ ]:
# Load json from colab
import json

input_path = 'sample_data/anscombe.json'
input_df = json.load(open(input_path))
In [ ]:
# Load csv from github

input_path = 'https://raw.githubusercontent.com/scikit-learn/scikit-learn/master/sklearn/datasets/data/breast_cancer.csv'
input_df = pd.read_csv(input_path)

Convert to Dataframe

In [ ]:
# Numpy to dataframe
import numpy as np
import pandas as pd

my_array = np.array([1, 2, 3])
df = pd.DataFrame(my_array,  columns=['col1']) 
df  
Out[ ]:
col1
0 1
1 2
2 3

Crate Dataframe

In [ ]:
# Create Empty Dataframe
df = pd.DataFrame()
df
Out[ ]:
In [ ]:
# Create Dataframe with date type

df = pd.DataFrame({'col1': [1.0],
                   'col2': [1],
                   'col3': [pd.Timestamp('20180310')],
                   'col4': ['A']})
print(df.dtypes)
df
col1           float64
col2             int64
col3    datetime64[ns]
col4            object
dtype: object
Out[ ]:
col1 col2 col3 col4
0 1.0 1 2018-03-10 A
In [ ]:
# Create Dataframe

df = pd.DataFrame(np.array([[1, 2], [3, 4]]), columns=['col1', 'col2'])
df
Out[ ]:
col1 col2
0 1 2
1 3 4

Exploration

In [ ]:
# Show data

df = pd.DataFrame(np.array([[1, 2], [3, 4]]), columns=['col1', 'col2'])
print(df)
# df.head()
# df.tail()
   col1  col2
0     1     2
1     3     4
In [ ]:
# Show dataframe's shape

df = pd.DataFrame(np.array([[1, 2], [3, 4]]), columns=['col1', 'col2'])
df.shape
Out[ ]:
(2, 2)

Data Type

In [ ]:
# Show data type

df = pd.DataFrame(np.array([['A', 1], ['B', 2]]), columns=['col1', 'col2'])
df.dtypes
Out[ ]:
col1    object
col2    object
dtype: object
In [ ]:
df = pd.DataFrame({'col1': [1.0],
                   'col2': [1],
                   'col3': [pd.Timestamp('20180310')],
                   'col4': ['A']})
df.dtypes
Out[ ]:
col1           float64
col2             int64
col3    datetime64[ns]
col4            object
dtype: object
In [ ]:
# Change data type

df = pd.DataFrame(np.array([['A', 1, 1.1]]), columns=['col1', 'col2', 'col3'])
df['col1'] = df['col1'].astype(str)
df['col2'] = df['col2'].astype(int)
df['col3'] = df['col3'].astype(float)

df.dtypes
Out[ ]:
col1     object
col2      int64
col3    float64
dtype: object

Data Seletion

Column Edition

In [ ]:
# Select Column

df = pd.DataFrame({ 
    'col1': [1, 2, 3,], 
    'col2': ['a', 'b', 'c'], 
    'col3': [1.1, '1.0', '1.3'] }) 

filtered_df = df[['col1', 'col3']]
filtered_df.head()
Out[ ]:
col1 col3
0 1 1.1
1 2 1.0
2 3 1.3
In [ ]:
# Create unique list

df = pd.DataFrame({ 
    'col1': [1, 2, 2, 3, 3, 3]}) 
item_list = list(set(df['col1']))

print('item_count: ',len(item_list))
print(item_list)
item_count:  3
[1, 2, 3]
In [ ]:
# Drop column

df = pd.DataFrame({ 
    'col1': [1, 2, 3,], 
    'col2': ['a', 'b', 'c'], 
    'col3': [1.1, 1.2, 1.3] }) 
df.drop(columns=['col2'], axis=1, inplace=True)
df
Out[ ]:
col1 col3
0 1 1.1
1 2 1.2
2 3 1.3
In [ ]:
# Change column name

df = pd.DataFrame({ 
    'col1': [1, 2, 3,], 
    'col2': ['a', 'b', 'c'], 
    'col3': [1.1, 1.2, 1.3] }) 
df = df.rename(columns={"col1": "c1", "col2": "c2"})
df
Out[ ]:
c1 c2 col3
0 1 a 1.1
1 2 b 1.2
2 3 c 1.3
In [3]:
# Column rename : Replace space with "_"
df = pd.DataFrame({ 
    'col 1': [1, 2, 3,], 
    'col 2': ['a', 'b', 'c'], 
    'col 3': [1.1, 1.2, 1.3] }) 
print(df)

df.columns = [label.replace(' ', '_') for label in df.columns]
print(df)
   col 1 col 2  col 3
0      1     a    1.1
1      2     b    1.2
2      3     c    1.3
   col_1 col_2  col_3
0      1     a    1.1
1      2     b    1.2
2      3     c    1.3
In [ ]:
# Set column to index

df = pd.DataFrame({ 
    'col1': ['A', 'B', 'C'],
    'col2': [10, 20, 30]}) 

df = df.set_index('col2')
df
Out[ ]:
col1
col2
10 A
20 B
30 C
In [ ]:
# Set index to column

df = pd.DataFrame({ 
    'col1': ['A', 'B', 'C'],
    'col2': [10, 20, 30]}) 

df['col3'] = df.index
df
Out[ ]:
col1 col2 col3
0 A 10 0
1 B 20 1
2 C 30 2
In [ ]:
# Reset index

df = pd.DataFrame({ 
    'col1': ['A', 'B', 'C'],
    'col2': [10, 20, 30]}) 
df.reset_index(drop=True, inplace=True)
df
Out[ ]:
col1
0 A
1 B
2 C
In [ ]:
# Reset index

df = pd.DataFrame({ 
    'col1': ['B', 'A', 'C'],
    'col2': [10, 30, 40]}) 
df = df.sort_values(by=['col1'])
# df = df.sort_values(by=['col1', 'col2'])        # multiple column sort
# df = df.sort_values(by=['col1'], ascending=False) # sort Descending
df
Out[ ]:
col1 col2
2 C 40
0 B 10
1 A 30

2.3.8 Calculation

In [ ]:
# Create unique list
result = {}
df = pd.DataFrame({ 'col1': [1, 2, 2, 3, 3, 3]}) 

result[('row', 'Max')] = df['col1'].max()
result[('row', 'Min')] = df['col1'].min()
result[('row', 'Median')] = df['col1'].median()
result[('row', 'Mean')] = df['col1'].mean()  
result[('row', 'Stdev')] = df['col1'].std()  
result[('row', '25th_percentile')] = np.percentile(df['col1'],25)
result[('row', '50th_percentile')] = np.percentile(df['col1'],50)
result[('row', '75th_percentile')] = np.percentile(df['col1'],75)

result_df = pd.Series(result).unstack()
result_df
Out[ ]:
25th_percentile 50th_percentile 75th_percentile Max Mean Median Min Stdev
row 2.0 2.5 3.0 3.0 2.333333 2.5 1.0 0.816497
In [ ]:
# Calculate time

from datetime import datetime
result = {}
start_time = datetime.now()

result[('row', 'col')] = datetime.now() - start_time

# Show result
result_df = pd.Series(result).unstack()
result_df
Out[ ]:
col
row 00:00:00.000084

2.3.9 Edition Filter

In [ ]:
# Count Null

df = pd.DataFrame({ 
    'col1': [1, 2, 3,], 
    'col2': ['a', np.NaN, 'c'], 
    'col3': [1.1, np.NaN, np.NaN] }) 
df.isnull().sum()
Out[ ]:
col1    0
col2    1
col3    2
dtype: int64
In [ ]:
df = pd.DataFrame({ 
    'col1': [1, 2, 3,], 
    'col2': ['a', np.NaN, 'c'], 
    'col3': [1.1, np.NaN, np.NaN] }) 

dropna_df = df.dropna()
dropna_df
Out[ ]:
col1 col2 col3
0 1 a 1.1
In [ ]:
 # Where flag
 
 df = pd.DataFrame({ 
    'col1': [1, 2, 3,], 
    'col2': ['a', 'b', 'c'], 
    'col3': [1.1, 1.2, 1.3] }) 

df['col4'] = np.where((df['col1'] >= 2 ), True, False)
df['col5'] = np.where((df['col1'] >= 2 ) & (df['col3'] <= 1.2), True, False)
df['col6'] = np.where((df['col1'] >= 2 ) | (df['col3'] <= 1.2), True, False)
df['col7'] = np.where(df['col2'] == "c", True, False)  

df
Out[ ]:
col1 col2 col3 col4 col5 col6 col7
0 1 a 1.1 False False True False
1 2 b 1.2 True True True False
2 3 c 1.3 True False True True
In [ ]:
# Isin Flag

df = pd.DataFrame({ 
    'col1': [1, 2, 3,], 
    'col2': ['a', 'b', 'c'], 
    'col3': [1.1, 1.2, 1.3] }) 
my_list = ['a', 'b']
df['col4'] = df['col2'].isin(my_list)
df
Out[ ]:
col1 col2 col3 col4
0 1 a 1.1 True
1 2 b 1.2 True
2 3 c 1.3 False

2.3.10 Condition

In [ ]:
 

▶ Image

In [ ]:
# Upload file from wikimedia

import urllib
img_src = "https://upload.wikimedia.org/wikipedia/commons/thumb/2/21/64_365_Color_Macro_%285498808099%29.jpg/320px-64_365_Color_Macro_%285498808099%29.jpg"
img_path = 'input_image.jpg'
urllib.request.urlretrieve(img_src, img_path)
Out[ ]:
('input_image.jpg', <http.client.HTTPMessage at 0x7f151e4a9128>)
In [ ]:
# Show image

from IPython.display import Image,display_jpeg
img_path = 'input_image.jpg'
display_jpeg(Image(img_path))
In [ ]: