import pandas as pd
# Load csv from colab
input_path = 'sample_data/california_housing_train.csv'
input_df = pd.read_csv(input_path)
# Set header (header=0)
# No header (header=None)
# Set header name (names=('c1', 'c2'))
# Select column index (usecols=[1, 2])
# Select column name (usecols=['col1', 'col2'])
# Set column index (index_col=0)
# Set str datatype (dtype=str)
# Set encode (encoding='shift_jis')
# Load json from colab
import json
input_path = 'sample_data/anscombe.json'
input_df = json.load(open(input_path))
# Load csv from github
input_path = 'https://raw.githubusercontent.com/scikit-learn/scikit-learn/master/sklearn/datasets/data/breast_cancer.csv'
input_df = pd.read_csv(input_path)
# Numpy to dataframe
import numpy as np
import pandas as pd
my_array = np.array([1, 2, 3])
df = pd.DataFrame(my_array, columns=['col1'])
df
# Create Empty Dataframe
df = pd.DataFrame()
df
# Create Dataframe with date type
df = pd.DataFrame({'col1': [1.0],
'col2': [1],
'col3': [pd.Timestamp('20180310')],
'col4': ['A']})
print(df.dtypes)
df
# Create Dataframe
df = pd.DataFrame(np.array([[1, 2], [3, 4]]), columns=['col1', 'col2'])
df
# Show data
df = pd.DataFrame(np.array([[1, 2], [3, 4]]), columns=['col1', 'col2'])
print(df)
# df.head()
# df.tail()
# Show dataframe's shape
df = pd.DataFrame(np.array([[1, 2], [3, 4]]), columns=['col1', 'col2'])
df.shape
# Show data type
df = pd.DataFrame(np.array([['A', 1], ['B', 2]]), columns=['col1', 'col2'])
df.dtypes
df = pd.DataFrame({'col1': [1.0],
'col2': [1],
'col3': [pd.Timestamp('20180310')],
'col4': ['A']})
df.dtypes
# Change data type
df = pd.DataFrame(np.array([['A', 1, 1.1]]), columns=['col1', 'col2', 'col3'])
df['col1'] = df['col1'].astype(str)
df['col2'] = df['col2'].astype(int)
df['col3'] = df['col3'].astype(float)
df.dtypes
# Select Column
df = pd.DataFrame({
'col1': [1, 2, 3,],
'col2': ['a', 'b', 'c'],
'col3': [1.1, '1.0', '1.3'] })
filtered_df = df[['col1', 'col3']]
filtered_df.head()
# Create unique list
df = pd.DataFrame({
'col1': [1, 2, 2, 3, 3, 3]})
item_list = list(set(df['col1']))
print('item_count: ',len(item_list))
print(item_list)
# Drop column
df = pd.DataFrame({
'col1': [1, 2, 3,],
'col2': ['a', 'b', 'c'],
'col3': [1.1, 1.2, 1.3] })
df.drop(columns=['col2'], axis=1, inplace=True)
df
# Change column name
df = pd.DataFrame({
'col1': [1, 2, 3,],
'col2': ['a', 'b', 'c'],
'col3': [1.1, 1.2, 1.3] })
df = df.rename(columns={"col1": "c1", "col2": "c2"})
df
# Column rename : Replace space with "_"
df = pd.DataFrame({
'col 1': [1, 2, 3,],
'col 2': ['a', 'b', 'c'],
'col 3': [1.1, 1.2, 1.3] })
print(df)
df.columns = [label.replace(' ', '_') for label in df.columns]
print(df)
# Set column to index
df = pd.DataFrame({
'col1': ['A', 'B', 'C'],
'col2': [10, 20, 30]})
df = df.set_index('col2')
df
# Set index to column
df = pd.DataFrame({
'col1': ['A', 'B', 'C'],
'col2': [10, 20, 30]})
df['col3'] = df.index
df
# Reset index
df = pd.DataFrame({
'col1': ['A', 'B', 'C'],
'col2': [10, 20, 30]})
df.reset_index(drop=True, inplace=True)
df
# Reset index
df = pd.DataFrame({
'col1': ['B', 'A', 'C'],
'col2': [10, 30, 40]})
df = df.sort_values(by=['col1'])
# df = df.sort_values(by=['col1', 'col2']) # multiple column sort
# df = df.sort_values(by=['col1'], ascending=False) # sort Descending
df
# Create unique list
result = {}
df = pd.DataFrame({ 'col1': [1, 2, 2, 3, 3, 3]})
result[('row', 'Max')] = df['col1'].max()
result[('row', 'Min')] = df['col1'].min()
result[('row', 'Median')] = df['col1'].median()
result[('row', 'Mean')] = df['col1'].mean()
result[('row', 'Stdev')] = df['col1'].std()
result[('row', '25th_percentile')] = np.percentile(df['col1'],25)
result[('row', '50th_percentile')] = np.percentile(df['col1'],50)
result[('row', '75th_percentile')] = np.percentile(df['col1'],75)
result_df = pd.Series(result).unstack()
result_df
# Calculate time
from datetime import datetime
result = {}
start_time = datetime.now()
result[('row', 'col')] = datetime.now() - start_time
# Show result
result_df = pd.Series(result).unstack()
result_df
# Count Null
df = pd.DataFrame({
'col1': [1, 2, 3,],
'col2': ['a', np.NaN, 'c'],
'col3': [1.1, np.NaN, np.NaN] })
df.isnull().sum()
df = pd.DataFrame({
'col1': [1, 2, 3,],
'col2': ['a', np.NaN, 'c'],
'col3': [1.1, np.NaN, np.NaN] })
dropna_df = df.dropna()
dropna_df
# Where flag
df = pd.DataFrame({
'col1': [1, 2, 3,],
'col2': ['a', 'b', 'c'],
'col3': [1.1, 1.2, 1.3] })
df['col4'] = np.where((df['col1'] >= 2 ), True, False)
df['col5'] = np.where((df['col1'] >= 2 ) & (df['col3'] <= 1.2), True, False)
df['col6'] = np.where((df['col1'] >= 2 ) | (df['col3'] <= 1.2), True, False)
df['col7'] = np.where(df['col2'] == "c", True, False)
df
# Isin Flag
df = pd.DataFrame({
'col1': [1, 2, 3,],
'col2': ['a', 'b', 'c'],
'col3': [1.1, 1.2, 1.3] })
my_list = ['a', 'b']
df['col4'] = df['col2'].isin(my_list)
df
# Upload file from wikimedia
import urllib
img_src = "https://upload.wikimedia.org/wikipedia/commons/thumb/2/21/64_365_Color_Macro_%285498808099%29.jpg/320px-64_365_Color_Macro_%285498808099%29.jpg"
img_path = 'input_image.jpg'
urllib.request.urlretrieve(img_src, img_path)
# Show image
from IPython.display import Image,display_jpeg
img_path = 'input_image.jpg'
display_jpeg(Image(img_path))