By Kritdikoon Woraitthinan 2021-01-15
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
plt.style.use('seaborn')
import math
import statistics
import numpy as np
import scipy.stats
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import Normalizer
from sklearn.datasets import fetch_california_housing
# Load data
data = fetch_california_housing()
df = pd.DataFrame(np.c_[data.data, data.target], columns=data.feature_names + ["target"])
X = df['Population'].values.reshape(-1, 1)
df.head(2)
print('mean = ' + str(round(np.mean(X),2)) + '\n'+
'median = ' + str(round(np.median(X),2)) + '\n'+
'min = ' + str(round(np.min(X),2)) + '\n'+
'max = ' + str(round(np.max(X),2)) + '\n'
'SD = ' + str(round(np.std(X),2)) + '\n'
'skew = ' + str(round(scipy.stats.skew(X)[0],2)) + '\n'
'kurtosis = ' + str(round(scipy.stats.kurtosis(X, bias=False)[0],2)) + '\n'
)
plt.hist(X, bins=30, alpha=0.5);
def get_plot(data, title):
subplot_n = len(data)
fig, axs = plt.subplots(1, subplot_n, figsize=(subplot_n*5,4))
for i, x in enumerate(data):
axs[i].hist(x, bins=30, alpha=0.5)
axs[i].set_title( title[i] + '\n'+
'mean = ' + str(round(np.mean(x),2)) + '\n'+
'median = ' + str(round(np.median(x),2)) + '\n'+
'min = ' + str(round(np.min(x),2)) + '\n'+
'max = ' + str(round(np.max(x),2)) + '\n'
'SD = ' + str(round(np.std(x),2)) + '\n'
'skew = ' + str(round(scipy.stats.skew(x, bias=False)[0],2)) + '\n'
'kurtosis = ' + str(round(scipy.stats.kurtosis(x, bias=False)[0],2)) + '\n'
);
X_MinMaxScaler = MinMaxScaler().fit_transform(X)
get_plot([X, X_MinMaxScaler], ['Original', 'MinMaxScaler'])
X_MaxAbsScaler = MaxAbsScaler().fit_transform(X)
get_plot([X, X_MaxAbsScaler], ['Original', 'MaxAbsScaler'])
標準化(standardization)
X_StandardScaler = StandardScaler().fit_transform(X)
get_plot([X, X_StandardScaler], ['Original', 'StandardScaler'])
X_RobustScaler = RobustScaler(quantile_range=(25, 75)).fit_transform(X)
get_plot([X, X_RobustScaler], ['Original', 'RobustScaler'])
X_PowerTransformer_boxcox = PowerTransformer(method='box-cox').fit_transform(X)
X_PowerTransformer_yeojohnson = PowerTransformer(method='yeo-johnson').fit_transform(X)
get_plot([X, X_PowerTransformer_boxcox, X_PowerTransformer_yeojohnson],
['Original', 'PowerTransformer_boxcox', 'PowerTransformer_yeojohnson'])
QuantileTransformerは非線形変換を適用します。
Uniform
Normal
X_QuantileTransformer_uniform = QuantileTransformer(output_distribution='uniform').fit_transform(X)
X_QuantileTransformer_normal = QuantileTransformer(output_distribution='normal').fit_transform(X)
get_plot([X, X_QuantileTransformer_uniform, X_QuantileTransformer_normal],
['Original', 'QuantileTransformer_uniform', 'QuantileTransformer_normal'])
row1 = df.iloc[0:2].values[0].reshape(-1, 1)
X_Normalizer = Normalizer().fit_transform(row1)
get_plot([row1, X_Normalizer], ['Original', 'Normalizer'])
X_log = np.log(X)
get_plot([X, X_log], ['Original', 'Log'])
X_sqrt = np.sqrt(X)
get_plot([X, X_sqrt], ['Original', 'Square root'])