머신러닝 05 - sklearn 알아 보기 (회귀

머신러닝 05 - sklearn 알아 보기 (회귀 - 2)

H-V 2022. 2. 14. 17:23

패스트캠퍼스 '직장인을 위한 파이썬 데이터분석 올인원 패키치 Online' 참조

* 모델 성능 확인을 위한 함수

from sklearn.metrics import mean_absolute_error, mean_squared_error

import matplotlib.pyplot as plt
import seaborn as sns

my_predictions = {}

colors = ['r', 'c', 'm', 'y', 'k', 'khaki', 'teal', 'orchid', 'sandybrown',
          'greenyellow', 'dodgerblue', 'deepskyblue', 'rosybrown', 'firebrick',
          'deeppink', 'crimson', 'salmon', 'darkred', 'olivedrab', 'olive',
          'forestgreen', 'royalblue', 'indigo', 'navy', 'mediumpurple', 'chocolate',
          'gold', 'darkorange', 'seagreen', 'turquoise', 'steelblue', 'slategray',
          'peru', 'midnightblue', 'slateblue', 'dimgray', 'cadetblue', 'tomato'
         ]

def plot_predictions(name_, pred, actual):
    df = pd.DataFrame({'prediction': pred, 'actual': y_test})
    df = df.sort_values(by='actual').reset_index(drop=True)

    plt.figure(figsize=(12, 9))
    plt.scatter(df.index, df['prediction'], marker='x', color='r')
    plt.scatter(df.index, df['actual'], alpha=0.7, marker='o', color='black')
    plt.title(name_, fontsize=15)
    plt.legend(['prediction', 'actual'], fontsize=12)
    plt.show()

def mse_eval(name_, pred, actual):
    global predictions
    global colors

    plot_predictions(name_, pred, actual)

    mse = mean_squared_error(pred, actual)
    my_predictions[name_] = mse

    y_value = sorted(my_predictions.items(), key=lambda x: x[1], reverse=True)

    df = pd.DataFrame(y_value, columns=['model', 'mse'])
    print(df)
    min_ = df['mse'].min() - 10
    max_ = df['mse'].max() + 10

    length = len(df)

    plt.figure(figsize=(10, length))
    ax = plt.subplot()
    ax.set_yticks(np.arange(len(df)))
    ax.set_yticklabels(df['model'], fontsize=15)
    bars = ax.barh(np.arange(len(df)), df['mse'])

    for i, v in enumerate(df['mse']):
        idx = np.random.choice(len(colors))
        bars[i].set_color(colors[idx])
        ax.text(v + 2, i, str(round(v, 3)), color='k', fontsize=15, fontweight='bold')

    plt.title('MSE Error', fontsize=18)
    plt.xlim(min_, max_)

    plt.show()

def remove_model(name_):
    global my_predictions
    try:
        del my_predictions[name_]
    except KeyError:
        return False
    return True

01 선형회귀 (Linear Regression)

from sklearn.linear_model import LinearRegression

model = LinearRegression(n_jobs=-1)

model.fit(x_train, y_train)
LinearRegression(n_jobs=-1)


pred = model.predict(x_test)

mse_eval('LinearRegression', pred, y_test)

검은색 점 = 실제 데이터
빨간색 x = 예측 데이터
검은색점과 x점 사이의 거리를 '오차'라고 부름
오차에 제곱을하고 평균을 낸게 MSE

02 규제 (Regularization)

학습이 과대적합 되는 것을 방지하고자 일종의 'penalty'를 부여하는 것
L1,L2 규제가 있음

L2 규제 (L2 Regularization)
각 가중치 제곱의 합에 규제 강도(Regularization Strength) - 'penalty' 를 곱한다
규제강도를 크게하면 가중치가 더 많이 감소되고(규제를 중요시하게됨), 규제강도를 작게하면 가중치가 증가한다(규제를 중요시 하지 않게 됨)
L1 규제 (L1 Regularization)
가중치의 제곱의 합이 아닌 가중치의 합에 규제강도를 곱하여 오차에 더함
어떤 가중치(w)는 실제로 0이 되기도 한다. 즉, 모델에서 완전히 제외되는 특성이 생김

L2규제가 L1규제에 비해 더 안정적이라 L2를 많이 씀

릿지(Ridge) - L2규제를 활용한 선형회귀모델

# Ridge - L2규제를 활용한 선형회귀모델
from sklearn.linear_model import Ridge

# 강도 설정, 강도가 커질수록 큰 규제
alphas = [100, 10, 1, 0.1, 0.01, 0.001, 0.0001]

# 포문을 돌려 각각의 강도에 대한 값을 확인
for alpha in alphas:
    ridge = Ridge(alpha=alpha)
    ridge.fit(x_train, y_train)
    pred = ridge.predict(x_test)
    mse_eval('Ridge(alpha={})'.format(alpha), pred, y_test)

▶ 가중치를 눈으로 확인도 가능 하다

# 학습 완료 후 각각의 가중치를 볼 수 있다. 컬럼과 매치되어 적용
ridge.coef_ #13개
array([ -0.10552463,   0.02605621,   0.06979038,   1.828221  ,
       -12.33153449,   5.5654287 ,  -0.03086006,  -1.1530249 ,
         0.20078576,  -0.01342223,  -0.85449981,   0.00850256,
        -0.32504348])
        

# 학습데이터 컬럼과 매치되어 가중치가 계산됨
x_train.columns # 13개
Index(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
       'PTRATIO', 'B', 'LSTAT'],
      dtype='object')


def plot_coef(columns, coef):
    coef_df = pd.DataFrame(list(zip(columns, coef)))
    coef_df.columns=['feature', 'coef']
    coef_df = coef_df.sort_values('coef', ascending=False).reset_index(drop=True)
    
    fig, ax = plt.subplots(figsize=(9, 7))
    ax.barh(np.arange(len(coef_df)), coef_df['coef'])
    idx = np.arange(len(coef_df))
    ax.set_yticks(idx)
    ax.set_yticklabels(coef_df['feature'])
    fig.tight_layout()
    plt.show()
    
plot_coef(x_train.columns, ridge.coef_)

0은 가중치가 거의 없다는 말, 즉 0을 가진 데이터들은 성능에 미세한 영향을 끼친다는 말

하이퍼파라미터 값으로 가중치를 어떻게 조절하느냐에따라 성능이 달라진다. 규제는 학습데이터가 엄청나게 방대한면 규제가 사실 필요없다. 그만큼 데이터가 이미 많이 풀려서 학습이 되어있다는 것. 하지만 대회, 소규모 기업등에서 데이터가 없을때는 실제 사례화 일반화를 맞출 필요가 있어지고 이때 규제를 많이 줘야 일반화에 가까워 진다

라쏘 - L1규제를 활용한 선형회귀모델

# Lasso - L1규제를 활용한 선형회귀모델
from sklearn.linear_model import Lasso

# 값이 커질 수록 큰 규제입니다.
alphas = [100, 10, 1, 0.1, 0.01, 0.001, 0.0001]

# 포문으로 각 규제값에 따른 차이 비교
for alpha in alphas:
    lasso = Lasso(alpha=alpha)
    lasso.fit(x_train, y_train)
    pred = lasso.predict(x_test)
    mse_eval('Lasso(alpha={})'.format(alpha), pred, y_test)

L2와 L1을 비교해서 성능을 볼 수 있다. L2는 100일때 L1은 0.001일때 성능이 제일 좋다

▶ 위에서 언급했던 라쏘의 단점을 보자

lasso_100 = Lasso(alpha=100)
lasso_100.fit(x_train, y_train)
lasso_pred_100 = lasso_100.predict(x_test)

lasso_100.coef_
array([-0.        ,  0.        , -0.        ,  0.        , -0.        ,
        0.        , -0.        ,  0.        , -0.        , -0.02428582,
       -0.        ,  0.00041901, -0.        ])
       
plot_coef(x_train.columns, lasso_100.coef_)

규제를 100으로 주게되면 대부분의 데이터가 규제가 쌓이게되고 데이터의 영향력이 0이 되어버리는 단점이 있다.

ElasticNet - 릿지와 라쏘를 섞은 하이브리드형 모델

l1_ratio (default=0.5)

l1_ratio = 0 (L2 규제만 사용).
l1_ratio = 1 (L1 규제만 사용).
0 < l1_ratio < 1 (L1 and L2 규제의 혼합사용)

03 Scaler 다시 보기

Scaler - 평균(Mean)을 0, 표준편차(Std)를 1로 만들어주는 라이브러리

from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
x_train.describe()

# StandardScaler 평균(Mean)을 0, 표준편차(Std)를 1로 만들어주는 라이브러리
std_scaler = StandardScaler()

# .fit_stransform() = .fit() - 단순학습, .transform() - 학습된 데이터 적용
# .fit_transform()을 하면 학습+적용을 동시에 함
std_scaled = std_scaler.fit_transform(x_train)

round(pd.DataFrame(std_scaled).describe(), 2)

MinMaxScaler - min값과 max값을 0~1사이로 정규화

# MinMaxScaler - min값과 max값을 0~1사이로 정규화
minmax_scaler = MinMaxScaler()
minmax_scaled = minmax_scaler.fit_transform(x_train)

round(pd.DataFrame(minmax_scaled).describe(), 2)

RobustScaler - 중앙값(median)이 0, IQR(interquartile range)이 1이 되도록 변환
RobustScaler는 아웃라이어 데이터를 걸러주는 역할

# RobustScaler - 중앙값(median)이 0, IQR(interquartile range)이 1이 되도록 변환
# 아웃라이어 처리에 좋음
robust_scaler = RobustScaler()
robust_scaled = robust_scaler.fit_transform(x_train)

round(pd.DataFrame(robust_scaled).median(), 2)
0     0.0
1     0.0
2     0.0
3     0.0
4     0.0
5     0.0
6     0.0
7     0.0
8     0.0
9     0.0
10    0.0
11    0.0
12    0.0
dtype: float64

04 파이프라인(Pipeline)

스캐일로 값들을 변환하고 그 값을 다시 .fit()을 해줘야 하는데 이러한 번거러움을 없애기위한 라이브러리

from sklearn.pipeline import make_pipeline

elasticnet_pipeline = make_pipeline(
    StandardScaler(), #원하는 스캐일러를 넣어주기만 하면 됨
    ElasticNet(alpha=0.1, l1_ratio=0.2)
)

elasticnet_pred = elasticnet_pipeline.fit(x_train, y_train).predict(x_test)

mse_eval('Standard ElasticNet', elasticnet_pred, y_test)

05 다항식 모델 (Polynomial Features)

다항식의 계수간 상호작용을 통해 새로운 'Feature'를 생성하는 모델
예로 [a,b] 2개의 feature가 존재한다고 가정하고, degree=2로 설정하면 다항식 모델은 [1,a,b,a^2,ab,b^2]를 만듬

from sklearn.preprocessing import PolynomialFeatures

poly_pipeline = make_pipeline(
    PolynomialFeatures(degree=2, include_bias=False),
    StandardScaler(), #원하는 스캐일러를 넣어주기만 하면 됨
    ElasticNet(alpha=0.1, l1_ratio=0.2)
)

poly_pred = poly_pipeline.fit(x_train, y_train).predict(x_test)

mse_eval('Poly ElasticNet', poly_pred, y_test)

저작자표시 (새창열림)