머신러닝4주차
1. Logistic regression by grandient descent
Sample dataset
import numpy as np
import matplotlib.pyplot as plt
# x_train is the vector of input variables
# y_train is the vector of target values (class labels)
x_train = np.array([0.5, 2.5, 4.0, 6.0, 7.5, 9.5])
y_train = np.array([0, 0, 0, 1, 1, 1])
print('x_train =', x_train)
print('y_train =', y_train)
x_train = [0.5 2.5 4. 6. 7.5 9.5]
y_train = [0 0 0 1 1 1]
3개씩 나누어서 표시함
# plot the data points. first 3 data(class 0) as red, next 3 data(class 1) as blue
plt.scatter(x_train[:3], y_train[:3],
color='red', marker='o', label='class 0')
plt.scatter(x_train[3:], y_train[3:],
color='blue', marker='x', label='class 1')
plt.xlabel('x')
plt.ylabel('y')
plt.legend(loc='upper left')
plt.show()
Classification and decision boundary
# the sigmoid function
def sigmoid(z):
# sigmoid
y = 1 / (1+np.exp(-z))
return y
sigmoid는 입력값을 받아서 출력값을 0과 1사이로 바꿔준다.
즉 확률처럼 해석할 수 있는 값으로 변경해준다.
(S자 곡선 형태라서, 경계가 명확하게 나타남)
# plot the data points
plt.scatter(x_train[:3], y_train[:3],
color='red', marker='o', label='class 0')
plt.scatter(x_train[3:], y_train[3:],
color='blue', marker='x', label='class 1')
# predict y_hat for x in [0, 10]
x = np.arange(0, 10, 0.1)
y_hat = sigmoid(x - 5)
# plot y_hat
plt.plot(x, y_hat)
plt.xlabel('x')
plt.ylabel('y')
plt.title('Prob. of y = 1')
plt.legend(loc='upper left')
plt.show()
y_hat을 sigmoid로 구하고 그 값을 그래프로 나타내면 그림과 같이 S자 곡선으로 나타난다.
# plot the data points in x axis
plt.scatter(x_train[:3], [0, 0, 0],
color='red', marker='o', label='class 0')
plt.scatter(x_train[3:], [0, 0, 0],
color='blue', marker='x', label='class 1')
# show decision boundary
x = 5
plt.plot(x, 0, marker='^')
plt.xlabel('x')
plt.title('Decision Boundary')
plt.legend(loc='upper left')
plt.show()
x=5라는 값을 정해서 클래스가 나뉘는 점을 찍음(임의로 정함)
Sample dataset-2D
- Task: predicting x0,x1→y (0 or 1)
- Be careful! the name of dataset is X_train, not x_train!
# x_train is the vector of input variables
# y_train is the vector of target values (class labels)
X_train = np.array([[0.5, 1.5], [1.0, 1.0], [1.0, 0.5],
[2.5, 1.5], [2.0, 2.0], [1.0, 2.5]])
y_train = np.array([0, 0, 0, 1, 1, 1])
print('X_train =', X_train)
print('y_train =', y_train)
일반적인 데이터셋임
앞에 했던 것과 다른 점은 x0,x1과 같이 두개가 y를 결정한다는 점임
from matplotlib import cm
from mpl_toolkits.mplot3d import axes3d
fig, ax = plt.subplots(subplot_kw={"projection": "3d"})
# plot the data points in 3D
ax.scatter(X_train[:3,0],X_train[:3,1], y_train[:3], marker='o', c='red')
ax.scatter(X_train[3:,0],X_train[3:,1] , y_train[3:], marker='x', c='blue')
plt.xlabel('x0')
plt.ylabel('x1')
plt.show()
3차원으로 나타내봤음
Classification by a linear model
- model : y=sigmoid(x⋅w+b)=sigmoid(w0x0+w1x1+b)
- decision boundary: x⋅w+b = w0 ⋅ x0+w1 ⋅x1+b=0
# predict class by a linear model
# x : array (m, n) - m examples with n features
# y : vector (m, )
# w : model parameters (n, )
# b : model parameter scalar
def predict(x, w, b):
# predict y_hat by the model y = sigmoid(xw + b)
y_hat = sigmoid(np.dot(x,w)+b)
# class = 1 if y_hat >= 0.5. use np.where()
y = np.where(y_hat >= 0.5, 1, 0)
return y
predict 함수를 정의
Prediction - single data
# arbitrary model parameter
w = [1.0, 1.0]
b = 1.0
print('x = ', X_train[0])
print('y = ', y_train[0])
# print y_hat = sigmoid(z), z = xw + b
z = np.dot(X_train[0],w)+b
y_hat = sigmoid(z)
print('y_hat = sigmoid(xw + b) = ', y_hat)
# predict class
print('class prediction = ', predict(X_train[0],w,b))
하나의 학습 데이터를 갖고 예측값 계산과 분류 결과 출력을 연습하는 예제이다.
Prediction - all data 이제는 모든 데이터를 연습한다.
# arbitrary model parameter
w = [1.0, 1.0]
b = 1.0
print('X = \n', X_train)
print('y = \n', y_train)
# print y_hat = sigmoid(z), z = xw + b
z = np.dot(X_train,w)+b
y_hat = sigmoid(z)
print('y_hat = sigmoid(xw + b) = \n', y_hat)
# predict class
print('class prediction = \n', predict(X_train,w,b))
0.5를 기준으로 0.5보다 높으니 1을 출력한 모습이다.
Cost function
- Binary Cross Entropy
y_hat(예측값)이 y(실제값)과 같을 수록 cost가 낮고 멀수록 높아진다. 학습 중 모델은 cost를 최소화하는 방향으로 파라미터를 조정한다
# Computes the cost function - BCE
def compute_cost(X, y, w, b):
"""
Computes cost
Args:
X (ndarray (m,n)): Data, m examples with n features
y (ndarray (m,)) : target values
w (ndarray (n,)) : model parameters
b (scalar) : model parameter
Returns:
cost (scalar): cost
"""
m = X.shape[0]
cost = 0.0
# compute y_hat = sigmoid(z), z = xw + b
z = np.dot(X,w)+b
y_hat = sigmoid(z)
# compute cost = binary cross entropy. Use np.log()
cost = -y*np.log(y_hat) - (1-y) * np.log(1-y_hat)
cost = np.sum(cost) / m
return cost
print('w =', w)
print('b =', b)
print('y_train =', y_train)
z = np.dot(X_train, w) + b
y_hat = sigmoid(z)
print('y_hat = ', y_hat)
# check the cost for the model with current w, b
cost = compute_cost(X_train,y_train,w,b)
print('--> cost =', cost)
Computing gradients
비용함수(J)를 최소화하기 위한 가중치w, 편향b에 대한 기울기(gadient)를 계산하는 공식
def compute_gradient(X, y, w, b):
"""
X (ndarray (m,n): Data, m examples with n features
y (ndarray (m,)): target values
w (ndarray (n,)): model parameters
b (scalar) : model parameter
dj_dw (ndarray (n,)): The gradient of the cost w.r.t. the parameters w.
dj_db (scalar) : The gradient of the cost w.r.t. the parameter b.
"""
m,n = X.shape
dj_dw = np.zeros((n,))
dj_db = 0.
# compute y_hat
y_hat = sigmoid(np.dot(X,w)+b)
err = y_hat - y
# compute gadients
dj_dw = np.dot(X.T,err)/m
dj_db = np.sum(err)/m
return dj_dw, dj_db
print('w =', w)
print('b =', b)
print('X_train =', X_train)
print('y_train =', y_train)
# check the gradients for current w, b
dj_dw, dj_db = compute_gradient(X_train,y_train,w,b)
print('dj_dw =', dj_dw)
print('dj_db =', dj_db)
Gradient descent
cost를 줄이도록 w,b 반복 업데이트 한다.
def gradient_descent(X, y, w, b, alpha, num_iters):
"""
X (ndarray (m,n) : Data, m examples with n features
y (ndarray (m,)) : target values
w (ndarray (n,)): model parameters
b (scalar) : model parameter
alpha (float) : Learning rate
num_iters (scalar) : number of iterations to run gradient descent
"""
# a list to store cost J at each iteration
J_history = []
for i in range(num_iters):
# compute the gradient
dj_dw, dj_db = compute_gradient(X,y,w,b)
# update Parameters
w = w-alpha*dj_dw
b = b-alpha*dj_db
# save cost to J_history list
J_history.append(compute_cost(X, y, w, b) )
# print cost every 1000 iteration
if (i % 1000) == 0:
print('Iteration %5d: Cost %0.2e ' % (i, J_history[-1]))
return w, b, J_history
Logistic regression by gradient descent
- Starting from initial parameters
- Use GD to get the parameters
# initialize parameters
w_init = np.zeros(X_train.shape[1])
b_init = 0.
# set the hyperparameters, 0.1 and 10000
alpha = 0.1
iterations = 10000
# run gradient descent
w_final, b_final, J_hist = gradient_descent(X_train,y_train,w_init,b_init,alpha,iterations)
# print the learned parameters
print(f"\n parameters: w={w_final},b={b_final}")
Plotting cost change
# plot cost during iteration 1 ~ 10000
plt.plot(J_hist[:10000])
plt.title("Cost vs. Iteration")
plt.ylabel('cost')
plt.xlabel('iteration')
plt.show()
반복될수록 cost가 줄어드는 것으로 보아 성공적인 학습이란 것을 알 수 있다.
Plotting the learned model
로지스틱 회귀 모델의 결정경계를 3D로 시각화 하는 예제
import matplotlib.pyplot as plt
from matplotlib import cm
from mpl_toolkits.mplot3d import axes3d
fig, ax = plt.subplots(subplot_kw={"projection": "3d"})
# plot the data points
#ax.scatter(...)훈련 데이터를 3D 좌표로 시각화 한다.
ax.scatter(X_train[:3,0], X_train[:3,1], y_train[:3], marker='o', c='red')
ax.scatter(X_train[3:,0], X_train[3:,1], y_train[3:], marker='x', c='blue')
# compute y_hat for all meshgrid using learned w and b
#meshgrid+sigmoid(...) 예측확률(y_hat)을 계산한다.
x0 = np.arange(0, 3, 0.1)
x1 = np.arange(0, 3, 0.1)
x0, x1 = np.meshgrid(x0, x1)
y_hat = sigmoid(x0*w_final[0] + x1*w_final[1]+b_final)
# show the model by plotting y_hat
#ax.plot_surface(...)시그모이드 결정면을 그려서 분류 경계 표현
ax.plot_surface(x0, x1, y_hat, cmap=cm.coolwarm, alpha=0.5)
plt.xlabel('x0')
plt.ylabel('x1')
plt.title('Prob. of y = 1')
plt.show()
Decision boundary in feature space
결정 경계를 피처 공간에 표현한 것
이 결정 경계는 모델이 위쪽은 y=1로 예측 아래는 y=0으로 예측한다.
# plot the data points in feature space (x0, x1 plane)
plt.scatter(X_train[:3, 0], X_train[:3, 1], marker='o', c='red')
plt.scatter(X_train[3:, 0], X_train[3:, 1], marker='x', c='blue')
plt.axis([0, 3, 0, 3])
# decision boundary
x0 = np.arange(0,4)
x1 = -w_final[0]/w_final[1]*x0-b_final/w_final[1]
plt.plot(x0, x1)
plt.xlabel('x0')
plt.ylabel('x1')
plt.title('Decision Boundary')
plt.show()
Accuracy of the model
# Compute the class prediction and accuracy of the model
y_pred = predict(X_train,w_final,b_final)
accuracy = np.sum(y_train == y_pred)/len(y_train)
print("Accuracy on the training set =", accuracy)
1.0은 모두 다 일치한다는 말이다
2. Logistic regression using scikit learn
앞에 내용은 3주차와 유사하다
다른 점만 보자면
데이터를 train과 test 둘로 나누는 방법이다.
Splitting data into 70% training data & 30% test data
from sklearn.model_selection import train_test_split
# make training set and test set. use train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=1,stratify=y)
# check the shape of training and test data
print(X_train.shape)
print(X_test.shape)
3. Logistic Regression-Multinomial Classification
Training the model using LogisticRegression
from sklearn.linear_model import LogisticRegression
# training the model. use LogisticRegression with C=100, multi_class='ovr'
lr = LogisticRegression(C=100.0, random_state=1,multi_class='ovr')
lr.fit(X_train,y_train)
C=100.0 규제 강도의 역수로 C가 클수록 규제가 약해진다->모델 복잡도를 허용한다.
random_state=1 랜덤성 고정(재현 가능한 결과를 위해)
multi_class='ovr' One-vs-Rest방식: 이진 분류이므로 기본적이나 명시적으로 지정 가능
Plotting decision regions
#X_combined_std = np.vstack((X_train_std, X_test_std))
#y_combined = np.hstack((y_train, y_test))
# decision boundary of the model
plot_decision_regions(X_train,y_train,classifier=lr)
plt.xlabel('petal length [standardized]')
plt.ylabel('petal width [standardized]')
plt.legend(loc='upper left')
plt.show()
Probability of class 0,1,2
# probability of class 0, 1, 2 for first 5 test data. use .predict_proba()
print(lr.predict_proba(X_test[:5]))
Prediction of class labels
# predicting labels of first 5 test dataset
y_pred = lr.predict(X_test)
print('True test labels :', y_test[:5])
print('Predicted labels :', y_pred[:5])
Effect of regularization
# list of C values, learned weights, test accuracy
params, weights, test_acc = [], [], []
# learn weights w[1] and accuracy for C in 10**-5 to 10**5
for c in np.arange(-5, 5):
# learn the weights using LogisticRegression, multi_class='ovr'
lr = LogisticRegression(C=10.**c,random_state=1,multi_class='ovr')
lr.fit(X_train,y_train)
# record the C values, learned weights, test accuracy
params.append(10.**c)
weights.append(lr.coef_[1])
test_acc.append(lr.score(X_test,y_test))
weights = np.array(weights)
# plotting weights for each C
plt.plot(params, weights[:, 0], label='petal length')
plt.plot(params, weights[:, 1], label='petal width', linestyle='--')
plt.ylabel('weights')
plt.xlabel('C')
plt.legend(loc='upper left')
plt.xscale('log')
plt.show()