머신러닝 3주차
Sample Dataset
import numpy as np
import matplotlib.pyplot as plt
# x_train is the vector of input variables (size in 1000 square feet)
# y_train is the vector of target values (price in 1000s of dollars)
x_train = np.array([1.8, 1.3, 1.7, 1.9, 1.4])
y_train = np.array([430., 320., 390., 490., 400.])
print('x_train =', x_train)
print('y_train =', y_train)
x_train = [1.8 1.3 1.7 1.9 1.4]
y_train = [430. 320. 390. 490. 400.]
# plot the data points
plt.scatter(x_train,y_train, marker='o',c='blue')
plt.title("Housing Prices")
plt.ylabel('Price (1000 dollars)')
plt.xlabel('Size (1000 sqft)')
plt.show()
Prediction by a linear model
-Model : y=w x + b
# Computes the prediction of a linear model
# w, b : model parameters
# x, y : scalar
def predict(x, w, b):
# the linear model y = wx + b
y = w*x+b
return y
# predict with arbitrary model parameter
w = 100
b = 10
print('size = ', x_train[0])
print('price = ', y_train[0])
# predict the y value of first data(x_train[0]) using w and b
y_hat = predict(x_train[0],w,b)
print('predicted =', y_hat)
w와 b를 설정한 후에 예측값을 y_hat으로 정의하여 출력
size = 1.8
price = 430.0
predicted = 190.0
# Computes the cost function - MSE
# x is a vector of training data (ndarray (m,))
def compute_cost(x, y, w, b):
# number of training examples
m = x.shape[0]
cost = 0
for i in range(m):
# compute the predicted y of each x by the linear model
y_hat = w*x[i]+b
# square (predicted y - true y), and add
cost += (y_hat-y[i])**2
cost = (1 / (2 * m)) * cost
return cost
print('w =', w)
print('b =', b)
print('y_train =', y_train)
print('y_hat = ', predict(x_train, w, b))
# check the cost for the model with w = 100, b = 10
cost = compute_cost(x_train,y_train,w,b)
print('--> cost =', cost)
w = 100
b = 10
y_train = [430. 320. 390. 490. 400.]
y_hat = [190. 140. 180. 200. 150.]
--> cost = 28070.0
Computing gradients
# Compute the gradient for linear regression
# x is a vector of training data (ndarray (m,))
# dj_dw : The gradient of the cost w.r.t. the parameters w
# dj_db : The gradient of the cost w.r.t. the parameter b
def compute_gradient(x, y, w, b):
# Number of training examples
m = x.shape[0]
dj_dw = 0
dj_db = 0
for i in range(m):
# compute the predicted y
y_hat = w*x[i]+b
# compute the gradients
dj_dw += (y_hat-y[i])*x[i] #-(y[i]-y_hat[i])*x[i]
dj_db += y_hat-y[i]
dj_dw = dj_dw / m
dj_db = dj_db / m
return dj_dw, dj_db
# check the gradients for current w, b
print('w =', w)
print('b =', b)
print('x_train =', x_train)
print('y_train =', y_train)
dj_dw, dj_db =compute_gradient(x_train,y_train,w,b)
print('dj_dw =', dj_dw)
print('dj_db =', dj_db)
w = 100
b = 10
x_train = [1.8 1.3 1.7 1.9 1.4]
y_train = [430. 320. 390. 490. 400.]
dj_dw = -384.8
dj_db = -234.0
Gradient descent
num_iters는 몇번 반복할지를 뜻하고 alpha는 얼마나 빠르게 w와 b를 업데이트할지를 결정하는 하이퍼파라미터
alpha값이 크면 큰 폭으로 이동하고 작으면 천천히 이동한다. 너무 작으면 수렴 속도가 느리고 너무 크면 진동하거나 발산해서 학습에 실패한다.
# Performs gradient descent to fit w, b
def gradient_descent(x, y, w, b, alpha, num_iters):
"""
x (ndarray (m,)) : data, m examples
y (ndarray (m,)) : target values
alpha (float) : learning rate
num_iters (int) : number of iterations
J_history (list) : History of cost values
"""
J_history = []
for i in range(num_iters):
# compute the gradient
dj_dw, dj_db = compute_gradient(x,y,w,b)
# update Parameters
w = w-alpha*dj_dw
b = b-alpha*dj_db
# save cost to J_history list
J_history.append(compute_cost(x,y,w,b))
# print cost every 1000 iteration
if (i % 1000) == 0:
print('Iteration %5d: Cost %0.2e ' % (i, J_history[-1]))
return w, b, J_history
Linear regression by gradient descent
- Starting from initial parameters
- Use GD to get the parameters
w와 b를 찾는 과정이고 이를 iteration값인 10000번 반복하며 천번마다 출력
# initialize parameters
w_init = 0
b_init = 0
# set the hyper parameters, 0.01 and 10000
alpha = 0.01
iterations = 10000
# run gradient descent
w_final, b_final, J_hist = gradient_descent(x_train,y_train,w_init,b_init,alpha,iterations)
# print the learned parameters
print('w =',w_final)
print('b =',b_final)
Iteration 0: Cost 7.69e+04
Iteration 1000: Cost 4.58e+02
Iteration 2000: Cost 4.58e+02
Iteration 3000: Cost 4.58e+02
Iteration 4000: Cost 4.58e+02
Iteration 5000: Cost 4.58e+02
Iteration 6000: Cost 4.58e+02
Iteration 7000: Cost 4.58e+02
Iteration 8000: Cost 4.58e+02
Iteration 9000: Cost 4.58e+02
w = 155.60725919032078
b = 155.60725919032078
Plotting cost change
# plot cost during iteration 1 ~ 100
plt.plot(J_hist[:100])
plt.title("Cost vs. Iteration")
plt.ylabel('cost')
plt.xlabel('iteration')
plt.show()
Plotting the learned model
# plot the data points
plt.scatter(x_train, y_train, marker='o', c='blue')
# predict the y value for x_train using learned w and b
y_hat = predict(x_train,w_final,b_final)
# show the model by plotting x_train and y_hat
plt.plot(x_train,y_hat,color='red')
plt.title('SIZE vs. PRICE')
plt.xlabel('size')
plt.ylabel('price')
plt.grid()
plt.show()
# test price prediction for 1.5 thousand sqft house
size = 1.5
price = predict(size,w_final,b_final)
print('Prediction: %.2f sqft house --> %.2f thousand dollars' % (size*1000, price))
Prediction: 1500.00 sqft house --> 389.02 thousand dollars
2. Linear Regression by gradient descent-multiple variables, vector form
Sample dataset
이번엔 x0와x1 두개가 입력값일때 y값을 찾는 과정임
일단 데이터를 알아보자
import numpy as np
import matplotlib.pyplot as plt
# X_train is the vector of input variables (size in 1000 square feet, number of rooms)
# y_train is the vector of target values (price in 1000s of dollars)
X_train = np.array([[1.8, 4], [1.3, 3], [1.7, 4], [1.9, 5], [1.4, 3]])
y_train = np.array([430., 320., 390., 490., 400.])
print(X_train)
print(y_train)
[[1.8 4. ]
[1.3 3. ]
[1.7 4. ]
[1.9 5. ]
[1.4 3. ]]
[430. 320. 390. 490. 400.]
3d로 구현하는 방법 암기
fig, ax = plt.subplots(subplot_kw={"projection": "3d"})
# plot the data points in 3D
ax.scatter(X_train[:,0], X_train[:,1], y_train, marker='o', c='blue')
plt.show()
Prediction by a linear model
y=w x + b = w0 x0 + w1x1 + b
# Compute the prediction of a linear model
# x : array (m, n) - m examples with n features
# y : vector (m, )
# w : model parameters (n, )
# b : model parameter scalar
def predict(x, w, b):
# the linear model y = wx + b
y = np.dot(x,w)+b
return y
1번과 달리 2번에서는 x가 2개 이므로 행렬의 곱 형태로 w와 곱해서 계산됨
(1번은 w x + b의 형태였음)
# arbitrary model parameter
w = np.array([100, 100])
b = 10
# first row from training data
print('X_train[0] =', X_train[0])
print('y_train[0] =', y_train[0])
# predict the y value of first data(X_train[0]) using w and b
y_hat = predict(X_train[0],w,b)
print('y_hat = ', y_hat)
임의로 w값과 b값을 정해두고 X.train의 첫번째 값과 y_train의 첫번째 값을 가져와서 y_hat을 계산해보는 과정임
w = np.array([100, 100])
b = 10
# all training data
print('X_train =', X_train)
print('y_train =', y_train)
# predict the y values of all data(X_train) using w and b
y_hat = predict(X_train,w,b)
print('y_hat = ', y_hat)
X_train = [[1.8 4. ]
[1.3 3. ]
[1.7 4. ]
[1.9 5. ]
[1.4 3. ]]
y_train = [430. 320. 390. 490. 400.]
y_hat = [590. 440. 580. 700. 450.]
앞에서 한개의 데이터만을 비교했다면 이번에는 모든 데이터를 비교한 결과이다
y_train값과 y_hat값의 차이를 보면 된다.
Cost fuction
# Computes the cost function - MSE
def compute_cost(X, y, w, b):
"""
X (ndarray (m,n)): dataset, m examples with n features
y (ndarray (m,)) : target values
w (ndarray (n,)) : model parameters
b (scalar) : model parameter
"""
# number of training examples and number of features
m, n = X.shape
# compute y_hat
y_hat = np.dot(X,w)+b
# compute the cost
cost = np.sum((y_hat-y)**2)/(2*m)
return cost
sum함수를 통해서 한번에 배열끼리 더하는게 가능함
print('w =', w)
print('b =', b)
print('y_train =', y_train)
print('y_hat = ', predict(X_train, w, b))
# check the cost for the model with w = [100, 100], b = 10
cost = compute_cost(X_train,y_train,w,b)
print('--> cost =', cost)
w = [100 100]
b = 10
y_train = [430. 320. 390. 490. 400.]
y_hat = [590. 440. 580. 700. 450.]
--> cost = 12270.0
Computing gradients
# Computes the gradient for linear regression
def compute_gradient(X, y, w, b):
"""
X (ndarray (m,n)): Data, m examples with n features
y (ndarray (m,)) : target values
w (ndarray (n,)) : model parameters
b (scalar) : model parameter
dj_dw (ndarray (n,)): The gradient of the cost w.r.t. the parameters w.
dj_db (scalar): The gradient of the cost w.r.t. the parameter b.
"""
# number of training examples and number of features
m, n = X.shape
dj_dw = np.zeros((n,))
dj_db = 0.
# compute y_hat
y_hat = np.dot(X,w)+b
err = y_hat - y
# compute gradients
dj_dw = np.dot(X.T,err)/m
dj_db = np.sum(err)/m
return dj_dw, dj_db
dj_dw = np.zeros((n,))
w는 여러 개의 파라미터값을 가지고 있으니 dj_dw도 그만큼 여러 개의 값이 있어야 한다. 그래서 크기가 n인 1차원 배열을 0으로 초기화 한다.
dj_db = 0.
b는 bias(편향)이니 하나의 값만 있다 따라서 비용함수를 b에 대해 편미분한 값인 스칼라 값 중 하나이다. 따라서 0으로 초기화 해 둔다.(누적으로 구하는 방식으로 구현할 수도 있기 때문에)
# check the gradients for current w, b
print('w =', w)
print('b =', b)
print('X_train =', X_train)
print('y_train =', y_train)
dj_dw, dj_db = compute_gradient(X_train,y_train,w,b)
print('dj_dw =', dj_dw)
print('dj_db =', dj_db)
w = [100 100]
b = 10
X_train = [[1.8 4. ]
[1.3 3. ]
[1.7 4. ]
[1.9 5. ]
[1.4 3. ]]
y_train = [430. 320. 390. 490. 400.]
dj_dw = [247.2 592. ]
dj_db = 146.0
Gradient descent
비용 함수가 최소가 되는 방향으로 알맞는 w값을 찾는 과정
# Performs gradient descent to fit w, b
def gradient_descent(X, y, w, b, alpha, num_iters):
"""
X (ndarray (m,n)) : data, m examples with n features
y (ndarray (m,)) : target values
w (ndarray (n,)): model parameters
b (scalar) : model parameter
alpha (float) : learning rate
num_iters (int) : number of iterations
"""
# a list to store cost J at each iteration
J_history = []
for i in range(num_iters):
# compute the gradient
dj_dw, dj_db = compute_gradient(X,y,w,b)
# update Parameters
w = w-alpha*dj_dw
b = b-alpha*dj_db
# save cost to J_history list
J_history.append(compute_cost(X,y,w,b))
# print cost every 1000 iteration
if (i % 1000) == 0:
print('Iteration %5d: Cost %0.2e ' % (i, J_history[-1]))
return w, b, J_history
Linear regression by gradient descent
- Starting from initial parameters
- Use GD to get the parameters
# initialize parameters
w_init = np.zeros(X_train.shape[1])
b_init = 0.
# set the hyperparameters, 0.1 and 10000
alpha = 0.1
iterations = 10000
# run gradient descent
w_final, b_final, J_hist = gradient_descent(X_train,y_train,w_init,b_init,alpha,iterations)
# print the learned parameters
print('w =',w_final)
print('b =',b_final)
Iteration 0: Cost 6.27e+04
Iteration 1000: Cost 3.76e+02
Iteration 2000: Cost 3.74e+02
Iteration 3000: Cost 3.73e+02
Iteration 4000: Cost 3.73e+02
Iteration 5000: Cost 3.72e+02
Iteration 6000: Cost 3.72e+02
Iteration 7000: Cost 3.72e+02
Iteration 8000: Cost 3.72e+02
Iteration 9000: Cost 3.72e+02
w = [137.51483137 22.57217992]
b = 97.45385409256511
Plotting cost change
# plot cost during iteration 1 ~ 100
plt.plot(J_hist[:100])
plt.title("Cost vs. Iteration")
plt.ylabel('cost')
plt.xlabel('iteration')
plt.show()
Plotting the learned model
from matplotlib import cm
fig, ax = plt.subplots(subplot_kw={"projection": "3d"})
# plot the data points
ax.scatter(X_train[:,0], X_train[:,1], y_train, marker='o', c='blue')
# predict the y values for all meshgrid using learned w and b
x0 = np.arange(1, 2, 0.1)
x1 = np.arange(2, 6, 0.1)
x0, x1 = np.meshgrid(x0, x1)
y_hat = w_final[0]*x0+w_final[1]*x1+b_final
# show the model by plotting Z surface
ax.plot_surface(x0, x1, y_hat, cmap=cm.coolwarm, alpha=0.5)
plt.show()
from sklearn.metrics import mean_squared_error
# compute y_hat for X_train
y_hat = predict(X_train,w_final,b_final)
# print the mean squared error(MSE) between y and y_hat
print('MSE : %.2f' % mean_squared_error(y_train, y_hat))
mean_squared_error(): 평균 제곱 오차 계산
MSE는 값이 작을수록 실제 데이터와 잘 맞다는 뜻임
3. Linear Regression using scikit learn
(이렇게 하면 쉽게 구할 수 있다!)
X_train = np.array([[1.8, 4], [1.3, 3], [1.7, 4], [1.9, 5], [1.4, 3]])
y_train = np.array([430., 320., 390., 490., 400.])
print(X_train)
print(y_train)
[[1.8 4. ]
[1.3 3. ]
[1.7 4. ]
[1.9 5. ]
[1.4 3. ]]
[430. 320. 390. 490. 400.]
from sklearn.linear_model import LinearRegression
# training the model - use .fit()
lr = LinearRegression() //껍데기를 만들고
lr.fit(X_train,y_train)//내부적으로 비용함수(MSE)를 최소화 하는 w,b를 찾는 과정이 일어난다.
# print the learned model parameters
print('w =',lr.coef_)
print('b =',lr.intercept_)
w = [138.46153846 22.30769231]
b = 96.92307692307696
from sklearn.metrics import mean_squared_error
# predict y values for X - use .predict()
y_hat = lr.predict(X_train)
# print the mean squared error(MSE) between y and y_hat
print('MSE : %.2f' % mean_squared_error(y_train, y_hat))
MSE : 744.62
4.Polynomial Regression and Regularization
from sklearn.preprocessing import PolynomialFeatures # 다항식 피처 생성을 위한 모듈
from sklearn.linear_model import LinearRegression # 선형 회귀 모델
from sklearn.linear_model import Ridge # 릿지 회귀 (L2 정규화 포함된 선형 회귀)
import numpy as np
# 실제 함수 정의: y = cos(1.5 * pi * X)
# 이 함수를 우리가 학습할 데이터의 "정답" 함수로 사용함
def true_fun(X):
return np.cos(1.5 * np.pi * X)
# 랜덤 시드 고정 (실험 결과를 항상 동일하게 재현 가능하게 함)
np.random.seed(0)
# 샘플 수 설정
n_samples = 20
# 0과 1 사이에서 20개의 랜덤한 X값 생성 후 정렬 (1차원 배열)
X = np.sort(np.random.rand(n_samples))
# y = 실제 함수 값 + 약간의 랜덤 노이즈 추가
# 현실의 데이터처럼 완벽하게 딱 떨어지지 않게 만들기 위해 노이즈를 섞음
y = true_fun(X) + np.random.randn(n_samples) * 0.1
# sklearn에서 요구하는 입력 형태로 변경 (20, 1) 형태의 2차원 배열로 변환
X = X.reshape(-1, 1)
# 확인용: 앞부분 3개 X값과 그에 대한 y값 출력
print(X[:3])
print(y[:3])
[[0.0202184 ]
[0.07103606]
[0.0871293 ]]
[1.14487249 0.9239768 0.94819339]
# plot the data points
plt.scatter(X, y, edgecolor="b", s=20)
plt.xlabel("x")
plt.ylabel("y")
plt.show()
# training the linear regression model with X
lr = LinearRegression()
lr.fit(X,y)
plt.scatter(X, y, edgecolor="b", s=20)
# show the model by plotting x_test and predicted values
X_test = np.arange(0, 1, 0.1).reshape(-1, 1)
plt.plot(X_test,lr.predict(X_test),c='r')
plt.xlabel("x")
plt.ylabel("y")
plt.show()
Generating Polynomial features
이렇게 n차까지 있는 경우
from sklearn.preprocessing import PolynomialFeatures
# X 데이터를 기반으로 차수가 10인 다항식 특성 생성 객체 생성
# include_bias=False는 상수항(1)은 포함하지 않겠다는 의미
poly = PolynomialFeatures(degree=10, include_bias=False)
# 기존 X를 10차 다항 특성으로 변환
# 예: x → [x, x^2, x^3, ..., x^10]
X_poly = poly.fit_transform(X)
# 변환된 데이터의 앞 3개 샘플 출력
print(X_poly[:3])
[[2.02183974e-02 4.08783595e-04 8.26494919e-06 1.67104028e-07
3.37857564e-09 6.83093851e-11 1.38110630e-12 2.79237560e-14
5.64573598e-16 1.14147734e-17]
[7.10360582e-02 5.04612156e-03 3.58456585e-04 2.54633428e-05
1.80881550e-06 1.28491123e-07 9.12750292e-09 6.48381829e-10
4.60584893e-11 3.27181353e-12]
[8.71292997e-02 7.59151487e-03 6.61443374e-04 5.76310980e-05
5.02135721e-06 4.37507337e-07 3.81197079e-08 3.32134345e-09
2.89386329e-10 2.52140282e-11]]
Polynomial regression
이제는 가중치까지 있고 y값이 있음
# 다항 특성을 갖는 데이터를 사용하여 선형 회귀 모델 학습
lr = LinearRegression()
lr.fit(X_poly, y) # 다항 특성 변환된 X와 y로 모델 학습
# 원래 데이터 포인트를 산점도로 시각화 (파란 테두리의 점)
plt.scatter(X, y, edgecolor="b", s=20)
# 테스트용 x값 생성: 0부터 1까지 0.01 간격 (100개 샘플)
X_test = np.arange(0, 1, 0.01).reshape(-1, 1)
#.reshape(-1,1)을 하는 이유는 scikit-learn에서는 입력 데이터 X가 항상 2차원이어야 해.
#그래서 (샘플 수, 피처 수) 형태로 되어 있어야 한다.
#그래서 만약 X가 [0.1, 0.2, 0.3]처럼 1차원이라면,
#X.reshape(-1, 1)을 통해 (3, 1)의 2차원 형태로 만들어주는 거야.
# 테스트 데이터를 학습 때와 같은 방식으로 다항 특성 변환
X_test_poly = poly.fit_transform(X_test)
# 모델이 예측한 값을 곡선으로 시각화 (빨간 선)
plt.plot(X_test, lr.predict(X_test_poly), c='r')
# 그래프에 축 레이블 추가
plt.xlabel("x")
plt.ylabel("y")
# 그래프 표시
plt.show()
빨간색 그래프가 너무 복잡해 보인다
Polynomial regression with regularization
Ridge회귀는 비용 함수(w)에 L2 정규화 항을 추가해서 모델이 과도하게 복잡해지는 걸 방지한다.(Overfitting)
alpha값이 클수록 제약이 더 강하다
# training the ridge regression model with polynomial features of X
# set regularization parameter alpha = 0.01
lr = Ridge(alpha=0.01)
lr.fit(X_poly,y)
plt.scatter(X, y, edgecolor="b", s=20)
X_test = np.arange(0, 1, 0.01).reshape(-1, 1)
# generate polynomial features of X_test
X_test_poly = poly.fit_transform(X_test)
# show the model by plotting X_test and predicted values from polynomial features of X_test
plt.plot(X_test,lr.predict(X_test_poly),c='r')
plt.xlabel("x")
plt.ylabel("y")
plt.show()