머신러닝 3주차

카테고리 없음

머신러닝 3주차

용학사 2025. 3. 24. 14:57

Sample Dataset

import numpy as np
import matplotlib.pyplot as plt

# x_train is the vector of input variables (size in 1000 square feet)
# y_train is the vector of target values (price in 1000s of dollars)

x_train = np.array([1.8, 1.3, 1.7, 1.9, 1.4])
y_train = np.array([430., 320., 390., 490., 400.])

print('x_train =', x_train)
print('y_train =', y_train)

x_train = [1.8 1.3 1.7 1.9 1.4]
y_train = [430. 320. 390. 490. 400.]

# plot the data points
plt.scatter(x_train,y_train, marker='o',c='blue')

plt.title("Housing Prices")
plt.ylabel('Price (1000 dollars)')
plt.xlabel('Size (1000 sqft)')
plt.show()

Prediction by a linear model

-Model : y=w x + b

# Computes the prediction of a linear model
# w, b : model parameters  
# x, y : scalar 

def predict(x, w, b):

    # the linear model y = wx + b
    y = w*x+b
        
    return y

# predict with arbitrary model parameter 
w = 100
b = 10

print('size =     ', x_train[0])
print('price =    ', y_train[0])

# predict the y value of first data(x_train[0]) using w and b
y_hat = predict(x_train[0],w,b)

print('predicted =', y_hat)

w와 b를 설정한 후에 예측값을 y_hat으로 정의하여 출력

size =      1.8
price =     430.0
predicted = 190.0

# Computes the cost function - MSE
# x is a vector of training data (ndarray (m,))

def compute_cost(x, y, w, b): 

    # number of training examples
    m = x.shape[0] 
    
    cost = 0   
    for i in range(m): 
        
        # compute the predicted y of each x by the linear model
        y_hat = w*x[i]+b  
        
        # square (predicted y - true y), and add
        cost += (y_hat-y[i])**2
        
    cost = (1 / (2 * m)) * cost  
    return cost

print('w =', w)
print('b =', b)
print('y_train =', y_train)
print('y_hat =  ', predict(x_train, w, b))

# check the cost for the model with w = 100, b = 10
cost = compute_cost(x_train,y_train,w,b)

print('--> cost =', cost)

w = 100
b = 10
y_train = [430. 320. 390. 490. 400.]
y_hat =   [190. 140. 180. 200. 150.]
--> cost = 28070.0

Computing gradients

# Compute the gradient for linear regression 
# x is a vector of training data (ndarray (m,))
# dj_dw : The gradient of the cost w.r.t. the parameters w
# dj_db : The gradient of the cost w.r.t. the parameter b     

def compute_gradient(x, y, w, b): 
    
    # Number of training examples
    m = x.shape[0]  
    
    dj_dw = 0
    dj_db = 0
    
    for i in range(m):  
        # compute the predicted y 
        y_hat = w*x[i]+b 
        
        # compute the gradients
        dj_dw += (y_hat-y[i])*x[i]   #-(y[i]-y_hat[i])*x[i]
        dj_db += y_hat-y[i]

    dj_dw = dj_dw / m 
    dj_db = dj_db / m 
        
    return dj_dw, dj_db

# check the gradients for current w, b
print('w =', w)
print('b =', b)
print('x_train =', x_train)
print('y_train =', y_train)

dj_dw, dj_db =compute_gradient(x_train,y_train,w,b)

print('dj_dw =', dj_dw)
print('dj_db =', dj_db)

w = 100
b = 10
x_train = [1.8 1.3 1.7 1.9 1.4]
y_train = [430. 320. 390. 490. 400.]
dj_dw = -384.8
dj_db = -234.0

Gradient descent

num_iters는 몇번 반복할지를 뜻하고 alpha는 얼마나 빠르게 w와 b를 업데이트할지를 결정하는 하이퍼파라미터

alpha값이 크면 큰 폭으로 이동하고 작으면 천천히 이동한다. 너무 작으면 수렴 속도가 느리고 너무 크면 진동하거나 발산해서 학습에 실패한다.

# Performs gradient descent to fit w, b

def gradient_descent(x, y, w, b, alpha, num_iters): 
    """  
    x (ndarray (m,))  : data, m examples 
    y (ndarray (m,))  : target values
    alpha (float)     : learning rate
    num_iters (int)   : number of iterations

    J_history (list)  : History of cost values
    """
    J_history = []
    
    for i in range(num_iters):
        # compute the gradient 
        dj_dw, dj_db = compute_gradient(x,y,w,b)     

        # update Parameters 
        w = w-alpha*dj_dw                            
        b = b-alpha*dj_db                            

        # save cost to J_history list 
        J_history.append(compute_cost(x,y,w,b))

        # print cost every 1000 iteration  
        if (i % 1000) == 0:
            print('Iteration %5d: Cost %0.2e ' % (i, J_history[-1]))
           
    return w, b, J_history

Linear regression by gradient descent

Starting from initial parameters
Use GD to get the parameters

w와 b를 찾는 과정이고 이를 iteration값인 10000번 반복하며 천번마다 출력

# initialize parameters
w_init = 0
b_init = 0

# set the hyper parameters, 0.01 and 10000
alpha = 0.01
iterations = 10000

# run gradient descent
w_final, b_final, J_hist = gradient_descent(x_train,y_train,w_init,b_init,alpha,iterations)

# print the learned parameters
print('w =',w_final)
print('b =',b_final)

Iteration     0: Cost 7.69e+04 
Iteration  1000: Cost 4.58e+02 
Iteration  2000: Cost 4.58e+02 
Iteration  3000: Cost 4.58e+02 
Iteration  4000: Cost 4.58e+02 
Iteration  5000: Cost 4.58e+02 
Iteration  6000: Cost 4.58e+02 
Iteration  7000: Cost 4.58e+02 
Iteration  8000: Cost 4.58e+02 
Iteration  9000: Cost 4.58e+02 
w = 155.60725919032078
b = 155.60725919032078

Plotting cost change

# plot cost during iteration 1 ~ 100 
plt.plot(J_hist[:100])

plt.title("Cost vs. Iteration")
plt.ylabel('cost') 
plt.xlabel('iteration')  
plt.show()

Plotting the learned model

# plot the data points
plt.scatter(x_train, y_train, marker='o', c='blue')

# predict the y value for x_train using learned w and b
y_hat = predict(x_train,w_final,b_final)

# show the model by plotting x_train and y_hat
plt.plot(x_train,y_hat,color='red')  

plt.title('SIZE vs. PRICE') 
plt.xlabel('size') 
plt.ylabel('price') 
plt.grid()
plt.show()

# test price prediction for 1.5 thousand sqft house
size = 1.5
price = predict(size,w_final,b_final)

print('Prediction: %.2f sqft house --> %.2f thousand dollars' % (size*1000, price))

Prediction: 1500.00 sqft house --> 389.02 thousand dollars

2. Linear Regression by gradient descent-multiple variables, vector form

Sample dataset

이번엔 x0와x1 두개가 입력값일때 y값을 찾는 과정임

일단 데이터를 알아보자

import numpy as np
import matplotlib.pyplot as plt

# X_train is the vector of input variables (size in 1000 square feet, number of rooms)
# y_train is the vector of target values (price in 1000s of dollars)

X_train = np.array([[1.8, 4], [1.3, 3], [1.7, 4], [1.9, 5], [1.4, 3]])
y_train = np.array([430., 320., 390., 490., 400.])

print(X_train)
print(y_train)

[[1.8 4. ]
 [1.3 3. ]
 [1.7 4. ]
 [1.9 5. ]
 [1.4 3. ]]
[430. 320. 390. 490. 400.]

3d로 구현하는 방법 암기

fig, ax = plt.subplots(subplot_kw={"projection": "3d"})

# plot the data points in 3D
ax.scatter(X_train[:,0], X_train[:,1], y_train, marker='o', c='blue')
plt.show()

Prediction by a linear model

y=w x + b = w0 x0 + w1x1 + b

# Compute the prediction of a linear model
# x : array (m, n) - m examples with n features
# y : vector (m, ) 
# w : model parameters (n, )
# b : model parameter scalar 

def predict(x, w, b): 

    # the linear model y = wx + b
    y = np.dot(x,w)+b
    
    return y

1번과 달리 2번에서는 x가 2개 이므로 행렬의 곱 형태로 w와 곱해서 계산됨

(1번은 w x + b의 형태였음)

# arbitrary model parameter 
w = np.array([100, 100])
b = 10

# first row from training data
print('X_train[0] =', X_train[0])
print('y_train[0] =', y_train[0])

# predict the y value of first data(X_train[0]) using w and b
y_hat = predict(X_train[0],w,b)

print('y_hat =     ', y_hat)

임의로 w값과 b값을 정해두고 X.train의 첫번째 값과 y_train의 첫번째 값을 가져와서 y_hat을 계산해보는 과정임

w = np.array([100, 100])
b = 10

# all training data
print('X_train =', X_train)
print('y_train =', y_train)

# predict the y values of all data(X_train) using w and b
y_hat = predict(X_train,w,b)

print('y_hat =  ', y_hat)

X_train = [[1.8 4. ]
 [1.3 3. ]
 [1.7 4. ]
 [1.9 5. ]
 [1.4 3. ]]
y_train = [430. 320. 390. 490. 400.]
y_hat =   [590. 440. 580. 700. 450.]

앞에서 한개의 데이터만을 비교했다면 이번에는 모든 데이터를 비교한 결과이다

y_train값과 y_hat값의 차이를 보면 된다.

Cost fuction

# Computes the cost function - MSE

def compute_cost(X, y, w, b): 
    """
      X (ndarray (m,n)): dataset, m examples with n features
      y (ndarray (m,)) : target values
      w (ndarray (n,)) : model parameters  
      b (scalar)       : model parameter
    """
    # number of training examples and number of features
    m, n = X.shape

    # compute y_hat
    y_hat = np.dot(X,w)+b
    
    # compute the cost
    cost = np.sum((y_hat-y)**2)/(2*m)

    return cost

sum함수를 통해서 한번에 배열끼리 더하는게 가능함

print('w =', w)
print('b =', b)
print('y_train =', y_train)
print('y_hat =  ', predict(X_train, w, b))

# check the cost for the model with w = [100, 100], b = 10
cost = compute_cost(X_train,y_train,w,b)

print('--> cost =', cost)

w = [100 100]
b = 10
y_train = [430. 320. 390. 490. 400.]
y_hat =   [590. 440. 580. 700. 450.]
--> cost = 12270.0

Computing gradients

# Computes the gradient for linear regression 

def compute_gradient(X, y, w, b): 
    """
      X (ndarray (m,n)): Data, m examples with n features
      y (ndarray (m,)) : target values
      w (ndarray (n,)) : model parameters  
      b (scalar)       : model parameter
      
      dj_dw (ndarray (n,)): The gradient of the cost w.r.t. the parameters w. 
      dj_db (scalar):       The gradient of the cost w.r.t. the parameter b. 
    """
    # number of training examples and number of features
    m, n = X.shape           
    
    dj_dw = np.zeros((n,))
    dj_db = 0.
    
    # compute y_hat
    y_hat = np.dot(X,w)+b
    err = y_hat - y   
    
    # compute gradients
    dj_dw = np.dot(X.T,err)/m
    dj_db = np.sum(err)/m
       
    return dj_dw, dj_db

dj_dw = np.zeros((n,))

w는 여러 개의 파라미터값을 가지고 있으니 dj_dw도 그만큼 여러 개의 값이 있어야 한다. 그래서 크기가 n인 1차원 배열을 0으로 초기화 한다.

dj_db = 0.

b는 bias(편향)이니 하나의 값만 있다 따라서 비용함수를 b에 대해 편미분한 값인 스칼라 값 중 하나이다. 따라서 0으로 초기화 해 둔다.(누적으로 구하는 방식으로 구현할 수도 있기 때문에)

# check the gradients for current w, b
print('w =', w)
print('b =', b)
print('X_train =', X_train)
print('y_train =', y_train)

dj_dw, dj_db = compute_gradient(X_train,y_train,w,b)

print('dj_dw =', dj_dw)
print('dj_db =', dj_db)

w = [100 100]
b = 10
X_train = [[1.8 4. ]
 [1.3 3. ]
 [1.7 4. ]
 [1.9 5. ]
 [1.4 3. ]]
y_train = [430. 320. 390. 490. 400.]
dj_dw = [247.2 592. ]
dj_db = 146.0

Gradient descent

비용 함수가 최소가 되는 방향으로 알맞는 w값을 찾는 과정

# Performs gradient descent to fit w, b

def gradient_descent(X, y, w, b, alpha, num_iters): 
    """
      X (ndarray (m,n))   : data, m examples with n features
      y (ndarray (m,))    : target values
      w (ndarray (n,)): model parameters  
      b (scalar)      : model parameter
      alpha (float)       : learning rate
      num_iters (int)     : number of iterations 
    """
    # a list to store cost J at each iteration
    J_history = []
    
    for i in range(num_iters):

        # compute the gradient 
        dj_dw, dj_db = compute_gradient(X,y,w,b)   

        # update Parameters 
        w = w-alpha*dj_dw              
        b = b-alpha*dj_db     
      
        # save cost to J_history list 
        J_history.append(compute_cost(X,y,w,b))

        # print cost every 1000 iteration
        if (i % 1000) == 0:
            print('Iteration %5d: Cost %0.2e ' % (i, J_history[-1]))
        
    return w, b, J_history

Linear regression by gradient descent

Starting from initial parameters
Use GD to get the parameters

# initialize parameters
w_init = np.zeros(X_train.shape[1])
b_init = 0.

# set the hyperparameters, 0.1 and 10000
alpha = 0.1
iterations = 10000

# run gradient descent
w_final, b_final, J_hist = gradient_descent(X_train,y_train,w_init,b_init,alpha,iterations)

# print the learned parameters
print('w =',w_final)
print('b =',b_final)

Iteration     0: Cost 6.27e+04 
Iteration  1000: Cost 3.76e+02 
Iteration  2000: Cost 3.74e+02 
Iteration  3000: Cost 3.73e+02 
Iteration  4000: Cost 3.73e+02 
Iteration  5000: Cost 3.72e+02 
Iteration  6000: Cost 3.72e+02 
Iteration  7000: Cost 3.72e+02 
Iteration  8000: Cost 3.72e+02 
Iteration  9000: Cost 3.72e+02 
w = [137.51483137  22.57217992]
b = 97.45385409256511

Plotting cost change

# plot cost during iteration 1 ~ 100 
plt.plot(J_hist[:100])

plt.title("Cost vs. Iteration")
plt.ylabel('cost') 
plt.xlabel('iteration')  
plt.show()

Plotting the learned model

from matplotlib import cm

fig, ax = plt.subplots(subplot_kw={"projection": "3d"})

# plot the data points
ax.scatter(X_train[:,0], X_train[:,1], y_train, marker='o', c='blue')

# predict the y values for all meshgrid using learned w and b
x0 = np.arange(1, 2, 0.1)
x1 = np.arange(2, 6, 0.1)
x0, x1 = np.meshgrid(x0, x1)
y_hat = w_final[0]*x0+w_final[1]*x1+b_final

# show the model by plotting Z surface
ax.plot_surface(x0, x1, y_hat, cmap=cm.coolwarm, alpha=0.5)

plt.show()

from sklearn.metrics import mean_squared_error

# compute y_hat for X_train
y_hat = predict(X_train,w_final,b_final)

# print the mean squared error(MSE) between y and y_hat 
print('MSE : %.2f' % mean_squared_error(y_train, y_hat))

mean_squared_error(): 평균 제곱 오차 계산

MSE는 값이 작을수록 실제 데이터와 잘 맞다는 뜻임

3. Linear Regression using scikit learn

(이렇게 하면 쉽게 구할 수 있다!)

X_train = np.array([[1.8, 4], [1.3, 3], [1.7, 4], [1.9, 5], [1.4, 3]])
y_train = np.array([430., 320., 390., 490., 400.])

print(X_train)
print(y_train)

[[1.8 4. ]
 [1.3 3. ]
 [1.7 4. ]
 [1.9 5. ]
 [1.4 3. ]]
[430. 320. 390. 490. 400.]

from sklearn.linear_model import LinearRegression

# training the model - use .fit()
lr = LinearRegression() //껍데기를 만들고
lr.fit(X_train,y_train)//내부적으로 비용함수(MSE)를 최소화 하는 w,b를 찾는 과정이 일어난다.

# print the learned model parameters
print('w =',lr.coef_)
print('b =',lr.intercept_)

w = [138.46153846  22.30769231]
b = 96.92307692307696

from sklearn.metrics import mean_squared_error

# predict y values for X - use .predict()
y_hat = lr.predict(X_train)

# print the mean squared error(MSE) between y and y_hat 
print('MSE : %.2f' % mean_squared_error(y_train, y_hat))

MSE : 744.62

4.Polynomial Regression and Regularization

from sklearn.preprocessing import PolynomialFeatures  # 다항식 피처 생성을 위한 모듈
from sklearn.linear_model import LinearRegression     # 선형 회귀 모델
from sklearn.linear_model import Ridge                # 릿지 회귀 (L2 정규화 포함된 선형 회귀)

import numpy as np

# 실제 함수 정의: y = cos(1.5 * pi * X)
# 이 함수를 우리가 학습할 데이터의 "정답" 함수로 사용함
def true_fun(X):
    return np.cos(1.5 * np.pi * X)

# 랜덤 시드 고정 (실험 결과를 항상 동일하게 재현 가능하게 함)
np.random.seed(0)

# 샘플 수 설정
n_samples = 20

# 0과 1 사이에서 20개의 랜덤한 X값 생성 후 정렬 (1차원 배열)
X = np.sort(np.random.rand(n_samples))

# y = 실제 함수 값 + 약간의 랜덤 노이즈 추가
# 현실의 데이터처럼 완벽하게 딱 떨어지지 않게 만들기 위해 노이즈를 섞음
y = true_fun(X) + np.random.randn(n_samples) * 0.1

# sklearn에서 요구하는 입력 형태로 변경 (20, 1) 형태의 2차원 배열로 변환
X = X.reshape(-1, 1)

# 확인용: 앞부분 3개 X값과 그에 대한 y값 출력
print(X[:3])
print(y[:3])

[[0.0202184 ]
 [0.07103606]
 [0.0871293 ]]
[1.14487249 0.9239768  0.94819339]

# plot the data points
plt.scatter(X, y, edgecolor="b", s=20)

plt.xlabel("x")
plt.ylabel("y")
plt.show()

# training the linear regression model with X
lr = LinearRegression()
lr.fit(X,y)

plt.scatter(X, y, edgecolor="b", s=20)

# show the model by plotting x_test and predicted values
X_test = np.arange(0, 1, 0.1).reshape(-1, 1)
plt.plot(X_test,lr.predict(X_test),c='r')

plt.xlabel("x")
plt.ylabel("y")
plt.show()

Generating Polynomial features

이렇게 n차까지 있는 경우

from sklearn.preprocessing import PolynomialFeatures

# X 데이터를 기반으로 차수가 10인 다항식 특성 생성 객체 생성
# include_bias=False는 상수항(1)은 포함하지 않겠다는 의미
poly = PolynomialFeatures(degree=10, include_bias=False)

# 기존 X를 10차 다항 특성으로 변환
# 예: x → [x, x^2, x^3, ..., x^10]
X_poly = poly.fit_transform(X)

# 변환된 데이터의 앞 3개 샘플 출력
print(X_poly[:3])

[[2.02183974e-02 4.08783595e-04 8.26494919e-06 1.67104028e-07
  3.37857564e-09 6.83093851e-11 1.38110630e-12 2.79237560e-14
  5.64573598e-16 1.14147734e-17]
 [7.10360582e-02 5.04612156e-03 3.58456585e-04 2.54633428e-05
  1.80881550e-06 1.28491123e-07 9.12750292e-09 6.48381829e-10
  4.60584893e-11 3.27181353e-12]
 [8.71292997e-02 7.59151487e-03 6.61443374e-04 5.76310980e-05
  5.02135721e-06 4.37507337e-07 3.81197079e-08 3.32134345e-09
  2.89386329e-10 2.52140282e-11]]

Polynomial regression

이제는 가중치까지 있고 y값이 있음

# 다항 특성을 갖는 데이터를 사용하여 선형 회귀 모델 학습
lr = LinearRegression()
lr.fit(X_poly, y)  # 다항 특성 변환된 X와 y로 모델 학습

# 원래 데이터 포인트를 산점도로 시각화 (파란 테두리의 점)
plt.scatter(X, y, edgecolor="b", s=20)

# 테스트용 x값 생성: 0부터 1까지 0.01 간격 (100개 샘플)
X_test = np.arange(0, 1, 0.01).reshape(-1, 1)
#.reshape(-1,1)을 하는 이유는 scikit-learn에서는 입력 데이터 X가 항상 2차원이어야 해.
#그래서 (샘플 수, 피처 수) 형태로 되어 있어야 한다.
#그래서 만약 X가 [0.1, 0.2, 0.3]처럼 1차원이라면,
#X.reshape(-1, 1)을 통해 (3, 1)의 2차원 형태로 만들어주는 거야.

# 테스트 데이터를 학습 때와 같은 방식으로 다항 특성 변환
X_test_poly = poly.fit_transform(X_test)

# 모델이 예측한 값을 곡선으로 시각화 (빨간 선)
plt.plot(X_test, lr.predict(X_test_poly), c='r')

# 그래프에 축 레이블 추가
plt.xlabel("x")
plt.ylabel("y")

# 그래프 표시
plt.show()

빨간색 그래프가 너무 복잡해 보인다

Polynomial regression with regularization

Ridge회귀는 비용 함수(w)에 L2 정규화 항을 추가해서 모델이 과도하게 복잡해지는 걸 방지한다.(Overfitting)

alpha값이 클수록 제약이 더 강하다

# training the ridge regression model with polynomial features of X
# set regularization parameter alpha = 0.01 
lr = Ridge(alpha=0.01)
lr.fit(X_poly,y)

plt.scatter(X, y, edgecolor="b", s=20)

X_test = np.arange(0, 1, 0.01).reshape(-1, 1)
# generate polynomial features of X_test 
X_test_poly = poly.fit_transform(X_test)

# show the model by plotting X_test and predicted values from polynomial features of X_test
plt.plot(X_test,lr.predict(X_test_poly),c='r')

plt.xlabel("x")
plt.ylabel("y")
plt.show()