EEM-272 Makine Öğrenmesine Giriş

2024-2025 Bahar Dönemi

Ders 2

Univariate (Tek değişkenli) Lineer (Doğrusal) Regresyon

import matplotlib.pyplot as plt
import numpy as np

sizes=np.array([2600, 3000, 3200, 3600, 4000]).reshape((-1,1))
prices =np.array([550000, 565000, 610000, 680000, 725000]).reshape((-1,1))
sizes.shape, prices.shape

    ((5, 1), (5, 1))

plt.scatter(sizes, prices, marker='x', color='b')

plt.xlabel('Boyut')
plt.ylabel('Fiyat')

plt.grid(True)
plt.show()

png

Lineer regresyon

from sklearn import linear_model

#model
reg = linear_model.LinearRegression()

# modeli verilerle eğitme
reg.fit(sizes,prices)

reg.predict([[3300]])

    array([[628715.75342466]])

# theta_0 değeri
reg.intercept_

    array([180616.43835616])

# theta_1 değeri
reg.coef_

    array([[135.78767123]])

theta_0 = reg.intercept_[0]
theta_1 = reg.coef_[0,0]

# dolayısıyla hipotez fomulu
h=theta_0+theta_1*size

h=theta_0+theta_1*3300
h

    628715.7534246575

# tum veri setini tahmin etmek için
prices_pred = reg.predict(sizes)

from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
print("Ortalama kare hatası: ", mean_squared_error(prices, prices_pred))
print("Ortalama mutlak yüzde  hatası: ", mean_absolute_percentage_error(prices, prices_pred))

    Ortalama kare hatası:  186815068.4931509
    Ortalama mutlak yüzde  hatası:  0.019201242020345188

plt.scatter(sizes, prices, marker='x', color='b')

plt.xlabel('Size (sq ft)')
plt.ylabel('Price ($)')
plt.title('Price vs Size of Properties')
plt.plot(sizes, prices_pred, color='blue', linewidth=3)
plt.grid(True)
plt.show()

png

Multivariate (Çok değişkenli) Lineer (Doğrusal) Regresyon

# diyelim ki aşağıdaki gibi bir veri setimiz olsun.
# Alan, oda_sayisi, yaş, fiyat
data =[[2600, 3, 20, 550000],
       [3000, 4, 15, 565000],
       [3200, 4, 18, 610000],
       [3600, 3, 30, 595000],
       [4000, 5, 8 , 760000],
       [4100, 6, 8 , 810000]]

data=np.array(data)

X=data[:,:-1]
X

    array([[2600,    3,   20],
           [3000,    4,   15],
           [3200,    4,   18],
           [3600,    3,   30],
           [4000,    5,    8],
           [4100,    6,    8]])

y=data[:,-1]
y

    array([550000, 565000, 610000, 595000, 760000, 810000])

reg = linear_model.LinearRegression()
reg.fit(X,y)

# theta_0
reg.intercept_

    221323.0018654043

# theta_1, theta_2, theta_3
reg.coef_

    array([  112.06244194, 23388.88007794, -3231.71790863])

# Alanı 3000 feet kare olan 3 odalı 40 yaşındaki evin fiyatı ne olmalı?
reg.predict([[3000, 3, 40]])

    array([498408.25158031])

theta_0 = reg.intercept_
theta = reg.coef_

# veya
theta_0 + theta[0]*3000 + theta[1]*3 + theta[2]*40 

    498408.2515803071

Polinomsal Regresyon

Örnek 1

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

model = make_pipeline(PolynomialFeatures(3), LinearRegression())
model.fit(sizes, prices)

predictions=model.predict(sizes)

predictions

    array([[549309.43705744],
           [569834.06801092],
           [603957.3844045 ],
           [682417.03703898],
           [724482.07348817]])

prices

    array([[550000],
           [565000],
           [610000],
           [680000],
           [725000]])

intercept = model.named_steps['linearregression'].intercept_
coefficients = model.named_steps['linearregression'].coef_

coefficients

    array([[ 0.00000000e+00, -5.08191437e+03,  1.57417101e+00,
            -1.56287425e-04]])

intercept

    array([5867798.52203121])

formula = f"y = {intercept[0]:.4f}"
for i, coef in enumerate(coefficients[0][1:], start=1):
    formula += f" + ({coef:.18f}) * x**{i}"

formula

    'y = 5867798.5220 + (-5081.914368269133774447) * x**1 + (1.574171013805353070) * x**2 + (-0.000156287424943002) * x**3'

x=2600

y = 5867798.5220 + (-5081.914368269133774447) * x + (1.574171013805353070) * x**2 + (-0.000156287424943002) * x**3
y

    549309.4370262353

model.predict([[2600]])

    array([[549309.43705744]])

mse = mean_squared_error(predictions, prices)
mse

    13293721.973269027

mape=mean_absolute_percentage_error(predictions, prices)
mape

    0.004800448655029275

plt.scatter(sizes, prices, marker='x', color='b')
plt.scatter(sizes, predictions, marker='+', color='r')

plt.xlabel('Size (sq ft)')
plt.ylabel('Price ($)')
plt.title('Price vs Size of Properties')


plt.grid(True)
plt.show()

png

line_vals=np.arange(2600,4001,10).reshape((-1,1))
line_preds=model.predict(line_vals)

plt.scatter(sizes, prices, marker='x', color='b')
plt.plot(line_vals, line_preds, color='r')

plt.xlabel('Size (sq ft)')
plt.ylabel('Price ($)')
plt.title('Price vs Size of Properties')


plt.grid(True)
plt.show()

png

Örnek 2

Veri dosyasını indirmek için tıklayınız.

import csv
filename = "02_data.csv"

x = []
y = []

with open(filename, "r", newline="", encoding="utf-8") as file:
    reader = csv.reader(file)
    for row in reader:
        if len(row) == 2:  # İki değer içeren satırları işleme al
            x.append(float(row[0]))
            y.append(float(row[1]))

x=np.array(x)
y=np.array(y)

plt.scatter(x, y, marker='x', color='b')
plt.show()

png

x=x.reshape((-1,1))
y=y.reshape((-1,1))

model = make_pipeline(PolynomialFeatures(4), LinearRegression())
model.fit(x, y)

values=np.arange(1,750).reshape((-1,1))
pred_vals=model.predict(values)

plt.scatter(x, y, marker='x', color='b')
plt.plot(values,pred_vals, color="r")
plt.show()

png

# en iyi dereceyi bulmak icin

best=np.inf
best_degree=0
for degree in range(50):
    model = make_pipeline(PolynomialFeatures(degree), LinearRegression())
    model.fit(x, y)
    y_pred=model.predict(x)
    error=mean_squared_error(y_pred, y)
    if error < best:
        best = error
        best_degree = degree
    

best_degree

Modelleri saklamak ve yüklemek için

model = make_pipeline(PolynomialFeatures(4), LinearRegression())
model.fit(x, y)

import pickle

with open('model_pickle','wb') as file:
    pickle.dump(model,file)

with open('model_pickle','rb') as file:
    model = pickle.load(file)

model.predict(x)[:3]

    array([[161.62963009],
           [151.50199079],
           [156.52255932]])