EEM-272 Makine Öğrenmesine Giriş
2024-2025 Bahar Dönemi
Ders 2
Dersin slaytın için tıklayınız.
Univariate (Tek değişkenli) Lineer (Doğrusal) Regresyon
import matplotlib.pyplot as plt
import numpy as np
sizes=np.array([2600, 3000, 3200, 3600, 4000]).reshape((-1,1))
prices =np.array([550000, 565000, 610000, 680000, 725000]).reshape((-1,1))
sizes.shape, prices.shape
((5, 1), (5, 1))
plt.scatter(sizes, prices, marker='x', color='b')
plt.xlabel('Boyut')
plt.ylabel('Fiyat')
plt.grid(True)
plt.show()
Lineer regresyon
from sklearn import linear_model
#model
reg = linear_model.LinearRegression()
# modeli verilerle eğitme
reg.fit(sizes,prices)
reg.predict([[3300]])
array([[628715.75342466]])
# theta_0 değeri
reg.intercept_
array([180616.43835616])
# theta_1 değeri
reg.coef_
array([[135.78767123]])
theta_0 = reg.intercept_[0]
theta_1 = reg.coef_[0,0]
# dolayısıyla hipotez fomulu
h=theta_0+theta_1*size
h=theta_0+theta_1*3300
h
628715.7534246575
# tum veri setini tahmin etmek için
prices_pred = reg.predict(sizes)
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
print("Ortalama kare hatası: ", mean_squared_error(prices, prices_pred))
print("Ortalama mutlak yüzde hatası: ", mean_absolute_percentage_error(prices, prices_pred))
Ortalama kare hatası: 186815068.4931509 Ortalama mutlak yüzde hatası: 0.019201242020345188
plt.scatter(sizes, prices, marker='x', color='b')
plt.xlabel('Size (sq ft)')
plt.ylabel('Price ($)')
plt.title('Price vs Size of Properties')
plt.plot(sizes, prices_pred, color='blue', linewidth=3)
plt.grid(True)
plt.show()
Multivariate (Çok değişkenli) Lineer (Doğrusal) Regresyon
# diyelim ki aşağıdaki gibi bir veri setimiz olsun.
# Alan, oda_sayisi, yaş, fiyat
data =[[2600, 3, 20, 550000],
[3000, 4, 15, 565000],
[3200, 4, 18, 610000],
[3600, 3, 30, 595000],
[4000, 5, 8 , 760000],
[4100, 6, 8 , 810000]]
data=np.array(data)
X=data[:,:-1]
X
array([[2600, 3, 20], [3000, 4, 15], [3200, 4, 18], [3600, 3, 30], [4000, 5, 8], [4100, 6, 8]])
y=data[:,-1]
y
array([550000, 565000, 610000, 595000, 760000, 810000])
reg = linear_model.LinearRegression()
reg.fit(X,y)
# theta_0
reg.intercept_
221323.0018654043
# theta_1, theta_2, theta_3
reg.coef_
array([ 112.06244194, 23388.88007794, -3231.71790863])
# Alanı 3000 feet kare olan 3 odalı 40 yaşındaki evin fiyatı ne olmalı?
reg.predict([[3000, 3, 40]])
array([498408.25158031])
theta_0 = reg.intercept_
theta = reg.coef_
# veya
theta_0 + theta[0]*3000 + theta[1]*3 + theta[2]*40
498408.2515803071
Polinomsal Regresyon
Örnek 1
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
model = make_pipeline(PolynomialFeatures(3), LinearRegression())
model.fit(sizes, prices)
predictions=model.predict(sizes)
predictions
array([[549309.43705744], [569834.06801092], [603957.3844045 ], [682417.03703898], [724482.07348817]])
prices
array([[550000], [565000], [610000], [680000], [725000]])
intercept = model.named_steps['linearregression'].intercept_
coefficients = model.named_steps['linearregression'].coef_
coefficients
array([[ 0.00000000e+00, -5.08191437e+03, 1.57417101e+00, -1.56287425e-04]])
intercept
array([5867798.52203121])
formula = f"y = {intercept[0]:.4f}"
for i, coef in enumerate(coefficients[0][1:], start=1):
formula += f" + ({coef:.18f}) * x**{i}"
formula
'y = 5867798.5220 + (-5081.914368269133774447) * x**1 + (1.574171013805353070) * x**2 + (-0.000156287424943002) * x**3'
x=2600
y = 5867798.5220 + (-5081.914368269133774447) * x + (1.574171013805353070) * x**2 + (-0.000156287424943002) * x**3
y
549309.4370262353
model.predict([[2600]])
array([[549309.43705744]])
mse = mean_squared_error(predictions, prices)
mse
13293721.973269027
mape=mean_absolute_percentage_error(predictions, prices)
mape
0.004800448655029275
plt.scatter(sizes, prices, marker='x', color='b')
plt.scatter(sizes, predictions, marker='+', color='r')
plt.xlabel('Size (sq ft)')
plt.ylabel('Price ($)')
plt.title('Price vs Size of Properties')
plt.grid(True)
plt.show()
line_vals=np.arange(2600,4001,10).reshape((-1,1))
line_preds=model.predict(line_vals)
plt.scatter(sizes, prices, marker='x', color='b')
plt.plot(line_vals, line_preds, color='r')
plt.xlabel('Size (sq ft)')
plt.ylabel('Price ($)')
plt.title('Price vs Size of Properties')
plt.grid(True)
plt.show()
Örnek 2
Veri dosyasını indirmek için tıklayınız.
import csv
filename = "02_data.csv"
x = []
y = []
with open(filename, "r", newline="", encoding="utf-8") as file:
reader = csv.reader(file)
for row in reader:
if len(row) == 2: # İki değer içeren satırları işleme al
x.append(float(row[0]))
y.append(float(row[1]))
x=np.array(x)
y=np.array(y)
plt.scatter(x, y, marker='x', color='b')
plt.show()
x=x.reshape((-1,1))
y=y.reshape((-1,1))
model = make_pipeline(PolynomialFeatures(4), LinearRegression())
model.fit(x, y)
values=np.arange(1,750).reshape((-1,1))
pred_vals=model.predict(values)
plt.scatter(x, y, marker='x', color='b')
plt.plot(values,pred_vals, color="r")
plt.show()
# en iyi dereceyi bulmak icin
best=np.inf
best_degree=0
for degree in range(50):
model = make_pipeline(PolynomialFeatures(degree), LinearRegression())
model.fit(x, y)
y_pred=model.predict(x)
error=mean_squared_error(y_pred, y)
if error < best:
best = error
best_degree = degree
best_degree
6
Modelleri saklamak ve yüklemek için
model = make_pipeline(PolynomialFeatures(4), LinearRegression())
model.fit(x, y)
import pickle
with open('model_pickle','wb') as file:
pickle.dump(model,file)
with open('model_pickle','rb') as file:
model = pickle.load(file)
model.predict(x)[:3]
array([[161.62963009], [151.50199079], [156.52255932]])