-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathall_linear_regression
More file actions
183 lines (130 loc) · 42.2 KB
/
all_linear_regression
File metadata and controls
183 lines (130 loc) · 42.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
# -*- coding: utf-8 -*-
"""Linear Regression Least Squares Regression
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/gist/azfar154/6f5c2d622f6075b5df2e919c92fbf95f/linear-regression.ipynb
**IMPORTANT READ ME!!!**
You must connect to a hosted gpu don't use your local environment.
You can choose to run programs using GPU acceleration or with a TPU
Dark Mode is amazing so like on the top bar go to Tools then Preferences then change the mode to dark.
AZFAR MOHAMED © High School South
Import important Libraries
"""
# Commented out IPython magic to ensure Python compatibility.
# %matplotlib notebook
import numpy as np
import pandas as pd
import seaborn as sn
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
"""Make a Linear Regression Set"""
from sklearn.datasets import make_regression
X_R1,y_R1=make_regression(n_samples=100,n_features=1,n_informative=1,bias=150.0,noise=30,random_state=0)
"""Make a 75% test and 25% train ratio"""
X_train,x_test,y_train,y_test=train_test_split(X_R1,y_R1,random_state=0)
"""Linear Regression Least Squares"""
from sklearn.linear_model import LinearRegression
#Normal Linear Regression doesn't need an alpha paramter
linreg=LinearRegression().fit(X_train,y_train)
print("The linear coef/w hat is {}".format(linreg.coef_))
print("The intercept/bias term is {}".format(linreg.intercept_))
print("The score for the training data is {:.2f}".format(linreg.score(X_train,y_train)))
print("The score for the testing data is {:.2f}".format(linreg.score(x_test,y_test)))
"""Plotting this Linear Regression"""
plt.figure(figsize=(5,4))
plt.title("Linear Regression:Least Squares")
plt.scatter(X_R1,y_R1,marker='o',s=50,alpha=0.8)
plt.plot(X_R1,X_R1*linreg.coef_+linreg.intercept_,'r-')
"""Ridge Regression"""
from sklearn.linear_model import Ridge
## Ridge Regression uses the L2 penalty which features normalization this is essential when you have features that have a huge impact on the y hat which is the output.
#L2 penalty helps the user for overfitting. Overfitting only benefits the training data. Ridge Regression uses the least squares critera for calculating the W and B but adds a large penalty to the coefficents.
#"In other words, all things being equal, if ridge regression finds two possible linear models that predict the training data values equally well, it will prefer the linear model that has a smaller overall sum of squared feature weights"
ridgeregress=Ridge(alpha=20).fit(X_train,y_train)
print('ridge regression linear model intercept: {}'
.format(ridgeregress.intercept_))
print('ridge regression linear model coeff:\n{}'
.format(ridgeregress.coef_))
print('R-squared score (training): {:.3f}'
.format(ridgeregress.score(X_train, y_train)))
print('R-squared score (test): {:.3f}'
.format(ridgeregress.score(x_test, y_test)))
print('Number of non-zero features: {}'.format(np.sum(ridgeregress.coef_ !=0)))
"""Finding the "best" alpha variable"""
for i in [1,10,20,50,100,200]:
ridgeregress=Ridge(alpha=i).fit(X_train,y_train)
print("When the alpha is",i)
print("The score is {:.2f} for the training data".format(ridgeregress.score(X_train,y_train)))
print("The score is {:.2f} for the training data".format(ridgeregress.score(x_test,y_test)))
"""Ridge Regression with feature normalization"""
#Scaler helps normalize the training and the test data we can use the MinMaxScaler)
scaler=MinMaxScaler()
x_train_scaled=scaler.fit_transform(X_train)
x_test_scaled=scaler.transform(x_test)
ridgeregressnorm=Ridge(alpha=20.0).fit(x_train_scaled,y_train)
print("Ridge regress noralization scores. Test:{:.2f} \n Training {:.2f}".format(ridgeregressnorm.score(x_train_scaled,y_train),ridgeregressnorm.score(x_test_scaled,y_test)))
"""Normalization will help your data.
LOOK AT THE DIFFERENCE BETWEEN THESE GRAPHS
https://towardsdatascience.com/scale-standardize-or-normalize-with-scikit-learn-6ccc7d176a02
If you want to find the amount of time to run a cell you can use the function %%time
"""
# Commented out IPython magic to ensure Python compatibility.
# %%time
# 6
# scaler=MinMaxScaler()
# x_train_scaled=scaler.fit_transform(X_train)
# x_test_scaled=scaler.transform(x_test)
# ridgeregressnorm=Ridge(alpha=20.0).fit(x_train_scaled,y_train)
#
# print("Ridge regress noralization scores. Test:{:.2f} \n Training {:.2f}".format(ridgeregressnorm.score(x_train_scaled,y_train),ridgeregressnorm.score(x_test_scaled,y_test)))
"""Without an external gpu it is usually impossible to achieve such speeds.
Formula:
Input=(x0,x1...x n)
Function:
y hat= w hat 0 x 0 ... w hat n x n +b hat
y hat is the output
w hat is the model coefficent/slope
b hat is the y intercept
Least Squares
RSS=(y-(w i*x+b)
Least Squares Ridge Regression:
RSS=(y-(w i*x+b)+a wj squared
a wj squared is the alpha L2 which prefers the linear model which has a smaller squared sum of feature weights
Lasso Regression
"""
# In this regression you take the square root of wj rather than squaring it which means that it favors few data with medium/large effect
from sklearn.linear_model import Lasso
lassoregress=Lasso().fit(X_train,y_train)
print("The average score for the training data is {:.2f}".format(lassoregress.score(X_train,y_train)))
print("The average score for the testing data is {:.2f}".format(lassoregress.score(x_test,y_test)))
print("The number of non zero features is {}.".format(np.sum(lassoregress.coef_!=0)))
#With the line of code above you can find if Lasso Regression is what you need for your data
"""Polynomial Features"""
#Generates new feature that can match some Addition of many polynomial features often leads to
#overfitting, so we often use polynomial features in combination with regression that has a regularization penalty, like ridge regression. This allows to use a much richer functions that can be used to fit ununsal data
#The degree of the polynomial shows how many variable participate at a time with this feature
from sklearn.preprocessing import PolynomialFeatures
# Making a more complex regression problem where these features are important in
from sklearn.datasets import make_friedman1
X_F1, y_F1 = make_friedman1(n_samples = 100,
n_features = 7, random_state=0)
#If doesn't plot copy and paste this into another window
plt.figure()
plt.scatter(X_F1[:, 2], y_F1, marker= 'o', s=50)
""""""
#Lets try doing this with linear regression and polynomial features
poly=PolynomialFeatures(degree=2)
x_f1_poly=poly.fit_transform(X_F1)
X_train, X_test, y_train, y_test = train_test_split(x_f1_poly, y_F1,
random_state = 0)
linreg=LinearRegression().fit(X_train,y_train)
print("For linear regression the score is:")
print("The training score is {:.2f}".format(linreg.score(X_train,y_train)))
print("The testing score is {:.2f}".format(linreg.score(X_test,y_test)))
##Lets try this with ridgeregression
linridge=Ridge(alpha=2).fit(X_train,y_train)
print("The score for ridge regression is:")
print("The training score is {:.2f}".format(linridge.score(X_train,y_train)))
print("The testing score is {:.2f}".format(linridge.score(X_test,y_test)))
#There is not many medium effecting feature weights