HW5
Fengrui Liu
2020/10/1
##3. (e)
library(ISLR)
attach(Weekly)
library(MASS)
## Warning: package 'MASS' was built under R version 3.6.2
weekly_training <- Weekly[Year <= 2008,]
weekly_test <- Weekly[Year > 2008,]
LAD_data = lda(Direction ~ Lag2, data= weekly_training)
Predicted_LDA <- predict(LAD_data, data.frame(weekly_test))$class
table( weekly_test$Direction, Predicted_LDA )
## Predicted_LDA
## Down Up
## Down 9 34
## Up 5 56
mean(weekly_test$Direction == Predicted_LDA)
## [1] 0.625
(f)
QDA_data <- qda(Direction ~ Lag2, data=weekly_training)
Predicted_QDA <- predict(QDA_data, data.frame(weekly_test))$class
table( weekly_test$Direction, Predicted_QDA )
## Predicted_QDA
## Down Up
## Down 0 43
## Up 0 61
mean(weekly_test$Direction == Predicted_QDA)
## [1] 0.5865385
(g)
X.train <- matrix( Lag2[ Year <= 2008 ], sum(Year <= 2008), 1 )
X.test <- matrix( Lag2[ Year > 2008 ], sum(Year >= 2009), 1 )
Y.train <- Direction[ Year <= 2008 ]
Y.test <- Direction[ Year > 2008 ]
knn.fit <- class::knn( X.train, X.test, Y.train, 1 )
table( Y.test, knn.fit )
## knn.fit
## Y.test Down Up
## Down 21 22
1
## Up 30 31
mean( Y.test == knn.fit )
## [1] 0.5
(h) LDA: classification rate is 63%. QDA: classification rate is 53%. KNN: classification rate is 51%. LDA
is better than QDA. LDA,QDA are better than KNN.
4
attach(Boston)
names(Boston)
## [1] "crim" "zn" "indus" "chas" "nox" "rm" "age"
## [8] "dis" "rad" "tax" "ptratio" "black" "lstat" "medv"
Crime <- rep(0, length(crim))
Crime[crim > median(crim)] <- 1
Boston<- data.frame(Boston,Crime)
train <- 1:(dim(Boston)[1]/2)
test <- (dim(Boston)[1]/2 + 1):dim(Boston)[1]
Boston_train <- Boston[train, ]
Boston_test <- Boston[test, ]
crim_test <- Crime[test]
## Logistic regression.
reg_model1
<- glm(Crime ~ zn + indus + chas + nox + rm + age + dis + rad + tax +
ptratio + black + lstat + medv, family="binomial")
summary(reg_model1)
##
## Call:
## glm(formula = Crime ~ zn + indus + chas + nox + rm + age + dis +
## rad + tax + ptratio + black + lstat + medv, family = "binomial")
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.3946 -0.1585 -0.0004 0.0023 3.4239
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -34.103704 6.530014 -5.223 1.76e-07 ***
## zn -0.079918 0.033731 -2.369 0.01782 *
## indus -0.059389 0.043722 -1.358 0.17436
## chas 0.785327 0.728930 1.077 0.28132
## nox 48.523782 7.396497 6.560 5.37e-11 ***
## rm -0.425596 0.701104 -0.607 0.54383
## age 0.022172 0.012221 1.814 0.06963 .
## dis 0.691400 0.218308 3.167 0.00154 **
## rad 0.656465 0.152452 4.306 1.66e-05 ***
## tax -0.006412 0.002689 -2.385 0.01709 *
## ptratio 0.368716 0.122136 3.019 0.00254 **
## black -0.013524 0.006536 -2.069 0.03853 *
## lstat 0.043862 0.048981 0.895 0.37052
## medv 0.167130 0.066940 2.497 0.01254 *
## ---
2
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 701.46 on 505 degrees of freedom
## Residual deviance: 211.93 on 492 degrees of freedom
## AIC: 239.93
##
## Number of Fisher Scoring iterations: 9
Because of the P-value, significant variables are: zn, nox, age, dis, rad, tax, ptratio, black, medv.
reg_model2<- glm(Crime ~ zn + nox + age + dis + rad + tax + ptratio + black + medv, data = Boston_train, family="binomial")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(reg_model2)
##
## Call:
## glm(formula = Crime ~ zn + nox + age + dis + rad + tax + ptratio +
## black + medv, family = "binomial", data = Boston_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1131 -0.1780 -0.0009 0.2415 3.0885
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -66.233571 11.236568 -5.894 3.76e-09 ***
## zn -0.428786 0.098572 -4.350 1.36e-05 ***
## nox 81.501997 14.107158 5.777 7.59e-09 ***
## age 0.020096 0.017923 1.121 0.2622
## dis 2.199728 0.463544 4.745 2.08e-06 ***
## rad 1.523621 0.312386 4.877 1.08e-06 ***
## tax -0.012168 0.005396 -2.255 0.0241 *
## ptratio 0.639813 0.164406 3.892 9.96e-05 ***
## black -0.009557 0.005961 -1.603 0.1089
## medv 0.094511 0.048743 1.939 0.0525 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 329.37 on 252 degrees of freedom
## Residual deviance: 91.88 on 243 degrees of freedom
## AIC: 111.88
##
## Number of Fisher Scoring iterations: 9
p <- predict(reg_model2, type = "response", data.frame(Boston_test))
p_crime <- 1*(p > 0.5)
table(Boston_test$Crime, p_crime)
## p_crime
## 0 1
## 0 71 19
3
## 1 21 142
mean(Boston_test$Crime != p_crime)
## [1] 0.1581028
## LDA
lda.crime<- lda(Crime ~ zn + nox + age + dis + rad + tax + ptratio + black + medv, data=Boston_train)
predict_lda <- predict(lda.crime, data.frame(Boston_test))$class
table(Boston_test$Crime, predict_lda)
## predict_lda
## 0 1
## 0 80 10
## 1 22 141
mean(Boston_test$Crime != predict_lda)
## [1] 0.1264822
## QDA
qda.crime<- qda(Crime ~ zn + nox + age + dis + rad + tax + ptratio + black + medv, data=Boston_train)
predict_qda <- predict(qda.crime, data.frame(Boston_test))$class
table(Boston_test$Crime, predict_qda)
## predict_qda
## 0 1
## 0 81 9
## 1 155 8
81+9+8+155
## [1] 253
155+9
## [1] 164
164/253
## [1] 0.6482213
## KNN
x_test <- Boston_test[ , c(2,5,7,8,9,10,11,12,14)]
x_train <- Boston_train[ , c(2,5,7,8,9,10,11,12,14)]
crime_test <- Crime[test]
knn <- class::knn(x_test, x_train,crime_test,1)
table(crim_test,knn)
## knn
## crim_test 0 1
## 0 57 33
## 1 125 38
57+33+125+38
## [1] 253
125+33
## [1] 158
4
158/253
## [1] 0.6245059
According to the error rate, LDA will be best model in this case, because it havs the lowest error rate 12.65%.
5
学霸联盟