Chapter 4 kNN (k-Nearest Neighbors)
2021-12-25 updated
Ref: distance
Ref: tutorial
4.1 Difference calculating the distance: (1) Euclidean (2) Manhatten (3) Cosine (4) Jaccard Coefficient (5) MinKowski
# Euclidean distance
<- function(a, b) {
euclidean_distance
# We check that they have the same number of observation
if (length(a) == length(b)) {
sqrt(sum((a-b)^2))
else {
} stop("Vectors must be of the same length")
}
}
euclidean_distance(1:10, 11:20)
## [1] 31.62278
# Manhattan distance
<- function(a, b) {
manhattan_distance # We check that they have the same number of observation
if (length(a) == length(b)) {
sum(abs(a-b))
else {
} stop("Vectors must be of the same length")
}
}
manhattan_distance(1:10, 11:20)
## [1] 100
# Cosine similarity
<- function(a, b) {
cos_similarity if (length(a) == length(b)) {
= sum(a * b, na.rm = T)
num = sqrt(sum(a^2, na.rm = T)) * sqrt(sum(b^2, na.rm = T))
den = num/den
result 1 - result # because cos(0)=1
else {
} stop (1:10, 11:20)
}
}
cos_similarity(1:10, 11:20)
## [1] 0.0440877
# measure the degree of similarity of two vectors
# all values are equal = 1
# all values are different = 0
<- function(a, b) {
jaccard
if (length(a) == length(b)) {
<- length(intersect(a,b))
intersection <- length(a) + length(b) - intersection
union /union
intersectionelse {
} stop("Vectors must be of the same length")
}
}
jaccard(1:10, 11:20)
## [1] 0
<- function(a, b, p) {
minkowski_distance
# p=1, Manhattan distance
# p=2, Euclidean distance
if (p <= 0) {
stop("p must be higher than 0")
}
if (length(a) == length(b)) {
sum(abs(a-b)^p)^(1/p)
else {
} stop("Vectors must be of the same length")
}
}
minkowski_distance(1:10, 11:20, 1)) (
## [1] 100
minkowski_distance(1:10, 11:20, 2)) (
## [1] 31.62278
We need to based on the type of data, the dimensions, and the business objective to decide which method we are going to use. For example, Manhattan is good for the closet route that a taxi must take.
4.2 Find the k nearest neighbors
The process includes: (1) Check the number of observations is the same (2) Calculate distance (3) Find the closest neighbors. In the following, we use Boston house price data to demo KNN:
library(MASS)
data(Boston)
str(Boston)
## 'data.frame': 506 obs. of 14 variables:
## $ crim : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
## $ zn : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
## $ indus : num 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
## $ chas : int 0 0 0 0 0 0 0 0 0 0 ...
## $ nox : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
## $ rm : num 6.58 6.42 7.18 7 7.15 ...
## $ age : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
## $ dis : num 4.09 4.97 4.97 6.06 6.06 ...
## $ rad : int 1 2 2 3 3 3 5 5 5 5 ...
## $ tax : num 296 242 242 222 222 222 311 311 311 311 ...
## $ ptratio: num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
## $ black : num 397 397 393 395 397 ...
## $ lstat : num 4.98 9.14 4.03 2.94 5.33 ...
## $ medv : num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
library(caret)
set.seed(1)
<- train(
model ~ .,
medv data = Boston,
method = "knn"
)
model
## k-Nearest Neighbors
##
## 506 samples
## 13 predictor
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 506, 506, 506, 506, 506, 506, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 6.774213 0.4788519 4.616781
## 7 6.709875 0.4771239 4.635036
## 9 6.746559 0.4654866 4.690258
##
## RMSE was used to select the optimal
## model using the smallest value.
## The final value used for the model was k
## = 7.
plot(model)
Caret provide preprocessing method before we run our data.
set.seed(1)
<- train(
model2 ~ .,
medv data = Boston,
method = "knn",
preProcess = c("center", "scale")
)
model2
## k-Nearest Neighbors
##
## 506 samples
## 13 predictor
##
## Pre-processing: centered (13), scaled (13)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 506, 506, 506, 506, 506, 506, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 4.827696 0.7297751 3.048151
## 7 4.793191 0.7373525 3.043650
## 9 4.788986 0.7410578 3.070081
##
## RMSE was used to select the optimal
## model using the smallest value.
## The final value used for the model was k
## = 9.
Splitting the dataset (prevent overfitting)
set.seed(1)
<- createDataPartition(Boston$medv, p = .80, list = FALSE)
inTraining <- Boston[inTraining, ]
training <- Boston[-inTraining, ]
testing
<- train(
model3 ~.,
medv data = training,
method = "knn",
preProcess = c("center", "scale")
)
model3
## k-Nearest Neighbors
##
## 407 samples
## 13 predictor
##
## Pre-processing: centered (13), scaled (13)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 407, 407, 407, 407, 407, 407, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 4.948949 0.7111926 3.228177
## 7 5.008726 0.7072326 3.240362
## 9 5.049853 0.7042396 3.281286
##
## RMSE was used to select the optimal
## model using the smallest value.
## The final value used for the model was k
## = 5.
See the performance of our model
<- subset(testing, select=-c(medv))
test.features <- subset(testing, select=medv)[,1]
test.target
= predict(model3, newdata = test.features)
predictions
# RMSE
sqrt(mean((test.target - predictions)^2))) (
## [1] 4.563839
# R squared
cor(test.target, predictions) ^ 2 ) (
## [1] 0.7814887
Cross validation
set.seed(1)
<- trainControl(
ctrl method = "cv",
number = 10
)
<- train(
model4 ~ .,
medv data = training,
method = "knn",
preProcess = c("center", "scale"),
trControl = ctrl
)
(model4)
## k-Nearest Neighbors
##
## 407 samples
## 13 predictor
##
## Pre-processing: centered (13), scaled (13)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 367, 366, 367, 366, 365, 367, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 4.616138 0.7518673 3.064657
## 7 4.734625 0.7404093 3.151517
## 9 4.677503 0.7508160 3.156651
##
## RMSE was used to select the optimal
## model using the smallest value.
## The final value used for the model was k
## = 5.
plot(model4)
# To see if model4 is better than model3
<- subset(testing, select=-c(medv))
test.features <- subset(testing, select=medv)[,1]
test.target
= predict(model4, newdata = test.features)
predictions
# RMSE
sqrt(mean((test.target - predictions)^2))) (
## [1] 4.563839
# R squared
cor(test.target, predictions) ^ 2 ) (
## [1] 0.7814887
We use lambda
to tune the hyper parameters
set.seed(1)
<- expand.grid(
tuneGrid k = seq(5, 9, by = 1)
)
<- train(
model5 ~.,
medv data = training,
method = "knn",
preProcess = c("center", "scale"),
trControl = ctrl,
tuneGrid = tuneGrid
)
(model5)
## k-Nearest Neighbors
##
## 407 samples
## 13 predictor
##
## Pre-processing: centered (13), scaled (13)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 367, 366, 367, 366, 365, 367, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 4.616138 0.7518673 3.064657
## 6 4.754269 0.7386282 3.162237
## 7 4.734625 0.7404093 3.151517
## 8 4.656271 0.7508317 3.133727
## 9 4.677503 0.7508160 3.156651
##
## RMSE was used to select the optimal
## model using the smallest value.
## The final value used for the model was k
## = 5.
plot(model5)