Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
# Learn Gradient Boosting Model by coding | |
# good slide | |
# http://www.ccs.neu.edu/home/vip/teach/MLcourse/4_boosting/slides/gradient_boosting.pdf | |
# http://cran.r-project.org/web/packages/gbm/gbm.pdf | |
#1 Gradient Boosting for Regression | |
# generate data | |
generate_func = function(n=1000,size=0.5){ | |
# n: how many data points | |
# size: hold 50% of all data as train | |
set.seed(1) | |
x = seq(0,2*pi,length=n) # generate x | |
noise = rnorm(n,0,0.3) | |
y = sin(x) + noise # generate y | |
select_index = sample(1:n,size=size*n,replace = F) | |
# split data into train and test | |
train_x = x[select_index] | |
train_y = y[select_index] | |
test_x = x[-select_index] | |
test_y = y[-select_index] | |
data = list('train_x'=train_x, | |
'train_y'=train_y, | |
'test_x'=test_x, | |
'test_y'=test_y) | |
return(data) | |
} | |
data = generate_func() | |
objects(data) | |
# train boosting regression tree | |
GBR = function(x,y,rate=1,iter=100){ | |
# iter is iterate number, higher is better, but be carefule overfit. | |
# rate is learning speed, lower is better, but too slow will cause longer time. | |
library(rpart) | |
max_depth=1 # tree stump | |
mu = mean(y) # start with an initial model | |
dy = y - mu # residuals or error, These are the parts that existing model cannot do well. | |
learner = list() # define a learners holder | |
for (i in 1:iter) { | |
# use a weak model to improve | |
learner[[i]] = rpart(dy~x, # error is target variable | |
control=rpart.control(maxdepth=max_depth,cp=0)) # cp=0 means to growth a tree with any cost | |
# modify residuals for next iter | |
dy = dy - rate*predict(learner[[i]]) | |
} | |
result = list('learner'=learner, | |
'rate'=rate, | |
'iter'=iter) | |
return(result) | |
} | |
model = GBR(data$train_x,data$train_y,iter=1000) | |
objects(model) | |
# predict function | |
predict_GBR = function(x,y,model){ | |
predict_y = list() # hold predict result | |
mu = mean(y) # initial model | |
n = length(y) | |
iter = model$iter | |
rate = model$rate | |
predict_y[[1]] = rep(mu,n) | |
learner = model$learner | |
for (i in 2:iter) { | |
# add pre-model prediction to predict y | |
predict_y[[i]] = predict_y[[i-1]] + rate * predict(learner[[i-1]],newdata=data.frame(x)) | |
} | |
# mean sqare error | |
mse = sapply(predict_y,function(pre_y) round(mean((y-pre_y)^2),3)) | |
result = list('predict_y'=predict_y, | |
'mse'= mse) | |
return(result) | |
} | |
# predict train data | |
predict_train = predict_GBR(data$train_x,data$train_y,model=model) | |
objects(predict_train) | |
# more trees get less error | |
plot(predict_train$mse,type='l') | |
# plot the effect of boosing tree | |
plotfunc = function(x,y,predict,num){ | |
library(ggplot2) | |
pre = predict$predict_y[[num]] | |
plotdf = data.frame(x=x,y=y,pre = pre) | |
mse = round(mean((y-pre)^2),3) | |
p = ggplot(plotdf,aes(x,y)) + | |
geom_point(alpha=0.5) | |
p = p + geom_line(aes(x,pre)) + xlab(paste0('mse=',mse)) | |
plot(p) | |
} | |
# use mean of y to predict data | |
plotfunc(data$train_x,data$train_y,predict_train,num=1) | |
# add another tree model | |
plotfunc(data$train_x,data$train_y,predict_train,num=2) | |
# add more tree getter more result | |
plotfunc(data$train_x,data$train_y,predict_train,num=10) | |
plotfunc(data$train_x,data$train_y,predict_train,num=100) | |
# predict test data | |
predict_test = predict_GBR(data$test_x,data$test_y,model=model) | |
plot(predict_test$mse,type='l') | |
plotfunc(data$test_x,data$test_y,predict_test,10) | |
plotfunc(data$test_x,data$test_y,predict_test,100) | |
# compare different parametter | |
# create 2 models with different rate | |
model_1 = GBR(data$train_x,data$train_y,rate=1,iter=500) | |
model_2 = GBR(data$train_x,data$train_y,rate=0.1,iter=500) | |
# use train and test data, we have 4 results | |
predict_train_1 = predict_GBR(data$train_x,data$train_y,model=model_1) | |
predict_train_2 = predict_GBR(data$train_x,data$train_y,model=model_2) | |
predict_test_1 = predict_GBR(data$test_x,data$test_y,model=model_1) | |
predict_test_2 = predict_GBR(data$test_x,data$test_y,model=model_2) | |
# take out mse of these 4 results | |
train_error_1 = predict_train_1$mse | |
train_error_2 = predict_train_2$mse | |
test_error_1 = predict_test_1$mse | |
test_error_2 = predict_test_2$mse | |
iter = 1:model_1$iter | |
# compare these mse | |
plotdf = data.frame(iter,train_error_1,test_error_1,train_error_2,test_error_2) | |
p = ggplot(plotdf)+ | |
geom_line(aes(x=iter,y=train_error_1),color='blue')+ | |
geom_line(aes(x=iter,y=test_error_1),color='red') + | |
geom_line(aes(x=iter,y=train_error_2),linetype=2,color='blue')+ | |
geom_line(aes(x=iter,y=test_error_2),linetype=2,color='red') | |
print(p) | |
# test error is better model performance. | |
# less rate is better but need more iter | |
which.min(train_error_1) | |
which.min(train_error_2) | |
which.min(test_error_1) | |
which.min(test_error_2) | |
# 2. Gradient Boosting for Classification | |
# read data | |
data = subset(iris,Species!='virginica',select = c(1,2,5)) | |
data$y = ifelse(data$Species == 'setosa',1,-1) | |
data$Species = NULL | |
names(data)[1:2] = c('x1','x2') | |
head(data) | |
p = ggplot(data,aes(x1,x2,color=factor(y)))+ | |
geom_point() | |
print(p) | |
# train boosting tree for classification | |
GBC = function(data,rate=0.1,iter=100){ | |
library(rpart) | |
max_depth=1 | |
learner = list() | |
mu = mean(data$y==1) | |
# start with an initial model | |
# mu=p(y=1) -> f=w*x = log(mu/(1-mu)) | |
f = log(mu/(1-mu)) | |
data$dy = data$y/(1+exp(data$y*f)) # dy is negtive gradient of log loss funtion | |
for (i in 1:iter) { | |
# use a weak model to improve | |
learner[[i]] = rpart(dy~x1+x2,data=data, | |
control=rpart.control(maxdepth=max_depth,cp=0)) | |
# improve model | |
f = f + rate *predict(learner[[i]]) | |
# modify dy | |
data$dy = data$y/(1+exp(data$y*f)) | |
} | |
result = list('learner'=learner, | |
'rate'=rate, | |
'iter'=iter) | |
return(result) | |
} | |
model = GBC(data,rate=0.1,iter=1000) | |
objects(model) | |
# predict function | |
predict_GBC = function(data,model){ | |
predict_y = list() | |
mu = mean(data$y==1) | |
f = log(mu/(1-mu)) | |
n = nrow(data) | |
iter = model$iter | |
rate = model$rate | |
predict_y[[1]] = rep(f,n) | |
learner = model$learner | |
for (i in 2:iter) { | |
predict_y[[i]] = predict_y[[i-1]] + rate *predict(learner[[i-1]],newdata=data) | |
} | |
mse = sapply(predict_y,function(pre_y) sum(log(1+exp(-data$y*pre_y)))) # logistic loss function | |
result = list('predict_y'=predict_y, | |
'mse'= mse) | |
return(result) | |
} | |
# predict data | |
predict_train = predict_GBC(data,model=model) | |
objects(predict_train) | |
plot(predict_train$mse,type='l') | |
final = predict_train$predict_y[[1000]] | |
y_p = 1/(1+exp(-final)) | |
y_pred = ifelse(y_p>0.5,1,-1) | |
table(data$y, y_pred) | |
# # plot | |
plotfunc2 = function(data,num,rate=0.1){ | |
library(ggplot2) | |
model = GBC(data,rate=rate,iter=num) | |
predict_train = predict_GBC(data,model=model) | |
final = predict_train$predict_y[[num]] | |
y_p = 1/(1+exp(-final)) | |
data$y_pre = ifelse(y_p>0.5,1,-1) | |
tree_f = sapply(model$learner,function(x){ | |
temp = x$splits | |
return(row.names(temp)[1]) | |
}) | |
tree_v = sapply(model$learner,function(x){ | |
temp = x$splits | |
return(temp[1,'index']) | |
}) | |
x1value = tree_v[tree_f=='x1'] | |
x2value = tree_v[tree_f=='x2'] | |
p = ggplot(data,aes(x=x1,y=x2)) | |
p = p + geom_point(aes(color=factor(y_pre)),size=5) + | |
geom_point(aes(color=factor(y)),size=3)+ | |
geom_vline(xintercept = x1value,alpha=0.4) + | |
geom_hline(yintercept = x2value,alpha=0.4) + | |
scale_colour_brewer(type="seq", palette='Set1') + | |
theme_bw() | |
print(p) | |
} | |
plotfunc2(data,num=10) | |
plotfunc2(data,num=20) | |
plotfunc2(data,num=60) | |
plotfunc2(data,num=100) | |
plotfunc2(data,num=1000) |