星期五, 五月 22, 2015
用代码来理解boosting方法
提升方法是集成学习中预测能力最强的一种方法。在R和Python中都有相应的扩展库和丰富的函数。不过对于初学者来讲,理解这种方法不是很容易。本文基于R的决策树包实现两种基本的提升树,即回归提升树和分类提升树。有助于理解提升方法的原理,以及各项参数的作用。公式推导可以见这篇文章。
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Learn Gradient Boosting Model by coding | |
# good slide | |
# http://www.ccs.neu.edu/home/vip/teach/MLcourse/4_boosting/slides/gradient_boosting.pdf | |
# http://cran.r-project.org/web/packages/gbm/gbm.pdf | |
#1 Gradient Boosting for Regression | |
# generate data | |
generate_func = function(n=1000,size=0.5){ | |
# n: how many data points | |
# size: hold 50% of all data as train | |
set.seed(1) | |
x = seq(0,2*pi,length=n) # generate x | |
noise = rnorm(n,0,0.3) | |
y = sin(x) + noise # generate y | |
select_index = sample(1:n,size=size*n,replace = F) | |
# split data into train and test | |
train_x = x[select_index] | |
train_y = y[select_index] | |
test_x = x[-select_index] | |
test_y = y[-select_index] | |
data = list('train_x'=train_x, | |
'train_y'=train_y, | |
'test_x'=test_x, | |
'test_y'=test_y) | |
return(data) | |
} | |
data = generate_func() | |
objects(data) | |
# train boosting regression tree | |
GBR = function(x,y,rate=1,iter=100){ | |
# iter is iterate number, higher is better, but be carefule overfit. | |
# rate is learning speed, lower is better, but too slow will cause longer time. | |
library(rpart) | |
max_depth=1 # tree stump | |
mu = mean(y) # start with an initial model | |
dy = y - mu # residuals or error, These are the parts that existing model cannot do well. | |
learner = list() # define a learners holder | |
for (i in 1:iter) { | |
# use a weak model to improve | |
learner[[i]] = rpart(dy~x, # error is target variable | |
control=rpart.control(maxdepth=max_depth,cp=0)) # cp=0 means to growth a tree with any cost | |
# modify residuals for next iter | |
dy = dy - rate*predict(learner[[i]]) | |
} | |
result = list('learner'=learner, | |
'rate'=rate, | |
'iter'=iter) | |
return(result) | |
} | |
model = GBR(data$train_x,data$train_y,iter=1000) | |
objects(model) | |
# predict function | |
predict_GBR = function(x,y,model){ | |
predict_y = list() # hold predict result | |
mu = mean(y) # initial model | |
n = length(y) | |
iter = model$iter | |
rate = model$rate | |
predict_y[[1]] = rep(mu,n) | |
learner = model$learner | |
for (i in 2:iter) { | |
# add pre-model prediction to predict y | |
predict_y[[i]] = predict_y[[i-1]] + rate * predict(learner[[i-1]],newdata=data.frame(x)) | |
} | |
# mean sqare error | |
mse = sapply(predict_y,function(pre_y) round(mean((y-pre_y)^2),3)) | |
result = list('predict_y'=predict_y, | |
'mse'= mse) | |
return(result) | |
} | |
# predict train data | |
predict_train = predict_GBR(data$train_x,data$train_y,model=model) | |
objects(predict_train) | |
# more trees get less error | |
plot(predict_train$mse,type='l') | |
# plot the effect of boosing tree | |
plotfunc = function(x,y,predict,num){ | |
library(ggplot2) | |
pre = predict$predict_y[[num]] | |
plotdf = data.frame(x=x,y=y,pre = pre) | |
mse = round(mean((y-pre)^2),3) | |
p = ggplot(plotdf,aes(x,y)) + | |
geom_point(alpha=0.5) | |
p = p + geom_line(aes(x,pre)) + xlab(paste0('mse=',mse)) | |
plot(p) | |
} | |
# use mean of y to predict data | |
plotfunc(data$train_x,data$train_y,predict_train,num=1) | |
# add another tree model | |
plotfunc(data$train_x,data$train_y,predict_train,num=2) | |
# add more tree getter more result | |
plotfunc(data$train_x,data$train_y,predict_train,num=10) | |
plotfunc(data$train_x,data$train_y,predict_train,num=100) | |
# predict test data | |
predict_test = predict_GBR(data$test_x,data$test_y,model=model) | |
plot(predict_test$mse,type='l') | |
plotfunc(data$test_x,data$test_y,predict_test,10) | |
plotfunc(data$test_x,data$test_y,predict_test,100) | |
# compare different parametter | |
# create 2 models with different rate | |
model_1 = GBR(data$train_x,data$train_y,rate=1,iter=500) | |
model_2 = GBR(data$train_x,data$train_y,rate=0.1,iter=500) | |
# use train and test data, we have 4 results | |
predict_train_1 = predict_GBR(data$train_x,data$train_y,model=model_1) | |
predict_train_2 = predict_GBR(data$train_x,data$train_y,model=model_2) | |
predict_test_1 = predict_GBR(data$test_x,data$test_y,model=model_1) | |
predict_test_2 = predict_GBR(data$test_x,data$test_y,model=model_2) | |
# take out mse of these 4 results | |
train_error_1 = predict_train_1$mse | |
train_error_2 = predict_train_2$mse | |
test_error_1 = predict_test_1$mse | |
test_error_2 = predict_test_2$mse | |
iter = 1:model_1$iter | |
# compare these mse | |
plotdf = data.frame(iter,train_error_1,test_error_1,train_error_2,test_error_2) | |
p = ggplot(plotdf)+ | |
geom_line(aes(x=iter,y=train_error_1),color='blue')+ | |
geom_line(aes(x=iter,y=test_error_1),color='red') + | |
geom_line(aes(x=iter,y=train_error_2),linetype=2,color='blue')+ | |
geom_line(aes(x=iter,y=test_error_2),linetype=2,color='red') | |
print(p) | |
# test error is better model performance. | |
# less rate is better but need more iter | |
which.min(train_error_1) | |
which.min(train_error_2) | |
which.min(test_error_1) | |
which.min(test_error_2) | |
# 2. Gradient Boosting for Classification | |
# read data | |
data = subset(iris,Species!='virginica',select = c(1,2,5)) | |
data$y = ifelse(data$Species == 'setosa',1,-1) | |
data$Species = NULL | |
names(data)[1:2] = c('x1','x2') | |
head(data) | |
p = ggplot(data,aes(x1,x2,color=factor(y)))+ | |
geom_point() | |
print(p) | |
# train boosting tree for classification | |
GBC = function(data,rate=0.1,iter=100){ | |
library(rpart) | |
max_depth=1 | |
learner = list() | |
mu = mean(data$y==1) | |
# start with an initial model | |
# mu=p(y=1) -> f=w*x = log(mu/(1-mu)) | |
f = log(mu/(1-mu)) | |
data$dy = data$y/(1+exp(data$y*f)) # dy is negtive gradient of log loss funtion | |
for (i in 1:iter) { | |
# use a weak model to improve | |
learner[[i]] = rpart(dy~x1+x2,data=data, | |
control=rpart.control(maxdepth=max_depth,cp=0)) | |
# improve model | |
f = f + rate *predict(learner[[i]]) | |
# modify dy | |
data$dy = data$y/(1+exp(data$y*f)) | |
} | |
result = list('learner'=learner, | |
'rate'=rate, | |
'iter'=iter) | |
return(result) | |
} | |
model = GBC(data,rate=0.1,iter=1000) | |
objects(model) | |
# predict function | |
predict_GBC = function(data,model){ | |
predict_y = list() | |
mu = mean(data$y==1) | |
f = log(mu/(1-mu)) | |
n = nrow(data) | |
iter = model$iter | |
rate = model$rate | |
predict_y[[1]] = rep(f,n) | |
learner = model$learner | |
for (i in 2:iter) { | |
predict_y[[i]] = predict_y[[i-1]] + rate *predict(learner[[i-1]],newdata=data) | |
} | |
mse = sapply(predict_y,function(pre_y) sum(log(1+exp(-data$y*pre_y)))) # logistic loss function | |
result = list('predict_y'=predict_y, | |
'mse'= mse) | |
return(result) | |
} | |
# predict data | |
predict_train = predict_GBC(data,model=model) | |
objects(predict_train) | |
plot(predict_train$mse,type='l') | |
final = predict_train$predict_y[[1000]] | |
y_p = 1/(1+exp(-final)) | |
y_pred = ifelse(y_p>0.5,1,-1) | |
table(data$y, y_pred) | |
# # plot | |
plotfunc2 = function(data,num,rate=0.1){ | |
library(ggplot2) | |
model = GBC(data,rate=rate,iter=num) | |
predict_train = predict_GBC(data,model=model) | |
final = predict_train$predict_y[[num]] | |
y_p = 1/(1+exp(-final)) | |
data$y_pre = ifelse(y_p>0.5,1,-1) | |
tree_f = sapply(model$learner,function(x){ | |
temp = x$splits | |
return(row.names(temp)[1]) | |
}) | |
tree_v = sapply(model$learner,function(x){ | |
temp = x$splits | |
return(temp[1,'index']) | |
}) | |
x1value = tree_v[tree_f=='x1'] | |
x2value = tree_v[tree_f=='x2'] | |
p = ggplot(data,aes(x=x1,y=x2)) | |
p = p + geom_point(aes(color=factor(y_pre)),size=5) + | |
geom_point(aes(color=factor(y)),size=3)+ | |
geom_vline(xintercept = x1value,alpha=0.4) + | |
geom_hline(yintercept = x2value,alpha=0.4) + | |
scale_colour_brewer(type="seq", palette='Set1') + | |
theme_bw() | |
print(p) | |
} | |
plotfunc2(data,num=10) | |
plotfunc2(data,num=20) | |
plotfunc2(data,num=60) | |
plotfunc2(data,num=100) | |
plotfunc2(data,num=1000) |
订阅:
博文 (Atom)