# Generalized smooth monotonic regression by Boosting (GMonBoost)
# written by Florian Leitenstorfer, last update: August 29, 2007
# This R implementation refers
#
# Tutz and Leitenstorfer (2007): Generalized Smooth Monotonic Regression in Additive Modelling,
# Journal of Computational and Graphical Statistics 16, 165-188.
# In function GMonBoost(), the algorithm for fitting a generalized additive
# regression model with monotonicity constraints on some smooth components.
# The current implementation allows for gaussian, binomial and poisson response
# (always using the canonical link).
# Monotonic regression always denotes nondecreasing regression.
# The estimation of a semiparametric linear predictor is not yet implemented.
# In a second file appl_JCGS.r, the code for the application to automobile data is given
# Description of variables of GMonboost:
# y: response vector.
# X: data matrix with n lines (observations) and p columns (covariates).
# spline: type of basis functions. Implemented are "ispline" (I-splines of
# degree 2, knots are placed equidistantly) and "logbas" (logistic basis functions,
# knots are placed at the sample quantiles)
# mvec: a logical vector of length p which indicates whether a monotonicity restriction should be
# set on the s covariate or not (TRUE/FALSE)
# family: error distribution to be used in the model. Implemented are "gaussian", "binomial" and "poisson".
# kvec: a numerical vector of length p assigning the number of basis functions used for
# the smooth components (default: 20 basis functions for each smooth component)
# criterion: stopping criterion for the boosting algorithm. Available are "aicc" or "gMDL" for gaussian response,
# "aic" or "bic" for non-gaussian responses
# lambda: ridge parameter used in the one-step ridge estimates (default=20)
# M: maximum number of iterations for the boosting algorithm
# early: early stopping of the boosting algorithm after greater.bound times in a row increasing criterion (logical, default=F)
# (in the current implementation, the algorithm runs as along as the early stopping condition is met
# for both criteria corresponding to the chosen family)
# greater.bound: number steps with increasing criteria sufficient for early stopping when early=T (default=5)
# print.mesg: should intermediate results of the fitting procedure be printed (logical, default=T)
# Output: The function yields as output a list of length 14, containing
# fit: fitted values on the response scale
# eta: fitted linear predictor
# coef: estimated basis coefficients
# crit: value of the chosen criterion for the resulting fit
# crit.vec: vector of values of the criterion in each boosting iteration
# edf: estimated degrees of freedom for the resulting fit
# its: number of iterations until the criterion is minimized (without initialization step)
#
# The following values are mainly needed for prediction and plotting
# family, spline: see above
# knot.list: list of length p containing the knot locations
# center.list: list of length p containing the means of basis functions
# c.list: list of length p containing the (euclidean) norms of basis functions
# min.vec: vector of length p containing the minimum values of the covariates
# max.vec: vector of length p containing the maximum values of the covariates
# Moreover, this file contains function for prediction (predict.GMonBoost) and plots (plot.GMonBoost):
# Variables of predict.GMonBoost:
# object: a list resulting from a GMonBoost
# newdata: a matrix of values (p columns), where the prediction should be made
# type: type of prediction which should be made ("response"=response scale, "link"=linear predictor,
# "terms"=smooth components)
#Variables of plot.GMonBoost:
# object: a list resulting from a GMonBoost
# lty: line type
# xlab: vector of length p, labels for the x axes
# n: number equdistant points for the grid where the smooth curves should be drawn
# comp.draw: index of smooth components which should be drawn
GMonBoost <- function(y, #vector of responses
X, #matrix of covariates
spline, #type of basis functions
mvec, #monotonicity restriction of coefficients
family="gaussian", #type of exponential family
kvec=rep(20,length(mvec)),#number of basis functions used for estimating the smooth components
criterion="aicc", #stopping criterion
lambda=20, #ridge penalty
M=200, #maximum number of iterations
early=F, #early stopping of boosting iterations
greater.bound=5, #number of increasing AIC/BIC in a row sufficient for early stopping
print.mesg=T #printing of intermediate messages
){
X <- as.matrix(X)
n <- length(y)
p <- length(mvec)
if(dim(X)[1]!=n | dim(X)[2]!=p){
stop("error: dimensions do not agree \n")
}
if(p!=length(kvec)){
stop("error: vector of restrictions does not fit to vector of knots \n")
}
if((family=="gaussian"&(criterion=="aic"|criterion=="bic"))|(family!="gaussian"&(criterion=="aicc"|criterion=="gMDL"))){
stop("error: combination of exponential family and stopping criterion not available \n")
}
min.vec <- apply(X,2,min)
max.vec <- apply(X,2,max)
#computation of basis functions (centered and scaled)
B.list <- list()
B <- 1
knot.list <- list()
center.list <- list()
c.list <- list()
for(i in 1:p){
X.sc.temp <- (X[,i]-min.vec[i])/max(X[,i]-min.vec[i])
B.list[[i]] <- mono.basis(X=X.sc.temp,k=kvec[i],spline=spline,intercept=F)
B <- cbind(B,B.list[[i]]$Bzs.n)
knot.list[[i]] <- B.list[[i]]$knot.vec
center.list[[i]] <- B.list[[i]]$center.mean
c.list[[i]] <- B.list[[i]]$cnorm
}
#Fitting the model by GMonBoost
fit.object <- Genboost.additive(y=y,X=B,family=family,kvec=kvec,mvec=mvec,lambda=lambda,M=M,
early=early,greater.bound=greater.bound,print.mesg=print.mesg)
its <- switch(criterion,aicc=min(fit.object$m.aic),aic=min(fit.object$m.aic),gMDL=fit.object$m.bic,bic=fit.object$m.bic)
crit <- switch(criterion,aicc=min(fit.object$aic),aic=min(fit.object$aic),gMDL=min(fit.object$bic),bic=min(fit.object$bic))
crit.vec <- switch(criterion,aicc=fit.object$aic,aic=fit.object$aic,gMDL=fit.object$bic,bic=fit.object$bic)
fit <- fit.object$Fit[,its]
eta <- fit.object$Eta[,its]
coef <- fit.object$Coef[,its]
edf <- fit.object$edf[its]
res <- list(fit=fit, #fitted values, response scale
eta=eta, #fitted values, linear predictor
coef=coef, #Basis Coefficients
crit=crit, #optimal value of stopping criterion
crit.vec=crit.vec, #development of the
edf=edf, #estimated d.f.
its=its-1, #optimal number of iterations, initialization: l_0=0
## the following values are needed for prediciton and plotting ##
family=family, #type of exponential family
spline=spline, #type of basis functions
knot.list=knot.list, #list of knots
center.list=center.list,#list of means of basis functions
c.list=c.list, #list of the norm of basis functions
min.vec=min.vec, #minima of the covariates
max.vec=max.vec) #maxima of the covariates
}
predict.GMonBoost <- function(object, #GMonBoost object
newdata, #data where predictions have to be made
type="link" #type of prediction (response,link,terms)
){
newdata <- as.matrix(newdata)
n <- dim(newdata)[1]
p <- dim(newdata)[2]
if(p!=length(object$knot.list)){
stop("error: dimensions do not agree \n")
}
if(sum(apply(newdata,2,min)