"gpls"<- function(y, x, m.binomial = NULL, components = "max", r.gamma = NULL, family = "gaussian", link = "default", max.threshold = 0.99, x.predict = NULL, y.predict = NULL) { # Function gpls: partial least squares algorithm for # generalized linear models. # Input: y= response variable (exponential family). # Input: x= matrix of explantory variables, often spectra with p>n. # Input: components= integer no. of latent variables, e.g. components=4. # Default is max. # Input: family='gaussian', 'binomial', 'poisson', 'Gamma' distribution. # family must have quotes. # Input: m.binomial=vector of binomial trials. Default is 1 vector. # Input: r.gamma=vector of gamma shape parameters. Default is 1 vector. # Input: link= link function ('identity', 'log', 'sqrt', 'logit', 'probit', 'cloglog', # 'loglog', recipical). # link must have quotes. # Input: max.threshold= percent of sum of singular values used in 'max'. # Default=.99. # Input: x.predict, y.predict= used for se(pred) of indep CV dataset. # Result: a plot of gpls spectral beta's, indexed 1:p. # Output: A list: including, latent variable coefs, spectral coefs, # glm.summary, loading weight matrix (w), glm diag v, # components, deviance, % correct classification (binomial). # # Reference: Marx, B.D. (1996). Iteratively reweighted partial least squares estimation for # generalized linear regression. Technometrics 38(4): 374-381. # # (c) 1995 Brian D. Marx # Version 1.1 # n <- length(y) one <- rep(1, n) x <- as.matrix(x) p <- ncol(x) if(components == "max") { xscale <- scale(x) xxt <- xscale %*% t(xscale) partial.sum <- ii <- 0 xxt.sv <- svd(xxt)$d^2 tot.sv <- sum(xxt.sv) while(partial.sum < max.threshold) { partial.sum <- sum(xxt.sv[1:(ii + 1)])/tot.sv ii <- ii + 1 } components <- min(n - 1, p - 1, ii + 1) xxt <- NULL } eta.new <- rep(0, n) if(link == "default" && family == "gaussian") { link <- "identity" } if(link == "default" && family == "poisson") { link <- "log" } if(link == "default" && family == "binomial") { link <- "logit" } if(link == "default" && family == "Gamma") { link <- "log" } if(missing(m.binomial)) { m.binomial <- as.vector(rep(1, n)) } if(missing(r.gamma)) { r.gamma <- rep(1, length(y)) } if(family != "binomial" && family != "gaussian" && family != "poisson" && family != "Gamma") { warning(paste("Improper FAMILY option. Choose: gaussian, poisson, binomial or Gamma; use quotes" )) } if((family == "binomial") && (link != "logit" && link != "probit" && link != "cloglog" && link != "loglog")) { warning(paste("Improper LINK option with family=binomial. Choose: logit, probit, loglog, cloglog; use quotes" )) } if((family == "Gamma") && (link != "log" && link != "recipical" && link != "identity")) { warning(paste("Improper LINK option with family=Gamma. Choose: recipical, log, identity; use quotes" )) } if((family == "poisson") && (link != "log" && link != "sqrt" && link != "identity")) { warning(paste("Improper LINK option with family=poisson. Choose: log, sqrt, identity; use quotes" )) } if((family == "gaussian") && (link != "identity")) { warning(paste("Improper LINK option with family=gaussian. Choose: identity; use quotes" )) } q <- it <- wcheck <- 0 w1 <- as.vector(rep(0, p)) q.new <- 1 repeat { it <- it + 1 if(it > 25) break d.q <- max(abs((q.new - q)/q.new)) if(d.q < 0.001) break print(c(it, d.q, wcheck)) q <- q.new w1p <- w1 eta <- eta.new if(link == "identity") { mu <- m.binomial * eta h.prime <- m.binomial * 1 } if(link == "log") { mu <- m.binomial * exp(eta) h.prime <- m.binomial * mu } if(link == "sqrt") { mu <- eta^2 h.prime <- 2 * eta } if(link == "logit") { mu <- m.binomial/(1 + exp( - eta)) h.prime <- mu * (1 - mu/m.binomial) } if(link == "recipical") { mu <- 1/eta h.prime <- - (mu^2) } if(link == "probit") { mu <- m.binomial * pnorm(eta) h.prime <- m.binomial * dnorm(eta) } if(link == "cloglog") { mu <- m.binomial * (1 - exp( - exp(eta))) h.prime <- (m.binomial) * exp(eta) * exp( - exp(eta)) } if(link == "loglog") { mu <- m.binomial * exp( - exp( - eta)) h.prime <- m.binomial * exp( - eta) * exp( - exp( - eta )) } if(family == "gaussian") { v <- rep(1, n) } if(family == "poisson") { v <- h.prime^2/mu } if(family == "binomial") { v <- h.prime^2/(mu * (1 - mu/m.binomial)) } if(family == "Gamma") { v <- (r.gamma * h.prime^2)/mu^2 } f0 <- (y - mu)/h.prime + eta # xs <- (1/sqrt(n - 1) # ) * scale(x, # center = # apply(x, 2, # weighted.mean, # w = v)) xs <- scale(x, center = apply(x, 2, weighted.mean, w = v), scale = F) xcenter <- x - xs ccenter <- xcenter[1, ] xs <- (1/sqrt(n - 1)) * scale(xs, center = F) sscale <- (x[1, ] - ccenter)/xs[1, ] e0 <- as.matrix(xs) f0 <- as.matrix(f0) int.adj <- apply(f0, 2, weighted.mean, w = v) ee <- e0 f <- f0 ww <- matrix(0, p, components) tt <- matrix(0, n, components) qq <- rep(0, components) pp <- matrix(0, components, p) for(i in 1:components) { ww.raw <- as.vector(t(ee) %*% (v * f)) ww[, i] <- ww.raw/sqrt(t(ww.raw) %*% ww.raw) tt[, i] <- as.matrix(ee %*% ww[, i]) tt[, i] <- (1/sqrt(n - 1)) * scale(tt[, i], center = apply(as.matrix(tt[, i]), 2, weighted.mean, w = v)) fit.t.f <- lsfit(tt[, i], f, wt = v, intercept = F) fit.t.ee <- lsfit(tt[, i], ee, wt = v, intercept = F) qq[i] <- fit.t.f$coef pp[i, ] <- fit.t.ee$coef ee <- fit.t.ee$resid f <- fit.t.f$resid } w1 <- ww[, 1] q.new <- as.vector(c(int.adj, qq)) eta.new <- as.matrix(cbind(one, tt)) %*% q.new wcheck <- max(abs((w1p - w1)/w1)) } beta <- ww %*% solve(pp %*% ww) %*% as.vector(qq) bin.percent.correct <- NULL if(family == "binomial") { pcount <- 0 p.hat <- 1/(1 + exp( - eta)) for(ii in 1:n) { if(p.hat[ii] > 0.5) { count <- y[ii] } if(p.hat[ii] <= 0.5) { count <- m.binomial[ii] - y[ii] } count <- pcount + count pcount <- count } bin.percent.correct <- count/sum(m.binomial) } if(family == "binomial" && link == "logit") { glm1 <- glm((y/m.binomial) ~ tt, family = binomial, weights = m.binomial) } if(family == "binomial" && link == "probit") { glm1 <- glm((y/m.binomial) ~ tt, family = binomial(link = probit), weights = m.binomial) } if(family == "binomial" && link == "cloglog") { glm1 <- glm((y/m.binomial) ~ tt, family = binomial(link = cloglog), weights = m.binomial) } if(family == "binomial" && link == "loglog") { glm1 <- glm((y/m.binomial) ~ tt, family = binomial(link = loglog), weights = m.binomial) } if(family == "poisson" && link == "log") { glm1 <- glm(y ~ tt, family = poisson) } if(family == "poisson" && link == "sqrt") { glm1 <- glm(y ~ tt, family = poisson(link = sqrt)) } if(family == "Gamma" && link == "log") { glm1 <- glm(y ~ tt, family = Gamma(link = log)) } if(family == "Gamma" && link == "recipical") { glm1 <- glm(y ~ tt, family = Gamma(link = recipical)) } if(family == "Gamma" && link == "identity") { glm1 <- glm(y ~ tt, family = Gamma(link = identity)) } if(family == "gaussian" && link == "identity") { glm1 <- glm(y ~ tt, family = gaussian) } unscaled.beta <- beta/sscale percent.pred <- eta.pred <- phat.pred <- sep.pred <- NULL if(!missing(x.predict)) { u.x.predict <- t((t(x.predict) - ccenter)/sscale) eta.pred <- u.x.predict %*% beta + int.adj if(!missing(y.predict)) { sep.pred <- sqrt(sum((y.predict - eta.pred)^2)/(length( y.predict))) if(link == "logit") { sep.pred <- NULL phat.pred <- exp(eta.pred)/(1 + exp(eta.pred)) ppcount <- 0 for(ii in 1:length(y.predict)) { if(phat.pred[ii] > 0.5) { count <- y.predict[ii] } if(phat.pred[ii] <= 0.5) { count <- 1 - y.predict[ii] } count <- ppcount + count ppcount <- count } percent.pred <- count/length(y.predict) } } } tsplot(unscaled.beta, xlab = "coefficient index", ylab = "unscaled beta") glist <- list() glist$percent.pred <- percent.pred glist$sep.pred <- sep.pred glist$phat.pred <- phat.pred glist$eta.pred <- eta.pred glist$scale.vec <- sscale glist$center.vec <- ccenter glist$intercept.cs <- int.adj glist$w.loadings <- ww glist$spectra.coef <- beta glist$spectra.coef.unscaled <- unscaled.beta glist$v.diag <- v glist$glm.details <- glm1 glist$lv.coef <- q glist$deviance <- deviance(glm1) glist$bin.percent.correct <- bin.percent.correct glist$components <- components glist$mu <- mu/m.binomial glist }