#-*- S -*- 

# Chapter 11   Modern Regression

library(MASS)
postscript(file="ch11.ps", width=8, height=6, pointsize=9)


# 11.1  Additive models and scatterplot smoothers}

data(mcycle)
attach(mcycle)
par(mfrow = c(3,2))
plot(times, accel, main="Polynomial regression")
lines(times, fitted(lm(accel ~ poly(times, 3))))
lines(times, fitted(lm(accel ~ poly(times, 6))), lty=3)
legend(40, -100, c("degree=3", "degree=6"), lty=c(1,3),bty="n")
library(splines)
plot(times, accel, main="Natural splines")
lines(times, fitted(lm(accel ~ ns(times, df=5))))
lines(times, fitted(lm(accel ~ ns(times, df=10))), lty=3)
lines(times, fitted(lm(accel ~ ns(times, df=20))), lty=4)
legend(40, -100, c("df=5", "df=10", "df=20"), lty=c(1,3,4),
   bty="n")
plot(times, accel, main="Smoothing splines")
#lines(smooth.spline(times, accel))
plot(times, accel, main="Lowess")
lines(lowess(times, accel))
lines(lowess(times, accel, 0.2), lty=3)
legend(40, -100, c("default span", "f = 0.2"), lty=c(1,3),
    bty="n")
plot(times, accel, main ="ksmooth")
#lines(ksmooth(times, accel,"normal", bandwidth=5))
#lines(ksmooth(times, accel,"normal", bandwidth=2), lty=3)
legend(40, -100, c("bandwidth=5", "bandwidth=2"), lty=c(1,3),
   bty="n")
plot(times, accel, main ="supsmu")
#lines(supsmu(times, accel))
#lines(supsmu(times, accel, bass=3), lty=3)
legend(40, -100, c("default", "bass=3"), lty=c(1,3), bty="n")
detach("mcycle")

if(F) { # no gam
attach(rock)
rock.lm <- lm(log(perm) ~ area + peri + shape, data=rock)
summary(rock.lm)
rock.gam <- gam(log(perm) ~ s(area) + s(peri) + s(shape),
   control=gam.control(maxit=50, bf.maxit=50), data=rock)
summary(rock.gam)
anova(rock.lm, rock.gam)
par(mfrow=c(2,2))
plot(rock.gam, se=T)
rock.gam1 <- gam(log(perm) ~ area + peri + s(shape), data=rock)
par(mfrow=c(2,2))
plot(rock.gam1, se=T)
anova(rock.lm, rock.gam1, rock.gam)
rock.ns <- lm(log(perm) ~ area + peri + ns(shape, df=4), rock)
summary(rock.ns)
par(mfrow=c(2,2))
plot.gam(rock.ns, se=T)

if(!exists("bwt")) {
  attach(birthwt)
  race <- factor(race, labels=c("white", "black", "other"))
  ptd <- factor(ptl > 0)
  ftv <- factor(ftv); levels(ftv)[-(1:2)] <- "2+"
  bwt <- data.frame(low=factor(low), age, lwt, race,
	   smoke=(smoke>0), ptd, ht=(ht>0), ui=(ui>0), ftv)
  detach("birthwt"); rm(race, ptd, ftv)
}

attach(bwt)
age1 <- age*(ftv=="1"); age2 <- age*(ftv=="2+")
birthwt.gam <- gam(low ~ s(age) + s(lwt) + smoke + ptd +
     ht + ui + ftv + s(age1) + s(age2) + smoke:ui, binomial,
     bwt, bf.maxit=25)
summary(birthwt.gam)
table(low, predict(birthwt.gam) > 0)
par(mfrow=c(2,2))
plot(birthwt.gam, ask=T, se=T)
detach()
}

# 11.2  Projection-pursuit regression

if(F) { # no ppreg
attach(rock)
x <- cbind(area, peri, shape)
# or  model.matrix(~ -1 + area + peri + shape)
rock.ppr <- ppreg(x, log(perm), 1, 5)
SStot <- var(log(perm))*(length(perm)-1)
SStot*rock.ppr$esq
rock.ppr$allalpha[1,,1] # first alpha for first fit
rock.ppr$allalpha[1,,1]*sqrt(diag(var(x)))
plot(rock.ppr$z, rock.ppr$zhat, type="l")
rock.gam2 <- gam(log(perm) ~ shape+lo(area, peri, span=0.2))
summary(rock.gam2)
anova(rock.lm, rock.gam2)
par(mfrow=c(1,1))
plot(rock.gam2, eye=c(-10000, 50000, 20), pty="s")
rock.ppr2 <- ppreg(x[, 1:2], log(perm), 1, 3)
SStot*rock.ppr2$esq
rock.ppr2$allalpha[1,,1]*sqrt(diag(var(x[, 1:2])))
rock.ppr2$allalpha[1:2,,2]*sqrt(diag(var(x[, 1:2])))
area1 <- area/10000; peri1 <- peri/10000
rock.gam3 <- gam(log(perm) ~ lo(area1, peri1) + 
  lo(area1, shape) + lo(peri1, shape), 
  control=gam.control(maxit=50, bf.maxit=50))
summary(rock.gam3)
anova(rock.lm, rock.gam3)
par(mfrow=c(1,3))
plot(rock.gam3, pty="s")
}

# 11.3  Response transformation models

library(acepack)
data(mammals)
attach(mammals)
a <- ace(body, brain)
o1<- order(body); o2 <- order(brain)
par(mfrow=c(2,2))
plot(body, brain, main="Original Data", log="xy")
plot(body[o1], a$tx[o1], main="Transformation of body wt", 
   type="l", log="x")
plot(brain[o2], a$ty[o2], main="Transformation of brain wt", 
   type="l", log="x")
plot(a$tx, a$ty, main="Transformed y vs  x")

a <- avas(body, brain)
par(mfrow=c(2,2))
plot(body, brain, main="Original Data", log="xy")
plot(body[o1], a$tx[o1], main="Transformation of body wt", 
   type="l", log="x")
plot(brain[o2], a$ty[o2], main="Transformation of brain wt", 
   type="l", log="x")
plot(a$tx, a$ty, main="Transformed y vs x")
detach()

data(rock)
attach(rock)
x <- cbind(area, peri, shape)
o1 <- order(area); o2 <- order(peri); o3 <- order(shape)
a <- avas(x, perm)
par(mfrow=c(2,2))
plot(area[o1], a$tx[o1,1], type="l")
rug(area)
plot(peri[o2], a$tx[o2,2], type="l")
rug(peri)
plot(shape[o3], a$tx[o3,3], type="l")
rug(shape)
plot(perm, a$ty, log="x")

a <- avas(x, log(perm))
plot(area[o1], a$tx[o1,1], type="l")
rug(area)
plot(peri[o2], a$tx[o2,2], type="l")
rug(peri)
plot(shape[o3], a$tx[o3,3], type="l")
rug(shape)
plot(perm, a$ty, log="x")

a <- avas(x, log(perm), lin=0)
plot(area[o1], a$tx[o1,1], type="l")
rug(area)
plot(peri[o2], a$tx[o2,2], type="l")
rug(peri)
plot(shape[o3], a$tx[o3,3], type="l")
rug(shape)
plot(perm, a$ty, log="x")
detach()


# 11.4  Neural networks

library(nnet)
attach(rock)
area1 <- area/10000; peri1 <- peri/10000
rock.x <- cbind(area1, peri1, shape)
#set.seed(5555)
# The answer depends on the seed and also on the machine
rock.nn <- nnet(rock.x, log(perm), size=3, decay=1e-3,
      linout=T,skip=T,  maxit=1000)
summary(rock.nn)
sum((log(perm) - predict(rock.nn, rock.x))^2)
eigen(nnet.Hess(rock.nn, rock.x, log(perm)), T)$values

# From second printing:
rock1 <- data.frame(perm, area=area1, peri=peri1, shape)
detach()
rock.nn1 <- nnet(log(perm) ~ area + peri + shape, data=rock1,
              size=3, decay=1e-3, linout=T, skip=T, maxit=1000)
rock.nn1


# =========================================

# Chapter 11 Complements


# 11.1 Additive models

library(KernSmooth)

attach(mcycle)
plot(times, accel)
lines(locpoly(times, accel, bandwidth=dpill(times,accel)))
lines(locpoly(times, accel, bandwidth=dpill(times,accel), 
              degree=2), lty=3)
detach()
detach("package:KernSmooth")

library(locfit)
fit <- locfit(accel ~ times, alpha = 0.3, data=mcycle)
plot(fit, se.fit=T, get.data=T)
fit2 <- locfit(accel ~ times, ev="data", alpha=0.2, data=mcycle)
y <- resid(fit2)
fit3 <- locfit(log(y^2) ~ times, deg=1, alpha=1, ev="data",
               data=mcycle)
va <- pmax(exp(fitted(fit3)), 20)
fit <- locfit(accel ~ times, alpha=c(0,0,2), weights=1/va,
              ev="grid", mg=200, data=mcycle)
plot(fit, se.fit=T, get.data=T)
detach("package:locfit")

q() #==================
#set.seed(123)
cpus0 <- cpus[, 2:8]  # excludes names, authors' predictions
for(i in 1:3) cpus0[,i] <- log10(cpus0[,i])
samp <- sample(1:209, 100)
cpus.lm <- lm(log10(perf) ~ ., data=cpus0[samp,])
test <- function(fit)
  sqrt(sum((log10(cpus0[-samp, "perf"]) - 
	    predict(fit, cpus0[-samp,]))^2)/109)
test(cpus.lm)

cpus.lm2 <- step(cpus.lm, trace=F)
cpus.lm2$anova
test(cpus.lm2)

Xin <- as.matrix(cpus0[samp,1:6])
library(mda)
test2 <- function(fit) {
  Xp <- as.matrix(cpus0[-samp,1:6])
  sqrt(sum((log10(cpus0[-samp, "perf"]) - 
	    predict(fit, Xp))^2)/109)
}
cpus.bruto <- bruto(Xin, log10(cpus0[samp,7]))
test2(cpus.bruto)

cpus.bruto$type
cpus.bruto$df

# examine the fitted functions
par(mfrow=c(3,2))
Xp <- matrix(sapply(cpus0[samp, 1:6], mean), 100, 6, byrow=T)
for(i in 1:6) {
  xr <- sapply(cpus0, range)
  Xp1 <- Xp; Xp1[,i] <- seq(xr[1,i], xr[2,i], len=100)
  Xf <- predict(cpus.bruto, Xp1)
  plot(Xp1[ ,i], Xf, xlab=names(cpus0)[i], ylab="", type="l")
}


cpus.mars <- mars(Xin, log10(cpus0[samp,7]))
showcuts <- function(obj)
{
  tmp <- obj$cuts[obj$sel, ]
  dimnames(tmp) <- list(NULL, dimnames(Xin)[[2]])
  tmp
}
showcuts(cpus.mars)
test2(cpus.mars)

# examine the fitted functions
Xp <- matrix(sapply(cpus0[samp, 1:6], mean), 100, 6, byrow=T)
for(i in 1:6) {
  xr <- sapply(cpus0, range)
  Xp1 <- Xp; Xp1[,i] <- seq(xr[1,i], xr[2,i], len=100)
  Xf <- predict(cpus.mars, Xp1)
  plot(Xp1[ ,i], Xf, xlab=names(cpus0)[i], ylab="", type="l")
}
par(mfrow=c(1,1))

cpus.mars2 <- mars(Xin, log10(cpus0[samp,7]), degree=2)
showcuts(cpus.mars2)
test2(cpus.mars2)

cpus.mars6 <- mars(Xin, log10(cpus0[samp,7]), degree=6)
showcuts(cpus.mars6)
test2(cpus.mars6)

cpus.gam <- gam(log10(perf) ~ ., data=cpus0[samp, ])
cpus.gam2 <- step.gam(cpus.gam, scope=list(
  "syct"  = ~ 1 + syct + s(syct, 2) + s(syct),
  "mmin"  = ~ 1 + mmin + s(mmin, 2) + s(mmin),
  "mmax"  = ~ 1 + mmax + s(mmax, 2) + s(mmax),
  "cach"  = ~ 1 + cach + s(cach, 2) + s(cach),
  "chmin" = ~ 1 + chmin + s(chmin, 2) + s(chmin),
  "chmax" = ~ 1 + chmax + s(chmax, 2) + s(chmax)
))
print(cpus.gam2$anova, digits=3)
test(cpus.gam2)

library(treefix, first=T)
cpus.ltr <- tree(log10(perf) ~ ., data=cpus0[samp,])
plot(cv.tree(cpus.ltr,, prune.tree))
cpus.ltr1 <- prune.tree(cpus.ltr, best=10)
test(cpus.ltr1)

library(sm)
attach(birthwt)
sm.logit(age, low, h=5, display="se")
detach()

library(locfit, first=T)
bwt.lf <- locfit(low ~ age+lwt, data=birthwt, family="binomial", 
                 deg=1, scale=0, alpha=c(0,0,2))
plot(bwt.lf, get.data=T)

pima.lf <- locfit(I(type=="Yes") ~ glu + bmi, data=Pima.tr,
                  family="binomial", scale=0, alpha=c(0,0,2))
par(mfrow=c(1,2), pty="s")
plot(pima.lf, get.data=T); plot(pima.lf, type="persp")
par(mfrow=c(1,1), pty="m")


# 11.2 Projection-pursuit regression

attach(rock)
rock1 <- data.frame(area=area/10000, peri=peri/10000,
		    shape=shape, perm=perm)
detach()
library(ppr)
rock.ppr <- ppr(log(perm) ~ area + peri + shape, data=rock1,
                nterms=2, max.terms=5)
rock.ppr
summary(rock.ppr)


par(mfrow=c(3,2))
plot(rock.ppr)
plot(update(rock.ppr, bass=5))
plot(update(rock.ppr, sm.method="gcv", gcvpen=2))
par(mfrow=c(1,1))

rock.ppr2 <- update(rock.ppr, sm.method="gcv", gcvpen=2)
summary(rock.ppr2)

summary(rock1) # to find the ranges of the variables
Xp <- expand.grid(area=seq(0.1,1.2,0.05),
                  peri=seq(0,0.5,0.02), shape=0.2)
rock.grid <- cbind(Xp,fit=predict(rock.ppr2, Xp))
wireframe(fit ~ area+peri, rock.grid, screen=list(z=160,x=-60),
          aspect=c(1,0.5), drape=T)

cpus.ppr <- ppr(log10(perf) ~ ., data=cpus0[samp,],
                nterms=2, max.terms=10, bass=5)
cpus.ppr

cpus.ppr <- ppr(log10(perf) ~ ., data=cpus0[samp,],
		nterms=7, max.terms=10, bass=5)
test(cpus.ppr)
ppr(log10(perf) ~ ., data=cpus0[samp,],
    nterms=2, max.terms=10, sm.method="spline")
cpus.ppr2 <- ppr(log10(perf) ~ ., data=cpus0[samp,],
    nterms=5, max.terms=10, sm.method="spline")
test(cpus.ppr2)
cpus.ppr3 <- ppr(log10(perf) ~ ., data=cpus0[samp,],
    nterms=3, max.terms=10, sm.method="spline")
test(cpus.ppr3)


res1 <- log10(cpus0[-samp, "perf"]) - 
              predict(cpus.lm, cpus0[-samp,])
res2 <- log10(cpus0[-samp, "perf"]) - 
              predict(cpus.ppr2, cpus0[-samp,])
wilcox.test(res1^2, res2^2, paired=T, alternative="greater")


# 11.4 Neural networks

library(nnet)
attach(rock)
rock1 <- data.frame(perm, area=area1, peri=peri1, shape)
rock.nn1 <- nnet(log(perm) ~ area + peri + shape, data=rock1,
              size=3, decay=1e-3, linout=T, skip=T, maxit=1000)
summary(rock.nn1)
sum((log(perm) - predict(rock.nn1))^2)

Xp <- expand.grid(area=seq(0.1,1.2,0.05),
                  peri=seq(0,0.5,0.02), shape=0.2)
rock.grid <- cbind(Xp,fit=predict(rock.nn1, Xp))
wireframe(fit ~ area + peri, rock.grid, screen=list(z=160,x=-60),
          aspect=c(1,0.5), drape=T)

rock.nn$nconn
rock.nn$conn
summary(rock.nn)
detach()

attach(cpus0)
cpus1 <- data.frame(syct=syct-2, mmin=mmin-3, mmax=mmax-4, 
cach=cach/256, chmin=chmin/100, chmax=chmax/100, perf=perf)
detach()

test <- function(fit)
  sqrt(sum((log10(cpus1[-samp, "perf"]) - 
           predict(fit, cpus1[-samp,]))^2)/109)
cpus.nn1 <- nnet(log10(perf) ~ ., data=cpus1[samp,], linout=T,
                 skip=T, size=0)
test(cpus.nn1)

cpus.nn2 <- nnet(log10(perf) ~ ., data=cpus1[samp,], linout=T,
                 skip=T, size=4, decay=0.01, maxit=1000)
test(cpus.nn2)

cpus.nn3 <- nnet(log10(perf) ~ ., data=cpus1[samp,], linout=T,
                 skip=T, size=10, decay=0.01, maxit=1000)
test(cpus.nn3)

cpus.nn4 <- nnet(log10(perf) ~ ., data=cpus1[samp,], linout=T,
                 skip=T, size=25, decay=0.01, maxit=1000)
test(cpus.nn4)

CVnn.cpus <- function(formula, data=cpus1[samp, ], 
     size = c(0, 4, 4, 10, 10),
     lambda = c(0, rep(c(0.003, 0.01), 2)),
     nreps = 5, nifold = 10, ...)
{
  CVnn1 <- function(formula, data, nreps=1, ri,  ...)
  {
    truth <- log10(data$perf)
    res <- numeric(length(truth))
    cat("  fold")
    for (i in sort(unique(ri))) {
      cat(" ", i,  sep="")
      for(rep in 1:nreps) {
        learn <- nnet(formula, data[ri !=i,], trace=F, ...)
        res[ri == i] <- res[ri == i] +
                        predict(learn, data[ri == i,])
      }
    }
    cat("\n")
    sum((truth - res/nreps)^2)
  }
  choice <- numeric(length(lambda))
  ri <- sample(nifold, nrow(data), replace=T)
  for(j in seq(along=lambda)) {
    cat("  size =", size[j], "decay =", lambda[j], "\n")
    choice[j] <- CVnn1(formula, data, nreps=nreps, ri=ri,
                       size=size[j], decay=lambda[j], ...)
    }
  cbind(size=size, decay=lambda, fit=sqrt(choice/100))
}
CVnn.cpus(log10(perf) ~ ., data=cpus1[samp,],
          linout=T, skip=T, maxit=1000)

testnn <- function(nreps=1, ...)
{
  res <- numeric(109)
  cat("  rep")
  for (i in 1:nreps) {
    cat(" ", i,  sep="")
    fit <- nnet(log10(perf) ~ ., data=cpus1[samp,],
                trace=F, linout=T, ...)
    res <- res + predict(fit, cpus1[-samp,])
  }
  cat("\n")
  sqrt(sum((log10(cpus1[-samp, "perf"]) - res/nreps)^2)/109)
}
testnn(nreps=5, skip=T, maxit=1000, size=10, decay=0.01)

# End of ch11
