# Density estimation and local regression
# Eric Feigelson
# Harvard-Smithsonian Center for Astrophysics tutorial, January 2014


# Regenerate bivariate dataset from first tutorial

x <- sample(seq(0.01, 3, length.out=500))
y <- 0.5*x + 0.3^(x^2) + rnorm(500, mean=0, sd=(0.05*(1+x^2)))
xy <- cbind(x, y)


# I. Kernel density estimator with visualizations

library(KernSmooth)
dpik(x) ; dpik(y) 
smxy <- bkde2D(xy, bandwidth=c(dpik(x),dpik(y)))
image(smxy$x1, sm_xy$x2, smxy$fhat, col=topo.colors(30), xlab='Xvar', ylab='Yvar', cex.lab=1.2)
contour(smxy$x1, smxy$x2, smxy$fhat, add=T)
persp(smxy$x1,smxy$x2, smxy$fhat, theta=100, phi=40, shade=0.6, col='green1', xlab='Xvar', ylab='Yvar', zlab='Density')


# II. Least-squares polynomial regression 

logx <- log10(x) ; logx2 <- (log10(x))^2 ; logx3 <- (log10(x))^3 # Set up independent variables
logx4 <- (log10(x))^4 ; logx5 <- (log10(x))^5
yfit <- lm(log10(y) ~ logx + logx2 + logx3 + logx4 + logx5) # `lm' is an important function for linear modeling, '~' indicates a formula
str(yfit)									# Note the complicated output from lm
plot(log10(x), log10(y), pch=20, col=grey(0.5),cex=0.3)	# Plot data points
lines(sort(log10(x)), yfit$fit[order(log10(x))], lwd=3)	# Add regression curve. Subtle difference between `sort' and `order' 
dev.copy2eps(file="smooth.eps")


# III. Two spline fits

# A cubic smoothing spline fit
# This function is based on code in the GAMFIT Fortran program by T. Hastie and R. Tibshirani (http://lib.stat.cmu.edu/general/), which makes use of spline code by Finbarr O'Sullivan.  [Chair of Statistics, Univ College Cork IE]

cubsplxy <- smooth.spline(log10(xy))			
plot(log10(xy), pch=20, cex=0.5, xlab='log(Xvar)', ylab='log(Yvar)')		# Plot points
plot(log10(xy), pch=20, cex=0.5, ylim=c(-0.7, 0.4), xlab='log(Xvar)', ylab='log(Yvar)', main='Two spline fits')		
lines(cubsplxy, lwd=2, col='darkgreen')		# Plot the spline fit
predict(cubsplxy, seq(-2.0,0.5, length.out=20))	# Evaluate the spline fit using R's generic `predict' function
knotx <- cubsplxy$fit$knot*cubsplxy$fit$range + cubsplxy$fit$min 	# Find and plot the spline knots
rug(knotx,col='darkgreen')

# COnstrained B-Splines Nonparametric Regression Quantiles
# Bartels, R. and Conn A. (1980) Linearly Constrained Discrete L_1 Problems, ACM Transaction on Mathematical Software 6, 594–608.
# Ng, P. (1996) An Algorithm for Quantile Smoothing Splines, Computational Statistics & Data Analysis 22, 99–118.
# He, X. and Ng, P. (1999) COBS: Qualitatively Constrained Smoothing via Linear Programming; Computational Statistics 14, 315–337.
# Ng, P. and Maechler, M. (2007) A Fast and Efficient Implementation of Qualitatively Constrained Quantile Smoothing Splines, Statistical Modelling 7(4), 315-328.

install.packages("cobs") ; library(cobs)  	
help(cobs)
cobsxy50 <- cobs(log10(x), log10(y), ic='BIC', tau=0.5)	#  Median regression fit
lines(sort(cobsxy50$x),cobsxy50$fitted[order(cobsxy50$x)], lwd=2, col=2)
cobsxy25 <- cobs(log10(x), log10(y), ic='BIC', tau=0.25)
lines(sort(cobsxy25$x),cobsxy25$fitted[order(cobsxy25$x)], lwd=1, col=2)
cobsxy75 <- cobs(log10(x), log10(y), ic='BIC', tau=0.75)
lines(sort(cobsxy75$x),cobsxy75$fitted[order(cobsxy75$x)], lwd=1, col=2)
rug(cobsxy50$knots, lwd=2, col=2)


# III. Three well-established bivariate semi-parametric local regression estimators

# LOESS,  W. S. Cleveland, `Visualizing Data', Hobart Press 1993

sortx <- x[order(x)] ; sorty <- y[order(x)]
local_fit <- loess(sorty ~ sortx, span=0.5, data.frame(x=x,y=y))	
summary(local_fit)
plot(x,y,pch=20, cex=0.5)
lines(sortx, predict(local_fit), lwd=2, col=2)
# Save evenly-spaced LOESS fit to a file 
x_seq <- seq(0.0, 3.0, by=0.03) 
loc_dat <- predict(local_fit, newdata=x_seq)
write(rbind(x_seq,loc_dat), sep=' ', ncol=2, file='localfit.txt')

# Nonlinear quantile regression  (R. Koenker`Quantile Regression', Cambridge Univ Press 2005)

install.packages('quantreg') ; library(quantreg)		
install.packages('MatrixModels') ; library(MatrixModels)
plot(sortx, sorty, pch=20, cex=0.5, xlab='Xvar', ylab='Yvar')
fit_rqss.25 <- rqss(sorty ~ qss(sortx), tau=0.25)
lines(sortx[-1],fit_rqss.25$coef[1]+fit_rqss.25$coef[-1], col='red')
fit_rqss.50 <- rqss(sorty ~ qss(sortx), tau=0.50)
lines(sortx[-1],fit_rqss.50$coef[1]+fit_rqss.50$coef[-1], lwd=2)
fit_rqss.75 <- rqss(sorty ~ qss(sortx), tau=0.75)
lines(sortx[-1],fit_rqss.75$coef[1]+fit_rqss.75$coef[-1], col='red')

# Nonparametric regression with bootstrap errors
# Hayfield, T. & Racine, J. S. Nonparametric Econometrics: The np package, J. Statist. Software, 27(5), 2008
# http://www.jstatsoft.org/v27/i05/

install.packages("np") ; library(np)	
help(npplot)
bw.NW <- npregbw(x, y, regtype='lc', bwtype='fixed') 
npplot(bws=bw.NW, ylim=c(0.0,2.5), plot.errors.method="bootstrap", 
    plot.errors.bar='I', plot.errors.type='quantiles') 
points(x, y, pch=20, cex=0.5)