# Regression # (based on Chpt 7, Modern Statistical Methods for Astronomy) # Eric Feigelson # Harvard-Smithsonian Center for Astrophysics tutorial, January 2014 # I. Construct large and small samples of 77,429 SDSS quasars setwd('/Users/e5f/Desktop/CASt/consulting/ qso <- read.table("http://astrostatistics.psu.edu/MSMA/datasets/SDSS_QSO.dat",head=T) dim(qso) ; names(qso) ; summary(qso) ; attach(qso) sig_u_mag[sig_u_mag<0.02] <- 0.02 # set threshold on magnitude errors # Plot dataset of SDSS quasar i vs. u magnitudes showing heteroscedastic measurement errors plot(i_mag, u_mag, pch=20, cex=0.1, col='grey60', xlim=c(16,21), ylim=c(16.5,23.5), xlab="SDSS i (mag)", ylab="SDSS u (mag)") for(i in 50:150) { lines(c(i_mag[i],i_mag[i]),c((u_mag[i]+sig_u_mag[i]), (u_mag[i]-sig_u_mag[i])), col='deepskyblue') lines(c((i_mag[i]+sig_i_mag[i]),(i_mag[i]-sig_i_mag[i])), c(u_mag[i],u_mag[i]), col='deepskyblue') } smqso <- bkde2D(cbind(i_mag, u_mag), bandwidth=c(0.05, 0.05), gridsize=c(400,400)) contour(smqso$x1, smqso$x2, smqso$fhat, add=T, col='gold', nlevels=6) # II. Ordinary least squares fit fit_ols <- lm(u_mag~i_mag) summary(fit_ols) confint(fit_ols, level=0.997) abline(fit_ols$coef, lty=1, lwd=2) dev.new(3) ; par(mfrow=c(2,2)) # Diagnostic plots useful for identifying outliers plot(fit_ols, which=c(2:5), caption='', sub.caption='' ,pch=20, cex=0.3, cex.lab=1.3, cex.axis=1.3) par(mfrow=c(1,1)) ; dev.set(2) # III. Weighted least squares fit fit_wt <- lm(u_mag~i_mag, x=T, weights=1/(sig_u_mag*sig_u_mag)) summary(fit_wt) abline(fit_wt$coef,lty=2,lwd=2, col='darkgreen') # IV. Robust M-estimator library(MASS) fit_M <- rlm(u_mag~i_mag, method='M') # robust fit with Huber's psi functon summary(fit_M) aM <- fit_M$coef[[1]] ; bM <- fit_M$coef[[2]] lines(c(16,21), c(aM+bM*16, aM+bM*21), lty=3, lwd=3, col='royalblue3') fit_Mwt <- rlm(u_mag~i_mag, method='M', weights=1/(sig_u_mag*sig_u_mag), wt.method='inv.var') # robust fit with measurement error weighting summary(fit_Mwt) aMwt <- fit_Mwt$coef[[1]] ; bMwt <- fit_Mwt$coef[[2]] lines(c(16,21), c(aMwt+bMwt*16, aMwt+bMwt*21), lty=3, lwd=3, col='darkblue') text(19.5, 17, 'u = 0.13 + 1.02*i', cex=1.3, col='darkblue') dev.new() ; hist(fit_Mwt$w, breaks=50, ylim=c(0,1000)) # show down-weighted data points dev.set(2) # manipulate graphics windows dev.copy2eps(file='SDSS_u_vs_i.eps') # V. Fit Sersic function to NGC 4472 elliptical galaxy surface brightness profile NGC4472 <- read.table("http://astrostatistics.psu.edu/MSMA/datasets/NGC4472_profile.dat", header=T) attach(NGC4472) NGC4472.fit <- nls(surf_mag ~ -2.5*log10(I.e * 10^(-(0.868*n-0.142)* ((radius/r.e)^{1/n}-1))) + 26, data=NGC4472, start=list(I.e=20., r.e=120.,n=4.), model=T, trace=T) summary(NGC4472.fit) deviance(NGC4472.fit) logLik(NGC4472.fit) # Plot NGC 4472 data and best-fit model plot(NGC4472.fit$model$radius, NGC4472.fit$model$surf_mag, pch=20, xlab="r (arcsec)", ylab=expression(mu ~~ (mag/sq.arcsec)), ylim=c(16,28), cex.lab=1.5, cex.axis=1.5) lines(NGC4472.fit$model$radius, fitted(NGC4472.fit)) # Fit and plot radial profiles of NGC 4406 and NGC 4451 NGC4406 <- read.table("http://astrostatistics.psu.edu/MSMA/datasets/NGC4406_profile.dat", header=T) attach(NGC4406) NGC4406.fit <- nls(surf_mag ~ -2.5*log10(I.e * 10^(-(0.868*n-0.142)* ((radius/r.e)^{1/n}-1))) + 32, data=NGC4406, start=list(I.e=20., r.e=120.,n=4.), model=T, trace=T) summary(NGC4406.fit) points(NGC4406.fit$model$radius, NGC4406.fit$model$surf_mag, pch=3) lines(NGC4406.fit$model$radius, fitted(NGC4406.fit)) NGC4551 <- read.table("http://astrostatistics.psu.edu/MSMA/datasets/NGC4551_profile.dat", header=T) attach(NGC4551) NGC4551.fit <- nls(surf_mag ~ -2.5*log10(I.e * 10^(-(0.868*n-0.142)* ((radius/r.e)^{1/n}-1))) + 26, data=NGC4551, start=list(I.e=20.,r.e=15.,n=4.), model=T, trace=T) summary(NGC4551.fit) points(NGC4551.fit$model$radius, NGC4551.fit$model$surf_mag, pch=5) lines(NGC4551.fit$model$radius, fitted(NGC4551.fit)) legend(500, 20, c("NGC 4472","NGC 4406", "NGC 4551"), pch=c(20,3,5)) # NGC 4472 analysis # Residual plot plot(NGC4472.fit$model$radius,residuals(NGC4472.fit), xlab="r (arcsec)", ylab="Residuals", pch=20, cex.lab=1.5, cex.axis=1.5) lines(supsmu(NGC4472.fit$model$radius, residuals(NGC4472.fit), span=0.05), lwd=2) # Test for normality of residuals qqnorm(residuals(NGC4472.fit) / summary(NGC4472.fit)$sigma) abline(a=0,b=1) shapiro.test(residuals(NGC4472.fit) / summary(NGC4472.fit)$sigma) # Bootstrap parameter estimates install.packages('nlstools') ; library(nlstools) NGC4472.boot <- nlsBoot(NGC4472.fit) summary(NGC4472.boot) hist(NGC4472.boot$coefboot[,3], breaks=50, xlab='Sersic n', main='Error analysis for Sersic fit of NGC 4472') curve(dnorm(x,m=5.95, sd=0.10)*58/5.95, xlim=c(5.6,6.4), ylim=c(0,50), add=T)