# Multivariate analysis, clustering and classification # (based on Chpt 12, Modern Statistical Methods for Astronomy) # Eric Feigelson # Harvard-Smithsonian Center for Astrophysics tutorial, January 2014 # Read in galaxy redshift dataset shap <- read.table('http://astrostatistics.psu.edu/MSMA/datasets/Shapley_galaxy.dat', header=T, fill=T) attach(shap) ; dim(shap) ; summary(shap) shap.hi <- shap[(R.A. < 205) & (R.A. > 200) & (Dec. > -34) & (Dec. < -29) ,] # High density region shap.lo <- shap[(R.A. < 214) & (R.A. > 209) & (Dec. > -34) & (Dec. < -27) ,] # Low density region shap.clus <- shap[(R.A. <204.5) & (R.A. > 200.4) & (Dec. > -32.5) & (Dec. < -31.0) & (Vel > 11000) & (Vel < 18000),] # Shapley Concentration # Plot in 3-dimensions using rgl, plot and scatterplot3d install.packages('rgl') ; library(rgl) # rgl is an interactive 3D graphical system based on OpenGL rgl.open() rgl.points(scale(shap[,1]), scale(shap[,2]), scale(shap[,4])) rgl.bbox() rgl.snapshot('Shapley.png') rgl.close() plot(shap.lo[,1], shap.lo[,2], cex=(scale(shap.lo[,4])+1.5)/2, xlab='Right Ascension (degrees)', ylab='Declination (degrees)') install.packages('scatterplot3d') ; library(scatterplot3d) scatterplot3d(shap.hi[,c(1,2,4)], pch=20, cex.symbols=0.7, type='p', angl=40, zlim=c(0,50000)) # Preparation of nearest neighbor lists for spdep analysis install.packages('spdep') ; library(spdep) shap.lo.mat <- as.matrix(shap.lo[,1:2]) nn.shap.lo <- knearneigh(shap.lo.mat, k=1) ; str(nn.shap.lo) nb.shap.lo <- knn2nb(nn.shap.lo) ; plot.nb(nb.shap.lo, shap.lo[,1:2]) nb.wt.shap.lo <- nb2listw(nb.shap.lo,style='B') ; summary(nb.wt.shap.lo) # Application of Moran's I and Geary's C statistics moran(shap.lo.mat[,1], nb.wt.shap.lo, n=length(nb.shap.lo), S0=Szero(nb.wt.shap.lo)) moran.test(shap.lo.mat[,1], nb.wt.shap.lo) moran.mc(shap.lo.mat[,1], nb.wt.shap.lo, nsim=10000) geary(shap.lo.mat[,1], nb.wt.shap.lo, n=length(nb.shap.lo), n1=length(nb.shap.lo)-1, S0=Szero(nb.wt.shap.lo)) geary.test(shap.lo.mat[,1], nb.wt.shap.lo) # Variogram analysis: gstat install.packages('gstat') ; library(gstat) shap.variog <- variogram(Vel~1, locations=~R.A.+Dec., data=shap) variog.mod1 <- vgm(7e+07, "Gau", 3.0,2e+07) variog.fit <- fit.variogram(shap.variog,variog.mod1) ; variog.fit plot(shap.variog,model <- variog.fit, col='black', pch=20, xlab='Distance (degree)', ylab="Semivariance (km/s*km/s)", lwd=2) # Preparation for spatstat analysis install.packages('spatstat') ; library(spatstat) shap.lo.win <- owin(range(shap.lo[,1]), range(shap.lo[,2])) centroid.owin(shap.lo.win) ; area.owin(shap.lo.win) shap.lo.ppp <- as.ppp(shap.lo[,c(1,2,4)], shap.lo.win) # planar point pattern summary(shap.lo.ppp) plot(density(shap.lo.ppp,0.3), col=topo.colors(20), main='', xlab='R.A.', ylab='Dec.') plot(shap.lo.ppp, lwd=2, add=T) # K function for the Shapley low density region shap.lo.K <- Kest(shap.lo.ppp, correction='isotropic') shap.lo.K.bias <- Kest(shap.lo.ppp, correction='none') plot.fv(shap.lo.K, lwd=2, col='black', main='', xlab='r (degrees)', legend=F) plot.fv(shap.lo.K.bias, add=T, lty=3, lwd=2, col='black', legend=F) # Draw envelope of 100 simulations of CSR process shap.lo.K.env <- envelope(shap.lo.ppp, fun=Kest, nsim=100, global=T) xx <- c(0, shap.lo.K.env$r, rev(shap.lo.K.env$r), 0) yy <- c(c(0, shap.lo.K.env$lo), rev(c(0,shap.lo.K.env$hi))) polygon(xx, yy, col='gray') plot.fv(shap.lo.K, lwd=2, col='black', main='', add=T, legend=F) plot.fv(shap.lo.K.bias, add=T, lty=3, lwd=2, col='black', legend=F) # Baddeley J function for the Shapley low density region plot(Jest(shap.lo.ppp), lwd=2, col='black', cex.lab=1.3, cex.axis=1.3, main='', xlab='r (degrees)', legend=F) # Two-point correlation function shap.lo.pcf <- pcf(shap.lo.ppp, stoyan=0.05) plot(shap.lo.pcf, xlim=c(0.0,0.2)) plot(log10(shap.lo.pcf$r[2:512]), log10(shap.lo.pcf$trans[2:512]), type='l', lwd=2, xlab='log r (degrees)', ylab='log pair correlation fn') lines(c(-1,0), c(0.78+0.48,0.48), lwd=2, lty=2) lines(c(-2,0), c(0,0), lwd=2, lty=3) # Compute Dirichlet (Voronoi) tessellation shap.lo.dir <- dirichlet(shap.lo.ppp) summary(shap.lo.dir) ; plot(shap.lo.dir, main='') shap.lo.tile <- tiles(shap.lo.dir) ; str(shap.lo.tile) shap.lo.area <- list(lapply(shap.lo.tile, area.owin)) ; str(shap.lo.area) # Select small area tiles as clusters hist(as.numeric(shap.lo.area[[1]]), breaks=30) shap.lo.clus <- cut(as.numeric(shap.lo.area[[1]]), breaks=c(0,0.06,1)) plot(shap.lo.dir, main='') points(shap.lo.ppp, pch=20, cex=0.5) points(shap.lo.ppp[shap.lo.clus=='(0,0.06]'], pch=1,lwd=2) # Ordinary kriging using the geoR package to map mean galaxy velocity across sky # Variogram analysis: geoR install.packages('geoR') ; library(geoR) shap1 <- shap[-c(which(duplicated(shap[,1:2]))),] shap.geo <- as.geodata(shap1,coords.col=1:2, data.col=4) points.geodata(shap.geo,cex.min=0.2, cex.max=1.0, pt.div='quart', col='gray') plot.geodata(shap.geo, breaks=30) shap.vario <- variog(shap.geo, uvec=seq(0, 10, by=0.2)) plot(shap.vario, lwd=2, lty=1) shap.variofit <- variofit(shap.vario, cov.model='gaussian') lines(shap.variofit, lty=2) shap.grid <- pred_grid(c(193,217), c(-38,-29), by=0.3) KC <- krige.control(obj.model=shap.variofit) shap.okrig <- krige.conv(shap.geo,loc=shap.grid, krig=KC) par(mfrow=c(1,2)) image(shap.okrig, xlim=c(195,203), ylim=c(-38,-27)) points(shap.geo, cex.min=0.3, cex.max=1.5, add=T) image(shap.okrig, loc=shap.grid, val=sqrt(shap.okrig$krige.var), xlim=c(195,203), ylim=c(-38,-27), zlim=c(3800,5000)) points(shap.geo,cex.min=0.3, cex.max=1.5, add=T) par(mfrow=c(1,1))