Lab 6 - Logistic Regression
COMPUTER LAB 6: LANDSCAPE AND CLASSIFICATION MODELING -
LOGISTIC REGRESSION FOR SPECIES DISTRIBUTION MODEL
You can download the R Script: AdvSpatial_Lab6.r
This computer lab is designed to provide students with an understanding of the techniques used to model species presence. We will be modeling with data that consist of categorical responses: a species is present (1) OR a species is absent (0). When the number of categories = 2, these are known as binary responses and require different types of statistical techniques for analyses.
PREPARING THE R SESSION
This lab will require several R packages. Please install and load the following R packages:
#Get the necessary packages
require(sp)
require(raster)
require(rgdal)
require(rgeos)
require(AICcmodavg)
require(boot)
require(ROCR)
require(SDMTools)
OBSERVATION DATA AND METRICS
We will be using the same data as in previous labs, the Pariette Cactus in the Uintah Basin, Eastern Utah. Let's add this shapefile to our R session and view the data (remember to change the directory path):
#Read in Sample Location Shapefile
ptsshp<- readOGR(dsn="//wygiscdev/AdvSpatial/salbeke/LandscapeMetrics", layer="ParietteCactus")
#read in the HUC shapefile
huc.sm<- readOGR(dsn="//wygiscdev/AdvSpatial/salbeke/lab2", layer="HUC_SCBR")
#Now filter the points to be a minimum distance from another point
#set the minimum distance points can be from eachother
mindist <- 100 #unit is meters for projected coordinate systems
#Overwrite the PointID column with a new number. I'm doing this to make data.frame manipulation easier to follow
ptsshp@data$PointID<- 1:nrow(ptsshp)
#next we need to limit the points to only ones further than dist from others
#create a complete distance matrix
d<- gDistance(ptsshp, byid=TRUE) #uses rgeos package
#remove the distance of 0 for point to itself
d[row(d) == col(d)] <- NA
#loop through each point and determine which points are within 50m
outDF<- data.frame()
for(i in 1:nrow(d)){
d.match<- unique(which(d[i,]<mindist))
if(length(d.match)>0){
outDF<- rbind(outDF, data.frame(inPt=i, closePt=paste(d.match, collapse=",")))
}
}
#now we need to remove points from the OBS Points
outDF$closePt<- as.character(outDF$closePt)
#now loop through the close points to create a non-spatial match dataset
sdata<- ptsshp
for(i in 1:nrow(outDF)){
#first check to see if current outDF point still exists in the sdata dataset
if(!is.na(match(outDF$inPt[i], sdata$PointID))){
#point still exists in the dataset, so remove the other points that are within the proximity
kill<- which(sdata@data$PointID %in% unlist(strsplit(outDF[i,2], ",")))
if(length(kill)>0){
sdata<- sdata[-kill,]
}#close kill
}#close pt exist
}#close i
#Add a field to indicate Presence
sdata@data$PresAbs<- 1
#now limit the attribute table to this field only
sdata@data<- data.frame(PresAbs=sdata@data$PresAbs)
#how many points remain
nrow(sdata)
#//////////////////////////////////////////////////////////////////////////////
#now create a set of random points that are within the HUC extent and are twice as many as the presence points
#set the beginning number for the PointID
ptID<- 9000 #this is larger than num of rows in input, so unique values
#determine total number of points to create
abspts<- nrow(sdata)*2
#We are going to use the extent of the HUCs to make a new polygon
#let's build a polygon from scratch. To create a true polygon feature class, we need to deal with the spatial hierarchy
xs<- c(bbox(huc.sm)[1,1], bbox(huc.sm)[1,2], bbox(huc.sm)[1,2], bbox(huc.sm)[1,1], bbox(huc.sm)[1,1])
ys<- c(bbox(huc.sm)[2,1], bbox(huc.sm)[2,1], bbox(huc.sm)[2,2], bbox(huc.sm)[2,2], bbox(huc.sm)[2,1])
#create a single polygon
rstdy<- Polygon(matrix(c(xs, ys), ncol=2)) #sp package
#create a list of polygons (yes we only have one, but...)
rstdy<- Polygons(list(rstdy), ID="1")
#now make it spatially aware
rstdy<- SpatialPolygons(list(rstdy), proj4string=CRS(proj4string(huc.sm)))
#now we have to create pseudo-absence points for model fit
rpt<- spsample(rstdy, n=abspts, type="random", iter=100)
#create a distance matrix between pres and abs
d<- gDistance(rpt, sdata, byid=TRUE) #uses rgeos package
#loop through each point and determine which points are within 100m
outDF<- data.frame()
for(j in 1:nrow(d)){
d.match<- unique(which(d[j,]<mindist))
if(length(d.match)>0){
outDF<- rbind(outDF, data.frame(inPt=j, closePt=paste(d.match, collapse=",")))
}
}
#now loop through and create the new absence points for being too close to presence points
if(nrow(outDF)>0){
for(j in 1:nrow(outDF)){
#remove the offending point from the list
rpt<- rpt[-outDF[j,1],]
#keep drawing random sample until distance criteria is met
repeat{
#for simplicity sake, resample a replacement point from polygon
rpt.add<- spsample(rstdy, 1, type="random")
#test if first point meets distance criteria
if(gDistance(rpt.add, sdata)>=mindist){
#add the point to the random samples
rpt<- rbind(rpt, rpt.add)
#break out of the loop
break()
}#close if gDistance
}#close repeat
}#not to next outDF
}#close outDF if
#make random points into a SPDF
rpt<- SpatialPointsDataFrame(rpt, data=data.frame(PresAbs=rep(0, length(rpt))))
#now plot everything
plot(rstdy)
plot(sdata, add=TRUE)
plot(rpt, add=TRUE, col="red")
#/////////////////////////////////////////////////////////////////////////////////
#////////////////////////////////////////////////////////////////////////////////
#now create two datasets, one for model fitting (70% of available) and one for model validation (30%)
#first select vector of rows for sdata: model presence points. Notice we round the number of points to an integer
rpts<- sample(1:nrow(sdata), round(nrow(sdata)*0.7))
#get the spatial features, etc.
p.mod<- sdata[rpts,]
#get the remaining presence points for validation
p.val<- sdata[is.na(match(rownames(sdata@data), rpts))==TRUE,]
#second, select vector of rows for rpt: model absence points. Notice we round the number of points to an integer
rpts<- sample(1:nrow(rpt), round(nrow(rpt)*0.7))
#get the spatial features, etc.
a.mod<- rpt[rpts,]
#get the remaining presence points for validation
a.val<- rpt[is.na(match(rownames(rpt@data), rpts))==TRUE,]
#Now bind the rows together to create the model and validation datasets
pts.mod<- rbind(p.mod, a.mod)
pts.val<- rbind(p.val, a.val)
#plot the training and validation presence points
plot(rstdy)
plot(pts.mod, add=TRUE)
plot(pts.val, add=TRUE, col="red")
#///////////////////////////////////////////////////////////////////////////////////
#////////////////////////////////////////////////////////////////////////////////////
#Now that we have our Training and Validation data: Next, read in the raster data for attribution
#set the raster directory
setwd("//wygiscdev/AdvSpatial/salbeke/LandscapeMetrics/Derived")
#find all files with img
dlist<- dir(pattern=".img$")
#loop through the list and read in the rasters
rlist<- list()
for(i in 1:length(dlist)){
rlist<- c(rlist, raster(dlist[i]))
}
#now stack the rasters. They must meet the 4 criteria of being in the same extent of the landscape
rs<- stack(rlist)
#add in the DEM to your list if it doesn't exist
rs<- stack(rs, raster("//wygiscdev/AdvSpatial/salbeke/LandscapeMetrics/DEM_30m.img"))
#make sure you see all of the rasters
names(rs)
#Now attribute our training points with the landscapte metrics
e<- extract(rs, pts.mod)
#bind the columns back to the points as attributes
pts.mod@data<- cbind(pts.mod@data, e)
#check out your handy work
View(pts.mod)
names(pts.mod@data)
#Now attribute our validation points with the landscapte metrics
e<- extract(rs, pts.val)
#bind the columns back to the points as attributes
pts.val@data<- cbind(pts.val@data, e)
#check out your handy work
View(pts.val)
names(pts.val@data)
LOGISTIC REGRESSION
Presence and absence responses (0, 1) are called bounded responses because the probability of presence can't be less than 0 nor greater than 1. Linear Regression estimates, however, can take on any value. Often a linear regression model will estimate values that are impossible. To avoid these impossible predicted values, presence and absence data can be modeled using logistic regression. Logistic regression is a type of General Linear Model (GLM). It fits data to a binomial distribution using a logit link and is used to estimate probabilities (i.e. predictions are bounded to a range from 0 to 1).
Predicted probabilities are calculated using the parameter estimates. The fist step involves calculating the log-odds (η, a.k.a. your parameter estimates/model) and using it to estimate the probability of presence (p) via the transformation:
p = 1/(1 + exp(- η))
THE MODELING PROCESS
Now that we have a set of metrics that we hypothesize to effect the realized niche of Pariette Cactus and limit its spatial distribution, we need to develop appropriate models describing the combined (additive) effect upon their distribution.
Step 1 - Test for Collinearity:
As with all General Linear Models, the first step is to test predictor variables for collinearity. All of the metrics are continuous data, so we can use Pearson's r or Spearman's rho correlation coefficients to test for collinearity (the results will be slightly different depending on the chosen method):
#Use Pearson's to test collinearity
cor(cactus[,3:21], method="pearson")
You may notice that the matrix is quite large and might take a while to interpret. Use this command to help focus your attention to the values that matter (i.e. the coefficients with values larger than 0.5):
#the matrix is too hard to read, so use this instead
symnum(cor(cactus[,3:21], method="pearson"), cut=c(0.5), corr=TRUE, symbols=c("0","*"))
How do we decide which of the correlated variables to keep within our model? Is it OK to keep the same type of variable (i.e. Curvature) calculated at different scales? To help assist my decisions, I like to turn to Information Theory and AIC. Let's create a set of Univariate models and assess the relative fit of each variable to the data:
#Create the univariate models
u01<- glm(PresAbs~cti, family=binomial(link="logit"), pts.mod@data, na.action=na.pass)
u02<- glm(PresAbs~diss03, family=binomial(link="logit"), pts.mod@data, na.action=na.pass)
u03<- glm(PresAbs~diss11, family=binomial(link="logit"), pts.mod@data, na.action=na.pass)
u04<- glm(PresAbs~diss33, family=binomial(link="logit"), pts.mod@data, na.action=na.pass)
u05<- glm(PresAbs~procrv03, family=binomial(link="logit"), pts.mod@data, na.action=na.pass)
u06<- glm(PresAbs~procrv11, family=binomial(link="logit"), pts.mod@data, na.action=na.pass)
u07<- glm(PresAbs~procrv33, family=binomial(link="logit"), pts.mod@data, na.action=na.pass)
u08<- glm(PresAbs~sar, family=binomial(link="logit"), pts.mod@data, na.action=na.pass)
u09<- glm(PresAbs~scosa, family=binomial(link="logit"), pts.mod@data, na.action=na.pass)
u10<- glm(PresAbs~spost03, family=binomial(link="logit"), pts.mod@data, na.action=na.pass)
u11<- glm(PresAbs~spost11, family=binomial(link="logit"), pts.mod@data, na.action=na.pass)
u12<- glm(PresAbs~spost33, family=binomial(link="logit"), pts.mod@data, na.action=na.pass)
u13<- glm(PresAbs~srr03, family=binomial(link="logit"), pts.mod@data, na.action=na.pass)
u14<- glm(PresAbs~srr11, family=binomial(link="logit"), pts.mod@data, na.action=na.pass)
u15<- glm(PresAbs~srr33, family=binomial(link="logit"), pts.mod@data, na.action=na.pass)
u16<- glm(PresAbs~ssina, family=binomial(link="logit"), pts.mod@data, na.action=na.pass)
u17<- glm(PresAbs~strmdist, family=binomial(link="logit"), pts.mod@data, na.action=na.pass)
u18<- glm(PresAbs~trasp, family=binomial(link="logit"), pts.mod@data, na.action=na.pass)
u19<- glm(PresAbs~tri03, family=binomial(link="logit"), pts.mod@data, na.action=na.pass)
u20<- glm(PresAbs~tri11, family=binomial(link="logit"), pts.mod@data, na.action=na.pass)
u21<- glm(PresAbs~tri33, family=binomial(link="logit"), pts.mod@data, na.action=na.pass)
u22<- glm(PresAbs~DEM_30m, family=binomial(link="logit"), pts.mod@data, na.action=na.pass)
#Create object of all the models
uModels<- list(u01, u02, u03, u04, u05, u06, u07, u08, u09, u10, u11, u12, u13, u14, u15, u16, u17, u18, u19, u20, u21, u22)
#Give each model a name
uNames <- c("cti", "diss03", "diss11", "diss33", "procrv03", "procrv11", "procrv33", "sar", "scosa",
"spost03", "spost11", "spost33", "srr03", "srr11", "srr33", "ssina", "strmdist",
"trasp", "tri03", "tri11", "tri33", "DEM_30m")
# Calculate AICc table
uaicWt<-aictab(cand.set=uModels, modnames=uNames, sort=TRUE, c.hat=1)
uaicWt #view the results
Using the correlations, combined with the AICc values, we are able to create a list of metrics that are not collinear, use the best fitting scale for the variable and hopefully provide the best chance of fitting an accurate model. I have compiled the list of attributes that we can use in a Global Model below:
#now recheck correlations
symnum(cor(pts.mod@data[,c("cti", "diss11", "DEM_30m", "srr33", "tri33", "strmdist",
"sar", "trasp")], method="pearson"), cut=c(0.7), corr=TRUE, symbols=c("0","*"))
Step 2 - GOF of the Global Model:
Now that we have a list of variables that are not collinear we can use each variable as part of our Global Model. We can do this because we have already decided, a priori, that each of these predictor variables (metrics) have an influence upon cactus distribution (i.e. we have developed hypotheses). Let's fit our Global Model:
#create the Global Model
lr01<- glm(PresAbs~cti+diss11+DEM_30m+I(DEM_30m^2)+srr33+tri33+strmdist+sar+trasp+cti*trasp+DEM_30m*cti+srr33*tri33, family=binomial(link="logit"), pts.mod@data, na.action=na.pass)
summary(lr01)
Recall that while using linear regression, we have several tools to diagnostic investigations to test if our data contains outliers, which any GLM is sensitive to, and if the model fits the data. Just like in linear regression, we can test which of our samples are having a disproportionate influence on our model fit (i.e. outliers) using Cook's Distance:
#GLM diagnostics
lrdiag<-glm.diag(lr01) #from (boot)
glm.diag.plots(lr01, lrdiag)
plot(lr01, which=4, cook.levels=cutoff, main="Cook's Distance")
Nothing greater than 1 for Cook's distance and the QQPlot is relatively linear, so I feel that there aren't any outliers
Because the response variable is constrained between 0 and 1, we cannot use the common approach of plotting residuals against fitted values and assessing if the residuals are homoscedastic (randomly around 0). Thus we need a different method to assess Goodness-of-Fit (GOF). The Hosmer-Lemeshow GOF Statistic. This method essentially bins (i.e. maybe 10 groups) the sorted list of predicted values and tests their distributed values against an expected distribution of values and uses a Χ^{2} test for significant differences. If our model is a good fit to the data, the Hosmer-Lemeshow statistic should be ≥ 0.05 (i.e. we accept the null hypothesis of no sig. difference). To perform this test, we first need to load the function into R:
#calculate Hosmer-Lemeshow
#Make sure you change the directory or open this file and submit all lines of code (HosmerLemeshowGOF.r)
source("//miriam/asas$/R_Functions/HosmerLemeshowGOF.r")
Once you have loaded the function, test the Global Model for GOF:
#run Hosmer-Lemshow GOF
hosmerlemGOF(lr01, 10)
If the test statistic value is less than 0.05, our model is NOT considered a good fit.
What do we do? ...One option is to remove any outliers that we previously identified. However, we didn't ID any outliers. In this case, given best professional judgment and knowing that the Hosmer-Lemeshow statistic may not be appropriate for binary response, we are going to ignore this statistic and not worry about over-dispersion (meaning higher than expected variance).
Step 3 - Model Selection using AIC:
By now you should be familiar with AIC model selection. Run the following code to analyze the candidate set of models:
#create set of candidate models
lr01<- glm(PresAbs~cti+diss11+DEM_30m+I(DEM_30m^2)+srr33+tri33+strmdist+sar+trasp+cti*trasp+DEM_30m*cti+srr33*tri33, family=binomial(link="logit"), pts.mod@data, na.action=na.pass)
lr02<- glm(PresAbs~diss11+DEM_30m+I(DEM_30m^2)+srr33+tri33+strmdist+sar, family=binomial(link="logit"), pts.mod@data, na.action=na.pass)
lr03<- glm(PresAbs~cti+diss11+DEM_30m+I(DEM_30m^2)+srr33+tri33+strmdist+sar+trasp+DEM_30m*cti, family=binomial(link="logit"), pts.mod@data, na.action=na.pass)
lr04<- glm(PresAbs~cti+diss11+DEM_30m+srr33+tri33+strmdist+sar+trasp, family=binomial(link="logit"), pts.mod@data, na.action=na.pass)
lr05<- glm(PresAbs~cti+DEM_30m+I(DEM_30m^2)+srr33+tri33+trasp+cti*trasp+DEM_30m*cti+srr33*tri33, family=binomial(link="logit"), pts.mod@data, na.action=na.pass)
lr06<- glm(PresAbs~cti+diss11+DEM_30m+I(DEM_30m^2)+tri33+strmdist+sar+trasp+cti*trasp+DEM_30m*cti, family=binomial(link="logit"), pts.mod@data, na.action=na.pass)
lr07<- glm(PresAbs~diss11+DEM_30m+I(DEM_30m^2)+srr33+tri33+strmdist+sar+trasp+srr33*tri33, family=binomial(link="logit"), pts.mod@data, na.action=na.pass)
lr08<- glm(PresAbs~cti+diss11+DEM_30m+I(DEM_30m^2)+srr33+tri33+strmdist+sar+trasp+cti*trasp, family=binomial(link="logit"), pts.mod@data, na.action=na.pass)
#Create object of all the models
lrModels<- list(lr01, lr02, lr03, lr04, lr05, lr06, lr07, lr08)
#Give each model a name
lrNames <- c("Global", "Mod2", "Mod3", "Mod4", "Mod5", "Mod6", "Mod7", "Mod8")
# Calculate AICc table
aicWt<-aictab(cand.set=lrModels, modnames=lrNames, sort=TRUE, c.hat=1)
aicWt #view the results
You will notice a couple of interesting facts when you view the AICc table. First, only two models have significant weight to be considered for inclusion in a confidence set of models (i.e. for model averaging). Second, you will recognize just how important the interaction terms appear to be for model performance. The one model that does not include any form of interaction is over 100 AIC units larger than the best fitting model. Finally, I would like to draw your attention to the ΔAICc between the 'Global' and the other models. The difference is less than 2 AIC units. Recall the equation is:
AIC = -2 * ln(likelihood) + 2K
Where K is the number of parameters. Thus, with the only difference between the top two models is the addition of additional interaction terms, we can make a conscious choice to only use the best fitting model because the additional parameters in the Global is just barely adding enough predictive power to carry it's 'AIC weight'. A more parsimonious model is a less complex model, therefore, for the remainder of this lab, we will only test the performance of the best fitting model 'lr03'.
Step 4 - Model Performance and Evaluation:
Internal Validation: AIC also measures the predictive ability of models. Models with the lowest AIC will generally have the most accuracy. The accuracy of logistic regression models is estimated by examining the prediction and classification error rates. A prediction error rate is the proportion (or percentage) of predictions that were wrong and a classification error rate is the proportion (or percentage) of observations that were incorrectly classified. These errors are typically delineated within a table called a 'confusion matrix'. To perform an internal validation of our best fitting model, run the following code:
#calculate the confusion matrix given the best model
confusion.matrix(pts.mod@data$PresAbs, fitted(lr03), threshold=0.5)
Notice the we had to chose a threshold of 0.5 (i.e. the predicted probability of a cactus being present). The resulting confusion matrix should provide the following table:
Observed | |||
Predicted | Absent | Present | |
Absent | 595 | 86 | |
Present | 92 | 258 |
The prediction (commission) error rate for presence - that is the proportion of times the model was WRONG when the species was PREDICTED to be present - would be:
92/350 = 0.263 = 26.3% (the posterior probability that we are wrong)
The corresponding prediction error rate for absent:
86/681 = 12.6%
Now, the classification (omission) error rate for the presence -- that is the proportion of times a KNOWN presence was predicted to be absent -- would be:
86/344 = 0.25 = 25% (Notice that this differs from the prediction error rate)
The corresponding classification error rate for absent 92/687 = 13.4%
The total accuracy of the model is calculated as (595 + 258) / (595 + 86 + 92 + 258) = 82.7% thus total error is 100 - 82.7 = 17.3%
As previously stated, we chose a threshold of 0.5 to create the confusion matrix. What is the correct threshold to choose? This is a very loaded question. We may want to choose a threshold that matches our prevelance. Thus 0.33 would be appropriate since we doubled the number of pseudo-absence points when compared to presence points.
Another possible solution is to look across all possible thresholds to assess the accuracy of the model. Receiver-Operator Characteristic (ROC) curves are one approach to assess overall model performance. The calculate a ROC curve, run the following code:
#get the ROC plot
pred <- prediction(fitted(lr03), pts.mod@data$PresAbs)
perf <- performance(pred, measure="tpr", x.measure="fpr")
plot(perf, col=rainbow(10))
abline(coef=c(0,1))
#Now calculate the Area Under the Curve:
#calculate the AUC
lrAUC<- auc(pts.mod@data$PresAbs, fitted(lr03))
lrAUC
An ROC curve demonstrates several things:
- It shows the tradeoff between sensitivity and specificity (any increase in sensitivity (true positive rate) will be accompanied by a decrease in specificity (false positive rate)).
- The closer the curve follows the left-hand border and then the top border of the ROC space, the more accurate the test.
- The closer the curve comes to the 45-degree diagonal of the ROC space, the less accurate the test.
- The slope of the tangent line at a cutpoint gives the likelihood ratio (LR) for that value of the test.
- The area under the curve (AUC) is a measure of test accuracy.
Accuracy is measured by the area under the ROC curve. An area of 1 represents a perfect test; an area of .5 represents a worthless test. A rough guide for classifying the accuracy of a diagnostic test is the traditional academic point system:
.90-1 = excellent (A)
.80-.90 = good (B)
.70-.80 = fair (C)
.60-.70 = poor (D)
.50-.60 = fail (F)
At this point, you may be wondering what this area number really means and how it is computed. The area measures discrimination, that is, the ability of the test to correctly classify sample locations as occupied or not. The area under the curve is the percentage of randomly drawn pairs for which this is true. Computing the area is more difficult to explain and beyond the scope of this introductory material. Two methods are commonly used: a non-parametric method (Mann-Whitney U Statistic) and a parametric method using a maximum likelihood estimator to fit a smooth curve to the data points. The 'auc' function from the R package 'SDMTools' uses the Mann-Whitney U Statistic.
A final note of historical interest: You may be wondering where the name "Receiver Operating Characteristic" came from. ROC analysis is part of a field called "Signal Detection Theory" developed during World War II for the analysis of radar images. Radar operators had to decide whether a blip on the screen represented an enemy target, a friendly ship, or just noise. Signal detection theory measures the ability of radar receiver operators to make these important distinctions. Their ability to do so was called the Receiver Operating Characteristics. It was not until the 1970's that signal detection theory was recognized as useful for interpreting classification test results.
One final test that you may want to employ is the calculation of the amount of variance explained. In least-squares regression we can easily calculate an R2 value. In a GLM, you can use the amount of deviance to estimate a pseudo-R2 value.
( pr2<- (1 - (lr03$deviance/lr03$null.deviance)) )
Before we move onto the next section, let's add our predicted values to the data frame for later use:
#we can then add our predicted values to the data frame
pts.mod@data<-cbind(pts.mod@data, Pred03=round(fitted(lr03),5))
View(pts.mod)
Out-of-Sample Validation: As previously stated, the model with the lowest AIC will generally have the best out-of-sample accuracy. Out-of-sample accuracy is roughly defined as the accuracy of model predictions made with data that was NOT used to fit the model. Out-of-sample accuracy is important because conservation biologists often use their models to predict the effect of some management action or develop a conservation or monitoring strategy. Hence, they need a measure of how far their estimate is likely to be from the 'truth'. Ideally, a researcher will have enough sample data in which a decision can be made to 'hold out' data, meaning randomly select sample locations to include in model fitting, and then use the fitted model to predict the probability of presence for the hold-out sample locations. Often a threshold of using 70% of occurrence points for model fit, and the remaining 30% for validation is used. That is what we have done. So, let's see how well our model fits the hold-out samples.
#Predict to validation data
vpred<- predict(lr03, newdata=pts.val@data, type="response")
#calculate the AUC
( valAUC<- auc(pts.val@data$PresAbs, vpred) )
#compare to within-sample
lrAUC
When the validation values are also high, we can feel confident that our model is performing well and isn't biased to the Training data. However, what do you do if you have a very small number of occurrences?
One means of estimating out-of-sample accuracy is through k-fold cross-validation. During this procedure, the data are divided randomly into K groups (i.e. k= 5 for 5-fold, k=10 for 10-fold and k= Num of Obs for leave-one-out). For each group the logistic regression model is fit with the observations in k-1 groups, and the probability of presence for the k omitted observations are estimated using the fitted model. This procedure is repeated for all groups of observations. The accuracy of logistic regression models is estimated by examining the prediction and classification error rates.
To perform a leave-one-out cross-validation, the R package 'boot' can be used and within this package the cv.glm function. Run the following lines of code:
#leave-one-out xval stuff
#uses a threshold of 0.5 for total classification error
cost <- function(r, pi = 0) mean(abs(r-pi) > 0.5)
cval<-cv.glm(pts.mod@data,lr03,cost, K=nrow(pts.mod))$delta #package(boot)
cval #[1] 0.1765276 0.1762191
The output is a vector having two values. The first component is the raw cross-validation estimate of total prediction error. The second component is the adjusted cross-validation estimate. The adjustment is designed to compensate for the bias introduced by not using leave-one-out cross-validation. As you can see from the result, using leave-one-out we have a 17.7% classification error. This is quite similar the 17.3% error from our internal validation effort. However, I find that this solution is less than ideal for a couple of reasons. First, the total classification error is dependent upon choosing a threshold. Secondly, upon reviewing the cv.glm function I noticed that if you perform a k-fold cross-validation, the function does not appear to account for equal selection of both presence and absence points (i.e. the same proportion of presence and absence for each group of data). To try and alleviate these concerns of mine, I developed a k-fold cross-validation function that allows for the calculation of the cross-validation AUC. Save the following file to your directory : K_FoldCrossVal.r
As described above, k-fold randomly divides your data into a set number of groups. A model is fit with the majority of the data and the hold-out data are predicted with the previously fit model. The process continues until each group is used as a hold-out dataset. Within the K_FoldCrossVal.r file, I try to create a model that proportionately has the same number of presences and abscences within each hold-out group. The output of this function is a data frame identical to the input data frame, but it contains two additional columns, one for the k-fold grouping value (xGroup), the other a column with the predicted probability (xPred). With the xPred column, we can calculate a k-fold cross-validated AUC value, therefore eliminating the need choose a threshold. To run the k-fold function, first load the file as a source:
#Make sure you change the directory or open this file and submit all lines of code
source("//wygiscdev/AdvSpatial/LabData/lab6/K_FoldCrossVal.r")
Let's now run a 10-fold cross-validation and calculate the AUC of the cross-validated predictions. You will notice there are 4 arguments that must be provided for the function to work and we are over-writing our current data frame with a new data frame that has the 2 new columns of information:
#Calculate the cross-validated AUC, append results to our attribute table
#--- x is a data.frame object
#--- present is a string representing the column name containing the presence/absence
#--- model is a string representing the logistic model specification (i.e. Present~VarA+VarB)
#--- k is the number of groups for cross validation
pts.mod@data<- xvalLogistic(pts.mod@data, present="PresAbs",
model="PresAbs~cti+diss11+DEM_30m+I(DEM_30m^2)+srr33+tri33+strmdist+sar+trasp+DEM_30m*cti", k=10)
( xvalAUC<- auc(pts.mod@data$PresAbs, pts.mod@data$xPred) )
Hopefully the value is very close to your Training and Test AUC calculated above
If you run the function several times, you will notice that the AUC values change. How much do they differ? It may be necessary to calculate an average AUC value to test how variable (i.e. sensitive) your model is to changes in data configuration and model fit. If you are satisfied with the results, maybe you can move forward with presenting your model results. However, do you feel that you have accurately represented the entire distribution of presence and pseudo-absence data? Do you feel that the parameter estimates are stable and will perform well when applied to external data not used within the model fitting process? If you feel there is some room for improvement, developing a boot-strapped model may be useful.
#Before moving onto another topic, we can predict our model back onto the raster surface:
rsPred<- predict(rs, lr03, type="response", progress="text")
plot(rsPred) #plot the results
plot(sdata, add=TRUE)
Bootstrapping to Fit a Model
Bootstrapping is a resampling technique used to obtain estimates of summary statistics. In our case, the summary statistics are our parameter estimates. Wikipedia defines bootstrapping as:
the practice of estimating properties of an estimator (such as its variance) by measuring those properties when sampling from an approximating distribution. One standard choice for an approximating distribution is the empirical distribution of the observed data. In the case where a set of observations can be assumed to be from an independent and identically distributed population, this can be implemented by constructing a number of resamples of the observed dataset (and of equal size to the observed dataset), each of which is obtained by random sampling with replacement from the original dataset.
We will use our spatially filtered dataset that we initially created, 'sdata'. From this shapefil, we are going to randomly sample presence points, create absence points and calculate a set of parameter estimates using the best fitting model from above (i.e. lr03). For each iteration, a set of 70% of the occurrence and twice as many random pseudo-absence locations will be drawn and a Logistic model will be fit. The parameter estimates from the model, as well as an AUC value, will be calcuated and stored. After all of the iterations are complete, a data.frame containing the parameter estimates and AUC will be returned.
#reduce the raster stack to only the variables we are using in the GLM to speed things up
##cti+diss11+DEM_30m+srr33+tri33+strmdist+sar+trasp
rs.glm<- stack(rs[[c(1, 3, 22, 15, 21, 17, 8, 18)]])
#Set the number of bootstrap iterations
bsiter<- 10 #should be much higher in practice
#set number of presence points
np<- round(nrow(sdata)*0.7) #= 344
#determine total number of points to create
abspts<- np*2 #688
#create output data.frame
outLR<- data.frame()
for (i in 1:bsiter){
#now we have to create pseudo-absence points for model fit
rpt<- spsample(rstdy, n=nrow(sdata)*2, type="random", iter=100)
#create a distance matrix between pres and abs
d<- gDistance(rpt, sdata, byid=TRUE) #uses rgeos package
#loop through each point and determine which points are within 100m
outDF<- data.frame()
for(j in 1:nrow(d)){
d.match<- unique(which(d[j,]<mindist))
if(length(d.match)>0){
outDF<- rbind(outDF, data.frame(inPt=j, closePt=paste(d.match, collapse=",")))
}
}
#now loop through and create the new absence points for being too close to presence points
if(nrow(outDF)>0){
for(j in 1:nrow(outDF)){
#remove the offending point from the list
rpt<- rpt[-outDF[j,1],]
#keep drawing random sample until distance criteria is met
repeat{
#for simplicity sake, resample a replacement point from polygon
rpt.add<- spsample(rstdy, 1, type="random")
#test if first point meets distance criteria
if(gDistance(rpt.add, sdata)>=mindist){
#add the point to the random samples
rpt<- rbind(rpt, rpt.add)
#break out of the loop
break()
}#close if gDistance
}#close repeat
}#not to next outDF
}#close outDF if
#make random points into a SPDF
rpt<- SpatialPointsDataFrame(rpt, data=data.frame(PresAbs=rep(0, length(rpt))))
#////////////////////////////////////////////////////////////////////////////////
#now create two datasets, one for model fitting (70% of available) and one for model validation (30%)
#first select vector of rows for sdata: model presence points. Notice we round the number of points to an integer
rpts<- sample(1:nrow(sdata), round(nrow(sdata)*0.7))
#get the spatial features, etc.
p.mod<- sdata[rpts,]
#get the remaining presence points for validation
p.val<- sdata[is.na(match(rownames(sdata@data), rpts))==TRUE,]
#second, select vector of rows for rpt: model absence points. Notice we round the number of points to an integer
rpts<- sample(1:nrow(rpt), round(nrow(rpt)*0.7))
#get the spatial features, etc.
a.mod<- rpt[rpts,]
#get the remaining presence points for validation
a.val<- rpt[is.na(match(rownames(rpt@data), rpts))==TRUE,]
#Now bind the rows together to create the model and validation datasets
pts.mod<- rbind(p.mod, a.mod)
pts.val<- rbind(p.val, a.val)
#////////////////////////////////////////////////
#This section is for the Logistic Regression Model
#////////////////////////////////////////////////
#get the predictors for the points
lmdata<- data.frame(extract(rs.glm, pts.mod))
#append the columns to the spatial points
pts.mod@data<- data.frame(pts.mod@data, lmdata)
#push back to dataframe
lmdata<-data.frame(pts.mod)
lmdata[,1]<- as.factor(lmdata[,1])
#get the predictors for the points
lvdata<- data.frame(extract(rs.glm, pts.val))
#append the columns to the spatial points
pts.val@data<- data.frame(pts.val@data, lvdata)
#push back to dataframe
lvdata<-data.frame(pts.val)
lvdata[,1]<- as.factor(lvdata[,1])
#fit the glm to the data
modLR <- glm(PresAbs~cti+diss11+DEM_30m+I(DEM_30m^2)+srr33+tri33+strmdist+sar+trasp+DEM_30m*cti, data=lmdata, family=binomial(link="logit"))
#get the coefficients
cof<-matrix(modLR$coefficients, ncol=length(modLR$coefficients))
#get the SE
se <- matrix(summary(modLR)$coefficients[,2], ncol=length(modLR$coefficients))
#calculate the AUC
lrAUC<- auc(lmdata$PresAbs, fitted(modLR))
#Predict to validation data
vpred<- predict(modLR, newdata=lvdata, type="response")
#calculate the AUC
valAUC<- auc(lvdata$PresAbs, vpred)
#add the AUC to the matrix
c<- data.frame(cbind(i, cof, se, lrAUC, valAUC))
names(c)<- c("BSRun","Intercept", "cti", "diss11", "DEM_30m", "I(DEM_30m^2)", "srr33", "tri33", "strmdist", "sar", "trasp", "DEM_30m*cti",
"SE_Intercept", "SE_cti", "SE_diss11", "SE_DEM_30m", "SE_I(DEM_30m^2)", "SE_srr33", "SE_tri33", "SE_strmdist", "SE_sar",
"SE_trasp", "SE_DEM_30m*cti", "TrainAUC", "TestAUC")
#append to the output
outLR<- rbind(outLR, c)
#////////////////////////////////////////////////
#End Logistic Regression
#////////////////////////////////////////////////
}
#Inspect the output table and look for anomalies. To complete the bootstrap exercise,
#let's create a table containing the mean Parameter Estimates along with 95% Confidence Intervals (or you can use quantiles):
#calculate the bootstrapped parameter estimates by looping through columns
outBeta<- data.frame()
for(i in 2:12){
outBeta<- rbind(outBeta, data.frame(Mean=mean(outLR[,i]), Median=median(outLR[,i]), SD=sd(outLR[,i])))
}
#add in the parameters
outBeta<- data.frame(Parameter=names(outLR)[2:12], outBeta)
#Set our mean estimate to the median. Could choose mean
meanEst<- outBeta[,3]
names(meanEst)<- outBeta[,1]
MAPPING THE RESULTS
#use the calc function to predict to landscape
# x -- a stack of rasters. Choosing the matching raster for the coefficient
names(rs.glm)
calcGLM<- function(x){
round(1/(1+exp(-1*(meanEst[1]+meanEst[2]*x[1]+meanEst[3]*x[2]+meanEst[4]*x[3]+meanEst[5]*(x[3]^2)+
meanEst[6]*x[4]+meanEst[7]*x[5]+meanEst[8]*x[6]+meanEst[9]*x[7]+meanEst[10]*x[8]+meanEst[11]*x[3]*x[1]))),5)
}
predBoot<- calc(rs.glm, fun=calcGLM, progress="text")
plot(predBoot)
plot(sdata, add=TRUE)