Hi All as with the last three posts I am summarizing linear modeling and generalized linear models as used in Business intelligence. The areas that I are covered are:
- Querying and cleaning data for modeling
- Plotting models with uncertainty
- Quick and dirty model selection
- Regularized regression
- L1 and L2 Norms
This post explores number 3 model selection. Model selection is a topic that has been covered a few times already (1, 2, 3, 4) on this blog. What makes this post different (i.e., better) is that it lacks mathematical rigor and uses base functions (e.g., lm).
Now before going forward nothing here is worthwhile if you are happy with stepAIC. Personally I have found it so buggy as to never be useful; if you have power to you and I suggest you use the time you would have spent here on something more productive.
Before model selecting can happen data must be simulated:
# data for modeling
x.index = 1:336
x.month = floor((x.index-1)/28)
x.dom = rep(1:28, 12)
x.week = floor((x.index-1)/7)
x.dow = rep(0:6, 12*4)
x.rand1 = runif(336)
x.rand2 = runif(336)
x.rand3 = runif(336)
x.dom0 = rep(1:28, 12)
x.dom1 = rep(1:28, 12)
x.dom2 = rep(1:28, 12)
x.dom3 = rep(1:28, 12)
x.dom4 = rep(1:28, 12)
x.dom5 = rep(1:28, 12)
x.dom6 = rep(1:28, 12)
x.dom7 = rep(1:28, 12)
x.dom8 = rep(1:28, 12)
x.dom9 = rep(1:28, 12)
x.dom10 = rep(1:28, 12)
x.dom0[x.month != 0] = 0
x.dom1[x.month != 1] = 0
x.dom2[x.month != 2] = 0
x.dom3[x.month != 3] = 0
x.dom4[x.month != 4] = 0
x.dom5[x.month != 5] = 0
x.dom6[x.month != 6] = 0
x.dom7[x.month != 7] = 0
x.dom8[x.month != 8] = 0
x.dom9[x.month != 9] = 0
x.dom10[x.month != 9] = 0
X = data.frame(x.index,x.month,x.dom,x.week ,x.dow , x.rand1, x.rand2, x.rand3,x.dom0 ,x.dom1 ,x.dom2 ,x.dom3 ,x.dom4 ,x.dom5 ,x.dom6 ,x.dom7 ,x.dom8 ,x.dom9 ,x.dom10)
# data for making the simulated observations
x.sun = rep(c(1,0,0,0,0,0,0), 12*4)
x.mon = rep(c(0,1,0,0,0,0,0), 12*4)
x.tue = rep(c(0,0,1,0,0,0,0), 12*4)
x.wed = rep(c(0,0,0,1,0,0,0), 12*4)
x.thu = rep(c(0,0,0,0,1,0,0), 12*4)
x.fri = rep(c(0,0,0,0,0,1,0), 12*4)
x.sat = rep(c(0,0,0,0,0,0,1), 12*4)
x.m0 = c(rep(1, 28), rep(0, 11*28))
x.m1 = c(rep(0, 1*28), rep(1, 28), rep(0, 10*28))
x.m2 = c(rep(0, 2*28), rep(1, 28), rep(0, 9*28))
x.m3 = c(rep(0, 3*28), rep(1, 28), rep(0, 8*28))
x.m4 = c(rep(0, 4*28), rep(1, 28), rep(0, 7*28))
x.m5 = c(rep(0, 5*28), rep(1, 28), rep(0, 6*28))
x.m6 = c(rep(0, 6*28), rep(1, 28), rep(0, 5*28))
x.m7 = c(rep(0, 7*28), rep(1, 28), rep(0, 4*28))
x.m8 = c(rep(0, 8*28), rep(1, 28), rep(0, 3*28))
x.m9 = c(rep(0, 9*28), rep(1, 28), rep(0, 2*28))
x.m10 = c(rep(0, 10*28), rep(1, 28), rep(0, 1*28))
x.m11 = c(rep(0, 11*28), rep(1, 28))
# the simulated observations
n = floor(runif(336, min = 10000, max =20000))
sd = rchisq(n = 336, df = 25, ncp = 0)/sqrt(n)
error = rnorm(336, mean= 0, sd =sd)
y.true = 0.5*x.fri + 0.5*x.sat - 0.25*x.mon - 0.25*x.tue - 0.25*x.wed -0.25*x.thu +
0.15*x.m0 +0.2*x.m1 -0.05*x.m2 -0.65*x.m3 +0.05*x.m4 +0.35*x.m5 +0.25*x.m6 +0.15*x.m7 -0.15*x.m8 +0.15*x.m9 -0.45*x.m10 +
0.02*x.dom5 - 0.03*x.dom9 +0.015*x.dom10 +
1.5
y = y.true +error
# plot data
plot(x.index, y)
lines(x.index, y.true, col="blue", lwd=3)
points(x.index, y)
The simulated data and true model are shown in Fig. 1. The true model has day of week (dow) and monthly parameters. In addition 6n and 10th months have slopes. In real-life this code would just be a sql query.

Figure 1: The simulated data with noise (circles) and without noise (blue lines).
The model selection procedures described here algorithmically have two parts. The first part is to build a data frame will all features that will be considered in selection process. These features can be either individual predictors, collections of categorical predictors, interaction terms, or any combination there of. The only restriction is that they must be prespecified. The second step of the algorithm is to compute the BIC of models using sub sets of the features compiled in the first step. Here both an exhaustive and annealing method is given.
Returning to the first step:
#make data.frame for model selection
data.MS = data.frame(
x.index = X$x.index , #1
x.month.cat = factor(X$x.month), # 2
x.dom = X$x.dom , # 3
x.week = X$x.week , # 4
x.week.cat = factor(X$x.week) , # 5
x.dow = X$x.dow , # 6
x.dow.cat = factor(X$x.dow), # 7
x.month.dow.cat = factor(X$x.dow):factor(X$x.month), # 8
x.rand1 = X$x.rand1 , # 9
x.rand2 = X$x.rand2 , # 10
x.dom0 = X$x.dom0 , # 11
x.dom1 = X$x.dom1 , # 12
x.dom2 = X$x.dom2 , # 13
x.dom3 = X$x.dom3 , # 14
x.dom4 = X$x.dom4 , # 15
x.dom5 = X$x.dom5 , # 16
x.dom6 = X$x.dom6 , # 17
x.dom7 = X$x.dom7 , # 18
x.dom8 = X$x.dom8 , # 19
x.dom9 = X$x.dom9 , # 20
x.dom10 = X$x.dom10 , # 21
y = y
)
In this example the individual predictors are “x.index”, “x.dom”, “x.week”, “x.dow”, “x.rand1”, and “x.rand2”. The collections of predictors are “x.month.cat”, “x.week.cat”, and “x.dow.cat”. Finally, the interaction terms are “x.month.dow.cat”, and “x.dom” 0 to 10. The exhaustive approach will calculate the BIC for each subset of these predictors. The code for the exhaustive method is:
# model selection
BIC.list = 1:2047
N = sum(n)
for ( i in 1:2047)
{
print("--------------------------------------------------")
print(i)
cols = intToBits(i)[1:11]
cols.tf = c(cols[1] == 1, cols[2] == 1, cols[3] == 1, cols[4] == 1, cols[5] == 1, cols[6] == 1,cols[7] == 1,cols[8] == 1,cols[9] == 1, cols[10] == 1, rep(cols[11] == 1,11), T )
data.temp = data.MS[,cols.tf]
print(colnames(data.temp))
mod.temp = lm(y~., data = data.temp, weights = sd^(-2) ) # note that in this case I am asuming that hte true sds are known
LogLhood = logLik(mod.temp)
K = mod.temp$rank
K.col = sum(cols.tf)
BIC = -2*LogLhood[1] + log(N)*K + 0.01*K.col
print(K)
print(K.col)
print(log(N))
print(BIC)
BIC.list[i] = BIC
}
i = which.min(BIC.list)
cols = intToBits(i)[1:11]
cols.tf = c(cols[1] == 1, cols[2] == 1, cols[3] == 1, cols[4] == 1, cols[5] == 1, cols[6] == 1,cols[7] == 1,cols[8] == 1,cols[9] == 1, cols[10] == 1, rep(cols[11] == 1,11), T )
data.temp = data.MS[,cols.tf]
mod.best = lm(y~., data = data.temp, weights = sd^(-2) ) # note that in this case I am asuming that hte true sds are known
P = predict(mod.best, se.fit =T)
EX = P$fit
LB = P$fit - 1.96*P$se.fit
UB = P$fit + 1.96*P$se.fit
plot(x.index, y)
polygon(c(x.index, rev(x.index)), c(UB, rev(LB)), col= "grey", border=F)
lines(x.index, EX, col="red", lwd=3)
lines(x.index, y.true, col="blue", lwd=3, lty =2)
points(x.index, y)
I would like to highlight a few choices. First the BIC is calculated manually. This is does because the BIC function in r does not always calculate the number of observations correctly when they are used in the weights of the linear model. Second the strategy of using intToBits and then making rules on the cols.tf slows for groups of parameters to be linked. Here all of the “x.dom” parameters are either on or off together. Thirdly, I have modified the BIC to avoid models with many unused parameters (the “0.01*K.col” term).
Here there are only 11 features that could be on or off. If it is describable to look through many features to the extent that an exhaustive search is not possible then an annealing approach seems to be the best plan. The code for the annealing model selection is
# model selection
N.steps = 10000
t = 10
t.step = 0.9995
0 = last.accept
N = sum(n)
cols.tf = c(runif(1) >= 0.5, runif(1) >= 0.5, runif(1) >= 0.5, runif(1) >= 0.5, runif(1) >= 0.5,
runif(1) >= 0.5, runif(1) >= 0.5, runif(1) >= 0.5, runif(1) >= 0.5, runif(1) >= 0.5,
runif(1) >= 0.5, runif(1) >= 0.5, runif(1) >= 0.5, runif(1) >= 0.5, runif(1) >= 0.5,
runif(1) >= 0.5, runif(1) >= 0.5, runif(1) >= 0.5, runif(1) >= 0.5, runif(1) >= 0.5,
runif(1) >= 0.5, T )
cols.tf.old = cols.tf
cols.tf.best = cols.tf
data.temp = data.MS[,cols.tf]
print(colnames(data.temp))
mod.temp = lm(y~., data = data.temp, weights = sd^(-2) ) # note that in this case I am asuming that hte true sds are known
LogLhood = logLik(mod.temp)
K = mod.temp$rank
K.col = sum(cols.tf)
BIC.OLD = -2*LogLhood[1] + log(N)*K + 0.01*K.col
BIC.BEST = BIC.OLD
for ( i in 1:N.steps)
{
print("--------------------------------------------------")
print(i)
cols.tf = cols.tf.old
j = sample(1:21,1) #
S. = 1
if ( (cols.tf[j] ==T) & (S. == 1) )
{
cols.tf[j] = F
S. = 0
}
if ( (cols.tf[j] ==F) & (S. == 1) )
{
cols.tf[j] = T
S. = 0
}
data.temp = data.MS[,cols.tf]
print(colnames(data.temp))
mod.temp = lm(y~., data = data.temp, weights = sd^(-2) ) # note that in this case I am asuming that hte true sds are known
LogLhood = logLik(mod.temp)
K = mod.temp$rank
K.col = sum(cols.tf)
BIC.NEW = -2*LogLhood[1] + log(N)*K + 0.01*K.col
r = runif(1)
A = exp((BIC.OLD-BIC.NEW)/t )
print(paste("A", A))
print(paste("NEW", BIC.NEW))
print(paste("OLD", BIC.OLD))
if(A >= r)
{
print("accept new model")
BIC.OLD = BIC.NEW
cols.tf.old = cols.tf
last.accept = i
}
if(BIC.NEW <= BIC.BEST)
{
BIC.BEST = BIC.NEW
cols.tf.best = cols.tf
}
print(K)
print(K.col)
print(paste("t",t))
print(paste("j", j))
print(log(N))
print(BIC.OLD)
t= t*t.step
}
cols.tf = cols.tf.best
cols.tf.old = cols.tf
cols.tf.best = cols.tf
data.temp = data.MS[,cols.tf]
print(colnames(data.temp))
# secound run though for good measure
0 = last.accept
mod.temp = lm(y~., data = data.temp, weights = sd^(-2) ) # note that in this case I am asuming that hte true sds are known
LogLhood = logLik(mod.temp)
K = mod.temp$rank
K.col = sum(cols.tf)
BIC.OLD = -2*LogLhood[1] + log(N)*K + 0.01*K.col
BIC.BEST = BIC.OLD
for ( i in 1:N.steps)
{
print("--------------------------------------------------")
print(i)
cols.tf = cols.tf.old
j = sample(1:21,1) #
S. = 1
if ( (cols.tf[j] ==T) & (S. == 1) )
{
cols.tf[j] = F
S. = 0
}
if ( (cols.tf[j] ==F) & (S. == 1) )
{
cols.tf[j] = T
S. = 0
}
data.temp = data.MS[,cols.tf]
print(colnames(data.temp))
mod.temp = lm(y~., data = data.temp, weights = sd^(-2) ) # note that in this case I am asuming that hte true sds are known
LogLhood = logLik(mod.temp)
K = mod.temp$rank
K.col = sum(cols.tf)
BIC.NEW = -2*LogLhood[1] + log(N)*K + 0.01*K.col
r = runif(1)
A = exp((BIC.OLD-BIC.NEW)/t )
print(paste("A", A))
print(paste("NEW", BIC.NEW))
print(paste("OLD", BIC.OLD))
if(A >= r)
{
print("accept new model")
BIC.OLD = BIC.NEW
cols.tf.old = cols.tf
last.accept = i
}
if(BIC.NEW <= BIC.BEST)
{
BIC.BEST = BIC.NEW
cols.tf.best = cols.tf
}
print(K)
print(K.col)
print(paste("t",t))
print(paste("j", j))
print(log(N))
print(BIC.OLD)
t= t*t.step
}
data.temp = data.MS[,cols.tf]
mod.best = lm(y~., data = data.temp, weights = sd^(-2) ) # note that in this case I am asuming that hte true sds are known
P = predict(mod.best, se.fit =T)
EX = P$fit
LB = P$fit - 1.96*P$se.fit
UB = P$fit + 1.96*P$se.fit
plot(x.index, y)
polygon(c(x.index, rev(x.index)), c(UB, rev(LB)), col= "grey", border=F)
lines(x.index, EX, col="red", lwd=3)
lines(x.index, y.true, col="blue", lwd=3, lty =2)
points(x.index, y)
This code is almost exactly the same as the exhaustive search but now a random walk is taken instead of a systematic iteration. In addition, models are not automatically accepted but only accepted using stochastic annealing. Two cooling runs are made because … why not? It is good to check stability.
As a final point I would say once you have a best model cross validation is always a good extra check.
Tune in next time for more on linear models.