My DS Coding Bolg: January 2013

Monday, January 21, 2013

Conditional Logistic Regression in R

##################################################################
### Conditional Logistic Regression
##################################################################
# install.packages("clogitL1")
require("clogitL1")

set.seed(145)
# data parameters
K = 10 # number of strata
n = 5 # number in strata
m = 2 # cases per stratum
p = 20 # predictors
# generate data
y = rep(c(rep(1, m), rep(0, n-m)), K)
X = matrix (rnorm(K*n*p, 0, 1), ncol = p) # pure noise
strata = sort(rep(1:K, n))
par(mfrow = c(1,2))
# fit the conditional logistic model
clObj = clogitL1(y=y, x=X, strata)
plot(clObj, logX=TRUE)
# cross validation
clcvObj = cv.clogitL1(clObj)
plot(clcvObj)

Multinomial Logistic Regression in R

The data on customers' choices can be analyzed to determine which features of a product (e.f., package size, brand, or flavor) are most attractive to customers and how they trade of desirable feature against price.

Given the customer makes a choice among several options, each of which has its own set of attributes. To accommodate the unique data structure, marketers have adopted choice models, which are well suited to understanding the relationship between the attributes of products and customers' choices among sets of products.

Choice models are used to understand how product attributes drive customers' choices. The most popular choice model in practice is the multinomial logit model. The model can be estimated using frequentist models with mlogit() or using Bayesian methods with MCMCmnl().

Before analyzing any choice data, it is useful to compute raw choice counts for each attribute.

Estimating a choice model is similar to estimating simpler linear models, The key output of the estimation is a set of parameters that describe how much each attribute is associated with the observed choices.

Conjoint analysis is a survey method where customers are asked to make choices among products with varying features and prices.

##################################################################
### Multinomial Logistic Regression
##################################################################
## http://www.ats.ucla.edu/stat/r/dae/mlogit.htm
library(foreign)
library(nnet)
ml <- read.dta("http://www.ats.ucla.edu/stat/data/hsbdemo.dta")

with(ml, table(ses, prog))
with(ml, do.call(rbind, tapply(write, prog, function(x) c(Mean = mean(x), SD = sd(x)))))

ml$prog2 <- relevel(ml$prog, ref = "academic")
with(ml, table(ses, prog2))
with(ml, table(write, prog2))
test <- multinom(prog2 ~ ses + write, data = ml)
summary(test)
z <- summary(test)$coefficients/summary(test)$standard.errors; z

#2-tailed z test
p <- (1 - pnorm(abs(z), 0, 1))*2; p

## extract the coefficients from the model and exponentiate
exp(coef(test))
head(pp <- fitted(test))
dses <- data.frame(ses = c("low", "middle", "high"), write = mean(ml$write))
predict(test, newdata = dses, "probs")

Wednesday, January 16, 2013

PGM4 - Compute Marginal

require(reshape)
source("PM1.R")

#FactorMarginalization Sums given variables out of a factor.
ComputeJointDistribution <- function(F) {
# Check for empty factor list
if (length(F) == 0){
    cat('Error: empty factor list');
      Joint = list(var = NA, card = NA, val = NA);
      return(Joint);
}

Joint = F[[1]];
for (i in 2:length(F)){
Joint = FactorProduct(Joint, F[[i]]);
}
return(Joint);
}

#ComputeMarginal Computes the marginal over a set of given variables
ComputeMarginal <- function(V, F, E) {
# Check for empty factor list
if (length(F) == 0){
    cat('Error: empty factor list');
      M = list(var = NA, card = NA, val = NA);
      return(M);
}
M = list(var = NA, card = NA, val = NA);
joint = ComputeJointDistribution(F);
joint = list(joint);
evidence = ObserveEvidence(joint, E);
evidence = evidence[[1]];
evidence$val = evidence$val / sum(evidence$val);

var = setdiff(evidence$var, V);
M = FactorMarginalization(evidence, var);
return(M);
}

V <- c(2, 3);
A <- list(var = c(1), card = c(2), val = c(0.11, 0.89));
B <- list(var = c(2, 1), card = c(2, 2), val = c(0.59, 0.41, 0.22, 0.78));
C <- list(var = c(3, 2), card = c(2, 2), val = c(0.39, 0.61, 0.06, 0.94));
F <- list(A, B, C);
E <- matrix(c(1, 2), byrow = T, nrow = 1)
ComputeMarginal(V, F, E)

Sunday, January 13, 2013

PGM3 - Observe Evidence

# ObserveEvidence Modify a vector of factors given some evidence.
ObserveEvidence <- function(F, E) {
for ( i in 1: nrow(E)){
    v <- E[i, 1]; # variable
    x <- E[i, 2]; # value

    #Check validity of evidence
    if (x == 0){
     cat('Evidence not set for variable', v);
     continue;
    }

    for (j in 1:length(F)){
     # Does factor contain variable?
     indx = match(v, F[[j]]$var);
     if (!is.na(indx)){
      # Check validity of evidence
         if (x > F[[j]]$card[indx] || x < 0 ){
           cat('Invalid evidence X_' , v, ' = ', x);
         }

      map <- F[[j]]$var == v;
      assignments <- IndexToAssignment(F[[j]]$card);
      indxx = (assignments[, map] == x);
      assignments$val <- F[[j]]$val;
      assignments$val[!indxx] <- 0;
      F[[j]]$val <- assignments$val;
     }
    }
}
return(F)
}

#case 1;
F.1 <- list(var=c(1), card=c(2), val=c(0.11, 0.89));
F.2 <- list(var=c(2, 1), card=c(2, 2), val=c(0.59, 0.41, 0.22, 0.78));
F.3 <- list(var=c(3, 2), card=c(2, 2), val=c(0, 0.61, 0, 0));
F <- list(F.1, F.2, F.3)
E <- matrix(c(2, 1, 3, 2), byrow = T, nrow = 2)
ObserveEvidence(F, E);

#case 2;
Z = list(var=c(3, 2, 1), card=c(2, 2, 3), val=c(0.25, 0.35, 0.08, 0.16, 0.05, 0.07, 0, 0, 0.15, 0.21, 0.09, 0.18));
Z <- list(Z);
E <- matrix(c(3, 1), byrow = T, nrow = 1)
ObserveEvidence(Z, E);

Saturday, January 12, 2013

PGM2 - Factor Marginalization

#FactorMarginalization Sums given variables out of a factor.
FactorMarginalization <- function(A, V) {

# Check for empty factor or variable list
if ((length(A$var)==0) || length(V) == 0) B <- A; return;

# Construct the output factor over A.var \ V (the variables in A.var that are not in V)
B.var <- sort(setdiff(A$var, V));
mapB <- match(B.var, A$var);

# Check for empty resultant factor
if (length(B.var)==0) cat('Error: Resultant factor has empty scope');

# Initialize B.card and B.val
B.card <- A$card[mapB];
B.val <- rep(0, prod(B.card));

assignments <- IndexToAssignment(A$card);
assignments$val <- A$val;
assignments$indxB <- AssignmentToIndex(assignments[, mapB], B.card);
tmp1 <- ddply(assignments, .(indxB), function(x) sum(x$val));
B.val <- tmp1$V1[sort(tmp1$indxB)];

B = list(B.var, B.card, B.val);
names(B) = c("var", "card", "val");
return(B);
}

# FACTORS.MARGINALIZATION = FactorMarginalization(FACTORS.INPUT(2), [2]);
#testing 1;
#struct('var', [1], 'card', [2], 'val', [0.11, 0.89]);
A <- data.frame(matrix(c(1,2,0.11,0.89), nrow = 2));
#case1
A.var = c(2, 1);
A.card = c(2, 2);
A.val = c(0.59, 0.41, 0.22, 0.78);
A = list(A.var, A.card, A.val);
names(A) = c("var", "card", "val");
V = c(2);
FactorMarginalization(A,V)

#testing 2;
XX = list(c(3, 2, 1), c(2, 2, 3), c(0.25, 0.35, 0.08, 0.16, 0.05, 0.07, 0, 0, 0.15, 0.21, 0.09, 0.18));
names(XX) = c("var", "card", "val");
V = c(2);
FactorMarginalization(XX,V)

Thursday, January 3, 2013

PGM1 - Factor Product

FactorProduct Computes the product of two factors.
C = FactorProduct(A,B) computes the product between two factors, A and B, where each factor is defined over a set of variables with given dimension. The factor data structure has the following fields:
       .var    Vector of variables in the factor, e.g. [1 2 3]
       .card   Vector of cardinalities corresponding to .var, e.g. [2 2 2]
       .val    Value table of size prod(.card)

# assignments = IndexToAssignment(1:prod(C.card), C.card);
IndexToAssignment <- function(D){
# ensure that D is a row vector
D = unlist(D);
tmp = seq(1, D[1]);
if (length(D)==1){
return(tmp);
}else
{
tmp = list(tmp);
for (i in 2:length(D)){
tmp = c(tmp, list(seq(1, D[i])));
}
return(expand.grid(tmp));
}
}

#indxA = AssignmentToIndex(assignments(:, mapA), A.card);
AssignmentToIndex <- function(AA, D) {
if (length(D)==1){
return(AA);
}else
{
tmp<-IndexToAssignment(D);
names(AA) <- names(tmp);
AA$rank <- 1:length(AA[,1]);
tmp$index <- 1:prod(D);
tmp <- merge(AA, tmp);
tmp <- tmp[sort.list(tmp$rank),];
}
return(tmp$index);
}

# FactorProduct Computes the product of two factors.
FactorProduct <- function(A, B) {
# Check for empty factors;
if (length(A$var)==0) C = B; return;
if (length(B$var)==0) C = A; return;

# Check that variables in both A and B have the same cardinality
dummy = intersect(A$var, B$var);
if (length(dummy)>1){
# A and B have at least 1 variable in common
if (A$card[A$var == dummy] != B$card[B$var == dummy]){
cat('Dimensionality mismatch in factors');
return;
}
}

# Set the variables of C
C.var = sort(union(A$var, B$var));

# Set the cardinality of variables in C
C.card = rep(1, length(C.var));
mapA = match(A$var, C.var);
mapB = match(B$var, C.var);
C.card[mapA] = A$card;
C.card[mapB] = B$card;

# Initialize the factor values of C:
#   prod(C.card) is the number of entries in C
C.val = rep(1, prod(C.card));

assignments = IndexToAssignment(C.card);
indxA = AssignmentToIndex(assignments[, mapA], A$card);
indxB = AssignmentToIndex(assignments[, mapB], B$card);
C.val = A$val[indxA] * B$val[indxB];

C = list(C.var, C.card, C.val);
names(C) = c("var", "card", "val");
return(C);
}

#testing 1;
#struct('var', [1], 'card', [2], 'val', [0.11, 0.89]);
A <- data.frame(matrix(c(1,2,0.11,0.89), nrow = 2));
names(A) = c("1", "val");
A.var = 1;
A.card = 2;
A.val = c(0.11, 0.89);
A = list(A.var, A.card, A.val);
names(A) = c("var", "card", "val");

#struct('var', [2, 1], 'card', [2, 2], 'val', [0.59, 0.41, 0.22, 0.78]);
B <- data.frame(matrix(c(1,2,1,2,1,1,2,2,0.59, 0.41, 0.22, 0.78), nrow = 4));
names(B) = c("2", "1", "val");
B.var = c(2,1);
B.card = c(2,2);
B.val = c(0.59, 0.41, 0.22, 0.78);
B = list(B.var, B.card, B.val);
names(B) = c("var", "card", "val");

C = FactorProduct(A,B);
#var: [1 2]
#card: [2 2]
#val: [0.0649 0.1958 0.0451 0.6942]

#testing 2;
XX = list(c(2,1), c(2,3), c(0.5, 0.8, 0.1, 0, 0.3, 0.9));
names(XX) = c("var", "card", "val");

YY = list(c(3, 2), c(2, 2), c(0.5, 0.7, 0.1, 0.2));
names(YY) = c("var", "card", "val");

ZZ = FactorProduct(XX,YY);
#var: [1 2 3]
#card: [3 2 2]
#val: [0.2500 0.0500 0.1500 0.0800 0 0.0900 0.3500 0.0700 0.2100 0.1600 0 0.1800]