Thursday, November 29, 2012

SVM 1 - Using LIBSVM in R

memory.limit()
memory.size(max = TRUE)
rm(list=ls(all=T))

require(RODBC)
require(ggplot2)
require(plyr)
require(e1071)

# http://planatscher.net/svmtut/svmtut.html
# 1) Task: linear separable data
data1 <- seq(1,10,by=2)
classes1 <- c('a','a','a','b','b')
test1 <- seq(1,10,by=2) + 1
model1 <- svm(data1,classes1,type='C',kernel='linear')
print(model1)
summary(model1)
predict(model1,data1)
pred <- fitted(model1)
table(pred, classes1)
predict(model1,test1)

#2) Task: not-linear-separable data
data2 <- seq(1,10)
classes2 <- c('b','b','b','a','a','a','a','b','b','b')
model2 <- svm(data2,classes2,type='C',kernel='linear')
predict(model2,data2)
table(predict(model2,data2), classes2)

#3) Task: Example from the manual (iris-data)
attach(iris)
model <- svm(Species ~ ., data = iris)
x <- subset(iris, select = -Species)
y <- Species
model <- svm(x, y)
print(model)
summary(model)
pred <- predict(model, x)
pred <- fitted(model)
table(pred, y)

pred <- predict(model, x, decision.values = TRUE)
attr(pred, "decision.values")[1:4,]
plot(cmdscale(dist(iris[,-5])),col = as.integer(iris[,5]),pch = c("o","+")[1:150 %in% model$index + 1])

i2 <- iris
levels(i2$Species)[3] <- "versicolor"
summary(i2$Species)
wts <- 100 / table(i2$Species)
wts
m <- svm(Species ~ ., data = i2, class.weights = wts)
detach(iris)

x <- seq(0.1, 5, by = 0.05)
y <- log(x) + rnorm(x, sd = 0.2)
m <- svm(x, y)
new <- predict(m, x)

plot(x, y)
points(x, log(x), col = 2)
points(x, new, col = 4)
X <- data.frame(a = rnorm(1000), b = rnorm(1000))
attach(X)

m <- svm(X, gamma = 0.1)
m <- svm(~., data = X, gamma = 0.1)
newdata <- data.frame(a = c(0, 4), b = c(0, 4))
predict (m, newdata)

plot(X, col = 1:1000 %in% m$index + 1, xlim = c(-5,5), ylim=c(-5,5))
points(newdata, pch = "+", col = 2, cex = 5)

#4) Task: Breast-cancer data
bcdata <- read.csv('SVM\\breast-cancer-wisconsin.data',head=TRUE)
names(bcdata)
databcall <- subset(bcdata,select=c(-Samplecodenumber,-Class))
classesbcall <- subset(bcdata,select=Class)
databctrain <- databcall[1:400,]
classesbctrain <- classesbcall[1:400,]
databctest <- databcall[401:699,]
classesbctest <- classesbcall[401:699,]
model <- svm(databctrain, classesbctrain)
pred <- predict(model, databctest)
table(pred,t(classesbctest))
tune(svm, train.x=databctrain, train.y=classesbctrain, validation.x=databctest, validation.y=classesbctest, ranges = list(gamma = 2^(-1:1), cost = 2^(2:4)), control = tune.control(sampling = "fix"))


# Try binary
iris.part = subset(iris, Species != 'setosa')
iris.part$Species = factor(iris.part$Species)
iris.part = iris.part[, c(1,2,5)]
head(iris.part)

# Fit svm model
fit = svm(Species ~ ., data=iris.part, type='C-classification', kernel='linear')
summary(fit)

# Make a plot of the model
dev.new(width=5, height=5)
plot(fit, iris.part)

# Tabulate actual labels vs. fitted labels
pred = predict(fit, iris.part)
table(Actual=iris.part$Species, Fitted=pred)

# Obtain feature weights
w = t(t(fit$coefs) %*% fit$SV)
coef = data.frame(dimnames(w)[[1]],w)

# Calculate decision values manually
iris.scaled = scale(iris.part[,-3], fit$x.scale[[1]], fit$x.scale[[2]])
t(w) %*% t(as.matrix(iris.scaled)) - fit$rho

tmp1 <- (iris.part[,1] - fit$x.scale[[1]][1])/fit$x.scale[[2]][1];
tmp2 <- (iris.part[,2] - fit$x.scale[[1]][2])/fit$x.scale[[2]][2];
tmp <- data.frame(tmp1, tmp2);
scores <- t(w) %*% t(as.matrix(tmp)) - fit$rho

# Should equal...
fit$decision.values

# tune parameters;
fit = svm(Species ~ ., data=iris.part, type='C-classification', kernel='linear')
obj <- tune(svm, Species~., data=iris.part, tunecontrol = tune.control(sampling = "fix"))
summary(obj)
plot(obj, iris.part)
obj = best.tune(svm, Species ~ ., data=iris.part, type='C-classification', kernel = 'linear')

# Journal of Stat software;
data(iris)
n = length(iris[,1])
msk = sample(1:n, n*.8)
iris_train = iris[msk,]
iris_test = iris[-msk,]

model <- svm(Species ~ ., data = iris_train, method = "C-classification", kernel = "radial", cost = 10, gamma = 0.1)
summary(model)
plot(model, iris_train, Petal.Width ~Petal.Length, slice = list(Sepal.Width = 3, Sepal.Length = 4))
pred <- predict(model, head(iris), decision.values = TRUE)
attr(pred, "decision.values")
prediction <- predict(model, iris_test)
tab <- table(pred = prediction, true = iris_test[,5]); tab

#tune framework;
tuned <- tune.svm(Species ~ ., data = iris_train, gamma = 10^(-6:-1), cost = 10^(-1:1))
summary(tuned)
model  <- svm(Species ~ ., data = iris_train, kernel="radial", gamma=0.01, cost=10)
summary(model)
plot(model, iris_train, Petal.Width ~Petal.Length, slice = list(Sepal.Width = 3, Sepal.Length = 4))
prediction <- predict(model, iris_test)
tab <- table(pred = prediction, true = iris_test[,5]); tab

#tune binary classifier;
iris.part = subset(iris, Species != 'setosa')
iris.part$Species = factor(iris.part$Species)
iris.part = iris.part[, c(1,2,5)]
head(iris.part)

n = length(iris.part[,1])
msk = sample(1:n, n*.8)
iris_train = iris.part[msk,]
iris_test = iris.part[-msk,]
tobj <- tune.svm(Species ~ ., data = iris_train, gamma = 10^(-6:-3), cost = 10^(-1:1))
summary(tobj)

plot(tobj, transform.x = log10, xlab = expression(log[10](gamma)), ylab = "C")
bestGamma <- tobj$best.parameters[[1]]
bestC <- tobj$best.parameters[[2]]
model <- svm(Species ~ ., data = iris_train, cost = bestC, gamma = bestGamma, cross = 10)
summary(model)

pred <- predict(model, iris_test)
(acc <- table(pred, iris_test$Species))
classAgreement(acc)

Tuesday, November 6, 2012

Generate Mulitpage pdf

fnames <- c('filename')

#data contains two objests(icr, icr.new)
multipage <- function(fnames, result.table, outputfnames){
  pdf(outputfnames)
  for (i in 1:length(fnames)){
#    for (i in 1:10){
    data <- load(file = paste(fnames[i], ".RData", sep = ""))
    if (data[1] != "icr") { icr <- rate1;
                            icr.new <- rate9;}
 
    icr$rank <- icr$rank/1000000
    icr.new$rank <- icr.new$rank/1000000
    rate10$rank <- rate10$rank/1000000;
 
    points <- result.table[result.table$campaign == fnames[i],3:6]
    points$size <- points$size/1000000
    p <- ggplot(data=icr, aes(y=rank.1, x=rank)) +
      scale_x_continuous(limits=c(0,25)) +
      geom_line(aes(x=rank, y=rank.1),size = 1.5, color = "black") +
      geom_line(data=icr.new, aes(x=rank, y=rank.1),size = 1.5, color = "red") +
      geom_line(data=rate10, aes(x=rank, y=rank.1),size = 1.5, color = "darkgreen") +
      geom_point(data=points, aes(x=size, y=pred.lift), size = 3, color = "blue") +
      geom_point(data=points, aes(x=size, y=nb.lift), size = 3, color = "purple") +
      geom_point(data=points, aes(x=size, y=em.lift), size = 3, color = "green") +
      xlab("Audience Size (X 1,000,000)") + ylab("Est. Lift (%)") +
      opts(axis.text.x = theme_text(face = "bold", size = 12)) +
      opts(axis.text.y = theme_text(face = "bold", size = 12)) +
      opts(axis.title.x  = theme_text(face = "bold", size = 12)) +
      opts(axis.title.y  = theme_text(face = "bold", size = 12, angle = 90)) +
      opts(title = paste("Lift Plot of ", fnames[i],sep="")) +
      opts(plot.title = theme_text(face = "bold", size=14))
 
    print(p)
  }
  dev.off()
}

multipage (fnames, result.table, "results.pdf")