Monday, September 15, 2014

Classification Models in R

n=nrow(all)
ntrain <- round(n*0.7)
set.seed(333)
tindex <- sample(n, ntrain)
all$y <- all$y==1
train <- all[tindex, ]
test <- all[-tindex, ]


##################################################################
## Naive Bayes
##################################################################
train$y=factor(train$y, levels=c(TRUE, FALSE))
newdata <- data.frame(y=test$y,
                      is_verified=test$is_verified,
                      pageviews1=test$pageviews1,
                      follows1=test$follows1,
                      likes1=test$likes1,
                      reblogs1=test$reblogs1,
                      original_posts1=test$original_posts1,
                      searches1=test$searches1,
                      unfollows1=test$unfollows1,
                      received_engagments1=test$received_engagments1,
                      time1=test$time1,
                      regi_geo1=test$regi_geo1)
nb1 <- NaiveBayes(y ~ is_verified + pageviews1 + follows1 + likes1 + reblogs1 + original_posts1
                  + searches1 + unfollows1 + received_engagments1 + time1 + regi_geo1, data=train)
tmp <- predict(nb1, newdata = newdata)
table(tmp$class, test$y)
test$score = tmp$posterior
#calculate auc;
pred = prediction(test$score, test$y);
auc.tmp <- performance(pred,"auc");
auc <- as.numeric(auc.tmp@y.values); auc;
perf <- performance(pred,"tpr","fpr");
plot(perf);
abline(0, 1, lty = 2);
# KS is the maximum difference between the cumulative true positive and cumulative false positive rate.
max(attr(perf,'y.values')[[1]]-attr(perf,'x.values')[[1]])


##################################################################
## Decision Tree
##################################################################
require(rpart)
ct <- rpart(factor(gear) ~ ., data = train, minsplit = 3)
summary(ct)
plot(ct)
text(ct)
pred <- predict(ct, newdata = test, type = 'class')
prop.table(table(pred, test$gear))

ct <- rpart(as.factor(y) ~ is_verified + pageviews1 + follows1 + likes1
            + reblogs1 + original_posts1 + searches1 + unfollows1 + received_engagments1,
            data = train, minsplit = 5)
summary(ct)
plot(ct)
text(ct)

test$score = predict(ct, newdata = test, type = "prob")[,2]
#calculate auc;
pred = prediction(test$score, test$y);
plot(performance(Pred2, "tpr", "fpr"))
abline(0, 1, lty = 2)
auc.tmp <- performance(pred,"auc");
auc <- as.numeric(auc.tmp@y.values); auc;
perf <- performance(pred,"tpr","fpr");
plot(perf);
abline(0, 1, lty = 2);
# KS is the maximum difference between the cumulative true positive and cumulative false positive rate.
max(attr(perf,'y.values')[[1]]-attr(perf,'x.values')[[1]])


##################################################################
## KNN;
##################################################################
train_x <- data.frame(is_verified=as.numeric(train$is_verified),
                      pageviews1=as.numeric(train$pageviews1),
                      follows1=as.numeric(train$follows1),
                      likes1=as.numeric(train$likes1),
                      reblogs1=as.numeric(train$reblogs1),
                      original_posts1=as.numeric(train$original_posts1),
                      searches1=as.numeric(train$searches1),
                      unfollows1=as.numeric(train$unfollows1),
                      received_engagments1=as.numeric(train$received_engagments1),
                      time1=as.numeric(train$time1),
                      regi_geo1=as.numeric(train$regi_geo1))
train_y=train$y
test_x <- data.frame(is_verified=as.numeric(test$is_verified),
                     pageviews1=as.numeric(test$pageviews1),
                     follows1=as.numeric(test$follows1),
                     likes1=as.numeric(test$likes1),
                     reblogs1=as.numeric(test$reblogs1),
                     original_posts1=as.numeric(test$original_posts1),
                     searches1=as.numeric(test$searches1),
                     unfollows1=as.numeric(test$unfollows1),
                     received_engagments1=as.numeric(test$received_engagments1),
                     time1=as.numeric(test$time1),
                     regi_geo1=as.numeric(test$regi_geo1))

prediction <- knn(train_x, test_x, train_y, k=5)
table(prediction, test$y)

test$score = as.integer(prediction)
#calculate auc;
pred = prediction(test$score, test$y);
auc.tmp <- performance(pred,"auc");
auc <- as.numeric(auc.tmp@y.values); auc;
perf <- performance(pred,"tpr","fpr");
plot(perf);
abline(0, 1, lty = 2);
# KS is the maximum difference between the cumulative true positive and cumulative false positive rate.
max(attr(perf,'y.values')[[1]]-attr(perf,'x.values')[[1]])

p <- ggplot(test, aes(pageviews1, follows1))
p + geom_point(aes(color=test$y))


##################################################################
## Support Vector Machines
##################################################################
## 0.6148956 0.2297911
svm <- svm(y ~ is_verified + pageviews1 + follows1 + likes1 + reblogs1 + original_posts1
           + searches1 + unfollows1 + received_engagments1 + time1 + regi_geo1, data=train,
           method = "C-classification", kernel = "radial", cost = 100, gamma = 1)
test$score <- predict(svm, test)
AUC(test$score, test$y)
KS(test$score, test$y)

## 0.621205 0.2424101
svm <- svm(y ~ is_verified + pageviews1 + follows1 + likes1 + reblogs1 + original_posts1
           + searches1 + unfollows1 + received_engagments1 + time1 + regi_geo1, data=train,
           method = "C-classification", kernel = "radial", cost = 1, gamma = 1)
test$score <- predict(svm, test)
AUC(test$score, test$y)
KS(test$score, test$y)


##################################################################
## Artificial Neural Networks
##################################################################
train$y=as.integer(train$y)
train$y1=1-train$y
train$is_verified=as.integer(train$is_verified)
train$time1=as.integer(train$time1)
train$regi_geo1=as.integer(train$regi_geo1)

nn1 <- neuralnet(y ~ is_verified + pageviews1 + follows1 + likes1 + reblogs1 + original_posts1
                 + searches1 + unfollows1 + received_engagments1 + time1 + regi_geo1,
                 data=train, hidden=c(4))
plot(nn1)

test_x <- data.frame(is_verified=as.integer(test$is_verified),
                     pageviews1=as.integer(test$pageviews1),
                     follows1=as.integer(test$follows1),
                     likes1=as.integer(test$likes1),
                     reblogs1=as.integer(test$reblogs1),
                     original_posts1=as.integer(test$original_posts1),
                     searches1=as.integer(test$searches1),
                     unfollows1=as.integer(test$unfollows1),
                     received_engagments1=as.integer(test$received_engagments1),
                     time1=as.integer(test$time1),
                     regi_geo1=as.integer(test$regi_geo1))
prediction <- compute(nn1, test_x)
prediction <- prediction$net.result

test$score <- prediction[,1]
AUC(test$score, test$y)
KS(test$score, test$y)


##################################################################
## Random Forest
##################################################################
Random forest consists of a number of decision trees. Every node in the decision trees is a condition on a single feature, designed to split the dataset into two so that similar response values end up in the same set. The measure based on which the (locally) optimal condition is chosen is called impurity. For classification, it is typically either Gini impurity or information gain/entropy and for regression trees it is variance. Thus when training a tree, it can be computed how much each feature decreases the weighted impurity in a tree. For a forest, the impurity decrease from each feature can be averaged and the features are ranked according to this measure.

In particular, trees that are grown very deep tend to learn highly irregular patterns: they overfit their training sets, i.e. have low bias, very high variance. Random forests are a way of averaging multiple deep decision trees, trained on different parts of the same training set, with the goal of reducing the variance. This comes at the expense of a small increase in the bias and some loss of interpretability, but generally greatly boosts the performance of the final model.

install.packages('randomForest')
require(randomForest)
(rf <- randomForest(factor(gear) ~ ., data = train, ntree = 10))
plot(rf)
pred <- predict(rf, newdata = test, type = 'class')
prop.table(table(pred, test$gear))  

## importance of variables
rf <- randomForest(factor(gear) ~ ., data = train, ntree = 100, importance=T)
varImpPlot(rf, main="Importance of Variables")        


##################################################################
## GB Decesion Tree
##################################################################
require("gbm")
gbm_perf <- gbm.perf(gbm_model, method = "cv")
predictions_gbm <- predict(gbm_model, newdata = testdf[, -response_column],
                           n.trees = gbm_perf, type = "response")


No comments:

Post a Comment