My DS Coding Bolg: Part 1: Arules in R

Association Rules for Market Basket Analysis

Need to examine a strategy to extract insights from transactions and cooccurrence data sets by association rule mining. Association rule analysis attempts to find sets of informative patterns from large, sparse data sets.

The idea of association rule mining is this: when events occur together more often than one would expect form their individual rates of occurrence, such cooccurrence is an interesting pattern.

An association is simple the co-occurrence of two or more things.

The support for a set of items is the proportion of all transactions that contain the set.

The confidence is the support for the co-occurrence of all items in a rule, conditional on the support for the left-hand set along. Support (X and Y) | Support (X)

The lift is the support of a set conditional on the joint support of each element, Support (X and Y) | Support(X) Support(Y). Say lift = 50, which indicates the combination occurs 50 times more often than we would expect if the two items were independent.

#Package arules provides both data structures for efficient handling of sparse binary data as well as interfaces to implementations of Apriori and Eclat for mining frequent itemsets, maximal frequent itemsets, closed frequent itemsets and association rules.

require(arules)

#example 1: creating transactions form a list;

a_list <- list(

c("a","b","c"),

c("a","b"),

c("a","b","d"),

c("c","e"),

c("a","b","d","e")

)

names(a_list) <- paste("Tr",c(1:5), sep = "")

a_list

trans <- as(a_list, "transactions")

summary(trans)

image(trans)

## example 2: creating transactions from a matrix

a_matrix <- matrix(

c(1,1,1,0,0,

1,1,0,0,0,

1,1,0,1,0,

0,0,1,0,1,

1,1,0,1,1), ncol = 5)

dimnames(a_matrix) <- list(

c("a","b","c","d","e"),

paste("Tr",c(1:5), sep = ""))

a_matrix

trans2 <- as(a_matrix, "transactions")

trans2

inspect(trans2)

## example 3: creating transactions from data.frame

a_df <- data.frame(

age = as.factor(c(6,8,7,6,9,5)),

grade = as.factor(c(1,3,1,1,4,1)))

## note: all attributes have to be factors

a_df

## coerce

trans3 <- as(a_df, "transactions")

image(trans3)

#5.1. Example 1: Analyzing and preparing a transaction data set;

data("Epub")

summary(Epub)

year <- strftime(as.POSIXlt(transactionInfo(Epub)[["TimeStamp"]]), "%Y")

table(year)

Epub2003 <- Epub[year == "2003"]

length(Epub2003)

image(Epub2003)

transactionInfo(Epub2003[size(Epub2003) > 20])

inspect(Epub2003[1:5])

as(Epub2003[1:5], "list")

EpubTidLists <- as(Epub, "tidLists")

EpubTidLists

as(EpubTidLists[1:3], "list")

#5.2. Example 2: Preparing and mining a questionnaire data set;

data("AdultUCI")

dim(AdultUCI)

AdultUCI[["fnlwgt"]] <- NULL

AdultUCI[["education-num"]] <- NULL

AdultUCI[[ "age"]] <- ordered(cut(AdultUCI[[ "age"]], c(15,25,45,65,100)),labels = c("Young", "Middle-aged", "Senior", "Old"))

AdultUCI[[ "hours-per-week"]] <- ordered(cut(AdultUCI[[ "hours-per-week"]],c(0,25,40,60,168)),labels = c("Part-time", "Full-time", "Over-time", "Workaholic"))

AdultUCI[[ "capital-gain"]] <- ordered(cut(AdultUCI[[ "capital-gain"]],c(-Inf,0,median(AdultUCI[[ "capital-gain"]][AdultUCI[[ "capital-gain"]]>0]),Inf)),labels = c("None", "Low", "High"))

AdultUCI[[ "capital-loss"]] <- ordered(cut(AdultUCI[[ "capital-loss"]],c(-Inf,0,median(AdultUCI[[ "capital-loss"]][AdultUCI[[ "capital-loss"]]>0]),Inf)),labels = c("none", "low", "high"))

Adult <- as(AdultUCI, "transactions")

Adult

summary(Adult)

itemFrequencyPlot(Adult, support = 0.1, cex.names=0.8)

rules <- apriori(Adult, parameter = list(support = 0.01, confidence = 0.6))

rules

summary(rules)

rulesIncomeSmall <- subset(rules, subset = rhs %in% "income=small" & lift > 1.2)

rulesIncomeLarge <- subset(rules, subset = rhs %in% "income=large" & lift > 1.2)

inspect(head(SORT(rulesIncomeSmall, by = "confidence"), n = 3))

inspect(head(SORT(rulesIncomeLarge, by = "confidence"), n = 3))

WRITE(rulesIncomeSmall, file = "data.csv", sep = ",", col.names = NA)

#5.3. Example 3: Extending arules with a new interest measure;

data("Adult")

fsets <- eclat(Adult, parameter = list(support = 0.05), control = list(verbose=FALSE))

singleItems <- fsets[size(items(fsets)) == 1]

singleSupport <- quality(singleItems)$support

names(singleSupport) <- unlist(LIST(items(singleItems), decode = FALSE))

head(singleSupport, n = 5)

itemsetList <- LIST(items(fsets), decode = FALSE)

allConfidence <- quality(fsets)$support / sapply(itemsetList, function(x) max(singleSupport[as.character(x)]))

quality(fsets) <- cbind(quality(fsets), allConfidence)

fsetsEducation <- subset(fsets, subset = items %pin% "education")

inspect(SORT(fsetsEducation[size(fsetsEducation)>1], by = "allConfidence")[1 : 3])

#5.4. Example 4: Sampling

data("Adult")

Adult

supp <- 0.05

epsilon <- 0.1

c <- 0.1

n <- -2 * log(c)/ (supp * epsilon^2)

AdultSample <- sample(Adult, n, replace = TRUE)

itemFrequencyPlot(AdultSample, population = Adult, support = supp, cex.names = 0.7)

itemFrequencyPlot(AdultSample, population = Adult, support = supp, lift = TRUE, cex.names = 0.9)

time <- system.time(itemsets <- eclat(Adult, parameter = list(support = supp), control = list(verbose = FALSE)))

## Application;
install.packages("chron")
install.packages("ggplot2")
install.packages("plyr")
install.packages("reshape2")
install.packages("arules")
install.packages("arulesViz")

#Analytical packages;
require(chron)
require(ggplot2)
require(plyr)
require(reshape2)
require(arules)
require(arulesViz)
require(reshape2)

#Association Rules;
load(file = "result5.rda")
result4$user_cumcnt <- as.integer(result4$user_cumcnt)
result4$user_cnt <- as.integer(result4$user_cnt)

#dropped robot;
#one user one basket without duplicate categories;
basic <- data.frame(result4[result4$user_cnt<=60, c(1,15)])
#dedupe due to merge of conv. category
basic <- unique(basic)
visit_cmatrix <- dcast(basic, userid ~ category)
cmatrix <- visit_cmatrix[,-1]

for (i in 1: length(cmatrix[1,])){
cmatrix[,i] <- ifelse(is.na(cmatrix[,i]),NA,1)
cmatrix[,i] <- as.factor(cmatrix[,i])
}

tmp <- as(cmatrix, "transactions")
tmp
summary(tmp)
inspect(tmp[1:10])
itemFrequencyPlot(tmp, support = 0.0001, cex.names=0.9)
#junk<-sort(itemFrequency(tmp))
#txt <- rep("",length(junk))
#for (i in 1:length(junk)) txt[i] <- strsplit(names(junk[i]), split="=")[[1]][1]
#plot1 <- barplot(junk, names = txt, col='blue', cex.names=0.5)
#text(plot1, (1:length(junk)) + par("cxy")[2], junk, cex = .8)

rules <- apriori(tmp, parameter = list(support = 0.001, confidence = 0.005))
rules
summary(rules)
#support --> P(A&B)
summary(quality(rules)$support)
#confidence --> P(A&B)/P(A)
summary(quality(rules)$confidence)
#lift --> P(A&B)/P(A)P(B)
summary(quality(rules)$lift)

inspect(head(SORT(rules, by = "confidence"), n = 10))
inspect(head(sort(rules, by = "lift"), n = 10))
plot(rules, method = NULL, measure = "support", shading = "lift", interactive = FALSE, data=tmp)

plot(rules, measure=c("support", "lift"), shading="confidence", interactive=TRUE, data = tmp)

#Scatter plot
plot(rules)
head(quality(rules))
plot(rules, measure=c("support", "lift"), shading="confidence")

#Matrix-based visualizations
subrules <- rules[quality(rules)$confidence > 0.5]
plot(subrules, method="matrix", measure="lift")
plot(subrules, method="matrix", measure="lift", control=list(reorder=TRUE))
plot(subrules, method="matrix3D", measure="lift")

plot(rules, method="grouped")
plot(rules, method="grouped", control=list(k=50))

rulesRev <- new("rules", lhs=lhs(subrules), rhs=rhs(subrules))
m <- match(subrules, rulesRev, nomatch=0)
result <- data.frame(rules = labels(subrules), quality(subrules))
result$rules <- as.character(result$rules)
lhs <- do.call(rbind,strsplit(result[,1], " => "))
result <- cbind(lhs,result[,2:4])

plot <- data.frame(result[,c(1,2,5)])
names(plot) <- c("start", "end", "weight")
g <- graph.data.frame(plot, directed = T)
g
V(g)$name
E(g)$weight
ideg <- degree(g, mode = "in", loops = F)
#col=rainbow(12) # For edge colors

l <-layout.reingold.tilford(g)
l[,1] <- 0:15
l[,2] <- 0:15
plot.igraph(g,
vertex.label = V(g)$name,
vertex.label.color = "gray20",
vertex.size = ideg*5 + 10,
vertex.size2 = 10,
edge.arrow.size=0.5,
edge.width = 2,
edge.curved = T,
layout = l)

My DS Coding Bolg

Wednesday, October 28, 2015

Part 1: Arules in R

No comments:

Post a Comment

Blog Archive