Association Rules for Market Basket Analysis
Need to examine a strategy to extract insights from transactions and cooccurrence data sets by association rule mining. Association rule analysis attempts to find sets of informative patterns from large, sparse data sets.
The idea of association rule mining is this: when events occur together more often than one would expect form their individual rates of occurrence, such cooccurrence is an interesting pattern.
An association is simple the co-occurrence of two or more things.
The support for a set of items is the proportion of all transactions that contain the set.
The confidence is the support for the co-occurrence of all items in a rule, conditional on the support for the left-hand set along. Support (X and Y) | Support (X)
The lift is the support of a set conditional on the joint support of each element, Support (X and Y) | Support(X) Support(Y). Say lift = 50, which indicates the combination occurs 50 times more often than we would expect if the two items were independent.
#Package arules provides both data structures for efficient handling of sparse binary data as well as interfaces to implementations of Apriori and Eclat for mining frequent itemsets, maximal frequent itemsets, closed frequent itemsets and association rules.
Need to examine a strategy to extract insights from transactions and cooccurrence data sets by association rule mining. Association rule analysis attempts to find sets of informative patterns from large, sparse data sets.
The idea of association rule mining is this: when events occur together more often than one would expect form their individual rates of occurrence, such cooccurrence is an interesting pattern.
An association is simple the co-occurrence of two or more things.
The support for a set of items is the proportion of all transactions that contain the set.
The confidence is the support for the co-occurrence of all items in a rule, conditional on the support for the left-hand set along. Support (X and Y) | Support (X)
The lift is the support of a set conditional on the joint support of each element, Support (X and Y) | Support(X) Support(Y). Say lift = 50, which indicates the combination occurs 50 times more often than we would expect if the two items were independent.
#Package arules provides both data structures for efficient handling of sparse binary data as well as interfaces to implementations of Apriori and Eclat for mining frequent itemsets, maximal frequent itemsets, closed frequent itemsets and association rules.
require(arules)
#example 1: creating transactions form a list;
a_list <- list(
c("a","b","c"),
c("a","b"),
c("a","b","d"),
c("c","e"),
c("a","b","d","e")
)
names(a_list) <- paste("Tr",c(1:5), sep = "")
a_list
trans <- as(a_list, "transactions")
summary(trans)
image(trans)
## example 2: creating transactions from a matrix
a_matrix <- matrix(
c(1,1,1,0,0,
1,1,0,0,0,
1,1,0,1,0,
0,0,1,0,1,
1,1,0,1,1), ncol = 5)
dimnames(a_matrix) <- list(
c("a","b","c","d","e"),
paste("Tr",c(1:5), sep = ""))
a_matrix
trans2 <- as(a_matrix, "transactions")
trans2
inspect(trans2)
## example 3: creating transactions from data.frame
a_df <- data.frame(
age = as.factor(c(6,8,7,6,9,5)),
grade = as.factor(c(1,3,1,1,4,1)))
## note: all attributes have to be factors
a_df
## coerce
trans3 <- as(a_df, "transactions")
image(trans3)
#5.1. Example 1: Analyzing and preparing a transaction data set;
data("Epub")
summary(Epub)
year <- strftime(as.POSIXlt(transactionInfo(Epub)[["TimeStamp"]]), "%Y")
table(year)
Epub2003 <- Epub[year == "2003"]
length(Epub2003)
image(Epub2003)
transactionInfo(Epub2003[size(Epub2003) > 20])
inspect(Epub2003[1:5])
as(Epub2003[1:5], "list")
EpubTidLists <- as(Epub, "tidLists")
EpubTidLists
as(EpubTidLists[1:3], "list")
#5.2. Example 2: Preparing and mining a questionnaire data set;
data("AdultUCI")
dim(AdultUCI)
AdultUCI[["fnlwgt"]] <- NULL
AdultUCI[["education-num"]] <- NULL
AdultUCI[[ "age"]] <- ordered(cut(AdultUCI[[ "age"]], c(15,25,45,65,100)),labels = c("Young", "Middle-aged", "Senior", "Old"))
AdultUCI[[ "hours-per-week"]] <- ordered(cut(AdultUCI[[ "hours-per-week"]],c(0,25,40,60,168)),labels = c("Part-time", "Full-time", "Over-time", "Workaholic"))
AdultUCI[[ "capital-gain"]] <- ordered(cut(AdultUCI[[ "capital-gain"]],c(-Inf,0,median(AdultUCI[[ "capital-gain"]][AdultUCI[[ "capital-gain"]]>0]),Inf)),labels = c("None", "Low", "High"))
AdultUCI[[ "capital-loss"]] <- ordered(cut(AdultUCI[[ "capital-loss"]],c(-Inf,0,median(AdultUCI[[ "capital-loss"]][AdultUCI[[ "capital-loss"]]>0]),Inf)),labels = c("none", "low", "high"))
Adult <- as(AdultUCI, "transactions")
Adult
summary(Adult)
itemFrequencyPlot(Adult, support = 0.1, cex.names=0.8)
rules <- apriori(Adult, parameter = list(support = 0.01, confidence = 0.6))
rules
summary(rules)
rulesIncomeSmall <- subset(rules, subset = rhs %in% "income=small" & lift > 1.2)
rulesIncomeLarge <- subset(rules, subset = rhs %in% "income=large" & lift > 1.2)
inspect(head(SORT(rulesIncomeSmall, by = "confidence"), n = 3))
inspect(head(SORT(rulesIncomeLarge, by = "confidence"), n = 3))
WRITE(rulesIncomeSmall, file = "data.csv", sep = ",", col.names = NA)
#5.3. Example 3: Extending arules with a new interest measure;
data("Adult")
fsets <- eclat(Adult, parameter = list(support = 0.05), control = list(verbose=FALSE))
singleItems <- fsets[size(items(fsets)) == 1]
singleSupport <- quality(singleItems)$support
names(singleSupport) <- unlist(LIST(items(singleItems), decode = FALSE))
head(singleSupport, n = 5)
itemsetList <- LIST(items(fsets), decode = FALSE)
allConfidence <- quality(fsets)$support / sapply(itemsetList, function(x) max(singleSupport[as.character(x)]))
quality(fsets) <- cbind(quality(fsets), allConfidence)
fsetsEducation <- subset(fsets, subset = items %pin% "education")
inspect(SORT(fsetsEducation[size(fsetsEducation)>1], by = "allConfidence")[1 : 3])
#5.4. Example 4: Sampling
data("Adult")
Adult
supp <- 0.05
epsilon <- 0.1
c <- 0.1
n <- -2 * log(c)/ (supp * epsilon^2)
n
AdultSample <- sample(Adult, n, replace = TRUE)
itemFrequencyPlot(AdultSample, population = Adult, support = supp, cex.names = 0.7)
itemFrequencyPlot(AdultSample, population = Adult, support = supp, lift = TRUE, cex.names = 0.9)
time <- system.time(itemsets <- eclat(Adult, parameter = list(support = supp), control = list(verbose = FALSE)))
## Application;
install.packages("chron")
install.packages("ggplot2")
install.packages("plyr")
install.packages("reshape2")
install.packages("arules")
install.packages("arulesViz")
#Analytical packages;
require(chron)
require(ggplot2)
require(plyr)
require(reshape2)
require(arules)
require(arulesViz)
require(reshape2)
#Association Rules;
load(file = "result5.rda")
result4$user_cumcnt <- as.integer(result4$user_cumcnt)
result4$user_cnt <- as.integer(result4$user_cnt)
#dropped robot;
#one user one basket without duplicate categories;
basic <- data.frame(result4[result4$user_cnt<=60, c(1,15)])
#dedupe due to merge of conv. category
basic <- unique(basic)
visit_cmatrix <- dcast(basic, userid ~ category)
cmatrix <- visit_cmatrix[,-1]
for (i in 1: length(cmatrix[1,])){
cmatrix[,i] <- ifelse(is.na(cmatrix[,i]),NA,1)
cmatrix[,i] <- as.factor(cmatrix[,i])
}
tmp <- as(cmatrix, "transactions")
tmp
summary(tmp)
inspect(tmp[1:10])
itemFrequencyPlot(tmp, support = 0.0001, cex.names=0.9)
#junk<-sort(itemFrequency(tmp))
#txt <- rep("",length(junk))
#for (i in 1:length(junk)) txt[i] <- strsplit(names(junk[i]), split="=")[[1]][1]
#plot1 <- barplot(junk, names = txt, col='blue', cex.names=0.5)
#text(plot1, (1:length(junk)) + par("cxy")[2], junk, cex = .8)
rules <- apriori(tmp, parameter = list(support = 0.001, confidence = 0.005))
rules
summary(rules)
#support --> P(A&B)
summary(quality(rules)$support)
#confidence --> P(A&B)/P(A)
summary(quality(rules)$confidence)
#lift --> P(A&B)/P(A)P(B)
summary(quality(rules)$lift)
inspect(head(SORT(rules, by = "confidence"), n = 10))
inspect(head(sort(rules, by = "lift"), n = 10))
plot(rules, method = NULL, measure = "support", shading = "lift", interactive = FALSE, data=tmp)
plot(rules, measure=c("support", "lift"), shading="confidence", interactive=TRUE, data = tmp)
#Scatter plot
plot(rules)
head(quality(rules))
plot(rules, measure=c("support", "lift"), shading="confidence")
#Matrix-based visualizations
subrules <- rules[quality(rules)$confidence > 0.5]
plot(subrules, method="matrix", measure="lift")
plot(subrules, method="matrix", measure="lift", control=list(reorder=TRUE))
plot(subrules, method="matrix3D", measure="lift")
plot(rules, method="grouped")
plot(rules, method="grouped", control=list(k=50))
rulesRev <- new("rules", lhs=lhs(subrules), rhs=rhs(subrules))
m <- match(subrules, rulesRev, nomatch=0)
result <- data.frame(rules = labels(subrules), quality(subrules))
result$rules <- as.character(result$rules)
lhs <- do.call(rbind,strsplit(result[,1], " => "))
result <- cbind(lhs,result[,2:4])
plot <- data.frame(result[,c(1,2,5)])
names(plot) <- c("start", "end", "weight")
g <- graph.data.frame(plot, directed = T)
g
V(g)$name
E(g)$weight
ideg <- degree(g, mode = "in", loops = F)
#col=rainbow(12) # For edge colors
l <-layout.reingold.tilford(g)
l[,1] <- 0:15
l[,2] <- 0:15
plot.igraph(g,
vertex.label = V(g)$name,
vertex.label.color = "gray20",
vertex.size = ideg*5 + 10,
vertex.size2 = 10,
edge.arrow.size=0.5,
edge.width = 2,
edge.curved = T,
layout = l)
install.packages("chron")
install.packages("ggplot2")
install.packages("plyr")
install.packages("reshape2")
install.packages("arules")
install.packages("arulesViz")
#Analytical packages;
require(chron)
require(ggplot2)
require(plyr)
require(reshape2)
require(arules)
require(arulesViz)
require(reshape2)
#Association Rules;
load(file = "result5.rda")
result4$user_cumcnt <- as.integer(result4$user_cumcnt)
result4$user_cnt <- as.integer(result4$user_cnt)
#dropped robot;
#one user one basket without duplicate categories;
basic <- data.frame(result4[result4$user_cnt<=60, c(1,15)])
#dedupe due to merge of conv. category
basic <- unique(basic)
visit_cmatrix <- dcast(basic, userid ~ category)
cmatrix <- visit_cmatrix[,-1]
for (i in 1: length(cmatrix[1,])){
cmatrix[,i] <- ifelse(is.na(cmatrix[,i]),NA,1)
cmatrix[,i] <- as.factor(cmatrix[,i])
}
tmp <- as(cmatrix, "transactions")
tmp
summary(tmp)
inspect(tmp[1:10])
itemFrequencyPlot(tmp, support = 0.0001, cex.names=0.9)
#junk<-sort(itemFrequency(tmp))
#txt <- rep("",length(junk))
#for (i in 1:length(junk)) txt[i] <- strsplit(names(junk[i]), split="=")[[1]][1]
#plot1 <- barplot(junk, names = txt, col='blue', cex.names=0.5)
#text(plot1, (1:length(junk)) + par("cxy")[2], junk, cex = .8)
rules <- apriori(tmp, parameter = list(support = 0.001, confidence = 0.005))
rules
summary(rules)
#support --> P(A&B)
summary(quality(rules)$support)
#confidence --> P(A&B)/P(A)
summary(quality(rules)$confidence)
#lift --> P(A&B)/P(A)P(B)
summary(quality(rules)$lift)
inspect(head(SORT(rules, by = "confidence"), n = 10))
inspect(head(sort(rules, by = "lift"), n = 10))
plot(rules, method = NULL, measure = "support", shading = "lift", interactive = FALSE, data=tmp)
plot(rules, measure=c("support", "lift"), shading="confidence", interactive=TRUE, data = tmp)
#Scatter plot
plot(rules)
head(quality(rules))
plot(rules, measure=c("support", "lift"), shading="confidence")
#Matrix-based visualizations
subrules <- rules[quality(rules)$confidence > 0.5]
plot(subrules, method="matrix", measure="lift")
plot(subrules, method="matrix", measure="lift", control=list(reorder=TRUE))
plot(subrules, method="matrix3D", measure="lift")
plot(rules, method="grouped")
plot(rules, method="grouped", control=list(k=50))
rulesRev <- new("rules", lhs=lhs(subrules), rhs=rhs(subrules))
m <- match(subrules, rulesRev, nomatch=0)
result <- data.frame(rules = labels(subrules), quality(subrules))
result$rules <- as.character(result$rules)
lhs <- do.call(rbind,strsplit(result[,1], " => "))
result <- cbind(lhs,result[,2:4])
plot <- data.frame(result[,c(1,2,5)])
names(plot) <- c("start", "end", "weight")
g <- graph.data.frame(plot, directed = T)
g
V(g)$name
E(g)$weight
ideg <- degree(g, mode = "in", loops = F)
#col=rainbow(12) # For edge colors
l <-layout.reingold.tilford(g)
l[,1] <- 0:15
l[,2] <- 0:15
plot.igraph(g,
vertex.label = V(g)$name,
vertex.label.color = "gray20",
vertex.size = ideg*5 + 10,
vertex.size2 = 10,
edge.arrow.size=0.5,
edge.width = 2,
edge.curved = T,
layout = l)