My DS Coding Bolg: October 2014

Friday, October 17, 2014

NLS model in R

names(data) <- c("freq", "imp", "uuser", "click", "action");
data$CTR <- data$click/data$imp;
data$CVR <- data$action/data$imp;

data <- data[1:41,];
beta0 <- sum(data$action)/sum(data$imp);
beta1 <- max(data$action/data$imp);
beta2 <- -5/log(1/2);

decay <- nls(formula = CVR ~ b0 + b1 *exp(-freq/b2),
data=data,
start=c(b0=beta0, b1=beta1, b2=beta2),
trace = T)
summary(decay)
##Residual standard error: 5.08e-06 on 137648 degrees of freedom
## the coefficients only:
coef(decay)
## including their SE, etc:
coef(summary(decay))
plot(data$freq,data$CVR, main = "nls(*), data, true function and fit, n=100")
#curve(beta0 + beta1 * exp(-x / beta2), col = 4, add = TRUE)
lines(data$freq, predict(decay), col = 2)

result <- data.frame(data$freq,data$CVR,predict(decay));

ggplot(data, aes(y=data$CVR, x=data$freq)) +
geom_point(stat = "identity", fill = "red", size = 3) +
# stat_smooth() +
geom_line(aes(y=predict(decay), x=data$freq), color = 'red', size = 1, lty = 2) +
ggtitle("CVR by Frequency") +
ylab("CVR") + xlab("Frequency in one day") +
theme(plot.title = element_text(face = "bold", size = 20)) +
theme(axis.text.x = element_text(face = "bold", size = 16)) +
theme(axis.text.y = element_text(face = "bold", size = 16)) +
theme(axis.title.x = element_text(face = "bold", size = 16)) +
theme(axis.title.y = element_text(face = "bold", size = 16, angle = 90)) +
theme(legend.position = "top") +
theme(legend.key = element_rect(colour = NA)) +
theme(legend.title = element_blank()) +
scale_x_discrete(breaks = as.character(seq(0, 30, by = 2)))

##starting value
negexp.sv <- function(x,y){
mx <- mean(x)
x1<-x-mx
x2<-((x-mx)^2)/2
b <- as.vector(lm(y~x1+x2)$coef)
b2 <- b[2]/b[3]
b<-as.vector(lm(y~exp(-x/b2))$coef)
parms <- cbind(b[1],b[2],b2)
}
b.start <- negexp.sv(data$freq, data$CVR)
##specify gradient
expn <- function(b0,b1,b2,x){
temp <- exp(-x/b2)
model.func <- b0+b1*temp
D <- cbind(1, temp, (b1*x*temp)/b2^2)
dimnames(D) <- list(NULL, c("b0","b1","b2"))
attr(model.func, "gradient") <- D
model.func
}

decay <- nls(formula = CVR ~ expn(b0, b1, b2, frequency),
data=data,
## start=c(b0=b.start[1], b1=b.start[2], b2=b.start[3]),
start=c(b0=beta0, b1=beta1, b2=beta2),
trace = T)
summary(decay)
##Residual standard error: 5.08e-06 on 137648 degrees of freedom

for (i in seq(300,1000,100)){
print(i);
data1 <- data[1:i,];
decay <- nls(formula = CVR ~ expn(b0, b1, b2, frequency),
data=data1,
## start=c(b0=b.start[1], b1=b.start[2], b2=b.start[3]),
start=c(b0=beta0, b1=beta1, b2=beta2),
trace = F)
print(coef(summary(decay)))

}

Thursday, October 9, 2014

Text Mining - ngram in R

##Install the needed packages
Needed <- c("tm", "SnowballCC", "RColorBrewer", "ggplot2", "wordcloud", "biclust", "cluster", "igraph", "fpc")
install.packages(Needed, dependencies=TRUE)

install.packages("Rcampdf", repos = "http://datacube.wu.ac.at/", type = "source")
#slam needs to be installed
install.packages("/Users/tkmahll/Downloads/slam_0.1-37.tgz", repos = NULL, type="source")

##load the library
library(xlsx)
library(tm)
library(SnowballC)
library(RWeka)
library(ggplot2)
library(wordcloud)
library(RColorBrewer)

#read in the dataset
sellouts <- read.xlsx("sellouts.xlsx", sheetName = "Products")

head(sellouts$Name)
names(sellouts)
sellouts$Name[1:100]

##select only the Product Name column for analysis
sellout.prod <- as.list(sellouts[,1])

##Preprocess the dataset
#Remove punctuation
product <- Corpus(VectorSource(sellout.prod))
product <- tm_map(product, removePunctuation)
product <- tm_map(product, removeNumbers)
product <- tm_map(product, tolower)
product <- tm_map(product, removeWords, stopwords("english"))
#product <- tm_map(product, stemDocument)
product <- tm_map(product, stripWhitespace)

writeLines(as.character(product[[333]])) #333, 1227, 1228, 1229, 337

#keep it as plaintextdocument
product <- tm_map(product, PlainTextDocument)
#change to the character type
product.1 <- lapply(product, as.character)

#create the object
prod.docs <- Corpus(VectorSource(product.1))
prod.docs <- tm_map(prod.docs, PlainTextDocument)

#the function for conducting TermDocumentMatrix
BigramTokenizer <- function(x)
unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE)

tdm <- TermDocumentMatrix(prod.docs, control = list(tokenize = BigramTokenizer))
class(tdm)
dtm <- DocumentTermMatrix(prod.docs, control = list(tokenize = BigramTokenizer))

##Create a WordCloud to Visualize the Text Data
freq <- sort(colSums(as.matrix(dtm)), decreasing=TRUE)
head(freq, 20)

wf <- data.frame(word=names(freq), freq=freq)
head(wf)

# Create the word cloud
set.seed(123)
pal = brewer.pal(9,"BuPu")
wordcloud(words = wf$word,
freq = wf$freq,
scale = c(3,.8),
random.order = F,
colors = pal,
max.words = 30)

##Draw the plot of the frequency
library(ggplot2)
p <- ggplot(subset(wf, freq>10), aes(word, freq))
p <- p + geom_bar(stat="identity")
p <- p + theme(axis.text.x=element_text(angle=45, hjust=1))
p

####################################
##Tri-gram analysis
#the function for conducting TermDocumentMatrix
TrigramTokenizer <- function(x)
unlist(lapply(ngrams(words(x), 3), paste, collapse = " "), use.names = FALSE)

tdm.tri <- TermDocumentMatrix(prod.docs, control = list(tokenize = TrigramTokenizer))
class(tdm)
dtm.tri <- DocumentTermMatrix(prod.docs, control = list(tokenize = TrigramTokenizer))

##Create a WordCloud to Visualize the Text Data
freq.tri <- sort(colSums(as.matrix(dtm.tri)), decreasing=TRUE)
head(freq.tri, 20)

wf.tri <- data.frame(word=names(freq.tri), freq=freq.tri)
head(wf.tri)

##Draw the plot of the frequency
p.tri <- ggplot(subset(wf.tri, freq>3), aes(word, freq))
p.tri <- p.tri + geom_bar(stat="identity")
p.tri <- p.tri + theme(axis.text.x=element_text(angle=45, hjust=1))
p.tri

Friday, October 3, 2014

Text Mining - word cloud in R

##### Read-in File

```
memory.limit()
memory.size(max = TRUE)
rm(list=ls(all=T))
sessionInfo()

library(data.table)
library(NLP)
library(tm) # Framework for text mining.
library(SnowballC) # Provides wordStem() for stemming.
library(RColorBrewer) # Generate palette of colours for plots.
library(ggplot2) # Plot word frequencies.
library(Rgraphviz) # Correlation plots.

setwd("seach index");

search = read.csv("search201408.txt", row.names = NULL, header = F, sep = "\t", quote = "", stringsAsFactors = FALSE)
search = data.table(search)
search <- search[,V2:=NULL]
setnames(search, c("query", "count"))

> dim(search)
[1] 669546 2
> head(search)
query count
1: toyota 303077
2: toyota trucks 223888
3: toyota.com 115706
4: toyota tacoma 62982
5: toyota highlander 48905
6: 2015 toyota camry 46997

search1 <- search[count>1000, ];
dim(search1)

search1$idx <- 1:nrow(search1)
tmp = dcast.data.table(search1, idx ~ query, value.var = "count", fun = sum)
tmp = tmp[,idx:=NULL]
dtm <- tm::as.DocumentTermMatrix(tmp, weighting=weightTf)
dtm

library(wordcloud)
set.seed(123)
wordcloud(names(freq), freq, min.freq=5000, colors=brewer.pal(6, "Dark2"))

```

##### Loading a Corpus
```
doc.frame <- DataframeSource(search)
doc.corpus <- Corpus(doc.frame)
class(doc.corpus)
class(doc.corpus[[1]])

```

##### Exploring a Corpus

```

inspect(doc.corpus[1])

```

#### Preparing the Corpus
```
---conversion to lower case
doc.corpus <- tm_map(doc.corpus, tolower)

---remove nubmers
doc.corpus <- tm_map(doc.corpus, removeNumbers)
inspect(doc.corpus[1])

---remove punctuation
doc.corpus <- tm_map(doc.corpus, removePunctuation)
inspect(doc.corpus[11])

---remove english stop words
doc.corpus <- tm_map(doc.corpus, removeNumbers)
inspect(doc.corpus[1111])

---remove own stop words
doc.corpus <- tm_map(doc.corpus, removeWords, stopwords("english"))
inspect(doc.corpus[1111])

---remove strip whitespace
doc.corpus <- tm_map(doc.corpus, stripWhitespace)

---specific transformations
for (j in seq(doc.corpus))
{
doc.corpus[[j]] <- gsub("\t\t", " ", doc.corpus[[j]])
doc.corpus[[j]] <- gsub("/", " ", doc.corpus[[j]])
doc.corpus[[j]] <- gsub("@", " ", doc.corpus[[j]])
}
inspect(doc.corpus[16])

doc.corpus <- tm_map(doc.corpus, PlainTextDocument)
```

#### Stemming
```
library(SnowballC)
doc.corpus <- tm_map(doc.corpus, stemDocument)
```

#### Creating a Document term Matrix
```
dtm <- DocumentTermMatrix(doc.corpus)
inspect(dtm[1:5, 1000:1005])
inspect(dtm[1:5, 100:105])

tdm <- TermDocumentMatrix(doc.corpus)
tdm
```

#### Exploring the Document Term Matrix
```
freq <- sort(colSums(as.matrix(dtm)), decreasing=TRUE)
head(freq, 20)
tail(freq, 20)

```

#### Distribution of Term Frequencies

```
dim(dtm)
dtms <- removeSparseTerms(dtm, 0.5)
dim(dtms)
inspect(dtms)

freq <- colSums(as.matrix(dtms))
freq
table(freq)

```

#### Identifying Frequent Items and Associations

```
findFreqTerms(dtm, lowfreq=1000)

findFreqTerms(dtm, lowfreq=100)

findAssocs(dtm, "think", corlimit=0.6)

```

#### Correlations Plots

```
--plot(dtm,terms=findFreqTerms(dtm, lowfreq=100)[1:50], corThreshold=0.5)
```

#### Plotting Word Frequencies
```
freq <- sort(colSums(as.matrix(dtm)), decreasing=TRUE)
head(freq, 14)

wf <- data.frame(word=names(freq), freq=freq)
head(wf)

p <- ggplot(subset(wf, freq>500), aes(word, freq))
p <- p + geom_bar(stat="identity")
p <- p + theme(axis.text.x=element_text(angle=45, hjust=1))
p
```

#### Plotting Word Cloud

```
library(wordcloud)
set.seed(123)
wordcloud(names(freq), freq, min.freq=500)
```