Thursday, October 9, 2014

Text Mining - ngram in R

##Install the needed packages
Needed <- c("tm", "SnowballCC", "RColorBrewer", "ggplot2", "wordcloud", "biclust", "cluster", "igraph", "fpc")
install.packages(Needed, dependencies=TRUE)

install.packages("Rcampdf", repos = "http://datacube.wu.ac.at/", type = "source")
#slam needs to be installed
install.packages("/Users/tkmahll/Downloads/slam_0.1-37.tgz", repos = NULL, type="source")


##load the library
library(xlsx)
library(tm)
library(SnowballC)
library(RWeka)
library(ggplot2)
library(wordcloud)
library(RColorBrewer)


#read in the dataset
sellouts <- read.xlsx("sellouts.xlsx", sheetName = "Products")

head(sellouts$Name)
names(sellouts)
sellouts$Name[1:100]

##select only the Product Name column for analysis
sellout.prod <- as.list(sellouts[,1])


##Preprocess the dataset
#Remove punctuation
product <- Corpus(VectorSource(sellout.prod))
product <- tm_map(product, removePunctuation)
product <- tm_map(product, removeNumbers)
product <- tm_map(product, tolower)
product <- tm_map(product, removeWords, stopwords("english"))
#product <- tm_map(product, stemDocument)
product <- tm_map(product, stripWhitespace)

writeLines(as.character(product[[333]])) #333, 1227, 1228, 1229, 337

#keep it as plaintextdocument
product <- tm_map(product, PlainTextDocument)
#change to the character type
product.1 <- lapply(product, as.character)


#create the object
prod.docs <- Corpus(VectorSource(product.1))
prod.docs <- tm_map(prod.docs, PlainTextDocument)

#the function for conducting TermDocumentMatrix
BigramTokenizer <- function(x)
  unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE)

tdm <- TermDocumentMatrix(prod.docs, control = list(tokenize = BigramTokenizer))
class(tdm)
dtm <- DocumentTermMatrix(prod.docs, control = list(tokenize = BigramTokenizer))


##Create a WordCloud to Visualize the Text Data
freq <- sort(colSums(as.matrix(dtm)), decreasing=TRUE)
head(freq, 20)

wf <- data.frame(word=names(freq), freq=freq)
head(wf)

# Create the word cloud
set.seed(123)
pal = brewer.pal(9,"BuPu")
wordcloud(words = wf$word,
          freq = wf$freq,
          scale = c(3,.8),
          random.order = F,
          colors = pal,
          max.words = 30)



##Draw the plot of the frequency
library(ggplot2)
p <- ggplot(subset(wf, freq>10), aes(word, freq))  
p <- p + geom_bar(stat="identity")
p <- p + theme(axis.text.x=element_text(angle=45, hjust=1))
p


####################################
##Tri-gram analysis
#the function for conducting TermDocumentMatrix
TrigramTokenizer <- function(x)
  unlist(lapply(ngrams(words(x), 3), paste, collapse = " "), use.names = FALSE)

tdm.tri <- TermDocumentMatrix(prod.docs, control = list(tokenize = TrigramTokenizer))
class(tdm)
dtm.tri <- DocumentTermMatrix(prod.docs, control = list(tokenize = TrigramTokenizer))


##Create a WordCloud to Visualize the Text Data
freq.tri <- sort(colSums(as.matrix(dtm.tri)), decreasing=TRUE)
head(freq.tri, 20)

wf.tri <- data.frame(word=names(freq.tri), freq=freq.tri)
head(wf.tri)


##Draw the plot of the frequency
p.tri <- ggplot(subset(wf.tri, freq>3), aes(word, freq))  
p.tri <- p.tri + geom_bar(stat="identity")
p.tri <- p.tri + theme(axis.text.x=element_text(angle=45, hjust=1))
p.tri

No comments:

Post a Comment