Friday, October 3, 2014

Text Mining - word cloud in R

##### Read-in File

```
memory.limit()
memory.size(max = TRUE)
rm(list=ls(all=T))
sessionInfo()

library(data.table)
library(NLP)
library(tm) # Framework for text mining.
library(SnowballC) # Provides wordStem() for stemming.
library(RColorBrewer) # Generate palette of colours for plots.
library(ggplot2) # Plot word frequencies.
library(Rgraphviz) # Correlation plots.

setwd("seach index");

search = read.csv("search201408.txt", row.names = NULL, header = F, sep = "\t", quote = "", stringsAsFactors = FALSE)
search = data.table(search)
search <- search[,V2:=NULL]
setnames(search, c("query", "count"))

> dim(search)
[1] 669546      2
> head(search)
               query  count
1:            toyota 303077
2:     toyota trucks 223888
3:        toyota.com 115706
4:     toyota tacoma  62982
5: toyota highlander  48905
6: 2015 toyota camry  46997

search1 <- search[count>1000, ];
dim(search1)

search1$idx <- 1:nrow(search1)
tmp = dcast.data.table(search1, idx ~ query, value.var = "count", fun = sum)
tmp = tmp[,idx:=NULL]
dtm <- tm::as.DocumentTermMatrix(tmp, weighting=weightTf)
dtm

library(wordcloud)
set.seed(123)
wordcloud(names(freq), freq, min.freq=5000, colors=brewer.pal(6, "Dark2"))

```

##### Loading a Corpus
```
doc.frame <- DataframeSource(search)
doc.corpus <- Corpus(doc.frame)
class(doc.corpus)
class(doc.corpus[[1]])

```

##### Exploring a Corpus

```

inspect(doc.corpus[1])

```

#### Preparing the Corpus
```
---conversion to lower case
doc.corpus <- tm_map(doc.corpus, tolower)

---remove nubmers
doc.corpus <- tm_map(doc.corpus, removeNumbers)
inspect(doc.corpus[1])

---remove punctuation
doc.corpus <- tm_map(doc.corpus, removePunctuation)
inspect(doc.corpus[11])

---remove english stop words
doc.corpus <- tm_map(doc.corpus, removeNumbers)
inspect(doc.corpus[1111])

---remove own stop words
doc.corpus <- tm_map(doc.corpus, removeWords, stopwords("english"))
inspect(doc.corpus[1111])

---remove strip whitespace
doc.corpus <- tm_map(doc.corpus, stripWhitespace)

---specific transformations
for (j in seq(doc.corpus))
{
doc.corpus[[j]] <- gsub("\t\t", " ", doc.corpus[[j]])
doc.corpus[[j]] <- gsub("/", " ", doc.corpus[[j]])
doc.corpus[[j]] <- gsub("@", " ", doc.corpus[[j]])
}
inspect(doc.corpus[16])

doc.corpus <- tm_map(doc.corpus, PlainTextDocument)
```

#### Stemming
```
library(SnowballC)
doc.corpus <- tm_map(doc.corpus, stemDocument)
```

#### Creating a Document term Matrix
```
dtm <- DocumentTermMatrix(doc.corpus)
inspect(dtm[1:5, 1000:1005])
inspect(dtm[1:5, 100:105])

tdm <- TermDocumentMatrix(doc.corpus)
tdm
```

#### Exploring the Document Term Matrix
```
freq <- sort(colSums(as.matrix(dtm)), decreasing=TRUE)
head(freq, 20)
tail(freq, 20)

```

#### Distribution of Term Frequencies

```
dim(dtm)
dtms <- removeSparseTerms(dtm, 0.5)
dim(dtms)
inspect(dtms)

freq <- colSums(as.matrix(dtms))
freq
table(freq)

```

#### Identifying Frequent Items and Associations

```
findFreqTerms(dtm, lowfreq=1000)

findFreqTerms(dtm, lowfreq=100)

findAssocs(dtm, "think", corlimit=0.6)

```

#### Correlations Plots

```
--plot(dtm,terms=findFreqTerms(dtm, lowfreq=100)[1:50], corThreshold=0.5)
```

#### Plotting Word Frequencies
```
freq <- sort(colSums(as.matrix(dtm)), decreasing=TRUE)
head(freq, 14)

wf <- data.frame(word=names(freq), freq=freq)
head(wf)

p <- ggplot(subset(wf, freq>500), aes(word, freq))
p <- p + geom_bar(stat="identity")
p <- p + theme(axis.text.x=element_text(angle=45, hjust=1))
p
```

#### Plotting Word Cloud

```
library(wordcloud)
set.seed(123)
wordcloud(names(freq), freq, min.freq=500)
```

No comments:

Post a Comment