```
memory.limit()
memory.size(max = TRUE)
rm(list=ls(all=T))
sessionInfo()
library(data.table)
library(NLP)
library(tm) # Framework for text mining.
library(SnowballC) # Provides wordStem() for stemming.
library(RColorBrewer) # Generate palette of colours for plots.
library(ggplot2) # Plot word frequencies.
library(Rgraphviz) # Correlation plots.
setwd("seach index");
search = read.csv("search201408.txt", row.names = NULL, header = F, sep = "\t", quote = "", stringsAsFactors = FALSE)
search = data.table(search)
search <- search[,V2:=NULL]
setnames(search, c("query", "count"))
> dim(search)
[1] 669546 2
> head(search)
query count
1: toyota 303077
2: toyota trucks 223888
3: toyota.com 115706
4: toyota tacoma 62982
5: toyota highlander 48905
6: 2015 toyota camry 46997
search1 <- search[count>1000, ];
dim(search1)
search1$idx <- 1:nrow(search1)
tmp = dcast.data.table(search1, idx ~ query, value.var = "count", fun = sum)
tmp = tmp[,idx:=NULL]
dtm <- tm::as.DocumentTermMatrix(tmp, weighting=weightTf)
dtm
library(wordcloud)
set.seed(123)
wordcloud(names(freq), freq, min.freq=5000, colors=brewer.pal(6, "Dark2"))
```
##### Loading a Corpus
```
doc.frame <- DataframeSource(search)
doc.corpus <- Corpus(doc.frame)
class(doc.corpus)
class(doc.corpus[[1]])
```
##### Exploring a Corpus
```
inspect(doc.corpus[1])
```
#### Preparing the Corpus
```
---conversion to lower case
doc.corpus <- tm_map(doc.corpus, tolower)
---remove nubmers
doc.corpus <- tm_map(doc.corpus, removeNumbers)
inspect(doc.corpus[1])
---remove punctuation
doc.corpus <- tm_map(doc.corpus, removePunctuation)
inspect(doc.corpus[11])
---remove english stop words
doc.corpus <- tm_map(doc.corpus, removeNumbers)
inspect(doc.corpus[1111])
---remove own stop words
doc.corpus <- tm_map(doc.corpus, removeWords, stopwords("english"))
inspect(doc.corpus[1111])
---remove strip whitespace
doc.corpus <- tm_map(doc.corpus, stripWhitespace)
---specific transformations
for (j in seq(doc.corpus))
{
doc.corpus[[j]] <- gsub("\t\t", " ", doc.corpus[[j]])
doc.corpus[[j]] <- gsub("/", " ", doc.corpus[[j]])
doc.corpus[[j]] <- gsub("@", " ", doc.corpus[[j]])
}
inspect(doc.corpus[16])
doc.corpus <- tm_map(doc.corpus, PlainTextDocument)
```
#### Stemming
```
library(SnowballC)
doc.corpus <- tm_map(doc.corpus, stemDocument)
```
#### Creating a Document term Matrix
```
dtm <- DocumentTermMatrix(doc.corpus)
inspect(dtm[1:5, 1000:1005])
inspect(dtm[1:5, 100:105])
tdm <- TermDocumentMatrix(doc.corpus)
tdm
```
#### Exploring the Document Term Matrix
```
freq <- sort(colSums(as.matrix(dtm)), decreasing=TRUE)
head(freq, 20)
tail(freq, 20)
```
#### Distribution of Term Frequencies
```
dim(dtm)
dtms <- removeSparseTerms(dtm, 0.5)
dim(dtms)
inspect(dtms)
freq <- colSums(as.matrix(dtms))
freq
table(freq)
```
#### Identifying Frequent Items and Associations
```
findFreqTerms(dtm, lowfreq=1000)
findFreqTerms(dtm, lowfreq=100)
findAssocs(dtm, "think", corlimit=0.6)
```
#### Correlations Plots
```
--plot(dtm,terms=findFreqTerms(dtm, lowfreq=100)[1:50], corThreshold=0.5)
```
#### Plotting Word Frequencies
```
freq <- sort(colSums(as.matrix(dtm)), decreasing=TRUE)
head(freq, 14)
wf <- data.frame(word=names(freq), freq=freq)
head(wf)
p <- ggplot(subset(wf, freq>500), aes(word, freq))
p <- p + geom_bar(stat="identity")
p <- p + theme(axis.text.x=element_text(angle=45, hjust=1))
p
```
#### Plotting Word Cloud
```
library(wordcloud)
set.seed(123)
wordcloud(names(freq), freq, min.freq=500)
```
No comments:
Post a Comment