Wednesday, February 3, 2016

Master R 7 - Unstructured Data

library(tm)
getSources()
getReaders()

# Importing the corpus
res <- XML::readHTMLTable(paste0('http://cran.r-project.org/', 'web/packages/available_packages_by_name.html'), which = 1)
head(res)
v <- Corpus(VectorSource(res$V2)); v
inspect(head(v, 3))
meta(v[[1]])
writeLines(as.character(v[[1]]))
lapply(v[1:5], as.character)

# Cleaning the corpus
getTransformations()
stopwords("english")
removeWords('to be or not to be', stopwords("english"))
v <- tm_map(v, removeWords, stopwords("english"))
inspect(head(v, 3))

v <- tm_map(v, content_transformer(tolower))
v <- tm_map(v, removePunctuation)
v <- tm_map(v, stripWhitespace)
inspect(head(v, 3))

# Visualizing the most frequent words in the corpus
wordcloud::wordcloud(v)

# Further cleanup
v <- tm_map(v, removeNumbers)
tdm <- TermDocumentMatrix(v)
inspect(tdm[1:5, 1:20])
findFreqTerms(tdm, lowfreq = 100)

myStopwords <- c('package', 'based', 'using')
v <- tm_map(v, removeWords, myStopwords)

# Stemming words
library(SnowballC)
wordStem(c('cats', 'mastering', 'modelling', 'models', 'model'))
wordStem(c('are', 'analyst', 'analyze', 'analysis'))
d <- v
v <- tm_map(v, stemDocument, language = "english")
v <- tm_map(v, content_transformer(function(x, d) {
paste(stemCompletion(strsplit(stemDocument(x), ' ')[[1]],d),collapse = ' ')
}), d)

tdm <- TermDocumentMatrix(v)
findFreqTerms(tdm, lowfreq = 100)


# Lemmatisation
Remove characters from the end of words in the hope of finding the stem, which is heuristic process often resulting in not-existing words. We tried to overcome this issue by comleting the stems to the shortest meaningful words by using a dictionary, which might result in derivation in the meaning of the term.
Another way to reduce the number of inflectional forms of different terms, instead of deconstruction and then trying to rebuidl the words, is morphological analysis with the help of a dictionary. This process is called lemmatisation, which looks for lemma (the canonical form of a word) instead of stems.

# Analyzing the associations among terms
findAssocs(tdm, 'data', 0.1)

# Some other metrics
vnchar <- sapply(v, function(x) nchar(x$content))
summary(vnchar)
(vm <- which.min(vnchar))
v[[vm]]
hist(vnchar, main = 'Length of R package descriptions', xlab = 'Number of characters')

# The segmentation of documents
hadleyverse <- c('ggplot2', 'dplyr', 'reshape2', 'lubridate','stringr', 'devtools', 'roxygen2', 'tidyr')
(w <- which(res$V1 %in% hadleyverse))
plot(hclust(dist(DocumentTermMatrix(v[w]))), xlab = 'Hadleyverse packages')
sapply(v[w], function(x) structure(content(x), .Names = meta(x, 'id')))

No comments:

Post a Comment

Blog Archive