library(tm) getSources() getReaders() # Importing the corpus
res <- XML::readHTMLTable(paste0('http://cran.r-project.org/', 'web/packages/available_packages_by_name.html'), which = 1)
head(res) v <- Corpus(VectorSource(res$V2)); v inspect(head(v, 3)) meta(v[[1]]) writeLines(as.character(v[[1]])) lapply(v[1:5], as.character) # Cleaning the corpus getTransformations() stopwords("english") removeWords('to be or not to be', stopwords("english")) v <- tm_map(v, removeWords, stopwords("english")) inspect(head(v, 3)) v <- tm_map(v, content_transformer(tolower)) v <- tm_map(v, removePunctuation) v <- tm_map(v, stripWhitespace) inspect(head(v, 3)) # Visualizing the most frequent words in the corpus wordcloud::wordcloud(v) # Further cleanup v <- tm_map(v, removeNumbers) tdm <- TermDocumentMatrix(v) inspect(tdm[1:5, 1:20]) findFreqTerms(tdm, lowfreq = 100) myStopwords <- c('package', 'based', 'using') v <- tm_map(v, removeWords, myStopwords) # Stemming words library(SnowballC) wordStem(c('cats', 'mastering', 'modelling', 'models', 'model')) wordStem(c('are', 'analyst', 'analyze', 'analysis')) d <- v v <- tm_map(v, stemDocument, language = "english") v <- tm_map(v, content_transformer(function(x, d) { paste(stemCompletion(strsplit(stemDocument(x), ' ')[[1]],d),collapse = ' ') }), d) tdm <- TermDocumentMatrix(v) findFreqTerms(tdm, lowfreq = 100) # Lemmatisation Remove characters from the end of words in the hope of finding the stem, which is heuristic process often resulting in not-existing words. We tried to overcome this issue by comleting the stems to the shortest meaningful words by using a dictionary, which might result in derivation in the meaning of the term. Another way to reduce the number of inflectional forms of different terms, instead of deconstruction and then trying to rebuidl the words, is morphological analysis with the help of a dictionary. This process is called lemmatisation, which looks for lemma (the canonical form of a word) instead of stems. # Analyzing the associations among terms findAssocs(tdm, 'data', 0.1) # Some other metrics vnchar <- sapply(v, function(x) nchar(x$content)) summary(vnchar) (vm <- which.min(vnchar)) v[[vm]] hist(vnchar, main = 'Length of R package descriptions', xlab = 'Number of characters') # The segmentation of documents hadleyverse <- c('ggplot2', 'dplyr', 'reshape2', 'lubridate','stringr', 'devtools', 'roxygen2', 'tidyr') (w <- which(res$V1 %in% hadleyverse)) plot(hclust(dist(DocumentTermMatrix(v[w]))), xlab = 'Hadleyverse packages') sapply(v[w], function(x) structure(content(x), .Names = meta(x, 'id')))
No comments:
Post a Comment