Friday, February 12, 2016

Master R 14 - Analyzing the R community

R Foundation members

library(XML)
page <- htmlParse('http://r-project.org/foundation/donors.html')
list <- unlist(xpathApply(page, "//h3[@id='supporting-members']/following-sibling::ul[1]/li", xmlValue))
str(list)

supporterlist <- sub(' \\([a-zA-Z ]*\\)$', '', list)
countrylist   <- substr(list, nchar(supporterlist) + 3, nchar(list) - 1)
tail(sort(prop.table(table(countrylist)) * 100), 5)

Visualizing supporting members around the world

countries <- as.data.frame(table(countrylist))
library(rworldmap)
joinCountryData2Map(countries, joinCode = 'NAME', nameJoinColumn = 'countrylist', verbose = TRUE)

library(ggmap)
for (fix in c('Brasil', 'CZ', 'Danmark', 'NL')) {
   countrylist[which(countrylist == fix)] <-
       geocode(fix, output = 'more')$country
}

countries <- as.data.frame(table(countrylist))
countries <- joinCountryData2Map(countries, joinCode = 'NAME', nameJoinColumn = 'countrylist')
mapCountryData(countries, 'Freq', catMethod = 'logFixedWidth', mapTitle = 'Number of R Foundation supporting members')


R package maintainers

packages <- readHTMLTable(paste0('http://cran.r-project.org', '/web/checks/check_summary.html'), which = 2)
maintainers <- sub('(.*) <(.*)>', '\\1', packages$' Maintainer')
maintainers <- gsub(' ', ' ', maintainers)
str(maintainers)
tail(sort(table(maintainers)), 8)


The number of packages per maintainer

N <- as.numeric(table(maintainers))
library(fitdistrplus)
plotdist(N)
descdist(N, boot = 1e3)
(gparams <- fitdist(N, 'gamma'))
gshape <- gparams$estimate[['shape']]
grate  <- gparams$estimate[['rate']]
sum(rgamma(1e5, shape = gshape, rate = grate))
hist(rgamma(1e5, shape = gshape, rate = grate))
pgamma(2, shape = gshape, rate = grate)
prop.table(table(N <= 2))
ploc <- min(N)
pshp <- length(N) / sum(log(N) - log(ploc))

library(actuar)
ppareto(2, pshp, ploc)
fg <- fitdist(N, 'gamma')
fw <- fitdist(N, 'weibull')
fl <- fitdist(N, 'lnorm')
fp <- fitdist(N, 'pareto', start = list(shape = 1, scale = 1))
par(mfrow = c(1, 2))
denscomp(list(fg, fw, fl, fp), addlegend = FALSE)
qqcomp(list(fg, fw, fl, fp),  legendtext = c('gamma', 'Weibull', 'Lognormal', 'Pareto')) 
length(unique(maintainers))


The R-help mailing list

library(RCurl)
url <- getURL('https://stat.ethz.ch/pipermail/r-help/')
R.help.toc <- htmlParse(url)
R.help.archives <- unlist(xpathApply(R.help.toc, "//table//td[3]/a", xmlAttrs), use.names = FALSE)
dir.create('r-help')
for (f in R.help.archives)
     download.file(url = paste0(url, f), file.path('help-r', f), method = 'curl'))
lines <- system(paste0("zgrep -E '^From: .* at .*' ./help-r/*.txt.gz"), intern = TRUE)
length(lines)
length(unique(lines))
lines[26]
lines    <- sub('.*From: ', '', lines)
Rhelpers <- sub('.*\\((.*)\\)', '\\1', lines)
tail(sort(table(Rhelpers)), 6)
grep('Brian( D)? Ripley', names(table(Rhelpers)), value = TRUE)

Volume of the R-help mailing list

lines <- system(paste0(
"zgrep -E '^Date: [A-Za-z]{3}, [0-9]{1,2} [A-Za-z]{3} ",
"[0-9]{4} [0-9]{2}:[0-9]{2}:[0-9]{2} [-+]{1}[0-9]{4}' ",
"./help-r/*.txt.gz"), intern = TRUE)
length(lines)
head(sub('.*Date: ', '', lines[1]))
times <- strptime(sub('.*Date: ', '', lines), format = '%a, %d %b %Y %H:%M:%S %z')
plot(table(format(times, '%Y')), type = 'l')

library(data.table)
Rhelp <- data.table(time = times)
Rhelp[, H := hour(time)]
Rhelp[, D := wday(time)]
library(ggplot2)
ggplot(na.omit(Rhelp[, .N, by = .(H, D)]),
     aes(x = factor(H), y = factor(D), size = N)) + geom_point() +
     ylab('Day of the week') + xlab('Hour of the day') +
     ggtitle('Number of mails posted on [R-help]') +
     theme_bw() + theme('legend.position' = 'top')
tail(sort(table(sub('.*([+-][0-9]{4}).*', '\\1', lines))), 22)

Forecasting the e-mail volume in the future

Rhelp[, date := as.Date(time)]
Rdaily <- na.omit(Rhelp[, .N, by = date])
Rdaily <- zoo(Rdaily$N, Rdaily$date)
plot(Rdaily)

library(forecast)
fit <- ets(Rdaily)
predict(fit, 1)
plot(forecast(fit, 30), include = 365)


Analyzing overlaps between our lists of R users

lists <- rbindlist(list(
data.frame(name = unique(supporterlist), list = 'supporter'),
data.frame(name = unique(maintainers),   list = 'maintainer'),
data.frame(name = unique(Rhelpers),      list = 'R-help')))

t <- table(lists$name, lists$list)
table(rowSums(t))
library(Rcapture)
descriptive(t)
closedp(t)

Further ideas on extending the capture-recapture models


The number of R users in social media

library(fbRads)
fbad_init(FB account ID, FB API token)
fbad_get_search(q = 'rstats', type = 'adinterest')
fbad_get_search(fbacc = fbacc, q = 'SPSS', type = 'adinterest')
res <- fbad_get_search(fbacc = fbacc, q = 'programming language', type = 'adinterest')
res <- res[order(res$audience_size, decreasing = TRUE), ]
res[1:10, 1:3]


R-related posts in social media

library(twitteR)
setup_twitter_oauth(...)
str(searchTwitter("#rstats", n = 1, resultType = 'recent'))
tweets <- Rtweets(n = 500)
length(strip_retweets(tweets))
tweets <- twListToDF(tweets)

library(tm)
corpus <- Corpus(VectorSource(tweets$text))
corpus <- tm_map(corpus, removeWords, stopwords("english"))
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removeWords, 'rstats')


library(wordcloud)
wordcloud(corpus)

No comments:

Post a Comment

Blog Archive