R Foundation members
library(XML)
page <- htmlParse('http://r-project.org/foundation/donors.html')
list <- unlist(xpathApply(page, "//h3[@id='supporting-members']/following-sibling::ul[1]/li", xmlValue))
str(list)
supporterlist <- sub(' \\([a-zA-Z ]*\\)$', '', list)
countrylist <- substr(list, nchar(supporterlist) + 3, nchar(list) - 1)
tail(sort(prop.table(table(countrylist)) * 100), 5)
Visualizing supporting members around the world
countries <- as.data.frame(table(countrylist))
library(rworldmap)
joinCountryData2Map(countries, joinCode = 'NAME', nameJoinColumn = 'countrylist', verbose = TRUE)
library(ggmap)
for (fix in c('Brasil', 'CZ', 'Danmark', 'NL')) {
countrylist[which(countrylist == fix)] <-
geocode(fix, output = 'more')$country
}
countries <- as.data.frame(table(countrylist))
countries <- joinCountryData2Map(countries, joinCode = 'NAME', nameJoinColumn = 'countrylist')
mapCountryData(countries, 'Freq', catMethod = 'logFixedWidth', mapTitle = 'Number of R Foundation supporting members')
R package maintainers
packages <- readHTMLTable(paste0('http://cran.r-project.org', '/web/checks/check_summary.html'), which = 2)
maintainers <- sub('(.*) <(.*)>', '\\1', packages$' Maintainer')
maintainers <- gsub(' ', ' ', maintainers)
str(maintainers)
tail(sort(table(maintainers)), 8)
The number of packages per maintainer
N <- as.numeric(table(maintainers))
library(fitdistrplus)
plotdist(N)
descdist(N, boot = 1e3)
(gparams <- fitdist(N, 'gamma'))
gshape <- gparams$estimate[['shape']]
grate <- gparams$estimate[['rate']]
sum(rgamma(1e5, shape = gshape, rate = grate))
hist(rgamma(1e5, shape = gshape, rate = grate))
pgamma(2, shape = gshape, rate = grate)
prop.table(table(N <= 2))
ploc <- min(N)
pshp <- length(N) / sum(log(N) - log(ploc))
library(actuar)
ppareto(2, pshp, ploc)
fg <- fitdist(N, 'gamma')
fw <- fitdist(N, 'weibull')
fl <- fitdist(N, 'lnorm')
fp <- fitdist(N, 'pareto', start = list(shape = 1, scale = 1))
par(mfrow = c(1, 2))
denscomp(list(fg, fw, fl, fp), addlegend = FALSE)
qqcomp(list(fg, fw, fl, fp), legendtext = c('gamma', 'Weibull', 'Lognormal', 'Pareto'))
length(unique(maintainers))
The R-help mailing list
library(RCurl)
url <- getURL('https://stat.ethz.ch/pipermail/r-help/')
R.help.toc <- htmlParse(url)
R.help.archives <- unlist(xpathApply(R.help.toc, "//table//td[3]/a", xmlAttrs), use.names = FALSE)
dir.create('r-help')
for (f in R.help.archives)
download.file(url = paste0(url, f), file.path('help-r', f), method = 'curl'))
lines <- system(paste0("zgrep -E '^From: .* at .*' ./help-r/*.txt.gz"), intern = TRUE)
length(lines)
length(unique(lines))
lines[26]
lines <- sub('.*From: ', '', lines)
Rhelpers <- sub('.*\\((.*)\\)', '\\1', lines)
tail(sort(table(Rhelpers)), 6)
grep('Brian( D)? Ripley', names(table(Rhelpers)), value = TRUE)
Volume of the R-help mailing list
lines <- system(paste0(
"zgrep -E '^Date: [A-Za-z]{3}, [0-9]{1,2} [A-Za-z]{3} ",
"[0-9]{4} [0-9]{2}:[0-9]{2}:[0-9]{2} [-+]{1}[0-9]{4}' ",
"./help-r/*.txt.gz"), intern = TRUE)
length(lines)
head(sub('.*Date: ', '', lines[1]))
times <- strptime(sub('.*Date: ', '', lines), format = '%a, %d %b %Y %H:%M:%S %z')
plot(table(format(times, '%Y')), type = 'l')
library(data.table)
Rhelp <- data.table(time = times)
Rhelp[, H := hour(time)]
Rhelp[, D := wday(time)]
library(ggplot2)
ggplot(na.omit(Rhelp[, .N, by = .(H, D)]),
aes(x = factor(H), y = factor(D), size = N)) + geom_point() +
ylab('Day of the week') + xlab('Hour of the day') +
ggtitle('Number of mails posted on [R-help]') +
theme_bw() + theme('legend.position' = 'top')
tail(sort(table(sub('.*([+-][0-9]{4}).*', '\\1', lines))), 22)
Forecasting the e-mail volume in the future
Rhelp[, date := as.Date(time)]
Rdaily <- na.omit(Rhelp[, .N, by = date])
Rdaily <- zoo(Rdaily$N, Rdaily$date)
plot(Rdaily)
library(forecast)
fit <- ets(Rdaily)
predict(fit, 1)
plot(forecast(fit, 30), include = 365)
Analyzing overlaps between our lists of R users
lists <- rbindlist(list(
data.frame(name = unique(supporterlist), list = 'supporter'),
data.frame(name = unique(maintainers), list = 'maintainer'),
data.frame(name = unique(Rhelpers), list = 'R-help')))
t <- table(lists$name, lists$list)
table(rowSums(t))
library(Rcapture)
descriptive(t)
closedp(t)
Further ideas on extending the capture-recapture models
The number of R users in social media
library(fbRads)
fbad_init(FB account ID, FB API token)
fbad_get_search(q = 'rstats', type = 'adinterest')
fbad_get_search(fbacc = fbacc, q = 'SPSS', type = 'adinterest')
res <- fbad_get_search(fbacc = fbacc, q = 'programming language', type = 'adinterest')
res <- res[order(res$audience_size, decreasing = TRUE), ]
res[1:10, 1:3]
R-related posts in social media
library(twitteR)
setup_twitter_oauth(...)
str(searchTwitter("#rstats", n = 1, resultType = 'recent'))
tweets <- Rtweets(n = 500)
length(strip_retweets(tweets))
tweets <- twListToDF(tweets)
library(tm)
corpus <- Corpus(VectorSource(tweets$text))
corpus <- tm_map(corpus, removeWords, stopwords("english"))
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removeWords, 'rstats')
library(wordcloud)
wordcloud(corpus)
Friday, February 12, 2016
Master R 14 - Analyzing the R community
Labels:
R
Subscribe to:
Post Comments (Atom)
Blog Archive
-
▼
2016
(87)
-
▼
February
(15)
- Python Data Analysis 5 - pandas: Reading and Writi...
- Python Data Analysis 4 - The pandas Library - An I...
- Python Data Analysis 3 - The NumPy Library
- Python Data Analysis 2 - Introduction to the Pytho...
- Python Data Analysis 1 - An Introduction to Data A...
- Master R 14 - Analyzing the R community
- Master R 13 - Data Around Us
- Master R 12 - Analyzing Time-series
- Master R 11 - Social Network analysis of the R Eco...
- Master R 10 - Classification and Clustering
- Master R 9 - From Big to Small Data
- Master R 8 - Polishing data
- Master R 7 - Unstructured Data
- Master R 6 - Beyond the linear trend line
- Master R 5 - Building Models
-
▼
February
(15)
No comments:
Post a Comment