R Foundation members library(XML) page <- htmlParse('http://r-project.org/foundation/donors.html') list <- unlist(xpathApply(page, "//h3[@id='supporting-members']/following-sibling::ul[1]/li", xmlValue)) str(list) supporterlist <- sub(' \\([a-zA-Z ]*\\)$', '', list) countrylist <- substr(list, nchar(supporterlist) + 3, nchar(list) - 1) tail(sort(prop.table(table(countrylist)) * 100), 5) Visualizing supporting members around the world countries <- as.data.frame(table(countrylist)) library(rworldmap) joinCountryData2Map(countries, joinCode = 'NAME', nameJoinColumn = 'countrylist', verbose = TRUE) library(ggmap) for (fix in c('Brasil', 'CZ', 'Danmark', 'NL')) { countrylist[which(countrylist == fix)] <- geocode(fix, output = 'more')$country } countries <- as.data.frame(table(countrylist)) countries <- joinCountryData2Map(countries, joinCode = 'NAME', nameJoinColumn = 'countrylist') mapCountryData(countries, 'Freq', catMethod = 'logFixedWidth', mapTitle = 'Number of R Foundation supporting members') R package maintainers packages <- readHTMLTable(paste0('http://cran.r-project.org', '/web/checks/check_summary.html'), which = 2) maintainers <- sub('(.*) <(.*)>', '\\1', packages$' Maintainer') maintainers <- gsub(' ', ' ', maintainers) str(maintainers) tail(sort(table(maintainers)), 8) The number of packages per maintainer N <- as.numeric(table(maintainers)) library(fitdistrplus) plotdist(N) descdist(N, boot = 1e3) (gparams <- fitdist(N, 'gamma')) gshape <- gparams$estimate[['shape']] grate <- gparams$estimate[['rate']] sum(rgamma(1e5, shape = gshape, rate = grate)) hist(rgamma(1e5, shape = gshape, rate = grate)) pgamma(2, shape = gshape, rate = grate) prop.table(table(N <= 2)) ploc <- min(N) pshp <- length(N) / sum(log(N) - log(ploc)) library(actuar) ppareto(2, pshp, ploc) fg <- fitdist(N, 'gamma') fw <- fitdist(N, 'weibull') fl <- fitdist(N, 'lnorm') fp <- fitdist(N, 'pareto', start = list(shape = 1, scale = 1)) par(mfrow = c(1, 2)) denscomp(list(fg, fw, fl, fp), addlegend = FALSE) qqcomp(list(fg, fw, fl, fp), legendtext = c('gamma', 'Weibull', 'Lognormal', 'Pareto')) length(unique(maintainers)) The R-help mailing list library(RCurl) url <- getURL('https://stat.ethz.ch/pipermail/r-help/') R.help.toc <- htmlParse(url) R.help.archives <- unlist(xpathApply(R.help.toc, "//table//td[3]/a", xmlAttrs), use.names = FALSE) dir.create('r-help') for (f in R.help.archives) download.file(url = paste0(url, f), file.path('help-r', f), method = 'curl')) lines <- system(paste0("zgrep -E '^From: .* at .*' ./help-r/*.txt.gz"), intern = TRUE) length(lines) length(unique(lines)) lines[26] lines <- sub('.*From: ', '', lines) Rhelpers <- sub('.*\\((.*)\\)', '\\1', lines) tail(sort(table(Rhelpers)), 6) grep('Brian( D)? Ripley', names(table(Rhelpers)), value = TRUE) Volume of the R-help mailing list lines <- system(paste0( "zgrep -E '^Date: [A-Za-z]{3}, [0-9]{1,2} [A-Za-z]{3} ", "[0-9]{4} [0-9]{2}:[0-9]{2}:[0-9]{2} [-+]{1}[0-9]{4}' ", "./help-r/*.txt.gz"), intern = TRUE) length(lines) head(sub('.*Date: ', '', lines[1])) times <- strptime(sub('.*Date: ', '', lines), format = '%a, %d %b %Y %H:%M:%S %z') plot(table(format(times, '%Y')), type = 'l') library(data.table) Rhelp <- data.table(time = times) Rhelp[, H := hour(time)] Rhelp[, D := wday(time)] library(ggplot2) ggplot(na.omit(Rhelp[, .N, by = .(H, D)]), aes(x = factor(H), y = factor(D), size = N)) + geom_point() + ylab('Day of the week') + xlab('Hour of the day') + ggtitle('Number of mails posted on [R-help]') + theme_bw() + theme('legend.position' = 'top') tail(sort(table(sub('.*([+-][0-9]{4}).*', '\\1', lines))), 22) Forecasting the e-mail volume in the future Rhelp[, date := as.Date(time)] Rdaily <- na.omit(Rhelp[, .N, by = date]) Rdaily <- zoo(Rdaily$N, Rdaily$date) plot(Rdaily) library(forecast) fit <- ets(Rdaily) predict(fit, 1) plot(forecast(fit, 30), include = 365) Analyzing overlaps between our lists of R users lists <- rbindlist(list( data.frame(name = unique(supporterlist), list = 'supporter'), data.frame(name = unique(maintainers), list = 'maintainer'), data.frame(name = unique(Rhelpers), list = 'R-help'))) t <- table(lists$name, lists$list) table(rowSums(t)) library(Rcapture) descriptive(t) closedp(t) Further ideas on extending the capture-recapture models The number of R users in social media library(fbRads) fbad_init(FB account ID, FB API token) fbad_get_search(q = 'rstats', type = 'adinterest') fbad_get_search(fbacc = fbacc, q = 'SPSS', type = 'adinterest') res <- fbad_get_search(fbacc = fbacc, q = 'programming language', type = 'adinterest') res <- res[order(res$audience_size, decreasing = TRUE), ] res[1:10, 1:3] R-related posts in social media library(twitteR) setup_twitter_oauth(...) str(searchTwitter("#rstats", n = 1, resultType = 'recent')) tweets <- Rtweets(n = 500) length(strip_retweets(tweets)) tweets <- twListToDF(tweets) library(tm) corpus <- Corpus(VectorSource(tweets$text)) corpus <- tm_map(corpus, removeWords, stopwords("english")) corpus <- tm_map(corpus, content_transformer(tolower)) corpus <- tm_map(corpus, removePunctuation) corpus <- tm_map(corpus, stripWhitespace) corpus <- tm_map(corpus, removeWords, 'rstats') library(wordcloud) wordcloud(corpus)
Friday, February 12, 2016
Master R 14 - Analyzing the R community
Labels:
R
Subscribe to:
Post Comments (Atom)
Blog Archive
-
▼
2016
(87)
-
▼
February
(15)
- Python Data Analysis 5 - pandas: Reading and Writi...
- Python Data Analysis 4 - The pandas Library - An I...
- Python Data Analysis 3 - The NumPy Library
- Python Data Analysis 2 - Introduction to the Pytho...
- Python Data Analysis 1 - An Introduction to Data A...
- Master R 14 - Analyzing the R community
- Master R 13 - Data Around Us
- Master R 12 - Analyzing Time-series
- Master R 11 - Social Network analysis of the R Eco...
- Master R 10 - Classification and Clustering
- Master R 9 - From Big to Small Data
- Master R 8 - Polishing data
- Master R 7 - Unstructured Data
- Master R 6 - Beyond the linear trend line
- Master R 5 - Building Models
-
▼
February
(15)
No comments:
Post a Comment