rm(list=ls())
# Transposing matrices
m <- matrix(1:9, 3)
t(m)
# Filtering data by string matching
library(plyr)
library(dplyr)
library(hflights)
str(select(hflights, ends_with("delay")))
str(select(hflights, contains("T", ignore.case = FALSE)))
str(select(hflights, matches("^[[:alpha:]]{5,6}$")))
table(nchar(names(hflights)))
str(select(hflights, -matches("^[[:alpha:]]{7,8}$")))
# Rearranging data
str(arrange(hflights, ActualElapsedTime))
install.packages("magrittr")
library(magrittr)
hflights %>% arrange(ActualElapsedTime) %>% str
hflights %>% arrange(ActualElapsedTime) %>%
select(ActualElapsedTime, Dest) %>%
subset(Dest != 'AUS') %>%
head %>%
str
library(data.table)
str(head(data.table(hflights, key='ActualElapsedTime')[Dest != 'AUS', c('ActualElapsedTime', 'Dest'), with = FALSE]))
str(head(na.omit(data.table(hflights, key='ActualElapsedTime'))[Dest != 'AUS', list('ActualElapsedTime', 'Dest')]))
# dplyr vs data.table
# Computing new varibles
hflights_dt <- data.table(hflights)
hflights_dt[, DistanceKMs := Distance / 0.62137]
hflights_dt$DistanceKMs <- hflights_dt$Distance / 0.62137
# Memory profiling
install.packages("pryr")
library(pryr)
hflights_dt <- data.table(hflights)
address(hflights_dt)
hflights_dt$DistanceKMs <- hflights_dt$Distance / 0.62137
hflights_dt <- data.table(hflights)
address(hflights_dt)
hflights_dt[, DistanceKMs := Distance / 0.62137]
address(hflights_dt)
system.time(within(hflights_dt, DistanceKMs <- Distance / 0.62137))
# Creating multiple variables at a time
hflights_dt[, c('DistanceKMs', 'DistanceFeets') := list(Distance/0.62137, Distance * 5280)];
carriers <- unique(hflights_dt$UniqueCarrier)
hflights_dt[, paste('carrier', carriers, sep = '_') := lapply(carriers, function(x) as.numeric(UniqueCarrier == x))]
str(hflights_dt[, grep('^carrier', names(hflights_dt)), with = FALSE])
str(hflights_dt)
# Computing new variables with dplyr
hflights <- hflights %>% mutate(DistanceKMs = Distance / 0.62137)
hflights <- mutate(hflights, DistanceKMs = Distance/0.62137)
# Merging datasets
wdays <- data.frame(DayOfWeek = 1: 7,DayOfWeekString = c("Sunday", "Monday", "Tuesday","Wednesday", "Thursday", "Friday", "Saturday"))
system.time(merge(hflights, wdays))
system.time(merge(hflights_dt, wdays, by = 'DayOfWeek'))
hflights$wdays <- weekdays(as.Date(with(hflights, paste(Year, Month, DayofMonth, sep = '-'))))
# One of the most often used function along with the base commands, rbind and cbind is do.call, which can execute the rbind or cbind call on all elemetns of a list, thus enabling us, to join a list of data frames.
# Similarly, rbindlist can be called to merge a list of data.table objects ina much faster way.
# Reshaping data in a flexible way
# Converting wide tables to the long table format
library(reshape2)
head(melt(hflights))
hflights_melted <- melt(hflights, id.vars=0, measure.vars = c('ActualElapsedTime', 'AirTime'))
str(hflights_melted)
library(ggplot2)
ggplot(hflights_melted, aes(variable, y=value)) + geom_boxplot()
# Converting long tables to the wide table format
hflights_melted <- melt(hflights, id.vars='Month', measure.vars = c('ActualElapsedTime', 'AirTime'))
df <- dcast(hflights_melted, Month ~ variable, fun.aggregate = mean, na.rm = TRUE)
ggplot(melt(df, id.vars = 'Month')) +
geom_line(aes(x=Month, y=value, color=variable)) +
scale_x_continuous(breaks = 1:12) +
theme_bw() +
theme(legend.position = 'top')
hflights_melted <- melt(add_margins(hflights, 'Month'), id.vars = 'Month', measure.vars = c('ActualElapsedTime', 'AirTime'))
df <- dcast(hflights_melted, Month ~ variable, fun.aggregate = mean, na.rm = TRUE)
Thursday, December 17, 2015
Thursday, December 10, 2015
Master R 3 - Filter and Summarize Data
# Drop needless data
library(sqldf)
sqldf("select * from mtcars where am=1 and vs=1")
sqldf("select * from mtcars limit 10")
subset(mtcars, am==1 & vs ==1)
identical(
sqldf("select * from mtcars where am=1 and vs=1", row.names = TRUE),
subset(mtcars, am==1 & vs==1)
)
subset(mtcars, am ==1 & vs ==1, select = hp:wt)
subset(mtcars, am ==1 & vs ==1, select = -c(hp,wt))
# Drop needless data in an efficient way
library(hflights)
system.time(sqldf("select * from hflights where Dest=='BNA'", row.names = TRUE))
system.time(subset(hflights, Dest == 'BNA'))
install.packages("dplyr")
library(dplyr)
system.time(filter(hflights, Dest == 'BNA'))
# Drop needless data in another efficient way
library(data.table)
hflights_dt <- data.table(hflights)
hflights_dt[, rownames := rownames(hflights)]
system.time(hflights_dt[Dest == 'BNA'])
/*
DT[i, j, ..., drop = TRUE]
DT[where, select | update, group by][having][order by][]...[]
*/
str(hflights_dt[Dest == 'BNA', list(DepTime, ArrTime)])
tail(hflights_dt[Dest == 'BNA', list(DepTime, ArrTime)])
hflights_dt[Dest=='BNA', .(DepTime, ArrTime)]
hflights_dt[Dest=='BNA', c('DepTime', 'ArrTime'), with= FALSE]
## Aggregation
aggregate(hflights$Diverted, by=list(hflights$DayOfWeek), FUN=mean)
aggregate(Diverted ~ DayOfWeek, data=hflights, FUN=mean)
with(hflights, aggregate(Diverted, by=list(DayOfWeek), FUN=mean))
## Quicker aggregation with Base R commands
# tapply return array
# d stands for data.frame
# s stands for array
# l stands for list
# m is a special input type, which means that we provide multiple arguments in a tabular format for the function
# r input type expects an integer, which specifies the number of times replicated
#_ is a special output type that does not return anything for the function
tapply(hflights$Diverted, hflights$DayOfWeek, mean)
## Convenient helper functions
library(plyr)
ddply(hflights, .(DayOfWeek), function(x) mean(x$Diverted))
ddply(hflights, .(DayOfWeek), summarise, Diverted = mean(Diverted))
library(dplyr)
hflights_DayOfWeek <- group_by(hflights, DayOfWeek)
dplyr::summarise(hflights_DayOfWeek, mean(Diverted))
## Aggregate with data.table
setkey(hflights_dt, 'DayOfWeek')
hflights_dt[, mean(Diverted), by = DayOfWeek]
#hflights_dt[, list('mean(Diverted') = mean(Diverted)), by=DayOfWeek]
# Summary functions
ddply(hflights, .(DayOfWeek), summarise, n=length(Diverted))
ddply(hflights, .(DayOfWeek), nrow)
table(hflights$DayOfWeek)
count(hflights, 'DayOfWeek')
attr(hflights_DayOfWeek, 'group_sizes')
hflights_dt[, .N, by = list(DayOfWeek)]
library(sqldf)
sqldf("select * from mtcars where am=1 and vs=1")
sqldf("select * from mtcars limit 10")
subset(mtcars, am==1 & vs ==1)
identical(
sqldf("select * from mtcars where am=1 and vs=1", row.names = TRUE),
subset(mtcars, am==1 & vs==1)
)
subset(mtcars, am ==1 & vs ==1, select = hp:wt)
subset(mtcars, am ==1 & vs ==1, select = -c(hp,wt))
# Drop needless data in an efficient way
library(hflights)
system.time(sqldf("select * from hflights where Dest=='BNA'", row.names = TRUE))
system.time(subset(hflights, Dest == 'BNA'))
install.packages("dplyr")
library(dplyr)
system.time(filter(hflights, Dest == 'BNA'))
# Drop needless data in another efficient way
library(data.table)
hflights_dt <- data.table(hflights)
hflights_dt[, rownames := rownames(hflights)]
system.time(hflights_dt[Dest == 'BNA'])
/*
DT[i, j, ..., drop = TRUE]
DT[where, select | update, group by][having][order by][]...[]
*/
str(hflights_dt[Dest == 'BNA', list(DepTime, ArrTime)])
tail(hflights_dt[Dest == 'BNA', list(DepTime, ArrTime)])
hflights_dt[Dest=='BNA', .(DepTime, ArrTime)]
hflights_dt[Dest=='BNA', c('DepTime', 'ArrTime'), with= FALSE]
## Aggregation
aggregate(hflights$Diverted, by=list(hflights$DayOfWeek), FUN=mean)
aggregate(Diverted ~ DayOfWeek, data=hflights, FUN=mean)
with(hflights, aggregate(Diverted, by=list(DayOfWeek), FUN=mean))
## Quicker aggregation with Base R commands
# tapply return array
# d stands for data.frame
# s stands for array
# l stands for list
# m is a special input type, which means that we provide multiple arguments in a tabular format for the function
# r input type expects an integer, which specifies the number of times replicated
#_ is a special output type that does not return anything for the function
tapply(hflights$Diverted, hflights$DayOfWeek, mean)
## Convenient helper functions
library(plyr)
ddply(hflights, .(DayOfWeek), function(x) mean(x$Diverted))
ddply(hflights, .(DayOfWeek), summarise, Diverted = mean(Diverted))
library(dplyr)
hflights_DayOfWeek <- group_by(hflights, DayOfWeek)
dplyr::summarise(hflights_DayOfWeek, mean(Diverted))
## Aggregate with data.table
setkey(hflights_dt, 'DayOfWeek')
hflights_dt[, mean(Diverted), by = DayOfWeek]
#hflights_dt[, list('mean(Diverted') = mean(Diverted)), by=DayOfWeek]
# Summary functions
ddply(hflights, .(DayOfWeek), summarise, n=length(Diverted))
ddply(hflights, .(DayOfWeek), nrow)
table(hflights$DayOfWeek)
count(hflights, 'DayOfWeek')
attr(hflights_DayOfWeek, 'group_sizes')
hflights_dt[, .N, by = list(DayOfWeek)]
Master R 2 - Fetch Web Data
## Loading datasets from the Internet
str(read.csv('http://opengeocode.org/download/CCurls.txt'))
install.packages('RCurl')
library(RCurl)
url <- 'https://data.consumerfinance.gov/api/views/x94z-ydhh/rows.csv?accessType=DOWNLOAD'
df <- read.csv(text = getURL(url))
str(df)
sort(table(df$Product))
## Other popular online data formats
# JSON
install.packages(rjson)
library(rjson)
u <- 'http://data.consumerfinance.gov/api/views'
fromJSON(file=u)
res <- fromJSON(file=paste0(u, '/25ei-6bcr/rows.json?max_rows=5'))
names(res)
# drop meta data
res <- res$data
class(res)
df <- as.data.frame(t(sapply(res, function(x) unlist(x[-13]))))
str(df)
library(plyr)
df <- ldply(res, function(x) unlist(x[-13]))
#extract the name filed by `[`
names(df) <- sapply(res$meta$view$columns, `[`, 'name')[-13]
# XML
install.packages('XML')
require('XML')
doc <- xmlParse(paste0(u, '/25ei-6bcr/rows.xml?max_rows=5'))
df <- xmlToDataFrame(nodes = getNodeSet(doc, "//response/row/row"))
str(df)
# factor -> character -> numeric
is.number <- function(x)
all(!is.na(suppressWarnings(as.numeric(as.character(x)))))
for (n in names(df))
if(is.number(df[,n]))
df[,n] <- as.numeric(as.character(df[,n]))
# HTML tables
# use RCurl function
doc <- getURL(paste0(u, '/25ei-6bcr/rows?max_rows=5'), httpheader = c(Accept = "text/html"))
# use XML function
res <- readHTMLTable(doc)
# get first table
df <- res[[1]]
df <- readHTMLTable(doc, which = 1)
## Reading tabular from static Web pages
res <- readHTMLTable('https://cran.r-project.org/web/packages/available_packages_by_name.html')
library(wordcloud)
wordcloud(res[[1]][,2])
## Socrata Open Data API
install.packages('RSocrata')
library(RSocrata)
df <- read.socrata(paste0(u, '/25ei-6bcr'))
str(df)
# Finance APIs
install.packages('quantmod')
library(quantmod)
head(getSymbols('BBBY', env = NULL))
getFX('USD/RMB')
tail(USDRMB)
methods(getSymbols)
str(stockSymbols())
# Fetching time series with Quandl
install.packages('Quandl')
library(Quandl)
Quandl('SEC/DIV_A')
attr(Quandl('SEC/DIV_A', meta = TRUE), 'meta')$frequency
# Fetching Google Doc and analytics
RGoogleDocs
googlesheets
GTrendsR
# Historical weathre data
install.packages('weatherData')
library(weatherData)
getWeatherForDate('NewYork', start_date=as.Date('2015/12/1'), end_date=as.Date('2015/12/10'))
str(read.csv('http://opengeocode.org/download/CCurls.txt'))
install.packages('RCurl')
library(RCurl)
url <- 'https://data.consumerfinance.gov/api/views/x94z-ydhh/rows.csv?accessType=DOWNLOAD'
df <- read.csv(text = getURL(url))
str(df)
sort(table(df$Product))
## Other popular online data formats
# JSON
install.packages(rjson)
library(rjson)
u <- 'http://data.consumerfinance.gov/api/views'
fromJSON(file=u)
res <- fromJSON(file=paste0(u, '/25ei-6bcr/rows.json?max_rows=5'))
names(res)
# drop meta data
res <- res$data
class(res)
df <- as.data.frame(t(sapply(res, function(x) unlist(x[-13]))))
str(df)
library(plyr)
df <- ldply(res, function(x) unlist(x[-13]))
#extract the name filed by `[`
names(df) <- sapply(res$meta$view$columns, `[`, 'name')[-13]
# XML
install.packages('XML')
require('XML')
doc <- xmlParse(paste0(u, '/25ei-6bcr/rows.xml?max_rows=5'))
df <- xmlToDataFrame(nodes = getNodeSet(doc, "//response/row/row"))
str(df)
# factor -> character -> numeric
is.number <- function(x)
all(!is.na(suppressWarnings(as.numeric(as.character(x)))))
for (n in names(df))
if(is.number(df[,n]))
df[,n] <- as.numeric(as.character(df[,n]))
# HTML tables
# use RCurl function
doc <- getURL(paste0(u, '/25ei-6bcr/rows?max_rows=5'), httpheader = c(Accept = "text/html"))
# use XML function
res <- readHTMLTable(doc)
# get first table
df <- res[[1]]
df <- readHTMLTable(doc, which = 1)
## Reading tabular from static Web pages
res <- readHTMLTable('https://cran.r-project.org/web/packages/available_packages_by_name.html')
library(wordcloud)
wordcloud(res[[1]][,2])
## Socrata Open Data API
install.packages('RSocrata')
library(RSocrata)
df <- read.socrata(paste0(u, '/25ei-6bcr'))
str(df)
# Finance APIs
install.packages('quantmod')
library(quantmod)
head(getSymbols('BBBY', env = NULL))
getFX('USD/RMB')
tail(USDRMB)
methods(getSymbols)
str(stockSymbols())
# Fetching time series with Quandl
install.packages('Quandl')
library(Quandl)
Quandl('SEC/DIV_A')
attr(Quandl('SEC/DIV_A', meta = TRUE), 'meta')$frequency
# Fetching Google Doc and analytics
RGoogleDocs
googlesheets
GTrendsR
# Historical weathre data
install.packages('weatherData')
library(weatherData)
getWeatherForDate('NewYork', start_date=as.Date('2015/12/1'), end_date=as.Date('2015/12/10'))
Monday, December 7, 2015
Master R 1 - Readin Data
# Readin txt
rate1 <- read.table(text = "
rank rank.1 numberc lift
1 0.000028 0.000008 1 2284.907818
2 0.000036 0.000012 2 4483.611567
3 0.000278 0.000081 3 5073.756150
4 0.000636 0.000155 4 5354.705083
5 0.000706 0.000168 5 6457.000354
6 0.000942 0.000203 6 7075.458949
7 0.054982 0.003164 7 988.025500
8 0.055152 0.003198 8 1117.996648
9 0.075488 0.006454 9 641.864357
10 6.618558 0.239828 10 18.801391
11 10.047460 0.366477 11 13.258208
12 11.604464 0.443113 12 11.865461
13 13.818272 0.560422 13 10.021142
14 14.624862 0.658144 14 9.107123
15 18.287068 0.824674 15 7.642814
16 20.369262 1.046954 16 6.262066
17 21.784700 1.195478 17 5.757480
18 23.421508 1.453525 18 4.884909
19 23.512264 1.472164 19 5.133203
20 23.751196 1.509041 20 5.298250",
header = TRUE,sep = "",row.names = 1)
write.csv(rate1, "tmp.csv")
rate9 <- read.table(text = "
rank rank.1 numberc lift
1 0.002650 0.001006 1 393.25276
2 0.014656 0.005053 2 179.98946
3 0.023782 0.007999 3 172.92444
4 0.045300 0.014959 4 124.42221
5 0.060070 0.019574 5 119.18530
6 0.121250 0.037631 6 74.38297
7 0.121330 0.037635 7 86.93661
8 0.126386 0.039033 8 95.91715
9 0.250570 0.059789 9 70.30673
10 0.482642 0.119722 10 38.63316
11 0.710086 0.176313 11 28.61911
12 0.790464 0.198090 12 27.76305
13 1.886340 0.365539 13 15.89373
14 1.894954 0.367702 14 17.08624
15 2.930882 0.558398 15 11.76272
16 3.221640 0.619175 16 11.27770
17 3.428724 0.654649 17 11.33837
18 4.072348 0.752404 18 10.36725
19 9.085542 1.513426 19 4.96601
20 13.597818 2.305283 20 3.12303",
header = TRUE,sep = "",row.names = 1)
write.csv(rate9, "tmp.csv")
rate1 <- read.table(text = "
rank rank.1 numberc lift
1 0.000028 0.000008 1 2284.907818
2 0.000036 0.000012 2 4483.611567
3 0.000278 0.000081 3 5073.756150
4 0.000636 0.000155 4 5354.705083
5 0.000706 0.000168 5 6457.000354
6 0.000942 0.000203 6 7075.458949
7 0.054982 0.003164 7 988.025500
8 0.055152 0.003198 8 1117.996648
9 0.075488 0.006454 9 641.864357
10 6.618558 0.239828 10 18.801391
11 10.047460 0.366477 11 13.258208
12 11.604464 0.443113 12 11.865461
13 13.818272 0.560422 13 10.021142
14 14.624862 0.658144 14 9.107123
15 18.287068 0.824674 15 7.642814
16 20.369262 1.046954 16 6.262066
17 21.784700 1.195478 17 5.757480
18 23.421508 1.453525 18 4.884909
19 23.512264 1.472164 19 5.133203
20 23.751196 1.509041 20 5.298250",
header = TRUE,sep = "",row.names = 1)
write.csv(rate1, "tmp.csv")
rate9 <- read.table(text = "
rank rank.1 numberc lift
1 0.002650 0.001006 1 393.25276
2 0.014656 0.005053 2 179.98946
3 0.023782 0.007999 3 172.92444
4 0.045300 0.014959 4 124.42221
5 0.060070 0.019574 5 119.18530
6 0.121250 0.037631 6 74.38297
7 0.121330 0.037635 7 86.93661
8 0.126386 0.039033 8 95.91715
9 0.250570 0.059789 9 70.30673
10 0.482642 0.119722 10 38.63316
11 0.710086 0.176313 11 28.61911
12 0.790464 0.198090 12 27.76305
13 1.886340 0.365539 13 15.89373
14 1.894954 0.367702 14 17.08624
15 2.930882 0.558398 15 11.76272
16 3.221640 0.619175 16 11.27770
17 3.428724 0.654649 17 11.33837
18 4.072348 0.752404 18 10.36725
19 9.085542 1.513426 19 4.96601
20 13.597818 2.305283 20 3.12303",
header = TRUE,sep = "",row.names = 1)
write.csv(rate9, "tmp.csv")
## Create a formula for a model with a large number of variables:
xname <- paste("x", 1:25, sep=" ")
fmla <- as.formula(paste("y ~ ", paste(xname, collapse= "+")))
## 1 Loading text files of a reasonable size
library('hflights')
write.csv(hflights, 'hflights.csv', row.names = FALSE)
str(hflights)
system.time(read.csv('hflights.csv'))
colClasses <- sapply(hflights, class)
system.time(read.csv('hflights.csv', colClasses = colClasses))
system.time(read.csv('hflights.csv', colClasses = colClasses, nrows=227496, comment.char = ' ') )
install.packages('microbenchmark')
library(microbenchmark)
f <- function() read.csv('hflights.csv')
g <- function() read.csv('hflights.csv', colClasses = colClasses, nrows=227496, comment.char = ' ')
res <- microbenchmark(f(), g())
res
boxplot(res, xlab = '', main = expression(paste('Benchmarking'), italic('read.table')))
## Data files larger than the physical memory
install.packages('sqldf')
library(sqldf)
system.time(read.csv.sql('hflights.csv'))
install.packages('ff')
library(ff)
system.time(read.csv.ffdf(file='hflights.csv'))
install.packages('bigmemory')
library(bigmemory)
system.time(read.big.matrix('hflights.csv', header = TRUE))
## Benchmarking text file parsers
install.packages('data.table')
library(data.table)
system.time(dt <- fread('hflights.csv'))
df <- as.data.frame(dt)
is.data.frame(dt)
## Loading a subset of text files
df <- read.csv.sql('hflights.csv', sql = "select * from file where Dest = '\"BNA\"'")
df <- read.csv.sql('hflights.csv', sql = "select * from file where Dest = 'BNA'", filter = 'tr -d ^\\" ')
## Filtering flat files before loading to R
system.time(system('cat hflights.csv | grep BNA ', intern = TRUE))
sqldf("attach 'hflights_db' as new" )
read.csv.sql('hflights.csv', sql='create table hflights1 as select * from file', dbname = 'hflights_db')
system.time(df <- sqldf(sql = "select * from hflights1 where Dest = '\"BNA\"' ", dbname = "hflights_db"))
xname <- paste("x", 1:25, sep=" ")
fmla <- as.formula(paste("y ~ ", paste(xname, collapse= "+")))
## 1 Loading text files of a reasonable size
library('hflights')
write.csv(hflights, 'hflights.csv', row.names = FALSE)
str(hflights)
system.time(read.csv('hflights.csv'))
colClasses <- sapply(hflights, class)
system.time(read.csv('hflights.csv', colClasses = colClasses))
system.time(read.csv('hflights.csv', colClasses = colClasses, nrows=227496, comment.char = ' ') )
install.packages('microbenchmark')
library(microbenchmark)
f <- function() read.csv('hflights.csv')
g <- function() read.csv('hflights.csv', colClasses = colClasses, nrows=227496, comment.char = ' ')
res <- microbenchmark(f(), g())
res
boxplot(res, xlab = '', main = expression(paste('Benchmarking'), italic('read.table')))
## Data files larger than the physical memory
install.packages('sqldf')
library(sqldf)
system.time(read.csv.sql('hflights.csv'))
install.packages('ff')
library(ff)
system.time(read.csv.ffdf(file='hflights.csv'))
install.packages('bigmemory')
library(bigmemory)
system.time(read.big.matrix('hflights.csv', header = TRUE))
## Benchmarking text file parsers
install.packages('data.table')
library(data.table)
system.time(dt <- fread('hflights.csv'))
df <- as.data.frame(dt)
is.data.frame(dt)
## Loading a subset of text files
df <- read.csv.sql('hflights.csv', sql = "select * from file where Dest = '\"BNA\"'")
df <- read.csv.sql('hflights.csv', sql = "select * from file where Dest = 'BNA'", filter = 'tr -d ^\\" ')
## Filtering flat files before loading to R
system.time(system('cat hflights.csv | grep BNA ', intern = TRUE))
sqldf("attach 'hflights_db' as new" )
read.csv.sql('hflights.csv', sql='create table hflights1 as select * from file', dbname = 'hflights_db')
system.time(df <- sqldf(sql = "select * from hflights1 where Dest = '\"BNA\"' ", dbname = "hflights_db"))
Thursday, December 3, 2015
Visual 10 - Bipartite Network Plot in R
require(igraph)
require(NetIndices)
require(ggplot2)
ggbigraph <- function(g) {
g_ <- get.edgelist(g)
g_df <- as.data.frame(g_)
g_df$id <- 1:length(g_df[,1])
g_df <- melt(g_df, id=3)
xy_s <- data.frame(value = unique(g_df$value),
x = c(rep(2, length(unique(g_df$value))/2), rep(4, length(unique(g_df$value))/2)),
y = rep(seq(1, length(unique(g_df$value))/2, 1), 2))
g_df2 <- merge(g_df, xy_s, by = "value")
p <- ggplot(g_df2, aes(x, y)) +
geom_point() +
geom_line(size = 0.3, aes(group = id, col = id)) +
geom_text(size = 3, hjust = 2, aes(label = value)) +
theme_bw() +
opts(panel.grid.major=theme_blank(),
panel.grid.minor=theme_blank(),
axis.text.x=theme_blank(),
axis.text.y=theme_blank(),
axis.title.x=theme_blank(),
axis.title.y=theme_blank(),
axis.ticks=theme_blank(),
panel.border=theme_blank(),
legend.position="none")
p
}
i <- 2
cool <- ddply(visit_cmatrix, .(visit_cmatrix[,i], visit_cmatrix[,i+1]), nrow)
level <- levels(result2$category)
x<-as.numeric(factor(cool[,1], level, exclude=NULL))
y<-as.numeric(factor(cool[,2], level, exclude=NULL)) + 24
tmp <- cbind(x,y)
tmp <- rbind(c(NA,NA),tmp)
for (i in 1:ncol(tmp)){
x <- tmp[,i]
x[is.na(x)] <- 0 + (i-1) *24
tmp[,i] <- x
}
g <- graph.data.frame(tmp, directed=FALSE)
ggbigraph(g)
require(NetIndices)
require(ggplot2)
ggbigraph <- function(g) {
g_ <- get.edgelist(g)
g_df <- as.data.frame(g_)
g_df$id <- 1:length(g_df[,1])
g_df <- melt(g_df, id=3)
xy_s <- data.frame(value = unique(g_df$value),
x = c(rep(2, length(unique(g_df$value))/2), rep(4, length(unique(g_df$value))/2)),
y = rep(seq(1, length(unique(g_df$value))/2, 1), 2))
g_df2 <- merge(g_df, xy_s, by = "value")
p <- ggplot(g_df2, aes(x, y)) +
geom_point() +
geom_line(size = 0.3, aes(group = id, col = id)) +
geom_text(size = 3, hjust = 2, aes(label = value)) +
theme_bw() +
opts(panel.grid.major=theme_blank(),
panel.grid.minor=theme_blank(),
axis.text.x=theme_blank(),
axis.text.y=theme_blank(),
axis.title.x=theme_blank(),
axis.title.y=theme_blank(),
axis.ticks=theme_blank(),
panel.border=theme_blank(),
legend.position="none")
p
}
i <- 2
cool <- ddply(visit_cmatrix, .(visit_cmatrix[,i], visit_cmatrix[,i+1]), nrow)
level <- levels(result2$category)
x<-as.numeric(factor(cool[,1], level, exclude=NULL))
y<-as.numeric(factor(cool[,2], level, exclude=NULL)) + 24
tmp <- cbind(x,y)
tmp <- rbind(c(NA,NA),tmp)
for (i in 1:ncol(tmp)){
x <- tmp[,i]
x[is.na(x)] <- 0 + (i-1) *24
tmp[,i] <- x
}
g <- graph.data.frame(tmp, directed=FALSE)
ggbigraph(g)
Visual 9 - Heat Map in R
tmp1 <- table(visit_cmatrix[,2])
tmp1 <- data.frame(names(tmp1), cbind(tmp1))
colnames(tmp1) <- c('WebsiteCategory','1')
for ( i in 3:21) {
tmp <- table(visit_cmatrix[,i])
tmp <- data.frame(names(tmp), cbind(tmp))
colnames(tmp) <- c('WebsiteCategory', paste("", i-1, sep = ""))
tmp1 <- merge(tmp1, tmp, by = 'WebsiteCategory')
}
tmp1 <- tmp1[sort.list(tmp1[,2], decreasing = F),]
tmp1[,1] <- factor(tmp1[,1], levels = tmp1[,1])
tmp2 <- melt(tmp1, id ='WebsiteCategory')
tmp3 <- ddply(tmp2, .(variable), transform, rescale = rescale(value))
ggplot(tmp3, aes(variable, WebsiteCategory)) +
geom_tile(aes(fill = rescale, order = rescale), colour = "white") +
scale_fill_gradient(name = "Rescale of \n # users", low = "lightblue", high = "red") +
xlab("Visits") + ylab("Website Category") +
opts(axis.text.x = theme_text(face = "bold", size = 12)) +
opts(axis.text.y = theme_text(face = "bold", size = 12)) +
opts(axis.title.x = theme_text(face = "bold", size = 12)) +
opts(axis.title.y = theme_text(face = "bold", size = 12, angle = 90)) +
opts(title = "HeatMap of Multiple Visit Categories") +
opts(plot.title = theme_text(face = "bold", size=14))
tmp1 <- data.frame(names(tmp1), cbind(tmp1))
colnames(tmp1) <- c('WebsiteCategory','1')
for ( i in 3:21) {
tmp <- table(visit_cmatrix[,i])
tmp <- data.frame(names(tmp), cbind(tmp))
colnames(tmp) <- c('WebsiteCategory', paste("", i-1, sep = ""))
tmp1 <- merge(tmp1, tmp, by = 'WebsiteCategory')
}
tmp1 <- tmp1[sort.list(tmp1[,2], decreasing = F),]
tmp1[,1] <- factor(tmp1[,1], levels = tmp1[,1])
tmp2 <- melt(tmp1, id ='WebsiteCategory')
tmp3 <- ddply(tmp2, .(variable), transform, rescale = rescale(value))
ggplot(tmp3, aes(variable, WebsiteCategory)) +
geom_tile(aes(fill = rescale, order = rescale), colour = "white") +
scale_fill_gradient(name = "Rescale of \n # users", low = "lightblue", high = "red") +
xlab("Visits") + ylab("Website Category") +
opts(axis.text.x = theme_text(face = "bold", size = 12)) +
opts(axis.text.y = theme_text(face = "bold", size = 12)) +
opts(axis.title.x = theme_text(face = "bold", size = 12)) +
opts(axis.title.y = theme_text(face = "bold", size = 12, angle = 90)) +
opts(title = "HeatMap of Multiple Visit Categories") +
opts(plot.title = theme_text(face = "bold", size=14))
Wednesday, December 2, 2015
Visual 8 - Dot Plot and Bubble Chart
sql <- paste(" select a.week, a.fact_cat, a.fact_cnts/b.fact_cnts as perc from (
select week, fact_cat, count(*)*1.0 as fact_cnts from lh_userfact_weekhist
group by 1,2) a,
(select week, count(*)*1.0 as fact_cnts from lh_userfact_weekhist group by 1 ) b
where a.week=b.week
order by 1,3 desc;", sep = '');
data <- sqlQuery(con, sql, as.is = T, max =0 )
data[,3] <- as.numeric(data[ ,3])
data2<- data
data2$name <- factor(data2$fact_cat, levels=levels)
data2$nperc <- paste(round(data2$perc*100,2), '%', sep = '')
ggplot(data2, aes(y=name, x=week, size=perc, label=nperc)) +
geom_point(color='red', shape=16) + scale_size_continuous(range = c(1, 50)) +
geom_text(face = "bold", size = 6, color = "black") +
ggtitle("Distribution of Events by Category") +
xlab("Week") + ylab("Category") +
theme(plot.title = element_text(face = "bold", size = 20)) +
theme(axis.text.x = element_text(face = "bold", size = 16)) +
theme(axis.text.y = element_text(face = "bold", size = 16)) +
theme(axis.title.x = element_text(face = "bold", size = 16)) +
theme(axis.title.y = element_text(face = "bold", size = 16, angle = 90)) +
theme(legend.position = "top") +
theme(legend.key = element_rect(colour = NA)) +
theme(legend.title = element_blank())
#getting visit category matrix for the first 20 visits;
#156712;
levels(result2$category)
result2$user_cumcnt <- as.numeric(result2$user_cumcnt)
sample <- data.frame(result2[result2$user_cumcnt<=20, c(1,4,15)])
#37427 21;
visit_cmatrix <- dcast(sample, userid ~ user_cumcnt)
dim(visit_cmatrix)
#dot plot;
sample50 <- visit_cmatrix[sort(sample(1:nrow(visit_cmatrix), 50)),]
cnts <- apply(sample50[,-1], 1, fun <- function(x) {sum(!is.na(x))})
sample50 <- cbind(sample50, cnts)
sample50 <- sample50[sort.list(sample50$cnts),]
sample50 <- cbind(sample50, 1:50)
plot_sample <- melt(sample50, id = c("userid","cnts", "1:50"), na.rm = T)
names(plot_sample) <- c("userid", "cnts", "user_seq_no", "visits", "category" )
qplot(visits, user_seq_no, data = plot_sample) +
geom_point(aes(color = category), size = 4.5) +
geom_point(aes(shape = category)) +
scale_shape_manual(value=1:length(plot_sample$category)) +
opts(title = "First 20 Visits for 50 Users in the Sample") +
opts(plot.title = theme_text(face = "bold", size=14)) +
xlab("Visits") + ylab("User Seq No.") +
opts(axis.text.x = theme_text(family = "sans", face = "bold", size = 12)) +
opts(axis.text.y = theme_text(family = "sans", face = "bold", size = 12))
select week, fact_cat, count(*)*1.0 as fact_cnts from lh_userfact_weekhist
group by 1,2) a,
(select week, count(*)*1.0 as fact_cnts from lh_userfact_weekhist group by 1 ) b
where a.week=b.week
order by 1,3 desc;", sep = '');
data <- sqlQuery(con, sql, as.is = T, max =0 )
data[,3] <- as.numeric(data[ ,3])
data2<- data
data2$name <- factor(data2$fact_cat, levels=levels)
data2$nperc <- paste(round(data2$perc*100,2), '%', sep = '')
ggplot(data2, aes(y=name, x=week, size=perc, label=nperc)) +
geom_point(color='red', shape=16) + scale_size_continuous(range = c(1, 50)) +
geom_text(face = "bold", size = 6, color = "black") +
ggtitle("Distribution of Events by Category") +
xlab("Week") + ylab("Category") +
theme(plot.title = element_text(face = "bold", size = 20)) +
theme(axis.text.x = element_text(face = "bold", size = 16)) +
theme(axis.text.y = element_text(face = "bold", size = 16)) +
theme(axis.title.x = element_text(face = "bold", size = 16)) +
theme(axis.title.y = element_text(face = "bold", size = 16, angle = 90)) +
theme(legend.position = "top") +
theme(legend.key = element_rect(colour = NA)) +
theme(legend.title = element_blank())
#getting visit category matrix for the first 20 visits;
#156712;
levels(result2$category)
result2$user_cumcnt <- as.numeric(result2$user_cumcnt)
sample <- data.frame(result2[result2$user_cumcnt<=20, c(1,4,15)])
#37427 21;
visit_cmatrix <- dcast(sample, userid ~ user_cumcnt)
dim(visit_cmatrix)
#dot plot;
sample50 <- visit_cmatrix[sort(sample(1:nrow(visit_cmatrix), 50)),]
cnts <- apply(sample50[,-1], 1, fun <- function(x) {sum(!is.na(x))})
sample50 <- cbind(sample50, cnts)
sample50 <- sample50[sort.list(sample50$cnts),]
sample50 <- cbind(sample50, 1:50)
plot_sample <- melt(sample50, id = c("userid","cnts", "1:50"), na.rm = T)
names(plot_sample) <- c("userid", "cnts", "user_seq_no", "visits", "category" )
qplot(visits, user_seq_no, data = plot_sample) +
geom_point(aes(color = category), size = 4.5) +
geom_point(aes(shape = category)) +
scale_shape_manual(value=1:length(plot_sample$category)) +
opts(title = "First 20 Visits for 50 Users in the Sample") +
opts(plot.title = theme_text(face = "bold", size=14)) +
xlab("Visits") + ylab("User Seq No.") +
opts(axis.text.x = theme_text(family = "sans", face = "bold", size = 12)) +
opts(axis.text.y = theme_text(family = "sans", face = "bold", size = 12))
Visual 7 - Network Graph
conv <- sqlQuery(con, "select * from lh_tmp2 where random() < .05;", as.is = T);
tmp <- ddply(conv, .(userid), nrow);
conv <- merge(conv, tmp, by = 'userid');
conv <- conv[sort.list(conv[,3]),]
nconv <- length(unique(conv$userid));
nfact <- length(unique(conv$factid));
# if user --> positive else --> negative;
conv$userno <- as.numeric(as.factor(conv$userid));
conv$factno <- -as.numeric(as.factor(conv$factid));
m <- dim(conv)[2];
conv.list <- unique(conv$userno);
g <- graph.data.frame(conv[, (m-1):m], directed=FALSE)
indx <- as.numeric(V(g)$name)
colorlist <- c(rep("red", length(indx)));
for (j in 1: length(indx)){
if (indx[j]<0) {
cat1 <- unique(conv$cat1[conv$factno == indx[j]]);
if (cat1 == "Behavioral") colorlist[j] <- "purple";
if (cat1 == "Search") colorlist[j] <- "green";
if (cat1 == "Secondary Sources") colorlist[j] <- "cyan";
if (cat1 == "Site Visitation") colorlist[j] <- "cornflowerblue";
if (cat1 == "Social") colorlist[j] <- "orange";
}
}
V(g)$color <- colorlist
plot(g, vertex.size=2.5, vertex.label = NA, vertex.label.dist=0.25, edge.width=1, layout= layout.fruchterman.reingold)
plot(g, vertex.size=2.5, vertex.label = NA, vertex.label.dist=0.25, edge.width=1, layout= layout.graphopt)
plot(g, vertex.size=2.5, vertex.label = NA, vertex.label.dist=0.25, edge.width=1, layout= layout.kamada.kawai)
plot(g, layout=layout.circle)
# Force directed layouts
plot(g, layout=layout.fruchterman.reingold)
plot(g, layout=layout.graphopt)
plot(g, layout=layout.kamada.kawai)
tmp <- ddply(conv, .(userid), nrow);
conv <- merge(conv, tmp, by = 'userid');
conv <- conv[sort.list(conv[,3]),]
nconv <- length(unique(conv$userid));
nfact <- length(unique(conv$factid));
# if user --> positive else --> negative;
conv$userno <- as.numeric(as.factor(conv$userid));
conv$factno <- -as.numeric(as.factor(conv$factid));
m <- dim(conv)[2];
conv.list <- unique(conv$userno);
g <- graph.data.frame(conv[, (m-1):m], directed=FALSE)
indx <- as.numeric(V(g)$name)
colorlist <- c(rep("red", length(indx)));
for (j in 1: length(indx)){
if (indx[j]<0) {
cat1 <- unique(conv$cat1[conv$factno == indx[j]]);
if (cat1 == "Behavioral") colorlist[j] <- "purple";
if (cat1 == "Search") colorlist[j] <- "green";
if (cat1 == "Secondary Sources") colorlist[j] <- "cyan";
if (cat1 == "Site Visitation") colorlist[j] <- "cornflowerblue";
if (cat1 == "Social") colorlist[j] <- "orange";
}
}
V(g)$color <- colorlist
plot(g, vertex.size=2.5, vertex.label = NA, vertex.label.dist=0.25, edge.width=1, layout= layout.fruchterman.reingold)
plot(g, vertex.size=2.5, vertex.label = NA, vertex.label.dist=0.25, edge.width=1, layout= layout.graphopt)
plot(g, vertex.size=2.5, vertex.label = NA, vertex.label.dist=0.25, edge.width=1, layout= layout.kamada.kawai)
plot(g, layout=layout.circle)
# Force directed layouts
plot(g, layout=layout.fruchterman.reingold)
plot(g, layout=layout.graphopt)
plot(g, layout=layout.kamada.kawai)
Tuesday, December 1, 2015
Visual 6 - Density
# Plot 1
op <- par(cex.main = 1.5, mar = c(5, 6, 4, 5) + 0.1, mgp = c(3.5, 1, 0), cex.lab = 1.5, font.lab = 2, cex.axis = 1.3, bty = "n", las = 1)
yhigh <- 1
xlow <- -3
xhigh <- 3
postmean <- 0.5
postsd <- 0.8
priormean <- 0
priorsd <- 1
plot(function(x) dnorm(x, mean = postmean, sd = postsd), xlow, xhigh, ylim = c(0, yhigh), xlim = c(xlow, xhigh), lwd = 2, lty = 1, ylab = "", xlab = "", main = "Inference for Mu", axes = FALSE)
lines(c(0, 0), c(0, 1.25), lwd = 2, col = "grey")
par(new = TRUE)
plot(function(x) dnorm(x, mean = priormean, sd = priorsd), xlow, xhigh, ylim = c(0, yhigh), xlim = c(xlow, xhigh), lwd = 2, lty = 2, ylab = "", xlab = "", axes = FALSE)
axis(1)
axis(2)
par(las = 0)
mtext("Mu", side = 1, line = 2.5, cex = 1.5)
mtext("Density", side = 2, line = 3, cex = 1.8)
par(op)
## Plot2: With Marked Text
NormBF10 <- function(dat, mu = 0, m = 1, priordat = NULL, plot = F, xwide = 3) {
# dat ~ N(theta,1); theta ~ N(mu, 1/m); mu is prior mean, m is prior precision
if (is.null(priordat)) {
# no prior data
priormean <- mu
priorprec <- m
}
if (!is.null(priordat)) {
# prior data
n <- length(priordat)
priormean <- (m * mu + n * mean(priordat))/(m + n)
priorprec <- m + n
}
n <- length(dat)
posteriormean <- (priorprec * priormean + n * mean(dat))/(priorprec + n)
posteriorprec <- priorprec + n
prior.height <- dnorm(0, mean = priormean, sd = priorprec^(-0.5))
posterior.height <- dnorm(0, mean = posteriormean, sd = posteriorprec^(-0.5))
BF10 <- prior.height/posterior.height
if (plot == TRUE) {
par(cex.main = 1.5, mar = c(5, 6, 4, 5) + 0.1, mgp = c(3.5, 1, 0), cex.lab = 1.5,
font.lab = 2, cex.axis = 1.3, bty = "n", las = 1)
yhigh <- 1.5
xlow <- -3
xhigh <- 3
plot(function(x) dnorm(x, mean = posteriormean, sd = posteriorprec^(-0.5)),
xlow, xhigh, ylim = c(0, yhigh), xlim = c(xlow, xhigh), lwd = 2,
lty = 1, ylab = "", xlab = "", axes = FALSE)
lines(c(0, 0), c(0, 1.25), lwd = 2, col = "grey")
par(new = TRUE)
plot(function(x) dnorm(x, mean = priormean, sd = priorprec^(-0.5)), xlow,
xhigh, ylim = c(0, yhigh), xlim = c(xlow, xhigh), lwd = 2, lty = 2,
ylab = "", xlab = "", axes = FALSE)
axis(1)
axis(2)
par(las = 0)
mtext("Mu", side = 1, line = 2.5, cex = 1.5)
mtext("Density", side = 2, line = 3, cex = 1.8)
# Show Savage-Dickey density ratio:
points(0, prior.height, cex = 2, pch = 21, bg = "grey")
points(0, posterior.height, cex = 2, pch = 21, bg = "grey")
}
invisible(BF10)
}
dat <- c(0, 1, -1)
# dat <- c(-1,1,0)
#### simultaneous #### 1/NormBF10(dat, plot = TRUE) #2 text(-3, 1.4,
#### expression(BF[0][1](y[1],y[2],y[3]) == 2), cex = 1.5, pos = 4)
##### y1 #### 1/NormBF10(dat = dat[1], plot = TRUE) #sqrt(2) text(-3, 1.4,
##### expression(BF[0][1](y[1]) == sqrt(2)), cex = 1.5, pos = 4)
##### y2, given y1 #### 1/NormBF10(dat = dat[2], plot = TRUE, priordat = dat[1]) #1.04
##### composite.expression <- expression(paste(BF[0][1], '(', y[2], ' | ', y[1],
##### ')' %~~% 1.04)) text(-3, 1.4, composite.expression, cex = 1.5, pos = 4)
##### y3, given y1 and y2 ####
BF01 <- 1/NormBF10(dat = dat[3], plot = TRUE, priordat = dat[1:2]) #1.36
composite.expression <- expression(paste(BF[0][1], "(", y[3], " | ", y[1], ",", y[2], ")" %~~% 1.36))
text(-3, 1.4, composite.expression, cex = 1.5, pos = 4)
## Plot3
xbar.therapy <- 92
s.therapy <- 8.5
xbar.placebo <- 85
s.placebo <- 9.1
n <- 15
xdiff <- xbar.therapy - xbar.placebo
sdiff <- sqrt((s.therapy^2 + s.placebo^2)/2) * sqrt(2/n)
sdiff <- sqrt(s.therapy^2 + s.placebo^2)/sqrt(n)
muH0 <- 0
muH1 <- 8
t0 <- (xdiff - muH0)/sdiff
# H0 distribution with p-value shaded:
par(cex.main = 1.5, mar = c(4, 4.5, 4.5, 1), mgp = c(3.5, 1, 0), cex.lab = 1.5,
font.lab = 2, cex.axis = 1.8, bty = "n", las = 1)
par(mar = c(4, 4.5, 4.5, 1))
x <- seq(-15, 15, by = 0.001)
y <- dt(x/sdiff, df = 28)
plot(x, y, type = "l", axes = FALSE, xlab = NA, ylab = NA, xlim = c(-15, 20),
lwd = 2)
axis(side = 1, at = seq(-15, 15, by = 5), pos = 0, lwd = 2)
axis(side = 1, at = 7, pos = 0, col = "red4", col.axis = "red4", lwd = 2)
# shade area to right of obtained test statistic:
t0 <- xdiff/sdiff
cord.x <- c(t0, seq(t0, 4, 0.001), 4) * sdiff
cord.y <- c(0, dt(seq(t0, 4, 0.001), df = 28), 0)
polygon(cord.x, cord.y, col = "grey")
cord.x <- c(-4, seq(-4, -t0, 0.001), -t0) * sdiff
cord.y <- c(0, dt(seq(-4, -t0, 0.001), df = 24), 0)
polygon(cord.x, cord.y, col = "grey")
# add lines and text:
abline(v = xdiff, col = "red4", lwd = 2)
text(-15, 0.25, expression(paste(H[0], " : ", mu[diff], " = 0", sep = "")), adj = 0, cex = 1.8)
text(10, 0.08, paste("p = .04"), adj = 0, col = "red4", cex = 1.8)
lines(c(10, 8), c(0.05, 0.01), col = "red4", lwd = 2)
lines(c(10, -8), c(0.05, 0.01), col = "red4", lwd = 2)
mtext(expression(bar(x)[diff]), side = 1, line = 2, at = 6.5, adj = 0, col = "red4", cex = 1.8)
## Plot 4
xbar.therapy <- 92
s.therapy <- 8.5
xbar.placebo <- 85
s.placebo <- 9.1
n <- 15
xdiff <- xbar.therapy - xbar.placebo
sdiff <- sqrt((s.therapy^2 + s.placebo^2)/2) * sqrt(2/n)
sdiff <- sqrt(s.therapy^2 + s.placebo^2)/sqrt(n)
muH0 <- 0
muH1 <- 8
t0 <- (xdiff - muH0)/sdiff
par(cex.main = 1.5, mar = c(4, 4.5, 4.5, 1), mgp = c(3.5, 1, 0), cex.lab = 1.5, font.lab = 2, cex.axis = 1.8, bty = "n", las = 1)
x <- seq(-15, 30, by = 0.001)
y <- dt(x/sdiff, df = 28)
y3 <- dt((x - 9)/sdiff, df = 28)
plot(x, y, type = "l", axes = FALSE, xlab = NA, ylab = NA, xlim = c(-15, 30), lwd = 2)
lines(x, y3, lwd = 2)
axis(side = 1, at = seq(-15, 30, by = 5), labels = seq(-15, 30, by = 5), cex.axis = 1.6, lwd = 2)
axis(side = 1, at = 7, pos = 0, col = "red4", col.axis = "red4", lwd = 2, cex.axis = 1.6, padj = 0.6)
# shade critical regions:
tcrit <- qt(0.975, df = 28)
cord.x <- c(tcrit, seq(tcrit, 4, 0.001), 4) * sdiff
cord.y <- c(0, dt(seq(tcrit, 4, 0.001), df = 28), 0)
polygon(cord.x, cord.y, col = "grey")
cord.x <- c(-4, seq(-4, -tcrit, 0.001), -tcrit) * sdiff
cord.y <- c(0, dt(seq(-4, -tcrit, 0.001), df = 24), 0)
polygon(cord.x, cord.y, col = "grey")
# shade type-II error region
xcrit <- tcrit * sdiff
cord.x <- c(-5, seq(-5, xcrit, 0.001), xcrit)
cord.y <- c(0, dt(((seq(-5, xcrit, 0.001) - 9)/sdiff), df = 28), 0)
polygon(cord.x, cord.y, col = "grey90")
# add lines and text:
abline(v = xdiff, col = "red4", lwd = 2)
text(-16.3, 0.3, expression(paste(H[0], " : ", mu[diff], " = 0", sep = "")), adj = 0, cex = 1.8)
text(13, 0.3, expression(paste(H[1], " : ", mu[diff], "" >= 9, , sep = "")), adj = 0, cex = 1.8)
text(10, 0.08, expression(paste(alpha)), adj = 0, col = "red4", cex = 1.8)
text(-11, 0.08, expression(paste(alpha)), adj = 0, col = "red4", cex = 1.8)
text(1, 0.08, expression(paste(beta)), adj = 0, col = "red4", cex = 1.8)
mtext(expression(bar(x)[diff]), side = 1, line = 2, at = 6.5, adj = 0, col = "red4", cex = 1.8, padj = 0.4)
lines(c(10, 8), c(0.05, 0.01), col = "red4", lwd = 2)
lines(c(-10, -8), c(0.05, 0.01), col = "red4", lwd = 2)
lines(c(2, 4), c(0.05, 0.01), col = "red4", lwd = 2)
op <- par(cex.main = 1.5, mar = c(5, 6, 4, 5) + 0.1, mgp = c(3.5, 1, 0), cex.lab = 1.5, font.lab = 2, cex.axis = 1.3, bty = "n", las = 1)
yhigh <- 1
xlow <- -3
xhigh <- 3
postmean <- 0.5
postsd <- 0.8
priormean <- 0
priorsd <- 1
plot(function(x) dnorm(x, mean = postmean, sd = postsd), xlow, xhigh, ylim = c(0, yhigh), xlim = c(xlow, xhigh), lwd = 2, lty = 1, ylab = "", xlab = "", main = "Inference for Mu", axes = FALSE)
lines(c(0, 0), c(0, 1.25), lwd = 2, col = "grey")
par(new = TRUE)
plot(function(x) dnorm(x, mean = priormean, sd = priorsd), xlow, xhigh, ylim = c(0, yhigh), xlim = c(xlow, xhigh), lwd = 2, lty = 2, ylab = "", xlab = "", axes = FALSE)
axis(1)
axis(2)
par(las = 0)
mtext("Mu", side = 1, line = 2.5, cex = 1.5)
mtext("Density", side = 2, line = 3, cex = 1.8)
par(op)
## Plot2: With Marked Text
NormBF10 <- function(dat, mu = 0, m = 1, priordat = NULL, plot = F, xwide = 3) {
# dat ~ N(theta,1); theta ~ N(mu, 1/m); mu is prior mean, m is prior precision
if (is.null(priordat)) {
# no prior data
priormean <- mu
priorprec <- m
}
if (!is.null(priordat)) {
# prior data
n <- length(priordat)
priormean <- (m * mu + n * mean(priordat))/(m + n)
priorprec <- m + n
}
n <- length(dat)
posteriormean <- (priorprec * priormean + n * mean(dat))/(priorprec + n)
posteriorprec <- priorprec + n
prior.height <- dnorm(0, mean = priormean, sd = priorprec^(-0.5))
posterior.height <- dnorm(0, mean = posteriormean, sd = posteriorprec^(-0.5))
BF10 <- prior.height/posterior.height
if (plot == TRUE) {
par(cex.main = 1.5, mar = c(5, 6, 4, 5) + 0.1, mgp = c(3.5, 1, 0), cex.lab = 1.5,
font.lab = 2, cex.axis = 1.3, bty = "n", las = 1)
yhigh <- 1.5
xlow <- -3
xhigh <- 3
plot(function(x) dnorm(x, mean = posteriormean, sd = posteriorprec^(-0.5)),
xlow, xhigh, ylim = c(0, yhigh), xlim = c(xlow, xhigh), lwd = 2,
lty = 1, ylab = "", xlab = "", axes = FALSE)
lines(c(0, 0), c(0, 1.25), lwd = 2, col = "grey")
par(new = TRUE)
plot(function(x) dnorm(x, mean = priormean, sd = priorprec^(-0.5)), xlow,
xhigh, ylim = c(0, yhigh), xlim = c(xlow, xhigh), lwd = 2, lty = 2,
ylab = "", xlab = "", axes = FALSE)
axis(1)
axis(2)
par(las = 0)
mtext("Mu", side = 1, line = 2.5, cex = 1.5)
mtext("Density", side = 2, line = 3, cex = 1.8)
# Show Savage-Dickey density ratio:
points(0, prior.height, cex = 2, pch = 21, bg = "grey")
points(0, posterior.height, cex = 2, pch = 21, bg = "grey")
}
invisible(BF10)
}
dat <- c(0, 1, -1)
# dat <- c(-1,1,0)
#### simultaneous #### 1/NormBF10(dat, plot = TRUE) #2 text(-3, 1.4,
#### expression(BF[0][1](y[1],y[2],y[3]) == 2), cex = 1.5, pos = 4)
##### y1 #### 1/NormBF10(dat = dat[1], plot = TRUE) #sqrt(2) text(-3, 1.4,
##### expression(BF[0][1](y[1]) == sqrt(2)), cex = 1.5, pos = 4)
##### y2, given y1 #### 1/NormBF10(dat = dat[2], plot = TRUE, priordat = dat[1]) #1.04
##### composite.expression <- expression(paste(BF[0][1], '(', y[2], ' | ', y[1],
##### ')' %~~% 1.04)) text(-3, 1.4, composite.expression, cex = 1.5, pos = 4)
##### y3, given y1 and y2 ####
BF01 <- 1/NormBF10(dat = dat[3], plot = TRUE, priordat = dat[1:2]) #1.36
composite.expression <- expression(paste(BF[0][1], "(", y[3], " | ", y[1], ",", y[2], ")" %~~% 1.36))
text(-3, 1.4, composite.expression, cex = 1.5, pos = 4)
## Plot3
xbar.therapy <- 92
s.therapy <- 8.5
xbar.placebo <- 85
s.placebo <- 9.1
n <- 15
xdiff <- xbar.therapy - xbar.placebo
sdiff <- sqrt((s.therapy^2 + s.placebo^2)/2) * sqrt(2/n)
sdiff <- sqrt(s.therapy^2 + s.placebo^2)/sqrt(n)
muH0 <- 0
muH1 <- 8
t0 <- (xdiff - muH0)/sdiff
# H0 distribution with p-value shaded:
par(cex.main = 1.5, mar = c(4, 4.5, 4.5, 1), mgp = c(3.5, 1, 0), cex.lab = 1.5,
font.lab = 2, cex.axis = 1.8, bty = "n", las = 1)
par(mar = c(4, 4.5, 4.5, 1))
x <- seq(-15, 15, by = 0.001)
y <- dt(x/sdiff, df = 28)
plot(x, y, type = "l", axes = FALSE, xlab = NA, ylab = NA, xlim = c(-15, 20),
lwd = 2)
axis(side = 1, at = seq(-15, 15, by = 5), pos = 0, lwd = 2)
axis(side = 1, at = 7, pos = 0, col = "red4", col.axis = "red4", lwd = 2)
# shade area to right of obtained test statistic:
t0 <- xdiff/sdiff
cord.x <- c(t0, seq(t0, 4, 0.001), 4) * sdiff
cord.y <- c(0, dt(seq(t0, 4, 0.001), df = 28), 0)
polygon(cord.x, cord.y, col = "grey")
cord.x <- c(-4, seq(-4, -t0, 0.001), -t0) * sdiff
cord.y <- c(0, dt(seq(-4, -t0, 0.001), df = 24), 0)
polygon(cord.x, cord.y, col = "grey")
# add lines and text:
abline(v = xdiff, col = "red4", lwd = 2)
text(-15, 0.25, expression(paste(H[0], " : ", mu[diff], " = 0", sep = "")), adj = 0, cex = 1.8)
text(10, 0.08, paste("p = .04"), adj = 0, col = "red4", cex = 1.8)
lines(c(10, 8), c(0.05, 0.01), col = "red4", lwd = 2)
lines(c(10, -8), c(0.05, 0.01), col = "red4", lwd = 2)
mtext(expression(bar(x)[diff]), side = 1, line = 2, at = 6.5, adj = 0, col = "red4", cex = 1.8)
## Plot 4
xbar.therapy <- 92
s.therapy <- 8.5
xbar.placebo <- 85
s.placebo <- 9.1
n <- 15
xdiff <- xbar.therapy - xbar.placebo
sdiff <- sqrt((s.therapy^2 + s.placebo^2)/2) * sqrt(2/n)
sdiff <- sqrt(s.therapy^2 + s.placebo^2)/sqrt(n)
muH0 <- 0
muH1 <- 8
t0 <- (xdiff - muH0)/sdiff
par(cex.main = 1.5, mar = c(4, 4.5, 4.5, 1), mgp = c(3.5, 1, 0), cex.lab = 1.5, font.lab = 2, cex.axis = 1.8, bty = "n", las = 1)
x <- seq(-15, 30, by = 0.001)
y <- dt(x/sdiff, df = 28)
y3 <- dt((x - 9)/sdiff, df = 28)
plot(x, y, type = "l", axes = FALSE, xlab = NA, ylab = NA, xlim = c(-15, 30), lwd = 2)
lines(x, y3, lwd = 2)
axis(side = 1, at = seq(-15, 30, by = 5), labels = seq(-15, 30, by = 5), cex.axis = 1.6, lwd = 2)
axis(side = 1, at = 7, pos = 0, col = "red4", col.axis = "red4", lwd = 2, cex.axis = 1.6, padj = 0.6)
# shade critical regions:
tcrit <- qt(0.975, df = 28)
cord.x <- c(tcrit, seq(tcrit, 4, 0.001), 4) * sdiff
cord.y <- c(0, dt(seq(tcrit, 4, 0.001), df = 28), 0)
polygon(cord.x, cord.y, col = "grey")
cord.x <- c(-4, seq(-4, -tcrit, 0.001), -tcrit) * sdiff
cord.y <- c(0, dt(seq(-4, -tcrit, 0.001), df = 24), 0)
polygon(cord.x, cord.y, col = "grey")
# shade type-II error region
xcrit <- tcrit * sdiff
cord.x <- c(-5, seq(-5, xcrit, 0.001), xcrit)
cord.y <- c(0, dt(((seq(-5, xcrit, 0.001) - 9)/sdiff), df = 28), 0)
polygon(cord.x, cord.y, col = "grey90")
# add lines and text:
abline(v = xdiff, col = "red4", lwd = 2)
text(-16.3, 0.3, expression(paste(H[0], " : ", mu[diff], " = 0", sep = "")), adj = 0, cex = 1.8)
text(13, 0.3, expression(paste(H[1], " : ", mu[diff], "" >= 9, , sep = "")), adj = 0, cex = 1.8)
text(10, 0.08, expression(paste(alpha)), adj = 0, col = "red4", cex = 1.8)
text(-11, 0.08, expression(paste(alpha)), adj = 0, col = "red4", cex = 1.8)
text(1, 0.08, expression(paste(beta)), adj = 0, col = "red4", cex = 1.8)
mtext(expression(bar(x)[diff]), side = 1, line = 2, at = 6.5, adj = 0, col = "red4", cex = 1.8, padj = 0.4)
lines(c(10, 8), c(0.05, 0.01), col = "red4", lwd = 2)
lines(c(-10, -8), c(0.05, 0.01), col = "red4", lwd = 2)
lines(c(2, 4), c(0.05, 0.01), col = "red4", lwd = 2)
Visual 5 - Bar Chart and Stacked Bar Chart
library(plyr)
data3 <- data2[index,];
data3 <- data3[sort.list(data3$clickperc),];
barplot(data3$clickperc, name = round(data3$userperc,2), col = 'blue', main = 'Distribution of Clicks for Comcast 59693',
xlab = "% Clickers", ylab="% of Clicks", ylim = c(0,1.1));
abline(h = data2$clickperc[6953], col= 'red', lty=3);
#Use ggplot;
ggplot(data3, aes(userperc)) +
geom_bar(aes(y = clickperc),stat = "identity", fill = 'blue') +
geom_abline(intercept = 0, slope =1, color = 'red', size = 1, lty = 3) +
ggtitle("Distribution of Random Clicks") +
ylab("% of Clicks") + xlab("% of Clickers") +
theme(plot.title = element_text(face = "bold", size = 16)) +
theme(axis.text.x = element_text(face = "bold", size = 12)) +
theme(axis.text.y = element_text(face = "bold", size = 12)) +
theme(axis.title.x = element_text(face = "bold", size = 12)) +
theme(axis.title.y = element_text(face = "bold", size = 12, angle = 90))
#Use ggplot for barchart;
grp <- rep('1 Large: 10K+', length(conv));
grp[conv <= 10000] <- '2 Medium: 1K - 10K';
grp[conv <= 1000] <- '3 Small: 1K -';
ggplot(data1, aes(y=AUC, x=fnames,fill = grp)) +
geom_bar() + coord_flip() +
geom_text(aes(y=AUC, x=fnames, label=round(AUC,2)), hjust=-.2, face = "bold", size = 5, color = "black") +
ylab("Campaigns") + xlab("AUC") +
opts(axis.text.x = theme_text(face = "bold", size = 12)) +
opts(axis.text.y = theme_text(face = "bold", size = 12)) +
opts(axis.title.x = theme_text(face = "bold", size = 12)) +
opts(axis.title.y = theme_text(face = "bold", size = 12, angle = 90)) +
opts(title = "AUCs by Bucket Campaigns") +
opts(plot.title = theme_text(face = "bold", size=14)) +
opts(legend.position="top") +
opts(legend.key = theme_rect(colour = NA)) +
opts(legend.title = theme_blank()) +
opts(legend.text = theme_text(family = "sans", face = "bold", size
data1=data[, list(army_green_dress=sum(army_green_dress),
hunter_green_dress=sum(hunter_green_dress),
coral_dress=sum(coral_dress),
sage_dress=sum(sage_dress),
tan_dress=sum(tan_dress)
)]
data2=data.table(color=colnames(data1),cnt=t(data1))[order(cnt.V1)]
data2$color=factor(data2$color, levels=data2$color)
ggplot(data2, aes(y=cnt.V1, x=color)) +
geom_bar(stat="identity", fill="blue") +
ggtitle("Total Searches from 3/27/2016 to 7/16/2016 by Sampled Color") +
ylab("Total Searches") +
xlab("Color") +
geom_text(data=data2, aes(y=cnt.V1, x=color, label=cnt.V1), vjust=-.5, hjust=0, size = 6, color = "black") +
theme(plot.title = element_text(face = "bold", size = 16)) +
theme(axis.text.x = element_text(face = "bold", size = 12)) +
theme(axis.text.y = element_text(face = "bold", size = 12)) +
theme(axis.title.x = element_text(face = "bold", size = 12)) +
theme(axis.title.y = element_text(face = "bold", size = 12, angle=90))
## Stack Bar chart
ggplot(data, aes(y=blankimp, x=datekey, fill = targetgrp)) +
geom_bar(stat = "identity", position = "stack")+
geom_abline(intercept = mean(data2[,2]), color = 'red', size = 1, lty = 3) +
annotate("text", x=data2$datekey, y=data2$tot * 1.02, label=paste(round(data2$tot/1000000,2),'M'), size = 6) +
ggtitle("Daily Impressions by Targeted Groups for Comcast 59693") +
ylab("Impressions") + xlab("Date") +
theme(plot.title = element_text(face = "bold", size = 16)) +
theme(axis.text.x = element_text(face = "bold", size = 12)) +
theme(axis.text.y = element_text(face = "bold", size = 12)) +
theme(axis.title.x = element_text(face = "bold", size = 12)) +
theme(axis.title.y = element_text(face = "bold", size = 12, angle = 90)) +
theme(legend.position = "top") +
theme(legend.key = element_rect(colour = NA)) +
theme(legend.title = element_blank()) +
theme(legend.text = element_text(family = "sans", face = "bold", size = 12)) +
scale_x_date(labels = date_format("%m/%d"), breaks = date_breaks("da
# Use barplot
mean.prop.sw <- c(0.7, 0.6, 0.67, 0.5, 0.45, 0.48, 0.41, 0.34, 0.5, 0.33)
sd.prop.sw <- c(0.3, 0.4, 0.2, 0.35, 0.28, 0.31, 0.29, 0.26, 0.21, 0.23)
N <- 100
b <- barplot(mean.prop.sw, las = 1, xlab = " ", ylab = " ", col = "grey", cex.lab = 1.7,
cex.main = 1.5, axes = FALSE, ylim = c(0, 1))
axis(1, c(0.8, 2, 3.2, 4.4, 5.6, 6.8, 8, 9.2, 10.4, 11.6), 1:10, cex.axis = 1.3)
axis(2, seq(0, 0.8, by = 0.2), cex.axis = 1.3, las = 1)
mtext("Block", side = 1, line = 2.5, cex = 1.5, font = 2)
mtext("Proportion of Switches", side = 2, line = 3, cex = 1.5, font = 2)
l_ply(seq_along(b), function(x) arrows(x0 = b[x], y0 = mean.prop.sw[x], x1 = b[x], y1 = mean.prop.sw[x] + 1.96 * sd.prop.sw[x]/sqrt(N), code = 2, length = 0.1, angle = 90, lwd = 1.5))
# Use barplot
index <- seq(1, length(data2[,1]), by = round(length(data2[,1])/100));data3 <- data2[index,];
data3 <- data3[sort.list(data3$clickperc),];
barplot(data3$clickperc, name = round(data3$userperc,2), col = 'blue', main = 'Distribution of Clicks for Comcast 59693',
xlab = "% Clickers", ylab="% of Clicks", ylim = c(0,1.1));
abline(h = data2$clickperc[6953], col= 'red', lty=3);
#Use ggplot;
ggplot(data3, aes(userperc)) +
geom_bar(aes(y = clickperc),stat = "identity", fill = 'blue') +
geom_abline(intercept = 0, slope =1, color = 'red', size = 1, lty = 3) +
ggtitle("Distribution of Random Clicks") +
ylab("% of Clicks") + xlab("% of Clickers") +
theme(plot.title = element_text(face = "bold", size = 16)) +
theme(axis.text.x = element_text(face = "bold", size = 12)) +
theme(axis.text.y = element_text(face = "bold", size = 12)) +
theme(axis.title.x = element_text(face = "bold", size = 12)) +
theme(axis.title.y = element_text(face = "bold", size = 12, angle = 90))
#Use ggplot for barchart;
grp <- rep('1 Large: 10K+', length(conv));
grp[conv <= 10000] <- '2 Medium: 1K - 10K';
grp[conv <= 1000] <- '3 Small: 1K -';
ggplot(data1, aes(y=AUC, x=fnames,fill = grp)) +
geom_bar() + coord_flip() +
geom_text(aes(y=AUC, x=fnames, label=round(AUC,2)), hjust=-.2, face = "bold", size = 5, color = "black") +
ylab("Campaigns") + xlab("AUC") +
opts(axis.text.x = theme_text(face = "bold", size = 12)) +
opts(axis.text.y = theme_text(face = "bold", size = 12)) +
opts(axis.title.x = theme_text(face = "bold", size = 12)) +
opts(axis.title.y = theme_text(face = "bold", size = 12, angle = 90)) +
opts(title = "AUCs by Bucket Campaigns") +
opts(plot.title = theme_text(face = "bold", size=14)) +
opts(legend.position="top") +
opts(legend.key = theme_rect(colour = NA)) +
opts(legend.title = theme_blank()) +
opts(legend.text = theme_text(family = "sans", face = "bold", size
data1=data[, list(army_green_dress=sum(army_green_dress),
hunter_green_dress=sum(hunter_green_dress),
coral_dress=sum(coral_dress),
sage_dress=sum(sage_dress),
tan_dress=sum(tan_dress)
)]
data2=data.table(color=colnames(data1),cnt=t(data1))[order(cnt.V1)]
data2$color=factor(data2$color, levels=data2$color)
ggplot(data2, aes(y=cnt.V1, x=color)) +
geom_bar(stat="identity", fill="blue") +
ggtitle("Total Searches from 3/27/2016 to 7/16/2016 by Sampled Color") +
ylab("Total Searches") +
xlab("Color") +
geom_text(data=data2, aes(y=cnt.V1, x=color, label=cnt.V1), vjust=-.5, hjust=0, size = 6, color = "black") +
theme(plot.title = element_text(face = "bold", size = 16)) +
theme(axis.text.x = element_text(face = "bold", size = 12)) +
theme(axis.text.y = element_text(face = "bold", size = 12)) +
theme(axis.title.x = element_text(face = "bold", size = 12)) +
theme(axis.title.y = element_text(face = "bold", size = 12, angle=90))
## Stack Bar chart
ggplot(data, aes(y=blankimp, x=datekey, fill = targetgrp)) +
geom_bar(stat = "identity", position = "stack")+
geom_abline(intercept = mean(data2[,2]), color = 'red', size = 1, lty = 3) +
annotate("text", x=data2$datekey, y=data2$tot * 1.02, label=paste(round(data2$tot/1000000,2),'M'), size = 6) +
ggtitle("Daily Impressions by Targeted Groups for Comcast 59693") +
ylab("Impressions") + xlab("Date") +
theme(plot.title = element_text(face = "bold", size = 16)) +
theme(axis.text.x = element_text(face = "bold", size = 12)) +
theme(axis.text.y = element_text(face = "bold", size = 12)) +
theme(axis.title.x = element_text(face = "bold", size = 12)) +
theme(axis.title.y = element_text(face = "bold", size = 12, angle = 90)) +
theme(legend.position = "top") +
theme(legend.key = element_rect(colour = NA)) +
theme(legend.title = element_blank()) +
theme(legend.text = element_text(family = "sans", face = "bold", size = 12)) +
scale_x_date(labels = date_format("%m/%d"), breaks = date_breaks("da
Visual 4 - Bloxplot and Donut Chart
# Use rect and points to plot boxplot
boxplot.ej <- function(y, xloc = 1, width.box = 0.25, lwd.box = 2, width.hor = 0.25,
lwd.hor = 2, range.wisk = 1.5, lwd.wisk = 2, pch.box = 16, cex.boxpoint = 2,
plot.outliers = FALSE, pch.out = 1, cex.out = 1, color = "black") {
# makes boxplot with dot as median and solid whisker Interquartile range =
# (.75 quantile) - (.25 quantile). Note: Wiskers are not always symmetrical;
# top wisker extends up to max(y) constrained by y <= (.75 quantile) +
# range.wisk*Interquartile range bottom whisker is determined by min(y)
# constrained by y >= (.25 quantile) - range.wisk*Interquartile range
Q <- quantile(y, c(0.25, 0.5, 0.75))
names(Q) <- NULL # gets rid of percentages
IQ.range <- Q[3] - Q[1]
low <- Q[1] - range.wisk * IQ.range
high <- Q[3] + range.wisk * IQ.range
index <- which((y >= low) & (y <= high))
wisk.low <- min(y[index])
wisk.high <- max(y[index])
outliers <- y[which((y < low) | (y > high))]
# plot median:
points(xloc, Q[2], pch = pch.box, cex = cex.boxpoint, col = color)
# plot box:
xleft <- xloc - width.box/2
xright <- xloc + width.box/2
ybottom <- Q[1]
ytop <- Q[3]
rect(xleft, ybottom, xright, ytop, lwd = lwd.box, border = color)
# plot whiskers:
segments(xloc, wisk.low, xloc, Q[1], lwd = lwd.wisk, col = color)
segments(xloc, Q[3], xloc, wisk.high, lwd = lwd.wisk, col = color)
# plot horizontal segments:
x0 <- xloc - width.hor/2
x1 <- xloc + width.hor/2
segments(x0, wisk.low, x1, wisk.low, lwd = lwd.hor, col = color)
segments(x0, wisk.high, x1, wisk.high, lwd = lwd.hor, col = color)
# plot outliers:
if (plot.outliers == TRUE) {
xloc.p <- rep(xloc, length(outliers))
points(xloc.p, outliers, pch = pch.out, cex = cex.out, col = color)
}
}
RT.hf.sp <- rnorm(1000, mean = 0.41, sd = 0.008)
RT.lf.sp <- rnorm(1000, mean = 0.43, sd = 0.01)
RT.vlf.sp <- rnorm(1000, mean = 0.425, sd = 0.012)
RT.hf.ac <- rnorm(1000, mean = 0.46, sd = 0.008)
RT.lf.ac <- rnorm(1000, mean = 0.51, sd = 0.01)
RT.vlf.ac <- rnorm(1000, mean = 0.52, sd = 0.012)
ps <- 1 # size of boxpoint
par(cex.main = 1.5, mar = c(5, 6, 4, 5) + 0.1, mgp = c(3.5, 1, 0), cex.lab = 1.5,
font.lab = 2, cex.axis = 1.3, bty = "n", las = 1)
x <- c(1, 2, 3, 4)
plot(x, c(-10, -10, -10, -10), type = "p", ylab = " ", xlab = " ", cex = 1.5,
ylim = c(0.3, 0.6), xlim = c(1, 4), lwd = 2, pch = 5, axes = FALSE, main = " ")
axis(1, at = c(1.5, 2.5, 3.5), labels = c("HF", "LF", "VLF"))
mtext("Word Frequency", side = 1, line = 3, cex = 1.5, font = 2)
axis(2, pos = 1.1)
par(las = 0)
mtext("Group Mean M", side = 2, line = 2.9, cex = 1.5, font = 2)
x <- c(1.5, 2.5, 3.5)
boxplot.ej(RT.hf.sp, xloc = 1.5, cex.boxpoint = ps)
boxplot.ej(RT.hf.ac, xloc = 1.5, cex.boxpoint = ps, color = "grey")
boxplot.ej(RT.lf.sp, xloc = 2.5, cex.boxpoint = ps)
boxplot.ej(RT.lf.ac, xloc = 2.5, cex.boxpoint = ps, color = "grey")
boxplot.ej(RT.vlf.sp, xloc = 3.5, cex.boxpoint = ps)
boxplot.ej(RT.vlf.ac, xloc = 3.5, cex.boxpoint = ps, color = "grey")
text(2.5, 0.35, "Speed", cex = 1.4, font = 1, adj = 0.5)
text(2.5, 0.57, "Accuracy", cex = 1.4, font = 1, col = "grey", adj = 0.5)
# Use qplot to create bloxplot
users <- ddply(result3, .(category, user_cumcnt), function(x) data.frame(users = length(unique(x$userid))))
boxplot.ej <- function(y, xloc = 1, width.box = 0.25, lwd.box = 2, width.hor = 0.25,
lwd.hor = 2, range.wisk = 1.5, lwd.wisk = 2, pch.box = 16, cex.boxpoint = 2,
plot.outliers = FALSE, pch.out = 1, cex.out = 1, color = "black") {
# makes boxplot with dot as median and solid whisker Interquartile range =
# (.75 quantile) - (.25 quantile). Note: Wiskers are not always symmetrical;
# top wisker extends up to max(y) constrained by y <= (.75 quantile) +
# range.wisk*Interquartile range bottom whisker is determined by min(y)
# constrained by y >= (.25 quantile) - range.wisk*Interquartile range
Q <- quantile(y, c(0.25, 0.5, 0.75))
names(Q) <- NULL # gets rid of percentages
IQ.range <- Q[3] - Q[1]
low <- Q[1] - range.wisk * IQ.range
high <- Q[3] + range.wisk * IQ.range
index <- which((y >= low) & (y <= high))
wisk.low <- min(y[index])
wisk.high <- max(y[index])
outliers <- y[which((y < low) | (y > high))]
# plot median:
points(xloc, Q[2], pch = pch.box, cex = cex.boxpoint, col = color)
# plot box:
xleft <- xloc - width.box/2
xright <- xloc + width.box/2
ybottom <- Q[1]
ytop <- Q[3]
rect(xleft, ybottom, xright, ytop, lwd = lwd.box, border = color)
# plot whiskers:
segments(xloc, wisk.low, xloc, Q[1], lwd = lwd.wisk, col = color)
segments(xloc, Q[3], xloc, wisk.high, lwd = lwd.wisk, col = color)
# plot horizontal segments:
x0 <- xloc - width.hor/2
x1 <- xloc + width.hor/2
segments(x0, wisk.low, x1, wisk.low, lwd = lwd.hor, col = color)
segments(x0, wisk.high, x1, wisk.high, lwd = lwd.hor, col = color)
# plot outliers:
if (plot.outliers == TRUE) {
xloc.p <- rep(xloc, length(outliers))
points(xloc.p, outliers, pch = pch.out, cex = cex.out, col = color)
}
}
RT.hf.sp <- rnorm(1000, mean = 0.41, sd = 0.008)
RT.lf.sp <- rnorm(1000, mean = 0.43, sd = 0.01)
RT.vlf.sp <- rnorm(1000, mean = 0.425, sd = 0.012)
RT.hf.ac <- rnorm(1000, mean = 0.46, sd = 0.008)
RT.lf.ac <- rnorm(1000, mean = 0.51, sd = 0.01)
RT.vlf.ac <- rnorm(1000, mean = 0.52, sd = 0.012)
ps <- 1 # size of boxpoint
par(cex.main = 1.5, mar = c(5, 6, 4, 5) + 0.1, mgp = c(3.5, 1, 0), cex.lab = 1.5,
font.lab = 2, cex.axis = 1.3, bty = "n", las = 1)
x <- c(1, 2, 3, 4)
plot(x, c(-10, -10, -10, -10), type = "p", ylab = " ", xlab = " ", cex = 1.5,
ylim = c(0.3, 0.6), xlim = c(1, 4), lwd = 2, pch = 5, axes = FALSE, main = " ")
axis(1, at = c(1.5, 2.5, 3.5), labels = c("HF", "LF", "VLF"))
mtext("Word Frequency", side = 1, line = 3, cex = 1.5, font = 2)
axis(2, pos = 1.1)
par(las = 0)
mtext("Group Mean M", side = 2, line = 2.9, cex = 1.5, font = 2)
x <- c(1.5, 2.5, 3.5)
boxplot.ej(RT.hf.sp, xloc = 1.5, cex.boxpoint = ps)
boxplot.ej(RT.hf.ac, xloc = 1.5, cex.boxpoint = ps, color = "grey")
boxplot.ej(RT.lf.sp, xloc = 2.5, cex.boxpoint = ps)
boxplot.ej(RT.lf.ac, xloc = 2.5, cex.boxpoint = ps, color = "grey")
boxplot.ej(RT.vlf.sp, xloc = 3.5, cex.boxpoint = ps)
boxplot.ej(RT.vlf.ac, xloc = 3.5, cex.boxpoint = ps, color = "grey")
text(2.5, 0.35, "Speed", cex = 1.4, font = 1, adj = 0.5)
text(2.5, 0.57, "Accuracy", cex = 1.4, font = 1, col = "grey", adj = 0.5)
# Use qplot to create bloxplot
users <- ddply(result3, .(category, user_cumcnt), function(x) data.frame(users = length(unique(x$userid))))
convs <- ddply(result3, .(category, user_cumcnt), function(x) data.frame(convs = sum(x$convs)))
tmp <- merge(users, convs, by=c("category", "user_cumcnt"))
imps <- rep(0, length(tmp[,1]))
cum.convs <- rep(0, length(tmp[,1]))
tmp <- data.frame(tmp, cbind(imps, cum.convs))
tmp <- tmp[order(tmp$category, tmp$user_cumcnt), ]
cat.visit <- tmp
web.level <- levels(cat.visit[,1])
for (i in 1: length(web.level)){
cat1 <- cat.visit[cat.visit[,1] == web.level[i],3]
cat.visit[cat.visit[,1] == web.level[i],5] <- cumsum(cat1)
cat2 <- cat.visit[cat.visit[,1] == web.level[i],4]
cat.visit[cat.visit[,1] == web.level[i],6] <- cumsum(cat2)
}
write.csv(cat.visit, file= "cat_visit.csv")
cat.visit1 <- ddply(cat.visit, .(category, user_cumcnt), transform, resp.rate = convs/users, cum.conv = cum.convs/imps )
qplot(category, cum.conv, data=cat.visit1, geom=c("boxplot", "jitter"), color =category) +
geom_point(aes(color = category), size = 2) +
opts(title = "Conversion Rates by Website Category") +
opts(plot.title = theme_text(face = "bold", size=14)) +
xlab("Category") + ylab("Conversion Rates") +
opts(axis.text.x = theme_text(family = "sans", face = "bold", size = 8)) +
opts(axis.text.y = theme_text(family = "sans", face = "bold", size = 12))
## Donut Charts
list.of.data.frames = list(dat1, dat2, dat3, dat4, dat5, dat6, dat7)
data=merged.data.frame = Reduce(function(...) merge(..., all=T), list.of.data.frames)
# ggplot(data, aes(y=data[,2], x="", fill=data$age_grp)) +
# geom_bar(width = 1, stat = "identity") +
# coord_polar("y", start=0) +
# scale_fill_brewer(palette="Dark2") +
# ggtitle("Total Customers by Age Group") +
# ylab("Total Customers") +
# xlab("") +
# theme(plot.title = element_text(face = "bold", size = 20)) +
# theme(axis.text.x = element_text(face = "bold", size = 14)) +
# theme(axis.text.y = element_text(face = "bold", size = 14)) +
# theme(axis.title.x = element_text(face = "bold", size = 16)) +
# theme(strip.text.x = element_text(face = "bold", size = 16)) +
# theme(axis.title.y = element_text(face = "bold", size = 16, angle=90))
data2=colPerc(data[,-1])
data2=as.data.frame(data2[-nrow(data2),])
data2$age_grp=data$age_grp
data2$ymax=cumsum(data2[,1])
data2$ymin = c(0, head(data2$ymax, n = -1))
blank_theme <- theme_minimal()+
theme(
axis.title.x = element_blank(),
axis.title.y = element_blank(),
panel.border = element_blank(),
panel.grid=element_blank(),
axis.ticks = element_blank(),
plot.title=element_text(size=14, face="bold")
)
ggplot(data2, aes(fill = age_grp, ymax = ymax, ymin = ymin, xmax = 100, xmin = 80)) +
geom_rect(colour = "black") +
coord_polar(theta = "y") +
xlim(c(0, 100)) +
geom_label(aes(label=paste(data2[,1],"%"),x=100,y=(ymin+ymax)/2),inherit.aes = TRUE, show.legend = FALSE) +
scale_fill_brewer("Age Group", palette = "Dark2") + blank_theme +
theme(axis.text.x=element_blank()) + theme(legend.position=c(.5, .5)) +
ggtitle("Total Customers") +
theme(panel.grid=element_blank()) +
theme(axis.text=element_blank()) +
theme(axis.ticks=element_blank()) +
theme(legend.title = element_text(size=16, face="bold")) +
theme(legend.text = element_text(size = 14, face = "bold"))
opts(axis.text.y = theme_text(family = "sans", face = "bold", size = 12))
## Donut Charts
list.of.data.frames = list(dat1, dat2, dat3, dat4, dat5, dat6, dat7)
data=merged.data.frame = Reduce(function(...) merge(..., all=T), list.of.data.frames)
# ggplot(data, aes(y=data[,2], x="", fill=data$age_grp)) +
# geom_bar(width = 1, stat = "identity") +
# coord_polar("y", start=0) +
# scale_fill_brewer(palette="Dark2") +
# ggtitle("Total Customers by Age Group") +
# ylab("Total Customers") +
# xlab("") +
# theme(plot.title = element_text(face = "bold", size = 20)) +
# theme(axis.text.x = element_text(face = "bold", size = 14)) +
# theme(axis.text.y = element_text(face = "bold", size = 14)) +
# theme(axis.title.x = element_text(face = "bold", size = 16)) +
# theme(strip.text.x = element_text(face = "bold", size = 16)) +
# theme(axis.title.y = element_text(face = "bold", size = 16, angle=90))
data2=colPerc(data[,-1])
data2=as.data.frame(data2[-nrow(data2),])
data2$age_grp=data$age_grp
data2$ymax=cumsum(data2[,1])
data2$ymin = c(0, head(data2$ymax, n = -1))
blank_theme <- theme_minimal()+
theme(
axis.title.x = element_blank(),
axis.title.y = element_blank(),
panel.border = element_blank(),
panel.grid=element_blank(),
axis.ticks = element_blank(),
plot.title=element_text(size=14, face="bold")
)
ggplot(data2, aes(fill = age_grp, ymax = ymax, ymin = ymin, xmax = 100, xmin = 80)) +
geom_rect(colour = "black") +
coord_polar(theta = "y") +
xlim(c(0, 100)) +
geom_label(aes(label=paste(data2[,1],"%"),x=100,y=(ymin+ymax)/2),inherit.aes = TRUE, show.legend = FALSE) +
scale_fill_brewer("Age Group", palette = "Dark2") + blank_theme +
theme(axis.text.x=element_blank()) + theme(legend.position=c(.5, .5)) +
ggtitle("Total Customers") +
theme(panel.grid=element_blank()) +
theme(axis.text=element_blank()) +
theme(axis.ticks=element_blank()) +
theme(legend.title = element_text(size=16, face="bold")) +
theme(legend.text = element_text(size = 14, face = "bold"))
Visual 3 - Line Plot
plotsegraph <- function(loc, value, sterr, wiskwidth, color = "grey", linewidth = 2) {
w <- wiskwidth/2
segments(x0 = loc, x1 = loc, y0 = value - sterr, y1 = value + sterr, col = color, lwd = linewidth)
segments(x0 = loc - w, x1 = loc + w, y0 = value + sterr, y1 = value + sterr, col = color, lwd = linewidth) # upper whiskers
segments(x0 = loc - w, x1 = loc + w, y0 = value - sterr, y1 = value - sterr, col = color, lwd = linewidth) # lower whiskers
}
RT.hf.sp <- 0.41
RT.lf.sp <- 0.43
RT.vlf.sp <- 0.425
se.RT.hf.sp <- 0.01
se.RT.lf.sp <- 0.015
se.RT.vlf.sp <- 0.02
RT.hf.ac <- 0.46
RT.lf.ac <- 0.51
RT.vlf.ac <- 0.52
se.RT.hf.ac <- 0.01
se.RT.lf.ac <- 0.015
se.RT.vlf.ac <- 0.02
par(cex.main = 1.5, mar = c(5, 6, 4, 5) + 0.1, mgp = c(3.5, 1, 0), cex.lab = 1.5, font.lab = 2, cex.axis = 1.3, bty = "n", las = 1)
x <- c(1, 2, 3, 4)
plot(x, c(-10, -10, -10, -10), type = "p", ylab = "", xlab = " ", cex = 1.5, ylim = c(0.3, 0.6), xlim = c(1, 4), lwd = 2, pch = 5, axes = F, main = " ")
axis(1, at = c(1.5, 2.5, 3.5), labels = c("HF", "LF", "VLF"))
mtext("Word Frequency", side = 1, line = 3, cex = 1.5, font = 2)
axis(2, pos = 1.2, )
par(las = 0)
mtext(expression(paste("Mean ", mu)), side = 2, line = 2, cex = 1.5, font = 2)
x <- c(1.5, 2.5, 3.5)
points(x, c(RT.hf.sp, RT.lf.sp, RT.vlf.sp), cex = 1.5, lwd = 2, pch = 19)
plot.errbars <- plotsegraph(x, c(RT.hf.sp, RT.lf.sp, RT.vlf.sp), c(se.RT.hf.sp, se.RT.lf.sp, se.RT.vlf.sp), 0.1, color = "black") #0.1 = wiskwidth
lines(c(1.5, 2.5, 3.5), c(RT.hf.sp, RT.lf.sp, RT.vlf.sp), lwd = 2, type = "c")
points(x, c(RT.hf.ac, RT.lf.ac, RT.vlf.ac), cex = 1.5, lwd = 2, pch = 21)
plot.errbars <- plotsegraph(x, c(RT.hf.ac, RT.lf.ac, RT.vlf.ac), c(se.RT.hf.ac, se.RT.lf.ac, se.RT.vlf.ac), 0.1, color = "black") #0.1 = wiskwidth
lines(c(1.5, 2.5, 3.5), c(RT.hf.ac, RT.lf.ac, RT.vlf.ac), lwd = 2, type = "c")
points(1.5, 0.6, pch = 21, lwd = 2, cex = 1.5)
text(1.7, 0.6, "Accuracy", cex = 1.2, font = 1, adj = 0)
points(1.5, 0.57, pch = 19, lwd = 2, cex = 1.5)
text(1.7, 0.57, "Speed", cex = 1.2, font = 1, adj = 0)
top10thisyear=top10data[(n-15):n]
tmp=matrix(unlist(strsplit(top10thisyear$Week, " - ")), ncol=2, byrow=TRUE)[,1]
tmp=as.Date(tmp, format = "%Y-%m-%d")
top10thisyear[,start_dt:=tmp]
top10thisyear[order(start_dt)]
#cast data from wide to long
top10thisyearlong=melt(top10thisyear, id.vars=c("Week", "start_dt"), measure.vars=names(top10thisyear)[c(-1,-length(top10thisyear))])
ggplot(data=top10thisyearlong, aes(x=start_dt, y=value, group=factor(variable))) +
aes(color=factor(variable)) +
geom_line(linetype=1 , size = 1.25) +
geom_point(size = 1.8) +
ggtitle("Top Total Seach Index from 4/10/2016 to 7/24/2016 ") +
ylab("Search Volume Index") + xlab("Week") +
scale_x_date(date_labels = "%b %d", date_breaks = "1 week") +
theme(plot.title = element_text(face = "bold", size = 20)) +
theme(axis.text.x = element_text(face = "bold", size = 14)) +
theme(axis.text.y = element_text(face = "bold", size = 14)) +
theme(axis.title.x = element_text(face = "bold", size = 16)) +
theme(axis.title.y = element_text(face = "bold", size = 16, angle = 90)) +
theme(legend.position = 'bottom') +
theme(legend.text = element_text(family = "sans", face = "bold", size = 12)) +
theme(legend.title = element_text(family = "sans", face = "bold", size = 12)) +
guides(color=guide_legend("Styles"))
w <- wiskwidth/2
segments(x0 = loc, x1 = loc, y0 = value - sterr, y1 = value + sterr, col = color, lwd = linewidth)
segments(x0 = loc - w, x1 = loc + w, y0 = value + sterr, y1 = value + sterr, col = color, lwd = linewidth) # upper whiskers
segments(x0 = loc - w, x1 = loc + w, y0 = value - sterr, y1 = value - sterr, col = color, lwd = linewidth) # lower whiskers
}
RT.hf.sp <- 0.41
RT.lf.sp <- 0.43
RT.vlf.sp <- 0.425
se.RT.hf.sp <- 0.01
se.RT.lf.sp <- 0.015
se.RT.vlf.sp <- 0.02
RT.hf.ac <- 0.46
RT.lf.ac <- 0.51
RT.vlf.ac <- 0.52
se.RT.hf.ac <- 0.01
se.RT.lf.ac <- 0.015
se.RT.vlf.ac <- 0.02
par(cex.main = 1.5, mar = c(5, 6, 4, 5) + 0.1, mgp = c(3.5, 1, 0), cex.lab = 1.5, font.lab = 2, cex.axis = 1.3, bty = "n", las = 1)
x <- c(1, 2, 3, 4)
plot(x, c(-10, -10, -10, -10), type = "p", ylab = "", xlab = " ", cex = 1.5, ylim = c(0.3, 0.6), xlim = c(1, 4), lwd = 2, pch = 5, axes = F, main = " ")
axis(1, at = c(1.5, 2.5, 3.5), labels = c("HF", "LF", "VLF"))
mtext("Word Frequency", side = 1, line = 3, cex = 1.5, font = 2)
axis(2, pos = 1.2, )
par(las = 0)
mtext(expression(paste("Mean ", mu)), side = 2, line = 2, cex = 1.5, font = 2)
x <- c(1.5, 2.5, 3.5)
points(x, c(RT.hf.sp, RT.lf.sp, RT.vlf.sp), cex = 1.5, lwd = 2, pch = 19)
plot.errbars <- plotsegraph(x, c(RT.hf.sp, RT.lf.sp, RT.vlf.sp), c(se.RT.hf.sp, se.RT.lf.sp, se.RT.vlf.sp), 0.1, color = "black") #0.1 = wiskwidth
lines(c(1.5, 2.5, 3.5), c(RT.hf.sp, RT.lf.sp, RT.vlf.sp), lwd = 2, type = "c")
points(x, c(RT.hf.ac, RT.lf.ac, RT.vlf.ac), cex = 1.5, lwd = 2, pch = 21)
plot.errbars <- plotsegraph(x, c(RT.hf.ac, RT.lf.ac, RT.vlf.ac), c(se.RT.hf.ac, se.RT.lf.ac, se.RT.vlf.ac), 0.1, color = "black") #0.1 = wiskwidth
lines(c(1.5, 2.5, 3.5), c(RT.hf.ac, RT.lf.ac, RT.vlf.ac), lwd = 2, type = "c")
points(1.5, 0.6, pch = 21, lwd = 2, cex = 1.5)
text(1.7, 0.6, "Accuracy", cex = 1.2, font = 1, adj = 0)
points(1.5, 0.57, pch = 19, lwd = 2, cex = 1.5)
text(1.7, 0.57, "Speed", cex = 1.2, font = 1, adj = 0)
top10thisyear=top10data[(n-15):n]
tmp=matrix(unlist(strsplit(top10thisyear$Week, " - ")), ncol=2, byrow=TRUE)[,1]
tmp=as.Date(tmp, format = "%Y-%m-%d")
top10thisyear[,start_dt:=tmp]
top10thisyear[order(start_dt)]
#cast data from wide to long
top10thisyearlong=melt(top10thisyear, id.vars=c("Week", "start_dt"), measure.vars=names(top10thisyear)[c(-1,-length(top10thisyear))])
ggplot(data=top10thisyearlong, aes(x=start_dt, y=value, group=factor(variable))) +
aes(color=factor(variable)) +
geom_line(linetype=1 , size = 1.25) +
geom_point(size = 1.8) +
ggtitle("Top Total Seach Index from 4/10/2016 to 7/24/2016 ") +
ylab("Search Volume Index") + xlab("Week") +
scale_x_date(date_labels = "%b %d", date_breaks = "1 week") +
theme(plot.title = element_text(face = "bold", size = 20)) +
theme(axis.text.x = element_text(face = "bold", size = 14)) +
theme(axis.text.y = element_text(face = "bold", size = 14)) +
theme(axis.title.x = element_text(face = "bold", size = 16)) +
theme(axis.title.y = element_text(face = "bold", size = 16, angle = 90)) +
theme(legend.position = 'bottom') +
theme(legend.text = element_text(family = "sans", face = "bold", size = 12)) +
theme(legend.title = element_text(family = "sans", face = "bold", size = 12)) +
guides(color=guide_legend("Styles"))
Visual 2 - Histogram
# Basic Histogram
good.choices <- c(.43, .47, .47, .48, .50, .52, .53, .53, .54, .54, .54, .54, .55, .55, .55, .56, .56, .57, .57, .57, .57, .58, .58, .58, .59, .59, .60, .62, .63, .63, .64, .64, .66, .66, .67, .67, .68, .70, .70)
par(cex.main = 1.5, mar = c(5, 6, 4, 5) + 0.1, mgp = c(3.5, 1, 0), cex.lab = 1.5 , font.lab = 2, cex.axis = 1.3, bty = "n", las=1)
hist(good.choices, main = "", xlab = "", ylab = " ", ylim = c(0, 13), xlim = c(.30, .80), axes = FALSE, col = "grey")
axis(1, seq(.30, .80, by = .1))
axis(2, seq(.00, 12, by = 2))
rug(jitter(good.choices))
mtext("Prop. Choices from Good Decks", side = 1, line = 2.5, cex = 1.5, font = 2)
mtext("Number of Studies", side = 2, line = 3, cex = 1.5, font = 2, las = 0)
# Include a density estimator
lines(density(good.choices), lwd = 2)
# Include Numbers on Top
yhigh <- 8
h <- hist(good.choices, freq = FALSE, main = "", xlab = "", ylab = " ", ylim = c(0, yhigh), xlim = c(0.3, 0.8), axes = FALSE, col = "grey")
l_ply(seq_along(h$density), function(x) text(h$mids[x], h$density[x] + 0.32, round(h$density[x], 2), cex = 1.2))
axis(1, seq(0.3, 0.8, by = 0.1))
axis(2, labels = FALSE, lwd.ticks = 0)
rug(jitter(good.choices))
mtext("Prop. Choices from Good Decks", side = 1, line = 2.5, cex = 1.5, font = 2)
mtext("Density of Studies", side = 2, line = 1, cex = 1.5, font = 2, las = 0)
## Select most recent 16 week data;
colsToSum <- names(data)[-1]
thisyear <- data[, lapply(.SD, sum, na.rm=TRUE), .SDcols=colsToSum]
## Column sum
data2=data.table(style=colnames(thisyear), cnt=unlist(thisyear))[order(cnt)]
data2$rowid=1:nrow(data2)
data2$style=str_replace_all(data2$style, "_", " ")
data2$style=factor(data2$style, levels=data2$style)
data2$scalecnt=round(100*data2$cnt/max(data2$cnt))
ggplot(data2, aes(y=scalecnt, x=style)) +
geom_bar(stat="identity", fill="blue") +
coord_flip() +
ggtitle("Total Search Index from 4/10/2016 to 7/24/2016 ") +
ylab("Search Volume Index") +
xlab("Style") +
theme(plot.title = element_text(face = "bold", size = 10)) +
theme(axis.text.x = element_text(face = "bold", size = 10)) +
theme(axis.text.y = element_text(face = "bold", size = 10)) +
theme(axis.title.x = element_text(face = "bold", size = 10)) +
theme(axis.title.y = element_text(face = "bold", size = 10, angle=90))
colsToSum <- names(data)[-1]
thisyear <- data[, lapply(.SD, sum, na.rm=TRUE), .SDcols=colsToSum]
## Column sum
data2=data.table(style=colnames(thisyear), cnt=unlist(thisyear))[order(cnt)]
data2$rowid=1:nrow(data2)
data2$style=str_replace_all(data2$style, "_", " ")
data2$style=factor(data2$style, levels=data2$style)
data2$scalecnt=round(100*data2$cnt/max(data2$cnt))
ggplot(data2, aes(y=scalecnt, x=style)) +
geom_bar(stat="identity", fill="blue") +
coord_flip() +
ggtitle("Total Search Index from 4/10/2016 to 7/24/2016 ") +
ylab("Search Volume Index") +
xlab("Style") +
theme(plot.title = element_text(face = "bold", size = 10)) +
theme(axis.text.x = element_text(face = "bold", size = 10)) +
theme(axis.text.y = element_text(face = "bold", size = 10)) +
theme(axis.title.x = element_text(face = "bold", size = 10)) +
theme(axis.title.y = element_text(face = "bold", size = 10, angle=90))
Visual 1 - Corrlation
> height.ratio <- c(0.924324324, 1.081871345, 1, 0.971098266, 1.029761905,
+ 0.935135135, 0.994252874, 0.908163265, 1.045714286, 1.18404908,
+ 1.115606936, 0.971910112, 0.97752809, 0.978609626, 1,
+ 0.933333333, 1.071428571, 0.944444444, 0.944444444, 1.017142857,
+ 1.011111111, 1.011235955, 1.011235955, 1.089285714, 0.988888889,
+ 1.011111111, 1.032967033, 1.044444444, 1, 1.086705202,
+ 1.011560694, 1.005617978, 1.005617978, 1.005494505, 1.072222222,
+ 1.011111111, 0.983783784, 0.967213115, 1.04519774, 1.027777778,
+ 1.086705202, 1, 1.005347594, 0.983783784, 0.943005181, 1.057142857)
>
> pop.vote <- c(0.427780852, 0.56148981, 0.597141922, 0.581254292, 0.530344067,
+ 0.507425996, 0.526679292, 0.536690951, 0.577825976, 0.573225387,
+ 0.550410082, 0.559380032, 0.484823958, 0.500466176, 0.502934212,
+ 0.49569636, 0.516904414, 0.522050547, 0.531494442, 0.60014892,
+ 0.545079801, 0.604274986, 0.51635906, 0.63850958, 0.652184407,
+ 0.587920412, 0.5914898, 0.624614752, 0.550040193, 0.537771958,
+ 0.523673642, 0.554517134, 0.577511576, 0.500856251, 0.613444534,
+ 0.504063153, 0.617883695, 0.51049949, 0.553073235, 0.59166415,
+ 0.538982024, 0.53455133, 0.547304058, 0.497350649, 0.512424242,
+ 0.536914796)
>
> cor.test(height.ratio,pop.vote)
Pearson's product-moment correlation
data: height.ratio and pop.vote
t = 2.8358, df = 44, p-value = 0.006883
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.1160356 0.6133936
sample estimates:
cor
0.3930924
Subscribe to:
Posts (Atom)
Blog Archive
-
▼
2015
(43)
-
▼
December
(14)
- Master R 4 - Restruct Data
- Master R 3 - Filter and Summarize Data
- Master R 2 - Fetch Web Data
- Master R 1 - Readin Data
- Visual 10 - Bipartite Network Plot in R
- Visual 9 - Heat Map in R
- Visual 8 - Dot Plot and Bubble Chart
- Visual 7 - Network Graph
- Visual 6 - Density
- Visual 5 - Bar Chart and Stacked Bar Chart
- Visual 4 - Bloxplot and Donut Chart
- Visual 3 - Line Plot
- Visual 2 - Histogram
- Visual 1 - Corrlation
-
▼
December
(14)