Saturday, November 19, 2016

Check Univariate Distribution in R

memory.limit()
memory.size(max = TRUE)
rm(list=ls(all=TRUE))
sessionInfo()

require(data.table)
require(stringr)
require(lubridate)
require(scales)
require(tigerstats)
require(ggplot2)
require(gridExtra)
require(ggthemes)

##################################################################
## Check Distribution
##################################################################
## Change data type
# series - actual TV series such as American Idol or Glee (names are masked for this exercise)
data$series = as.character(data$series)
sort(xtabs(~series,data), decreasing = T)[1:10]

# network - Networks such as ABC, HBO, FOX, etc. Names are masked.
data$network = as.character(data$network)
sort(xtabs(~network,data), decreasing = T)[1:10]

# Type - Type of TV network (broadcast or cable)
data$Type = as.character(data$Type)
sort(xtabs(~Type,data), decreasing = T)
rowPerc(xtabs(~Type,data))

# Eps 4 - Number of episodes in the given timeframe (assume a broadcast month = 4 weeks)
data$X..Eps = as.integer(data$X..Eps)
summary(data$X..Eps)

# Air Day - Day of episode airing (M, T, W, R, F, S, U)
data$Air.Day_M=unlist(lapply(data$Air.Day, function(x) grepl('M',x)))
data$Air.Day_T=unlist(lapply(data$Air.Day, function(x) grepl('T',x)))
data$Air.Day_W=unlist(lapply(data$Air.Day, function(x) grepl('W',x)))
data$Air.Day_R=unlist(lapply(data$Air.Day, function(x) grepl('R',x)))
data$Air.Day_F=unlist(lapply(data$Air.Day, function(x) grepl('F',x)))
data$Air.Day_S=unlist(lapply(data$Air.Day, function(x) grepl('S',x)))
data$Air.Day_U=unlist(lapply(data$Air.Day, function(x) grepl('U',x)))
rowPerc(xtabs(~data$Air.Day_M,data))
rowPerc(xtabs(~data$Air.Day_T,data))
rowPerc(xtabs(~data$Air.Day_W,data))
rowPerc(xtabs(~data$Air.Day_R,data))
rowPerc(xtabs(~data$Air.Day_F,data))
rowPerc(xtabs(~data$Air.Day_S,data))
rowPerc(xtabs(~data$Air.Day_U,data))

# National Time - 9:00 PM Airing start time
# tmp=as.POSIXct(as.character(data$National.Time), format="%H:%M %r")
# class(data$National.Time)
# rowPerc(xtabs(~data$National.Time,data))

# daypart prime Industry-standard time block (see side panel for details)
data$daypart=as.character(data$daypart)
xtabs(~daypart,data)
rowPerc(xtabs(~daypart,data))

#Run_time (min) 60 Series run time in minutes
summary(data$Run_time..min.)
data=data[order(data$Run_time..min., decreasing = T), ]

#Unique HHs 2,636,448 Number of unique Households tuned in to a given series within given time interval*
data$Unique.HHs=as.integer(data$Unique.HHs)
summary(data$Unique.HHs)

#Total Hrs Viewed 1,534,543 Sum of hours 'logged in' to the given program by all viewers in given time frame
data$Total.Hrs.Viewed=as.integer(data$Total.Hrs.Viewed)
summary(data$Total.Hrs.Viewed)

#Avg % Viewed 53.6% Average % of the program viewed**
tmp=str_replace_all(data$Avg...Viewed, "%", "")
data$Avg...Viewed=as.numeric(tmp)/100
summary(data$Avg...Viewed)

##################################################################
## Check Time Series Distribution
##################################################################

train$Date = as.Date(train$Date, "%Y-%m-%d")
ggplot(train, aes(x=Date, y=embroidered_top)) +
  geom_point(aes( x= Date, y=embroidered_top), col = "blue", size = 1) +
  scale_x_date(labels=date_format("%b %y")) +
  stat_smooth(color="red")

No comments:

Post a Comment

Blog Archive