Thursday, March 5, 2015

OLS Model in R

Content Weekly Trend
========================================================

### Read-in content raw data - weekly aggregated data

```{r echo=FALSE, results='hide',message=FALSE}
memory.limit()
memory.size(max = TRUE)
rm(list=ls(all=T))

require(data.table)
require(ggplot2)
require(plyr)

data <- read.table("content_data.csv", head=F, sep=',');
setnames(data, c("pv","mon","wk", "week"));
head(data)
```

```{r echo=FALSE, results='hide',message=FALSE, fig.width=20, fig.height=8}
ggplot(data, aes(x=wk, y=pv)) +
  geom_bar(stat = "identity", fill = "blue")+
  geom_abline(intercept = mean(data$pv), color = 'red', size = 1, lty = 3) +
  ggtitle("Weekly Pageview Counts") +
  ylab("Page Views") + xlab("Week") +
  theme(plot.title = element_text(face = "bold", size = 20)) +
  theme(axis.text.x = element_text(face = "bold", size = 16, angle=-90)) +
  theme(axis.text.y = element_text(face = "bold", size = 16)) +
  theme(axis.title.x = element_text(face = "bold", size = 16)) +
  theme(axis.title.y = element_text(face = "bold", size = 16, angle = 90)) +
  theme(legend.position = "top") +
  theme(legend.key = element_rect(colour = NA)) +
  theme(legend.title = element_blank()) +
  theme(legend.position = "none")

```

Drop first 11 month pageviews.
```{r}
data <- data[14:77,]
head(data)
```

### OLS

#### Stata code
```
import excel "mytest.xlsx",
sheet("Sheet1") firstrow tsset mycount
generate mylog = log(wvolume)
generate cc = mycount*mycount
regress mylog  mycount i.mymonth
```

### Regress on continuous month
```{r}
data$mylog = log(data$pv)
data$mycount = (1:length(data$pv))
data$cc = data$mycount^2
ols1 <- lm(data$mylog ~ data$mycount + data$mon, data = data)
summary(ols1)
exp(ols1$coefficients[3])
AIC(ols1)
BIC(ols1)
```

### Regress on discrete month
```{r}
data$mon = as.factor(data$mon)
ols2 <- lm(data$mylog ~ data$mycount + data$cc + data$mon, data = data)
summary(ols2)
AIC(ols2)
BIC(ols2)
par(mfrow=c(2, 2))
plot(ols2)

anova(ols1, ols2)

```

### trending factor
```{r}
exp(ols2$coefficients[2]+ols2$coefficients[3])
```

### monthly adjustment factor
```{r}
exp(ols2$coefficients[4:14])
```

### Aggregated weekly data to generate monthly data
```{r}
data2 <- data.table(data)
data3<-data2[,list(pv=sum(pv),mon=min(as.integer(mon))), by=week][order(week)]
```


```{r echo=FALSE, results='hide',message=FALSE, fig.width=20, fig.height=8}
ggplot(data3, aes(x=week, y=pv)) +
  geom_bar(stat = "identity", fill = "darkblue")+
  geom_abline(intercept = mean(data3$pv), color = 'red', size = 1, lty = 3) +
  ggtitle("Monthly Pageview Counts") +
  ylab("PageViews") + xlab("Month") +
  theme(plot.title = element_text(face = "bold", size = 20)) +
  theme(axis.text.x = element_text(face = "bold", size = 16, angle=-90)) +
  theme(axis.text.y = element_text(face = "bold", size = 16)) +
  theme(axis.title.x = element_text(face = "bold", size = 16)) +
  theme(axis.title.y = element_text(face = "bold", size = 16, angle = 90)) +
  theme(legend.position = "top") +
  theme(legend.key = element_rect(colour = NA)) +
  theme(legend.title = element_blank()) +
  theme(legend.position = "none")
```

```{r}
data3$mylog = log(data3$pv)
data3$mycount = (1:length(data3$pv))
data3$cc = data3$mycount^2
data3$mon = as.factor(data3$mon)

ols3 <- lm(data3$mylog ~ data3$mycount + data3$cc + data3$mon, data = data3)
summary(ols3)
AIC(ols3)
BIC(ols3)
par(mfrow=c(2, 2))
plot(ols3)
```

### trending factor
```{r}
exp(ols3$coefficients[2]+ols3$coefficients[3])
```

### monthly adjustment factor
```{r}
exp(ols3$coefficients[4:14])
```