#------------------------------------------------------------------------------------ #----------------Part Ia: Descriptive statistics-------------------------------------- #------------------------------------------------------------------------------------ ##install packages automatically #background info for ggplot2: see http://docs.ggplot2.org/current/ x <- c("dplyr","ggplot2","plotly","lubridate") ipak <- function(pkg){ new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])] print(new.pkg) if (length(new.pkg)>0){ install.packages(new.pkg, dependencies = TRUE) sapply(pkg, require, character.only = TRUE) } } ipak(pkg=x) lapply(x, require, character.only = TRUE) Sys.setlocale("LC_TIME","English") #------------------------------------------------------------ #--------------Step 1: data loading and some preparations --- #------------------------------------------------------------ #Download ATM.txt and import via RStudio or use direct link A<-read.table("http://www.trutschnig.net/ATM.txt",head=TRUE) summary(A) head(A) str(A) A$ymd<-ymd(A$ymd) A$year<-year(A$ymd) #------------------------------------------------------------ #--------------Step 2: plot the timeseries ---------------- #------------------------------------------------------------ # a first look at the data p <- ggplot(data=A,aes(x=ymd,y=sum_out)) p <- p + geom_line() p <- p + geom_point() p <- p + theme_bw() p <- p + facet_wrap(~year,scales = "free_x",nrow=3) p #show more capabilities of ggplot2 (no need to understand all of it right now) http://docs.ggplot2.org/current/ #improvement: better grid, holidays in colour, smoother A$monthday<-as.numeric(substr(A$ymd,9,10)) #day of month a<-max(A$sum_out,na.rm=TRUE) B<-subset(A,A$monthday==1) #first days per month A$month<-as.numeric(substr(A$ymd,6,7)) H<-subset(A,A$holiday==1) #holidays V<-subset(A,A$holiday==0.5) p <- ggplot(data=A,aes(x=ymd,y=sum_out)) p <- p + stat_smooth(se=FALSE,colour="gray") p <- p + geom_line() p <- p + geom_point() p <- p + geom_point(data=H,colour="red",size=2) p <- p + geom_point(data=V,colour="blue",size=2) p <- p + facet_wrap(~year,scales = "free_x",nrow=3) p <- p + scale_y_continuous(limits=c(0,a),name="withdrawn") p <- p + theme(panel.grid.minor.x = element_blank()) p <- p + theme_bw() p <- p + scale_x_date(breaks=B$ymd) p p01 <- p #plotly version for better inspection: p <- ggplot(data=A,aes(x=ymd,y=sum_out)) p <- p + stat_smooth(se=FALSE,colour="gray") p <- p + geom_line() p <- p + geom_point(aes(text=paste("weekday: ",weekday,sep=""))) p <- p + geom_point(data=H,colour="red",size=2,aes(text=paste("weekday: ",weekday,sep=""))) p <- p + geom_point(data=V,colour="blue",size=2,aes(text=paste("weekday: ",weekday,sep=""))) p <- p + facet_wrap(~year,scales = "free_x",nrow=3) p <- p + scale_y_continuous(limits=c(0,a),name="withdrawn") p <- p + theme(panel.grid.minor.x = element_blank()) p <- p + theme_bw() p <- p + scale_x_date(breaks=B$ymd) ggplotly(p) #------------------------------------------------------------ #--------------Step 3: yearly histo and ecdf -------- #------------------------------------------------------------ p <- ggplot(data=A,aes(x=sum_out)) p <- p + geom_histogram(fill="gray30",colour="gray") p <- p + facet_wrap(~year,nrow=3) p <- p + theme_bw() p p02 <- p #yearly violin plot (adjust = 0.5) B <- A B$year <- as.factor(B$year) p <- ggplot(data=B,aes(x=year,y=sum_out,fill=year)) p <- p + geom_violin(adjust=0.5) p <- p + stat_summary(fun.y = mean,fun.ymin = mean, fun.ymax = mean) p <- p + scale_y_continuous(limits=c(0,a),name="sum_out") p <- p + theme_bw() p p03 <- p p <- ggplot(data=B,aes(x=sum_out,colour=year)) p <- p + stat_ecdf(geom = "step") p <- p + theme_bw() p <- p + scale_color_manual(values=c("magenta","green","gray30")) p <- p + scale_y_continuous("relative frequency") p <- p + theme(legend.position="none") p p04 <- p #------------------------------------------------------------ #--------------Step 4: Quantiles and boxplots -------- #------------------------------------------------------------ #yearly boxplots p <- ggplot(data=B,aes(x=year,y=sum_out)) p <- p + geom_boxplot(fill="lightblue") p <- p + theme_bw() p #yearly boxplots plotly version p <- ggplot(data=B,aes(x=year,y=sum_out)) p <- p + geom_boxplot(fill="lightblue",outlier.size=0) p <- p + geom_jitter(aes(text=paste("ymd: ",ymd,sep=""))) p <- p + theme_bw() ggplotly(p) #weekday B <- A B$nr_weekday <- as.factor(B$nr_weekday) B$year <- as.factor(B$year) p <- ggplot(data=B,aes(x=nr_weekday,y=sum_out)) p <- p + geom_boxplot(,fill="lightblue") p <- p + theme_bw() p p05 <- p #weekday + year p <- ggplot(data=B,aes(x=nr_weekday,y=sum_out,fill=year)) p <- p + geom_boxplot() p <- p + theme_bw() p <- p + scale_fill_manual(values=c("magenta","green","gray30")) p p06 <- p #weekday reordered p <- ggplot(data=B,aes(x=nr_weekday,y=sum_out,fill=nr_weekday)) p <- p + geom_boxplot() p <- p + facet_wrap(~year) p <- p + scale_y_continuous(limits=c(0,a)) p <- p + theme_bw() p p07 <- p #weekday heatmap A$yw<-format(A$ymd,"%Y-%W") farbe<-rainbow(100,start=.40,end=.17) p <- ggplot(A, aes(x=yw,y=nr_weekday)) p <- p + geom_tile(aes(fill = sum_out),colour = "white") p <- p + scale_fill_gradientn(colours=farbe) p <- p + theme_bw() p <- p + theme(axis.text.x=element_text(angle=90)) p <- p + labs(title ="Sum_out per day and week") p <- p + facet_wrap(~year,nrow=3,scales="free") p p08 <- p #monthday boxplot: A$year <- as.factor(A$year) p <- ggplot(data=A,aes(x=monthday,y=sum_out,group=monthday,fill=year)) p <- p + geom_boxplot() p <- p + scale_fill_manual(values=c("magenta","green","gray30")) p <- p + theme_bw() p <- p + facet_wrap(~year) p #ggplotly version ggplotly(p)