##install packages ggplot2, doBy, gridExtr library(ggplot2) #background info see http://docs.ggplot2.org/current/ library(doBy) library(gridExtra) Sys.setlocale("LC_TIME","English") #------------------------------------------------------------ #--------------Step I: data loading and first look -------- #------------------------------------------------------------ #Download ATM.txt and import via RStudio or use direct link A<-read.table("http://www.trutschnig.net/Datensatz.txt",head=TRUE) summary(A) head(A) str(A) A$ymd<-as.Date(A$ymd) plot(A$ymd,A$sum_out,type="l") #------------------------------------------------------------ #--------------Step 2: plot the timeseries ---------------- #------------------------------------------------------------ # a look at the first year A$year<-as.numeric(substr(A$ymd,1,4)) A7<-subset(A,A$year==2007) p <- ggplot(data=A7,aes(x=ymd,y=sum_out)) p <- p + geom_line() p <- p + geom_point() p # a look at the first year p <- ggplot(data=A,aes(x=ymd,y=sum_out)) p <- p + geom_line() p <- p + geom_point() p <- p + facet_wrap(~year,scales = "free_x",nrow=3) p #show more capabilities of ggplot2 (no need to understand all of it right now) http://docs.ggplot2.org/current/ #improvement: better grid, holidays in colour, smoother A$monthday<-as.numeric(substr(A$ymd,9,10)) #day of month a<-max(A$sum_out,na.rm=TRUE) B<-subset(A,A$monthday==1) #first days per month A$month<-as.numeric(substr(A$ymd,6,7)) H<-subset(A,A$holiday==1) #holidays V<-subset(A,A$holiday==0.5) p <- ggplot(data=A,aes(x=ymd,y=sum_out)) p <- p + geom_line() p <- p + geom_point() p <- p + geom_point(data=H,colour="red",size=3) p <- p + geom_point(data=V,colour="blue",size=3) p <- p + facet_wrap(~year,scales = "free_x",nrow=3) p <- p + scale_y_continuous(limits=c(0,a),name="withdrawn") p <- p + theme(panel.grid.minor.x = element_blank()) p <- p + theme_bw() p <- p + scale_x_date(breaks=B$ymd) p <- p + stat_smooth(se=FALSE) p #weekdays in different colours p <- ggplot(data=A,aes(x=ymd,y=sum_out)) p <- p + geom_line() p <- p + geom_point(aes(x=ymd,y=sum_out,color=factor(nr_weekday)),size=3) #p <- p + scale_colour_manual(values=c(rep("black",6),"red"),name="Wochentag") #p <- p + scale_colour_manual(values=cols(7),name="Wochentag") p <- p + facet_wrap(~year,scales = "free_x",nrow=3) p <- p + scale_y_continuous(limits=c(0,a),name="Geldmenge") p <- p + theme(panel.grid.minor.x = element_blank()) p <- p + theme_bw() p <- p + scale_x_date(breaks=B$ymd) p <- p + stat_smooth(se=FALSE) p #------------------------------------------------------------ #--------------Step 3: yearly histo and boxplots -------- #------------------------------------------------------------ #produce a yearly histogram - go to http://docs.ggplot2.org/current/follows and find out how to do it #produce a boxplot per year - go to http://docs.ggplot2.org/current/follows and find out how to do it #find out what geom_jitter does and add jittered points to the boxplots #------------------------------------------------------------ #--------------Step 4: daily and monthly boxplots -------- #------------------------------------------------------------ #produce a boxplot per month #produce a boxplot per nr_weekday #------------------------------------------------------------ #--------------Step 5: include holiday info -------- #------------------------------------------------------------ #weekday reordered - omit outliers and plot holiday info instead V<-subset(A,A$holiday==0.5) p <- ggplot(data=A,aes(x=factor(nr_weekday),y=sum_out,fill=factor(nr_weekday))) p <- p + geom_boxplot(outlier.size=0) p <- p + facet_wrap(~year) p <- p + scale_y_continuous(limits=c(0,a)) p <- p + xlab("weekday") p <- p + labs(title = "Boxplot per weekday and year") p <- p + scale_fill_discrete(name = "Weekday") p <- p + geom_point(data=H,colour="red",size=3) p <- p + geom_point(data=V,colour="blue",size=3) p <- p + theme_bw() p #weekday reordered violin plot (adjust = 0.5) p <- ggplot(data=A,aes(x=factor(nr_weekday),y=sum_out,fill=factor(nr_weekday))) p <- p + geom_violin(adjust=0.5) p <- p + stat_summary(fun.y = median,fun.ymin = median, fun.ymax = median) p <- p + facet_wrap(~year) p <- p + scale_y_continuous(limits=c(0,a),name="Weekday") p <- p + theme_bw() p #monthday p <- ggplot(data=A,aes(x=factor(monthday),y=sum_out)) p <- p + geom_boxplot(fill="green") p <- p + facet_wrap(~year) p <- p + scale_y_continuous(limits=c(0,a)) p <- p + theme_bw() p #Step 5b: #Create a heatmap showing sum_out per weekday (y-coordinate) and week #use A$yw<-format(A$ymd,"%Y-%W") and geom_tile on http://docs.ggplot2.org/current/ #repeat for monthday #use A$monthday<-as.numeric(substr(A$ymd,9,10)) #------------------------------------------------------------ #--------------Step 6: save plots -------- #------------------------------------------------------------ #last two plots in one pdf p <- ggplot(data=A,aes(x=factor(nr_weekday),y=sum_out,fill=factor(nr_weekday))) p <- p + geom_violin(adjust=0.5) p <- p + theme(legend.position="none") p <- p + stat_summary(fun.y = median,fun.ymin = median, fun.ymax = median) p <- p + facet_wrap(~year) p <- p + scale_y_continuous(limits=c(0,a),name="outgoing") p <- p + scale_x_discrete(name="weekday") p <- p + theme_bw() p1 <- p #monthday p <- ggplot(data=A,aes(x=factor(monthday),y=sum_out)) p <- p + geom_boxplot(fill="green") p <- p + facet_wrap(~year) p <- p + scale_y_continuous(limits=c(0,a)) p <- p + scale_x_discrete(name="monthday") p <- p + theme_bw() p2 <- p library(gridExtra) pdf(file="plots.pdf",width=20,height=12) grid.arrange(p1, p2, nrow=2) dev.off() #------------------------------------------------------------------------------------------------------ #------------------------------------------------------------------------------------------------------ #second part starts here #------------------------------------------------------------------------------------------------------ #next step: calculate the summary statistics plotted before -> need loops and subsets #generate data and use if loop to calculate if positive or negative #Schritt 1 - loop basics x<-rnorm(1000,0,1) y<-x for(i in 1:1000){ if(x[i]<0){y[i]<-"N"} else {y[i]<-"P"} } E<-data.frame(x=x,y=y) summary(E) #Step 2 #use ifelse instead (quicker version - vectorial brother of classical if, see help) #Step 3 #calculate yearly mean of sum_out #Step 4 #don't calculate the levels of the variable of interest manually - let R do it - use levels A$year<-as.factor(A$year) years<-levels(A$year) n<-length(years) erg<-rep(0,n) for(i in 1:n){ A1<-subset(A,A$year==years[i]) erg[i]<-mean(A1$sum_out,na.rm=TRUE) } erg #Step 5 #same for weekdays + boxplot #Step 6 #what if we want median of sum_out per weekday and year (as in the boxplots) - use the summaryBy function in the doBy package ! B<-subset(A,is.na(A$sum_out)==0) YM<-summaryBy(data=B,sum_out ~ year + weekday, FUN=c(median)) YM<-YM[order(YM$weekday,YM$year),] #Step 7 #mean, median and length per year and weekday #Step 8 #also calculate upper and lower quartile -> use own little function q25<-function(x){ r<-quantile(x,probs=0.25,na.rm=TRUE) return(as.numeric(r)) } #Step 9 #correct order of weekdays #Step 10 #mean, median, etc. of sum_out per weekday, year AND holiday