##install packages ggplot2, doBy, gridExtr library(ggplot2) #background info see http://docs.ggplot2.org/current/ library(doBy) library(gridExtra) Sys.setlocale("LC_TIME","English") #------------------------------------------------------------ #--------------Step I: data loading and first look -------- #------------------------------------------------------------ #Download ATM.txt and import via RStudio or use direct link A<-read.table("http://www.trutschnig.net/Datensatz.txt",head=TRUE) summary(A) head(A) str(A) A$ymd<-as.Date(A$ymd) plot(A$ymd,A$sum_out,type="l") #------------------------------------------------------------ #--------------Step 2: plot the timeseries ---------------- #------------------------------------------------------------ # a look at the first year A$year<-as.numeric(substr(A$ymd,1,4)) A7<-subset(A,A$year==2007) p <- ggplot(data=A7,aes(x=ymd,y=sum_out)) p <- p + geom_line() p <- p + geom_point() p # a look at the first year p <- ggplot(data=A,aes(x=ymd,y=sum_out)) p <- p + geom_line() p <- p + geom_point() p <- p + facet_wrap(~year,scales = "free_x",nrow=3) p #show more capabilities of ggplot2 (no need to understand all of it right now) http://docs.ggplot2.org/current/ #improvement: better grid, holidays in colour, smoother A$monthday<-as.numeric(substr(A$ymd,9,10)) #day of month a<-max(A$sum_out,na.rm=TRUE) B<-subset(A,A$monthday==1) #first days per month A$month<-as.numeric(substr(A$ymd,6,7)) H<-subset(A,A$holiday==1) #holidays V<-subset(A,A$holiday==0.5) p <- ggplot(data=A,aes(x=ymd,y=sum_out)) p <- p + geom_line() p <- p + geom_point() p <- p + geom_point(data=H,colour="red",size=3) p <- p + geom_point(data=V,colour="blue",size=3) p <- p + facet_wrap(~year,scales = "free_x",nrow=3) p <- p + scale_y_continuous(limits=c(0,a),name="withdrawn") p <- p + theme(panel.grid.minor.x = element_blank()) p <- p + theme_bw() p <- p + scale_x_date(breaks=B$ymd) p <- p + stat_smooth(se=FALSE) p #weekdays in different colours p <- ggplot(data=A,aes(x=ymd,y=sum_out)) p <- p + geom_line() p <- p + geom_point(aes(x=ymd,y=sum_out,color=factor(nr_weekday)),size=3) #p <- p + scale_colour_manual(values=c(rep("black",6),"red"),name="Wochentag") #p <- p + scale_colour_manual(values=cols(7),name="Wochentag") p <- p + facet_wrap(~year,scales = "free_x",nrow=3) p <- p + scale_y_continuous(limits=c(0,a),name="Geldmenge") p <- p + theme(panel.grid.minor.x = element_blank()) p <- p + theme_bw() p <- p + scale_x_date(breaks=B$ymd) p <- p + stat_smooth(se=FALSE) p #------------------------------------------------------------ #--------------Step 3: yearly histo and boxplots -------- #------------------------------------------------------------ #histo p <- ggplot(data=A,aes(x=sum_out)) p <- p + geom_histogram() p <- p + facet_wrap(~year,nrow=3) p <- p + theme_bw() p #to do: find out how to get a nicer colour for the histogram, see, e.g. http://docs.ggplot2.org/current/ p <- ggplot(data=A,aes(x=sum_out)) p <- p + geom_histogram(fill="gray30",colour="gray") p <- p + facet_wrap(~year,nrow=3) p <- p + theme_bw() p #boxplots p <- ggplot(data=A,aes(x=factor(year),y=sum_out)) p <- p + geom_boxplot() p <- p + theme_bw() p #to do: add all samples per year in the boxplot - use geom_jitter p <- ggplot(data=A,aes(x=factor(year),y=sum_out)) p <- p + geom_boxplot() p <- p + geom_jitter() p <- p + theme_bw() p #------------------------------------------------------------ #--------------Step 4: daily and monthly boxplots -------- #------------------------------------------------------------ #monthly boxplot p <- ggplot(data=A,aes(x=factor(month),y=sum_out)) p <- p + geom_boxplot(fill="green") p <- p + facet_wrap(~year) p <- p + theme_bw() p #reorder p <- ggplot(data=A,aes(x=factor(month),y=sum_out,fill=factor(year))) p <- p + geom_boxplot() p <- p + theme_bw() p #weekday p <- ggplot(data=A,aes(x=factor(nr_weekday),y=sum_out,fill=factor(year))) p <- p + geom_boxplot() p <- p + theme_bw() p #weekday reordered p <- ggplot(data=A,aes(x=factor(nr_weekday),y=sum_out,fill=factor(nr_weekday))) p <- p + geom_boxplot() p <- p + facet_wrap(~year) p <- p + scale_y_continuous(limits=c(0,a)) p <- p + theme_bw() p #------------------------------------------------------------ #--------------Step 5: include holiday info -------- #------------------------------------------------------------ #weekday reordered - omit outliers and plot holiday info instead V<-subset(A,A$holiday==0.5) p <- ggplot(data=A,aes(x=factor(nr_weekday),y=sum_out,fill=factor(nr_weekday))) p <- p + geom_boxplot(outlier.size=0) p <- p + facet_wrap(~year) p <- p + scale_y_continuous(limits=c(0,a)) p <- p + xlab("weekday") p <- p + labs(title = "Boxplot per weekday and year") p <- p + scale_fill_discrete(name = "Weekday") p <- p + geom_point(data=H,colour="red",size=3) p <- p + geom_point(data=V,colour="blue",size=3) p <- p + theme_bw() p #weekday reordered violin plot (adjust = 0.5) p <- ggplot(data=A,aes(x=factor(nr_weekday),y=sum_out,fill=factor(nr_weekday))) p <- p + geom_violin(adjust=0.5) p <- p + stat_summary(fun.y = median,fun.ymin = median, fun.ymax = median) p <- p + facet_wrap(~year) p <- p + scale_y_continuous(limits=c(0,a),name="Weekday") p <- p + theme_bw() p #monthday p <- ggplot(data=A,aes(x=factor(monthday),y=sum_out)) p <- p + geom_boxplot(fill="green") p <- p + facet_wrap(~year) p <- p + scale_y_continuous(limits=c(0,a)) p <- p + theme_bw() p #heatmap for weekdays A$yw<-format(A$ymd,"%Y-%W") farbe<-rainbow(100,start=.40,end=.17) p <- ggplot(A, aes(yw,nr_weekday)) p <- p + geom_tile(aes(fill = sum_out),colour = "white") p <- p + scale_fill_gradientn(colours=farbe,name="count") #p <- p + theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5)) p <- p + theme_bw() p <- p + theme(axis.text.x=element_text(size=9,angle=30)) p <- p + labs(title = paste("Sum_out per day and week","\n","",sep="")) p <- p + facet_wrap(~year,nrow=3,scales="free") print(p) A$monthday<-as.numeric(substr(A$ymd,9,10)) A$ym<-substr(A$ymd,1,7) p <- ggplot(A, aes(ym,monthday)) p <- p + geom_tile(aes(fill = sum_out),colour = "white") p <- p + scale_fill_gradientn(colours=farbe,name="count") #p <- p + theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5)) p <- p + theme_bw() p <- p + theme(axis.text.x=element_text(size=9,angle=30)) p <- p + labs(title = paste("Sum_out per day and week","\n","",sep="")) print(p) #------------------------------------------------------------ #--------------Step 6: save plots -------- #------------------------------------------------------------ #last two plots in one pdf p <- ggplot(data=A,aes(x=factor(nr_weekday),y=sum_out,fill=factor(nr_weekday))) p <- p + geom_violin(adjust=0.5) p <- p + theme(legend.position="none") p <- p + stat_summary(fun.y = median,fun.ymin = median, fun.ymax = median) p <- p + facet_wrap(~year) p <- p + scale_y_continuous(limits=c(0,a),name="outgoing") p <- p + scale_x_discrete(name="weekday") p <- p + theme_bw() p1 <- p #monthday p <- ggplot(data=A,aes(x=factor(monthday),y=sum_out)) p <- p + geom_boxplot(fill="green") p <- p + facet_wrap(~year) p <- p + scale_y_continuous(limits=c(0,a)) p <- p + scale_x_discrete(name="monthday") p <- p + theme_bw() p2 <- p library(gridExtra) pdf(file="plots.pdf",width=20,height=12) grid.arrange(p1, p2, nrow=2) dev.off() #------------------------------------------------------------------------------------------------------ #------------------------------------------------------------------------------------------------------ #next step: calculate the summary statistics plotted before -> need loops and subsets #generate data and use if loop to calculate if positive or negative #Step 1: calculate day with maximal and minimal withdrawn amount #Step 2: loop basics: #let's assume we add a new column 'HL' to A only having two values: If sum_out>10000 then HL="H", otherwise HL="L" #count how many times HL=="H" A1<-subset(A,is.na(A$sum_out)==FALSE) A1$HL<-"H" for(i in 1:nrow(A1)){ if(A1$sum_out[i]<=10000){A1$HL[i]<-"L"} } #Step 2b - use ifelse #quicker version - vectorial brother of classical if A2<-subset(A,is.na(A$sum_out)==FALSE) A2$HL<-ifelse(A2$sum_out<=10000,"L","H") #Step 3 #calculate yearly mean of sum_out erg<-rep(0,3) for(i in 2007:2009){ A1<-subset(A,A$year==i) erg<-mean(A1$sum_out,na.rm=TRUE) print(erg) } erg #Step 4 #don't calculate the levels of the variable of interest manually - let R do it A$year<-as.factor(A$year) years<-levels(A$year) n<-length(years) erg<-rep(0,n) for(i in 1:n){ A1<-subset(A,A$year==years[i]) erg[i]<-mean(A1$sum_out,na.rm=TRUE) } erg #Step 5 #same for weekdays + boxplot weekdays<-levels(A$weekday) n<-length(weekdays) erg<-rep(0,n) for(i in 1:n){ A1<-subset(A,A$weekday==weekdays[i]) erg[i]<-median(A1$sum_out,na.rm=TRUE) } E<-data.frame(weekday=weekdays,median=erg) E #boxplots p <- ggplot(data=A,aes(x=weekday,y=sum_out)) p <- p + geom_boxplot() p <- p + geom_jitter() p #Step 6 #what if we want median of sum_out per weekday and year (as in the boxplots) - doubly loop ? use doBy package instead library(doBy) B<-subset(A,is.na(A$sum_out)==0) YM<-summaryBy(data=B,sum_out ~ year + weekday, FUN=c(median)) YM<-YM[order(YM$weekday,YM$year),] #Step 7 #mean, median and length per year and weekday YM<-summaryBy(data=B,sum_out ~ year + weekday, FUN=c(mean,median,length)) YM<-YM[order(YM$weekday,YM$year),] #Step 8 #also calculate upper and lower quartile -> use own little function, compare values with boxplots q25<-function(x){ r<-quantile(x,probs=0.25,na.rm=TRUE) return(as.numeric(r)) } q75<-function(x){ r<-quantile(x,probs=0.75,na.rm=TRUE) return(as.numeric(r)) } YM<-summaryBy(data=B,sum_out ~ year + weekday, FUN=c(q25,mean,median,q75,length)) YM<-YM[order(YM$weekday,YM$year),] #Step 9 #correct order of weekdays Woche<-A[1:7,2:3] YM<-merge(YM,Woche) YM<-YM[order(YM$nr_weekday,YM$year),] #Step 10 #mean, median, etc. of sum_out per weekday, year AND holiday YM<-summaryBy(data=B,sum_out ~ year + weekday + holiday, FUN=c(q25,mean,median,q75,length)) #Step 11: add related data - example add German weekdays Bridge<-data.frame(weekday=A$weekday[1:7],weekday_german=c("Mo","Di","Mi","Do","Fr","Sa","So")) A2<-merge(A,Bridge)