library(doBy) library(ggplot2) #library(scales) library(gridExtra) #library(rgdal) #------------------------------------------ Sys.setlocale("LC_TIME","English") #----------------------------------------------------------------------------------- #Background info #we analyze (anonymized and cleaned) mobile data from https://www.netztest.at/de/ #----------------------------------------------------------------------------------- #Step 0: load data and geo_data of austria #dir <- url("http://www.trutschnig.net/RTR_data.RData") dir <- url("http://www.trutschnig.net/RTR.RData") load(dir) dir <- url("http://www.trutschnig.net/AT.RData") load(dir) close(dir) ls() names(AT)<-c("long","lat") A<-RTR summary(RTR) #Step 1: plot the measurment coords p <- ggplot(data=AT,aes(x=long,y=lat)) p <- p + geom_path() p <- p + geom_point(data=A) p <- p + theme_bw() print(p) #Step 1b: Produce the same plot for each op_name p <- ggplot(data=AT,aes(x=long,y=lat)) p <- p + geom_path() p <- p + geom_point(data=A) p <- p + theme_bw() p <- p + facet_wrap(~op_name,nrow=2) print(p) #Step 2: produce a 2-dim histogram of the data xbin<-ybin<-0.02 farbe<-rainbow(100,start=.40,end=.17) p <- ggplot(data=AT,aes(x=long,y=lat)) p <- p + geom_path() p <- p + stat_binhex(data=A,binwidth = c(xbin,ybin)) p <- p + theme_bw() p <- p + labs(title = paste("Histogram: Data with gps coords, time window: ",min(A$mymd)," till ", max(A$mymd),"\n","",sep="")) p <- p + scale_fill_gradientn(colours=farbe,name="count",trans="log10") print(p) #Step2b: Same plot for each of the three op_names; use facet_wrap p <- ggplot(data=AT,aes(x=long,y=lat)) p <- p + geom_path() p <- p + stat_binhex(data=A,binwidth = c(xbin,ybin)) p <- p + theme_bw() p <- p + labs(title = paste("Histogram: Data with gps coords, time window: ",min(A$mymd)," till ", max(A$mymd),"\n","",sep="")) p <- p + scale_fill_gradientn(colours=farbe,name="count",trans="log10") p <- p + facet_wrap(~op_name,nrow=2) print(p) #----------------------------------------------------------------------------------------------------------- #--------------------------- (I) analyse perfomance of the three operators ------------------------------------- #----------------------------------------------------------------------------------------------------------- #Step 3a: which was the measurement with highest download speed (variable rtr_speed_dl) ? # when was this measurement done ? # in which network was the measurement done ? # which device was used for this measurement ? # in which city was the measurement done ? A0<-subset(A,A$rtr_speed_dl==max(A$rtr_speed_dl)) #Step 3b: Which was the iso_adm2 (=district) in which most measurments were done ? A1<-summaryBy(data=A,id~iso_adm2,FUN=c(length)) A1<-A1[order(-A1$id.length),] #Step 3c: Which was the day with the most measurements ? A1<-summaryBy(data=A,id~mymd,FUN=c(length)) A1<-A1[order(-A1$id.length),] #Step 3d: calculate number of samples per op_name and month (mym) using doBy and produce a (dodged) barplot AA<-summaryBy(data=A,id~op_name+mym,FUN=c(length)) names(AA)[3]<-"count" p <- ggplot(data=AA,aes(x=mym,y=count,fill=op_name)) p <- p + geom_bar(stat="identity",position="dodge") p <- p + labs(title = paste("Samples per op","\n",sep="")) p <- p + theme_bw() p <- p + scale_fill_manual(values = c("green", "magenta","gray40")) #p <- p + theme(axis.text.x=element_text(size=8,angle=90)) #p <- p + theme(axis.text.y=element_text(size=8)) print(p) #Step 4: Calculate the sample size per per operator and mymdh and produce a heatmap with mymd as x-coordinate, #hour as y-coordinate and colour according to sample size per hour. distinguish the 3 operators using facet_wrap A$mymdh<-substr(A$mtime,1,13) AA<-summaryBy(data=A,id~mymdh+op_name,FUN=c(length)) names(AA)[3]<-"count" AA$hour<-substr(AA$mymdh,12,13) AA$mymd<-as.Date(substr(AA$mymdh,1,10)) farbe<-rainbow(100,start=.40,end=.17) p <- ggplot(AA, aes(mymd,hour)) p <- p + geom_tile(aes(fill = count),colour = "white") p <- p + scale_fill_gradientn(colours=farbe) p <- p + theme_bw() p <- p + labs(title = paste("Cleaned Data: Sample sizes per hour and op","\n","",sep="")) p <- p + scale_x_date(breaks = date_breaks("months")) p <- p + facet_wrap(~op_name,nrow=3) print(p) #Step 5: produce boxplots rtr_speed_dl per month, colour according to op_name - how is the speed developement over the last months ? p <- ggplot(data=A,aes(x=op_name,y=rtr_speed_dl,fill=op_name)) p <- p + geom_boxplot() p <- p + scale_fill_manual(values = c("green", "magenta","gray40")) p <- p + theme_bw() p <- p + facet_wrap(~mym,nrow=1) p <- p + labs(title = paste("boxplot downlink speed (2G+3G+4G)","\n","",sep="")) p #Step 5b: produce boxplots rtr_speed_dl per month, colour according to op_name, use logarithmic y-scale p <- ggplot(data=A,aes(x=op_name,y=rtr_speed_dl,fill=op_name)) p <- p + geom_boxplot() p <- p + scale_fill_manual(values = c("green", "magenta","gray40")) p <- p + theme_bw() p <- p + facet_wrap(~mym,nrow=1) p <- p + scale_y_log10() p <- p + labs(title = paste("boxplot downlink speed (2G+3G+4G)","\n","",sep="")) p #Step 6: ecdf per month, choose colour according to op_name p <- ggplot(data=A,aes(x=rtr_speed_dl,color=op_name)) p <- p + stat_ecdf() p <- p + scale_colour_manual(values = c("green", "magenta","gray40"),name="year") p <- p + theme_bw() p <- p + scale_y_continuous(labels = percent) p <- p + labs(title = paste("cdf downlink speed (2G+3G+4G)","\n","",sep="")) p <- p + facet_wrap(~mym,nrow=1) p <- p + theme(axis.text=element_text(size=7)) p <- p + theme(axis.text.x=element_text(size=7,angle=30)) p #Step 7: repeat Step 5 and 6 for uplink p <- ggplot(data=A,aes(x=op_name,y=rtr_speed_ul,fill=op_name)) p <- p + geom_boxplot() p <- p + scale_fill_manual(values = c("green", "magenta","gray40")) p <- p + theme_bw() p <- p + facet_wrap(~mym,nrow=1) p <- p + labs(title = paste("boxplot uplink speed (2G+3G+4G)","\n","",sep="")) p #Step 8a: Which is the iso_adm2 with the highest median downlink speed ? B<-summaryBy(data=A,A$rtr_speed_dl~iso_adm2,FUN=c(median)) names(B)[2]<-"median" B<-B[order(-B$median),] #Step 8a: Which is the iso_adm2 with the highest median downlink speed in January 2015? B0<-subset(A,A$mym=="2015-01") B<-summaryBy(data=B0,B0$rtr_speed_dl~iso_adm2,FUN=c(median)) names(B)[2]<-"median" B<-B[order(-B$median),] #Step 9b: Calculate the median download speed for each operator in the month 2014-01 and in the month 2015-01 # How many percent did the speed of each operator increase ? G<-subset(A,A$mym %in% c("2014-01","2015-01")) GG<-summaryBy(data=G,G$rtr_speed_dl ~ mym + op_name,FUN=c(median)) #----------------------------------------------------------------------------------------------------------- #--------------------------- (II) analyse device perfomance since 2014-09-01-------------------------------- #----------------------------------------------------------------------------------------------------------- B<-subset(A,A$nw_cat%in%c("3G","4G")&A$mymd>=as.Date("2014-09-01")) #Rest of Code only uses B #Step 9a: Which device was used for the measurement with the highest downlink speed A0<-subset(B,B$rtr_speed_dl==max(B$rtr_speed_dl)) #Step 9b: Which device has the highest number of measurements with nw_cat 4G ? A4<-subset(B,B$nw_cat=="4G") AA4<-summaryBy(data=A4,id~device,FUN=c(length)) AA4<-AA4[order(-AA4$id.length),] #Step 9c: Which platform (Android or iOS) had more measurements in 4G ? A4<-subset(B,B$nw_cat=="4G") AA4<-summaryBy(data=A4,id~device_platform,FUN=c(length)) AA4 #Step 10: Calculate number of trials per device and filter to 40 most used devices GG<-summaryBy(data=B,id~device+device_platform,FUN=c(length)) names(GG)[3]<-c("count") GG<-GG[order(-GG$count),] GG$nr<-1:nrow(GG) G1<-GG[1:40,] freqdev<-as.character(G1$device) FD<-data.frame(device=freqdev,nr=1:40) p <- ggplot(data=G1,aes(x=nr,y=count,fill=device_platform)) p <- p + geom_bar(stat="identity") p <- p + labs(title = paste("Samples per top 40 devices","\n",sep="")) p <- p + theme_bw() p <- p + scale_fill_manual(values = c("green", "magenta")) p <- p + scale_x_discrete(breaks=1:40,labels=G1$device,"device") p <- p + theme(axis.text.x=element_text(size=8,angle=90)) p <- p + theme(axis.text.y=element_text(size=8)) print(p) #Step 11: Calculate maximum and 95%-quantile of downlink speed per each of the 40 devices; distinguish nw_cat nw_cat 2G/3G/4G. #and prepare a plot with the results B1<-subset(B,B$device%in%freqdev) q95<-function(x){r<-quantile(x,probs=0.95);return(as.numeric(r))} MS<-summaryBy(data=B1,rtr_speed_dl~device+nw_cat+device_platform,FUN=c(q95,max)) MS<-MS[order(-MS$rtr_speed_dl.max),] p <- ggplot(data=MS,aes(x=device,y=rtr_speed_dl.q95,fill=device_platform)) p <- p + geom_bar(stat="identity") p <- p + labs(title = paste("Max and 95%-quantile per top 40 device","\n",sep="")) p <- p + theme_bw() p <- p + scale_fill_manual(values = c("green", "magenta")) p <- p + theme(axis.text.x=element_text(size=8,angle=90)) p <- p + theme(axis.text.y=element_text(size=8)) p <- p + facet_wrap(~nw_cat,nrow=1) print(p)