#---------------------------------------------------------------------- # Baseball examples for R and RStudio demo #---------------------------------------------------------------------- ## Useful option (if working in the command line on a wide terminal) ## This will change the number of columns before R wraps the line. ## NOTE: This & some other code below might contain items surrounded ## by angle brackets (<...>). These should be replaced with actual ## values before running. #option(width=<# of columns>) ## Set working directory (uncomment if needed) #setwd("") ## Read data into R Data = read.table("hw1/RunsPerGame.txt", header=T) str(Data) # Get info about column names & types summary(Data) # Summary stats for each column ### ACCESSING DATA ## Rows and columns can be accessed by number... print( Data[1,] ) print( Data[,1] ) print( Data[1,1:6] ) ## Columns can be accessed by name print( Data$League ) print( Data$League[1:10] ) print( Data[, 'League'] ) print( Data[,c('Team','League')] ) ## Can combine to slice data as you'd like print( Data[c(1:10,15:20), c('Team','League','AVG')] ) ## Can select using conditionals print( Data[Data$League=='American',] ) print( subset(Data, League == 'American') ) summary(Data$RPG[Data$League=='National']) ### DATA MANIPULATION ## Order the Data Data <- Data[order(Data$Team),] ### COMMON CONTROL STRUCTURES for ( team in Data$Team ) { print( team ) } for ( i in 1:nrow(Data) ) { if ( Data$League[i] == "American") { print( paste( Data$Team[i],":", Data$RPG[i] ) ) } else { print( paste("SENIOR CIRCUIT", ":", Data$Team[i] ) ) } } ## Create R Function Calculate.RunsPerAB <- function(x) { return(x$R/x$AB) } print( Calculate.RunsPerAB(Data) ) ## Use attach() to avoid having to explicitly access by column name ## ## WARNING: Read the docs about "read-only" behavior. Also, attach() ## can clutter your namespace/environment (may create problem if ## you're dealing with multiple data sets) attach(Data) summary(RPG) summary(RPG[League=='National']) ## Basic statistics mean(RPG) median(RPG) sd(RPG) var(RPG) cov(OBP, RPG) IQR(RPG) # interquartile range -> useful for outlier detection #### BASIC PLOTTING ## Basic plotting boxplot(RPG) ## Multiple plots on the same "page" par(mfrow=c(1,2)) boxplot(RPG, main='Runs per game') boxplot(OBP, main='On base percentage') ## More complicated boxplots par(mfrow=c(1,1)) # back to only one plot per page boxplot(RPG ~ League) ## Scatter plot ## TIP: Read the docs for "cex" and "pch" options (change point types) plot(AVG, RPG, cex=0.5, pch=19) ## Scatter plots with multiple variables ## matplot(...) might be useful in other cases pairs(cbind(AVG,SLG,OBP,RPG), pch=19, cex=0.5) ## Basic linear regression modelH = lm(RPG~H) ## Get info about regression model & plot summary(modelH) plot(H, RPG, pch=19, cex=1.2, xlab="Hits", ylab = "Runs per Game") abline(modelH$coef[1], modelH$coef[2], col=2, lwd=2) ## Model evaluation predict(modelH) # Predicted values w/ training data sqrt(mean(residuals(modelH)^2)) # Training RMSE summary(modelH)$sigma # Residual Standard Error - uses degrees of freedom # Q: How do we calculate RMSE for testing data? # Hint: Can't use residuals(), but predict() does have a "newdata" param. ## Multi-parameter (predictor) models model.H.SO = lm(RPG ~ H + SO) summary(model.H.SO) ## Getting help on function parameters ?lm # -- or -- help(lm) ## More principled way using stepwise regression help.search('stepwise') # -- or -- ?? 'stepwise' ## GGPLOT - Slightly more elegant plotting require(ggplot2) ## Histogram ggplot(Data, aes(x=RPG)) + geom_histogram(binwidth=0.25) ## Different ways to view the effect of the League ggplot(Data, aes(x=RPG, fill=League)) + geom_histogram(binwidth=0.25) ggplot(Data, aes(x=RPG, fill=League)) + geom_histogram(binwidth=0.25, position='dodge') ggplot(Data, aes(x=RPG, fill=League)) + geom_histogram(binwidth=0.25) + facet_grid(League ~ .) ## Scatterpoint ggplot(Data, aes(x=AVG, y=RPG)) + geom_point() ggplot(Data, aes(x=AVG, y=RPG, colour=League)) + theme_bw() + scale_colour_brewer(palette="Set1") + geom_point(size=4) ## Simple box plots - you may need to 'reshape' the data for ggplot use require(reshape) M.Data <- melt(Data) ggplot(M.Data, aes(x=variable, y=value)) + theme_bw() + geom_boxplot() ggplot(subset(M.Data, variable %in% c("AVG", "OBP", "SLG")), aes(x=variable, y=value)) + theme_bw() + geom_boxplot() ## Box Plot with multiple variables ggplot(Data, aes(x=factor(League), y=RPG, colour=League)) + theme_bw() + scale_colour_brewer(palette="Set1") + geom_boxplot() ggplot(Data, aes(x=factor(League), y=RPG, colour=League)) + theme_bw() + scale_colour_brewer(palette="Set1") + geom_boxplot() + geom_point() ggplot(Data, aes(x=factor(League), y=RPG, colour=League)) + theme_bw() + scale_colour_brewer(palette="Set1") + geom_boxplot() + geom_jitter() ## Box plot + kernel density ggplot(Data, aes(x=factor(League), y=RPG, colour=League)) + theme_bw() + scale_colour_brewer(palette="Set1") + geom_violin() ## Plot points + model ggplot(Data, aes(x=H, y=RPG)) + theme_bw() + geom_point() + geom_abline(intercept = modelH$coef[1], slope= modelH$coef[2], colour="red") + xlab("H") + ylab("Runs per Game")