Survey server cheat sheet
ss <- read.csv("http://had.co.nz/stat480/data/server-survey.csv")
ss$id <- 1:nrow(ss)
cont <- sapply(ss, is.numeric)
scont <- ss[, cont]
library(reshape)
contm <- melt(scont, id="id", preserve=FALSE)
cast(contm, variable ~ ., c(length, min, max, mean, sd))
options(scipen = 9)
cast(contm, variable ~ ., c(length, min, max, mean, sd))[-1, ]
library(rggobi)
ggobi(ss)
table(ss$sex, exclude=NULL)
ss$sex[ss$sex == 2] <- NA
table(ss$sex, exclude=NULL)
ss$sex <- factor(ss$sex, levels=c(0,1), labels=c("male", "female"))
table(ss$state)
table(tolower(ss$state))
ss$state <- tolower(ss$state)
ss$state[ss$state == "(province) ontario, canada"] <- NA
ss$state[ss$state == "alaska"] <- "AK"
states <- c("alabama"="al", "alaska"="ak", "american samoa"="as", "arizona"="az", "arkansas"="ar", "california"="ca", "colorado"="co", "connecticut"="ct", "delaware"="de", "district of columbia"="dc", "federated states of micronesia"="fm", "florida"="fl", "georgia"="ga", "guam"="gu", "hawaii"="hi", "idaho"="id", "illinois"="il", "indiana"="in", "iowa"="ia", "kansas"="ks", "kentucky"="ky", "louisiana"="la", "maine"="me", "marshall islands"="mh", "maryland"="md", "massachusetts"="ma", "michigan"="mi", "minnesota"="mn", "mississippi"="ms", "missouri"="mo", "montana"="mt", "nebraska"="ne", "nevada"="nv", "new hampshire"="nh", "new jersey"="nj", "new mexico"="nm", "new york"="ny", "north carolina"="nc", "north dakota"="nd", "northern mariana islands"="mp", "ohio"="oh", "oklahoma"="ok", "oregon"="or", "palau"="pw", "pennsylvania"="pa", "puerto rico"="pr", "rhode island"="ri", "south carolina"="sc", "south dakota"="sd", "tennessee"="tn", "texas"="tx", "utah"="ut", "vermont"="vt", "virgin islands"="vi", "virginia"="va", "washington"="wa", "west virginia"="wv", "wisconsin"="wi", "wyoming"="wy")
longstates <- ss$state %in% names(states)
ss$state[longstates]
states[ss$state[longstates]]
ss$state[longstates] <- states[ss$state[longstates]]
table(ss$state)
ss$state[!ss$state %in% states] <- NA
ss$state <- factor(ss$state, levels=states)
table(ss$pcttip)
props <- !is.na(ss$pcttip) & na.omit(ss$pcttip < 1)
ss$pcttip[props] <- ss$pcttip[props] * 100
table(ss$pcttip)
table(round_any(ss$pcttip, 1), exclude=NULL)
ss <- ss[!is.na(ss$pcttip) & ss$pcttip > 70, ]
ss <- ss[!(!is.na(ss$pcttip) & ss$pcttip > 70), ]
table(ss$asian_prop, exclude=NULL)
props <- !is.na(ss$asian_prop) & ss$asian_prop < 1
ss$asian_prop[props] <- ss$asian_prop[props] * 100
ss[!is.na(ss$asian_prop) & ss$asian_prop > 100, ]
ss[!is.na(ss$asian_prop) & ss$asian_prop > 100, "asian_prop"] <- NA
qplot(ss$asian_prop, type="histogram", breaks=seq(0, 100, by=5))
qplot(ss$asian_prop, type="histogram", breaks=seq(0, 100, by=1))
ssstatem <- melt(ss, id=c("id", "state"), measure=c("pcttip", "asian_prop"), preserve=FALSE)
cast(ssstatem, state ~ variable, length)
cast(ssstatem, state ~ variable, c(length, mean))
means <- cast(ssstatem, state ~ variable, mean)
qplot(asian_prop, pcttip, data=means, main="State means")
qplot(asian_prop, pcttip, data=ss, main="Individual restaurants")