1. setting things up

1.1 set global chunk options and load packages

library("dplyr"); library("reshape2"); library("ggplot2"); library("knitr")

# also set the ggplot2 theme to something nicer than the default
theme_set(theme_minimal(base_size=20))

1.2 read in dataset and inspect

# read in a dataset

# print the first few rows

1.3 clean up dataset as desired

# do some cleaning operations here -- e.g. 
#   - change column names;
#   - recode a column;
#   - reshape the data to long-format with melt(); 
#   - convert factor columns to character or vice versa; 
#   - ...

2. summarize and visualize columns of interest

2.1 display some tables and summaries

# make a table by calling `table()` on a column of interest
# add some stuff to the `table` command above to make a formatted version:
#   - wrap it in `as.data.frame()`, and then wrap that in `kable()`
#   - then give `kable()` a vector of `col.names=` that you want displayed
#   - you can also specify `digits=` in `kable()`, for rounding in the output
# example of combining `aggregate()` with `kable()`:
cars_mpg_tab <- aggregate(mpg ~ gear + cyl, data=mtcars, FUN="mean")

# desired column names for output table:
my_colnames <- c("number of gears", "number of cylinders", "gas mileage (mpg)")

# display the table (note kable() comes from the knitr:: package)
kable(cars_mpg_tab, col.names=my_colnames, align=c("c","l","l"), digits=1)
number of gears number of cylinders gas mileage (mpg)
3 4 21.5
4 4 26.9
5 4 28.2
3 6 19.8
4 6 19.8
5 6 19.7
3 8 15.1
5 8 15.4

2.2 display some plots

# distribution of frequency by part of speech
ggplot(mtcars, aes(x=factor(cyl), y=mpg)) +
  geom_boxplot() + 
  theme(plot.title=element_text(face="italic")) +
  labs(title="plot title",
       subtitle="subtitle",
       caption="a caption with some info or a statement")

# distribution of rank by part of speech
ggplot(mtcars, aes(x=factor(gear), y=mpg)) +
  geom_boxplot() + 
  theme(axis.text.x=element_text(color="blue")) +
  labs(x="x axis label",
       y="y axis label")

# same as above plots, but with base graphics
# (not displayed bc of chunk option eval=FALSE)
boxplot(mpg ~ factor(cyl), data=mtcars)
boxplot(mpg ~ factor(gear), data=mtcars)

3. perform some analysis (optional – only if it’d be useful for you!)

3.1 fit a model to the data

# fit a linear regression model with `lm()` 
# the outcome is before the tilde; and 
# the predictors are after the tilde, separated by "+"

# search the help window and scroll to bottom of `lm()` page to see examples

3.2 examine the model summary and evaluate relationships

# look at the full model summary with `summary(your_model)`

3.3 evaluate the overall quality of the model fit

# extract the r-squared value from the model summary
# summary(your_model)$r.squared

# or compute it directly (see week1 notes for pearson's r)
# cor(your_data$outcome_var, fitted(your_model))^2

4. summarize the findings