19 Exercise solutions

19.1 Getting started

19.1.1 Install the tools

No solution for this one.

19.2 The toolbox

19.2.1 Customize RStudio

No solution for this one.

19.2.2 Résumé

No solution for this one.

19.3 Basic R

19.3.1 Math in the console

31 + 11
66 - 24
126 / 3
12^2 
256**0.5
(3 * (4 + 8^0.5))/(5^3)
## [1] 42
## [1] 42
## [1] 42
## [1] 144
## [1] 16
## [1] 0.164

19.3.2 Functions (I)

A
Answer: paste() and paste0(). The difference lies in the separator, which is an empty string in paste0() and one space in paste(). Moreover, the separator can be configured in paste() using the sep = parameter.

paste("welcome ", "to ", "R", sep = "")
paste0("welcome ", "to ", "R")

B
Answer: abs() returns the absolute value. Simply put, a number with the minus sign removed if present.

abs(-20)
## [1] 20
abs(20)
## [1] 20

C
Answer: it combines (concatenates) its arguments into a single vector. The first example creates a “character” (text data) and the second a “numeric” (numeric data).

c(1, 2, "a")
## [1] "1" "2" "a"
class(c(1, 2, "a"))
## [1] "character"
c(1, 2, 3)
## [1] 1 2 3
class(c(1, 2, 3))
## [1] "numeric"

D

#install it. Note the quotes
install.packages("RColorBrewer")
#load it into your session. Note the absence of quotes
library(RColorBrewer)

E

ceiling(numberz)
trunc(numberz) # or signif(numberz, 3)
round(numberz, 3)
floor(numberz)

F

strsplit(names, " ")

G

unique(birds)

19.3.3 Variables

x <- 20
y <- 10
z <- 3

x + y
x^z
#OR
#x**z
q <- x * y * z
sqrt(q)
q/pi
log10(x * y)

19.3.4 Vectors

Circles

The circumference of a circle is \(2\pi\cdot r\), its surface \(4\pi \cdot r^2\) and its volume \(4/3 \pi\cdot r^3\). Given this vector of circle radiuses,

radiuses <- c(0, 1, 2, pi, 4)

A
Calculate their cirumference.

2 * pi * radiuses

B
Calculate their surface.

4 * pi * radiuses^2

C
Calculate their volume.

4/3 * pi * radiuses^3

Creating vectors

Create the following vectors, as efficiently as possible. The functions rep(), seq() and paste0() and the colon operator : can be used, in any combination.

A
[1] 1 2 5 1 2 5

rep(c(1, 2, 5), times = 2)

B
[1] 9 9 9 8 8 8 7 7 7 6 6 6 5 5 5

rep(9:5, each = 3)

C
[1] 1 1 1 4 4 4 9 9 9 1 1 1 4 4 4 9 9 9

rep(c(1, 4, 9), times = 2, each = 3)

D
[1] "1a" "2b" "3c" "4d" "5e" "1a" "2b" "3c" "4d" "5e"

rep(paste0(1:5, letters[1:5]), times = 2)

E
[1] "0z" "0.2y" "0.4x" "0.6w" "0.8v" "1u"

paste0(seq(from = 0, to = 1, length.out = 6), letters[26:21])

F
[1] "505" "404" "303" "202" "101" "000"

paste0(5:0, 0, 5:0)

G [Challenge]
[1] "0.5A5.0" "0.4B4.0" "0.3C3.0" "0.2D2.0" "0.1E1.0"

paste0(seq(from = 0.5, to = 0.1, by = -0.1),  LETTERS[1:5], 5:1, ".0")

19.4 Complex datatypes

19.4.1 Creating factors

A

animal_risk <- c(2, 4, 1, 1, 2, 4, 1, 4, 1, 1, 2, 1)
animal_risk_factor <- factor(x = animal_risk,
                             levels = c(1, 2, 3, 4),
                             labels = c("harmless", "risky", "dangerous", "deadly"),
                             ordered = TRUE)

B

set.seed(1234)
wealth_male <- sample(x = letters[1:4], 
                 size = 1000,
                 replace= TRUE, 
                 prob = c(0.7, 0.17, 0.12, 0.01))
wealth_female <- sample(x = letters[1:4], 
                 size = 1000,
                 replace= TRUE, 
                 prob = c(0.8, 0.15, 0.497, 0.003))

wealth_labels <- c("poor", "middle class", "wealthy", "rich")

wealth_male_f <- factor(x = wealth_male,
                        levels = letters[1:4],
                        labels = wealth_labels,
                        ordered = TRUE)

wealth_female_f <- factor(x = wealth_female,
                        levels = letters[1:4],
                        labels = wealth_labels,
                        ordered = TRUE)

#combine
wealth_all_f <- c(wealth_male_f, wealth_female_f)

prop.table(table(wealth_all_f)) * 100
## wealth_all_f
##         poor middle class      wealthy         rich 
##        63.65        12.45        23.35         0.55

19.4.2 List actions

A

house_admin[1]

B

house_admin[2]
#OR
house_admin$Rose

C

house_admin[1:2]

D

house_admin$Mike$cooking
#OR
house_admin$Mike[[1]]
#OR
house_admin[[3]][["cooking"]]
#MORE POSSIBILITIES

E

house_admin$John$tasks[2]
#MANY MORE POSSIBILITIES

19.4.3 Named vectors

A

codons <- c("G", "P", "K", "S")
names(codons) <- c("GGA", "CCU", "AAA", "AGU")

my_DNA <- "GGACCUAAAAGU"
my_prot <- ""
for (i in seq(from = 1, to = nchar(my_DNA), by = 3)) {
        codon <- substr(my_DNA, i, i+2)
        my_prot <- paste0(my_prot, codons[codon])
}
print(my_prot)
## [1] "GPKS"

B

nuc_weights <- c(491.2, 467.2, 507.2, 482.2)
names(nuc_weights) <- c('A', 'C', 'G', 'U')

mol_weight <- 0
for (i in 1:nchar(my_DNA)) {
        nuc <- substr(my_DNA, i, i);
        print(nuc)
        mol_weight <- mol_weight + nuc_weights[nuc]
}
mol_weight

19.4.4 Lowry

bsa_conc <- c(0,    0,  0.025,  0.025,  0.075,  0.075,  0.125,  0.125)
OD750 <- c(0.063,   0.033,  0.16,   0.181,  0.346,  0.352,  0.491,  0.488)

A

dilution <- data.frame(bsa_conc, OD750)

B

names(dilution) <- c("prot_conc", "absorption")

C

bsa_conc2 <- c(0.175,   0.175,  0.25,   0.25)
OD750_2 <- c(0.597, 0.595,  0.743,  0.742)
df_temp <- data.frame("prot_conc" = bsa_conc2,
                      "absorption" = OD750_2)

D

dilution <- rbind(dilution, df_temp)

E

even <- dilution[c(T, F), ]
odd <- dilution[c(F, T), ]
dilution_duplo <- cbind(odd, even)
dilution_duplo

F

dilution_duplo[, 3] <- NULL
names(dilution_duplo) <- c("prot_conc", "abs1", "abs2")
dilution_duplo

G

{rlowry-duplo-mean-a, eval = FALSE} dilution_duplo$mean <- (dilution_duplo$abs1 + dilution_duplo$abs2) / 2 dilution_duplo

19.4.5 Island surfaces

A

islands_df <- data.frame(Island = names(islands),
                        Area = islands)
rownames(islands_df) <- NULL
islands_df

B

islands_df[order(islands_df$Area, decreasing = TRUE)[1:3], ]
#OR
islands_df[order(islands_df$Area, decreasing = TRUE), ][1:3,]

19.4.6 USArrests

The USArrests dataset is also one of the datasets included in the datasets package. It has info on the fifty states for these variables:

Murder arrests (per 100,000)
Assault arrests (per 100,000)
UrbanPop Percent urban population
Rape arrests (per 100,000)

We’ll explore this dataset for a few questions.

A

USArrests[rownames(USArrests) == "Montana", ]

B

USArrests[USArrests$Rape == max(USArrests$Rape), ]

C

USArrests[USArrests$Assault < USArrests$Murder * 10, ]

19.4.7 File reading practice

File 01

my_dir <- "data/file_reading"
my_file <- "file01.txt"
my_path <- paste0(my_dir, "/", my_file)
my_data <- read.table(
    my_path,
    comment.char = "#",
    header = T,
    sep = ",",
    dec = ".",
    na.strings = "ND",
    as.is = c(1, 3))

File 02

my_file <- "file02.txt"
my_path <- paste0(my_dir, "/", my_file)
my_data <- read.table(
    my_path,
    comment.char = "$",
    header = T,
    sep = "\t",
    dec = ",",
    na.strings = "?",
    as.is = c(1, 3)
)

File 03

my_file <- "file03.txt"
my_path <- paste0(my_dir, "/", my_file)
my_data <- read.table(
    my_path,
    header = T,
    sep = ";",
    dec = ".",
    na.strings = "ND",
    as.is = c(1, 3)
)

File 04

my_file <- "file04.txt"
my_path <- paste0(my_dir, "/", my_file)
my_data <- read.table(
    my_path,
    header = T,
    sep = "\t",
    dec = ",",
    na.strings = "no data",
    as.is = c(1, 3)
)

File 05

my_file <- "file05.txt"
my_path <- paste0(my_dir, "/", my_file)
my_data <- read.table(
    my_path,
    comment.char = "#",
    header = T,
    sep = ";",
    dec = ".",
    na.strings = "ND",
    as.is = c(1, 3)
)

File 06

my_file <- "file06.txt"
my_path <- paste0(my_dir, "/", my_file)
my_data <- read.table(
    my_path,
    comment.char = "#",
    header = T,
    sep = ",",
    dec = ".",
    na.strings = "-",
    as.is = c(1, 3))
my_data

File 07

my_file <- "file07.txt"
my_path <- paste0(my_dir, "/", my_file)
my_data <- read.table(
    my_path,
    comment.char = "#",
    header = F,
    sep = ";",
    dec = ".",
    na.strings = "-",
    as.is = c(1, 3))

File 08

my_file <- "file08.txt"
my_path <- paste0(my_dir, "/", my_file)
my_data <- read.table(
    my_path,
    comment.char = "@",
    header = T,
    sep = ";",
    dec = ".",
    na.strings = "ND",
    as.is = c(1, 3))

(More files, but solutions omitted.)

19.5 Basics of the ggplot2 package

19.5.1 Trees

With the trees dataset from the datasets package, create a scatter plot according to these constraints.

  • x-axis: Height
  • y-axis: Girth
  • size of plot symbol: Volume
  • color of plot symbol: darkgreen
  • a smoother without error margins

NB: Where you specify aesthetics matters here!

As extra practice you could convert the values to the metric system first.

ggplot(data = trees,
       mapping = aes(x = Height, y = Girth)) +
  geom_point(aes(size = Volume), color = "darkgreen") +
  geom_smooth(method = "loess", se = FALSE, formula = y~x) +
  labs(x = "Height (ft)", y = "Girth (inches)")

19.5.2 Insect Sprays

The datasets package that is shipped with R has a dataset called ?. Type ?InsectSprays to get information on it.

A [❋]

ggplot(data = InsectSprays, 
       mapping = aes(x = spray, y = count)) +
    geom_boxplot() +
    ylab("Insect count") +
    xlab("Insect spray")
The counts of insects in agricultural experimental units treated with different insecticides

Figure 19.1: The counts of insects in agricultural experimental units treated with different insecticides

B [❋❋]

ggplot(data = InsectSprays, 
       mapping = aes(x = spray, y = count, color = spray)) +
    geom_jitter(height = 0, width = 0.1, shape = 18, size = 2, alpha = 0.7) +
    ylab("Insect count") +
    xlab("Insect spray")
The counts of insects in agricultural experimental units treated with different insecticides

Figure 19.2: The counts of insects in agricultural experimental units treated with different insecticides

19.5.3 Pharmacokinetics

The Theoph dataset contains pharmacokinetics of theophylline, the anti-asthmatic drug theophylline. Twelve subjects were given oral doses of theophylline, then serum concentrations were measured at 11 time points over the next 25 hours.

A

ggplot(data = Theoph,
       mapping = aes(x = Time, y = conc, color = Dose, linetype = Subject)) +
  geom_line() +
  labs(x = "Time (h)", y = "Concentration (mg/l)")

B

ggplot(data = Theoph,
       mapping = aes(x = Time, y = conc)) +
  geom_line(aes(group = Subject, color = Dose)) +
  scale_color_gradient(low = "green", high = "red") +
  labs(x = "Time (h)", y = "Concentration (mg/l)") +
  theme_minimal()

(Many more solutions possible)

19.5.4 USArrests (II)

A

ggplot(data = USArrests,
       mapping = aes(x = Assault,
                     y = Murder)) +
  geom_point(color = "darkgreen") +
  geom_smooth(method = "lm", formula = y~x, se = FALSE, color = "darkred") +
  labs(x = "Assault arrests (per 100,000)", y = "Murder arrests (per 100,000)")

B

scale <- 1
ggplot(data = USArrests,
       mapping = aes(x = Assault)) +
  geom_histogram(aes(y=..density..), fill = "lightblue", color = "black", bins = 30) +
  geom_density(color = "darkblue") +
  #scale_y_continuous(sec.axis = sec_axis(~ . * scale, name="Assault prob")) +
  labs(x = "Assault arrests (per 100,000)")
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.

19.5.5 Orchard Sprays

A

ggplot(data = OrchardSprays,
       mapping = aes(x = treatment, y = decrease)) +
  geom_boxplot() +
  geom_jitter(color = "darkred", width = 0.2, alpha = 0.7)

B

ggplot(data = OrchardSprays,
       mapping = aes(x = treatment, y = decrease)) +
  geom_violin(aes(fill=treatment))

19.5.6 Diauxic growth

A [❋❋]

remote <- "https://raw.githubusercontent.com/MichielNoback/datasets/master/diauxic_growth/monod_diauxic_growth.csv"
#local <- "../diauxic.csv"
#download.file(url = remote, destfile = local)
diauxic <- read.table(remote, sep = ";", header = T)
diauxic <- pivot_longer(data = diauxic,
                        cols = -1,
                        names_to = "Substrate",
                        values_to = "OD")

B [❋]

diauxic$Substrate <- factor(diauxic$Substrate, 
       levels = c("GlucMann", "GlucXyl", "GlucArab", "GlucRham"), 
       labels = c("Glucose Mannose", "Glucose Xylose", "Glucose Arabinose", "Glucose Rhamnose"))

C [❋❋]

Create a line plot with all four growth curves within a single graph.

ggplot(data = diauxic,
       mapping = aes(x = Time, y = OD, color = Substrate)) +
    geom_point() +
    stat_smooth(method = "loess", se = FALSE, span = 0.3) +
    theme_bw()
## `geom_smooth()` using formula = 'y ~ x'
Monod's Diauxic shift experiment.

Figure 19.3: Monod’s Diauxic shift experiment.

D [❋❋❋]

ggplot(data = diauxic,
       mapping = aes(x = Time, y = OD)) +
    geom_point() +
    stat_smooth(method = "loess", se = FALSE, span = 0.3) +
    facet_wrap(. ~ Substrate, nrow = 2) +
    theme_bw()
## `geom_smooth()` using formula = 'y ~ x'
Monod's Diauxic shift experiment.

Figure 19.4: Monod’s Diauxic shift experiment.

19.5.7 Virginia Death Rates

library(dplyr)
## %>% is used to pipe results from one operation to the other, just like '|' in Linux.
virginia_death_rates <- as_tibble(VADeaths)
virginia_death_rates <- virginia_death_rates %>% 
    mutate("Age Group" = factor(rownames(virginia_death_rates), ordered = TRUE)) %>% 
    select(`Age Group`, everything()) #reorder the columns

A (❋❋❋)

Pivot this table to long (tidy) format. This should generate a dataframe with four columns: Age Group, Habitat, Gender and DeathRate.

virginia_death_rates <- virginia_death_rates %>% pivot_longer(cols = -1, 
                 names_to = c("Habitat", "Gender"), 
                 names_sep = " ", 
                 values_to = "DeathRate")

B (❋❋)

ggplot(data = virginia_death_rates, aes(Gender)) +
    geom_bar(aes(weight = DeathRate, fill = Habitat), position = "dodge")
Virginia Death rates

Figure 19.5: Virginia Death rates

19.5.8 Global temperature

Load the data.

remote_file <- "https://raw.githubusercontent.com/MichielNoback/datasets/master/global_temperature/annual.csv"
global_temp <- read.table(remote_file,
                          header = TRUE,
                          sep = ",")

19.5.8.1 Create a scatter-and-line-plot [❋❋]

ggplot(data = global_temp,
       mapping = aes(x = Year, y = Mean, color = Source)) +
    geom_point(size = 0.5) +
    geom_line() +
    geom_smooth(se = FALSE, method = "loess", formula = 'y ~ x') +
    theme_bw()
Global temperature anomalies

Figure 19.6: Global temperature anomalies

19.5.8.2 Re-create the heatmap [❋❋❋]

ggplot(data = global_temp[global_temp$Source == "GCAG", ],
       mapping = aes(x = Year, y = 1)) +
    geom_tile(aes(fill = Mean), colour = "white") + 
    scale_fill_gradient2(low = "blue", mid = "white", high = "red") +
    theme_bw() +
    theme(axis.text.y = element_blank(), axis.ticks.y = element_blank(), axis.title.y = element_blank())
Global temperature anomalies

Figure 19.7: Global temperature anomalies

Note: rescaling the temperature from 0 to 1 may yield even better results.

19.5.8.3 Extra practice

Level: [❋❋]

No solutions for this one.

19.5.9 Investigate new visualization

**Level: ❋❋

This assignment has no solution of course. It is included here solely to keep the numbering consistent for both assignment chapters.

19.6 Functions (II)

19.6.1 cut()

USArrests$UrbanPop_f <- cut(USArrests$UrbanPop, 
                  breaks = c(0, 60, 80, 100),
                  labels = c("low", "mid", "high"),
                  right = FALSE)

table(USArrests$UrbanPop_f)

19.6.2 quantile()

A

precip[precip < quantile(precip, probs = 0.1)]
##     Phoenix Los Angeles      Denver       Boise        Reno Albuquerque 
##         7.0        14.0        13.0        11.5         7.2         7.8 
##     El Paso 
##         7.8

B

sel_range <- quantile(infert$age, probs = c(0.4, 0.6))
infert[infert$age > sel_range["40%"] & infert$age < sel_range["60%"], ]

19.7 Flow control and scripting

19.7.1 Illegal reproductions

The mean

my_mean <- function(x) {
        sum(x, na.rm = T) / length(x)
}

Standard deviation

my_sd <- function(x) {
        sqrt(sum((x - mean(x))^2)/(length(x)-1))
}

Maximum

set.seed(123) 
my_nums <- sample(10000, 1000)
my_max <- function(x) {
    max_val <- 0
    for (i in my_nums) {
        if (i > max_val) {
            max_val = i
        }
    }
    return(max_val)
}

Median

my_median <- function(x) {
        sorted <- sort(x)
        if(length(x) %% 2 == 1) {
                #uneven length
                my_median <- sorted[ceiling(length(x)/2)]
        } else {
                my_median <- (sorted[length(x)/2] + sorted[(length(x)/2)+1]) / 2
        }
        return(my_median)
}

19.7.2 Various other functions

Find match locations

where_is_it <- function(x, look_for) {
    found <- integer(0)
    for (i in 1:length(x)) {
        elmnt <- x[i]
        if (elmnt == look_for) {
            found <- c(found, i)
        }
    }
    return(found)
}
x <- c("Pro", "Glu", "Lys", "Pro", "Ser")
where_is_it(x, "Pro")
##expected:
#[1] 1 4

Odd and even

count_odd_even <- function(x) {
    evens <- sum(x %% 2 == 0)
    odds <- sum(x %% 2 == 1)
    #OR, more efficient
    #odds <- length(x) - evens
    tmp <- c(even = evens, odd = odds)
    return(tmp)
}
count_odd_even(c(1,2,3,4,5,5))

Add column compared to mean

add_compared_to_mean <- function (df, name) {
    if (! is.data.frame(df)) {
        stop(paste0("'", df, "' is not a dataframe."))
    }
    if(! name %in% colnames(df)) {
        stop(paste0("'", name, "' is not an existing column."))
    }
    if(! is.numeric(df[, name])) {
        stop(paste0("'", name, " is not a numeric column."))
    }
    the_col <- df[, name]
    tmp <- ifelse(the_col > mean(the_col), "greater", "less")
    df$compared_to_mean <- tmp
    return(df)
}
my_df <- data.frame(a = c(2, 4, 2, 3, 4, 3, 1, 5), b = rep(c("x", "y"), 4))
add_compared_to_mean(my_df, "a")

Interquantile ranges

interquantile_range <- function(x, lower = 0, upper = 1) {
  if (! is.numeric(x) | 
      ! is.numeric(lower) |
      ! is.numeric(upper)) {
    stop("all three arguments should be numeric")
  }
  lower_val <- quantile(x, probs = lower)
  upper_val <- quantile(x, probs = upper)
  tmp <- upper_val - lower_val
  #a named vector is always nice, for acces but also for display purposes
  names(tmp) <- paste0(lower*100, "-", upper*100, "%")
  tmp
}
tst <- rnorm(1000)
interquantile_range(tst) # 0 to 1
interquantile_range(tst, 0.25, 0.75) # custom
#interquantile_range("foo") # error!

Vector distance

distance <- function(p, q) {
    if (! is.numeric(p) | ! is.numeric(q)) {
        stop("non-numeric vectors passed")
    }
    if (length(p) != length(q)) {
        stop("vectors have unequal length")
    }
    sqrt(sum((p - q)^2))
}

Other distance measures

my_distance <- function(p, q, method = "euclidean") {
  if (! (is.numeric(p) & is.numeric(q))) {
    stop("One or both of the vectors is not numeric")
  }
  if (length(p) != length(q)) {
    stop("Vectors are not of equal length")
  }
  if (method == "euclidean") {
    return(sqrt(sum((p - q)^2)))
  }
  else if (method == "manhattan") {
    return(sum(abs(p-q)))
  }
  else {
    stop(paste0("Method not found: ", method))
  }
}
my_distance(c(0,0,0), c(1, 1, 1))

G/C percentage

GC_perc <- function(seq, strict = TRUE) {
  if (is.na(seq)) {
    return(NA)
  }
  if (length(seq) == 0) {
    return(0)
  }
  seq.split <- strsplit(seq, "")[[1]]
  gc.count <- 0
  anom.count <- 0
  for (n in seq.split) {
    if (length(grep("[GATUCgatuc]", n)) > 0) {
      if (n == "G" || n == "C") {
        gc.count <- gc.count + 1
      }
    } else {
      if (strict) {
        stop(paste("Illegal character", n))
      } else {
        anom.count <- anom.count + 1     
      }
    }
  }
  ##return perc
  ##print(gc.count)
  if (anom.count > 0) {
    anom.perc <- anom.count / nchar(seq) * 100
    warning(paste("Non-DNA characters have percentage of", anom.perc))
  }
  return(gc.count / nchar(seq) * 100)
}

19.8 Tidying dataframes using Package tidyr

19.8.1 Small examples

The data folder of this eBook contains three small csv files that need tidying. Read them from file and convert them into tidy format, with data columns as clean as possible (e.g. not “creatine_T0” but “T0”).

A

Tidy file untidy1.csv.

untidy1 <- read.table("data/untidy1.csv", 
                      sep = ",", 
                      header = TRUE)
pivot_wider(data = untidy1,
            names_from = type,
            values_from = value)

B

Tidy file untidy2.csv.

untidy2 <- read.table("data/untidy2.csv", 
                      sep = ",", 
                      header = TRUE)
pivot_longer(data = untidy2,
             cols = -patient,
             names_prefix = "creatine_",
             names_to = "timepoint",
             values_to = "creatine")

C

Tidy file untidy3.csv.

untidy3 <- read.table("data/untidy3.csv", 
                      sep = ";", 
                      header = TRUE)
pivot_longer(data = untidy3,
             cols = -species,
             names_sep = "_",
             #OR
             #names_pattern = "(leashed|unleashed)_(caged|free)",
             names_to = c("leashed", "caged"),
             values_to = "wellbeing")

19.8.2 Lung cancer

lung_cancer_deaths <- data.frame(year = rep(1974:1979, each = 12),
                     month = rep(month.abb, times = 6),
                     male = as.integer(mdeaths),
                     female = as.integer(fdeaths),
                     total = as.integer(ldeaths))

A

lung_cancer_deaths <- pivot_longer(data = lung_cancer_deaths, 
             cols = 3:5, 
             names_to = "type",
             values_to = "count")

B

Prep:

lung_cancer_deaths$date <- as.Date(x = paste(lung_cancer_deaths$month, lung_cancer_deaths$year, 1, sep="/"),
        format = "%b/%Y/%d")

Figure:

ggplot(data = lung_cancer_deaths,
       mapping = aes(x = date, y = count)) +
  geom_line(aes(color = type))

C

ggplot(data = lung_cancer_deaths,
       mapping = aes(x = type, y = count)) +
  geom_boxplot(aes(fill=type))

ANSWER: You can see a single outlier in the female set. We can identify the year by finding out when this occurred:

maximum <- max(lung_cancer_deaths[lung_cancer_deaths$type == "female", "count"]) ## 1141
lung_cancer_deaths[lung_cancer_deaths$count == maximum, ]
## # A tibble: 1 × 5
##    year month type   count date      
##   <int> <chr> <chr>  <int> <date>    
## 1  1976 Feb   female  1141 1976-02-01

So this was February 1976. A quick Google search turned up a pdf document “CDC Influenza Surveillance” that states

“The 1975-1976 influenza season was noteworthy because of several events. a) An H3N2 influenza virus (A/Victoria/3/75), isolated first in April 1975, caused a wide- spread epidemic late in the influenza season in the United States. Based on pneumonia- and influenza-associated mortality which peaked in February and March 1976, this was the most severe epidemic experienced by the United States since the 1968-1969 Hong Kong epidemic.”

(direct link)

As you may know, (lung) cancer patients are especially vulnerable for influenza infections.

19.9 Old school data mangling

19.9.1 Whale selenium

whale_sel_url <- "https://raw.githubusercontent.com/MichielNoback/davur1/gh-pages/exercises/data/whale_selenium.txt"
whale_selenium <- read.table(whale_sel_url,
        header = T,
        row.names = 1)

A

apply(X = whale_selenium, MARGIN = 2, FUN = mean)

B

apply(X = whale_selenium, MARGIN = 2, FUN = sd)

C

my.sem <- function(x) {
        sem <- sd(x) / sqrt(length(x))
}
apply(X = whale_selenium, MARGIN = 2, FUN = my.sem)

D

whale_selenium$ratio <- apply(X = whale_selenium, 
            MARGIN = 1, 
            FUN = function(x){
                    x[2] / x[1]
            })
ggplot(data = whale_selenium, 
       mapping = aes(x = ratio)) +
  geom_histogram(bins = 15) +
  labs(x = "Tooth / Liver Selenium ratio", title = "Tooth / Liver Selenium ratios in whales")

E
Inline expressions are like this: 15.4 MpH.

19.9.2 Urine properties

urine_file_name <- "urine.csv"
url <- paste0("https://raw.githubusercontent.com/MichielNoback/datasets/master/urine/", urine_file_name)
local_name <- paste0("../", urine_file_name) #specifiy your own folder!
download.file(url = url, destfile = local_name)

A

urine <- read.table(local_name, 
                     sep = ",",
                     header = TRUE)

B

names(urine)[2] <- "ox_crystals"
urine$ox_crystals <- factor(urine$ox_crystals, levels = c(0, 1), labels = c("no", "yes"))

C

mean_sd <- function(x) {
    # returns a named vector
    c("mean" = round(mean(x, na.rm = T), 2), 
      "sd" = round(sd(x, na.rm = T), 2))
}
apply(X = urine[, 3:8], MARGIN = 2, FUN = mean_sd)
##      gravity   ph osmo  cond urea calc
## mean    1.02 6.03  615 20.90  266 4.14
## sd      0.01 0.72  238  7.95  131 3.26

D

aggregate(cbind(gravity, ph, osmo, cond, urea, calc) ~ ox_crystals, 
          data = urine, 
          FUN = function(x) round(mean(x, na.rm = T), 2))
##   ox_crystals gravity   ph osmo cond urea calc
## 1          no    1.02 6.13  562 20.6  232 2.63
## 2         yes    1.02 5.93  683 21.4  302 6.20

19.10 Data mangling with dplyr

19.10.1 Global temperature revisited

A [❋]

f <- "https://raw.githubusercontent.com/MichielNoback/datasets/master/global_temperature/GLB.Ts%2BdSST.csv"
#temp_dat <- readr::read_csv(f)
temp_dat <- read.table(f, sep = ",", header=TRUE, na.strings = "***")
temp_dat <- as_tibble(temp_dat)

B [❋❋]

temp_dat %>% 
    filter(JJA < -0.4) %>% 
    pull(Year)

C [❋❋]

# coldest
cold <- temp_dat %>% 
    arrange(J.D) %>% 
    select(Year, J.D) %>% 
    slice(1:5)
# warmest
warm <- temp_dat %>% 
    arrange(desc(J.D)) %>% 
    select(Year, J.D) %>% 
    slice(1:5)
# combine
bind_rows(cold, warm)

D [❋❋]

temp_dat %>%
    filter(MAM > (JJA+0.2)) %>%
    select(Year, MAM, JJA)

E [❋❋❋]

decade_avg <- temp_dat %>%
    filter(Year > 1969) %>%
    mutate(Decade = paste0(substr(as.character(Year), 1, 3), "0")) %>% # there probably is a better way to do this...
    select(Year, Decade, DJF, MAM, JJA, SON) %>%
    group_by(Decade) %>%
    summarize(across(-Year, mean, na.rm=T))
decade_avg %>% 
    pivot_longer(cols = -Decade, 
                 names_to = "Season",
                 values_to = "Anomaly") %>%
    mutate(Season = factor(Season, 
                           levels = c("DJF", "MAM", "JJA", "SON"), 
                           labels = c("Winter", "Spring", "Summer", "Autumn"),
                           ordered = TRUE)) %>%
    ggplot(mapping = aes(x = Decade)) +
        geom_bar(aes(weight = Anomaly, fill = Season), position = position_dodge()) +
    theme_classic()

F [❋❋❋❋]

max_temp <- max(temp_dat[, 2:13], na.rm=T)
temp_dat %>% 
    filter(if_any(Jan:Dec, ~ .x == max_temp)) %>% #across is deprecated
    select(Year, J.D, where(~ mean(.x, na.rm=T) == max_temp))

19.10.2 Pigs

A [❋]

pigs <- read.table("https://raw.githubusercontent.com/MichielNoback/datasets/master/pigs/dietox.csv", 
                   header = TRUE, 
                   sep = ",")

head(pigs)

B [❋❋]

pigs %>%
    filter(Time == 12 & Evit == 3 & Cu == 3) %>%
    select(Pig, Litter, Feed, Weight) %>%
    arrange(desc(Weight))

C [❋❋]

pigs %>% 
    distinct(Evit, Cu)

D [❋❋❋]

pigs %>%
    filter(Time == 12) %>%
    group_by(Evit) %>%
    summarize(mean_weight = mean(Weight),
              n = n())

E [❋❋❋]

gain <- pigs %>%
    select(Pig, Time, Weight, Feed, Evit, Cu) %>%
    group_by(Pig) %>%
    mutate(Cu = factor(Cu),
           Evit = factor(Evit),
           Lag_Weight = lag(Weight),
           Gain = (Weight - Lag_Weight)/Lag_Weight*100) 

ggplot(gain, aes(x = Cu, y = Gain, color = Cu)) +
    geom_violin() +
    facet_wrap(~Evit)

F [❋❋❋]

# select and pivot
pigs %>% 
    select(-X) %>%
    pivot_wider(names_from = Time,
                names_prefix = "w_",
                values_from = c(Weight, Feed))

19.10.3 Population numbers

A [❋]

pop_data_file <- "https://raw.githubusercontent.com/MichielNoback/datasets/master/population/EDU_DEM_05022020113225751.csv"
population <- read.table(pop_data_file,
                         header = TRUE, 
                         sep=",")

B [❋❋]
Which? str(population) tells me that Unit.Code, Unit and PowerCode are factors with one level. Using table(population$PowerCode.Code, useNA = "always") tells me that there are only zeros there. Same for Reference.Period.Code and Reference.Period. The variables Flags and Flag.Codes refer to the same, so one of them can be removed (I choose to remove Flag.Codes). The same counts for SEX/Sex, AGE/Age and YEAR/Year.

Select This is my selection:

keep <- names(population)[c(1, 2, 4, 6, 8, 15, 16)]
keep
population <- as_tibble(population[, keep]) #tibble is nicer!
head(population)

C [❋❋❋]

pop_totals <- dplyr::filter(population, Sex == "Total" & Age == "Total: All age groups")
##or, using base R
#population[population$Sex == "Total" & population$Age == "Total: All age groups", ]
pivot_wider(
    data = pop_totals[, c(1, 2, 5, 6)],
    names_from = Year,
    values_from = Value)

D [❋❋❋❋]
You could do this on the pop_totals dataset using the lag() function, after using group_by(), but since we have the wide format, you could also use a simple for loop, iterating the columns by index and subtracting the first from the second. I will demonstrate the only the first.

pop_totals %>% 
    group_by(Country) %>%
    mutate(Pop_change = as.integer(Value - lag(Value))) %>%
    ungroup() %>%
    select(-Value) %>%
    pivot_wider(names_from = Year,
        values_from = Pop_change) %>%
    select(-`2005`, -`2010`, -Flag.Codes) 
## backticks in above selection are required 
## because we are selecting names that are numbers!

E [❋❋❋]

sel <- population %>%
    filter(Sex == "Women" | Sex == "Men") %>%
    filter(COUNTRY %in% c("BEL", "CHE", "DNK", "FRA", "IRL", "DEU", "LUX", "NLD", "GBR")) %>%
    drop_na()

ggplot(sel, mapping = aes(Year)) +
    geom_bar(aes(weight = Value, fill = Sex))  + 
    facet_wrap(. ~ COUNTRY)

F [❋❋❋❋]

population %>%
    filter(Sex == "Total" & Age == "Total: All age groups" & (Year == 2005 | Year == 2017)) %>%
    group_by(Country) %>%
    mutate(Change = Value - lag(Value), 
           Previous = lag(Value)) %>%
    mutate(GrowthRate = Change / Previous * 100) %>%
    ungroup() %>%
    select(COUNTRY, Country, GrowthRate ) %>%
    arrange(desc(GrowthRate)) %>%
    head(3)

19.11 Text processing with regex

19.11.1 Restriction enzymes

A [❋❋]

pacI_re <- "TTAATTAA"
patterns <- c("T{2}A{2}T{2}A{2}",
           "(TTAA){2}",
           "(T{2}A{2}){2}")
for(ptrn in patterns){
    print(grepl(ptrn, pacI_re))
}

B [❋❋]

sfiI_re <- "GGCCACGTAGGCC"
patterns <- c("G{2}C{2}[GATC]{5}G{2}C{2}",
           "GGCC[GATC]{5}GGCC",
           "[GC]{4}[GATC]{5}[GC]{4}") #last one is less specific!
for(ptrn in patterns){
    print(grepl(ptrn, sfiI_re))
}

19.11.2 Prosite Patterns

A [❋❋]
PS00211: “[LIVMFYC]-[SA]-[SAPGLVFYKQH]-G-[DENQMW]-[KRQASPCLIMFW]-[KRNQSTAVM]-[KRACLVM]-[LIVMFYPAN]-{PHY}-[LIVMFW]-[SAGCLIVP]-{FYWHP}-{KRHP}-[LIVMFYWSTA].”

PS00211<- "[LIVMFYC][SA][SAPGLVFYKQH]G[DENQMW][KRQASPCLIMFW][KRNQSTAVM][KRACLVM][LIVMFYPAN][^PHY][LIVMFW][SAGCLIVP][^FYWHP][^KRHP][LIVMFYWSTA]"

B [❋❋]
PS00018: “D-{W}-[DNS]-{ILVFYW}-[DENSTG]-[DNQGHRK]-{GP}-[LIVMC]-[DENQSTAGC]-x(2)- [DE]-[LIVMFYW].”

PS00018 <- "D[^W][DNS][^ILVFYW][DENSTG][DNQGHRK][^GP][LIVMC][DENQSTAGC].{2} [DE][LIVMFYW]"

19.11.3 Fasta Headers

library(stringr)
fasta_headers <- readLines("./data/fasta_headers.txt")

A [❋❋❋]

str_match(fasta_headers, "\\[(.+)\\]")[, 2]
str_match(fasta_headers, "\\[([[:alpha:]]+ [[:alpha:]]+) ?(.+)?\\]")[, 2]

B [❋❋❋]

str_match(fasta_headers, ">([[:alpha:]]{2,3}\\|\\w+)\\|")[, 2]

C [❋❋❋]

str_match(fasta_headers, ">.+\\| (.+?) \\[")[, 2]

19.12 Package ggplot2 revisited

19.12.1 Eye colors

A [❋❋]

(ec_data <- readr::read_csv("data/eyecolor_data.csv", 
                            col_types = "cfiffffffff",
                            na="-"))

B [❋❋❋]

ec_data <- ec_data %>%
    mutate(across(4:11, ~ recode_factor(.x, blauw = "blue", bruin = "brown", groen = "green")))

C [❋❋❋❋]

## Majority for use in mutate()
majority <- function(x) {
    #print(as.character(x))
    # there are many other approaches to be taken
    t <- table(x)
    if (length(t) == 1) return(x[1]) # one entry = no problem
    else if(length(unique(t)) == 1) return("intermediate") # same counts = no majority
    else{
        # order by count and return the most abundant one
        df <- as.data.frame(t)
        return(as.character(df[order(df$Freq, decreasing = T)[1], 1]))
    }
}

# mutate to apply the function
# this requires rowwise() and c_across()
ec_data <- ec_data %>%
    rowwise() %>%
    mutate(majority = majority(c_across(-(1:3))))

D [❋❋❋]

ec_data_tidy <- ec_data %>%
    pivot_longer(cols = -c(Identifier, sex, age, majority),
                 names_sep = "_",
                 names_to = c("eye", "experimenter"),
                 values_to = "observed_color")

E [❋❋❋]

## create the contingency table
counts <- ec_data_tidy %>% select(sex, eye, observed_color) %>%
    group_by(sex, eye, observed_color) %>%
    summarize(count = n(), .groups = "drop") 

## The balloon plot. Not that quotes around variables are required!
ggpubr::ggballoonplot(data = counts, 
                      x = 'eye', 
                      y = 'observed_color', 
                      facet.by = 'sex', 
                      size = 'count',
                      fill = "blue")

19.13 Working with date-time data

19.13.1 Bacterial growth

A [❋❋]

bact_growth <- read.table('data/growth-exp-data.csv', 
           sep = ";",
           header = TRUE)

B [❋❋]

bact_growth <- bact_growth %>%
    unite(date_time, date, time, sep="@") %>%
    mutate(date_time = lubridate::dmy_hm(date_time))

C [❋❋❋]

start <- bact_growth$date_time[1]
bact_growth <- 
    bact_growth %>%
    mutate(elapsed_time = hour(seconds_to_period(date_time - start)))

D [❋❋❋]

# Value used to transform the data of the second axis
coeff <- 1

OD_color <- "#69b3a2"
glucose_color <- "deepskyblue3"
ggplot(data = bact_growth, mapping = aes(x=elapsed_time)) +
  geom_line(mapping = aes(y = OD600), linewidth = 1, color = OD_color) + 
  geom_line(mapping = aes(y=glucose / coeff), linewidth = 1, color = glucose_color) +
  scale_y_continuous(
    # Features of the first axis
    name = "OD600",
    # Add a second axis and specify its features
    sec.axis = sec_axis(~ . * coeff, name = "Glucose level")
  ) +
  xlab("elapsed time (h)") +
  theme_classic() +
  theme(
    axis.title.y = element_text(color = OD_color, size=13),
    axis.title.y.right = element_text(color = glucose_color, size=13)
  )