penguins <- penguins %>%
drop_na(flipper_length_mm, body_mass_g, bill_length_mm, bill_depth_mm, sex)
#Q1 This makes the background of the plot white. Biscoe has the most penguins.
ggplot(data = penguins, aes(x = island, fill = species)) +
geom_bar(position = "dodge") +
labs(x = "Island Name", y = "Penguin Population #") +
theme_minimal()
#Q2 This distribution is skewed right but not bimodal. The bin width number dictates the range of observations counted within each bar of the histogram.The distribution is normal for Adelie and Chinstrap, but more bimodal for Gentoo. However, their means seem similar.
ggplot(data = penguins, aes(x = body_mass_g)) +
geom_histogram(binwidth = 10) +
facet_wrap(~ species)
labs(x = "Body Mass (g)", y = "Count (#)") +
theme_minimal(base_size = 21)
## NULL
#Q3 Gentoo penguins have the longest flippers. Males tend to have a higher body mass.The fill helps with readability because it colors each box plot, making it more eye catching.
ggplot(data = penguins, aes(x = species, y = flipper_length_mm, fill = species)) +
geom_boxplot()
ggplot(data = penguins, aes(x = sex, y = body_mass_g)) +
geom_boxplot()
#Q4 The relationship is initially unclear because all the points are the same color. However, after using color = species, I can interpret that the relationship seems positive. The line fits the data well as it reflects this positive relationship.
ggplot(data = penguins, aes(x = bill_length_mm, y = bill_depth_mm, color = species)) +
geom_point(size = 2) +
theme_minimal() +
geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'
#Q5 Gentoo penguins seem to show the strongest relationship. After using
the facet_grid function, it seems that the group with the strongest
positive correlation would be the male Chinstrap penguins.
ggplot(data = penguins, aes(x = flipper_length_mm, y = body_mass_g, color = species)) +
geom_point(size = 2) +
facet_wrap(~species) +
geom_smooth(method = "lm") +
facet_grid(sex~species)
## `geom_smooth()` using formula = 'y ~ x'
theme_minimal()
## <theme> List of 144
## $ line : <ggplot2::element_line>
## ..@ colour : chr "black"
## ..@ linewidth : num 0.5
## ..@ linetype : num 1
## ..@ lineend : chr "butt"
## ..@ linejoin : chr "round"
## ..@ arrow : logi FALSE
## ..@ arrow.fill : chr "black"
## ..@ inherit.blank: logi TRUE
## $ rect : <ggplot2::element_rect>
## ..@ fill : chr "white"
## ..@ colour : chr "black"
## ..@ linewidth : num 0.5
## ..@ linetype : num 1
## ..@ linejoin : chr "round"
## ..@ inherit.blank: logi TRUE
## $ text : <ggplot2::element_text>
## ..@ family : chr ""
## ..@ face : chr "plain"
## ..@ italic : chr NA
## ..@ fontweight : num NA
## ..@ fontwidth : num NA
## ..@ colour : chr "black"
## ..@ size : num 11
## ..@ hjust : num 0.5
## ..@ vjust : num 0.5
## ..@ angle : num 0
## ..@ lineheight : num 0.9
## ..@ margin : <ggplot2::margin> num [1:4] 0 0 0 0
## ..@ debug : logi FALSE
## ..@ inherit.blank: logi TRUE
## $ title : <ggplot2::element_text>
## ..@ family : NULL
## ..@ face : NULL
## ..@ italic : chr NA
## ..@ fontweight : num NA
## ..@ fontwidth : num NA
## ..@ colour : NULL
## ..@ size : NULL
## ..@ hjust : NULL
## ..@ vjust : NULL
## ..@ angle : NULL
## ..@ lineheight : NULL
## ..@ margin : NULL
## ..@ debug : NULL
## ..@ inherit.blank: logi TRUE
## $ point : <ggplot2::element_point>
## ..@ colour : chr "black"
## ..@ shape : num 19
## ..@ size : num 1.5
## ..@ fill : chr "white"
## ..@ stroke : num 0.5
## ..@ inherit.blank: logi TRUE
## $ polygon : <ggplot2::element_polygon>
## ..@ fill : chr "white"
## ..@ colour : chr "black"
## ..@ linewidth : num 0.5
## ..@ linetype : num 1
## ..@ linejoin : chr "round"
## ..@ inherit.blank: logi TRUE
## $ geom : <ggplot2::element_geom>
## ..@ ink : chr "black"
## ..@ paper : chr "white"
## ..@ accent : chr "#3366FF"
## ..@ linewidth : num 0.5
## ..@ borderwidth: num 0.5
## ..@ linetype : int 1
## ..@ bordertype : int 1
## ..@ family : chr ""
## ..@ fontsize : num 3.87
## ..@ pointsize : num 1.5
## ..@ pointshape : num 19
## ..@ colour : NULL
## ..@ fill : NULL
## $ spacing : 'simpleUnit' num 5.5points
## ..- attr(*, "unit")= int 8
## $ margins : <ggplot2::margin> num [1:4] 5.5 5.5 5.5 5.5
## $ aspect.ratio : NULL
## $ axis.title : NULL
## $ axis.title.x : <ggplot2::element_text>
## ..@ family : NULL
## ..@ face : NULL
## ..@ italic : chr NA
## ..@ fontweight : num NA
## ..@ fontwidth : num NA
## ..@ colour : NULL
## ..@ size : NULL
## ..@ hjust : NULL
## ..@ vjust : num 1
## ..@ angle : NULL
## ..@ lineheight : NULL
## ..@ margin : <ggplot2::margin> num [1:4] 2.75 0 0 0
## ..@ debug : NULL
## ..@ inherit.blank: logi TRUE
## $ axis.title.x.top : <ggplot2::element_text>
## ..@ family : NULL
## ..@ face : NULL
## ..@ italic : chr NA
## ..@ fontweight : num NA
## ..@ fontwidth : num NA
## ..@ colour : NULL
## ..@ size : NULL
## ..@ hjust : NULL
## ..@ vjust : num 0
## ..@ angle : NULL
## ..@ lineheight : NULL
## ..@ margin : <ggplot2::margin> num [1:4] 0 0 2.75 0
## ..@ debug : NULL
## ..@ inherit.blank: logi TRUE
## $ axis.title.x.bottom : NULL
## $ axis.title.y : <ggplot2::element_text>
## ..@ family : NULL
## ..@ face : NULL
## ..@ italic : chr NA
## ..@ fontweight : num NA
## ..@ fontwidth : num NA
## ..@ colour : NULL
## ..@ size : NULL
## ..@ hjust : NULL
## ..@ vjust : num 1
## ..@ angle : num 90
## ..@ lineheight : NULL
## ..@ margin : <ggplot2::margin> num [1:4] 0 2.75 0 0
## ..@ debug : NULL
## ..@ inherit.blank: logi TRUE
## $ axis.title.y.left : NULL
## $ axis.title.y.right : <ggplot2::element_text>
## ..@ family : NULL
## ..@ face : NULL
## ..@ italic : chr NA
## ..@ fontweight : num NA
## ..@ fontwidth : num NA
## ..@ colour : NULL
## ..@ size : NULL
## ..@ hjust : NULL
## ..@ vjust : num 1
## ..@ angle : num -90
## ..@ lineheight : NULL
## ..@ margin : <ggplot2::margin> num [1:4] 0 0 0 2.75
## ..@ debug : NULL
## ..@ inherit.blank: logi TRUE
## $ axis.text : <ggplot2::element_text>
## ..@ family : NULL
## ..@ face : NULL
## ..@ italic : chr NA
## ..@ fontweight : num NA
## ..@ fontwidth : num NA
## ..@ colour : chr "#4D4D4DFF"
## ..@ size : 'rel' num 0.8
## ..@ hjust : NULL
## ..@ vjust : NULL
## ..@ angle : NULL
## ..@ lineheight : NULL
## ..@ margin : NULL
## ..@ debug : NULL
## ..@ inherit.blank: logi TRUE
## $ axis.text.x : <ggplot2::element_text>
## ..@ family : NULL
## ..@ face : NULL
## ..@ italic : chr NA
## ..@ fontweight : num NA
## ..@ fontwidth : num NA
## ..@ colour : NULL
## ..@ size : NULL
## ..@ hjust : NULL
## ..@ vjust : num 1
## ..@ angle : NULL
## ..@ lineheight : NULL
## ..@ margin : <ggplot2::margin> num [1:4] 2.2 0 0 0
## ..@ debug : NULL
## ..@ inherit.blank: logi TRUE
## $ axis.text.x.top : <ggplot2::element_text>
## ..@ family : NULL
## ..@ face : NULL
## ..@ italic : chr NA
## ..@ fontweight : num NA
## ..@ fontwidth : num NA
## ..@ colour : NULL
## ..@ size : NULL
## ..@ hjust : NULL
## ..@ vjust : NULL
## ..@ angle : NULL
## ..@ lineheight : NULL
## ..@ margin : <ggplot2::margin> num [1:4] 0 0 4.95 0
## ..@ debug : NULL
## ..@ inherit.blank: logi TRUE
## $ axis.text.x.bottom : <ggplot2::element_text>
## ..@ family : NULL
## ..@ face : NULL
## ..@ italic : chr NA
## ..@ fontweight : num NA
## ..@ fontwidth : num NA
## ..@ colour : NULL
## ..@ size : NULL
## ..@ hjust : NULL
## ..@ vjust : NULL
## ..@ angle : NULL
## ..@ lineheight : NULL
## ..@ margin : <ggplot2::margin> num [1:4] 4.95 0 0 0
## ..@ debug : NULL
## ..@ inherit.blank: logi TRUE
## $ axis.text.y : <ggplot2::element_text>
## ..@ family : NULL
## ..@ face : NULL
## ..@ italic : chr NA
## ..@ fontweight : num NA
## ..@ fontwidth : num NA
## ..@ colour : NULL
## ..@ size : NULL
## ..@ hjust : num 1
## ..@ vjust : NULL
## ..@ angle : NULL
## ..@ lineheight : NULL
## ..@ margin : <ggplot2::margin> num [1:4] 0 2.2 0 0
## ..@ debug : NULL
## ..@ inherit.blank: logi TRUE
## $ axis.text.y.left : <ggplot2::element_text>
## ..@ family : NULL
## ..@ face : NULL
## ..@ italic : chr NA
## ..@ fontweight : num NA
## ..@ fontwidth : num NA
## ..@ colour : NULL
## ..@ size : NULL
## ..@ hjust : NULL
## ..@ vjust : NULL
## ..@ angle : NULL
## ..@ lineheight : NULL
## ..@ margin : <ggplot2::margin> num [1:4] 0 4.95 0 0
## ..@ debug : NULL
## ..@ inherit.blank: logi TRUE
## $ axis.text.y.right : <ggplot2::element_text>
## ..@ family : NULL
## ..@ face : NULL
## ..@ italic : chr NA
## ..@ fontweight : num NA
## ..@ fontwidth : num NA
## ..@ colour : NULL
## ..@ size : NULL
## ..@ hjust : NULL
## ..@ vjust : NULL
## ..@ angle : NULL
## ..@ lineheight : NULL
## ..@ margin : <ggplot2::margin> num [1:4] 0 0 0 4.95
## ..@ debug : NULL
## ..@ inherit.blank: logi TRUE
## $ axis.text.theta : NULL
## $ axis.text.r : <ggplot2::element_text>
## ..@ family : NULL
## ..@ face : NULL
## ..@ italic : chr NA
## ..@ fontweight : num NA
## ..@ fontwidth : num NA
## ..@ colour : NULL
## ..@ size : NULL
## ..@ hjust : num 0.5
## ..@ vjust : NULL
## ..@ angle : NULL
## ..@ lineheight : NULL
## ..@ margin : <ggplot2::margin> num [1:4] 0 2.2 0 2.2
## ..@ debug : NULL
## ..@ inherit.blank: logi TRUE
## $ axis.ticks : <ggplot2::element_blank>
## $ axis.ticks.x : NULL
## $ axis.ticks.x.top : NULL
## $ axis.ticks.x.bottom : NULL
## $ axis.ticks.y : NULL
## $ axis.ticks.y.left : NULL
## $ axis.ticks.y.right : NULL
## $ axis.ticks.theta : NULL
## $ axis.ticks.r : NULL
## $ axis.minor.ticks.x.top : NULL
## $ axis.minor.ticks.x.bottom : NULL
## $ axis.minor.ticks.y.left : NULL
## $ axis.minor.ticks.y.right : NULL
## $ axis.minor.ticks.theta : NULL
## $ axis.minor.ticks.r : NULL
## $ axis.ticks.length : 'rel' num 0.5
## $ axis.ticks.length.x : NULL
## $ axis.ticks.length.x.top : NULL
## $ axis.ticks.length.x.bottom : NULL
## $ axis.ticks.length.y : NULL
## $ axis.ticks.length.y.left : NULL
## $ axis.ticks.length.y.right : NULL
## $ axis.ticks.length.theta : NULL
## $ axis.ticks.length.r : NULL
## $ axis.minor.ticks.length : 'rel' num 0.75
## $ axis.minor.ticks.length.x : NULL
## $ axis.minor.ticks.length.x.top : NULL
## $ axis.minor.ticks.length.x.bottom: NULL
## $ axis.minor.ticks.length.y : NULL
## $ axis.minor.ticks.length.y.left : NULL
## $ axis.minor.ticks.length.y.right : NULL
## $ axis.minor.ticks.length.theta : NULL
## $ axis.minor.ticks.length.r : NULL
## $ axis.line : <ggplot2::element_blank>
## $ axis.line.x : NULL
## $ axis.line.x.top : NULL
## $ axis.line.x.bottom : NULL
## $ axis.line.y : NULL
## $ axis.line.y.left : NULL
## $ axis.line.y.right : NULL
## $ axis.line.theta : NULL
## $ axis.line.r : NULL
## $ legend.background : <ggplot2::element_blank>
## $ legend.margin : NULL
## $ legend.spacing : 'rel' num 2
## $ legend.spacing.x : NULL
## $ legend.spacing.y : NULL
## $ legend.key : <ggplot2::element_blank>
## $ legend.key.size : 'simpleUnit' num 1.2lines
## ..- attr(*, "unit")= int 3
## $ legend.key.height : NULL
## $ legend.key.width : NULL
## $ legend.key.spacing : NULL
## $ legend.key.spacing.x : NULL
## $ legend.key.spacing.y : NULL
## $ legend.key.justification : NULL
## $ legend.frame : NULL
## $ legend.ticks : NULL
## $ legend.ticks.length : 'rel' num 0.2
## $ legend.axis.line : NULL
## $ legend.text : <ggplot2::element_text>
## ..@ family : NULL
## ..@ face : NULL
## ..@ italic : chr NA
## ..@ fontweight : num NA
## ..@ fontwidth : num NA
## ..@ colour : NULL
## ..@ size : 'rel' num 0.8
## ..@ hjust : NULL
## ..@ vjust : NULL
## ..@ angle : NULL
## ..@ lineheight : NULL
## ..@ margin : NULL
## ..@ debug : NULL
## ..@ inherit.blank: logi TRUE
## $ legend.text.position : NULL
## $ legend.title : <ggplot2::element_text>
## ..@ family : NULL
## ..@ face : NULL
## ..@ italic : chr NA
## ..@ fontweight : num NA
## ..@ fontwidth : num NA
## ..@ colour : NULL
## ..@ size : NULL
## ..@ hjust : num 0
## ..@ vjust : NULL
## ..@ angle : NULL
## ..@ lineheight : NULL
## ..@ margin : NULL
## ..@ debug : NULL
## ..@ inherit.blank: logi TRUE
## $ legend.title.position : NULL
## $ legend.position : chr "right"
## $ legend.position.inside : NULL
## $ legend.direction : NULL
## $ legend.byrow : NULL
## $ legend.justification : chr "center"
## $ legend.justification.top : NULL
## $ legend.justification.bottom : NULL
## $ legend.justification.left : NULL
## $ legend.justification.right : NULL
## $ legend.justification.inside : NULL
## [list output truncated]
## @ complete: logi TRUE
## @ validate: logi TRUE
#Q6
ggplot(data = penguins, aes(x = bill_depth_mm, y = flipper_length_mm, color = species)) +
geom_point(size = 2) +
facet_wrap(~sex) +
labs(x = "Sex", y = "Bill depth (mm)") +
theme_minimal()
#Q7 It was most fun to be able to color the plots however I wanted. It
is confusing to figure out what some of the more obscure and non self
explanatory functions are used for; like facet wrap.
| #Lecture 07 |
|---|
| #Lecture 08 |
library(tidyverse)
library(palmerpenguins)
penguins <- palmerpenguins::penguins
#Q1 The original data set has 344 rows, while the cleaned data set has 333 rows.
penguins_clean <- penguins %>%
drop_na(bill_length_mm, sex)
nrow(penguins_clean)
## [1] 333
nrow(penguins)
## [1] 344
#Q2 The number after duplication is 363, and after using distinct, the total number of rows is 333. Duplicates can cause error in summary statistics and plots because they may create an inaccurate overrepresentation of certain data points, which would alter data trends and further analyses.
penguins_dup <- penguins_clean %>%
bind_rows(slice_sample(penguins_clean, n = 30, replace = TRUE))
nrow(penguins_dup)
## [1] 363
penguins_dup %>% distinct %>% nrow
## [1] 333
#Q3 There are 8 columns in the data frame.
penguins_clean %>%
select(species, island, sex, where(is.numeric))
## # A tibble: 333 × 8
## species island sex bill_length_mm bill_depth_mm flipper_length_mm
## <fct> <fct> <fct> <dbl> <dbl> <int>
## 1 Adelie Torgersen male 39.1 18.7 181
## 2 Adelie Torgersen female 39.5 17.4 186
## 3 Adelie Torgersen female 40.3 18 195
## 4 Adelie Torgersen female 36.7 19.3 193
## 5 Adelie Torgersen male 39.3 20.6 190
## 6 Adelie Torgersen female 38.9 17.8 181
## 7 Adelie Torgersen male 39.2 19.6 195
## 8 Adelie Torgersen female 41.1 17.6 182
## 9 Adelie Torgersen male 38.6 21.2 191
## 10 Adelie Torgersen male 34.6 21.1 198
## # ℹ 323 more rows
## # ℹ 2 more variables: body_mass_g <int>, year <int>
ncol(penguins_clean)
## [1] 8
#Q4
penguins_clean %>%
select(starts_with("bill"))
## # A tibble: 333 × 2
## bill_length_mm bill_depth_mm
## <dbl> <dbl>
## 1 39.1 18.7
## 2 39.5 17.4
## 3 40.3 18
## 4 36.7 19.3
## 5 39.3 20.6
## 6 38.9 17.8
## 7 39.2 19.6
## 8 41.1 17.6
## 9 38.6 21.2
## 10 34.6 21.1
## # ℹ 323 more rows
penguins_clean %>%
select(ends_with("_mm"))
## # A tibble: 333 × 3
## bill_length_mm bill_depth_mm flipper_length_mm
## <dbl> <dbl> <int>
## 1 39.1 18.7 181
## 2 39.5 17.4 186
## 3 40.3 18 195
## 4 36.7 19.3 193
## 5 39.3 20.6 190
## 6 38.9 17.8 181
## 7 39.2 19.6 195
## 8 41.1 17.6 182
## 9 38.6 21.2 191
## 10 34.6 21.1 198
## # ℹ 323 more rows
penguins_clean %>%
select(matches("bi.*|fli.*"))
## # A tibble: 333 × 3
## bill_length_mm bill_depth_mm flipper_length_mm
## <dbl> <dbl> <int>
## 1 39.1 18.7 181
## 2 39.5 17.4 186
## 3 40.3 18 195
## 4 36.7 19.3 193
## 5 39.3 20.6 190
## 6 38.9 17.8 181
## 7 39.2 19.6 195
## 8 41.1 17.6 182
## 9 38.6 21.2 191
## 10 34.6 21.1 198
## # ℹ 323 more rows
#Q5 Using across(where(is.numeric)) is much more efficient than typing out every numeric column name, because there may be a very large number of these columns and this will be useful for large datasets.
penguins_clean %>%
group_by(species) %>%
summarize(across(where(is.numeric), mean))
## # A tibble: 3 × 6
## species bill_length_mm bill_depth_mm flipper_length_mm body_mass_g year
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Adelie 38.8 18.3 190. 3706. 2008.
## 2 Chinstrap 48.8 18.4 196. 3733. 2008.
## 3 Gentoo 47.6 15.0 217. 5092. 2008.
#Q6
penguins_clean %>%
mutate(bill_length = case_when(
bill_length_mm < 40 ~ "short",
bill_length_mm < 50 ~ "medium",
bill_length_mm >= 50 ~ "long"
)) %>%
select (bill_length_mm, bill_length)
## # A tibble: 333 × 2
## bill_length_mm bill_length
## <dbl> <chr>
## 1 39.1 short
## 2 39.5 short
## 3 40.3 medium
## 4 36.7 short
## 5 39.3 short
## 6 38.9 short
## 7 39.2 short
## 8 41.1 medium
## 9 38.6 short
## 10 34.6 short
## # ℹ 323 more rows
#Q7 There are 9 columns in penguins_instrumented and 8 columns in penguins_clean.
instrument_meta <- tibble(
instrument_id = c("caliper_A", "caliper_B", "caliper_C"),
calibration_mm = c(0.2, 0.5, 1.0),
manufacturer = c("Mitutoyo", "Fowler", "Generic")
)
penguins_instrumented <- penguins_clean %>%
drop_na(bill_length_mm) %>%
mutate(instrument_id = case_when(
year == 2007 ~ "caliper_A",
year == 2008 ~ "caliper_B",
year == 2009 ~ "caliper_C",
))
left_join(penguins_instrumented, instrument_meta, by = "instrument_id")
## # A tibble: 333 × 11
## species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
## <fct> <fct> <dbl> <dbl> <int> <int>
## 1 Adelie Torgersen 39.1 18.7 181 3750
## 2 Adelie Torgersen 39.5 17.4 186 3800
## 3 Adelie Torgersen 40.3 18 195 3250
## 4 Adelie Torgersen 36.7 19.3 193 3450
## 5 Adelie Torgersen 39.3 20.6 190 3650
## 6 Adelie Torgersen 38.9 17.8 181 3625
## 7 Adelie Torgersen 39.2 19.6 195 4675
## 8 Adelie Torgersen 41.1 17.6 182 3200
## 9 Adelie Torgersen 38.6 21.2 191 3800
## 10 Adelie Torgersen 34.6 21.1 198 4400
## # ℹ 323 more rows
## # ℹ 5 more variables: sex <fct>, year <int>, instrument_id <chr>,
## # calibration_mm <dbl>, manufacturer <chr>
ncol(penguins_instrumented)
## [1] 9
ncol(penguins_clean)
## [1] 8
#Reflection The most fun part of this exercise was figuring out the regex for Question 4. The most confusing part was understanding how the left join function worked. I do think this lesson provided useful tips that I will probably use in future data analyses.