R
Beginner
2 min read
Grouping, Summarising, and Joins
Example
library(dplyr)
employees <- data.frame(
id = 1:8,
name = c("Alice","Bob","Carol","Dave","Eve","Frank","Gina","Hank"),
dept = c("HR","IT","IT","Finance","HR","IT","Finance","HR"),
salary = c(55000,72000,48000,95000,61000,88000,102000,58000),
years = c(3,7,2,12,5,9,15,4),
active = c(TRUE,TRUE,FALSE,TRUE,TRUE,TRUE,TRUE,FALSE)
)
# group_by + summarise
employees |>
group_by(dept) |>
summarise(
headcount = n(),
avg_salary = mean(salary),
max_salary = max(salary),
avg_tenure = mean(years),
.groups = "drop"
)
# across() — apply function to multiple columns
employees |>
group_by(dept) |>
summarise(across(where(is.numeric), mean, .names = "avg_{.col}"),
.groups = "drop")
# count() shortcut
employees |> count(dept, sort = TRUE)
employees |> count(dept, active)
# Joins
departments <- data.frame(
dept = c("HR", "IT", "Finance", "Legal"),
manager = c("Susan", "Tom", "Rachel", "Eric"),
budget = c(200000, 500000, 800000, 150000)
)
# left_join — all employees, matched dept info
left_join(employees, departments, by = "dept")
# inner_join — only employees in departments table
inner_join(employees, departments, by = "dept")
# anti_join — departments with no employees
anti_join(departments, employees, by = "dept")
# semi_join — employees whose dept appears in departments
semi_join(employees, filter(departments, budget > 400000), by = "dept")
# Full pipeline example
employees |>
filter(active) |>
left_join(departments, by = "dept") |>
group_by(dept, manager) |>
summarise(total_salary = sum(salary),
headcount = n(),
.groups = "drop") |>
mutate(salary_pct = total_salary / budget) |>
arrange(desc(total_salary))