############################################################################### # R Optimization Tutorial — Profiling with profvis # # Run each profvis({ ... }) block in RStudio. profvis opens an interactive # view with two parts: # (1) a flame graph — wide bars = where time is spent # (2) a data table — line-by-line Time (ms) and Memory (MB) # # In the code panel of the profvis output, look for the longest memory and time bars. # ############################################################################### library(profvis) ## ---- Setup: build a sample dataset ----------------------------------------- set.seed(123) n <- 100000 df <- data.frame( id = seq_len(n), group = sample(sprintf("g%03d", 1:200), n, replace = TRUE), # 200 string groups value = rnorm(n, mean = 100, sd = 15) ) ############################################################################### # SLOW VERSION ############################################################################### profvis({ ## --- Bottleneck 1: growing a vector inside a loop (no pre-allocation) ------ ## Every c() builds a brand-new, longer vector and copies all prior elements. doubled <- c() for (i in seq_len(n)) { doubled <- c(doubled, df$value[i] * 2) } ## --- Bottleneck 2: apply() over data-frame rows --------------------------- ## apply() coerces the WHOLE data frame to one matrix. Because `group` is a ## character column, the matrix becomes character, so every number is turned ## into a string and back. The R closure is also called once per row. df$normalized <- apply(df, 1, function(row) { as.numeric(row["value"]) / 100 }) ## --- Bottleneck 3: repeated linear-scan subsetting ------------------------- ## df[df$group == g, ] rescans all 30,000 rows for each of the 200 groups ## and allocates a fresh subset every iteration. group_means <- c() for (g in sort(unique(df$group))) { sub <- df[df$group == g, ] group_means <- c(group_means, mean(sub$value)) } })