# Number of simulations, samples
n_simulations <- 500
n_train <- 100
n_test <- 100
# Number of predictor variables
n_vars_total <- 10
# Number of predictor variables with non-zero true coefficients
n_vars_true <- 2
# Number of noise variables (total - true)
n_vars_noise <- n_vars_total - n_vars_true
# True regression coefficients
true_beta <- c(5, -3, rep(0, n_vars_noise))
# True intercept
true_intercept <- 2
# Standard deviation of the error term (noise in the Y value)
error_sd <- 3
# --- Initialization ---
rmse_train_all <- numeric(n_simulations)
rmse_test_all <- numeric(n_simulations)
rmse_train_true <- numeric(n_simulations)
rmse_test_true <- numeric(n_simulations)
rmse_train_plus1 <- numeric(n_simulations)
rmse_test_plus1 <- numeric(n_simulations) # Set seed for reproducibility
set.seed(1015)
# --- Simulation Loop ---
# Loop 50 times
for (i in 1:n_simulations) {
# Generate predictor variables (X) from a standard normal distribution
X_train_matrix <- matrix(rnorm(n_train * n_vars_total), nrow = n_train, ncol = n_vars_total)
# Generate the error term (epsilon)
epsilon_train <- rnorm(n_train, mean = 0, sd = error_sd)
# Calculate the response variable (Y) using the true model: Y = intercept + X * beta + epsilon
Y_train_vector <- true_intercept + X_train_matrix %*% true_beta + epsilon_train
# Combine predictors and response into a data frame (useful for lm function)
train_df <- data.frame(Y = Y_train_vector, X_train_matrix)
# Assign meaningful column names
colnames(train_df) <- c("Y", paste0("X", 1:n_vars_total))
# Generate independent test data using the same process
X_test_matrix <- matrix(rnorm(n_test * n_vars_total), nrow = n_test, ncol = n_vars_total)
epsilon_test <- rnorm(n_test, mean = 0, sd = error_sd)
Y_test_vector <- true_intercept + X_test_matrix %*% true_beta + epsilon_test
test_df <- data.frame(Y = Y_test_vector, X_test_matrix)
colnames(test_df) <- c("Y", paste0("X", 1:n_vars_total))
# Model 1: Using all predictor variables
model_all <- lm(Y ~ ., data = train_df)
# Model 2: Using only the true predictor variables (X1 and X2 in this case)
true_var_names <- paste0("X", 1:n_vars_true)
formula_true_str <- paste("Y ~", paste(true_var_names, collapse = " + "))
formula_true <- as.formula(formula_true_str)
model_true <- lm(formula_true, data = train_df)
# Model 3: Using true predictors + 1 randomly chosen noise predictor
noise_var_indices <- (n_vars_true + 1):n_vars_total
chosen_noise_index <- sample(noise_var_indices, 1)
chosen_noise_var_name <- paste0("X", chosen_noise_index)
vars_plus1 <- c(true_var_names, chosen_noise_var_name)
formula_plus1_str <- paste("Y ~", paste(vars_plus1, collapse = " + ")) # Create formula string
formula_plus1 <- as.formula(formula_plus1_str) # Convert to formula
model_plus1 <- lm(formula_plus1, data = train_df)
# Calculate fitted values using the estimated model on the original training data
fitted_train_all <- predict(model_all, newdata = train_df)
fitted_train_true <- predict(model_true, newdata = train_df)
fitted_train_plus1 <- predict(model_plus1, newdata = train_df)
# Make predictions using the estimated model on the unseen evaluation data
pred_test_all <- predict(model_all, newdata = eval_df)
pred_test_true <- predict(model_true, newdata = eval_df)
pred_test_plus1 <- predict(model_plus1, newdata = eval_df)
# RMSE on training data (using fitted values) - Measures goodness of fit to the training data
rmse_train_all[i] <- sqrt(mean((train_df$Y - fitted_train_all)^2))
rmse_train_true[i] <- sqrt(mean((train_df$Y - fitted_train_true)^2))
rmse_train_plus1[i] <- sqrt(mean((train_df$Y - fitted_train_plus1)^2))
# RMSE on evaluation data (using predicted values) - Measures prediction accuracy on unseen data
rmse_test_all[i] <- sqrt(mean((eval_df$Y - pred_test_all)^2))
rmse_test_true[i] <- sqrt(mean((eval_df$Y - pred_test_true)^2))
rmse_test_plus1[i] <- sqrt(mean((eval_df$Y - pred_test_plus1)^2))
} # End of the simulation loopERROR: Error in eval(expr, envir, enclos): object 'eval_df' not found
Error in eval(expr, envir, enclos): object 'eval_df' not found
Traceback:
1. predict(model_all, newdata = eval_df)
2. predict.lm(model_all, newdata = eval_df)
# --- Step 6: Create Scatter Plots ---
par(mfrow = c(1, 3), mar = c(4, 4, 3, 1), oma = c(0, 0, 2, 0)) # Adjust margins
# Determine common axis limits for better comparison across plots
all_rmse_values <- c(rmse_train_all, rmse_test_all,
rmse_train_true, rmse_test_true,
rmse_train_plus1, rmse_test_plus1)
plot_lim <- range(all_rmse_values, na.rm = TRUE)
# Plot 1: All Predictors
plot(x = rmse_train_all, y = rmse_test_all,
main = "Model: All Predictors",
xlab = "Training RMSE (Fit)",
ylab = "Evaluation RMSE (Predict)",
pch = 19, col = "dodgerblue",
xlim = plot_lim, # Use common limits
ylim = plot_lim)
abline(a = 0, b = 1, col = "red", lty = 2) # y=x line
# Plot 2: True Predictors Only
plot(x = rmse_train_true, y = rmse_test_true,
main = "Model: True Predictors",
xlab = "Training RMSE",
ylab = "Test RMSE",
pch = 19, col = "forestgreen",
xlim = plot_lim,
ylim = plot_lim)
abline(a = 0, b = 1, col = "red", lty = 2)
# Plot 3: True + 1 Noise Predictor
plot(x = rmse_train_plus1, y = rmse_test_plus1,
main = "Model: True + 1 Noise",
xlab = "Training RMSE (Fit)",
ylab = "Evaluation RMSE (Predict)",
pch = 19, col = "darkorange",
xlim = plot_lim,
ylim = plot_lim)
abline(a = 0, b = 1, col = "red", lty = 2)
# Add an overall title to the figure
mtext("Training RMSE (Goodness of Fit) vs. Evaluation RMSE (Prediction Accuracy)", outer = TRUE, cex = 1.2) # More descriptive title
# Reset plotting parameters to default (1 plot per device)
par(mfrow = c(1, 1), mar = c(5.1, 4.1, 4.1, 2.1), oma = c(0, 0, 0, 0))