# Number of simulations, samples
<- 500
n_simulations <- 100
n_train <- 100
n_test
# Number of predictor variables
<- 10
n_vars_total # Number of predictor variables with non-zero true coefficients
<- 2
n_vars_true # Number of noise variables (total - true)
<- n_vars_total - n_vars_true
n_vars_noise
# True regression coefficients
<- c(5, -3, rep(0, n_vars_noise))
true_beta
# True intercept
<- 2
true_intercept
# Standard deviation of the error term (noise in the Y value)
<- 3
error_sd
# --- Initialization ---
<- numeric(n_simulations)
rmse_train_all <- numeric(n_simulations)
rmse_test_all
<- numeric(n_simulations)
rmse_train_true <- numeric(n_simulations)
rmse_test_true
<- numeric(n_simulations)
rmse_train_plus1 <- numeric(n_simulations) rmse_test_plus1
# Set seed for reproducibility
set.seed(1015)
# --- Simulation Loop ---
# Loop 50 times
for (i in 1:n_simulations) {
# Generate predictor variables (X) from a standard normal distribution
<- matrix(rnorm(n_train * n_vars_total), nrow = n_train, ncol = n_vars_total)
X_train_matrix # Generate the error term (epsilon)
<- rnorm(n_train, mean = 0, sd = error_sd)
epsilon_train # Calculate the response variable (Y) using the true model: Y = intercept + X * beta + epsilon
<- true_intercept + X_train_matrix %*% true_beta + epsilon_train
Y_train_vector # Combine predictors and response into a data frame (useful for lm function)
<- data.frame(Y = Y_train_vector, X_train_matrix)
train_df # Assign meaningful column names
colnames(train_df) <- c("Y", paste0("X", 1:n_vars_total))
# Generate independent test data using the same process
<- matrix(rnorm(n_test * n_vars_total), nrow = n_test, ncol = n_vars_total)
X_test_matrix <- rnorm(n_test, mean = 0, sd = error_sd)
epsilon_test <- true_intercept + X_test_matrix %*% true_beta + epsilon_test
Y_test_vector <- data.frame(Y = Y_test_vector, X_test_matrix)
test_df colnames(test_df) <- c("Y", paste0("X", 1:n_vars_total))
# Model 1: Using all predictor variables
<- lm(Y ~ ., data = train_df)
model_all
# Model 2: Using only the true predictor variables (X1 and X2 in this case)
<- paste0("X", 1:n_vars_true)
true_var_names <- paste("Y ~", paste(true_var_names, collapse = " + "))
formula_true_str <- as.formula(formula_true_str)
formula_true <- lm(formula_true, data = train_df)
model_true
# Model 3: Using true predictors + 1 randomly chosen noise predictor
<- (n_vars_true + 1):n_vars_total
noise_var_indices <- sample(noise_var_indices, 1)
chosen_noise_index <- paste0("X", chosen_noise_index)
chosen_noise_var_name <- c(true_var_names, chosen_noise_var_name)
vars_plus1 <- paste("Y ~", paste(vars_plus1, collapse = " + ")) # Create formula string
formula_plus1_str <- as.formula(formula_plus1_str) # Convert to formula
formula_plus1 <- lm(formula_plus1, data = train_df)
model_plus1
# Calculate fitted values using the estimated model on the original training data
<- predict(model_all, newdata = train_df)
fitted_train_all <- predict(model_true, newdata = train_df)
fitted_train_true <- predict(model_plus1, newdata = train_df)
fitted_train_plus1
# Make predictions using the estimated model on the unseen evaluation data
<- predict(model_all, newdata = eval_df)
pred_test_all <- predict(model_true, newdata = eval_df)
pred_test_true <- predict(model_plus1, newdata = eval_df)
pred_test_plus1
# RMSE on training data (using fitted values) - Measures goodness of fit to the training data
<- sqrt(mean((train_df$Y - fitted_train_all)^2))
rmse_train_all[i] <- sqrt(mean((train_df$Y - fitted_train_true)^2))
rmse_train_true[i] <- sqrt(mean((train_df$Y - fitted_train_plus1)^2))
rmse_train_plus1[i]
# RMSE on evaluation data (using predicted values) - Measures prediction accuracy on unseen data
<- sqrt(mean((eval_df$Y - pred_test_all)^2))
rmse_test_all[i] <- sqrt(mean((eval_df$Y - pred_test_true)^2))
rmse_test_true[i] <- sqrt(mean((eval_df$Y - pred_test_plus1)^2))
rmse_test_plus1[i]
# End of the simulation loop }
ERROR: Error in eval(expr, envir, enclos): object 'eval_df' not found
Error in eval(expr, envir, enclos): object 'eval_df' not found
Traceback:
1. predict(model_all, newdata = eval_df)
2. predict.lm(model_all, newdata = eval_df)
# --- Step 6: Create Scatter Plots ---
par(mfrow = c(1, 3), mar = c(4, 4, 3, 1), oma = c(0, 0, 2, 0)) # Adjust margins
# Determine common axis limits for better comparison across plots
<- c(rmse_train_all, rmse_test_all,
all_rmse_values
rmse_train_true, rmse_test_true,
rmse_train_plus1, rmse_test_plus1)<- range(all_rmse_values, na.rm = TRUE)
plot_lim
# Plot 1: All Predictors
plot(x = rmse_train_all, y = rmse_test_all,
main = "Model: All Predictors",
xlab = "Training RMSE (Fit)",
ylab = "Evaluation RMSE (Predict)",
pch = 19, col = "dodgerblue",
xlim = plot_lim, # Use common limits
ylim = plot_lim)
abline(a = 0, b = 1, col = "red", lty = 2) # y=x line
# Plot 2: True Predictors Only
plot(x = rmse_train_true, y = rmse_test_true,
main = "Model: True Predictors",
xlab = "Training RMSE",
ylab = "Test RMSE",
pch = 19, col = "forestgreen",
xlim = plot_lim,
ylim = plot_lim)
abline(a = 0, b = 1, col = "red", lty = 2)
# Plot 3: True + 1 Noise Predictor
plot(x = rmse_train_plus1, y = rmse_test_plus1,
main = "Model: True + 1 Noise",
xlab = "Training RMSE (Fit)",
ylab = "Evaluation RMSE (Predict)",
pch = 19, col = "darkorange",
xlim = plot_lim,
ylim = plot_lim)
abline(a = 0, b = 1, col = "red", lty = 2)
# Add an overall title to the figure
mtext("Training RMSE (Goodness of Fit) vs. Evaluation RMSE (Prediction Accuracy)", outer = TRUE, cex = 1.2) # More descriptive title
# Reset plotting parameters to default (1 plot per device)
par(mfrow = c(1, 1), mar = c(5.1, 4.1, 4.1, 2.1), oma = c(0, 0, 0, 0))