# Set seed for reproducibility
set.seed(123)
# Number of repetitions for the entire process
<- 50
n_repetitions
# Number of data points in training and test sets
<- 100
n_samples
# Parameters for the Normal distribution
<- 3
true_mean <- 9
true_variance <- sqrt(true_variance) # rnorm uses standard deviation
true_sd
# Vector to store the Mean Squared Sum for training, test data for each repetition
<- numeric(n_repetitions)
train_mse_results <- numeric(n_repetitions) test_mse_results
# --- Start the Simulation Loop ---
for (i in 1:n_repetitions) {
# 1. Generate Training Data
<- rnorm(n_samples, mean = true_mean, sd = true_sd)
train_data
# 2. Calculate the Sample Mean from Training Data
<- mean(train_data)
train_sample_mean
# 3. Calculate MSE on Training Data
<- (train_data - train_sample_mean)^2
train_squared_diff <- mean(train_squared_diff)
train_mse <- train_mse
train_mse_results[i]
# 4. Generate Test Data (Evaluation Data)
<- rnorm(n_samples, mean = true_mean, sd = true_sd)
test_data
# 5. Calculate MSE on Test Data using the Training Sample Mean
<- (test_data - train_sample_mean)^2
test_squared_diff <- mean(test_squared_diff)
test_mse <- test_mse
test_mse_results[i]
# End of the simulation loop }
# --- Visualize the Results ---
plot(train_mse_results, test_mse_results,
main = "Comparison of Training MSE and Test MSE (50 Repetitions)",
xlab = "MSE on Training Data",
ylab = "MSE on Test Data",
pch = 19,
col = "blue")
# Add a diagonal line (y=x) for reference
abline(a = 0, b = 1, col = "red", lty = 2) # lty=2 makes the line dashed
# Add a legend
legend("topleft", legend = "y = x line", col = "red", lty = 2)