8. Classifier

Author

이상민

Published

May 11, 2025

1. library

# install.packages(c("palmerpenguins", "class", "caret")) # Run this line once if packages are not installed
library(palmerpenguins)
library(class) # For k-NN
library(caret) # For confusionMatrix and createDataPartition
Loading required package: ggplot2

Loading required package: lattice

Warning message:
“Your system is mis-configured: ‘/var/db/timezone/localtime’ is not a symlink”
Warning message:
“‘/var/db/timezone/localtime’ is not identical to any known timezone file”

2. Data

# --- 1. Data Preparation ---
data(penguins)
penguins_complete <- na.omit(penguins)
selected_data <- penguins_complete[, c("bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g", "year", "sex", "species")]
set.seed(123)

- train, test 분리

# Split data into training (70%) and testing (30%) sets
train_index <- createDataPartition(selected_data$species, p = 0.7, list = FALSE)
training_data <- selected_data[train_index, ]
testing_data <- selected_data[-train_index, ]

3. Logistic Regression Classifier(성별(sex) 예측)

# --- 2. Logistic Regression Classifier (Predicting Sex) ---
lg_predictor_variables <- c("bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g")
lg_target_variable <- "sex"

logistic_model <- glm(
  formula = paste(lg_target_variable, "~", paste(lg_predictor_variables, collapse = " + ")),
  data = training_data,
  family = binomial(link = "logit") # Specify logistic regression
)
logistic_probabilities <- predict(logistic_model, newdata = testing_data, type = "response")
classified_sex <- ifelse(logistic_probabilities > 0.5, "male", "female")
classified_sex <- factor(classified_sex, levels = levels(testing_data$sex))

4. Logistic Regression 평가

lg_conf_matrix_details <- confusionMatrix(
  data = classified_sex,                   # Classified classes
  reference = testing_data[[lg_target_variable]] # Actual classes
)
lg_conf_matrix_details$table
lg_conf_matrix_details$overall["Accuracy"]
lg_conf_matrix_details$byClass[c("Precision", "Recall", "Sensitivity", "Specificity", "Pos Pred Value", "Neg Pred Value")]
          Reference
Prediction female male
    female     42    4
    male        6   46
Accuracy: 0.897959183673469
Precision
0.91304347826087
Recall
0.875
Sensitivity
0.875
Specificity
0.92
Pos Pred Value
0.91304347826087
Neg Pred Value
0.884615384615385

5. k-Nearest Neighbor (k-NN) Classifier

knn_predictor_variables <- c("bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g", "year")
knn_target_variable <- "species"
training_data_knn <- training_data
testing_data_knn <- testing_data
training_data_knn[knn_predictor_variables] <- scale(training_data_knn[knn_predictor_variables])
testing_data_knn[knn_predictor_variables] <- scale(testing_data_knn[knn_predictor_variables],
                                                   center = attr(scale(training_data[knn_predictor_variables]), "scaled:center"),
                                                   scale = attr(scale(training_data[knn_predictor_variables]), "scaled:scale"))

knn_classification <- knn(
  train = training_data_knn[, knn_predictor_variables], # Training predictors (scaled)
  test = testing_data_knn[, knn_predictor_variables],   # Testing predictors (scaled)
  cl = training_data[[knn_target_variable]],            # True class labels for training data
  k = 5
)

6. k-Nearest Neightbor 평가

knn_conf_matrix_details <- confusionMatrix(
  data = knn_classification,                      # classified classes
  reference = testing_data[[knn_target_variable]] # Actual classes
)
knn_conf_matrix_details$table
knn_conf_matrix_details$overall["Accuracy"]
knn_conf_matrix_details$byClass[, c("Precision", "Recall")]
           Reference
Prediction  Adelie Chinstrap Gentoo
  Adelie        43         0      0
  Chinstrap      0        20      0
  Gentoo         0         0     35
Accuracy: 1
A matrix: 3 × 2 of type dbl
Precision Recall
Class: Adelie 1 1
Class: Chinstrap 1 1
Class: Gentoo 1 1