# Get predicted probabilities on the test set
probs_test <- predict(cv_model, newdata = churn_test, type = "prob")[, "yes"] #predicting the probability of yes
#find the best cutoff
roc_obj <- roc(churn_test$churn, probs_test, levels = c("no", "yes")) #making sure that that the test has the right factor order
## Setting direction: controls < cases
plot(roc_obj,
main = paste("ROC Curve - Churn Model (AUC =", round(auc(roc_obj), 3), ")"),
col = "blue", lwd = 2)
auc(roc_curve)
## Area under the curve: 0.8452
#Calculate the best cutoff value
best_coords <- coords(roc_obj, "best", ret = c("threshold", "sensitivity", "specificity"), transpose = FALSE)
optimal_cutoff <- best_coords$threshold
print(optimal_cutoff)
## [1] 0.2209876
#Run prediction with the optimal cutt off and confusion matrix.
predicted_classes <- ifelse(probs_test >= optimal_cutoff, "yes", "no")
predicted_classes <- factor(predicted_classes, levels = c("no", "yes"))
confusionMatrix(predicted_classes, churn_test$churn, positive = "yes") #run the confusion matrix
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 731 45
## yes 71 92
##
## Accuracy : 0.8765
## 95% CI : (0.8537, 0.8968)
## No Information Rate : 0.8541
## P-Value [Acc > NIR] : 0.02706
##
## Kappa : 0.5405
##
## Mcnemar's Test P-Value : 0.02028
##
## Sensitivity : 0.67153
## Specificity : 0.91147
## Pos Pred Value : 0.56442
## Neg Pred Value : 0.94201
## Prevalence : 0.14590
## Detection Rate : 0.09798
## Detection Prevalence : 0.17359
## Balanced Accuracy : 0.79150
##
## 'Positive' Class : yes
##
#build a random forest model to test against the current turned logistic regression
#we can use the same training and testing sets that we used in for training the logistic model.
#rf formula is just the training formula for the random forest model. although, we are leaving out the event modifier b/c it's not necessary in an rf model
rf_formula <- churn ~ international_plan + number_customer_service_calls +
total_day_charge + voice_mail_plan + total_intl_calls
#we can also use the previously run control, but I am putting it in so, we have a reference for what is going on.
cv_control <- trainControl(
method = "repeatedcv",
number = 10,
repeats = 3,
classProbs = TRUE,
summaryFunction = twoClassSummary
)
#train the rf model
set.seed(123)
rf_model <- train(
rf_formula,
data = churn_train,
method = "rf",
metric = "ROC",
trControl = cv_control,
importance = TRUE
)
## Be prepared, this is going to take a while.
print(rf_model)
## Random Forest
##
## 2194 samples
## 5 predictor
## 2 classes: 'no', 'yes'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 1974, 1974, 1975, 1975, 1975, 1975, ...
## Resampling results across tuning parameters:
##
## mtry ROC Sens Spec
## 2 0.8718641 0.9761757 0.5114583
## 3 0.8600475 0.9626512 0.5281250
## 5 0.8535454 0.9548261 0.5447917
##
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
#predict the churn, like we did for the logistic model
rf_probs <- predict(rf_model, newdata = churn_test, type = "prob")[, "yes"] #predict probability of Yes/ churn
#Optimize the cutoff
rf_roc <- roc(churn_test$churn, rf_probs, levels = c("no", "yes"))
## Setting direction: controls < cases
rf_cutoff <- coords(rf_roc, "best", ret = "threshold")$threshold
print(rf_cutoff)
## [1] 0.129
#Classify the cutoff
rf_predicted <- ifelse(rf_probs >= rf_cutoff, "yes", "no")
rf_predicted <- factor(rf_predicted, levels = c("no", "yes")) #classify the cutoff, so the confusion matrix looks correct.
#Run the confusion matrix.
confusionMatrix(rf_predicted, churn_test$churn, positive = "yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 722 38
## yes 80 99
##
## Accuracy : 0.8743
## 95% CI : (0.8514, 0.8949)
## No Information Rate : 0.8541
## P-Value [Acc > NIR] : 0.0415527
##
## Kappa : 0.5526
##
## Mcnemar's Test P-Value : 0.0001604
##
## Sensitivity : 0.7226
## Specificity : 0.9002
## Pos Pred Value : 0.5531
## Neg Pred Value : 0.9500
## Prevalence : 0.1459
## Detection Rate : 0.1054
## Detection Prevalence : 0.1906
## Balanced Accuracy : 0.8114
##
## 'Positive' Class : yes
##
#compare the results
library(lattice)
results <- resamples(list(LogReg = cv_model, RF = rf_model))
summary(results)
##
## Call:
## summary.resamples(object = results)
##
## Models: LogReg, RF
## Number of resamples: 30
##
## ROC
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## LogReg 0.7540107 0.8109124 0.8510856 0.8396562 0.8744947 0.9107620 0
## RF 0.7773229 0.8433653 0.8759142 0.8718641 0.8845163 0.9566344 0
##
## Sens
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## LogReg 0.9468085 0.9625668 0.9733331 0.9720693 0.9839572 0.9946524 0
## RF 0.9414894 0.9680851 0.9786096 0.9761757 0.9840212 0.9946524 0
##
## Spec
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## LogReg 0.125 0.2265625 0.281250 0.2791667 0.3125 0.46875 0
## RF 0.375 0.4687500 0.515625 0.5114583 0.5625 0.59375 0
bwplot(results, layout = c(1, 3), metric = "ROC")
#creates a summary of both cross validated and optimized models and gives winner showing that overall, the RF model is better. when looking at ROC/Area Under the Curve, sensitivity (true positive) and specificity (true negative). the random forest model edges the logistic model out in every aspect.
#The box plot charts models ROC across the sampling and you can see that the rf model has is performing better.
#Load up the Customers to Predict set
load("C:/Users/13309/OneDrive/Desktop/KSU Summer 2025/Business Analytics/Group Project/New folder/Customers_To_Predict.RData")
#Ensuring that the factors match
#making sure the factoring is correct.
Customers_To_Predict$international_plan <- as.factor(Customers_To_Predict$international_plan)
Customers_To_Predict$voice_mail_plan <- as.factor(Customers_To_Predict$voice_mail_plan)
Customers_To_Predict$state <- as.factor(Customers_To_Predict$state)
Customers_To_Predict$area_code <- as.factor(Customers_To_Predict$area_code)
glimpse(Customers_To_Predict) #validate that the character variables were changed to factors
## Rows: 1,600
## Columns: 19
## $ state <fct> UT, SD, KY, MS, AK, TX, WI, UT, MT, NE, …
## $ account_length <dbl> 93, 39, 124, 162, 112, 109, 13, 66, 138,…
## $ area_code <fct> area_code_415, area_code_408, area_code_…
## $ international_plan <fct> no, no, no, yes, no, yes, no, no, no, no…
## $ voice_mail_plan <fct> no, no, no, no, yes, no, no, no, no, no,…
## $ number_vmail_messages <dbl> 0, 0, 0, 0, 31, 0, 0, 0, 0, 0, 0, 0, 35,…
## $ total_day_minutes <dbl> 174.1, 179.0, 156.9, 172.1, 142.9, 159.6…
## $ total_day_calls <dbl> 127, 88, 74, 138, 92, 136, 117, 98, 106,…
## $ total_day_charge <dbl> 29.60, 30.43, 26.67, 29.26, 24.29, 27.13…
## $ total_eve_minutes <dbl> 176.8, 148.2, 195.8, 165.9, 233.8, 151.0…
## $ total_eve_calls <dbl> 73, 124, 82, 93, 107, 126, 102, 93, 110,…
## $ total_eve_charge <dbl> 15.03, 12.60, 16.64, 14.10, 19.87, 12.84…
## $ total_night_minutes <dbl> 240.0, 146.8, 181.0, 279.0, 329.2, 142.1…
## $ total_night_calls <dbl> 111, 116, 99, 81, 142, 53, 108, 82, 102,…
## $ total_night_charge <dbl> 10.80, 6.61, 8.15, 12.56, 14.81, 6.39, 9…
## $ total_intl_minutes <dbl> 10.7, 8.8, 8.8, 9.2, 10.4, 7.3, 9.0, 11.…
## $ total_intl_calls <dbl> 3, 4, 2, 6, 6, 4, 3, 1, 4, 2, 3, 3, 6, 6…
## $ total_intl_charge <dbl> 2.89, 2.38, 2.38, 2.48, 2.81, 1.97, 2.43…
## $ number_customer_service_calls <dbl> 0, 2, 1, 2, 0, 4, 1, 0, 0, 0, 1, 3, 0, 1…
# Predict the probability of yes/ churn
rf_probs <- predict(rf_model, Customers_To_Predict, type = "prob")[, "yes"] #again predict the probability of yes.
optimal_cutoff <- 0.1 # Setting cutoff from what we optimized during training
rf_predicted_class <- ifelse(rf_probs >= optimal_cutoff, "yes", "no")
Customers_To_Predict$churn_prob <- rf_probs #drop predicted probability into the new churn_prob label
head(Customers_To_Predict)
## # A tibble: 6 × 20
## state account_length area_code international_plan voice_mail_plan
## <fct> <dbl> <fct> <fct> <fct>
## 1 UT 93 area_code_415 no no
## 2 SD 39 area_code_408 no no
## 3 KY 124 area_code_408 no no
## 4 MS 162 area_code_415 yes no
## 5 AK 112 area_code_415 no yes
## 6 TX 109 area_code_510 yes no
## # ℹ 15 more variables: number_vmail_messages <dbl>, total_day_minutes <dbl>,
## # total_day_calls <dbl>, total_day_charge <dbl>, total_eve_minutes <dbl>,
## # total_eve_calls <dbl>, total_eve_charge <dbl>, total_night_minutes <dbl>,
## # total_night_calls <dbl>, total_night_charge <dbl>,
## # total_intl_minutes <dbl>, total_intl_calls <dbl>, total_intl_charge <dbl>,
## # number_customer_service_calls <dbl>, churn_prob <dbl>
View(Customers_To_Predict)
save(rf_model, optimal_cutoff, Customers_To_Predict,
file = "C:/Users/13309/OneDrive/Desktop/KSU Summer 2025/Business Analytics/Group Project/Group2.RData")
write.csv(Customers_To_Predict,
file = "C:/Users/13309/OneDrive/Desktop/KSU Summer 2025/Business Analytics/Customers_To_Predict_Output.csv",
row.names = FALSE)
#sample review
median(Customers_To_Predict$churn_prob)
## [1] 0.008
sum(Customers_To_Predict$churn_prob > 0.1) #customers
## [1] 345