ABC Wireless Churn Prediction Project - Final R Script

Group 2 - Summer 2025

Authors: Elizabeth Crawley, Matt Miller, Holly Victor

# load libraries
library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(gtExtras)
## Loading required package: gt
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## 
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
## 
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(corrplot)
## corrplot 0.95 loaded

##loading data

churn <- read.csv("C:/Users/13309/OneDrive/Desktop/KSU Summer 2025/Business Analytics/Group Project/Churn_Train.csv", stringsAsFactors = FALSE)

##Clean the data and convert relevent variables to factors

churn$churn <- factor(churn$churn, levels = c("no", "yes")) # left this way on purpose
churn$international_plan <- as.factor(churn$international_plan)
churn$voice_mail_plan <- as.factor(churn$voice_mail_plan)
churn$state <- as.factor(churn$state)
churn$area_code <- as.factor(churn$area_code)
churn_clean <- churn[rowSums(is.na(churn)) <= 4, ] # get rid of any rows missing more than 4 values
table(churn_clean$churn) #show a new count
## 
##   no  yes 
## 2676  457
print(colSums(is.na(churn_clean))) #double check that it got us where we needed to be
##                         state                account_length 
##                             0                           301 
##                     area_code            international_plan 
##                             0                             0 
##               voice_mail_plan         number_vmail_messages 
##                             0                             0 
##             total_day_minutes               total_day_calls 
##                             0                             0 
##              total_day_charge             total_eve_minutes 
##                             0                           101 
##               total_eve_calls              total_eve_charge 
##                             0                             0 
##           total_night_minutes             total_night_calls 
##                             0                             0 
##            total_night_charge            total_intl_minutes 
##                             0                             0 
##              total_intl_calls             total_intl_charge 
##                           101                             0 
## number_customer_service_calls                         churn 
##                             0                             0

Impute the rest of the values.

numeric_cols <- sapply(churn_clean, is.numeric)
churn_clean[numeric_cols] <- lapply(churn_clean[numeric_cols], function(x) {
  ifelse(is.na(x), mean(x, na.rm = TRUE), x)
})

print(colSums(is.na(churn_clean)))
##                         state                account_length 
##                             0                             0 
##                     area_code            international_plan 
##                             0                             0 
##               voice_mail_plan         number_vmail_messages 
##                             0                             0 
##             total_day_minutes               total_day_calls 
##                             0                             0 
##              total_day_charge             total_eve_minutes 
##                             0                             0 
##               total_eve_calls              total_eve_charge 
##                             0                             0 
##           total_night_minutes             total_night_calls 
##                             0                             0 
##            total_night_charge            total_intl_minutes 
##                             0                             0 
##              total_intl_calls             total_intl_charge 
##                             0                             0 
## number_customer_service_calls                         churn 
##                             0                             0
#take a look at the data to insure that the formatting is correct. 
glimpse(churn_clean)
## Rows: 3,133
## Columns: 20
## $ state                         <fct> NV, HI, DC, HI, OH, MO, NC, PA, IA, IN, …
## $ account_length                <dbl> 125.00000, 108.00000, 82.00000, 97.32133…
## $ area_code                     <fct> area_code_510, area_code_415, area_code_…
## $ international_plan            <fct> no, no, no, no, no, no, no, no, no, no, …
## $ voice_mail_plan               <fct> no, no, no, yes, no, no, no, no, no, no,…
## $ number_vmail_messages         <int> 0, 0, 0, 30, 0, 0, 0, 0, 0, 0, 0, 32, 0,…
## $ total_day_minutes             <dbl> 2013.4, 291.6, 300.3, 110.3, 337.4, 178.…
## $ total_day_calls               <int> 99, 99, 109, 71, 120, 81, 81, 87, 115, 1…
## $ total_day_charge              <dbl> 28.66, 49.57, 51.05, 18.75, 57.36, 30.38…
## $ total_eve_minutes             <dbl> 1107.6000, 221.1000, 181.0000, 182.4000,…
## $ total_eve_calls               <int> 107, 93, 100, 108, 116, 74, 114, 92, 112…
## $ total_eve_charge              <dbl> 14.93, 18.79, 15.39, 15.50, 19.33, 19.86…
## $ total_night_minutes           <dbl> 243.3, 229.2, 270.1, 183.8, 153.9, 131.9…
## $ total_night_calls             <int> 92, 110, 73, 88, 114, 120, 82, 112, 95, …
## $ total_night_charge            <dbl> 10.95, 10.31, 12.15, 8.27, 6.93, 5.94, 9…
## $ total_intl_minutes            <dbl> 10.9, 14.0, 11.7, 11.0, 15.8, 9.1, 10.3,…
## $ total_intl_calls              <dbl> 7, 9, 4, 8, 7, 4, 6, 3, 7, 6, 7, 4, 5, 4…
## $ total_intl_charge             <dbl> 2.94, 3.78, 3.16, 2.97, 4.27, 2.46, 2.78…
## $ number_customer_service_calls <int> 0, 2, 0, 2, 0, 1, 1, 3, 2, 4, 1, 3, 3, 3…
## $ churn                         <fct> no, yes, yes, no, yes, no, no, no, no, y…
#create model with modifiers, to improve model performance.
trimmed_model <- glm(
  churn ~ international_plan + number_customer_service_calls +
           total_day_charge + voice_mail_plan + total_intl_calls + number_customer_service_calls:total_day_charge, 
  data = churn_clean, family = binomial
)
# review the summary to check the improvement of the of the model
summary(trimmed_model)
## 
## Call:
## glm(formula = churn ~ international_plan + number_customer_service_calls + 
##     total_day_charge + voice_mail_plan + total_intl_calls + number_customer_service_calls:total_day_charge, 
##     family = binomial, data = churn_clean)
## 
## Coefficients:
##                                                 Estimate Std. Error z value
## (Intercept)                                    -9.213368   0.488242 -18.870
## international_planyes                           2.050020   0.155085  13.219
## number_customer_service_calls                   2.454148   0.167528  14.649
## total_day_charge                                0.199051   0.012455  15.982
## voice_mail_planyes                             -0.915060   0.152040  -6.019
## total_intl_calls                               -0.074589   0.026427  -2.822
## number_customer_service_calls:total_day_charge -0.059376   0.004765 -12.461
##                                                Pr(>|z|)    
## (Intercept)                                     < 2e-16 ***
## international_planyes                           < 2e-16 ***
## number_customer_service_calls                   < 2e-16 ***
## total_day_charge                                < 2e-16 ***
## voice_mail_planyes                             1.76e-09 ***
## total_intl_calls                                0.00477 ** 
## number_customer_service_calls:total_day_charge  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2603.3  on 3132  degrees of freedom
## Residual deviance: 1923.4  on 3126  degrees of freedom
## AIC: 1937.4
## 
## Number of Fisher Scoring iterations: 6
#Model was built adding one variable at at time, checking how each variable interacted and the highest impact modifiers were added to the model.
#Run ROC Curve and UAC for the full model.

Trimmed_prob <- predict(trimmed_model, newdata = churn_clean, type = "response")

roc_curve <- roc(churn_clean$churn, Trimmed_prob, levels = c("no", "yes"), direction = "<")
plot(roc_curve, col = "blue", main = "ROC Curve for trimmed_model")

auc(roc_curve)
## Area under the curve: 0.8452

We need to validate that the model works as well in practice via cross validation via K-fold

#lets validate the model by using K Fold.

set.seed(123)
train_index <- createDataPartition(churn_clean$churn, p = 0.7, list = FALSE)
churn_train <- churn_clean[train_index, ]
churn_test <- churn_clean[-train_index, ]

#splitting the data into the standard 70/30 by indexing first and then splitting the data.

#set up a traning formula for the KFold to run on the training set

training_formula <-training_formula <- churn ~ international_plan + number_customer_service_calls + 
  total_day_charge + voice_mail_plan + total_intl_calls +number_customer_service_calls:total_day_charge

cv_control <- trainControl(
  method = "repeatedcv",          # repeated cross-validation
  number = 10,                    # 10 folds
  repeats = 3,                    # repeat 3 times
  classProbs = TRUE,              # return probabilities
  summaryFunction = twoClassSummary  # for AUC, Sens, Spec
)

#run k fold on the training set

cv_model <- train(
  training_formula,
  data = churn_train,
  method = "glm",
  family = "binomial",
  metric = "ROC",
  trControl = cv_control  # repeatedcv, classProbs = TRUE, etc.
)

print(cv_model)
## Generalized Linear Model 
## 
## 2194 samples
##    5 predictor
##    2 classes: 'no', 'yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 1975, 1975, 1974, 1974, 1975, 1975, ... 
## Resampling results:
## 
##   ROC        Sens       Spec     
##   0.8396562  0.9720693  0.2791667

Optimize the cutoff, so we can run it on the test partition.

# Get predicted probabilities on the test set
probs_test <- predict(cv_model, newdata = churn_test, type = "prob")[, "yes"] #predicting the probability of yes

#find the best cutoff
roc_obj <- roc(churn_test$churn, probs_test, levels = c("no", "yes")) #making sure that that the test has the right factor order
## Setting direction: controls < cases
plot(roc_obj,
     main = paste("ROC Curve - Churn Model (AUC =", round(auc(roc_obj), 3), ")"),
     col = "blue", lwd = 2)

    auc(roc_curve)
## Area under the curve: 0.8452

#Calculate the best cutoff value

best_coords <- coords(roc_obj, "best", ret = c("threshold", "sensitivity", "specificity"), transpose = FALSE)
optimal_cutoff <- best_coords$threshold
print(optimal_cutoff)
## [1] 0.2209876

#Run prediction with the optimal cutt off and confusion matrix.

predicted_classes <- ifelse(probs_test >= optimal_cutoff, "yes", "no")
predicted_classes <- factor(predicted_classes, levels = c("no", "yes"))


confusionMatrix(predicted_classes, churn_test$churn, positive = "yes") #run the confusion matrix
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  no yes
##        no  731  45
##        yes  71  92
##                                           
##                Accuracy : 0.8765          
##                  95% CI : (0.8537, 0.8968)
##     No Information Rate : 0.8541          
##     P-Value [Acc > NIR] : 0.02706         
##                                           
##                   Kappa : 0.5405          
##                                           
##  Mcnemar's Test P-Value : 0.02028         
##                                           
##             Sensitivity : 0.67153         
##             Specificity : 0.91147         
##          Pos Pred Value : 0.56442         
##          Neg Pred Value : 0.94201         
##              Prevalence : 0.14590         
##          Detection Rate : 0.09798         
##    Detection Prevalence : 0.17359         
##       Balanced Accuracy : 0.79150         
##                                           
##        'Positive' Class : yes             
## 

#build a random forest model to test against the current turned logistic regression

#we can use the same training and testing sets that we used in for training the logistic model.

#rf formula is just the training formula for the random forest model.  although, we are leaving out the event modifier b/c it's not necessary in an rf model

rf_formula <- churn ~ international_plan + number_customer_service_calls + 
  total_day_charge + voice_mail_plan + total_intl_calls

#we can also use the previously run control, but I am putting it in so, we have a reference for what is going on.
cv_control <- trainControl(
  method = "repeatedcv",
  number = 10,
  repeats = 3,
  classProbs = TRUE,
  summaryFunction = twoClassSummary
)

#train the rf model

set.seed(123)
rf_model <- train(
  rf_formula,
  data = churn_train,
  method = "rf",
  metric = "ROC",
  trControl = cv_control,
  importance = TRUE
  
)
## Be prepared, this is going to take a while.
print(rf_model)
## Random Forest 
## 
## 2194 samples
##    5 predictor
##    2 classes: 'no', 'yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 1974, 1974, 1975, 1975, 1975, 1975, ... 
## Resampling results across tuning parameters:
## 
##   mtry  ROC        Sens       Spec     
##   2     0.8718641  0.9761757  0.5114583
##   3     0.8600475  0.9626512  0.5281250
##   5     0.8535454  0.9548261  0.5447917
## 
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.

#predict the churn, like we did for the logistic model

rf_probs <- predict(rf_model, newdata = churn_test, type = "prob")[, "yes"] #predict probability of Yes/ churn

#Optimize the cutoff

rf_roc <- roc(churn_test$churn, rf_probs, levels = c("no", "yes"))
## Setting direction: controls < cases
rf_cutoff <- coords(rf_roc, "best", ret = "threshold")$threshold
print(rf_cutoff)
## [1] 0.129

#Classify the cutoff

rf_predicted <- ifelse(rf_probs >= rf_cutoff, "yes", "no")
rf_predicted <- factor(rf_predicted, levels = c("no", "yes"))  #classify the cutoff, so the confusion matrix looks correct.

#Run the confusion matrix.
confusionMatrix(rf_predicted, churn_test$churn, positive = "yes")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  no yes
##        no  722  38
##        yes  80  99
##                                           
##                Accuracy : 0.8743          
##                  95% CI : (0.8514, 0.8949)
##     No Information Rate : 0.8541          
##     P-Value [Acc > NIR] : 0.0415527       
##                                           
##                   Kappa : 0.5526          
##                                           
##  Mcnemar's Test P-Value : 0.0001604       
##                                           
##             Sensitivity : 0.7226          
##             Specificity : 0.9002          
##          Pos Pred Value : 0.5531          
##          Neg Pred Value : 0.9500          
##              Prevalence : 0.1459          
##          Detection Rate : 0.1054          
##    Detection Prevalence : 0.1906          
##       Balanced Accuracy : 0.8114          
##                                           
##        'Positive' Class : yes             
## 

#compare the results

library(lattice)
results <- resamples(list(LogReg = cv_model, RF = rf_model))  
summary(results)
## 
## Call:
## summary.resamples(object = results)
## 
## Models: LogReg, RF 
## Number of resamples: 30 
## 
## ROC 
##             Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## LogReg 0.7540107 0.8109124 0.8510856 0.8396562 0.8744947 0.9107620    0
## RF     0.7773229 0.8433653 0.8759142 0.8718641 0.8845163 0.9566344    0
## 
## Sens 
##             Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## LogReg 0.9468085 0.9625668 0.9733331 0.9720693 0.9839572 0.9946524    0
## RF     0.9414894 0.9680851 0.9786096 0.9761757 0.9840212 0.9946524    0
## 
## Spec 
##         Min.   1st Qu.   Median      Mean 3rd Qu.    Max. NA's
## LogReg 0.125 0.2265625 0.281250 0.2791667  0.3125 0.46875    0
## RF     0.375 0.4687500 0.515625 0.5114583  0.5625 0.59375    0
bwplot(results, layout = c(1, 3), metric = "ROC")

#creates a summary of both cross validated and optimized models and gives winner showing that overall, the RF model is better. when looking at ROC/Area Under the Curve, sensitivity (true positive) and specificity (true negative). the random forest model edges the logistic model out in every aspect. 

#The box plot charts models ROC across the sampling and you can see that the rf model has is performing better.

#Load up the Customers to Predict set

load("C:/Users/13309/OneDrive/Desktop/KSU Summer 2025/Business Analytics/Group Project/New folder/Customers_To_Predict.RData")

#Ensuring that the factors match

#making sure the factoring is correct.

Customers_To_Predict$international_plan <- as.factor(Customers_To_Predict$international_plan)
Customers_To_Predict$voice_mail_plan <- as.factor(Customers_To_Predict$voice_mail_plan)
Customers_To_Predict$state <- as.factor(Customers_To_Predict$state)
Customers_To_Predict$area_code <- as.factor(Customers_To_Predict$area_code)

glimpse(Customers_To_Predict) #validate that the character variables were changed to factors
## Rows: 1,600
## Columns: 19
## $ state                         <fct> UT, SD, KY, MS, AK, TX, WI, UT, MT, NE, …
## $ account_length                <dbl> 93, 39, 124, 162, 112, 109, 13, 66, 138,…
## $ area_code                     <fct> area_code_415, area_code_408, area_code_…
## $ international_plan            <fct> no, no, no, yes, no, yes, no, no, no, no…
## $ voice_mail_plan               <fct> no, no, no, no, yes, no, no, no, no, no,…
## $ number_vmail_messages         <dbl> 0, 0, 0, 0, 31, 0, 0, 0, 0, 0, 0, 0, 35,…
## $ total_day_minutes             <dbl> 174.1, 179.0, 156.9, 172.1, 142.9, 159.6…
## $ total_day_calls               <dbl> 127, 88, 74, 138, 92, 136, 117, 98, 106,…
## $ total_day_charge              <dbl> 29.60, 30.43, 26.67, 29.26, 24.29, 27.13…
## $ total_eve_minutes             <dbl> 176.8, 148.2, 195.8, 165.9, 233.8, 151.0…
## $ total_eve_calls               <dbl> 73, 124, 82, 93, 107, 126, 102, 93, 110,…
## $ total_eve_charge              <dbl> 15.03, 12.60, 16.64, 14.10, 19.87, 12.84…
## $ total_night_minutes           <dbl> 240.0, 146.8, 181.0, 279.0, 329.2, 142.1…
## $ total_night_calls             <dbl> 111, 116, 99, 81, 142, 53, 108, 82, 102,…
## $ total_night_charge            <dbl> 10.80, 6.61, 8.15, 12.56, 14.81, 6.39, 9…
## $ total_intl_minutes            <dbl> 10.7, 8.8, 8.8, 9.2, 10.4, 7.3, 9.0, 11.…
## $ total_intl_calls              <dbl> 3, 4, 2, 6, 6, 4, 3, 1, 4, 2, 3, 3, 6, 6…
## $ total_intl_charge             <dbl> 2.89, 2.38, 2.38, 2.48, 2.81, 1.97, 2.43…
## $ number_customer_service_calls <dbl> 0, 2, 1, 2, 0, 4, 1, 0, 0, 0, 1, 3, 0, 1…
# Predict the probability of yes/ churn
rf_probs <- predict(rf_model, Customers_To_Predict, type = "prob")[, "yes"] #again predict the probability of yes.

optimal_cutoff <- 0.1  # Setting cutoff from what we optimized during training

rf_predicted_class <- ifelse(rf_probs >= optimal_cutoff, "yes", "no")
Customers_To_Predict$churn_prob <- rf_probs  #drop predicted probability into the new churn_prob label

head(Customers_To_Predict)
## # A tibble: 6 × 20
##   state account_length area_code     international_plan voice_mail_plan
##   <fct>          <dbl> <fct>         <fct>              <fct>          
## 1 UT                93 area_code_415 no                 no             
## 2 SD                39 area_code_408 no                 no             
## 3 KY               124 area_code_408 no                 no             
## 4 MS               162 area_code_415 yes                no             
## 5 AK               112 area_code_415 no                 yes            
## 6 TX               109 area_code_510 yes                no             
## # ℹ 15 more variables: number_vmail_messages <dbl>, total_day_minutes <dbl>,
## #   total_day_calls <dbl>, total_day_charge <dbl>, total_eve_minutes <dbl>,
## #   total_eve_calls <dbl>, total_eve_charge <dbl>, total_night_minutes <dbl>,
## #   total_night_calls <dbl>, total_night_charge <dbl>,
## #   total_intl_minutes <dbl>, total_intl_calls <dbl>, total_intl_charge <dbl>,
## #   number_customer_service_calls <dbl>, churn_prob <dbl>
View(Customers_To_Predict)
save(rf_model, optimal_cutoff, Customers_To_Predict,
     file = "C:/Users/13309/OneDrive/Desktop/KSU Summer 2025/Business Analytics/Group Project/Group2.RData")


write.csv(Customers_To_Predict, 
          file = "C:/Users/13309/OneDrive/Desktop/KSU Summer 2025/Business Analytics/Customers_To_Predict_Output.csv", 
          row.names = FALSE)
#sample review
median(Customers_To_Predict$churn_prob)
## [1] 0.008
sum(Customers_To_Predict$churn_prob > 0.1) #customers
## [1] 345