# Restart ctrl, shift, F10

# Packages that will be used for regression:

library(tidyverse)
library(dplyr)
library(plyr)
library(readr)
library(ggplot2)
library(gridExtra)
library(stats)
library(gplots)
library(tidycomm)
library(caTools)
library(glmnet)
library(pROC)
library(AICcmodavg)
library(wesanderson)

# Setting the working directory:

setwd('C:/Users/.../Datasets/Churn')

# Importing the dataset:

churn_df <-read.csv('churn_data.csv')

# Checking the structure of the data:

str(churn_df)

# Renaming the dataset:

mydata <- churn_df

# Summary/Structure of Data

str(mydata)

summary(mydata)

head(mydata)


# Searching for Duplicates

dupes <- duplicated(mydata)

# Summing to see if duplicates are present:

sum(dupes)


# Listing columns to be removed as they are not meaningful:
columns_to_remove <- c('CaseOrder', 'Customer_id', 'Interaction', 'UID', 'City', 
                       'State', 'County', 'Zip', 'Lat', 'Lng', 'Population', 
                       'Area', 'TimeZone', 'Job', 'Marital', 'PaymentMethod')

# Remove the specified columns:
mydata <- mydata[, -which(names(mydata) %in% columns_to_remove)]


# Changing Column Names of Ordinal Data:

colnames(mydata)[colnames(mydata) == 'Item1'] <- 'Response'
colnames(mydata)[colnames(mydata) == 'Item2'] <- 'Fixes'
colnames(mydata)[colnames(mydata) == 'Item3'] <- 'Replacements'
colnames(mydata)[colnames(mydata) == 'Item4'] <- 'Reliability'
colnames(mydata)[colnames(mydata) == 'Item5'] <- 'Options'
colnames(mydata)[colnames(mydata) == 'Item6'] <- 'Respectful'
colnames(mydata)[colnames(mydata) == 'Item7'] <- 'CourtExchange'
colnames(mydata)[colnames(mydata) == 'Item8'] <- 'ActiveListening'

str(mydata)

# Categorical data summaries based on churn:

# Internet Service

cd1 <- ggplot(mydata, aes(x = Churn, fill = InternetService)) +
  geom_bar(position = "dodge", color = "black", show.legend = TRUE) +
  geom_text(stat = "count", aes(label = scales::percent(..count../sum(..count..)),
                                y = ..count.., group = InternetService), 
            position = position_dodge(width = 0.9),
            vjust = -0.5) +
  labs(title = "Churn Distribution by Internet Service",
       x = "Churn",
       y = "Count") +
    scale_fill_manual(values = wes_palette('Royal2' , n = 3)) +
  theme_minimal()

# Gender

cd2 <- ggplot(mydata, aes(x = Churn, fill = Gender)) +
  geom_bar(position = "dodge", color = "black", show.legend = TRUE) +
  geom_text(stat = "count", aes(label = scales::percent(..count../sum(..count..)),
                                y = ..count.., group = Gender), 
            position = position_dodge(width = 0.9),
            vjust = -0.5) +
  labs(title = "Churn Distribution by Gender",
       x = "Churn",
       y = "Count") +
  scale_fill_manual(values = wes_palette('Royal2' , n = 3)) +
  theme_minimal()

# Contract

cd3 <- ggplot(mydata, aes(x = Churn, fill = Contract)) +
  geom_bar(position = "dodge", color = "black", show.legend = TRUE) +
  geom_text(stat = "count", aes(label = scales::percent(..count../sum(..count..)),
                                y = ..count.., group = Contract), 
            position = position_dodge(width = 0.9),
            vjust = -0.5) +
  labs(title = "Churn Distribution by Contract",
       x = "Churn",
       y = "Count") +
  scale_fill_manual(values = wes_palette('Royal2' , n = 3)) +
  theme_minimal()

# Without Churn

# Internet Service

cd_noc1 <- ggplot(mydata, aes(x = InternetService, fill = InternetService)) +
  geom_bar(position = "dodge", color = "black", show.legend = TRUE) +
  geom_text(stat = "count", aes(label = scales::percent(..count../sum(..count..)),
                                y = ..count.., group = InternetService), 
            position = position_dodge(width = 0.9),
            vjust = -0.5) +
  labs(title = "Distribution by Internet Service",
       x = "InternetService",
       y = "Count") +
  scale_fill_manual(values = wes_palette('Royal2' , n = 3)) +
  theme_minimal()

# Gender

cd_noc2 <- ggplot(mydata, aes(x = Gender, fill = Gender)) +
  geom_bar(position = "dodge", color = "black", show.legend = TRUE) +
  geom_text(stat = "count", aes(label = scales::percent(..count../sum(..count..)),
                                y = ..count.., group = Gender), 
            position = position_dodge(width = 0.9),
            vjust = -0.5) +
  labs(title = "Distribution by Gender",
       x = "Gender",
       y = "Count") +
  scale_fill_manual(values = wes_palette('Royal2' , n = 3)) +
  theme_minimal()

# Contract

cd_noc3 <- ggplot(mydata, aes(x = Contract, fill = Contract)) +
  geom_bar(position = "dodge", color = "black", show.legend = TRUE) +
  geom_text(stat = "count", aes(label = scales::percent(..count../sum(..count..)),
                                y = ..count.., group = Contract), 
            position = position_dodge(width = 0.9),
            vjust = -0.5) +
  labs(title = "Distribution by Contract",
       x = "Contract",
       y = "Count") +
  scale_fill_manual(values = wes_palette('Royal2' , n = 3)) +
  theme_minimal()

# Without percent to save space:

# Internet Service

cd1a <- ggplot(mydata, aes(x = InternetService, fill = InternetService)) +
  geom_bar(position = "dodge", color = "black", show.legend = TRUE) +
   labs(title = "Distribution by Internet Service",
       x = "InternetService",
       y = "Count") +
  scale_fill_manual(values = wes_palette('Royal2' , n = 3)) +
  theme_minimal()

# Gender

cd2a <- ggplot(mydata, aes(x = Gender, fill = Gender)) +
  geom_bar(position = "dodge", color = "black", show.legend = TRUE) +
  labs(title = "Distribution by Gender",
       x = "Gender",
       y = "Count") +
  scale_fill_manual(values = wes_palette('Royal2' , n = 3)) +
  theme_minimal()

# Contract

cd3a<- ggplot(mydata, aes(x = Contract, fill = Contract)) +
  geom_bar(position = "dodge", color = "black", show.legend = TRUE) +
  labs(title = "Distribution by Contract",
       x = "Contract",
       y = "Count") +
  scale_fill_manual(values = wes_palette('Royal2' , n = 3)) +
  theme_minimal()


# Arranging the grids (distributions) by variable:

grid.arrange(cd_noc1, cd1)
grid.arrange(cd_noc2, cd2)
grid.arrange(cd_noc3, cd3)

# Smaller version without percentages:

grid.arrange(cd1a, cd2a, cd3a)

# ----------------------------------------------------------------

# Creating Dummy Variables for Categorical Data

mydata$DummyGender <- ifelse(mydata$Gender == 'Male', 1, 0)
mydata$DummyChurn <- ifelse(mydata$Churn == 'Yes', 1, 0)
mydata$DummyTechie <- ifelse(mydata$Techie == 'Yes', 1, 0)
mydata$DummyContract <- ifelse(mydata$Contract == 'Two Year', 1, 0)
mydata$DummyPort_modem <- ifelse(mydata$Port_modem == 'Yes', 1, 0)
mydata$DummyTablet <- ifelse(mydata$Tablet == 'Yes', 1, 0)
mydata$DummyInternetService <- ifelse(mydata$InternetService == 'Fiber Optic', 1, 0)
mydata$DummyPhone <- ifelse(mydata$Phone == 'Yes', 1, 0)
mydata$DummyMultiple <- ifelse(mydata$Multiple == 'Yes', 1, 0)
mydata$DummyOnlineSecurity <- ifelse(mydata$OnlineSecurity == 'Yes', 1, 0)
mydata$DummyOnlineBackup <- ifelse(mydata$OnlineBackup == 'Yes', 1, 0)
mydata$DummyDeviceProtection <- ifelse(mydata$DeviceProtection == 'Yes', 1, 0)
mydata$DummyTechSupport <- ifelse(mydata$TechSupport == 'Yes', 1, 0)
mydata$DummyStreamingTV <- ifelse(mydata$StreamingTV == 'Yes', 1, 0)
mydata$DummyStreamingMovies <- ifelse(mydata$StreamingMovies == 'Yes', 1, 0)
mydata$DummyPaperlessBilling <- ifelse(mydata$PaperlessBilling == 'Yes', 1, 0)

# Dropping all old categorical variables:

remove_original_categories <- c('Gender', 'Churn', 'Techie', 'Contract', 'Port_modem',
                                'Tablet', 'InternetService', 'Phone', 'Multiple', 
                                'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
                                'TechSupport', 'StreamingTV', 'StreamingMovies',
                                'PaperlessBilling')

mydata <- mydata[, -which(names(mydata) %in% remove_original_categories)]

str(mydata)



# Creating histograms for continuous variables by choosing variables first:
selected_columns <- c('Children', 'Age', 'Income', 'Outage_sec_perweek', 'Email',
                      'Contacts', 'Yearly_equip_failure', 'Tenure', 'MonthlyCharge',
                      'Bandwidth_GB_Year')

# Create the layout for multiple histograms in a visualization (2 rows, 5 columns):
par(mfrow = c(2, 5))

# Creating the histograms:
for (col in selected_columns) {
  hist(mydata[[col]], main = col, xlab = col, col = "lightblue")
}


# Boxplot for variables to check for outliers:

boxplot(mydata$Tenure, main = 'Boxplot for Tenure')$out
boxplot(mydata$Bandwidth_GB_Year, main = 'Boxplot for Bandwidth_GB_Year')$out
boxplot(mydata$MonthlyCharge, main = 'Boxplot for MonthlyCharge')$out



# Summary of Independent Variables

Tenure_Summary <- ggplot(mydata, aes(x = DummyChurn)) +
  geom_bar(position = 'dodge', stat = 'count', fill = 'lightblue') +
  geom_text(aes(label = paste0(round(prop.table(after_stat(count)) * 100, 2), '%')), stat = 'count')
Gender_Summary <- ggplot(mydata, aes(x = DummyGender)) +
  geom_bar(position = 'dodge', stat = 'count', fill = 'lightblue') +
  geom_text(aes(label = paste0(round(prop.table(after_stat(count)) * 100, 2), '%')), stat = 'count')
Techie_Summary <- ggplot(mydata, aes(x = DummyTechie)) +
  geom_bar(position = 'dodge', stat = 'count', fill = 'lightblue') +
  geom_text(aes(label = paste0(round(prop.table(after_stat(count)) * 100, 2), '%')), stat = 'count')
Port_modem_Summary <- ggplot(mydata, aes(x = DummyPort_modem)) +
  geom_bar(position = 'dodge', stat = 'count', fill = 'lightblue') +
  geom_text(aes(label = paste0(round(prop.table(after_stat(count)) * 100, 2), '%')), stat = 'count')
Tablet_Summary <- ggplot(mydata, aes(x = DummyTablet)) +
  geom_bar(position = 'dodge', stat = 'count', fill = 'lightblue') +
  geom_text(aes(label = paste0(round(prop.table(after_stat(count)) * 100, 2), '%')), stat = 'count')
Contract_Summary <- ggplot(mydata, aes(x = DummyContract)) +
  geom_bar(position = 'dodge', stat = 'count', fill = 'lightblue') +
  geom_text(aes(label = paste0(round(prop.table(after_stat(count)) * 100, 2), '%')), stat = 'count')

PaperlessBilling_Summary <- ggplot(mydata, aes(x = DummyPaperlessBilling)) +
  geom_bar(position = 'dodge', stat = 'count', fill = 'lightblue') +
  geom_text(aes(label = paste0(round(prop.table(after_stat(count)) * 100, 2), '%')), stat = 'count')
InternetService_Summary <- ggplot(mydata, aes(x = DummyInternetService)) +
  geom_bar(position = 'dodge', stat = 'count', fill = 'lightblue') +
  geom_text(aes(label = paste0(round(prop.table(after_stat(count)) * 100, 2), '%')), stat = 'count')
Phone_Summary <- ggplot(mydata, aes(x = DummyPhone)) +
  geom_bar(position = 'dodge', stat = 'count', fill = 'lightblue') +
  geom_text(aes(label = paste0(round(prop.table(after_stat(count)) * 100, 2), '%')), stat = 'count')
Multiple_Summary <- ggplot(mydata, aes(x = DummyMultiple)) +
  geom_bar(position = 'dodge', stat = 'count', fill = 'lightblue') +
  geom_text(aes(label = paste0(round(prop.table(after_stat(count)) * 100, 2), '%')), stat = 'count')
OnlineSecruity_Summary <- ggplot(mydata, aes(x = DummyOnlineSecurity)) +
  geom_bar(position = 'dodge', stat = 'count', fill = 'lightblue') +
  geom_text(aes(label = paste0(round(prop.table(after_stat(count)) * 100, 2), '%')), stat = 'count')
OnlineBackup_Summary <- ggplot(mydata, aes(x = DummyOnlineBackup)) +
  geom_bar(position = 'dodge', stat = 'count', fill = 'lightblue') +
  geom_text(aes(label = paste0(round(prop.table(after_stat(count)) * 100, 2), '%')), stat = 'count')

DeviceProtection_Summary <- ggplot(mydata, aes(x = DummyDeviceProtection)) +
  geom_bar(position = 'dodge', stat = 'count', fill = 'lightblue') +
  geom_text(aes(label = paste0(round(prop.table(after_stat(count)) * 100, 2), '%')), stat = 'count')
TechSupport_Summary <- ggplot(mydata, aes(x = DummyTechSupport)) +
  geom_bar(position = 'dodge', stat = 'count', fill = 'lightblue') +
  geom_text(aes(label = paste0(round(prop.table(after_stat(count)) * 100, 2), '%')), stat = 'count')
StreamingTV_Summary <- ggplot(mydata, aes(x = DummyStreamingTV)) +
  geom_bar(position = 'dodge', stat = 'count', fill = 'lightblue') +
  geom_text(aes(label = paste0(round(prop.table(after_stat(count)) * 100, 2), '%')), stat = 'count')
StreamingMovies_Summary <- ggplot(mydata, aes(x = DummyStreamingMovies)) +
  geom_bar(position = 'dodge', stat = 'count', fill = 'lightblue') +
  geom_text(aes(label = paste0(round(prop.table(after_stat(count)) * 100, 2), '%')), stat = 'count')

grid.arrange(Tenure_Summary, Gender_Summary, Techie_Summary, Port_modem_Summary, Tablet_Summary, Contract_Summary)
grid.arrange(PaperlessBilling_Summary, InternetService_Summary, Phone_Summary, 
             Multiple_Summary, OnlineSecruity_Summary, OnlineBackup_Summary)
grid.arrange(OnlineBackup_Summary, DeviceProtection_Summary, TechSupport_Summary, StreamingTV_Summary, StreamingMovies_Summary)


# .csv of data transformation

write.csv(mydata, file = 'modified_dataset.csv', row.names = FALSE)



# ---------------------------------------------------------------------

# Bivariate Analysis:


# Create scatterplot x = Children, y = Churn:

sp1 <- ggplot(mydata, aes(x = Children, y = DummyChurn)) +
  geom_point(color = 'red') +
  labs(title = paste('Scatterplot of Children vs. Churn\n', 
                     'R-squared:', round(cor(mydata$Children, mydata$DummyChurn)^2, 3)), 
       x = 'Children', 
       y = 'Churn') +
  theme_minimal()

# Create scatterplot x = Age, y = Churn:

sp2 <- ggplot(mydata, aes(x = Age, y = DummyChurn)) +
  geom_point(color = 'red') +
  labs(title = paste('Scatterplot of Age vs. Churn\n', 
                     'R-squared:', round(cor(mydata$Age, mydata$DummyChurn)^2, 3)), 
       x = 'Age', 
       y = 'Churn') +
  theme_minimal()

# Create scatterplot x = Income, y = Churn:

sp3 <- ggplot(mydata, aes(x = Income, y = DummyChurn)) +
  geom_point(color = 'red') +
  labs(title = paste('Scatterplot of Income vs. Churn\n', 
                     'R-squared:', round(cor(mydata$Income, mydata$DummyChurn)^2, 3)), 
       x = 'Income', 
       y = 'Churn') +
  theme_minimal()


# Create scatterplot x = Gender, y = Tenure:

sp4 <- ggplot(mydata, aes(x = DummyGender, y = DummyChurn)) +
  geom_point(color = 'red') +
  labs(title = paste('Scatterplot of Gender vs. Churn\n', 
                     'R-squared:', round(cor(mydata$DummyGender, mydata$DummyChurn)^2, 3)), 
       x = 'Gender', 
       y = 'Churn') +
  theme_minimal()

# Create scatterplot x = Outage_sec_perweek , y = Churn:

sp5 <- ggplot(mydata, aes(x = Outage_sec_perweek, y = DummyChurn)) +
  geom_point(color = 'red') +
  labs(title = paste('Scatterplot of Outage_sec_perweek vs. Churn\n', 
                     'R-squared:', round(cor(mydata$Outage_sec_perweek, mydata$DummyChurn)^2, 3)), 
       x = 'Outage_sec_perweek', 
       y = 'Churn') +
  theme_minimal()


# Create scatterplot x = Bandwidth_GB_Year , y = Churn:

sp6 <- ggplot(mydata, aes(x = Bandwidth_GB_Year, y = DummyChurn)) +
  geom_point(color = 'red') +
  labs(title = paste('Scatterplot of Bandwidth_GB_Year vs. Churn\n', 
                     'R-squared:', round(cor(mydata$Bandwidth_GB_Year, mydata$DummyChurn)^2, 3)), 
       x = 'Bandwidth_GB_Year', 
       y = 'Churn') +
  theme_minimal()

# Create scatterplot x = Email , y = Churn:

sp7 <- ggplot(mydata, aes(x = Email, y = DummyChurn)) +
  geom_point(color = 'red') +
  labs(title = paste('Scatterplot of Email vs. Churn\n', 
                     'R-squared:', round(cor(mydata$Email, mydata$DummyChurn)^2, 3)), 
       x = 'Email', 
       y = 'Churn') +
  theme_minimal()


# Create scatterplot x = Contacts , y = Churn:

sp8 <- ggplot(mydata, aes(x = Contacts, y = DummyChurn)) +
  geom_point(color = 'red') +
  labs(title = paste('Scatterplot of Contacts vs. Churn\n', 
                     'R-squared:', round(cor(mydata$Contacts, mydata$DummyChurn)^2, 3)), 
       x = 'Contacts', 
       y = 'Churn') +
  theme_minimal()

# Create scatterplot x = Yearly_equip_failure, y = Churn:

sp9 <- ggplot(mydata, aes(x = Yearly_equip_failure, y = DummyChurn)) +
  geom_point(color = 'red') +
  labs(title = paste('Scatterplot of Yearly_equip_failure vs. Churn\n', 
                     'R-squared:', round(cor(mydata$Yearly_equip_failure, mydata$DummyChurn)^2, 3)), 
       x = 'Yearly_equip_failure', 
       y = 'Churn') +
  theme_minimal()

# Create scatterplot x = MonthlyCharge, y = Churn:

sp10 <- ggplot(mydata, aes(x = MonthlyCharge, y = DummyChurn)) +
  geom_point(color = 'red') +
  labs(title = paste('Scatterplot of MonthlyCharge vs. Churn\n', 
                     'R-squared:', round(cor(mydata$MonthlyCharge, mydata$DummyChurn)^2, 3)), 
       x = 'MonthlyCharge', 
       y = 'Churn') +
  theme_minimal()

# Create scatterplot x = Timely Response, y = Churn:

sp11 <- ggplot(mydata, aes(x = Response, y = DummyChurn)) +
  geom_point(color = 'red') +
  labs(title = paste('Scatterplot of TimelyResponse vs. Churn\n', 
                     'R-squared:', round(cor(mydata$Response, mydata$DummyChurn)^2, 3)), 
       x = 'TimelyResponse', 
       y = 'Churn') +
  theme_minimal()

# Create scatterplot x = Timely Fixes, y = Churn:

sp12 <- ggplot(mydata, aes(x = Fixes, y = DummyChurn)) +
  geom_point(color = 'red') +
  labs(title = paste('Scatterplot of Timely Fixes vs. Churn\n', 
                     'R-squared:', round(cor(mydata$Fixes, mydata$DummyChurn)^2, 3)), 
       x = 'Timely Fixes', 
       y = 'Churn') +
  theme_minimal()

# Create scatterplot x = Timely Replacements , y = Churn:

sp13 <- ggplot(mydata, aes(x = Replacements, y = DummyChurn)) +
  geom_point(color = 'red') +
  labs(title = paste('Scatterplot of Timely Replacements vs. Churn\n', 
                     'R-squared:', round(cor(mydata$Replacements, mydata$DummyChurn)^2, 3)), 
       x = 'Timely Replacements', 
       y = 'Churn') +
  theme_minimal()

# Create scatterplot x = Reliability , y = Churn:

sp14 <- ggplot(mydata, aes(x = Reliability, y = DummyChurn)) +
  geom_point(color = 'red') +
  labs(title = paste('Scatterplot of Reliability vs. Churn\n', 
                     'R-squared:', round(cor(mydata$Reliability, mydata$DummyChurn)^2, 3)), 
       x = 'Reliability', 
       y = 'Churn') +
  theme_minimal()

# Create scatterplot x = Options , y = Churn:

sp15 <- ggplot(mydata, aes(x = Options, y = DummyChurn)) +
  geom_point(color = 'red') +
  labs(title = paste('Scatterplot of Options vs. Churn\n', 
                     'R-squared:', round(cor(mydata$Options, mydata$DummyChurn)^2, 3)), 
       x = 'Options', 
       y = 'Churn') +
  theme_minimal()


# Create scatterplot x = Respectful Response , y = Churn:

sp16 <- ggplot(mydata, aes(x = Respectful, y = DummyChurn)) +
  geom_point(color = 'red') +
  labs(title = paste('Scatterplot of RespectfulResponse vs. Churn\n', 
                     'R-squared:', round(cor(mydata$Respectful, mydata$DummyChurn)^2, 3)), 
       x = 'Respectful Response', 
       y = 'Churn') +
  theme_minimal()

# Create scatterplot x = CourtExchange, y = Churn:

sp17 <- ggplot(mydata, aes(x = CourtExchange, y = DummyChurn)) +
  geom_point(color = 'red') +
  labs(title = paste('Scatterplot of Court Exchange vs. Churn\n', 
                     'R-squared:', round(cor(mydata$CourtExchange, mydata$DummyChurn)^2, 3)), 
       x = 'Court Exchange', 
       y = 'Churn') +
  theme_minimal()

# Create scatterplot x = EvidenceActiveListening, y = Churn:

sp18 <- ggplot(mydata, aes(x = ActiveListening, y = DummyChurn)) +
  geom_point(color = 'red') +
  labs(title = paste('Scatterplot of Evidence Active Listening vs. Churn\n', 
                     'R-squared:', round(cor(mydata$ActiveListening, mydata$DummyChurn)^2, 3)), 
       x = 'Evidence Active Listening', 
       y = 'Churn') +
  theme_minimal()


grid.arrange(sp1, sp2, sp3, sp4, sp5, sp6)
grid.arrange(sp7, sp8, sp9, sp10, sp11, sp12)
grid.arrange(sp13, sp14, sp15, sp16, sp17, sp18)

# ------------------------------------------------------------------------------


pscl::pR2(logistic_model_all)["McFadden"]
pscl::pR2(reduced_model)["McFadden"]


# Fit a logistic regression model with all predictors with Churn being the Dependent
logistic_model_all <- glm(DummyChurn ~ ., data = mydata, family = binomial)


# Display the summary of the model
summary(logistic_model_all)

print(logistic_model_all)

# McFadden R:

pscl::pR2(logistic_model_all)["McFadden"]


#Reducing the model backwards
reduced_model <- step(logistic_model_all, direction = 'backward')

print(reduced_model)

summary(reduced_model)

pscl::pR2(reduced_model)["McFadden"]


# Specifying the variables in the new reduced model 2 - Gender Included / Paperless Billing Excluded
selected_variables_rm_1 <- c('Children', 'Age', 'Tenure', 'MonthlyCharge', 'Bandwidth_GB_Year', 'DummyGender', 'DummyTechie', 
                           'DummyContract', 'DummyInternetService', 'DummyPort_modem', 'DummyDeviceProtection',
                           'DummyPhone', 'DummyMultiple', 'DummyOnlineSecurity','DummyStreamingTV',
                           'DummyStreamingMovies')

# Creating the reduced model with specific variables
reduced_model_2 <- glm(DummyChurn ~ ., 
                     data = mydata[, c("DummyChurn", selected_variables_rm_1)], 
                     family = binomial)

summary(reduced_model_2)

pscl::pR2(reduced_model_2)["McFadden"]

# Specifying the variables in the new reduced model 3 - Paperless Billing Included / Gender Excluded
selected_variables_rm_2 <- c('Children', 'Age', 'Tenure', 'MonthlyCharge', 'Bandwidth_GB_Year', 'DummyTechie', 
                           'DummyContract', 'DummyInternetService', 'DummyPort_modem', 'DummyDeviceProtection',
                           'DummyPhone', 'DummyMultiple', 'DummyOnlineSecurity','DummyStreamingTV',
                           'DummyStreamingMovies', 'DummyPaperlessBilling')

reduced_model_3 <- glm(DummyChurn ~ ., 
                       data = mydata[, c("DummyChurn", selected_variables_rm_2)], 
                       family = binomial)

summary(reduced_model_3)

pscl::pR2(reduced_model_3)["McFadden"]

# Specifying the variables in the new reduced model 4 - Paperless Billing and Gender Excluded
selected_variables_rm_3 <- c('Children', 'Age', 'Tenure', 'MonthlyCharge', 'Bandwidth_GB_Year', 'DummyTechie', 
                             'DummyContract', 'DummyInternetService', 'DummyPort_modem', 'DummyDeviceProtection',
                             'DummyPhone', 'DummyMultiple', 'DummyOnlineSecurity','DummyStreamingTV',
                             'DummyStreamingMovies')
reduced_model_4 <- glm(DummyChurn ~ ., 
                       data = mydata[, c("DummyChurn", selected_variables_rm_3)], 
                       family = binomial)

summary(reduced_model_4)

pscl::pR2(reduced_model_4)["McFadden"]

# ------------------------------------------------------------------------------

# Create a list of models
model_list <- list(logistic_model_all, reduced_model, reduced_model_2)
model_names <- c('all.mod', 'reduced.mod', 'reduced.mod2')

# Run aictab to compare models
aictab_result <- aictab(model_list, modnames = model_names)


# Print the result
print(aictab_result)

# ------------------------------------------------------------------------------

# Confusion Matrix

# Predicted probabilities of Churn on first reduced
predicted_probabilities <- predict(reduced_model, newdata = mydata, type = "response")

# Convert probabilities to class labels (0 or 1) based on a threshold (e.g., 0.5)
# This needs to be done since Churn is in class 0,1

predicted_labels <- ifelse(predicted_probabilities > 0.5, 1, 0)

# Creating the actual labels for Churn

actual_labels <- mydata$DummyChurn

# Both actual and predicted labels into the confusion matrix:

conf_matrix <- table(actual_labels, predicted_labels)
print(conf_matrix)


# Getting Calculations of Accuracy from matrix

# Explicitly using the table function from the caret package

cm_data <- as.matrix(caret::confusionMatrix(conf_matrix)$table)

# Calculate metrics

accuracy <- sum(diag(cm_data)) / sum(cm_data) * 100
precision <- cm_data[2, 2] / sum(cm_data[, 2])
recall <- cm_data[2, 2] / sum(cm_data[2, ])
specificity <- cm_data[1, 1] / sum(cm_data[1, ])

# Print the metrics

cat("Accuracy:", accuracy, "% \n")
cat("Precision: ", precision, "\n")
cat("Recall: ", recall, "\n")
cat("Specificity: ", specificity, "\n")

str(mydata)


#-----------------------------------------------------


# Predicted probabilities of Churn on second reduced (not needed as second reduced was worse)
predicted_probabilities <- predict(reduced_model_2, newdata = mydata, type = "response")

# Convert probabilities to class labels (0 or 1) based on a threshold (e.g., 0.5)
# This needs to be done since Churn is in class 0,1

predicted_labels_2 <- ifelse(predicted_probabilities > 0.5, 1, 0)

# Creating the actual labels for Churn

actual_labels <- mydata$DummyChurn

# Both actual and predicted labels into the confusion matrix:

conf_matrix <- table(actual_labels, predicted_labels_2)
print(conf_matrix)



# -----------------------------------------------------------------------------


# Below are notes for the video to have easy access to the models:
# 5487.3

# Fit a logistic regression model with all predictors with Churn being the Dependent
logistic_model_all <- glm(DummyChurn ~ ., data = mydata, family = binomial)


# Display the summary of the model
summary(logistic_model_all)

# McFadden R:

pscl::pR2(logistic_model_all)["McFadden"]


#Reducing the model backwards
reduced_model <- step(logistic_model_all, direction = 'backward')

print(reduced_model)

summary(reduced_model)

pscl::pR2(reduced_model)["McFadden"]


# Specifying the variables in the new reduced model 2 - Gender Included / Paperless Billing Excluded
selected_variables_rm_1 <- c('Children', 'Age', 'Tenure', 'MonthlyCharge', 'Bandwidth_GB_Year', 'DummyGender', 'DummyTechie', 
                             'DummyContract', 'DummyInternetService', 'DummyPort_modem', 'DummyDeviceProtection',
                             'DummyPhone', 'DummyMultiple', 'DummyOnlineSecurity','DummyStreamingTV',
                             'DummyStreamingMovies')

# Creating the reduced model with specific variables
reduced_model_2 <- glm(DummyChurn ~ ., 
                       data = mydata[, c("DummyChurn", selected_variables_rm_1)], 
                       family = binomial)

summary(reduced_model_2)

pscl::pR2(reduced_model_2)["McFadden"]

# Specifying the variables in the new reduced model 3 - Paperless Billing Included / Gender Excluded
selected_variables_rm_2 <- c('Children', 'Age', 'Tenure', 'MonthlyCharge', 'Bandwidth_GB_Year', 'DummyTechie', 
                             'DummyContract', 'DummyInternetService', 'DummyPort_modem', 'DummyDeviceProtection',
                             'DummyPhone', 'DummyMultiple', 'DummyOnlineSecurity','DummyStreamingTV',
                             'DummyStreamingMovies', 'DummyPaperlessBilling')

reduced_model_3 <- glm(DummyChurn ~ ., 
                       data = mydata[, c("DummyChurn", selected_variables_rm_2)], 
                       family = binomial)

summary(reduced_model_3)

pscl::pR2(reduced_model_3)["McFadden"]

# Specifying the variables in the new reduced model 4 - Paperless Billing and Gender Excluded
selected_variables_rm_3 <- c('Children', 'Age', 'Tenure', 'MonthlyCharge', 'Bandwidth_GB_Year', 'DummyTechie', 
                             'DummyContract', 'DummyInternetService', 'DummyPort_modem', 'DummyDeviceProtection',
                             'DummyPhone', 'DummyMultiple', 'DummyOnlineSecurity','DummyStreamingTV',
                             'DummyStreamingMovies')
reduced_model_4 <- glm(DummyChurn ~ ., 
                       data = mydata[, c("DummyChurn", selected_variables_rm_3)], 
                       family = binomial)

summary(reduced_model_4)

pscl::pR2(reduced_model_4)["McFadden"]


# Initial: 5487.3
# First Reduced: 5462.4

# Second (w Gender / wo PB): 5462.9
# Third (w PB / wo Gender): 5462.5
# Fourth (wo Both): 5463

# The End