|
# Telco Customer Churn Analysis Script
|
|
|
|
# Load required libraries
|
|
|
|
library(ggplot2)
|
|
library(dplyr)
|
|
library(rpart)
|
|
library(e1071)
|
|
library(caret)
|
|
library(pROC)
|
|
|
|
# Load dataset
|
|
|
|
telco <- read.csv("Telco-Customer-Churn.csv", stringsAsFactors = TRUE)
|
|
telco\$TotalCharges <- as.numeric(as.character(telco\$TotalCharges))
|
|
telco <- telco\[!is.na(telco\$TotalCharges), ]
|
|
telco\$Churn <- factor(telco\$Churn, levels = c("No", "Yes"))
|
|
|
|
# Exploratory Data Visualizations
|
|
|
|
# Histogram for numeric variables
|
|
|
|
numeric\_vars <- c("tenure", "MonthlyCharges", "TotalCharges")
|
|
for (var in numeric\_vars) {
|
|
p <- ggplot(telco, aes\_string(x = var)) +
|
|
geom\_histogram(binwidth = 10, fill = "skyblue", color = "black") +
|
|
labs(title = paste("Histogram of", var), x = var, y = "Frequency") +
|
|
theme\_minimal()
|
|
ggsave(paste0("hist\_", var, ".png"), plot = p)
|
|
}
|
|
|
|
# Bar plot for churn
|
|
|
|
p\_churn <- ggplot(telco, aes(x = Churn, fill = Churn)) +
|
|
geom\_bar() +
|
|
labs(title = "Churn Distribution", x = "Churn", y = "Count") +
|
|
theme\_minimal()
|
|
ggsave("bar\_churn.png", plot = p\_churn)
|
|
|
|
# Boxplot of MonthlyCharges by Churn
|
|
|
|
p\_box <- ggplot(telco, aes(x = Churn, y = MonthlyCharges, fill = Churn)) +
|
|
geom\_boxplot() +
|
|
labs(title = "Monthly Charges by Churn", x = "Churn", y = "Monthly Charges") +
|
|
theme\_minimal()
|
|
ggsave("boxplot\_monthlycharges\_churn.png", plot = p\_box)
|
|
|
|
# Split data into training, validation1, and validation2
|
|
|
|
set.seed(100)
|
|
n <- nrow(telco)
|
|
train.index <- sample(1\:n, size = round(0.70 \* n))
|
|
remaining.index <- setdiff(1\:n, train.index)
|
|
valid1.index <- sample(remaining.index, size = round(0.15 \* n))
|
|
valid2.index <- setdiff(remaining.index, valid1.index)
|
|
|
|
train.df <- telco\[train.index, ]
|
|
valid1.df <- telco\[valid1.index, ]
|
|
valid2.df <- telco\[valid2.index, ]
|
|
|
|
# Logistic regression model (simplified)
|
|
|
|
logit.reg <- glm(Churn \~ SeniorCitizen + Dependents + tenure + MultipleLines + InternetService + Contract +
|
|
PaperlessBilling + PaymentMethod + MonthlyCharges + TotalCharges,
|
|
data = train.df, family = "binomial")
|
|
summary(logit.reg)
|
|
|
|
# Evaluate on validation set
|
|
|
|
valid1\_pred\_probs <- predict(logit.reg, newdata = valid1.df, type = "response")
|
|
valid1\_pred <- factor(ifelse(valid1\_pred\_probs > 0.5, "Yes", "No"), levels = c("No", "Yes"))
|
|
logit\_conf <- confusionMatrix(valid1\_pred, valid1.df\$Churn, positive = "Yes")
|
|
logit\_roc <- roc(response = valid1.df\$Churn, predictor = valid1\_pred\_probs)
|
|
|
|
# Decision Tree model
|
|
|
|
dt\_model <- rpart(Churn \~ tenure + MonthlyCharges + TotalCharges + SeniorCitizen, data = train.df, method = "class")
|
|
dt\_pred <- predict(dt\_model, valid1.df, type = "class")
|
|
dt\_conf <- confusionMatrix(dt\_pred, valid1.df\$Churn)
|
|
|
|
# Naive Bayes model
|
|
|
|
nb\_model <- naiveBayes(Churn \~ tenure + MonthlyCharges + TotalCharges + SeniorCitizen, data = train.df)
|
|
nb\_pred <- predict(nb\_model, valid1.df)
|
|
nb\_probs <- predict(nb\_model, valid1.df, type = "raw")
|
|
nb\_conf <- confusionMatrix(nb\_pred, valid1.df\$Churn)
|
|
nb\_roc <- roc(response = valid1.df\$Churn, predictor = nb\_probs\[,"Yes"])
|
|
|
|
# Print evaluations
|
|
|
|
cat("\nLogistic Regression Confusion Matrix:\n")
|
|
print(logit\_conf)
|
|
cat("\nAUC (Logistic):", auc(logit\_roc), "\n")
|
|
|
|
cat("\nDecision Tree Confusion Matrix:\n")
|
|
print(dt\_conf)
|
|
|
|
cat("\nNaive Bayes Confusion Matrix:\n")
|
|
print(nb\_conf)
|
|
cat("\nAUC (Naive Bayes):", auc(nb\_roc), "\n")
|
|
|
|
# Save ROC curve plots
|
|
|
|
png("logistic\_roc\_curve.png")
|
|
plot(logit\_roc, main = "ROC Curve - Logistic Regression", col = "darkgreen")
|
|
dev.off()
|
|
|
|
png("naive\_bayes\_roc\_curve.png")
|
|
plot(nb\_roc, main = "ROC Curve - Naive Bayes", col = "blue")
|
|
dev.off()
|
|
|
|
|