# Telco Customer Churn Analysis Script # Load required libraries library(ggplot2) library(dplyr) library(rpart) library(e1071) library(caret) library(pROC) # Load dataset telco <- read.csv("Telco-Customer-Churn.csv", stringsAsFactors = TRUE) telco\$TotalCharges <- as.numeric(as.character(telco\$TotalCharges)) telco <- telco\[!is.na(telco\$TotalCharges), ] telco\$Churn <- factor(telco\$Churn, levels = c("No", "Yes")) # Exploratory Data Visualizations # Histogram for numeric variables numeric\_vars <- c("tenure", "MonthlyCharges", "TotalCharges") for (var in numeric\_vars) { p <- ggplot(telco, aes\_string(x = var)) + geom\_histogram(binwidth = 10, fill = "skyblue", color = "black") + labs(title = paste("Histogram of", var), x = var, y = "Frequency") + theme\_minimal() ggsave(paste0("hist\_", var, ".png"), plot = p) } # Bar plot for churn p\_churn <- ggplot(telco, aes(x = Churn, fill = Churn)) + geom\_bar() + labs(title = "Churn Distribution", x = "Churn", y = "Count") + theme\_minimal() ggsave("bar\_churn.png", plot = p\_churn) # Boxplot of MonthlyCharges by Churn p\_box <- ggplot(telco, aes(x = Churn, y = MonthlyCharges, fill = Churn)) + geom\_boxplot() + labs(title = "Monthly Charges by Churn", x = "Churn", y = "Monthly Charges") + theme\_minimal() ggsave("boxplot\_monthlycharges\_churn.png", plot = p\_box) # Split data into training, validation1, and validation2 set.seed(100) n <- nrow(telco) train.index <- sample(1\:n, size = round(0.70 \* n)) remaining.index <- setdiff(1\:n, train.index) valid1.index <- sample(remaining.index, size = round(0.15 \* n)) valid2.index <- setdiff(remaining.index, valid1.index) train.df <- telco\[train.index, ] valid1.df <- telco\[valid1.index, ] valid2.df <- telco\[valid2.index, ] # Logistic regression model (simplified) logit.reg <- glm(Churn \~ SeniorCitizen + Dependents + tenure + MultipleLines + InternetService + Contract + PaperlessBilling + PaymentMethod + MonthlyCharges + TotalCharges, data = train.df, family = "binomial") summary(logit.reg) # Evaluate on validation set valid1\_pred\_probs <- predict(logit.reg, newdata = valid1.df, type = "response") valid1\_pred <- factor(ifelse(valid1\_pred\_probs > 0.5, "Yes", "No"), levels = c("No", "Yes")) logit\_conf <- confusionMatrix(valid1\_pred, valid1.df\$Churn, positive = "Yes") logit\_roc <- roc(response = valid1.df\$Churn, predictor = valid1\_pred\_probs) # Decision Tree model dt\_model <- rpart(Churn \~ tenure + MonthlyCharges + TotalCharges + SeniorCitizen, data = train.df, method = "class") dt\_pred <- predict(dt\_model, valid1.df, type = "class") dt\_conf <- confusionMatrix(dt\_pred, valid1.df\$Churn) # Naive Bayes model nb\_model <- naiveBayes(Churn \~ tenure + MonthlyCharges + TotalCharges + SeniorCitizen, data = train.df) nb\_pred <- predict(nb\_model, valid1.df) nb\_probs <- predict(nb\_model, valid1.df, type = "raw") nb\_conf <- confusionMatrix(nb\_pred, valid1.df\$Churn) nb\_roc <- roc(response = valid1.df\$Churn, predictor = nb\_probs\[,"Yes"]) # Print evaluations cat("\nLogistic Regression Confusion Matrix:\n") print(logit\_conf) cat("\nAUC (Logistic):", auc(logit\_roc), "\n") cat("\nDecision Tree Confusion Matrix:\n") print(dt\_conf) cat("\nNaive Bayes Confusion Matrix:\n") print(nb\_conf) cat("\nAUC (Naive Bayes):", auc(nb\_roc), "\n") # Save ROC curve plots png("logistic\_roc\_curve.png") plot(logit\_roc, main = "ROC Curve - Logistic Regression", col = "darkgreen") dev.off() png("naive\_bayes\_roc\_curve.png") plot(nb\_roc, main = "ROC Curve - Naive Bayes", col = "blue") dev.off()