anpham
/
Datamining-Class


								# Telco Customer Churn Analysis Script


								# Load required libraries


								library(ggplot2)

								library(dplyr)

								library(rpart)

								library(e1071)

								library(caret)

								library(pROC)


								# Load dataset


								telco <- read.csv("Telco-Customer-Churn.csv", stringsAsFactors = TRUE)

								telco\$TotalCharges <- as.numeric(as.character(telco\$TotalCharges))

								telco <- telco\[!is.na(telco\$TotalCharges), ]

								telco\$Churn <- factor(telco\$Churn, levels = c("No", "Yes"))


								# Exploratory Data Visualizations


								# Histogram for numeric variables


								numeric\_vars <- c("tenure", "MonthlyCharges", "TotalCharges")

								for (var in numeric\_vars) {

								  p <- ggplot(telco, aes\_string(x = var)) +

								    geom\_histogram(binwidth = 10, fill = "skyblue", color = "black") +

								    labs(title = paste("Histogram of", var), x = var, y = "Frequency") +

								    theme\_minimal()

								  ggsave(paste0("hist\_", var, ".png"), plot = p)

								}


								# Bar plot for churn


								p\_churn <- ggplot(telco, aes(x = Churn, fill = Churn)) +

								  geom\_bar() +

								  labs(title = "Churn Distribution", x = "Churn", y = "Count") +

								  theme\_minimal()

								ggsave("bar\_churn.png", plot = p\_churn)


								# Boxplot of MonthlyCharges by Churn


								p\_box <- ggplot(telco, aes(x = Churn, y = MonthlyCharges, fill = Churn)) +

								  geom\_boxplot() +

								  labs(title = "Monthly Charges by Churn", x = "Churn", y = "Monthly Charges") +

								  theme\_minimal()

								ggsave("boxplot\_monthlycharges\_churn.png", plot = p\_box)


								# Split data into training, validation1, and validation2


								set.seed(100)

								n <- nrow(telco)

								train.index <- sample(1\:n, size = round(0.70 \* n))

								remaining.index <- setdiff(1\:n, train.index)

								valid1.index <- sample(remaining.index, size = round(0.15 \* n))

								valid2.index <- setdiff(remaining.index, valid1.index)


								train.df <- telco\[train.index, ]

								valid1.df <- telco\[valid1.index, ]

								valid2.df <- telco\[valid2.index, ]


								# Logistic regression model (simplified)


								logit.reg <- glm(Churn \~ SeniorCitizen + Dependents + tenure + MultipleLines + InternetService + Contract +

								                   PaperlessBilling + PaymentMethod + MonthlyCharges + TotalCharges,

								                 data = train.df, family = "binomial")

								summary(logit.reg)


								# Evaluate on validation set


								valid1\_pred\_probs <- predict(logit.reg, newdata = valid1.df, type = "response")

								valid1\_pred <- factor(ifelse(valid1\_pred\_probs > 0.5, "Yes", "No"), levels = c("No", "Yes"))

								logit\_conf <- confusionMatrix(valid1\_pred, valid1.df\$Churn, positive = "Yes")

								logit\_roc <- roc(response = valid1.df\$Churn, predictor = valid1\_pred\_probs)


								# Decision Tree model


								dt\_model <- rpart(Churn \~ tenure + MonthlyCharges + TotalCharges + SeniorCitizen, data = train.df, method = "class")

								dt\_pred <- predict(dt\_model, valid1.df, type = "class")

								dt\_conf <- confusionMatrix(dt\_pred, valid1.df\$Churn)


								# Naive Bayes model


								nb\_model <- naiveBayes(Churn \~ tenure + MonthlyCharges + TotalCharges + SeniorCitizen, data = train.df)

								nb\_pred <- predict(nb\_model, valid1.df)

								nb\_probs <- predict(nb\_model, valid1.df, type = "raw")

								nb\_conf <- confusionMatrix(nb\_pred, valid1.df\$Churn)

								nb\_roc <- roc(response = valid1.df\$Churn, predictor = nb\_probs\[,"Yes"])


								# Print evaluations


								cat("\nLogistic Regression Confusion Matrix:\n")

								print(logit\_conf)

								cat("\nAUC (Logistic):", auc(logit\_roc), "\n")


								cat("\nDecision Tree Confusion Matrix:\n")

								print(dt\_conf)


								cat("\nNaive Bayes Confusion Matrix:\n")

								print(nb\_conf)

								cat("\nAUC (Naive Bayes):", auc(nb\_roc), "\n")


								# Save ROC curve plots


								png("logistic\_roc\_curve.png")

								plot(logit\_roc, main = "ROC Curve - Logistic Regression", col = "darkgreen")

								dev.off()


								png("naive\_bayes\_roc\_curve.png")

								plot(nb\_roc, main = "ROC Curve - Naive Bayes", col = "blue")

								dev.off()