@ -0,0 +1,112 @@ | |||||
# Telco Customer Churn Analysis Script | |||||
# Load required libraries | |||||
library(ggplot2) | |||||
library(dplyr) | |||||
library(rpart) | |||||
library(e1071) | |||||
library(caret) | |||||
library(pROC) | |||||
# Load dataset | |||||
telco <- read.csv("Telco-Customer-Churn.csv", stringsAsFactors = TRUE) | |||||
telco\$TotalCharges <- as.numeric(as.character(telco\$TotalCharges)) | |||||
telco <- telco\[!is.na(telco\$TotalCharges), ] | |||||
telco\$Churn <- factor(telco\$Churn, levels = c("No", "Yes")) | |||||
# Exploratory Data Visualizations | |||||
# Histogram for numeric variables | |||||
numeric\_vars <- c("tenure", "MonthlyCharges", "TotalCharges") | |||||
for (var in numeric\_vars) { | |||||
p <- ggplot(telco, aes\_string(x = var)) + | |||||
geom\_histogram(binwidth = 10, fill = "skyblue", color = "black") + | |||||
labs(title = paste("Histogram of", var), x = var, y = "Frequency") + | |||||
theme\_minimal() | |||||
ggsave(paste0("hist\_", var, ".png"), plot = p) | |||||
} | |||||
# Bar plot for churn | |||||
p\_churn <- ggplot(telco, aes(x = Churn, fill = Churn)) + | |||||
geom\_bar() + | |||||
labs(title = "Churn Distribution", x = "Churn", y = "Count") + | |||||
theme\_minimal() | |||||
ggsave("bar\_churn.png", plot = p\_churn) | |||||
# Boxplot of MonthlyCharges by Churn | |||||
p\_box <- ggplot(telco, aes(x = Churn, y = MonthlyCharges, fill = Churn)) + | |||||
geom\_boxplot() + | |||||
labs(title = "Monthly Charges by Churn", x = "Churn", y = "Monthly Charges") + | |||||
theme\_minimal() | |||||
ggsave("boxplot\_monthlycharges\_churn.png", plot = p\_box) | |||||
# Split data into training, validation1, and validation2 | |||||
set.seed(100) | |||||
n <- nrow(telco) | |||||
train.index <- sample(1\:n, size = round(0.70 \* n)) | |||||
remaining.index <- setdiff(1\:n, train.index) | |||||
valid1.index <- sample(remaining.index, size = round(0.15 \* n)) | |||||
valid2.index <- setdiff(remaining.index, valid1.index) | |||||
train.df <- telco\[train.index, ] | |||||
valid1.df <- telco\[valid1.index, ] | |||||
valid2.df <- telco\[valid2.index, ] | |||||
# Logistic regression model (simplified) | |||||
logit.reg <- glm(Churn \~ SeniorCitizen + Dependents + tenure + MultipleLines + InternetService + Contract + | |||||
PaperlessBilling + PaymentMethod + MonthlyCharges + TotalCharges, | |||||
data = train.df, family = "binomial") | |||||
summary(logit.reg) | |||||
# Evaluate on validation set | |||||
valid1\_pred\_probs <- predict(logit.reg, newdata = valid1.df, type = "response") | |||||
valid1\_pred <- factor(ifelse(valid1\_pred\_probs > 0.5, "Yes", "No"), levels = c("No", "Yes")) | |||||
logit\_conf <- confusionMatrix(valid1\_pred, valid1.df\$Churn, positive = "Yes") | |||||
logit\_roc <- roc(response = valid1.df\$Churn, predictor = valid1\_pred\_probs) | |||||
# Decision Tree model | |||||
dt\_model <- rpart(Churn \~ tenure + MonthlyCharges + TotalCharges + SeniorCitizen, data = train.df, method = "class") | |||||
dt\_pred <- predict(dt\_model, valid1.df, type = "class") | |||||
dt\_conf <- confusionMatrix(dt\_pred, valid1.df\$Churn) | |||||
# Naive Bayes model | |||||
nb\_model <- naiveBayes(Churn \~ tenure + MonthlyCharges + TotalCharges + SeniorCitizen, data = train.df) | |||||
nb\_pred <- predict(nb\_model, valid1.df) | |||||
nb\_probs <- predict(nb\_model, valid1.df, type = "raw") | |||||
nb\_conf <- confusionMatrix(nb\_pred, valid1.df\$Churn) | |||||
nb\_roc <- roc(response = valid1.df\$Churn, predictor = nb\_probs\[,"Yes"]) | |||||
# Print evaluations | |||||
cat("\nLogistic Regression Confusion Matrix:\n") | |||||
print(logit\_conf) | |||||
cat("\nAUC (Logistic):", auc(logit\_roc), "\n") | |||||
cat("\nDecision Tree Confusion Matrix:\n") | |||||
print(dt\_conf) | |||||
cat("\nNaive Bayes Confusion Matrix:\n") | |||||
print(nb\_conf) | |||||
cat("\nAUC (Naive Bayes):", auc(nb\_roc), "\n") | |||||
# Save ROC curve plots | |||||
png("logistic\_roc\_curve.png") | |||||
plot(logit\_roc, main = "ROC Curve - Logistic Regression", col = "darkgreen") | |||||
dev.off() | |||||
png("naive\_bayes\_roc\_curve.png") | |||||
plot(nb\_roc, main = "ROC Curve - Naive Bayes", col = "blue") | |||||
dev.off() | |||||
@ -0,0 +1,111 @@ | |||||
# Telco Customer Churn Analysis Script | |||||
# Load required libraries | |||||
library(ggplot2) | |||||
library(dplyr) | |||||
library(rpart) | |||||
library(e1071) | |||||
library(caret) | |||||
library(pROC) | |||||
# Load dataset | |||||
telco <- read.csv("Telco-Customer-Churn.csv", stringsAsFactors = TRUE) | |||||
telco\$TotalCharges <- as.numeric(as.character(telco\$TotalCharges)) | |||||
telco <- telco\[!is.na(telco\$TotalCharges), ] | |||||
telco\$Churn <- factor(telco\$Churn, levels = c("No", "Yes")) | |||||
# Exploratory Data Visualizations | |||||
# Histogram for numeric variables | |||||
numeric\_vars <- c("tenure", "MonthlyCharges", "TotalCharges") | |||||
for (var in numeric\_vars) { | |||||
p <- ggplot(telco, aes\_string(x = var)) + | |||||
geom\_histogram(binwidth = 10, fill = "skyblue", color = "black") + | |||||
labs(title = paste("Histogram of", var), x = var, y = "Frequency") + | |||||
theme\_minimal() | |||||
ggsave(paste0("hist\_", var, ".png"), plot = p) | |||||
} | |||||
# Bar plot for churn | |||||
p\_churn <- ggplot(telco, aes(x = Churn, fill = Churn)) + | |||||
geom\_bar() + | |||||
labs(title = "Churn Distribution", x = "Churn", y = "Count") + | |||||
theme\_minimal() | |||||
ggsave("bar\_churn.png", plot = p\_churn) | |||||
# Boxplot of MonthlyCharges by Churn | |||||
p\_box <- ggplot(telco, aes(x = Churn, y = MonthlyCharges, fill = Churn)) + | |||||
geom\_boxplot() + | |||||
labs(title = "Monthly Charges by Churn", x = "Churn", y = "Monthly Charges") + | |||||
theme\_minimal() | |||||
ggsave("boxplot\_monthlycharges\_churn.png", plot = p\_box) | |||||
# Split data into training, validation1, and validation2 | |||||
set.seed(100) | |||||
n <- nrow(telco) | |||||
train.index <- sample(1\:n, size = round(0.70 \* n)) | |||||
remaining.index <- setdiff(1\:n, train.index) | |||||
valid1.index <- sample(remaining.index, size = round(0.15 \* n)) | |||||
valid2.index <- setdiff(remaining.index, valid1.index) | |||||
train.df <- telco\[train.index, ] | |||||
valid1.df <- telco\[valid1.index, ] | |||||
valid2.df <- telco\[valid2.index, ] | |||||
# Logistic regression model (simplified) | |||||
logit.reg <- glm(Churn \~ SeniorCitizen + Dependents + tenure + MultipleLines + InternetService + Contract + | |||||
PaperlessBilling + PaymentMethod + MonthlyCharges + TotalCharges, | |||||
data = train.df, family = "binomial") | |||||
summary(logit.reg) | |||||
# Evaluate on validation set | |||||
valid1\_pred\_probs <- predict(logit.reg, newdata = valid1.df, type = "response") | |||||
valid1\_pred <- factor(ifelse(valid1\_pred\_probs > 0.5, "Yes", "No"), levels = c("No", "Yes")) | |||||
logit\_conf <- confusionMatrix(valid1\_pred, valid1.df\$Churn, positive = "Yes") | |||||
logit\_roc <- roc(response = valid1.df\$Churn, predictor = valid1\_pred\_probs) | |||||
# Decision Tree model | |||||
dt\_model <- rpart(Churn \~ tenure + MonthlyCharges + TotalCharges + SeniorCitizen, data = train.df, method = "class") | |||||
dt\_pred <- predict(dt\_model, valid1.df, type = "class") | |||||
dt\_conf <- confusionMatrix(dt\_pred, valid1.df\$Churn) | |||||
# Naive Bayes model | |||||
nb\_model <- naiveBayes(Churn \~ tenure + MonthlyCharges + TotalCharges + SeniorCitizen, data = train.df) | |||||
nb\_pred <- predict(nb\_model, valid1.df) | |||||
nb\_probs <- predict(nb\_model, valid1.df, type = "raw") | |||||
nb\_conf <- confusionMatrix(nb\_pred, valid1.df\$Churn) | |||||
nb\_roc <- roc(response = valid1.df\$Churn, predictor = nb\_probs\[,"Yes"]) | |||||
# Print evaluations | |||||
cat("\nLogistic Regression Confusion Matrix:\n") | |||||
print(logit\_conf) | |||||
cat("\nAUC (Logistic):", auc(logit\_roc), "\n") | |||||
cat("\nDecision Tree Confusion Matrix:\n") | |||||
print(dt\_conf) | |||||
cat("\nNaive Bayes Confusion Matrix:\n") | |||||
print(nb\_conf) | |||||
cat("\nAUC (Naive Bayes):", auc(nb\_roc), "\n") | |||||
# Save ROC curve plots | |||||
png("logistic\_roc\_curve.png") | |||||
plot(logit\_roc, main = "ROC Curve - Logistic Regression", col = "darkgreen") | |||||
dev.off() | |||||
png("naive\_bayes\_roc\_curve.png") | |||||
plot(nb\_roc, main = "ROC Curve - Naive Bayes", col = "blue") | |||||
dev.off() | |||||
@ -0,0 +1,38 @@ | |||||
# Telco Customer Churn Analysis Script | |||||
# Load required libraries | |||||
library(ggplot2) | |||||
library(dplyr) | |||||
library(rpart) | |||||
library(e1071) | |||||
library(caret) | |||||
library(pROC) | |||||
# Load dataset | |||||
telco <- read.csv("Telco-Customer-Churn.csv", stringsAsFactors = TRUE) | |||||
telco$TotalCharges <- as.numeric(as.character(telco$TotalCharges)) | |||||
telco <- telco[!is.na(telco$TotalCharges), ] | |||||
telco$Churn <- factor(telco$Churn, levels = c("No", "Yes")) | |||||
# Split data | |||||
set.seed(42) | |||||
trainIndex <- createDataPartition(telco$Churn, p = 0.7, list = FALSE) | |||||
train <- telco[trainIndex, ] | |||||
test <- telco[-trainIndex, ] | |||||
# Decision Tree model | |||||
dt_model <- rpart(Churn ~ tenure + MonthlyCharges + TotalCharges + SeniorCitizen, | |||||
data = train, method = "class") | |||||
dt_pred <- predict(dt_model, test, type = "class") | |||||
dt_conf <- confusionMatrix(dt_pred, test$Churn) | |||||
# Naive Bayes model | |||||
nb_model <- naiveBayes(Churn ~ tenure + MonthlyCharges + TotalCharges + SeniorCitizen, | |||||
data = train) | |||||
nb_pred <- predict(nb_model, test) | |||||
nb_conf <- confusionMatrix(nb_pred, test$Churn) | |||||
# ROC | |||||