|
# Load necessary libraries
|
|
library(forecast)
|
|
library(ggplot2)
|
|
library(gplots)
|
|
library(reshape)
|
|
library(GGally)
|
|
library(MASS)
|
|
library(naniar)
|
|
library(psych)
|
|
|
|
# Load dataset
|
|
setwd('/home/anpham/Nextcloud/cpp/Data Mining/Project')
|
|
telco.df <- read.csv("Telco-Customer-Churn.csv", stringsAsFactors = FALSE)
|
|
|
|
# Display first few rows
|
|
head(telco.df)
|
|
|
|
# Check structure
|
|
str(telco.df)
|
|
|
|
# Summary statistics
|
|
describe(telco.df)
|
|
|
|
# Convert categorical variables to factors
|
|
categorical_vars <- c("gender", "SeniorCitizen", "Partner", "Dependents", "PhoneService",
|
|
"MultipleLines", "InternetService", "OnlineSecurity", "OnlineBackup",
|
|
"DeviceProtection", "TechSupport", "StreamingTV", "StreamingMovies",
|
|
"Contract", "PaperlessBilling", "PaymentMethod", "Churn")
|
|
telco.df[categorical_vars] <- lapply(telco.df[categorical_vars], factor)
|
|
|
|
# Convert TotalCharges to numeric and handle missing values
|
|
telco.df$TotalCharges <- as.numeric(telco.df$TotalCharges)
|
|
telco.df$TotalCharges[is.na(telco.df$TotalCharges)] <- 0
|
|
|
|
# Create new feature: Average Monthly Spend
|
|
telco.df$AvgMonthlySpend <- with(telco.df, ifelse(tenure > 0, TotalCharges / tenure, MonthlyCharges))
|
|
|
|
# Categorize tenure into groups
|
|
telco.df$TenureCategory <- cut(telco.df$tenure,
|
|
breaks = c(-Inf, 12, 48, Inf),
|
|
labels = c("Short-Term", "Mid-Term", "Long-Term"))
|
|
|
|
# Check missing values
|
|
sum(is.na(telco.df))
|
|
colSums(is.na(telco.df))
|
|
|
|
# Visualize missing values
|
|
gg_miss_var(telco.df)
|
|
|
|
# Boxplots to detect outliers
|
|
ggplot(telco.df, aes(y = MonthlyCharges)) +
|
|
geom_boxplot(fill = "skyblue") +
|
|
ggtitle("Boxplot of Monthly Charges") +
|
|
theme_minimal()
|
|
|
|
ggplot(telco.df, aes(y = TotalCharges)) +
|
|
geom_boxplot(fill = "lightcoral") +
|
|
ggtitle("Boxplot of Total Charges") +
|
|
theme_minimal()
|
|
|
|
# Histogram for numeric variables
|
|
hist(telco.df$TotalCharges, col = "blue", main = "Distribution of TotalCharges")
|
|
hist(telco.df$MonthlyCharges, col = "lightblue", border = "black", main="Distribution of Monthly Charges")
|
|
|
|
# Density plot
|
|
plot(density(telco.df$MonthlyCharges, na.rm = TRUE), col = "red", main = "Density of MonthlyCharges")
|
|
|
|
# Boxplot of Monthly Charges by Churn status
|
|
boxplot(telco.df$MonthlyCharges ~ telco.df$Churn,
|
|
main = "Monthly Charges by Churn Status",
|
|
xlab = "Churn (Yes/No)",
|
|
ylab = "Monthly Charges",
|
|
col = c("red", "blue"))
|
|
|
|
# Scatter plot of tenure vs Monthly Charges
|
|
plot(telco.df$tenure, telco.df$MonthlyCharges,
|
|
xlab = "Tenure (Months)",
|
|
ylab = "Monthly Charges ($)",
|
|
main = "Tenure vs. Monthly Charges",
|
|
col = "blue", pch = 16)
|
|
|
|
# Scatter plot of tenure vs churn with jitter
|
|
ggplot(telco.df, aes(x = Churn, y = tenure, color = Churn)) +
|
|
geom_jitter(width = 0.2, alpha = 0.6) +
|
|
labs(title = "Tenure by Churn Status",
|
|
x = "Churn (Yes/No)",
|
|
y = "Tenure (Months)") +
|
|
theme_minimal()
|
|
|
|
# Convert Tenure to years
|
|
telco.df$TenureYears <- telco.df$tenure / 12
|
|
|
|
# Ensure "DSL" is properly recognized in InternetService
|
|
dsl_data <- subset(telco.df, InternetService == "DSL", select = c(TenureYears, MonthlyCharges))
|
|
|
|
# Scatter plot for DSL customers
|
|
ggplot(dsl_data, aes(x = TenureYears, y = MonthlyCharges)) +
|
|
geom_point(color = "blue", alpha = 0.6) +
|
|
labs(title = "Monthly Charges vs. Tenure (DSL Customers)",
|
|
x = "Tenure (Years)",
|
|
y = "Monthly Charges ($)") +
|
|
theme_minimal()
|
|
|
|
|