# Load necessary libraries library(forecast) library(ggplot2) library(gplots) library(reshape) library(GGally) library(MASS) library(naniar) library(psych) # Load dataset setwd('/home/anpham/Nextcloud/cpp/Data Mining/Project') telco.df <- read.csv("Telco-Customer-Churn.csv", stringsAsFactors = FALSE) # Display first few rows head(telco.df) # Check structure str(telco.df) # Summary statistics describe(telco.df) # Convert categorical variables to factors categorical_vars <- c("gender", "SeniorCitizen", "Partner", "Dependents", "PhoneService", "MultipleLines", "InternetService", "OnlineSecurity", "OnlineBackup", "DeviceProtection", "TechSupport", "StreamingTV", "StreamingMovies", "Contract", "PaperlessBilling", "PaymentMethod", "Churn") telco.df[categorical_vars] <- lapply(telco.df[categorical_vars], factor) # Convert TotalCharges to numeric and handle missing values telco.df$TotalCharges <- as.numeric(telco.df$TotalCharges) telco.df$TotalCharges[is.na(telco.df$TotalCharges)] <- 0 # Create new feature: Average Monthly Spend telco.df$AvgMonthlySpend <- with(telco.df, ifelse(tenure > 0, TotalCharges / tenure, MonthlyCharges)) # Categorize tenure into groups telco.df$TenureCategory <- cut(telco.df$tenure, breaks = c(-Inf, 12, 48, Inf), labels = c("Short-Term", "Mid-Term", "Long-Term")) # Check missing values sum(is.na(telco.df)) colSums(is.na(telco.df)) # Visualize missing values gg_miss_var(telco.df) # Boxplots to detect outliers ggplot(telco.df, aes(y = MonthlyCharges)) + geom_boxplot(fill = "skyblue") + ggtitle("Boxplot of Monthly Charges") + theme_minimal() ggplot(telco.df, aes(y = TotalCharges)) + geom_boxplot(fill = "lightcoral") + ggtitle("Boxplot of Total Charges") + theme_minimal() # Histogram for numeric variables hist(telco.df$TotalCharges, col = "blue", main = "Distribution of TotalCharges") hist(telco.df$MonthlyCharges, col = "lightblue", border = "black", main="Distribution of Monthly Charges") # Density plot plot(density(telco.df$MonthlyCharges, na.rm = TRUE), col = "red", main = "Density of MonthlyCharges") # Boxplot of Monthly Charges by Churn status boxplot(telco.df$MonthlyCharges ~ telco.df$Churn, main = "Monthly Charges by Churn Status", xlab = "Churn (Yes/No)", ylab = "Monthly Charges", col = c("red", "blue")) # Scatter plot of tenure vs Monthly Charges plot(telco.df$tenure, telco.df$MonthlyCharges, xlab = "Tenure (Months)", ylab = "Monthly Charges ($)", main = "Tenure vs. Monthly Charges", col = "blue", pch = 16) # Scatter plot of tenure vs churn with jitter ggplot(telco.df, aes(x = Churn, y = tenure, color = Churn)) + geom_jitter(width = 0.2, alpha = 0.6) + labs(title = "Tenure by Churn Status", x = "Churn (Yes/No)", y = "Tenure (Months)") + theme_minimal() # Convert Tenure to years telco.df$TenureYears <- telco.df$tenure / 12 # Ensure "DSL" is properly recognized in InternetService dsl_data <- subset(telco.df, InternetService == "DSL", select = c(TenureYears, MonthlyCharges)) # Scatter plot for DSL customers ggplot(dsl_data, aes(x = TenureYears, y = MonthlyCharges)) + geom_point(color = "blue", alpha = 0.6) + labs(title = "Monthly Charges vs. Tenure (DSL Customers)", x = "Tenure (Years)", y = "Monthly Charges ($)") + theme_minimal()