# Load necessary libraries
library(forecast)
library(ggplot2)
library(gplots)
library(reshape)
library(GGally)
library(MASS)
library(naniar)
library(psych)

# Load dataset
setwd('/home/anpham/Nextcloud/cpp/Data Mining/Project')
telco.df <- read.csv("Telco-Customer-Churn.csv", stringsAsFactors = FALSE)

# Display first few rows
head(telco.df)

# Check structure
str(telco.df)

# Summary statistics
describe(telco.df)

# Convert categorical variables to factors
categorical_vars <- c("gender", "SeniorCitizen", "Partner", "Dependents", "PhoneService",
                      "MultipleLines", "InternetService", "OnlineSecurity", "OnlineBackup",
                      "DeviceProtection", "TechSupport", "StreamingTV", "StreamingMovies",
                      "Contract", "PaperlessBilling", "PaymentMethod", "Churn")
telco.df[categorical_vars] <- lapply(telco.df[categorical_vars], factor)

# Convert TotalCharges to numeric and handle missing values
telco.df$TotalCharges <- as.numeric(telco.df$TotalCharges)
telco.df$TotalCharges[is.na(telco.df$TotalCharges)] <- 0

# Create new feature: Average Monthly Spend
telco.df$AvgMonthlySpend <- with(telco.df, ifelse(tenure > 0, TotalCharges / tenure, MonthlyCharges))

# Categorize tenure into groups
telco.df$TenureCategory <- cut(telco.df$tenure,
                               breaks = c(-Inf, 12, 48, Inf),
                               labels = c("Short-Term", "Mid-Term", "Long-Term"))

# Check missing values
sum(is.na(telco.df))
colSums(is.na(telco.df))

# Visualize missing values
gg_miss_var(telco.df)

# Boxplots to detect outliers
ggplot(telco.df, aes(y = MonthlyCharges)) +
  geom_boxplot(fill = "skyblue") +
  ggtitle("Boxplot of Monthly Charges") +
  theme_minimal()

ggplot(telco.df, aes(y = TotalCharges)) +
  geom_boxplot(fill = "lightcoral") +
  ggtitle("Boxplot of Total Charges") +
  theme_minimal()

# Histogram for numeric variables
hist(telco.df$TotalCharges, col = "blue", main = "Distribution of TotalCharges")
hist(telco.df$MonthlyCharges, col = "lightblue", border = "black", main="Distribution of Monthly Charges")

# Density plot
plot(density(telco.df$MonthlyCharges, na.rm = TRUE), col = "red", main = "Density of MonthlyCharges")

# Boxplot of Monthly Charges by Churn status
boxplot(telco.df$MonthlyCharges ~ telco.df$Churn,
        main = "Monthly Charges by Churn Status",
        xlab = "Churn (Yes/No)",
        ylab = "Monthly Charges",
        col = c("red", "blue"))

# Scatter plot of tenure vs Monthly Charges
plot(telco.df$tenure, telco.df$MonthlyCharges,
     xlab = "Tenure (Months)",
     ylab = "Monthly Charges ($)",
     main = "Tenure vs. Monthly Charges",
     col = "blue", pch = 16)

# Scatter plot of tenure vs churn with jitter
ggplot(telco.df, aes(x = Churn, y = tenure, color = Churn)) +
  geom_jitter(width = 0.2, alpha = 0.6) +
  labs(title = "Tenure by Churn Status",
       x = "Churn (Yes/No)",
       y = "Tenure (Months)") +
  theme_minimal()

# Convert Tenure to years
telco.df$TenureYears <- telco.df$tenure / 12

# Ensure "DSL" is properly recognized in InternetService
dsl_data <- subset(telco.df, InternetService == "DSL", select = c(TenureYears, MonthlyCharges))

# Scatter plot for DSL customers
ggplot(dsl_data, aes(x = TenureYears, y = MonthlyCharges)) +
  geom_point(color = "blue", alpha = 0.6) +
  labs(title = "Monthly Charges vs. Tenure (DSL Customers)",
       x = "Tenure (Years)",
       y = "Monthly Charges ($)") +
  theme_minimal()