diff --git a/Project/Project_code.R b/Project/Project_code.R new file mode 100644 index 0000000..0ac781a --- /dev/null +++ b/Project/Project_code.R @@ -0,0 +1,104 @@ +# Load necessary libraries +library(forecast) +library(ggplot2) +library(gplots) +library(reshape) +library(GGally) +library(MASS) +library(naniar) +library(psych) + +# Load dataset +setwd('/home/anpham/Nextcloud/cpp/Data Mining/Project') +telco.df <- read.csv("Telco-Customer-Churn.csv", stringsAsFactors = FALSE) + +# Display first few rows +head(telco.df) + +# Check structure +str(telco.df) + +# Summary statistics +describe(telco.df) + +# Convert categorical variables to factors +categorical_vars <- c("gender", "SeniorCitizen", "Partner", "Dependents", "PhoneService", + "MultipleLines", "InternetService", "OnlineSecurity", "OnlineBackup", + "DeviceProtection", "TechSupport", "StreamingTV", "StreamingMovies", + "Contract", "PaperlessBilling", "PaymentMethod", "Churn") +telco.df[categorical_vars] <- lapply(telco.df[categorical_vars], factor) + +# Convert TotalCharges to numeric and handle missing values +telco.df$TotalCharges <- as.numeric(telco.df$TotalCharges) +telco.df$TotalCharges[is.na(telco.df$TotalCharges)] <- 0 + +# Create new feature: Average Monthly Spend +telco.df$AvgMonthlySpend <- with(telco.df, ifelse(tenure > 0, TotalCharges / tenure, MonthlyCharges)) + +# Categorize tenure into groups +telco.df$TenureCategory <- cut(telco.df$tenure, + breaks = c(-Inf, 12, 48, Inf), + labels = c("Short-Term", "Mid-Term", "Long-Term")) + +# Check missing values +sum(is.na(telco.df)) +colSums(is.na(telco.df)) + +# Visualize missing values +gg_miss_var(telco.df) + +# Boxplots to detect outliers +ggplot(telco.df, aes(y = MonthlyCharges)) + + geom_boxplot(fill = "skyblue") + + ggtitle("Boxplot of Monthly Charges") + + theme_minimal() + +ggplot(telco.df, aes(y = TotalCharges)) + + geom_boxplot(fill = "lightcoral") + + ggtitle("Boxplot of Total Charges") + + theme_minimal() + +# Histogram for numeric variables +hist(telco.df$TotalCharges, col = "blue", main = "Distribution of TotalCharges") +hist(telco.df$MonthlyCharges, col = "lightblue", border = "black", main="Distribution of Monthly Charges") + +# Density plot +plot(density(telco.df$MonthlyCharges, na.rm = TRUE), col = "red", main = "Density of MonthlyCharges") + +# Boxplot of Monthly Charges by Churn status +boxplot(telco.df$MonthlyCharges ~ telco.df$Churn, + main = "Monthly Charges by Churn Status", + xlab = "Churn (Yes/No)", + ylab = "Monthly Charges", + col = c("red", "blue")) + +# Scatter plot of tenure vs Monthly Charges +plot(telco.df$tenure, telco.df$MonthlyCharges, + xlab = "Tenure (Months)", + ylab = "Monthly Charges ($)", + main = "Tenure vs. Monthly Charges", + col = "blue", pch = 16) + +# Scatter plot of tenure vs churn with jitter +ggplot(telco.df, aes(x = Churn, y = tenure, color = Churn)) + + geom_jitter(width = 0.2, alpha = 0.6) + + labs(title = "Tenure by Churn Status", + x = "Churn (Yes/No)", + y = "Tenure (Months)") + + theme_minimal() + +# Convert Tenure to years +telco.df$TenureYears <- telco.df$tenure / 12 + +# Ensure "DSL" is properly recognized in InternetService +dsl_data <- subset(telco.df, InternetService == "DSL", select = c(TenureYears, MonthlyCharges)) + +# Scatter plot for DSL customers +ggplot(dsl_data, aes(x = TenureYears, y = MonthlyCharges)) + + geom_point(color = "blue", alpha = 0.6) + + labs(title = "Monthly Charges vs. Tenure (DSL Customers)", + x = "Tenure (Years)", + y = "Monthly Charges ($)") + + theme_minimal() + +