|
@ -1,25 +1,104 @@ |
|
|
# Load data |
|
|
|
|
|
house.df <- read.csv("WestRoxbury.csv") |
|
|
|
|
|
|
|
|
|
|
|
# Extract the row numbers |
|
|
|
|
|
house.rows <- row.names(house.df) |
|
|
|
|
|
|
|
|
|
|
|
# Partition the row numbers into 10 equal subsets |
|
|
|
|
|
set.seed(100) |
|
|
|
|
|
house.rows.list <- list() |
|
|
|
|
|
for(i in 1:10){ |
|
|
|
|
|
house.rows.list[[i]] <- sample(house.rows, length(house.rows) / (10 - i + 1)) |
|
|
|
|
|
house.rows <- setdiff(house.rows, house.rows.list[[i]]) |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
# Partition the data into 10 equal subsets |
|
|
|
|
|
house.df.list <- list() |
|
|
|
|
|
for(i in 1:10){ |
|
|
|
|
|
house.df.list[[i]] <- house.df[house.rows.list[[i]],] |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
# Display the number of observations in each subset |
|
|
|
|
|
for(i in 1:10){ |
|
|
|
|
|
print(paste("Number of observations in subset", i, |
|
|
|
|
|
"=", dim(house.df.list[[i]])[1])) |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
# Load necessary libraries |
|
|
|
|
|
library(forecast) |
|
|
|
|
|
library(ggplot2) |
|
|
|
|
|
library(gplots) |
|
|
|
|
|
library(reshape) |
|
|
|
|
|
library(GGally) |
|
|
|
|
|
library(MASS) |
|
|
|
|
|
library(naniar) |
|
|
|
|
|
library(psych) |
|
|
|
|
|
|
|
|
|
|
|
# Load dataset |
|
|
|
|
|
setwd('/home/anpham/Nextcloud/cpp/Data Mining/Project') |
|
|
|
|
|
telco.df <- read.csv("Telco-Customer-Churn.csv", stringsAsFactors = FALSE) |
|
|
|
|
|
|
|
|
|
|
|
# Display first few rows |
|
|
|
|
|
head(telco.df) |
|
|
|
|
|
|
|
|
|
|
|
# Check structure |
|
|
|
|
|
str(telco.df) |
|
|
|
|
|
|
|
|
|
|
|
# Summary statistics |
|
|
|
|
|
describe(telco.df) |
|
|
|
|
|
|
|
|
|
|
|
# Convert categorical variables to factors |
|
|
|
|
|
categorical_vars <- c("gender", "SeniorCitizen", "Partner", "Dependents", "PhoneService", |
|
|
|
|
|
"MultipleLines", "InternetService", "OnlineSecurity", "OnlineBackup", |
|
|
|
|
|
"DeviceProtection", "TechSupport", "StreamingTV", "StreamingMovies", |
|
|
|
|
|
"Contract", "PaperlessBilling", "PaymentMethod", "Churn") |
|
|
|
|
|
telco.df[categorical_vars] <- lapply(telco.df[categorical_vars], factor) |
|
|
|
|
|
|
|
|
|
|
|
# Convert TotalCharges to numeric and handle missing values |
|
|
|
|
|
telco.df$TotalCharges <- as.numeric(telco.df$TotalCharges) |
|
|
|
|
|
telco.df$TotalCharges[is.na(telco.df$TotalCharges)] <- 0 |
|
|
|
|
|
|
|
|
|
|
|
# Create new feature: Average Monthly Spend |
|
|
|
|
|
telco.df$AvgMonthlySpend <- with(telco.df, ifelse(tenure > 0, TotalCharges / tenure, MonthlyCharges)) |
|
|
|
|
|
|
|
|
|
|
|
# Categorize tenure into groups |
|
|
|
|
|
telco.df$TenureCategory <- cut(telco.df$tenure, |
|
|
|
|
|
breaks = c(-Inf, 12, 48, Inf), |
|
|
|
|
|
labels = c("Short-Term", "Mid-Term", "Long-Term")) |
|
|
|
|
|
|
|
|
|
|
|
# Check missing values |
|
|
|
|
|
sum(is.na(telco.df)) |
|
|
|
|
|
colSums(is.na(telco.df)) |
|
|
|
|
|
|
|
|
|
|
|
# Visualize missing values |
|
|
|
|
|
gg_miss_var(telco.df) |
|
|
|
|
|
|
|
|
|
|
|
# Boxplots to detect outliers |
|
|
|
|
|
ggplot(telco.df, aes(y = MonthlyCharges)) + |
|
|
|
|
|
geom_boxplot(fill = "skyblue") + |
|
|
|
|
|
ggtitle("Boxplot of Monthly Charges") + |
|
|
|
|
|
theme_minimal() |
|
|
|
|
|
|
|
|
|
|
|
ggplot(telco.df, aes(y = TotalCharges)) + |
|
|
|
|
|
geom_boxplot(fill = "lightcoral") + |
|
|
|
|
|
ggtitle("Boxplot of Total Charges") + |
|
|
|
|
|
theme_minimal() |
|
|
|
|
|
|
|
|
|
|
|
# Histogram for numeric variables |
|
|
|
|
|
hist(telco.df$TotalCharges, col = "blue", main = "Distribution of TotalCharges") |
|
|
|
|
|
hist(telco.df$MonthlyCharges, col = "lightblue", border = "black", main="Distribution of Monthly Charges") |
|
|
|
|
|
|
|
|
|
|
|
# Density plot |
|
|
|
|
|
plot(density(telco.df$MonthlyCharges, na.rm = TRUE), col = "red", main = "Density of MonthlyCharges") |
|
|
|
|
|
|
|
|
|
|
|
# Boxplot of Monthly Charges by Churn status |
|
|
|
|
|
boxplot(telco.df$MonthlyCharges ~ telco.df$Churn, |
|
|
|
|
|
main = "Monthly Charges by Churn Status", |
|
|
|
|
|
xlab = "Churn (Yes/No)", |
|
|
|
|
|
ylab = "Monthly Charges", |
|
|
|
|
|
col = c("red", "blue")) |
|
|
|
|
|
|
|
|
|
|
|
# Scatter plot of tenure vs Monthly Charges |
|
|
|
|
|
plot(telco.df$tenure, telco.df$MonthlyCharges, |
|
|
|
|
|
xlab = "Tenure (Months)", |
|
|
|
|
|
ylab = "Monthly Charges ($)", |
|
|
|
|
|
main = "Tenure vs. Monthly Charges", |
|
|
|
|
|
col = "blue", pch = 16) |
|
|
|
|
|
|
|
|
|
|
|
# Scatter plot of tenure vs churn with jitter |
|
|
|
|
|
ggplot(telco.df, aes(x = Churn, y = tenure, color = Churn)) + |
|
|
|
|
|
geom_jitter(width = 0.2, alpha = 0.6) + |
|
|
|
|
|
labs(title = "Tenure by Churn Status", |
|
|
|
|
|
x = "Churn (Yes/No)", |
|
|
|
|
|
y = "Tenure (Months)") + |
|
|
|
|
|
theme_minimal() |
|
|
|
|
|
|
|
|
|
|
|
# Convert Tenure to years |
|
|
|
|
|
telco.df$TenureYears <- telco.df$tenure / 12 |
|
|
|
|
|
|
|
|
|
|
|
# Ensure "DSL" is properly recognized in InternetService |
|
|
|
|
|
dsl_data <- subset(telco.df, InternetService == "DSL", select = c(TenureYears, MonthlyCharges)) |
|
|
|
|
|
|
|
|
|
|
|
# Scatter plot for DSL customers |
|
|
|
|
|
ggplot(dsl_data, aes(x = TenureYears, y = MonthlyCharges)) + |
|
|
|
|
|
geom_point(color = "blue", alpha = 0.6) + |
|
|
|
|
|
labs(title = "Monthly Charges vs. Tenure (DSL Customers)", |
|
|
|
|
|
x = "Tenure (Years)", |
|
|
|
|
|
y = "Monthly Charges ($)") + |
|
|
|
|
|
theme_minimal() |
|
|
|
|
|
|
|
|
|
|
|
|