Compare commits

...

1 Commits
master ... an

Author SHA1 Message Date
  An Pham a1aa5ccff8 update Danniel Code 1 month ago
1 changed files with 104 additions and 25 deletions
Unified View
  1. +104
    -25
      Project/Solution-Coding-02-Data Mining Process.R

+ 104
- 25
Project/Solution-Coding-02-Data Mining Process.R View File

@ -1,25 +1,104 @@
# Load data
house.df <- read.csv("WestRoxbury.csv")
# Extract the row numbers
house.rows <- row.names(house.df)
# Partition the row numbers into 10 equal subsets
set.seed(100)
house.rows.list <- list()
for(i in 1:10){
house.rows.list[[i]] <- sample(house.rows, length(house.rows) / (10 - i + 1))
house.rows <- setdiff(house.rows, house.rows.list[[i]])
}
# Partition the data into 10 equal subsets
house.df.list <- list()
for(i in 1:10){
house.df.list[[i]] <- house.df[house.rows.list[[i]],]
}
# Display the number of observations in each subset
for(i in 1:10){
print(paste("Number of observations in subset", i,
"=", dim(house.df.list[[i]])[1]))
}
# Load necessary libraries
library(forecast)
library(ggplot2)
library(gplots)
library(reshape)
library(GGally)
library(MASS)
library(naniar)
library(psych)
# Load dataset
setwd('/home/anpham/Nextcloud/cpp/Data Mining/Project')
telco.df <- read.csv("Telco-Customer-Churn.csv", stringsAsFactors = FALSE)
# Display first few rows
head(telco.df)
# Check structure
str(telco.df)
# Summary statistics
describe(telco.df)
# Convert categorical variables to factors
categorical_vars <- c("gender", "SeniorCitizen", "Partner", "Dependents", "PhoneService",
"MultipleLines", "InternetService", "OnlineSecurity", "OnlineBackup",
"DeviceProtection", "TechSupport", "StreamingTV", "StreamingMovies",
"Contract", "PaperlessBilling", "PaymentMethod", "Churn")
telco.df[categorical_vars] <- lapply(telco.df[categorical_vars], factor)
# Convert TotalCharges to numeric and handle missing values
telco.df$TotalCharges <- as.numeric(telco.df$TotalCharges)
telco.df$TotalCharges[is.na(telco.df$TotalCharges)] <- 0
# Create new feature: Average Monthly Spend
telco.df$AvgMonthlySpend <- with(telco.df, ifelse(tenure > 0, TotalCharges / tenure, MonthlyCharges))
# Categorize tenure into groups
telco.df$TenureCategory <- cut(telco.df$tenure,
breaks = c(-Inf, 12, 48, Inf),
labels = c("Short-Term", "Mid-Term", "Long-Term"))
# Check missing values
sum(is.na(telco.df))
colSums(is.na(telco.df))
# Visualize missing values
gg_miss_var(telco.df)
# Boxplots to detect outliers
ggplot(telco.df, aes(y = MonthlyCharges)) +
geom_boxplot(fill = "skyblue") +
ggtitle("Boxplot of Monthly Charges") +
theme_minimal()
ggplot(telco.df, aes(y = TotalCharges)) +
geom_boxplot(fill = "lightcoral") +
ggtitle("Boxplot of Total Charges") +
theme_minimal()
# Histogram for numeric variables
hist(telco.df$TotalCharges, col = "blue", main = "Distribution of TotalCharges")
hist(telco.df$MonthlyCharges, col = "lightblue", border = "black", main="Distribution of Monthly Charges")
# Density plot
plot(density(telco.df$MonthlyCharges, na.rm = TRUE), col = "red", main = "Density of MonthlyCharges")
# Boxplot of Monthly Charges by Churn status
boxplot(telco.df$MonthlyCharges ~ telco.df$Churn,
main = "Monthly Charges by Churn Status",
xlab = "Churn (Yes/No)",
ylab = "Monthly Charges",
col = c("red", "blue"))
# Scatter plot of tenure vs Monthly Charges
plot(telco.df$tenure, telco.df$MonthlyCharges,
xlab = "Tenure (Months)",
ylab = "Monthly Charges ($)",
main = "Tenure vs. Monthly Charges",
col = "blue", pch = 16)
# Scatter plot of tenure vs churn with jitter
ggplot(telco.df, aes(x = Churn, y = tenure, color = Churn)) +
geom_jitter(width = 0.2, alpha = 0.6) +
labs(title = "Tenure by Churn Status",
x = "Churn (Yes/No)",
y = "Tenure (Months)") +
theme_minimal()
# Convert Tenure to years
telco.df$TenureYears <- telco.df$tenure / 12
# Ensure "DSL" is properly recognized in InternetService
dsl_data <- subset(telco.df, InternetService == "DSL", select = c(TenureYears, MonthlyCharges))
# Scatter plot for DSL customers
ggplot(dsl_data, aes(x = TenureYears, y = MonthlyCharges)) +
geom_point(color = "blue", alpha = 0.6) +
labs(title = "Monthly Charges vs. Tenure (DSL Customers)",
x = "Tenure (Years)",
y = "Monthly Charges ($)") +
theme_minimal()

Loading…
Cancel
Save