anpham
commented 1 month ago
Owner
```
# Load necessary libraries
library(forecast)
library(ggplot2)
library(gplots)
library(reshape)
library(GGally)
library(MASS)
library(naniar)
library(psych)
# Load dataset
setwd('/home/anpham/Nextcloud/cpp/Data Mining/Project')
telco.df <- read.csv("Telco-Customer-Churn.csv")
# Display the first few rows
head(telco.df)
# Check structure of dataset
str(telco.df)
# Provides mean, sd, min, max, skewness, kurtosis
describe(telco.df)
# Convert categorical variables to factors
categorical_vars <- c("gender", "SeniorCitizen", "Partner", "Dependents", "PhoneService",
"MultipleLines", "InternetService", "OnlineSecurity", "OnlineBackup",
"DeviceProtection", "TechSupport", "StreamingTV", "StreamingMovies",
"Contract", "PaperlessBilling", "PaymentMethod", "Churn")
telco.df[categorical_vars] <- lapply(telco.df[categorical_vars], factor)
# Summary statistics
summary(telco.df)
# Check missing values
sum(is.na(telco.df))
colSums(is.na(telco.df))
gg_miss_var(telco.df) # Visualize missing values
# Remove outliers using IQR method
num_vars <- c("tenure", "MonthlyCharges", "TotalCharges")
for (var in num_vars) {
Q1 <- quantile(telco.df[[var]], 0.25, na.rm = TRUE)
Q3 <- quantile(telco.df[[var]], 0.75, na.rm = TRUE)
IQR <- Q3 - Q1
telco.df <- telco.df[telco.df[[var]] >= (Q1 - 1.5 * IQR) & telco.df[[var]] <= (Q3 + 1.5 * IQR), ]
}
# Create new features
# 1. Tenure Segments
telco.df$TenureSegment <- cut(telco.df$tenure,
breaks = c(-Inf, 6, 12, 24, 48, Inf),
labels = c("New", "Recent", "1-2 Years", "2-4 Years", "Loyal"))
# 2. Cost Per Year
telco.df$CostPerYear <- with(telco.df, ifelse(tenure > 0, TotalCharges / (tenure / 12), MonthlyCharges * 12))
# 3. Contract Type Encoding
telco.df$IsMonthToMonth <- ifelse(telco.df$Contract == "Month-to-month", 1, 0)
# 4. Has Multiple Services
telco.df$HasMultipleServices <- rowSums(telco.df[, c("OnlineSecurity", "OnlineBackup",
"DeviceProtection", "TechSupport",
"StreamingTV", "StreamingMovies")] == "Yes")
# 5. Payment Method Type
telco.df$IsElectronicPayment <- ifelse(telco.df$PaymentMethod == "Electronic check", 1, 0)
# 6. Senior Citizen Flag
telco.df$IsSenior <- ifelse(telco.df$SeniorCitizen == 1, "Yes", "No")
# 7. Engagement Score
telco.df$EngagementScore <- with(telco.df, tenure * HasMultipleServices * MonthlyCharges)
# Save processed dataset
write.csv(telco.df, "Processed-Telco-Customer-Churn.csv", row.names = FALSE)
# Final data check
head(telco.df)
```