#2 Add new feature part 2

Open
opened 1 month ago by anpham · 0 comments
anpham commented 1 month ago
Owner
# Load necessary libraries
library(forecast)
library(ggplot2)
library(gplots)
library(reshape)
library(GGally)
library(MASS)
library(naniar)
library(psych)

# Load dataset
setwd('/home/anpham/Nextcloud/cpp/Data Mining/Project')
telco.df <- read.csv("Telco-Customer-Churn.csv")

# Display the first few rows
head(telco.df)

# Check structure of dataset
str(telco.df)

# Provides mean, sd, min, max, skewness, kurtosis
describe(telco.df)

# Convert categorical variables to factors
categorical_vars <- c("gender", "SeniorCitizen", "Partner", "Dependents", "PhoneService", 
                      "MultipleLines", "InternetService", "OnlineSecurity", "OnlineBackup", 
                      "DeviceProtection", "TechSupport", "StreamingTV", "StreamingMovies", 
                      "Contract", "PaperlessBilling", "PaymentMethod", "Churn")

telco.df[categorical_vars] <- lapply(telco.df[categorical_vars], factor)

# Summary statistics
summary(telco.df)

# Check missing values
sum(is.na(telco.df))
colSums(is.na(telco.df))

gg_miss_var(telco.df)  # Visualize missing values

# Remove outliers using IQR method
num_vars <- c("tenure", "MonthlyCharges", "TotalCharges")
for (var in num_vars) {
  Q1 <- quantile(telco.df[[var]], 0.25, na.rm = TRUE)
  Q3 <- quantile(telco.df[[var]], 0.75, na.rm = TRUE)
  IQR <- Q3 - Q1
  telco.df <- telco.df[telco.df[[var]] >= (Q1 - 1.5 * IQR) & telco.df[[var]] <= (Q3 + 1.5 * IQR), ]
}

# Create new features
# 1. Tenure Segments
telco.df$TenureSegment <- cut(telco.df$tenure, 
                              breaks = c(-Inf, 6, 12, 24, 48, Inf), 
                              labels = c("New", "Recent", "1-2 Years", "2-4 Years", "Loyal"))

# 2. Cost Per Year
telco.df$CostPerYear <- with(telco.df, ifelse(tenure > 0, TotalCharges / (tenure / 12), MonthlyCharges * 12))

# 3. Contract Type Encoding
telco.df$IsMonthToMonth <- ifelse(telco.df$Contract == "Month-to-month", 1, 0)

# 4. Has Multiple Services
telco.df$HasMultipleServices <- rowSums(telco.df[, c("OnlineSecurity", "OnlineBackup", 
                                                     "DeviceProtection", "TechSupport", 
                                                     "StreamingTV", "StreamingMovies")] == "Yes")

# 5. Payment Method Type
telco.df$IsElectronicPayment <- ifelse(telco.df$PaymentMethod == "Electronic check", 1, 0)

# 6. Senior Citizen Flag
telco.df$IsSenior <- ifelse(telco.df$SeniorCitizen == 1, "Yes", "No")

# 7. Engagement Score
telco.df$EngagementScore <- with(telco.df, tenure * HasMultipleServices * MonthlyCharges)

# Save processed dataset
write.csv(telco.df, "Processed-Telco-Customer-Churn.csv", row.names = FALSE)

# Final data check
head(telco.df)
``` # Load necessary libraries library(forecast) library(ggplot2) library(gplots) library(reshape) library(GGally) library(MASS) library(naniar) library(psych) # Load dataset setwd('/home/anpham/Nextcloud/cpp/Data Mining/Project') telco.df <- read.csv("Telco-Customer-Churn.csv") # Display the first few rows head(telco.df) # Check structure of dataset str(telco.df) # Provides mean, sd, min, max, skewness, kurtosis describe(telco.df) # Convert categorical variables to factors categorical_vars <- c("gender", "SeniorCitizen", "Partner", "Dependents", "PhoneService", "MultipleLines", "InternetService", "OnlineSecurity", "OnlineBackup", "DeviceProtection", "TechSupport", "StreamingTV", "StreamingMovies", "Contract", "PaperlessBilling", "PaymentMethod", "Churn") telco.df[categorical_vars] <- lapply(telco.df[categorical_vars], factor) # Summary statistics summary(telco.df) # Check missing values sum(is.na(telco.df)) colSums(is.na(telco.df)) gg_miss_var(telco.df) # Visualize missing values # Remove outliers using IQR method num_vars <- c("tenure", "MonthlyCharges", "TotalCharges") for (var in num_vars) { Q1 <- quantile(telco.df[[var]], 0.25, na.rm = TRUE) Q3 <- quantile(telco.df[[var]], 0.75, na.rm = TRUE) IQR <- Q3 - Q1 telco.df <- telco.df[telco.df[[var]] >= (Q1 - 1.5 * IQR) & telco.df[[var]] <= (Q3 + 1.5 * IQR), ] } # Create new features # 1. Tenure Segments telco.df$TenureSegment <- cut(telco.df$tenure, breaks = c(-Inf, 6, 12, 24, 48, Inf), labels = c("New", "Recent", "1-2 Years", "2-4 Years", "Loyal")) # 2. Cost Per Year telco.df$CostPerYear <- with(telco.df, ifelse(tenure > 0, TotalCharges / (tenure / 12), MonthlyCharges * 12)) # 3. Contract Type Encoding telco.df$IsMonthToMonth <- ifelse(telco.df$Contract == "Month-to-month", 1, 0) # 4. Has Multiple Services telco.df$HasMultipleServices <- rowSums(telco.df[, c("OnlineSecurity", "OnlineBackup", "DeviceProtection", "TechSupport", "StreamingTV", "StreamingMovies")] == "Yes") # 5. Payment Method Type telco.df$IsElectronicPayment <- ifelse(telco.df$PaymentMethod == "Electronic check", 1, 0) # 6. Senior Citizen Flag telco.df$IsSenior <- ifelse(telco.df$SeniorCitizen == 1, "Yes", "No") # 7. Engagement Score telco.df$EngagementScore <- with(telco.df, tenure * HasMultipleServices * MonthlyCharges) # Save processed dataset write.csv(telco.df, "Processed-Telco-Customer-Churn.csv", row.names = FALSE) # Final data check head(telco.df) ```
anpham started working 1 month ago
Sign in to join this conversation.
No Label
No Milestone
No project
No Assignees
1 Participants
Notifications
Due Date

No due date set.

Dependencies

This issue currently doesn't have any dependencies.

Loading…
There is no content yet.