library(tidyverse)
library(caret)
library(knitr)
library(pscl)
library(plotROC)
library(pROC)
library(scales)
library(gridExtra)
library(kableExtra)
library(Rpdb)
library(stargazer) # Source: ^[Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables. R package version 5.2.2. https://CRAN.R-project.org/package=stargazer]
options(scipen=10000000)

root.dir =
paste0("https://raw.githubusercontent.com/urbanSpatial",
"/Public-Policy-Analytics-Landing/master/DATA/")
source(
paste0("https://raw.githubusercontent.com/urbanSpatial/",
"Public-Policy-Analytics-Landing/master/functions.r"))

palette5 <- c("#f9b294","#f2727f","#c06c86","#6d5c7e","#315d7f")
palette4 <- c("#f9b294","#f2727f","#c06c86","#6d5c7e")
palette2 <- c("#f9b294","#f2727f")
palette1_main <- "#F2727F"
palette1_assist <- '#F9B294'

plotTheme <- function(base_size = 35, title_size = 45) {
  theme(
    text = element_text( color = "black"),
    plot.title = element_text(hjust = 0.5, size = title_size, colour = "black",face = "bold"), 
    plot.subtitle = element_text(hjust = 0.5,size=base_size,face="italic"),
    plot.caption = element_text(size=base_size,hjust=0),
    axis.ticks = element_blank(),
    panel.spacing = unit(6, 'lines'),
    panel.background = element_blank(),
    panel.grid.major = element_line("grey80", size = 0.1),
    panel.grid.minor = element_blank(),
    panel.border = element_rect(colour = "black", fill=NA, size=6),
    strip.background = element_rect(fill = "grey80", color = "white"),
    strip.text = element_text(size=base_size),
    axis.title = element_text(face = "bold",size=base_size),
    axis.text = element_text(size=base_size),
    plot.background = element_blank(),
    legend.background = element_blank(),
    legend.title = element_text(size=base_size,colour = "black", face = "italic"),
    legend.text = element_text(size=base_size,colour = "black", face = "italic"),
    strip.text.x = element_text(size = base_size,face = "bold")
  )
}

customTwoClassSummary <- function(data, lev = NULL, model = NULL, positive = NULL, negative=NULL) 
{
  lvls <- levels(data$obs)
  if (length(lvls) > 2) 
    stop(paste("Your outcome has", length(lvls), "levels. The twoClassSummary() function isn't appropriate."))
  caret:::requireNamespaceQuietStop("ModelMetrics")
  if (!all(levels(data[, "pred"]) == lvls)) 
    stop("levels of observed and predicted data do not match")
  rocAUC <- ModelMetrics::auc(ifelse(data$obs == lev[2], 0, 
                                     1), data[, lvls[1]])
  out <- c(rocAUC, 
           # Only change happens here!
           sensitivity(data[, "pred"], data[, "obs"], positive=positive), 
           specificity(data[, "pred"], data[, "obs"], negative=negative))
  names(out) <- c("ROC", "Sens", "Spec")
  out
}

HCD <-
  read.csv(file="D:/OneDrive/WORK/Upenn/MUSA_508_Public_Policy_Analytics_Michael/Public-Policy-Analytics-Landing/DATA/Chapter6/housingSubsidy.csv") %>%
  rename(HCD_Outcome = y,HCD_Outcome_Num = y_numeric)%>%
  dplyr::select(-X)%>%
  na.omit()

HCD <- HCD%>%mutate(HCD_Outcome_Num = factor(HCD_Outcome_Num))
HCD <- HCD%>%mutate(campaign = as.character(campaign))
HCD <- HCD%>%mutate(pdays = as.character(pdays))

1 Motivation

Most organizations work on behalf of ‘clients’, a significant effect will be made toward the organizations if a client of them no long participate, donate or purchase a good. Therefore, prediction toward these risks should be made by data scientists to stem the negative exaggeration in the early stage.

Emil City is considering a more proactive approach for targeting home owners who qualify for a home repair tax credit program. This tax credit program has been around for close to 20 years, and while the Department of Housing and Community Development (HCD) tries to proactively reach out to eligible homeowners ever year, the uptake of the credit is woefully inadequate.

This project aims to develop a model to help Emil City better target home owners who are willing to participate in a home repair tax credit program and thereby achieve a better Cost-Benefit Balance and a improvement in the local community value. Using a set of records that includes a series of features of homeowners who did or did not participate in the program in the past, the model is trained to predict the future probability and classify the likely participants from the unlikely.

2 Data Exploration

To further train a better model, we need to firstly check the indicators we have. Only if we get and wrangle indicators in a better way, can we have the chance to built up a better model. In the following, numeric and categorical indicators are separately plot out and their distribution and the abilities to distinguish homeowners who take credit and not take credit are observed.

# Numeric Indicators
HCD %>%
  dplyr::select(HCD_Outcome,age, previous, unemploy_rate,cons.price.idx,cons.conf.idx,inflation_rate,spent_on_repairs) %>%
  gather(Variable, value, -HCD_Outcome) %>%
  ggplot(aes(HCD_Outcome, value, fill=HCD_Outcome)) +
  geom_bar(position ="dodge", stat ="summary", fun ="mean") +
  geom_hline(aes(yintercept = 0), colour = palette5[4],linetype = 6, size = 4)+
  facet_wrap(~Variable, scales = "free",ncol=4) +
  scale_fill_manual(values = palette2) +
  labs(x="HCD_Outcome", y="Mean",
       title ="Feature associations with the likelihood of Take_Credit",
       subtitle = "Continous outcomes") +
  plotTheme() + 
  theme(legend.position = "none")

# Numeric Indicators
HCD %>%
  dplyr::select(HCD_Outcome,age, previous, unemploy_rate,cons.price.idx,cons.conf.idx,inflation_rate,spent_on_repairs) %>%
  gather(Variable, value, -HCD_Outcome) %>%
  ggplot() +
  geom_density(aes(value, color=HCD_Outcome), fill = "transparent",size=4) + 
  facet_wrap(~Variable, scales = "free",ncol=4) +
  scale_color_manual(values = palette2) +
  labs(title ="Feature distributions Take_Credit vs. No_Credit",
       subtitle = "Continous outcomes") +
  plotTheme() + 
  theme(legend.position = "bottom")

These two graphs above illustrate the importance of seven numeric indicators. They are age, cons.conf.idx - Consumer confidence index at time of campaign, cons.price.idx - Consumer Price Idex at campaign time, inflation_rate, previous - # of contacts before this campaign for this ind, spent onrepairs, and unemployment rate.

We can tell from the graphs that ‘cons.conf.idx’ and ‘cons.price.idx’ have the ability to distinguish homeowners who take credit from homeowners who do not take credit in some intervals. These intervals are where we should exemplify the distinguishing effect.

The way to exemplify the distinguishing effect is that instead of using these variables as numeric indicators, we should convert them into categorical indicators. Because in this project, qualitative distinguish is vital. Using numeric variables may reduce its qualitative ability.

# Categorical Indicators with two categories,figwitdth shouldbe 48
HCD %>%
  dplyr::select(HCD_Outcome,job,marital,education,taxLien,mortgage,taxbill_in_phl,contact,month,day_of_week,poutcome,campaign,pdays) %>%
  gather(Variable, value, -HCD_Outcome) %>%
  count(Variable, value, HCD_Outcome) %>%
  ggplot(aes(value, n, fill = HCD_Outcome)) +
  geom_bar(position = "fill", stat="identity") +
  facet_wrap(~Variable, scales = "free", ncol=4) +
  scale_fill_manual(values = palette2) +
  labs(x=" ", y="Count",
       title = "Feature distributions Take_Credit vs. No_Credit",
       subtitle = "Categorical features with two categories") +
  plotTheme() + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1),
                                   legend.position = "bottom")

The above graph displays correlation between 12 categorical features and “Take Credit” and “No Take Credit”. These variables are campaign - # of contacts for this ind for this campaign , contact - How have we previously contacted individual? , day_of_week - Day of the week we last contacted individual, education - Educational attainment, job, marital, month - Month we last contacted individual, mortgage - Is the owner carrying a mortgage, pdays - # days after ind. last contacted from a previous program, poutcome - Outcome of the previous marketing campaign, taxbill_in_phl - Is the owner’s full-time residence not in Philadelphia, taxLien - Is there a lien against the owner’s property?.

From these graphs, we can say ‘day_of_work’, and ‘taxbill_in_phl’ do not have abilities to distinguish these two types of people. And these graphs also tell us that ‘campaign’, ‘education’, ‘month’, and ‘pdays’ have a good discrimination ability among these two types of people as long as we further engineer these features. The engineering process will be carried out in the next section.

3 Modeling

A Logit model takes a dummy (two-category variable of either TRUE or FALSE) as the dependent variable. It classifies each observation into two categories - in this case, either taking credit or not taking credit. The objective of the model is to correctly classify those who will take credit as TRUE and, at the same time, not to falsely identify those who will not take credit.

The data-set is separated into a training set containing 65% of the observations and a test set containing 35% of the observations. We are using this way (test data which our training models have never seen) to test generalities. In the end, we are using K-fold cross-validation to have analyses on our baseline model and optimized model.

3.1 Initial Model

This section is divided into two parts. The first part is directly using raw metadata to run the logit regression and obtaining a baseline model, which we will use in the further to testify the effect of our model engineering.

set.seed(666)
trainIndex <- createDataPartition(y=paste(HCD$HCD_Outcome,HCD$education,HCD$taxLien,HCD$campaign,HCD$pdays),
                                  p = .65,
                                  list = FALSE,
                                  times = 1)
HCDTrain <- HCD[ trainIndex,]
HCDTest <- HCD[-trainIndex,]
HCD_regIni <- glm(HCD_Outcome_Num ~ .,data=HCDTrain%>%dplyr::select(-HCD_Outcome),family="binomial" (link="logit"))

testProbs <- data.frame(Outcome =as.factor(HCDTest$HCD_Outcome_Num),Probs = predict(HCD_regIni, HCDTest,type= "response"))
testProbs <-testProbs %>%mutate(predOutcome = as.factor(ifelse(testProbs$Probs > 0.5,1,0)))

3.2 Variable Engineering Model

The second part is using after-transformation indicators to rerun the logit regression model and compare effect of these two regression models.

As mentioned above, different numeric and categorical variables are engineered to exemplify their prediction ability. ‘Campaign’ predictor is categorized into three tiers, ‘education’ is divided into two tiers, ‘month’ is divided into four parts, while ‘pdays’ is divided into three tiers.

At the same time, numeric variables are also transformed. ‘Age’ is divided into 4 categories, transformed from numeric to categorical variables. ‘cons.conf.idx’ is categorized into six tiers, while “cons.price.idx” is divided into six variables.

# Categorical variables
campaign_keep <- c("1","2","3","4")
campaign_keep2 <- c("5","6","7","8","9","10","11")
edu_tier1 <- c("professional.course","university.degree","unknown")
month_tier1 <- c("apr","may","jun","jul","aug","nov")
month_tier2 <- c("dec","mar")
month_tier3 <- c("oct","sep")
pdays_t1 <- c("1","11","13","14","16","17")
pdays_t2 <- c("0","5","19","21")

HCD_VarEng <-HCD%>%
  mutate(HCD, campaign =ifelse(campaign %in% campaign_keep,"tier1",
                       ifelse(campaign %in%campaign_keep2,"tier2","others")),
  education=ifelse(education == "illiterate","illiterate",
                   ifelse(education %in% edu_tier1,"above_uni","others")),
  month=ifelse (month %in% month_tier1,"tier1",
                ifelse(month %in% month_tier2,"tier2","others")),
  pdays=ifelse(pdays %in% pdays_t1,"tier1",
               ifelse(pdays %in% pdays_t2,"tier2",pdays))
  )%>%
  dplyr::select(-day_of_week,-taxbill_in_phl)

HCD_VarEng2 <- HCD_VarEng %>%
   mutate(age=case_when(age < 30 ~ "age30",
                      age >= 30 & age < 50 ~ "age30-50",
                      age >= 50 & age < 65 ~ "age50-65",
                      age >= 65 ~ "age65+"),
          cons.conf.idx = case_when(
      cons.conf.idx < -50 ~ "conf_50-",
      cons.conf.idx >= -50 & cons.conf.idx < -45 ~ "conf-50to-45",
      cons.conf.idx >= -45 & cons.conf.idx < -40 ~ "conf-45to-40",
      cons.conf.idx >= -40 & cons.conf.idx < -35 ~ "conf-40to-35",
      cons.conf.idx >= -35 & cons.conf.idx < -30 ~ "conf-35to-30",
      cons.conf.idx >= -30 & cons.conf.idx < -25 ~ "conf-30to-25"),
      cons.price.idx=case_when(
      cons.price.idx < 93 ~ "price93-",
      cons.price.idx >= 93 & cons.price.idx < 93.6 ~ "price93-93.5",
      cons.price.idx >= 93.6 & cons.price.idx < 93.8 ~ "price93.5-94",
      cons.price.idx >= 93.8 & cons.price.idx < 94.2 ~ "price94-94.5",
      cons.price.idx >= 94.2 ~ "price94.5+"))

set.seed(666)
trainIndex <- createDataPartition(y=paste(HCD_VarEng2$HCD_Outcome,
                                          HCD_VarEng2$education,
                                          HCD_VarEng2$taxLien,
                                          HCD_VarEng2$campaign,
                                          HCD_VarEng2$pdays),
                                  p = .65,
                                  list = FALSE,
                                  times = 1)
HCDTrain <- HCD_VarEng2[ trainIndex,]
HCDTest <- HCD_VarEng2[-trainIndex,]
HCD_regVarEng <- glm(HCD_Outcome_Num ~ .,data=HCDTrain%>%dplyr::select(-HCD_Outcome),family="binomial" (link="logit"))

testProbs2 <- data.frame(Outcome =as.factor(HCDTest$HCD_Outcome_Num),Probs = predict(HCD_regVarEng, HCDTest,type= "response"))
testProbs2 <-testProbs2 %>%mutate(predOutcome = as.factor(ifelse(testProbs2$Probs > 0.5,1,0)))

In this process, the method to apply the transformation and the exact interval is obscure. Much effort are put into attempts, considering which kind of division can effectively improve prediction accuracy. In the next part, the outcomes of two regressions are put into one summary table to compare prediciton effect of these two regression model.

stargazer(HCD_regIni,HCD_regVarEng, type="text",
          digits = 1,
          title = "Table 1. Summary statistics",
          out.header = TRUE,
          column.labels = c("Before Feature Engineering", "After Feature Engineering"),
          single.row = TRUE,
          flip = FALSE)
   
   Table 1. Summary statistics
   =================================================================================
                                                Dependent variable:                 
                                ----------------------------------------------------
                                                  HCD_Outcome_Num                   
                                Before Feature Engineering After Feature Engineering
                                           (1)                        (2)           
   ---------------------------------------------------------------------------------
   age                                0.03*** (0.01)                                
   ageage30-50                                                     0.3 (0.2)        
   ageage50-65                                                    0.7** (0.3)       
   ageage65+                                                       0.1 (0.5)        
   jobblue-collar                       -0.2 (0.3)                -0.3 (0.2)        
   jobentrepreneur                      -0.4 (0.4)                -0.9 (0.5)        
   jobhousemaid                         0.2 (0.4)                 -0.1 (0.5)        
   jobmanagement                        -0.5 (0.3)                -0.6* (0.3)       
   jobretired                          -0.7* (0.4)                0.05 (0.4)        
   jobself-employed                     -0.6 (0.4)                -0.4 (0.4)        
   jobservices                          0.1 (0.3)                 -0.04 (0.3)       
   jobstudent                          -0.04 (0.4)                0.04 (0.4)        
   jobtechnician                        0.1 (0.2)                  0.1 (0.2)        
   jobunemployed                        0.2 (0.4)                  0.2 (0.4)        
   jobunknown                           -0.5 (0.8)                -0.4 (0.8)        
   maritalmarried                      -0.02 (0.2)               -0.003 (0.2)       
   maritalsingle                        0.3 (0.3)                  0.1 (0.3)        
   maritalunknown                       0.4 (1.3)               -15.8 (1,613.8)     
   educationbasic.6y                    0.4 (0.4)                                   
   educationbasic.9y                    0.2 (0.3)                                   
   educationhigh.school                 0.1 (0.3)                                   
   educationilliterate               -16.4 (3,956.2)            -17.2 (3,956.2)     
   educationprofessional.course         0.2 (0.3)                                   
   educationuniversity.degree           0.3 (0.3)                                   
   educationunknown                     0.3 (0.4)                                   
   educationothers                                                -0.05 (0.2)       
   taxLienunknown                       0.02 (0.2)                0.01 (0.2)        
   taxLienyes                        -13.9 (3,956.2)            -14.4 (3,956.2)     
   mortgageunknown                      -0.3 (0.5)                -0.3 (0.5)        
   mortgageyes                          -0.1 (0.1)                -0.1 (0.1)        
   taxbill_in_phlyes                    -0.1 (0.2)                                  
   contacttelephone                   -0.9*** (0.3)              -1.3*** (0.3)      
   monthaug                             -0.5 (0.4)                                  
   monthdec                             0.8 (0.8)                                   
   monthjul                             -0.3 (0.3)                                  
   monthjun                             -0.2 (0.4)                                  
   monthmar                            1.8*** (0.6)                                 
   monthmay                            -0.7** (0.3)                                 
   monthnov                            -0.8** (0.4)                                 
   monthoct                            -1.0* (0.6)                                  
   monthsep                             -0.6 (0.6)                                  
   day_of_weekmon                       -0.2 (0.2)                                  
   day_of_weekthu                      0.005 (0.2)                                  
   day_of_weektue                       -0.2 (0.2)                                  
   day_of_weekwed                       0.2 (0.2)                                   
   campaign10                           0.02 (1.1)                                  
   campaign11                           0.1 (1.1)                                   
   campaign12                        -14.6 (1,005.1)                                
   campaign13                        -14.7 (1,302.3)                                
   campaign14                        -14.7 (1,589.3)                                
   campaign15                        -14.6 (2,797.0)                                
   campaign16                        -15.0 (1,477.6)                                
   campaign17                        -14.6 (1,076.0)                                
   campaign18                        -14.5 (3,956.2)                                
   campaign19                        -15.2 (2,763.9)                                
   campaign2                            -0.1 (0.2)                                  
   campaign22                        -14.6 (2,793.0)                                
   campaign23                        -15.0 (2,797.3)                                
   campaign24                        -14.9 (3,956.2)                                
   campaign27                        -14.9 (3,956.2)                                
   campaign29                        -15.0 (2,797.3)                                
   campaign3                            0.2 (0.2)                                   
   campaign35                        -15.2 (3,956.2)                                
   campaign4                            0.5* (0.3)                                  
   campaign5                            -0.2 (0.4)                                  
   campaign6                            -0.7 (0.6)                                  
   campaign7                           -2.0* (1.1)                                  
   campaign8                            0.1 (0.8)                                   
   campaign9                            -0.5 (1.0)                                  
   pdays1                            -32.9 (3,504.9)                                
   pdays10                           -15.0 (2,795.0)                                
   pdays11                           -32.3 (4,843.9)                                
   monthtier1                                                    -1.2** (0.5)       
   monthtier2                                                     1.5** (0.6)       
   campaigntier1                                                 14.7 (567.3)       
   campaigntier2                                                 14.2 (567.3)       
   pdays12                           -15.7 (2,795.0)              -0.6 (1.5)        
   pdays13                           -34.4 (3,880.4)                                
   pdays14                           -33.8 (4,843.9)                                
   pdays15                           -17.0 (2,795.0)              -1.9 (1.8)        
   pdays16                           -35.0 (3,950.7)                                
   pdays17                           -34.6 (4,843.9)                                
   pdays18                           -16.8 (2,795.0)              -1.6 (1.8)        
   pdays19                            1.6 (4,843.9)                                 
   pdays2                            -16.5 (2,795.0)              -1.8 (1.6)        
   pdays21                            -1.0 (4,843.9)                                
   pdays3                            -15.0 (2,795.0)               0.4 (1.1)        
   pdays4                            -16.6 (2,795.0)              -1.6 (1.2)        
   pdays5                             2.5 (3,255.1)                                 
   pdays6                            -16.3 (2,795.0)              -1.0 (1.1)        
   pdays7                            -15.2 (2,795.0)              -0.2 (1.3)        
   pdays9                            -18.4 (2,795.0)              -2.7 (1.7)        
   pdays999                          -17.4 (2,795.0)              -2.2* (1.2)       
   pdaystier1                                                   -18.7 (1,277.8)     
   pdaystier2                                                   17.3 (1,408.1)      
   previous                             0.2 (0.2)                 -0.2 (0.2)        
   poutcomenonexistent                 0.7** (0.3)                0.6* (0.3)        
   poutcomesuccess                     -0.03 (0.9)                 0.2 (0.8)        
   unemploy_rate                       -0.9* (0.5)               -0.8** (0.3)       
   cons.price.idx                       1.3 (0.8)                                   
   cons.conf.idx                       0.1* (0.03)                                  
   cons.price.idxprice93-93.5                                     0.5* (0.3)        
   cons.price.idxprice93.5-94                                    2.9*** (0.8)       
   cons.price.idxprice94-94.5                                    2.1*** (0.5)       
   cons.price.idxprice94.5+                                      3.0*** (0.7)       
   cons.conf.idxconf-35to-30                                       0.9 (0.6)        
   cons.conf.idxconf-40to-35                                      1.5** (0.7)       
   cons.conf.idxconf-45to-40                                       0.8 (0.6)        
   cons.conf.idxconf-50to-45                                       0.2 (0.7)        
   cons.conf.idxconf_50-                                           0.7 (1.1)        
   inflation_rate                       0.2 (0.4)                 -0.6 (0.4)        
   spent_on_repairs                   -0.001 (0.01)              0.01* (0.01)       
   Constant                          -95.9 (2,797.8)             -83.8 (568.5)      
   ---------------------------------------------------------------------------------
   Observations                           2,826                      2,707          
   Log Likelihood                         -775.6                    -704.7          
   Akaike Inf. Crit.                     1,739.3                    1,521.4         
   =================================================================================
   Note:                                                 *p<0.1; **p<0.05; ***p<0.01

From this fined table we can see that the engineered model has imoroved the logit regression model. On the one hand, it effectively reduces the number of variables, optimizing the model. On the other hand, the discrimination abilities of some variables has been improved (exemplified), making them have a significant effect on the Prediciton. At last, the lower Akaike Inf. Crit. indicates it is a better model.

3.3 Models Comparison

What about we plot the comparison and difference out to have a more intuitive contrast. In this seciton, the density distributions before and after feature engineering are plot. The K-fold cross-validation is used to have a comprehensive estimate of these two models. Besides, the ROC curve of after-engineering model is plot.

Dist_before <- ggplot(testProbs, aes(x = Probs, fill = as.factor(Outcome))) +
  geom_density(color=palette1_main,size=3) +
  facet_grid(Outcome ~ .) +
  scale_fill_manual(values = palette2) + xlim(0, 1) +
  labs(x = "HCD_Outcome", y = "Density of probabilities",title = "Distribution before Fearture Engineering") +
  plotTheme() + 
  theme(legend.position = "bottom")

Dist_after <-ggplot(testProbs2, aes(x = Probs, fill = as.factor(Outcome))) +
  geom_density(color=palette1_main,size=3) +
  facet_grid(Outcome ~ .) +
  scale_fill_manual(values = palette2) + xlim(0, 1) +
  labs(x = "HCD_Outcome", y = "",title = "Distribution after Fearture Engineering") +
  plotTheme() + 
  theme(legend.position = "bottom")

grid.arrange(ncol = 2,Dist_before,Dist_after)

From this density distribution graph, we can tell that the after-engineering model effectively skewed the ‘No take credit’ to left, further improve the ability of model to predict who is not willing to take the credit, indirectly improve the model to predict people who is going to have ‘Credit’. Because the benefit of finding one more homeowner who is willing to take credit is larger than the marketing cost to some customers. The after-engineering model can be told successfully.

ctrl <- trainControl(method = "cv", number = 100,
                     classProbs=TRUE,
                     summaryFunction=function(...) customTwoClassSummary(..., 
                                       positive = "yes", negative="no"))

cvFit_before <- train(HCD_Outcome ~ ., data = HCD %>%
                 dplyr::select(-HCD_Outcome_Num),
               method="glm", family="binomial",
               metric="ROC", trControl = ctrl)

cvFit_after <- train(HCD_Outcome ~ ., data = HCD_VarEng2 %>%
                 dplyr::select(-HCD_Outcome_Num),
               method="glm", family="binomial",
               metric="ROC", trControl = ctrl)

cvFit_before2 <- data.frame(matrix(unlist(lapply(cvFit_before[4], function(x) as.numeric(as.character(x)))), nrow=1,ncol = 7,byrow=TRUE))%>%
  gather()%>%
  mutate_if(is.numeric, round,digits=3)%>%
  spread(key,value)%>%
  mutate(Method="Before")%>%rename(ROC=X2,Specificity=X4,Sensitivity=X3)%>%dplyr::select(ROC,Sensitivity,Specificity,Method)

cvFit_after2 <- data.frame(matrix(unlist(lapply(cvFit_after[4], function(x) as.numeric(as.character(x)))), nrow=1,ncol = 7,byrow=TRUE))%>%
  gather()%>%
  mutate_if(is.numeric, round,digits=3)%>%
  spread(key,value)%>%
  mutate(Method="After")%>%rename(ROC=X2,Specificity=X4,Sensitivity=X3)%>%dplyr::select(ROC,Sensitivity,Specificity,Method)

cvFit_Outcome <- rbind(cvFit_before2,cvFit_after2)

kable(cvFit_Outcome,align = 'c',caption = '<center>Table.Metric Comparison')%>%
  kable_classic(full_width = T)%>%
  kable_styling(position = "center")%>%
  footnote()
Table.Metric Comparison
ROC Sensitivity Specificity Method
0.761 0.238 0.981 Before
0.771 0.248 0.983 After
plot_before <- dplyr::select(cvFit_before$resample, -Resample) %>%
  gather(metric, value) %>%
  left_join(gather(cvFit_before$results[2:4], metric, mean)) %>%
ggplot(aes(value)) +
geom_histogram(bins=35, fill = palette1_main) +
  facet_wrap(~metric) +
  geom_vline(aes(xintercept = mean), colour = palette1_assist,linetype = 6, size = 5) +
  scale_x_continuous(limits = c(0, 1)) +
  labs(x="Goodness of Fit", y="Count",
       title="CV Goodness of Fit Metrics",
       subtitle ="Across-fold mean reprented as dotted lines")+
  plotTheme()+
  theme(legend.position = "bottom")

plot_after <- dplyr::select(cvFit_after$resample, -Resample) %>%
  gather(metric, value) %>%
  left_join(gather(cvFit_after$results[2:4], metric, mean)) %>%
ggplot(aes(value)) +
geom_histogram(bins=35, fill = palette1_main) +
  facet_wrap(~metric) +
  geom_vline(aes(xintercept = mean), colour = palette1_assist,linetype = 6, size = 5) +
  scale_x_continuous(limits = c(0, 1)) +
  labs(x="Goodness of Fit", y="Count",
       title="CV Goodness of Fit Metrics",
       subtitle ="Across-fold mean reprented as dotted lines")+
  plotTheme()+
  theme(legend.position = "bottom")

grid.arrange(ncol = 1,plot_before,plot_after)

In the above, six subplots are made to take a look at the outcome of the K-Fold Cross Validation. These indicators are ROC, Sensitivity and Specificity. From these specific metrics, we can tell that ROC is more concentrated so are sensitivity and specificity, even though the concentrated effect is nor so obvious.

AREA <- pROC::auc(testProbs$Outcome, testProbs$Probs)

ggplot(testProbs2, aes(d = as.numeric(testProbs2$Outcome),m = Probs)) +
  geom_roc(n.cuts = 50, labels = FALSE, colour = "#F2727F",size=7) +
  style_roc(theme = theme_grey) +
  geom_abline(slope = 1, intercept = 0, size = 5,color = '#F9B294') +
  geom_text(aes(label = paste("AUC =", round(AREA[1], 2))),x=-Inf, y=Inf, vjust = 1.5, hjust = -.1,size=15) +
  labs(title = "ROC Curve - Model after Feature Engineering")+
  plotTheme()+
  theme()

The ROC curve also support out opinions. The curve has a steeper slope in the left, which means at this interval, the true positive fraction will grow faster than false positive fraction, adding our total benefit. From this graph, we can also tell that when False positive fraction is around 10%, the marginal cost of True positive is lower. That means at this interval, we can obviously improve the effect of True Positive witout obviously adding the probability of false positive.

4 Cost Benefit Analysis

The above analysis is based on the assumption of 50% threshold, while the prediction comes to a price. Different thresholds can have a significantly different effect on the prediction result, which influences the model output. In this section, cost-benefit analysis is introduced to further optimize the model, choosing a optimal threshold to reduce the cost and increase the benefit. Let the model be optimized, taking into account of the cost-benefit.

4.1 Equation

A cost-benefit equation is used to optimize the threshold for the best outcome. As is estimated, marketing resources cost is $2,850 per homeowner, the credit costs $5,000 per homeowner, and houses that transacted after taking the credit has a average premium of $10,000, and surrounding the repaired home see an aggregate premium of $56,000, so the sum of $66,000 is considered as campaign benefit.

True Positive - Predicted correctly homeowner would take the credit; allocated the marketing resources, and they took the credit.

True Negative - Predicted correctly homeowner would not take the credit, no marketing resources were allocated, and no credit was allocated.

False Positive - Predicted incorrectly homeowner would take the credit; allocated marketing resources; no credit allocated.

False Negative - We predicted that a homeowner would not take the credit but they did. These are likely homeowners who signed up for reasons unrelated to the marketing campaign. Thus, we ‘0 out’ this category, assuming the cost/benefit of this is $0.

Table.Cost-Benefit Equation
Type Count Revenue..US.dollar. Explanation
True Positive Count_TP Count_TP * ((66,000 - 5,000)*0.25 - 2,850) Predicted correctly homeowner would take the credit; allocated the marketing resources, and they took the credit
True Negative Count_TN Count_TN * 0 Predicted correctly homeowner would not take the credit, no marketing resources were allocated, and no credit was allocated
False Positive Count_FP Count_FP * (- 2,850) Predicted incorrectly homeowner would take the credit; allocated marketing resources; no credit allocated
False Negative Count_FN Count_FN * 0 Predicted that a homeowner would not take the credit but they did. These are likely homeowners who signed up for reasons unrelated to the marketing campaign. Thus, we ‘0 out’ this category, assuming the cost/benefit of this is $0

4.2 Opatimal Threshold for Revenue

In this section, different thresholds will be test, and their related total revenue will be calculated and compared. In this way, we are going to find out a threshold that is suitable to our after-engineering model and can produce a realatively highest total revenue.

costbenefit_50_table <-testProbs2 %>%
  count(predOutcome, Outcome) %>%
  summarize(
    True_Positive = sum(n[predOutcome==1 & Outcome==1]),
    True_Negative = sum(n[predOutcome==0 & Outcome==0]),
    False_Positive = sum(n[predOutcome==1 & Outcome==0]),
    False_Negative = sum(n[predOutcome==0 & Outcome==1]),) %>%
  gather(Variable, Count) %>%
  mutate(Revenue =
           case_when(
             Variable == "True_Positive" ~ Count * 12400,
             Variable == "True_Negative" ~ 0,
             Variable == "False_Positive" ~ (-2850) * Count,
             Variable == "False_Negative" ~ 0)) %>%
  bind_cols(data.frame(Description = c("Predict:Take Credit ; In fact:Take Credit",
                                       "Predict:No Take Credit ; In fact:No Take Credit",
                                       "Predict:Take Credit ; In fact:No Take Credit",
                                       "Predict:No Take Credit ; In fact:Take Credit")))

costbenefit_50_kb <- kable(costbenefit_50_table,align = 'l',caption = '<center>Baseline:Revenue for 50% Threshold')%>%
  kable_classic(full_width = T)%>%
  kable_styling(position = "center",)%>%
  footnote()

column_spec(costbenefit_50_kb,1:4,width_min = "8em",width_max= "32em",extra_css = "vertical-align:middle;")
Baseline:Revenue for 50% Threshold
Variable Count Revenue Description
True_Positive 40 496000 Predict:Take Credit ; In fact:Take Credit
True_Negative 1235 0 Predict:No Take Credit ; In fact:No Take Credit
False_Positive 32 -91200 Predict:Take Credit ; In fact:No Take Credit
False_Negative 105 0 Predict:No Take Credit ; In fact:Take Credit
whichThreshold <-
  iterateThresholds(
    data=testProbs2, observedClass = Outcome,
    predictedProbs = Probs)

whichThreshold <-whichThreshold %>%
  dplyr::select(starts_with("Count"), Threshold) %>%
  gather(Variable, Count, -Threshold) %>%
  mutate(Revenue =
           case_when(
             Variable == "Count_TP" ~ Count * 12400,
             Variable == "Count_TN" ~ 0,
             Variable == "Count_FP" ~ (-2850) * Count,
             Variable == "Count_FN" ~ 0))
whichThreshold %>%
  ggplot(.,aes(Threshold, Revenue, colour = Variable)) +
  geom_point(size=6) +
  scale_colour_manual(values = palette4) +
  labs(title = "Revenue by confusion matrix type and threshold",
       subtitle = "Count_FN & Count_TN are coincided",
       y = "Revenue") +
  plotTheme() +
  theme(legend.position = "bottom")

  guides(colour=guide_legend(title = "Confusion Matrix"))
whichThreshold_revenue <-
  whichThreshold %>%
  mutate(Total_MarketingCredit_Count =ifelse(Variable == "Count_TP",Count,0),
         Total_Credit_Count=ifelse(Variable == "Count_TP",Count,
                                ifelse(Variable == "Count_FN",Count,0)),
         No_Credit_Count=ifelse(Variable == "Count_NP",Count,
                                ifelse(Variable == "Count_TN",Count,0))) %>%
  group_by(Threshold) %>%
  summarize(Total_Revenue = sum(Revenue),
            Total_MarketingCredit_Count = sum(Total_MarketingCredit_Count),
            Total_Credit_Count = sum(Total_Credit_Count),
            No_Credit_Count = sum(No_Credit_Count),
            Marketing_Credit_Rate=Total_MarketingCredit_Count/ sum(Count),
            No_Credit_Rate = No_Credit_Count/ sum(Count))
whichThreshold_revenue %>%
  dplyr::select(Threshold, Total_Revenue, Total_MarketingCredit_Count,No_Credit_Count,Marketing_Credit_Rate,No_Credit_Rate) %>%
  gather(Variable, Value, -Threshold) %>%
  ggplot(aes(Threshold, Value, colour = Variable)) +
  geom_point(size=6) +
  geom_vline(xintercept = pull(arrange(whichThreshold_revenue,
                                       -Total_Revenue)[1,1]),colour = palette5[4],size = 5) +
  scale_colour_manual(values = palette5) +
  facet_wrap(~Variable,scales = "free",ncol = 1)+
  labs(title = "Threshold as a function of Differnet Metrics",
       subtitle = "Purple Vertitle line is the best optimal threshold for biggest revenue")+
  plotTheme() +
  theme(legend.position = "bottom")

As shown above, total revenue increases when threshold is small, and slowly declines after the 14% threshold. Form here,we can know that a 14% threshold is the most suitable threshold for this after-engineering model. For it can take into account all revenues and loses and comprehensively obtain a reasonable high total revenue.

Besides, from the plots we can also know that the total count of credits that are affected by marking policies goes down as threshold goes down. The same as it rate of total credit account which includes the count of credits that are not affected by marketing policies.

Also, the total count of homeowners who do not take credits will significantly increase in the low threshold interval. They will arrive at a stationary period after the threshold reaches around 0.14.

4.3 Comparison between two Threshold

tablefinal <- filter(whichThreshold_revenue,Threshold==0.50 |Threshold==pull(arrange(whichThreshold_revenue,-Total_Revenue)[1,1]))%>%
  dplyr::select(Threshold,Total_Revenue,Total_MarketingCredit_Count,Total_Credit_Count)

tablefinal_kb <- kable(tablefinal,align = 'l',caption = '<center>Comparison between the Baseline Threshold and the Optimal Threshold')%>%
  kable_classic(full_width = T)%>%
  kable_styling(position = "center",)%>%
  footnote()

column_spec(tablefinal_kb,1:4,width_min = "10em",extra_css = "vertical-align:middle;")
Comparison between the Baseline Threshold and the Optimal Threshold
Threshold Total_Revenue Total_MarketingCredit_Count Total_Credit_Count
0.24 676200 72 145
0.50 404800 40 145

From the above table, we can have an intuitive understanding of the thresholds’ effect. From the table we can tell that our optimal threshold significantly increase our revenues, from $914,800 to $1,688,950, reachign out to more homeowners who have intentions to take credits.

5 Conclusion

From the outcome of this research, it is not difficult to draw the conclusion that this model should be put into markets. Because in the one hand, this data-driven approach improves business revenues relative to the business-as-usual approach in a great extent. Compared with a model wtih 0.5 threshold, this 0.24 threshold model brings an additional 67.0% return. On the other hand, it can effectively reach out to more potential homeowners. Compared to the business-as-usual approach which has a sensitivity of 0.15, this after-engineering model has a sensitivity of 0.27,increasing by 80%. The optimized model can predict more than a half of the credit takers, saving plenty of campaign resources

However, the limits also should not be ignored. Since we only used GLM regression in this project, we could not compare it with the effects of other regressions to find a more suitable method to implement this optimization. Besides, the data also limits the improvement. Due to the limited data linked with credit-taker, we can not further improve the sensitivity to have a better prediction on who is potential.

To make sure the marketing materials results in a better response. HCD should pay more attention to using mobile telecom, more frequently, to reach out to more potential credit takers, especially those property owners who have a lien against their property. At the same time, they should also pay attention to those homeowners who don’t have a contact for the campaign, those who spend much more money on house repair and those who are with jobs. In this way, could the marketing materials have a better response.