Incident Impact Prediction With Deployment

Business Objective :

To Predict The Impact Of The Incident Raised By The Customer.

Null Value Tableau Graph

Data Set Details:

If You Need Data Set Then Mail Me

train This DataSet For View Purpose.

Note: In Data visualisation , I Use Tableau Software.

EDA

We Have Two Data Set File. test.csv And train.csv

dim(train) # Getting Dimensions of Data.
colnames(train)
str(train)

Convert Logical To Factor.

for (i in 1:ncol(train)){
if(class(train[,i])=='logical'){
train[,i] <- as.factor(train[,i])
}
}

for (i in 1:ncol(test)){
if(class(test[,i])=='logical'){
test[,i] <- as.factor(test[,i])
}
}

Data Cleaning & Imputation

table(train$impact)
table(train$ID_status)
train <- subset(train,train$ID_status!='-100')

Removing Feature >50% data loss and timestamp.

colSums(train=='?')
train <- train[,-c(9,11,13,24,25)]

colSums(test=='?')
test <- test[,-c(9,11,13,23,24)]

Imputing With Mode which have lesser number of record missing.

train$ID_caller[train$ID_caller=='?'] <- 'Caller 1904'
train$location[train$location=='?'] <- 'Location 204'
train$category_ID[train$category_ID=='?'] <- 'Category 26'

test$ID_caller[test$ID_caller=='?'] <- 'Caller 1904'
test$location[test$location=='?'] <- 'Location 204'
test$category_ID[test$category_ID=='?'] <- 'Category 26'

Imputing With Decision Tree which have Large number of record missing.

sort(subset(colSums(train=='?'),colSums(train=='?')>1))
train[train=='?'] <- NA
train <- droplevels(train)
train <- impute(train,16,'opened_by')
train <- impute(train,16,'Support_group')
train <- impute(train,16,'support_incharge')
train <- impute(train,16,'user_symptom')
train <- impute(train,16,'Created_by')


sort(subset(colSums(test=='?'),colSums(test=='?')>1))
test[test=='?'] <- NA
test <- droplevels(test)
test <- impute(test,16,'opened_by')
test <- impute(test,16,'Support_group')
test <- impute(test,16,'support_incharge')
test <- impute(test,16,'user_symptom')
test <- impute(test,18,'Created_by')

#End of Emputation.

Load Imputed Train & Test Data

Then Compute Logical to Factor New Imputed Data

for (i in 1:ncol(train)){
if(class(train[,i])=='logical'){
train[,i] <- as.factor(train[,i])
}
}
for (i in 1:ncol(test)){
if(class(test[,i])=='logical'){
test[,i] <- as.factor(test[,i])
}
}

Feature Selection

I Use Boruta Feature

#Fit boruta model 
boruta_output <- Boruta(impact ~ ., data=train, doTrace=1,maxRuns=15) 

#Saveing Improtance of variable 
boruta_importance <- attStats(boruta_output)  

#Plotting Variable improtance 
plot(boruta_output, las = 2,cex.axis=0.55,xlab="")  
attStats(boruta_output)

Boruta Feature Selection

Factor Variable Encoding

train.impact <- train['impact']
train <- train[-20]
train['df_type'] <- 'train'
test['df_type'] <- 'test'
ori_data <- rbind(train,test)
ori_data <- droplevels(ori_data)

for (i in 1:ncol(ori_data)){
if(class(ori_data[,i])=='factor'){
ori_data[,i] <- as.numeric(ori_data[,i])
}
}

#Getting Train Dataset
train <- ori_data[ori_data$df_type=='train',]
train <- train[-20]
train['impact'] <- train.impact

#Getting Test Dataset
test <- ori_data[ori_data$df_type=='test',]
test <- test[-20]

Sampling Data Before Model Building

low <- subset(train,train$impact=='3 - Low')
medium <- subset(train,train$impact=='2 - Medium')
high <- subset(train,train$impact=='1 - High')

low <- low[sample(nrow(low), 94032,replace = T), ]
high <- high[sample(nrow(high), 94032,replace = T), ]
train <- rbind(low,medium,high)

Model Building

Decision Tree

library(C50)
controls <- C5.0Control(winnow = TRUE,
CF = 1,
fuzzyThreshold = FALSE,
sample = 0.999,
label = "impact")
train_model <- C5.0(train[,-which(colnames(train)=='impact')],train$impact,control = controls)
summary(train_model)
plot(train_model)
train_pred <- predict(train_model,train)
train <- sample_train
#Accuracy
library(caret)
confusionMatrix(train_pred,train$impact)


#Predicting On test Data

test_pred <- predict(train_model,newdata = test)
test_pred <- as.data.frame(test_pred)
test_pred['id'] <- 1:42514
colnames(test_pred) <- c('prediction1','ID')
test_pred <- test_pred[c('ID','prediction1')]

Random Forest

library(randomForest)
train_rand_forest <- randomForest(train[,-which(colnames(train)=='impact')],train$impact,mtry=7,ntree=1500,importance=TRUE)
summary(train_rand_forest)
train_rand_forest$importance
train_rand_pred <- predict(train_rand_forest,train)
#Accuracy
library(caret)
confusionMatrix(train_rand_pred,train$impact)

#Predicting On test Data

test_rand_pred <- predict(train_rand_forest,test)
test_rand_pred <- as.data.frame(test_rand_pred)
test_rand_pred['id'] <- 1:42514
colnames(test_rand_pred) <- c('prediction1','ID')
test_rand_pred <- test_rand_pred[c('ID','prediction1')]

Deployment R-Shiny

R Shine Deployment on Blow Link…..

READ Also  R for data science

R Shiny Code : Incident Impact Prediction

Python Deployment Using Flask

Deploy Model On Heroku Check Url Blow

Live Url : https://incident-impact-prediction.herokuapp.com

GitHub Full Code Link

Flask : https://github.com/jaydipkumar/flask

By Jay Patel

I done my data science study in 2018 at innodatatics. I have 5 Yers Experience in Data Science, Python and R.