# Clear environment of variables and functions
rm(list = ls(all = TRUE))
# Clear environmet of packages
if(is.null(sessionInfo()$otherPkgs) == FALSE)lapply(paste("package:", names(sessionInfo()$otherPkgs), sep=""), detach, character.only = TRUE, unload = TRUE)
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.5.3
## Warning: As of rlang 0.4.0, dplyr must be at least version 0.8.0.
## x dplyr 0.7.5 is too old for rlang 0.4.1.
## i Please update dplyr to the latest version.
## i Updating packages on Windows requires precautions:
## <https://github.com/jennybc/what-they-forgot/issues/62>
## -- Attaching packages --------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.2.1 v purrr 0.2.5
## v tibble 1.4.2 v dplyr 0.7.5
## v tidyr 0.8.1 v stringr 1.3.1
## v readr 1.1.1 v forcats 0.3.0
## Warning: package 'ggplot2' was built under R version 3.5.3
## Warning: package 'forcats' was built under R version 3.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 3.5.3
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(janitor)
## Warning: package 'janitor' was built under R version 3.5.3
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(knitr)
## Warning: package 'knitr' was built under R version 3.5.3
library(GGally)
## Warning: package 'GGally' was built under R version 3.5.3
##
## Attaching package: 'GGally'
## The following object is masked from 'package:dplyr':
##
## nasa
library(stringr)
library(plotly)
## Warning: package 'plotly' was built under R version 3.5.3
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(cowplot)
## Warning: package 'cowplot' was built under R version 3.5.3
##
## ********************************************************
## Note: As of version 1.0.0, cowplot does not change the
## default ggplot2 theme anymore. To recover the previous
## behavior, execute:
## theme_set(theme_cowplot())
## ********************************************************
library(kableExtra)
## Warning: package 'kableExtra' was built under R version 3.5.3
library(readxl)
## Warning: package 'readxl' was built under R version 3.5.3
library(corrplot)
## Warning: package 'corrplot' was built under R version 3.5.3
## corrplot 0.84 loaded
Load the data
df<-read_excel('HR_data.xlsx')
## readxl works best with a newer version of the tibble package.
## You currently have tibble v1.4.2.
## Falling back to column name repair from tibble <= v1.4.2.
## Message displays once per session.
Check the dfa type
str(df)
## Classes 'tbl_df', 'tbl' and 'data.frame': 1470 obs. of 35 variables:
## $ Age : num 41 49 37 33 27 32 59 30 38 36 ...
## $ Attrition : chr "Yes" "No" "Yes" "No" ...
## $ BusinessTravel : chr "Travel_Rarely" "Travel_Frequently" "Travel_Rarely" "Travel_Frequently" ...
## $ DailyRate : num 1102 279 1373 1392 591 ...
## $ Department : chr "Sales" "Research & Development" "Research & Development" "Research & Development" ...
## $ DistanceFromHome : num 1 8 2 3 2 2 3 24 23 27 ...
## $ Education : num 2 1 2 4 1 2 3 1 3 3 ...
## $ EducationField : chr "Life Sciences" "Life Sciences" "Other" "Life Sciences" ...
## $ EmployeeCount : num 1 1 1 1 1 1 1 1 1 1 ...
## $ EmployeeNumber : num 1 2 4 5 7 8 10 11 12 13 ...
## $ EnvironmentSatisfaction : num 2 3 4 4 1 4 3 4 4 3 ...
## $ Gender : chr "Female" "Male" "Male" "Female" ...
## $ HourlyRate : num 94 61 92 56 40 79 81 67 44 94 ...
## $ JobInvolvement : num 3 2 2 3 3 3 4 3 2 3 ...
## $ JobLevel : num 2 2 1 1 1 1 1 1 3 2 ...
## $ JobRole : chr "Sales Executive" "Research Scientist" "Laboratory Technician" "Research Scientist" ...
## $ JobSatisfaction : num 4 2 3 3 2 4 1 3 3 3 ...
## $ MaritalStatus : chr "Single" "Married" "Single" "Married" ...
## $ MonthlyIncome : num 5993 5130 2090 2909 3468 ...
## $ MonthlyRate : num 19479 24907 2396 23159 16632 ...
## $ NumCompaniesWorked : num 8 1 6 1 9 0 4 1 0 6 ...
## $ Over18 : chr "Y" "Y" "Y" "Y" ...
## $ OverTime : chr "Yes" "No" "Yes" "Yes" ...
## $ PercentSalaryHike : num 11 23 15 11 12 13 20 22 21 13 ...
## $ PerformanceRating : num 3 4 3 3 3 3 4 4 4 3 ...
## $ RelationshipSatisfaction: num 1 4 2 3 4 3 1 2 2 2 ...
## $ StandardHours : num 80 80 80 80 80 80 80 80 80 80 ...
## $ StockOptionLevel : num 0 1 0 0 1 0 3 1 0 2 ...
## $ TotalWorkingYears : num 8 10 7 8 6 8 12 1 10 17 ...
## $ TrainingTimesLastYear : num 0 3 3 3 3 2 3 2 2 3 ...
## $ WorkLifeBalance : num 1 3 3 3 3 2 2 3 3 2 ...
## $ YearsAtCompany : num 6 10 0 8 2 7 1 1 9 7 ...
## $ YearsInCurrentRole : num 4 7 0 7 2 7 0 0 7 7 ...
## $ YearsSinceLastPromotion : num 0 1 0 3 2 3 0 0 1 7 ...
## $ YearsWithCurrManager : num 5 7 0 0 2 6 0 0 8 7 ...
check for nulls
is.null(df)
## [1] FALSE
No nulls exist in the dfaframe.
fsct_var<-c('BusinessTravel','Department','EducationField','Gender','JobRole','MaritalStatus','Over18','OverTime')
df_num<-df
df_num$Gender <- str_replace(df_num$Gender, "Female", "1")
df_num$Gender <- str_replace(df_num$Gender, "Male", "2")
df_num$Attrition <- str_replace(df_num$Attrition, "No", "0")
df_num$Attrition <- str_replace(df_num$Attrition, "Yes", "1")
df_num$BusinessTravel <- str_replace(df_num$BusinessTravel, "Non-Travel", "0")
df_num$BusinessTravel <- str_replace(df_num$BusinessTravel, "Travel_Frequently", "2")
df_num$BusinessTravel <- str_replace(df_num$BusinessTravel, "Travel_Rarely", "1")
df_num$Department <- str_replace(df_num$Department, "Human Resources", "1")
df_num$Department <- str_replace(df_num$Department, "Research & Development", "2")
df_num$Department <- str_replace(df_num$Department, "Sales", "3")
df_num$Department <- str_replace(df_num$Department, "Human Resources", "1")
df_num$Department <- str_replace(df_num$Department, "Research & Development", "2")
df_num$Department <- str_replace(df_num$Department, "Sales", "3")
df_num$EducationField <- str_replace(df_num$EducationField, "Human Resources", "1")
df_num$EducationField <- str_replace(df_num$EducationField, "Life Sciences", "2")
df_num$EducationField <- str_replace(df_num$EducationField, "Marketing", "3")
df_num$EducationField <- str_replace(df_num$EducationField, "Medical", "4")
df_num$EducationField <- str_replace(df_num$EducationField, "Other", "5")
df_num$EducationField <- str_replace(df_num$EducationField, "Technical Degree", "6")
df_num$JobRole <- str_replace(df_num$JobRole, "Healthcare Representative", "1")
df_num$JobRole <- str_replace(df_num$JobRole, "Human Resources", "2")
df_num$JobRole <- str_replace(df_num$JobRole, "Laboratory Technician", "3")
df_num$JobRole <- str_replace(df_num$JobRole, "Manager", "4")
df_num$JobRole <- str_replace(df_num$JobRole, "Manufacturing Director", "5")
df_num$JobRole <- str_replace(df_num$JobRole, "Research Director", "6")
df_num$JobRole <- str_replace(df_num$JobRole, "Research Scientist", "7")
df_num$JobRole <- str_replace(df_num$JobRole, "Sales Executive", "8")
df_num$JobRole <- str_replace(df_num$JobRole, "Sales Representative", "9")
df_num$MaritalStatus <- str_replace(df_num$MaritalStatus, "Divorced", "1")
df_num$MaritalStatus <- str_replace(df_num$MaritalStatus, "Married", "2")
df_num$MaritalStatus <- str_replace(df_num$MaritalStatus, "Single", "3")
df_num$Over18 <- str_replace(df_num$Over18, "Y", "1")
df_num$OverTime <- str_replace(df_num$OverTime, "No", "0")
df_num$OverTime <- str_replace(df_num$OverTime, "Yes", "1")
df_num <- df_num %>%
mutate_if(is.character,as.numeric)
## Warning: `lgl_len()` is deprecated as of rlang 0.2.0.
## Please use `new_logical()` instead.
## This warning is displayed once per session.
## Warning: `is_lang()` is deprecated as of rlang 0.2.0.
## Please use `is_call()` instead.
## This warning is displayed once per session.
## Warning: `lang()` is deprecated as of rlang 0.2.0.
## Please use `call2()` instead.
## This warning is displayed once per session.
## Warning: `mut_node_car()` is deprecated as of rlang 0.2.0.
## This warning is displayed once per session.
## Warning: The `printer` argument is deprecated as of rlang 0.3.0.
## This warning is displayed once per session.
#df_new <- bind_cols(df_num, df_fact)
df_num %>%
select(fsct_var)%>%
cor(use = "pairwise.complete.obs") %>%
corrplot(method="number",type="upper")
## Warning: `new_overscope()` is deprecated as of rlang 0.2.0.
## Please use `new_data_mask()` instead.
## This warning is displayed once per session.
## Warning: `overscope_eval_next()` is deprecated as of rlang 0.2.0.
## Please use `eval_tidy()` with a data mask instead.
## This warning is displayed once per session.
## Warning in cor(., use = "pairwise.complete.obs"): the standard deviation is
## zero
Keeping department and Jobrole
df_num %>%
select(-fsct_var)%>%
cor(use = "pairwise.complete.obs") %>%
corrplot(method="number",type="upper")
## Warning in cor(., use = "pairwise.complete.obs"): the standard deviation is
## zero
df_num %>%
select(c("Age","JobLevel","MonthlyIncome","TotalWorkingYears","YearsAtCompany","YearsInCurrentRole","YearsSinceLastPromotion","YearsWithCurrManager"))%>%
cor(use = "pairwise.complete.obs") %>%
corrplot(method="number",type="upper")
The important variables that are being kept are Attrition,Age,JobLevel,MonthlyIncome,TotalWorkingYears,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion, Department ,JobRole and YearsWithCurrManager
Change some variables into factors
df$Attrition<-as.factor(df$Attrition)
levels(df$Attrition)<-c("0","1")
df$BusinessTravel<-as.factor(df$BusinessTravel)
df$Department<-as.factor(df$Department)
df$EducationField<-as.factor(df$EducationField)
df$Gender<-as.factor(df$Gender)
df$JobRole<-as.factor(df$JobRole)
df$MaritalStatus<-as.factor(df$MaritalStatus)
df$Over18<-as.factor(df$Over18)
df$OverTime<-as.factor(df$OverTime)
df$Education<-as.factor(df$Education)
df$JobSatisfaction<-as.factor(df$JobSatisfaction)
df$EnvironmentSatisfaction<-as.factor(df$EnvironmentSatisfaction)
df$JobLevel<-as.factor(df$JobLevel)
df$TrainingTimesLastYear<-as.factor(df$TrainingTimesLastYear)
df$TrainingSet <- ifelse(levels(df$TrainingTimesLastYear)==0,"0","1")
df$Training <- ifelse(levels(df$TrainingTimesLastYear)==0,"No","Yes")
Summary of the data
summary(df)
## Age Attrition BusinessTravel DailyRate
## Min. :18.00 0:1233 Non-Travel : 150 Min. : 102.0
## 1st Qu.:30.00 1: 237 Travel_Frequently: 277 1st Qu.: 465.0
## Median :36.00 Travel_Rarely :1043 Median : 802.0
## Mean :36.92 Mean : 802.5
## 3rd Qu.:43.00 3rd Qu.:1157.0
## Max. :60.00 Max. :1499.0
##
## Department DistanceFromHome Education
## Human Resources : 63 Min. : 1.000 1:170
## Research & Development:961 1st Qu.: 2.000 2:282
## Sales :446 Median : 7.000 3:572
## Mean : 9.193 4:398
## 3rd Qu.:14.000 5: 48
## Max. :29.000
##
## EducationField EmployeeCount EmployeeNumber
## Human Resources : 27 Min. :1 Min. : 1.0
## Life Sciences :606 1st Qu.:1 1st Qu.: 491.2
## Marketing :159 Median :1 Median :1020.5
## Medical :464 Mean :1 Mean :1024.9
## Other : 82 3rd Qu.:1 3rd Qu.:1555.8
## Technical Degree:132 Max. :1 Max. :2068.0
##
## EnvironmentSatisfaction Gender HourlyRate JobInvolvement
## 1:284 Female:588 Min. : 30.00 Min. :1.00
## 2:287 Male :882 1st Qu.: 48.00 1st Qu.:2.00
## 3:453 Median : 66.00 Median :3.00
## 4:446 Mean : 65.89 Mean :2.73
## 3rd Qu.: 83.75 3rd Qu.:3.00
## Max. :100.00 Max. :4.00
##
## JobLevel JobRole JobSatisfaction MaritalStatus
## 1:543 Sales Executive :326 1:289 Divorced:327
## 2:534 Research Scientist :292 2:280 Married :673
## 3:218 Laboratory Technician :259 3:442 Single :470
## 4:106 Manufacturing Director :145 4:459
## 5: 69 Healthcare Representative:131
## Manager :102
## (Other) :215
## MonthlyIncome MonthlyRate NumCompaniesWorked Over18 OverTime
## Min. : 1009 Min. : 2094 Min. :0.000 Y:1470 No :1054
## 1st Qu.: 2911 1st Qu.: 8047 1st Qu.:1.000 Yes: 416
## Median : 4919 Median :14236 Median :2.000
## Mean : 6503 Mean :14313 Mean :2.693
## 3rd Qu.: 8379 3rd Qu.:20462 3rd Qu.:4.000
## Max. :19999 Max. :26999 Max. :9.000
##
## PercentSalaryHike PerformanceRating RelationshipSatisfaction
## Min. :11.00 Min. :3.000 Min. :1.000
## 1st Qu.:12.00 1st Qu.:3.000 1st Qu.:2.000
## Median :14.00 Median :3.000 Median :3.000
## Mean :15.21 Mean :3.154 Mean :2.712
## 3rd Qu.:18.00 3rd Qu.:3.000 3rd Qu.:4.000
## Max. :25.00 Max. :4.000 Max. :4.000
##
## StandardHours StockOptionLevel TotalWorkingYears TrainingTimesLastYear
## Min. :80 Min. :0.0000 Min. : 0.00 0: 54
## 1st Qu.:80 1st Qu.:0.0000 1st Qu.: 6.00 1: 71
## Median :80 Median :1.0000 Median :10.00 2:547
## Mean :80 Mean :0.7939 Mean :11.28 3:491
## 3rd Qu.:80 3rd Qu.:1.0000 3rd Qu.:15.00 4:123
## Max. :80 Max. :3.0000 Max. :40.00 5:119
## 6: 65
## WorkLifeBalance YearsAtCompany YearsInCurrentRole
## Min. :1.000 Min. : 0.000 Min. : 0.000
## 1st Qu.:2.000 1st Qu.: 3.000 1st Qu.: 2.000
## Median :3.000 Median : 5.000 Median : 3.000
## Mean :2.761 Mean : 7.008 Mean : 4.229
## 3rd Qu.:3.000 3rd Qu.: 9.000 3rd Qu.: 7.000
## Max. :4.000 Max. :40.000 Max. :18.000
##
## YearsSinceLastPromotion YearsWithCurrManager TrainingSet
## Min. : 0.000 Min. : 0.000 Length:1470
## 1st Qu.: 0.000 1st Qu.: 2.000 Class :character
## Median : 1.000 Median : 3.000 Mode :character
## Mean : 2.188 Mean : 4.123
## 3rd Qu.: 3.000 3rd Qu.: 7.000
## Max. :15.000 Max. :17.000
##
## Training
## Length:1470
## Class :character
## Mode :character
##
##
##
##
Observations
grid.arrange(ggplot(data = df) +
geom_boxplot(mapping = aes(x = 1,y = Age))+
ggtitle('Age Distribution')+
labs(x = '', y = 'Age')+
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank(),#delete the x axis lables
plot.title = element_text(hjust = 0.5),# title center
text = element_text(size=10),plot.margin = margin(2,1.5,2,1.5, "cm")),
ggplot(df, aes(x=Age))+
geom_density(color="darkblue", fill="lightblue"),
ncol=2)
# grid.arrange(df %>% ggplot(mapping = aes(x = Age)) + geom_histogram(),
# df %>% ggplot(mapping = aes(x = 1, y = Age)) + geom_boxplot() + coord_flip(),
# ncol = 1)
grid.arrange(ggplot(data = df) +
geom_boxplot(mapping = aes(x = 1,y = MonthlyIncome))+
ggtitle('MonthlyIncome Distribution')+
labs(x = '', y = 'MonthlyIncome')+
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank(),#delete the x axis lables
plot.title = element_text(hjust = 0.5),# title center
text = element_text(size=10),plot.margin = margin(2,1.5,2,1.5, "cm")),
ggplot(df, aes(x=MonthlyIncome))+
geom_density(color="darkblue", fill="lightblue"),
ncol=2)
grid.arrange(ggplot(data = df) +
geom_boxplot(mapping = aes(x = 1,y = TotalWorkingYears))+
ggtitle('TotalWorkingYears Distribution')+
labs(x = '', y = 'TotalWorkingYears')+
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank(),#delete the x axis lables
plot.title = element_text(hjust = 0.5),# title center
text = element_text(size=10),plot.margin = margin(2,1.5,2,1.5, "cm")),
ggplot(df, aes(x=TotalWorkingYears))+
geom_density(color="darkblue", fill="lightblue"),
ncol=2)
grid.arrange(ggplot(data = df) +
geom_boxplot(mapping = aes(x = 1,y = YearsAtCompany))+
ggtitle('YearsAtCompany Distribution')+
labs(x = '', y = 'YearsAtCompany')+
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank(),#delete the x axis lables
plot.title = element_text(hjust = 0.5),# title center
text = element_text(size=10),plot.margin = margin(2,1.5,2,1.5, "cm")),
ggplot(df, aes(x=YearsAtCompany))+
geom_density(color="darkblue", fill="lightblue"),
ncol=2)
grid.arrange(ggplot(data = df) +
geom_boxplot(mapping = aes(x = 1,y = YearsInCurrentRole))+
ggtitle('YearsInCurrentRole Distribution')+
labs(x = '', y = 'YearsInCurrentRole')+
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank(),#delete the x axis lables
plot.title = element_text(hjust = 0.5),# title center
text = element_text(size=10),plot.margin = margin(2,1.5,2,1.5, "cm")),
ggplot(df, aes(x=YearsInCurrentRole))+
geom_density(color="darkblue", fill="lightblue"),
ncol=2)
grid.arrange(ggplot(data = df) +
geom_boxplot(mapping = aes(x = 1,y = YearsSinceLastPromotion))+
ggtitle('YearsSinceLastPromotion Distribution')+
labs(x = '', y = 'YearsSinceLastPromotion')+
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank(),#delete the x axis lables
plot.title = element_text(hjust = 0.5),# title center
text = element_text(size=10),plot.margin = margin(2,1.5,2,1.5, "cm")),
ggplot(df, aes(x=YearsSinceLastPromotion))+
geom_density(color="darkblue", fill="lightblue"),
ncol=2)
grid.arrange(ggplot(data = df) +
geom_boxplot(mapping = aes(x = 1,y = YearsWithCurrManager))+
ggtitle('YearsWithCurrManager Distribution')+
labs(x = '', y = 'YearsWithCurrManager')+
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank(),#delete the x axis lables
plot.title = element_text(hjust = 0.5),# title center
text = element_text(size=10),plot.margin = margin(2,1.5,2,1.5, "cm")),
ggplot(df, aes(x=YearsWithCurrManager))+
geom_density(color="darkblue", fill="lightblue"),
ncol=2)
df %>% ggplot(mapping = aes(x = JobLevel)) + geom_bar()
Most jobs are associated with level 1 and 2
df %>% ggplot(mapping = aes(x = Attrition)) + geom_bar()
Only about 19% have left the company
df %>% ggplot(mapping = aes(x = EnvironmentSatisfaction)) + geom_bar()
Most Employee are satisfied with their working environment
df %>% ggplot(mapping = aes(x = JobSatisfaction)) + geom_bar()
Most employees are satisfied with their job enviroment
df %>% ggplot(mapping = aes(x = TrainingSet)) + geom_bar()
Most employees have been trained.
df %>% ggplot(mapping = aes(x = JobRole)) + geom_bar()
Questions: 1.So many employees have high satisfaction with job and environment, then why are so many being trained? 2. How does training affect promotion? 3.Does Training make employess more satisfied with their environment and job? 4. Do employees being trained have higher median monthly income? 5.How long do they spend in their current role and maanger before being put into training?
df %>%
tabyl(JobRole, JobLevel) %>%
adorn_totals(where = c("row", "col")) %>%
adorn_percentages("row") %>%
adorn_pct_formatting()%>%
adorn_ns()
## Warning: `.named` can no longer be a width
## This warning is displayed once per session.
## Warning: `list_len()` is deprecated as of rlang 0.2.0.
## Please use `new_list()` instead.
## This warning is displayed once per session.
## Warning: `chr_along()` is deprecated as of rlang 0.2.0.
## This warning is displayed once per session.
## JobRole 1 2 3 4
## Healthcare Representative 0.0% (0) 59.5% (78) 33.6% (44) 6.9% (9)
## Human Resources 63.5% (33) 25.0% (13) 11.5% (6) 0.0% (0)
## Laboratory Technician 77.2% (200) 21.6% (56) 1.2% (3) 0.0% (0)
## Manager 0.0% (0) 0.0% (0) 11.8% (12) 46.1% (47)
## Manufacturing Director 0.0% (0) 62.1% (90) 31.0% (45) 6.9% (10)
## Research Director 0.0% (0) 0.0% (0) 35.0% (28) 32.5% (26)
## Research Scientist 80.1% (234) 19.5% (57) 0.3% (1) 0.0% (0)
## Sales Executive 0.0% (0) 71.5% (233) 24.2% (79) 4.3% (14)
## Sales Representative 91.6% (76) 8.4% (7) 0.0% (0) 0.0% (0)
## Total 36.9% (543) 36.3% (534) 14.8% (218) 7.2% (106)
## 5 Total
## 0.0% (0) 100.0% (131)
## 0.0% (0) 100.0% (52)
## 0.0% (0) 100.0% (259)
## 42.2% (43) 100.0% (102)
## 0.0% (0) 100.0% (145)
## 32.5% (26) 100.0% (80)
## 0.0% (0) 100.0% (292)
## 0.0% (0) 100.0% (326)
## 0.0% (0) 100.0% (83)
## 4.7% (69) 100.0% (1470)
Level 1 consists of Human rEources, Laboratory Technicians, Research Scientist and Sale Representative Level 2 consists of Mostly Healthcare Representatives, Manufacturing director and Sales Executive Level 3 consists ofhealthcare representatives,human resources, manager, manufacturing director, research director and sales executives level 4 consists of mostly managers and research directors level 5 consists of managers and research directors
df %>%
tabyl(Attrition, JobLevel) %>%
adorn_totals(where = c("row", "col")) %>%
adorn_percentages("row") %>%
adorn_pct_formatting()%>%
adorn_ns()
## Attrition 1 2 3 4 5
## 0 32.4% (400) 39.1% (482) 15.1% (186) 8.2% (101) 5.2% (64)
## 1 60.3% (143) 21.9% (52) 13.5% (32) 2.1% (5) 2.1% (5)
## Total 36.9% (543) 36.3% (534) 14.8% (218) 7.2% (106) 4.7% (69)
## Total
## 100.0% (1233)
## 100.0% (237)
## 100.0% (1470)
Most of the employees that leave are from Job Level 1,2,3. We see more people leaving from job level 1 than staying.
#df$TrainingSet <- ifelse(levels(df$TrainingTimesLastYear)==0,"0","1")
#df$Training <- ifelse(levels(df$TrainingTimesLastYear)==0,"No","Yes")
df %>%
tabyl( Training,Attrition) %>%
adorn_totals(where = c("row", "col")) %>%
adorn_percentages("col") %>%
adorn_pct_formatting()%>%
adorn_ns()
## Training 0 1 Total
## No 13.9% (172) 16.0% (38) 14.3% (210)
## Yes 86.1% (1061) 84.0% (199) 85.7% (1260)
## Total 100.0% (1233) 100.0% (237) 100.0% (1470)
There are three times more employees leaving with training than without training. There is a negative correlation with training and attrition. The more employees are trained, the more the employees leave.
df %>%
tabyl( JobSatisfaction,Training,Attrition) %>%
adorn_totals(where = c("row", "col")) %>%
adorn_percentages("col") %>%
adorn_pct_formatting()%>%
adorn_ns()
## $`0`
## JobSatisfaction No Yes Total
## 1 20.9% (36) 17.6% (187) 18.1% (223)
## 2 14.5% (25) 19.7% (209) 19.0% (234)
## 3 32.6% (56) 29.5% (313) 29.9% (369)
## 4 32.0% (55) 33.2% (352) 33.0% (407)
## Total 100.0% (172) 100.0% (1061) 100.0% (1233)
##
## $`1`
## JobSatisfaction No Yes Total
## 1 26.3% (10) 28.1% (56) 27.8% (66)
## 2 21.1% (8) 19.1% (38) 19.4% (46)
## 3 21.1% (8) 32.7% (65) 30.8% (73)
## 4 31.6% (12) 20.1% (40) 21.9% (52)
## Total 100.0% (38) 100.0% (199) 100.0% (237)
Employees who are trained and leave, are less satisfied with their job.
df %>%
tabyl( EnvironmentSatisfaction,Training,Attrition) %>%
adorn_totals(where = c("row", "col")) %>%
adorn_percentages("col") %>%
adorn_pct_formatting()%>%
adorn_ns()
## $`0`
## EnvironmentSatisfaction No Yes Total
## 1 14.5% (25) 17.6% (187) 17.2% (212)
## 2 16.9% (29) 20.3% (215) 19.8% (244)
## 3 29.7% (51) 32.0% (340) 31.7% (391)
## 4 39.0% (67) 30.1% (319) 31.3% (386)
## Total 100.0% (172) 100.0% (1061) 100.0% (1233)
##
## $`1`
## EnvironmentSatisfaction No Yes Total
## 1 28.9% (11) 30.7% (61) 30.4% (72)
## 2 13.2% (5) 19.1% (38) 18.1% (43)
## 3 31.6% (12) 25.1% (50) 26.2% (62)
## 4 26.3% (10) 25.1% (50) 25.3% (60)
## Total 100.0% (38) 100.0% (199) 100.0% (237)
Employees who are trained and leave, are less satisfied with their environment.
df_num$TrainingSet <- ifelse(df_num$TrainingTimesLastYear==0,0,1)
df_num %>% select(Attrition,Age,TotalWorkingYears,TrainingSet, MonthlyIncome, YearsSinceLastPromotion, YearsInCurrentRole,YearsAtCompany,YearsWithCurrManager) %>% cor()
## Attrition Age TotalWorkingYears
## Attrition 1.00000000 -0.1592050069 -0.171063246
## Age -0.15920501 1.0000000000 0.680380536
## TotalWorkingYears -0.17106325 0.6803805358 1.000000000
## TrainingSet -0.06189441 0.0003507411 0.008414448
## MonthlyIncome -0.15983958 0.4978545669 0.772893246
## YearsSinceLastPromotion -0.03301878 0.2165133679 0.404857759
## YearsInCurrentRole -0.16054500 0.2129010556 0.460364638
## YearsAtCompany -0.13439221 0.3113087697 0.628133155
## YearsWithCurrManager -0.15619932 0.2020886024 0.459188397
## TrainingSet MonthlyIncome
## Attrition -0.0618944070 -0.15983958
## Age 0.0003507411 0.49785457
## TotalWorkingYears 0.0084144481 0.77289325
## TrainingSet 1.0000000000 0.01199263
## MonthlyIncome 0.0119926260 1.00000000
## YearsSinceLastPromotion -0.0099478556 0.34497764
## YearsInCurrentRole 0.0033744170 0.36381767
## YearsAtCompany 0.0386414799 0.51428483
## YearsWithCurrManager 0.0300599048 0.34407888
## YearsSinceLastPromotion YearsInCurrentRole
## Attrition -0.033018775 -0.160545004
## Age 0.216513368 0.212901056
## TotalWorkingYears 0.404857759 0.460364638
## TrainingSet -0.009947856 0.003374417
## MonthlyIncome 0.344977638 0.363817667
## YearsSinceLastPromotion 1.000000000 0.548056248
## YearsInCurrentRole 0.548056248 1.000000000
## YearsAtCompany 0.618408865 0.758753737
## YearsWithCurrManager 0.510223636 0.714364762
## YearsAtCompany YearsWithCurrManager
## Attrition -0.13439221 -0.1561993
## Age 0.31130877 0.2020886
## TotalWorkingYears 0.62813316 0.4591884
## TrainingSet 0.03864148 0.0300599
## MonthlyIncome 0.51428483 0.3440789
## YearsSinceLastPromotion 0.61840887 0.5102236
## YearsInCurrentRole 0.75875374 0.7143648
## YearsAtCompany 1.00000000 0.7692124
## YearsWithCurrManager 0.76921243 1.0000000
Are younger or older people receiving more training?
monthly_incr <- df %>%
group_by(Training,JobLevel) %>%
summarise(med_inc = median(YearsAtCompany)) %>%
select(Training,JobLevel, (med_inc)) %>%
spread(key = Training, value = med_inc) %>%
mutate(Training0vs1 = (Yes-No))
monthly_incr
## # A tibble: 5 x 4
## JobLevel No Yes Training0vs1
## <fct> <dbl> <dbl> <dbl>
## 1 1 4 3 -1
## 2 2 7 6 -1
## 3 3 10 9 -1
## 4 4 8 14 6
## 5 5 4 19.5 15.5
df_monthly_incr <- df %>%
group_by(Training,Attrition) %>%
summarise(med_inc = median(MonthlyIncome)) %>%
select(Training,Attrition, (med_inc)) %>%
spread(key = Training, value = med_inc) %>%
mutate(Training0vs1 = (Yes-No)/No*100)
df_monthly_incr
## # A tibble: 2 x 4
## Attrition No Yes Training0vs1
## <fct> <dbl> <dbl> <dbl>
## 1 0 5008. 5210 4.04
## 2 1 3903 3102 -20.5
The monthly income decreases by 20.5% with training than no training for employees that were turned over. The monthly income increases by 4.04% with training than no training employees that were retained.
df_yearsatcompany <- df %>%
group_by(Training,JobLevel) %>%
summarise(med_inc = median(YearsAtCompany)) %>%
select(Training,JobLevel, (med_inc)) %>%
spread(key = Training, value = med_inc) %>%
mutate(Training0vs1 = (Yes-No)/No*100)
df_yearsatcompany
## # A tibble: 5 x 4
## JobLevel No Yes Training0vs1
## <fct> <dbl> <dbl> <dbl>
## 1 1 4 3 -25
## 2 2 7 6 -14.3
## 3 3 10 9 -10
## 4 4 8 14 75
## 5 5 4 19.5 388.
For job levels 1-3 with training , the median years in the company tend to be lower by between 10-25%. For job level 4 and 5, the median years in the company tend to increase with training.
df_yearsatcurrentrole <- df %>%
group_by(Training,JobLevel) %>%
summarise(med_inc = median(YearsInCurrentRole)) %>%
select(Training,JobLevel, (med_inc)) %>%
spread(key = Training, value = med_inc) %>%
mutate(Training0vs1 = (Yes-No)/No*100)
df_yearsatcurrentrole
## # A tibble: 5 x 4
## JobLevel No Yes Training0vs1
## <fct> <dbl> <dbl> <dbl>
## 1 1 2 2 0
## 2 2 4 3 -25
## 3 3 8 7 -12.5
## 4 4 6 7 16.7
## 5 5 2 7 250
For job levels 2-3 with training , the median years in the current role tend to be lower by between 10-25%. For job level 1, there is no difference with or without training for the years spent in the current role. For job level 4 and 5, the median years in the current role tend to increase with training.
df_yearswithcurrmanager <- df %>%
group_by(Training,JobLevel) %>%
summarise(med_inc = median(YearsWithCurrManager)) %>%
select(Training,JobLevel, (med_inc)) %>%
spread(key = Training, value = med_inc) %>%
mutate(Training0vs1 = (Yes-No)/No*100)
df_yearswithcurrmanager
## # A tibble: 5 x 4
## JobLevel No Yes Training0vs1
## <fct> <dbl> <dbl> <dbl>
## 1 1 2 2 0
## 2 2 4 3 -25
## 3 3 7 7 0
## 4 4 5.5 7 27.3
## 5 5 2 7 250
For job levels2 with training , the median years with the manager tend to be lower by between 25%. For job level 1 and 3, there is no difference with or without training for the years spent with the manager. For job level 4 and 5, the median years with the manager tend to increase with training.
df_yearsSincelastpromo <- df %>%
group_by(Training,JobLevel) %>%
summarise(med_inc = median(YearsSinceLastPromotion)) %>%
select(Training,JobLevel, (med_inc)) %>%
spread(key = Training, value = med_inc) %>%
mutate(Training0vs1 = (Yes-No)/No*100)
df_yearsSincelastpromo
## # A tibble: 5 x 4
## JobLevel No Yes Training0vs1
## <fct> <dbl> <dbl> <dbl>
## 1 1 1 1 0
## 2 2 1 1 0
## 3 3 4 1 -75
## 4 4 2.5 3 20
## 5 5 2 2 0
For job level 1,2 and 5, there is no difference with or without training for the years since last promotion. For job levels 3 with training , the median years since last promotion tend to be lower by between 75%. For job levels 4 with training , the median years since last promotion tend to be higher by between 20%.
df_age <- df %>%
group_by(Training,JobLevel) %>%
summarise(med_inc = median(Age)) %>%
select(Training,JobLevel, (med_inc)) %>%
spread(key = Training, value = med_inc) %>%
mutate(Training0vs1 = (Yes-No)/No*100)
df_age
## # A tibble: 5 x 4
## JobLevel No Yes Training0vs1
## <fct> <dbl> <dbl> <dbl>
## 1 1 32 31 -3.12
## 2 2 35 36 2.86
## 3 3 41 38 -7.32
## 4 4 50 47.5 -5
## 5 5 47 48 2.13
Younger people tend to receive more training in job level 1,3 and 4.
df_ttworkingyrs <- df %>%
group_by(Training,JobLevel) %>%
summarise(med_inc = median(TotalWorkingYears)) %>%
select(Training,JobLevel, (med_inc)) %>%
spread(key = Training, value = med_inc) %>%
mutate(Training0vs1 = (Yes-No)/No*100)
df_ttworkingyrs
## # A tibble: 5 x 4
## JobLevel No Yes Training0vs1
## <fct> <dbl> <dbl> <dbl>
## 1 1 6 6 0
## 2 2 10 10 0
## 3 3 16 13 -18.8
## 4 4 24 25 4.17
## 5 5 24 25 4.17
Job level three with training tend to give lower total working years in the company.
df_JL <- df %>%
filter(Attrition==1)
# Job level 1 across Training,attrition
grid.arrange(
df_JL %>%
ggplot(mapping = aes(x = JobLevel, fill = Training)) +
geom_bar(position = "dodge", width = 0.5) +
theme_classic(),
# job level 1 job satisfaction and environment
df_JL %>%
ggplot(mapping = aes(x = JobLevel, fill = JobSatisfaction)) +
geom_bar(position = "fill", width = 0.5) +
theme_classic(),
df_JL %>%
ggplot(mapping = aes(x = JobLevel, fill = EnvironmentSatisfaction)) +
geom_bar(position = "fill", width = 0.3) +
theme_classic(),
ncol=1)
df_JL <- df %>%
filter(Attrition==0)
# Job level 1 across Training,attrition
grid.arrange(
df_JL %>%
ggplot(mapping = aes(x = JobLevel, fill = Training)) +
geom_bar(position = "dodge", width = 0.5) +
theme_classic(),
# job level 1 job satisfaction and environment
df_JL %>%
ggplot(mapping = aes(x = JobLevel, fill = JobSatisfaction)) +
geom_bar(position = "fill", width = 0.5) +
theme_classic(),
df_JL %>%
ggplot(mapping = aes(x = JobLevel, fill = EnvironmentSatisfaction)) +
geom_bar(position = "fill", width = 0.3) +
theme_classic(),
ncol=1)
From the two graphs, we can definitely confirm that employees who leave the company are far more dissatisfied with the job and environment than employees who stay.
ggplot(df,aes(x=JobSatisfaction,group=Attrition,Training))+geom_bar(stat="count",aes(y=..prop..,fill=factor(..x..)))+labs(x="Job Satisfaction",y="Percentage",title="Job Satisfaction Vs Attrition Rates")+facet_wrap(~Attrition+Training)+theme(legend.position="none",plot.title=element_text(hjust=0.5,size=14))+geom_text(aes(label=scales::percent(..prop..),y=..prop..),stat="count",vjust=-0.5)
Training decreases jobsatisfaction,employee who left with training are less satisfied.
ggplot(df,aes(x=EnvironmentSatisfaction,group=Attrition,Training))+geom_bar(stat="count",aes(y=..prop..,fill=factor(..x..)))+labs(x="Job Satisfaction",y="Percentage",title="Environment Satisfaction Vs Attrition Rates")+facet_wrap(~Attrition+Training)+theme(legend.position="none",plot.title=element_text(hjust=0.5,size=14))+geom_text(aes(label=scales::percent(..prop..),y=..prop..),stat="count",vjust=-0.5)
All Employees who are being training become less satisfied with the environment.
grid.arrange(
df%>%group_by(YearsAtCompany) %>%
summarise(MnthlyInc = median(MonthlyIncome, na.rm = TRUE))%>%
ggplot(aes(x = YearsAtCompany, y = MnthlyInc)) +
geom_point() + geom_line(),
df%>%group_by(YearsInCurrentRole) %>%
summarise(MnthlyInc = median(MonthlyIncome, na.rm = TRUE))%>%
ggplot(aes(x = YearsInCurrentRole, y = MnthlyInc)) +
geom_point() + geom_line(),
df%>%group_by(YearsSinceLastPromotion) %>%
summarise(MnthlyInc = median(MonthlyIncome, na.rm = TRUE))%>%
ggplot(aes(x = YearsSinceLastPromotion, y = MnthlyInc)) +
geom_point() + geom_line(),
df%>%group_by(YearsWithCurrManager) %>%
summarise(MnthlyInc = median(MonthlyIncome, na.rm = TRUE))%>%
ggplot(aes(x = YearsWithCurrManager, y = MnthlyInc)) +
geom_point() + geom_line()
)
There seems to be a general positive relationship between years worked in the company with the median income except for 15,24,30,32,36 and 40 yyears in the company. Being 6 years in the current role give higher monthly income than anywhere from above 6 years to below 13 years. People who havee not has any recent promotions tend to have a higher monthly income, except for year 10. The more time you spend with your manger the higer the monthly income.
df%>%group_by(YearsInCurrentRole) %>%
summarise(MnthlyInc = median(MonthlyIncome, na.rm = TRUE))%>%
ggplot(aes(x = YearsInCurrentRole, y = MnthlyInc)) +
geom_point() + geom_line()
df%>%group_by(YearsInCurrentRole,YearsAtCompany) %>%
summarise(YearsCuuRole = median(YearsInCurrentRole, na.rm = TRUE))%>%
ggplot(aes(x = YearsAtCompany, y = YearsCuuRole)) +
geom_point()
There is a linear positive relationship with years at company with current role up till 20 years, but then there is not a linear relationship.This means after 20 years staying in the company,there are employees who switch their positions.
df %>%
ggplot(mapping = aes(x = YearsAtCompany, y = YearsWithCurrManager)) +
geom_point()
There is a linear positive relationship with years at company with current manager up till 20 years, but then there is not a linear relationship.This means after 20 years staying in the company,there are employees who switch their managers.
df %>%
ggplot(mapping = aes(x = YearsAtCompany, y = YearsSinceLastPromotion)) +
geom_point()
There is somewhat positive linear relationship, but weak. Not all employees who have been working for the company for a long time receive more promotions.
df %>%
group_by(JobLevel, Attrition, Training) %>%
summarise(med_mnthlyInc = median(MonthlyIncome)) %>%
ggplot(mapping = aes(x = Attrition, y = med_mnthlyInc, color = JobLevel)) +
geom_point() +
facet_grid(. ~ Training) +
theme_bw() +
theme(axis.text.x = element_text(size = 10, angle = 0))
For job level 5, there is no difference for both the employees who leave and stay for their monthly income with training. There is no data for monthly income for job level 4 with attrition and no training. For Job Level 4 who stay in the company, they earn less with training. For Job Level 3, both the employee who stay and leave earn less with training. People with job level two, who leave the company earned less with training. People with job level one, both who stayed and left, earned more with training.
df %>%
group_by(JobLevel, Attrition, Training) %>%
summarise(JobSatisfaction = mean(as.numeric(JobSatisfaction))) %>%
ggplot(mapping = aes(x = Attrition, y = JobSatisfaction, color = JobLevel)) +
geom_point() +
facet_grid(. ~ Training) +
theme_bw() +
theme(axis.text.x = element_text(size = 10, angle = 0))
Employees who are training regardless of leaving the company, are less happy
df %>%
group_by(JobLevel, Attrition, Training) %>%
summarise(EnvSatisfaction = mean(as.numeric(EnvironmentSatisfaction))) %>%
ggplot(mapping = aes(x = Attrition, y = EnvSatisfaction, color = JobLevel)) +
geom_point() +
facet_grid(. ~ Training) +
theme_bw() +
theme(axis.text.x = element_text(size = 10, angle = 0))
demo <- df %>% group_by(Training, Attrition) %>% summarise(med_inc = median(MonthlyIncome),
n = n()) %>% ungroup()
sub_title = paste0("Employees who leave with attrion have the least monthly income")
demo<-demo %>% mutate(scenario = as.factor(paste(Training,Attrition)),
scenario = factor(scenario,
levels = c("No 0","No 1","Yes 0","Yes 1"),
labels = c('NoTraining & NoAttrition', 'NoTraining & Attrition',"Training & NoAttrition", "Training & Attrition")
)
)
demo %>% ggplot(aes(x = reorder(scenario,-as.numeric(scenario)), y = med_inc, fill = scenario)) +
geom_bar(stat = 'identity', position = 'dodge') + coord_flip() +
scale_y_continuous(labels = dollar) +
ggtitle("Employees who leave with training have the least
monthly income") +
labs(x = "", y = "Median Income") +
scale_fill_manual(values=c("grey65", "grey65", "grey65", "grey25"))+
theme(legend.position = "none")
demo
## # A tibble: 4 x 5
## Training Attrition med_inc n scenario
## <chr> <fct> <dbl> <int> <fct>
## 1 No 0 5008. 172 NoTraining & NoAttrition
## 2 No 1 3903 38 NoTraining & Attrition
## 3 Yes 0 5210 1061 Training & NoAttrition
## 4 Yes 1 3102 199 Training & Attrition
demo3 <- df%>%group_by(Training,Attrition) %>% summarise(med_inc = mean(YearsWithCurrManager),n = n()) %>% ungroup()
sub_title = paste0("Employees who leave with attrion have the least monthly income")
demo3<-demo3 %>% mutate(scenario = as.factor(paste(Training,Attrition)),
scenario = factor(scenario,
levels = c("No 0","No 1","Yes 0","Yes 1"),
labels = c('NoTraining & NoAttrition', 'NoTraining & Attrition',"Training & NoAttrition", "Training & Attrition")
)
)
demo3 %>% ggplot(aes(x = reorder(scenario,-as.numeric(scenario)), y = med_inc, fill = scenario)) +
geom_bar(stat = 'identity', position = 'dodge') + coord_flip() +
ggtitle("Trained Employee who leave spent at least 40% less time with their manager
than employees without training") +
labs(x = "", y = "Time with Manager in Years") +
scale_fill_manual(values=c("grey65", "grey65", "grey65", "grey25"))+
theme(legend.position = "none")
demo3
## # A tibble: 4 x 5
## Training Attrition med_inc n scenario
## <chr> <fct> <dbl> <int> <fct>
## 1 No 0 4.53 172 NoTraining & NoAttrition
## 2 No 1 3.53 38 NoTraining & Attrition
## 3 Yes 0 4.34 1061 Training & NoAttrition
## 4 Yes 1 2.72 199 Training & Attrition
Training doesn’t make them spend much time with the manager so they leave
demo4 <- df%>%group_by(Training,Attrition) %>% summarise(med_inc = mean(YearsInCurrentRole),n = n()) %>% ungroup()
sub_title = paste0("Employees who leave with attrion have the least monthly income")
demo4<-demo4 %>% mutate(scenario = as.factor(paste(Training,Attrition)),
scenario = factor(scenario,
levels = c("No 0","No 1","Yes 0","Yes 1"),
labels = c('NoTraining & NoAttrition', 'NoTraining & Attrition',"Training & NoAttrition", "Training & Attrition")
)
)
demo4 %>% ggplot(aes(x = reorder(scenario,-as.numeric(scenario)), y = med_inc, fill = scenario)) +
geom_bar(stat = 'identity', position = 'dodge') + coord_flip() +
ggtitle('Trained Employee who leave spent 50% less time
in their current role than employees without training') +
labs(x = "", y = "Time in Current Role in years") +
scale_fill_manual(values=c("grey65", "grey65", "grey65", "grey25"))+
theme(legend.position = "none")
demo4
## # A tibble: 4 x 5
## Training Attrition med_inc n scenario
## <chr> <fct> <dbl> <int> <fct>
## 1 No 0 4.68 172 NoTraining & NoAttrition
## 2 No 1 4.13 38 NoTraining & Attrition
## 3 Yes 0 4.45 1061 Training & NoAttrition
## 4 Yes 1 2.67 199 Training & Attrition
They dont get to spend much time in their current role before receiving training
demo5 <- df%>%group_by(Training,Attrition) %>% summarise(PercentageofyearsinCuurrRole = mean(YearsInCurrentRole)/mean(TotalWorkingYears)*100,n = n()) %>% ungroup()
sub_title = paste0("Employees who leave with attrion have the least monthly income")
demo5<-demo5 %>% mutate(scenario = as.factor(paste(Training,Attrition)),
scenario = factor(scenario,
levels = c("No 0","No 1","Yes 0","Yes 1"),
labels = c('NoTraining & NoAttrition', 'NoTraining & Attrition',"Training & NoAttrition", "Training & Attrition")
)
)
demo5
## # A tibble: 4 x 5
## Training Attrition PercentageofyearsinCuurrRole n scenario
## <chr> <fct> <dbl> <int> <fct>
## 1 No 0 38.4 172 NoTraining & NoAt~
## 2 No 1 41.5 38 NoTraining & Attr~
## 3 Yes 0 37.7 1061 Training & NoAttr~
## 4 Yes 1 33.7 199 Training & Attrit~
It is important to give the employees enough time to adjust to their current role before training them. By taking 12% off the current role time with training and attrition and multiply it with 12 months, that amounts to additional 4 months. If we were to give our employees additional 4 months to adjust to their current role before putting them into training, then they would not leave.
demo6 <- df%>%group_by(Training,Attrition) %>% summarise(PercentageofyearswithManager = mean(YearsWithCurrManager)/mean(TotalWorkingYears)*100,n = n()) %>% ungroup()
sub_title = paste0("Employees who leave with attrion have the least monthly income")
demo6<-demo6 %>% mutate(scenario = as.factor(paste(Training,Attrition)),
scenario = factor(scenario,
levels = c("No 0","No 1","Yes 0","Yes 1"),
labels = c('NoTraining & NoAttrition', 'NoTraining & Attrition',"Training & NoAttrition", "Training & Attrition")
)
)
demo6
## # A tibble: 4 x 5
## Training Attrition PercentageofyearswithManager n scenario
## <chr> <fct> <dbl> <int> <fct>
## 1 No 0 37.2 172 NoTraining & NoAt~
## 2 No 1 35.4 38 NoTraining & Attr~
## 3 Yes 0 36.7 1061 Training & NoAttr~
## 4 Yes 1 34.4 199 Training & Attrit~
It is important to give the employees enough time to adjust to their manager before training them. By taking 6% off the time they spent with manager with training and attrition and multiply it with 12 months, that amounts to additional 2 months. If we were to give our employees additional 2 months to adjust to manager before putting them into training, then they would not leave.
# 95% CI, get z-value for upper tail, use 0.975 since is one sided
z <- qnorm(0.975)
# Incorporate CI into bar graph of means
sig_curr<-df %>%
group_by(Attrition,Training) %>%
summarise(mean = mean(YearsInCurrentRole), sd = sd(YearsInCurrentRole),
n = n(), ci = z * sd/sqrt(n)) %>%
ggplot(aes(x = Attrition, y = mean, fill=Training)) +
geom_bar(stat = "identity", position = "dodge") +
geom_errorbar(aes(ymin = mean - ci, ymax = mean + ci),
width = 0.5, position = position_dodge(0.9)) +
expand_limits(y=c(0, 5))+
theme_classic()+
theme(
axis.text.x = element_text(angle=90, hjust=1,size = 8))+
labs(title = "Mean Years difference for current Role based on Attrition with Training",
subtitle = "Training is a reliable measure for case with Attrition",
x = "Attrition",
y = "mean Current Years in Role")
sig_curr
The mean current role years spent difference is significant for employees leaving with and without training.
# 95% CI, get z-value for upper tail, use 0.975 since is one sided
z <- qnorm(0.975)
# Incorporate CI into bar graph of means
sig_man<-df %>%
group_by(Attrition,Training) %>%
summarise(mean = mean(YearsWithCurrManager), sd = sd(YearsWithCurrManager),
n = n(), ci = z * sd/sqrt(n)) %>%
ggplot(aes(x = Attrition, y = mea, fill=Training)) +
geom_bar(stat = "identity", position = "dodge") +
geom_errorbar(aes(ymin = mean - ci, ymax = mean + ci),
width = 0.5, position = position_dodge(0.9)) +
expand_limits(y=c(0, 5))+
theme_classic()+
theme(
axis.text.x = element_text(angle=90, hjust=1,size = 8))+
labs(title = "Mean Years difference for time spent with manager based on Attrition with Training",
subtitle = "Training is a reliable measure for case with Attrition",
x = "Attrition",
y = "mean Years with Manager")
sig_curr
The mean years spent with maanger difference is significant for employees leaving with and without training.
summary(glm(df$Attrition~df$MonthlyIncome+df$Training+df$YearsAtCompany+df$YearsInCurrentRole+df$YearsWithCurrManager+df$TotalWorkingYears+as.numeric(df$JobSatisfaction)+df$JobLevel,family = "binomial"))
##
## Call:
## glm(formula = df$Attrition ~ df$MonthlyIncome + df$Training +
## df$YearsAtCompany + df$YearsInCurrentRole + df$YearsWithCurrManager +
## df$TotalWorkingYears + as.numeric(df$JobSatisfaction) + df$JobLevel,
## family = "binomial")
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.1329 -0.6499 -0.4550 -0.2954 2.5616
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 6.214e-01 3.361e-01 1.849 0.06447 .
## df$MonthlyIncome -1.252e-04 6.639e-05 -1.885 0.05939 .
## df$TrainingYes -2.260e-01 2.061e-01 -1.097 0.27264
## df$YearsAtCompany 7.750e-02 2.884e-02 2.687 0.00721 **
## df$YearsInCurrentRole -1.089e-01 3.975e-02 -2.741 0.00613 **
## df$YearsWithCurrManager -9.541e-02 3.948e-02 -2.417 0.01565 *
## df$TotalWorkingYears -3.379e-02 1.928e-02 -1.753 0.07960 .
## as.numeric(df$JobSatisfaction) -2.807e-01 6.691e-02 -4.195 2.72e-05 ***
## df$JobLevel2 -5.704e-01 2.521e-01 -2.263 0.02365 *
## df$JobLevel3 6.728e-01 5.086e-01 1.323 0.18588
## df$JobLevel4 1.310e-01 9.802e-01 0.134 0.89372
## df$JobLevel5 1.120e+00 1.206e+00 0.929 0.35301
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1298.6 on 1469 degrees of freedom
## Residual deviance: 1177.1 on 1458 degrees of freedom
## AIC: 1201.1
##
## Number of Fisher Scoring iterations: 5
We can see that yearsatcompany,years in currentrole,Job satisfaction,years with current manager are all significant at .05 significance level.