# Clear environment of variables and functions
rm(list = ls(all = TRUE))
# Clear environmet of packages
if(is.null(sessionInfo()$otherPkgs) == FALSE)lapply(paste("package:", names(sessionInfo()$otherPkgs), sep=""), detach, character.only = TRUE, unload = TRUE)
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.5.3
## Warning: As of rlang 0.4.0, dplyr must be at least version 0.8.0.
## x dplyr 0.7.5 is too old for rlang 0.4.1.
## i Please update dplyr to the latest version.
## i Updating packages on Windows requires precautions:
## <https://github.com/jennybc/what-they-forgot/issues/62>
## -- Attaching packages ------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.2.1 v purrr 0.2.5
## v tibble 1.4.2 v dplyr 0.7.5
## v tidyr 0.8.1 v stringr 1.3.1
## v readr 1.1.1 v forcats 0.3.0
## Warning: package 'ggplot2' was built under R version 3.5.3
## Warning: package 'forcats' was built under R version 3.5.1
## -- Conflicts ---------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 3.5.3
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(janitor)
## Warning: package 'janitor' was built under R version 3.5.3
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(knitr)
## Warning: package 'knitr' was built under R version 3.5.3
library(GGally)
## Warning: package 'GGally' was built under R version 3.5.3
##
## Attaching package: 'GGally'
## The following object is masked from 'package:dplyr':
##
## nasa
library(stringr)
library(plotly)
## Warning: package 'plotly' was built under R version 3.5.3
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
Load the qp1_data.csv data into df
df<-read.csv('mtp_data.csv')
Check the data type
str(df)
## 'data.frame': 21850 obs. of 11 variables:
## $ UPC : Factor w/ 114 levels "00-01-16000-11653",..: 1 1 1 2 2 2 3 3 3 3 ...
## $ iri_key: int 644347 248741 535806 675634 205272 248741 932600 278738 660222 644937 ...
## $ week : int 1484 1483 1489 1489 1491 1492 1517 1513 1523 1483 ...
## $ units : int 5 2 3 2 8 5 6 1 4 14 ...
## $ brand : Factor w/ 15 levels "GENERAL MILLS CHEERIOS",..: 2 2 2 1 1 1 2 2 2 2 ...
## $ flavor : Factor w/ 5 levels "CINNAMON TOAST",..: 1 1 1 5 5 5 1 1 1 1 ...
## $ package: Factor w/ 2 levels "BOX","CUP": 1 1 1 1 1 1 2 2 2 2 ...
## $ volume : num 0.06 0.06 0.06 0.04 0.04 0.04 0.12 0.12 0.12 0.12 ...
## $ price : num 0.5 0.5 0.5 0.5 0.5 0.5 1.09 1.59 1.59 1 ...
## $ promo : int 0 0 0 0 0 0 0 0 0 0 ...
## $ ad : Factor w/ 3 levels "A","B","NONE": 1 3 3 3 3 3 3 3 3 3 ...
Change some integers to factors
df$promo <- as.factor(df$promo)
df$week<-as.factor(df$week)
df$revenue<-as.integer(df$price)*as.integer(df$units)
kellogs <- df[which(str_detect(df$brand, "KELLOGGS")),]
gm <- df[which(str_detect(df$brand, "GENERAL MILLS")),]
post <- df[which(str_detect(df$brand, "POST")),]
extractTitle <- function(brand) {
brand <- as.character(brand)
if (length(grep("KELLOGGS", brand)) > 0) {
return ("KELLOGGS")
} else if (length(grep("GENERAL MILLS", brand)) > 0) {
return ("GENERAL MILLS")
} else if (length(grep("POST", brand)) > 0) {
return ("POST")
} else {
return ("Other")
}
}
Producers <- NULL
for (i in 1:nrow(df)) {
Producers <- c(Producers, extractTitle(df[i,"brand"]))
}
df$Producers <- as.factor(Producers)
levels(df$week) <- c("1","2","3","4","5","6","7","8","9","10","11","12",
"13","14","15","16","17","18","19","20","21","22",
"23","24","25","26","27","28","29","30","31","32","33",
"34","35","36","37","38","39","40","41","42","43","44",
"45","46","47","48","49","50","51","52")
str(df)
## 'data.frame': 21850 obs. of 13 variables:
## $ UPC : Factor w/ 114 levels "00-01-16000-11653",..: 1 1 1 2 2 2 3 3 3 3 ...
## $ iri_key : int 644347 248741 535806 675634 205272 248741 932600 278738 660222 644937 ...
## $ week : Factor w/ 52 levels "1","2","3","4",..: 6 5 11 11 13 14 39 35 45 5 ...
## $ units : int 5 2 3 2 8 5 6 1 4 14 ...
## $ brand : Factor w/ 15 levels "GENERAL MILLS CHEERIOS",..: 2 2 2 1 1 1 2 2 2 2 ...
## $ flavor : Factor w/ 5 levels "CINNAMON TOAST",..: 1 1 1 5 5 5 1 1 1 1 ...
## $ package : Factor w/ 2 levels "BOX","CUP": 1 1 1 1 1 1 2 2 2 2 ...
## $ volume : num 0.06 0.06 0.06 0.04 0.04 0.04 0.12 0.12 0.12 0.12 ...
## $ price : num 0.5 0.5 0.5 0.5 0.5 0.5 1.09 1.59 1.59 1 ...
## $ promo : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ ad : Factor w/ 3 levels "A","B","NONE": 1 3 3 3 3 3 3 3 3 3 ...
## $ revenue : int 0 0 0 0 0 0 6 1 4 14 ...
## $ Producers: Factor w/ 3 levels "GENERAL MILLS",..: 1 1 1 1 1 1 1 1 1 1 ...
summary(df)
## UPC iri_key week
## 00-01-43000-10521: 676 Min. : 200171 22 : 475
## 00-01-38000-01621: 666 1st Qu.: 241553 12 : 473
## 00-01-38000-00828: 660 Median : 273720 37 : 464
## 00-01-16000-27569: 639 Mean : 454405 29 : 449
## 00-02-38000-66330: 618 3rd Qu.: 648547 20 : 445
## 00-01-38000-01611: 612 Max. :8057418 43 : 445
## (Other) :17979 (Other):19099
## units brand
## Min. : 1.000 KELLOGGS FROSTED FLAKES : 2295
## 1st Qu.: 3.000 KELLOGGS FROOT LOOPS : 2192
## Median : 7.000 GENERAL MILLS CINNAMON TST CR: 1834
## Mean : 8.579 GENERAL MILLS LUCKY CHARMS : 1681
## 3rd Qu.:12.000 KELLOGGS FROSTED MINI WHEATS : 1574
## Max. :28.000 GENERAL MILLS CHEERIOS : 1458
## (Other) :10816
## flavor package volume price
## CINNAMON TOAST:1834 BOX:21306 Min. :0.040 Min. :0.250
## COCOA :1901 CUP: 544 1st Qu.:0.750 1st Qu.:3.190
## FRUIT :2192 Median :1.060 Median :3.790
## REGULAR :8816 Mean :1.016 Mean :3.763
## TOASTED :7107 3rd Qu.:1.120 3rd Qu.:4.390
## Max. :4.000 Max. :9.990
##
## promo ad revenue Producers
## 0:17305 A : 1456 Min. : 0.00 GENERAL MILLS: 7189
## 1: 4545 B : 1061 1st Qu.: 9.00 KELLOGGS :12183
## NONE:19333 Median : 21.00 POST : 2478
## Mean : 26.28
## 3rd Qu.: 36.00
## Max. :140.00
##
Look for into the depth of the count in week and brand
mean(table(df$week))
## [1] 420.1923
median(table(df$week))
## [1] 421
quantile(table(df$week))
## 0% 25% 50% 75% 100%
## 369.00 400.75 421.00 436.75 475.00
table(df$week)
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
## 383 401 402 409 427 428 435 386 398 389 412 473 421 429 397 403 414 435
## 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
## 421 445 389 475 399 425 423 420 432 424 449 439 409 439 393 394 399 436
## 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52
## 464 441 414 441 412 418 445 441 439 420 431 445 429 369 400 388
table(df$brand)
##
## GENERAL MILLS CHEERIOS GENERAL MILLS CINNAMON TST CR
## 1458 1834
## GENERAL MILLS COCOA PUFFS GENERAL MILLS KIX
## 1020 1196
## GENERAL MILLS LUCKY CHARMS KELLOGGS COCOA KRISPIES
## 1681 881
## KELLOGGS FROOT LOOPS KELLOGGS FROSTED FLAKES
## 2192 2295
## KELLOGGS FROSTED MINI WHEATS KELLOGGS RAISIN BRAN
## 1574 1266
## KELLOGGS RICE KRISPIES KELLOGGS SMART START
## 1450 1134
## KELLOGGS SPECIAL K POST GRAPE NUTS
## 1391 1289
## POST SHREDDED WHEAT
## 1189
grid.arrange(
ggplot(data = df) +
geom_boxplot(mapping = aes(x = 1,y = revenue))+
ggtitle('revenue per package')+
labs(x = '', y = 'Reveneu')+
scale_y_continuous(labels = scales::dollar)+
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank(),#delete the x axis lables
plot.title = element_text(hjust = 0.5),# title center
text = element_text(size=10),plot.margin = margin(2,1.5,2,1.5, "cm")),
ggplot(data = df) +
geom_boxplot(mapping = aes(x = 1,y = price))+
ggtitle('price for package')+
labs(x = '', y = 'Price')+
scale_y_continuous(labels = scales::dollar)+
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank(),#delete the x axis lables
plot.title = element_text(hjust = 0.5),# title center
text = element_text(size=10),plot.margin = margin(2,1.5,2,1.5, "cm")),
ggplot(data = df) +
geom_boxplot(mapping = aes(x = 1,y = units))+
ggtitle('units sold')+
labs(x = '', y = 'Units sold')+
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank(),#delete the x axis lables
plot.title = element_text(hjust = 0.5),# title center
text = element_text(size=10),plot.margin = margin(2,1.5,2,1.5, "cm")),
ncol=3
)
ggplot(data = df) +
geom_boxplot(mapping = aes(x = 1,y = volume))+
ggtitle('volume of packages')+
labs(x = '', y = 'volume')+
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank(),#delete the x axis lables
plot.title = element_text(hjust = 0.5),# title center
text = element_text(size=10),plot.margin = margin(2,1.5,2,1.5, "cm"))
grid.arrange(
ggplot(data = df, mapping = aes(x = promo ))+
geom_bar(width=0.5,fill = '#32a0a8')+
ggtitle('promotion distribution')+
labs(x = 'Promotion', y = 'Count')+
theme(plot.title = element_text(hjust = 0.5,size= 15),text = element_text(size= 12),plot.margin = margin(2,1.5,2,1.5, "cm")),
ggplot(data = df, mapping = aes(x = ad ))+
geom_bar(width=0.5,fill = '#32a0a8')+
ggtitle('advertisement distribution')+
labs(x = 'Advertisement', y = 'Count')+
theme(plot.title = element_text(hjust = 0.5,size= 15),text = element_text(size= 12),plot.margin = margin(2,1.5,2,1.5, "cm")),
ncol=2
)
From the two graphs, we see that promotion and advertisement might not have such a big impact on revenue.Lets look at some more categorical variables
grid.arrange(
ggplot(data = df, mapping = aes(x = package ))+
geom_bar(width=0.5,fill = '#32a0a8')+
ggtitle('package distribution')+
labs(x = 'Packaging', y = 'Count')+
theme(plot.title = element_text(hjust = 0.5,size= 15),text = element_text(size= 12),plot.margin = margin(2,1.5,2,1.5, "cm"))
)
We see that box is the most popular packaging
ggplot(data = df, mapping = aes(x = flavor ))+
geom_bar(width=0.5,fill = '#32a0a8')+
ggtitle('flavour distribution')+
labs(x = 'Flavour', y = 'Count')+
theme(plot.title = element_text(hjust = 0.5,size= 15),text = element_text(size= 12),plot.margin = margin(2,1.5,2,1.5, "cm"))
We see that the most popular flavour is regular followed by toasted then fruit.
ggplot(data = df, mapping = aes(x = Producers ))+
geom_bar(width=0.5,fill = '#32a0a8')+
ggtitle('Producer distribution')+
labs(x = 'Producers', y = 'Count')+
theme(plot.title = element_text(hjust = 0.5,size= 15),text = element_text(size= 12),plot.margin = margin(2,1.5,2,1.5, "cm"))
Comments: We will see later if brand or producers effect revenue,promotion,ads or prices.So far we see kellogss sells the most but we do not know how much of it has an impact.
df %>%
tabyl(promo,ad) %>%
adorn_totals(where = c("row", "col"))
## Warning: `lang()` is deprecated as of rlang 0.2.0.
## Please use `call2()` instead.
## This warning is displayed once per session.
## Warning: `new_overscope()` is deprecated as of rlang 0.2.0.
## Please use `new_data_mask()` instead.
## This warning is displayed once per session.
## Warning: `overscope_eval_next()` is deprecated as of rlang 0.2.0.
## Please use `eval_tidy()` with a data mask instead.
## This warning is displayed once per session.
## Warning: The `printer` argument is deprecated as of rlang 0.3.0.
## This warning is displayed once per session.
## Warning: `.named` can no longer be a width
## This warning is displayed once per session.
## Warning: `list_len()` is deprecated as of rlang 0.2.0.
## Please use `new_list()` instead.
## This warning is displayed once per session.
## Warning: `chr_along()` is deprecated as of rlang 0.2.0.
## This warning is displayed once per session.
## Warning: `lgl_len()` is deprecated as of rlang 0.2.0.
## Please use `new_logical()` instead.
## This warning is displayed once per session.
## promo A B NONE Total
## 0 683 402 16220 17305
## 1 773 659 3113 4545
## Total 1456 1061 19333 21850
Comment: We see that most cereals do not have promo and ads, other than that promo with ads A gives the most sales.
df %>%
tabyl(promo,ad,flavor) %>%
adorn_totals(where = c("row", "col"))
## $`CINNAMON TOAST`
## promo A B NONE Total
## 0 60 24 1453 1537
## 1 54 27 216 297
## Total 114 51 1669 1834
##
## $COCOA
## promo A B NONE Total
## 0 45 21 1350 1416
## 1 83 70 332 485
## Total 128 91 1682 1901
##
## $FRUIT
## promo A B NONE Total
## 0 59 44 1570 1673
## 1 94 64 361 519
## Total 153 108 1931 2192
##
## $REGULAR
## promo A B NONE Total
## 0 231 161 6665 7057
## 1 285 278 1196 1759
## Total 516 439 7861 8816
##
## $TOASTED
## promo A B NONE Total
## 0 288 152 5182 5622
## 1 257 220 1008 1485
## Total 545 372 6190 7107
Comments: We see that regular and toasted have the most ads and promos, so flavours could be having an effect on prices,promotions and ads
df %>%
tabyl(promo,ad,Producers) %>%
adorn_totals(where = c("row", "col"))
## $`GENERAL MILLS`
## promo A B NONE Total
## 0 218 100 5591 5909
## 1 224 172 884 1280
## Total 442 272 6475 7189
##
## $KELLOGGS
## promo A B NONE Total
## 0 445 269 8756 9470
## 1 458 395 1860 2713
## Total 903 664 10616 12183
##
## $POST
## promo A B NONE Total
## 0 20 33 1873 1926
## 1 91 92 369 552
## Total 111 125 2242 2478
Comment: From here, we are starting to see that the producer has effect on promo and ads, there are more promo and ads for kelloggs than general mills or post.
df %>%
tabyl(flavor,promo,Producers) %>%
adorn_totals(where = c("row", "col"))
## $`GENERAL MILLS`
## flavor 0 1 Total
## CINNAMON TOAST 1537 297 1834
## COCOA 769 251 1020
## FRUIT 0 0 0
## REGULAR 1004 199 1203
## TOASTED 2599 533 3132
## Total 5909 1280 7189
##
## $KELLOGGS
## flavor 0 1 Total
## CINNAMON TOAST 0 0 0
## COCOA 647 234 881
## FRUIT 1673 519 2192
## REGULAR 4127 1008 5135
## TOASTED 3023 952 3975
## Total 9470 2713 12183
##
## $POST
## flavor 0 1 Total
## CINNAMON TOAST 0 0 0
## COCOA 0 0 0
## FRUIT 0 0 0
## REGULAR 1926 552 2478
## TOASTED 0 0 0
## Total 1926 552 2478
Comments: +General Mills promote toasted flavour cereal more over regular where Kelloggs promotes regular over toasted. +Post only sells regular, but promotes more regular cereals than general mills
df %>%
tabyl(brand,promo) %>%
adorn_totals(where = c("row", "col"))
## brand 0 1 Total
## GENERAL MILLS CHEERIOS 1240 218 1458
## GENERAL MILLS CINNAMON TST CR 1537 297 1834
## GENERAL MILLS COCOA PUFFS 769 251 1020
## GENERAL MILLS KIX 997 199 1196
## GENERAL MILLS LUCKY CHARMS 1366 315 1681
## KELLOGGS COCOA KRISPIES 647 234 881
## KELLOGGS FROOT LOOPS 1673 519 2192
## KELLOGGS FROSTED FLAKES 1819 476 2295
## KELLOGGS FROSTED MINI WHEATS 1248 326 1574
## KELLOGGS RAISIN BRAN 1060 206 1266
## KELLOGGS RICE KRISPIES 1162 288 1450
## KELLOGGS SMART START 812 322 1134
## KELLOGGS SPECIAL K 1049 342 1391
## POST GRAPE NUTS 1002 287 1289
## POST SHREDDED WHEAT 924 265 1189
## Total 17305 4545 21850
Comments: The brand names of general mills lucky charms, kelloggs froot loops and kelloggs frosted flakes seems to be being promoted more based on brand name. Want to verify if it is on brand name or promoted on flavour.
df %>%
tabyl(flavor,brand) %>%
adorn_totals(where = c("row", "col"))
## flavor GENERAL MILLS CHEERIOS GENERAL MILLS CINNAMON TST CR
## CINNAMON TOAST 0 1834
## COCOA 0 0
## FRUIT 0 0
## REGULAR 4 0
## TOASTED 1454 0
## Total 1458 1834
## GENERAL MILLS COCOA PUFFS GENERAL MILLS KIX GENERAL MILLS LUCKY CHARMS
## 0 0 0
## 1020 0 0
## 0 0 0
## 0 1196 3
## 0 0 1678
## 1020 1196 1681
## KELLOGGS COCOA KRISPIES KELLOGGS FROOT LOOPS KELLOGGS FROSTED FLAKES
## 0 0 0
## 881 0 0
## 0 2192 0
## 0 0 2295
## 0 0 0
## 881 2192 2295
## KELLOGGS FROSTED MINI WHEATS KELLOGGS RAISIN BRAN KELLOGGS RICE KRISPIES
## 0 0 0
## 0 0 0
## 0 0 0
## 1574 1266 0
## 0 0 1450
## 1574 1266 1450
## KELLOGGS SMART START KELLOGGS SPECIAL K POST GRAPE NUTS
## 0 0 0
## 0 0 0
## 0 0 0
## 0 0 1289
## 1134 1391 0
## 1134 1391 1289
## POST SHREDDED WHEAT Total
## 0 1834
## 0 1901
## 0 2192
## 1189 8816
## 0 7107
## 1189 21850
Comment: Froot loops are promoted based on brand name. Otherwise, it seems that producers focused on flavour have a bigger impact on promotion.
df %>%
tabyl(brand,ad) %>%
adorn_totals(where = c("row", "col"))
## brand A B NONE Total
## GENERAL MILLS CHEERIOS 79 44 1335 1458
## GENERAL MILLS CINNAMON TST CR 114 51 1669 1834
## GENERAL MILLS COCOA PUFFS 76 54 890 1020
## GENERAL MILLS KIX 70 54 1072 1196
## GENERAL MILLS LUCKY CHARMS 103 69 1509 1681
## KELLOGGS COCOA KRISPIES 52 37 792 881
## KELLOGGS FROOT LOOPS 153 108 1931 2192
## KELLOGGS FROSTED FLAKES 141 124 2030 2295
## KELLOGGS FROSTED MINI WHEATS 127 88 1359 1574
## KELLOGGS RAISIN BRAN 67 47 1152 1266
## KELLOGGS RICE KRISPIES 114 88 1248 1450
## KELLOGGS SMART START 124 90 920 1134
## KELLOGGS SPECIAL K 125 82 1184 1391
## POST GRAPE NUTS 46 63 1180 1289
## POST SHREDDED WHEAT 65 62 1062 1189
## Total 1456 1061 19333 21850
Comment: we see that the general mill focus on advertising their brands of cinnamon toast and lucky charms with ad A. Kelloggs makes more ad A for froot loops,frosted flakes,mini wheats rice krispies,smart tart,special K and makes ad B also for froot loops,frosted lakes.
df %>%
tabyl(flavor,ad,Producers) %>%
adorn_totals(where = c("row", "col"))
## $`GENERAL MILLS`
## flavor A B NONE Total
## CINNAMON TOAST 114 51 1669 1834
## COCOA 76 54 890 1020
## FRUIT 0 0 0 0
## REGULAR 70 55 1078 1203
## TOASTED 182 112 2838 3132
## Total 442 272 6475 7189
##
## $KELLOGGS
## flavor A B NONE Total
## CINNAMON TOAST 0 0 0 0
## COCOA 52 37 792 881
## FRUIT 153 108 1931 2192
## REGULAR 335 259 4541 5135
## TOASTED 363 260 3352 3975
## Total 903 664 10616 12183
##
## $POST
## flavor A B NONE Total
## CINNAMON TOAST 0 0 0 0
## COCOA 0 0 0 0
## FRUIT 0 0 0 0
## REGULAR 111 125 2242 2478
## TOASTED 0 0 0 0
## Total 111 125 2242 2478
Questions:we can investigate on weekly basis to see when promos and ads are put up and compare them amongst themselves.
df %>%
select_if(is.numeric) %>% # Use to select just the numeric variables
cor() %>%
round(2) %>%
kable()
iri_key | units | volume | price | revenue | |
---|---|---|---|---|---|
iri_key | 1.00 | 0.03 | -0.01 | -0.01 | 0.03 |
units | 0.03 | 1.00 | 0.02 | -0.19 | 0.88 |
volume | -0.01 | 0.02 | 1.00 | 0.54 | 0.21 |
price | -0.01 | -0.19 | 0.54 | 1.00 | 0.18 |
revenue | 0.03 | 0.88 | 0.21 | 0.18 | 1.00 |
comment:We can see that revenue has a high positive correaltion with units sold rather than price and volume.
We want to see deeper into the weekly promos and ads for each brand from each producer.
grid.arrange(
df %>%
filter(as.integer(week)<14) %>%
group_by(Producers,week) %>%
summarize(count=n()) %>%
ggplot(aes(x = Producers, y = count)) + geom_bar(stat = "identity")+
geom_text(aes(label = count),
angle=0,vjust="inward",hjust="inward",check_overlap = TRUE),
df %>%
filter(as.integer(week)<27 & as.integer(week)>13 ) %>%
group_by(Producers,week) %>%
summarize(count=n()) %>%
ggplot(aes(x = Producers, y = count)) + geom_bar(stat = "identity")+
geom_text(aes(label = count),
angle=0,vjust="inward",hjust="inward",check_overlap = TRUE),
df %>%
filter(as.integer(week)<40 & as.integer(week)>27 ) %>%
group_by(Producers,week) %>%
summarize(count=n()) %>%
ggplot(aes(x = Producers, y = count)) + geom_bar(stat = "identity")+
geom_text(aes(label = count),
angle=0,vjust="inward",hjust="inward",check_overlap = TRUE),
df %>%
filter(as.integer(week)<53 & as.integer(week)>40) %>%
group_by(Producers,week) %>%
summarize(count=n()) %>%
ggplot(aes(x = Producers, y = count)) + geom_bar(stat = "identity")+
geom_text(aes(label = count),
angle=0,vjust="inward",hjust="inward",check_overlap = TRUE)
)
Comment: We do not really see a big difference in quarterly sales between the producers
df %>%
filter(as.integer(week)<14) %>%
group_by(brand) %>%
summarize(count=n()) %>%
ggplot(aes(reorder(x = brand,count), y = count)) + geom_bar(stat = "identity")+
geom_text(aes(label = count),
angle=90,vjust=1,hjust=1,check_overlap = TRUE)+
theme_classic()+
theme(
axis.text.x = element_text(angle=90, hjust=1)
)+
labs(title = "Number of sales of first 14 weeks",
y = "sales",x="brand")
df %>%
filter(as.integer(week)<27 & as.integer(week)>13 ) %>%
group_by(brand) %>%
summarize(count=n()) %>%
ggplot(aes(reorder(x = brand,count) ,y = count)) + geom_bar(stat = "identity")+
geom_text(aes(label = count),
angle=90,vjust=1,hjust=1,check_overlap = TRUE)+
theme_classic()+
theme(
axis.text.x = element_text(angle=90, hjust=1)
)+
labs(title = "Number of sales from 13 weeks to 27 weeks",
y = "sales",x="brand")
df %>%
filter(as.integer(week)<40 & as.integer(week)>27 ) %>%
group_by(brand) %>%
summarize(count=n()) %>%
ggplot(aes(reorder(x = brand,count), y = count)) + geom_bar(stat = "identity")+
geom_text(aes(label = count),
angle=90,vjust=1,hjust=1,check_overlap = TRUE)+
theme_classic()+
theme(
axis.text.x = element_text(angle=90, hjust=1)
)+
labs(title = "Number of sales from 27 weeks to 40 weeks",
y = "sales",x="brand")
df %>%
filter(as.integer(week)<53 & as.integer(week)>40) %>%
group_by(brand) %>%
summarize(count=n()) %>%
ggplot(aes(reorder(x = brand,count) ,y = count)) + geom_bar(stat = "identity")+
geom_text(aes(label = count),
angle=90,vjust=1,hjust=1,check_overlap = TRUE)+
theme_classic()+
theme(
axis.text.x = element_text(angle=90, hjust=1))+
labs(title = "Number of sales from 40 weeks to 52 weeks",
y = "sales",x="brand")
Comment: We can see there is a difference in the histogram as we move from one quarter sales to the others. I think sales are affected by brand rather than producer.
grid.arrange(
df %>% filter(as.integer(week)<14 )%>%
group_by(promo, Producers) %>%
summarise(count = n()) %>%
ggplot(aes(promo, Producers)) +
geom_tile(aes(fill = count)),
df %>% filter(as.integer(week)<27 & as.integer(week)>13 )%>%
group_by(promo, Producers) %>%
summarise(count = n()) %>%
ggplot(aes(promo, Producers)) +
geom_tile(aes(fill = count)),
df %>% filter(as.integer(week)<40 & as.integer(week)>27 )%>%
group_by(promo, Producers) %>%
summarise(count = n()) %>%
ggplot(aes(promo, Producers)) +
geom_tile(aes(fill = count)),
df %>% filter(as.integer(week)<53 & as.integer(week)>40 )%>%
group_by(promo, Producers) %>%
summarise(count = n()) %>%
ggplot(aes(promo, Producers)) +
geom_tile(aes(fill = count))
)
Comment: There is almost not difference between the quartly heat mapps for promotion. So promotion is probably more brand based than producer. Let check it out below:
grid.arrange(
df %>% filter(as.integer(week)<14 )%>%
group_by(promo, brand) %>%
summarise(count = n()) %>%
ggplot((aes(promo, reorder(as.factor(brand),count)))) +
geom_tile(aes(fill = count)),
df %>% filter(as.integer(week)<27 & as.integer(week)>13 )%>%
group_by(promo, brand) %>%
summarise(count = n()) %>%
ggplot((aes(promo, reorder(as.factor(brand),count)))) +
geom_tile(aes(fill = count)))
grid.arrange(
df %>% filter(as.integer(week)<40 & as.integer(week)>27 )%>%
group_by(promo, brand) %>%
summarise(count = n()) %>%
ggplot((aes(promo, reorder(as.factor(brand),count)))) +
geom_tile(aes(fill = count)),
df %>% filter(as.integer(week)<53 & as.integer(week)>40 )%>%
group_by(promo, brand) %>%
summarise(count = n()) %>%
ggplot((aes(promo, reorder(as.factor(brand),count)))) +
geom_tile(aes(fill = count))
)
Comment: Comments: We can see that brands affect promoted over the quarters.We want to look at how each brand is being promoted over the 52 weeks.We will check it out later.
grid.arrange(
df %>% filter(as.integer(week)<14 & ad!="NONE")%>%
group_by(ad, Producers) %>%
summarise(count = n()) %>%
ggplot(aes(ad, Producers)) +
geom_tile(aes(fill = count)),
df %>% filter(as.integer(week)<27 & as.integer(week)>13 & ad!="NONE" )%>%
group_by(ad, Producers) %>%
summarise(count = n()) %>%
ggplot(aes(ad, Producers)) +
geom_tile(aes(fill = count)),
df %>% filter(as.integer(week)<40 & as.integer(week)>27 & ad!= "NONE")%>%
group_by(ad, Producers) %>%
summarise(count = n()) %>%
ggplot(aes(ad, Producers)) +
geom_tile(aes(fill = count)),
df %>% filter(as.integer(week)<53 & as.integer(week)>40 & ad!="NONE")%>%
group_by(ad, Producers) %>%
summarise(count = n()) %>%
ggplot(aes(ad, Producers)) +
geom_tile(aes(fill = count))
)
Comment: Yet again, producers do not really have an impact on ads quaterly
grid.arrange(
df %>% filter(as.integer(week)<14 & ad!="NONE" )%>%
group_by(ad, brand) %>%
summarise(count = n()) %>%
ggplot(aes(ad, reorder(as.factor(brand),count))) +
geom_tile(aes(fill = count)),
df %>% filter(as.integer(week)<27 & as.integer(week)>13 & ad!="NONE" )%>%
group_by(ad, brand) %>%
summarise(count = n()) %>%
ggplot(aes(ad, reorder(as.factor(brand),count))) +
geom_tile(aes(fill = count)))
grid.arrange(
df %>% filter(as.integer(week)<40 & as.integer(week)>27 & ad!= "NONE")%>%
group_by(ad, brand) %>%
summarise(count = n()) %>%
ggplot(aes(ad, reorder(as.factor(brand),count))) +
geom_tile(aes(fill = count)),
df %>% filter(as.integer(week)<53 & as.integer(week)>40 & ad!="NONE")%>%
group_by(ad, brand) %>%
summarise(count = n()) %>%
ggplot(aes(ad, reorder(as.factor(brand),count))) +
geom_tile(aes(fill = count))
)
Comments: We can see that brands affect ads over the quarters. We need to now carefully see for each producers’ brands how do they affect advertisement?
grid.arrange(
df %>% filter(as.integer(week)<14 & str_detect(df$brand, "KELLOGGS" )& promo!="0")%>%
group_by(promo, brand) %>%
summarise(count = n()) %>%
ggplot((aes(promo, reorder(as.factor(brand),count)))) +
geom_tile(aes(fill = count)),
df %>% filter(as.integer(week)<27 & as.integer(week)>13 & str_detect(df$brand, "KELLOGGS") & promo!="0")%>%
group_by(promo, brand) %>%
summarise(count = n()) %>%
ggplot((aes(promo, reorder(as.factor(brand),count)))) +
geom_tile(aes(fill = count)),
df %>% filter(as.integer(week)<40 & as.integer(week)>27 & str_detect(df$brand, "KELLOGGS")& promo!="0")%>%
group_by(promo, brand) %>%
summarise(count = n()) %>%
ggplot((aes(promo, reorder(as.factor(brand),count)))) +
geom_tile(aes(fill = count)),
df %>% filter(as.integer(week)<53 & as.integer(week)>40 & str_detect(df$brand, "KELLOGGS")& promo!="0")%>%
group_by(promo, brand) %>%
summarise(count = n()) %>%
ggplot((aes(promo, reorder(as.factor(brand),count)))) +
geom_tile(aes(fill = count))
)
grid.arrange(
df %>% filter(as.integer(week)<14 & str_detect(df$brand, "GENERAL MILLS" )& promo!="0")%>%
group_by(promo, brand) %>%
summarise(count = n()) %>%
ggplot((aes(promo, reorder(as.factor(brand),count)))) +
geom_tile(aes(fill = count)),
df %>% filter(as.integer(week)<27 & as.integer(week)>13 & str_detect(df$brand, "GENERAL MILLS") & promo!="0")%>%
group_by(promo, brand) %>%
summarise(count = n()) %>%
ggplot((aes(promo, reorder(as.factor(brand),count)))) +
geom_tile(aes(fill = count)),
df %>% filter(as.integer(week)<40 & as.integer(week)>27 & str_detect(df$brand, "GENERAL MILLS")& promo!="0")%>%
group_by(promo, brand) %>%
summarise(count = n()) %>%
ggplot((aes(promo, reorder(as.factor(brand),count)))) +
geom_tile(aes(fill = count)),
df %>% filter(as.integer(week)<53 & as.integer(week)>40 & str_detect(df$brand, "GENERAL MILLS")& promo!="0")%>%
group_by(promo, brand) %>%
summarise(count = n()) %>%
ggplot((aes(promo, reorder(as.factor(brand),count)))) +
geom_tile(aes(fill = count))
)
grid.arrange(
df %>% filter(as.integer(week)<14 & str_detect(df$brand, "POST" )& promo!="0")%>%
group_by(promo, brand) %>%
summarise(count = n()) %>%
ggplot((aes(promo, reorder(as.factor(brand),count)))) +
geom_tile(aes(fill = count)),
df %>% filter(as.integer(week)<27 & as.integer(week)>13 & str_detect(df$brand, "POST") & promo!="0")%>%
group_by(promo, brand) %>%
summarise(count = n()) %>%
ggplot((aes(promo, reorder(as.factor(brand),count)))) +
geom_tile(aes(fill = count)),
df %>% filter(as.integer(week)<40 & as.integer(week)>27 & str_detect(df$brand, "POST")& promo!="0")%>%
group_by(promo, brand) %>%
summarise(count = n()) %>%
ggplot((aes(promo, reorder(as.factor(brand),count)))) +
geom_tile(aes(fill = count)),
df %>% filter(as.integer(week)<53 & as.integer(week)>40 & str_detect(df$brand, "POST")& promo!="0")%>%
group_by(promo, brand) %>%
summarise(count = n()) %>%
ggplot((aes(promo, reorder(as.factor(brand),count)))) +
geom_tile(aes(fill = count))
)
Comments: Post promotes grape nuts and shredded wheat alternatingly through the quarters of 52 weeks.
grid.arrange(
df %>% filter(as.integer(week)<14 & ad!="NONE" & str_detect(df$brand, "KELLOGGS") )%>%
group_by(ad, brand) %>%
summarise(count = n()) %>%
ggplot(aes(ad, reorder(as.factor(brand),count))) +
geom_tile(aes(fill = count)),
df %>% filter(as.integer(week)<27 & as.integer(week)>13 & ad!="NONE" & str_detect(df$brand, "KELLOGGS"))%>%
group_by(ad, brand) %>%
summarise(count = n()) %>%
ggplot(aes(ad, reorder(as.factor(brand),count))) +
geom_tile(aes(fill = count)),
df %>% filter(as.integer(week)<40 & as.integer(week)>27 & ad!= "NONE" & str_detect(df$brand, "KELLOGGS"))%>%
group_by(ad, brand) %>%
summarise(count = n()) %>%
ggplot(aes(ad, reorder(as.factor(brand),count))) +
geom_tile(aes(fill = count)),
df %>% filter(as.integer(week)<53 & as.integer(week)>40 & ad!="NONE" & str_detect(df$brand, "KELLOGGS"))%>%
group_by(ad, brand) %>%
summarise(count = n()) %>%
ggplot(aes(ad, reorder(as.factor(brand),count))) +
geom_tile(aes(fill = count))
)
grid.arrange(
df %>% filter(as.integer(week)<14 & ad!="NONE" & str_detect(df$brand, "GENERAL MILLS") )%>%
group_by(ad, brand) %>%
summarise(count = n()) %>%
ggplot(aes(ad, reorder(as.factor(brand),count))) +
geom_tile(aes(fill = count)),
df %>% filter(as.integer(week)<27 & as.integer(week)>13 & ad!="NONE" & str_detect(df$brand, "GENERAL MILLS"))%>%
group_by(ad, brand) %>%
summarise(count = n()) %>%
ggplot(aes(ad, reorder(as.factor(brand),count))) +
geom_tile(aes(fill = count)),
df %>% filter(as.integer(week)<40 & as.integer(week)>27 & ad!= "NONE" & str_detect(df$brand, "GENERAL MILLS"))%>%
group_by(ad, brand) %>%
summarise(count = n()) %>%
ggplot(aes(ad, reorder(as.factor(brand),count))) +
geom_tile(aes(fill = count)),
df %>% filter(as.integer(week)<53 & as.integer(week)>40 & ad!="NONE" & str_detect(df$brand, "GENERAL MILLS"))%>%
group_by(ad, brand) %>%
summarise(count = n()) %>%
ggplot(aes(ad, reorder(as.factor(brand),count))) +
geom_tile(aes(fill = count))
)
grid.arrange(
df %>% filter(as.integer(week)<14 & ad!="NONE" & str_detect(df$brand, "POST") )%>%
group_by(ad, brand) %>%
summarise(count = n()) %>%
ggplot(aes(ad, reorder(as.factor(brand),count))) +
geom_tile(aes(fill = count)),
df %>% filter(as.integer(week)<27 & as.integer(week)>13 & ad!="NONE" & str_detect(df$brand, "POST"))%>%
group_by(ad, brand) %>%
summarise(count = n()) %>%
ggplot(aes(ad, reorder(as.factor(brand),count))) +
geom_tile(aes(fill = count)),
df %>% filter(as.integer(week)<40 & as.integer(week)>27 & ad!= "NONE" & str_detect(df$brand, "POST"))%>%
group_by(ad, brand) %>%
summarise(count = n()) %>%
ggplot(aes(ad, reorder(as.factor(brand),count))) +
geom_tile(aes(fill = count)),
df %>% filter(as.integer(week)<53 & as.integer(week)>40 & ad!="NONE" & str_detect(df$brand, "POST"))%>%
group_by(ad, brand) %>%
summarise(count = n()) %>%
ggplot(aes(ad, reorder(as.factor(brand),count))) +
geom_tile(aes(fill = count))
)
grid.arrange(
df %>% filter(as.integer(week)<14 & ad!="NONE" )%>%
group_by(ad, flavor) %>%
summarise(count = n()) %>%
ggplot(aes(ad, reorder(as.factor(flavor),count))) +
geom_tile(aes(fill = count)),
df %>% filter(as.integer(week)<27 & as.integer(week)>13 & ad!="NONE" )%>%
group_by(ad, flavor) %>%
summarise(count = n()) %>%
ggplot(aes(ad, reorder(as.factor(flavor),count))) +
geom_tile(aes(fill = count)),
df %>% filter(as.integer(week)<40 & as.integer(week)>27 & ad!= "NONE" )%>%
group_by(ad, flavor) %>%
summarise(count = n()) %>%
ggplot(aes(ad, reorder(as.factor(flavor),count))) +
geom_tile(aes(fill = count)),
df %>% filter(as.integer(week)<53 & as.integer(week)>40 & ad!="NONE")%>%
group_by(ad, flavor) %>%
summarise(count = n()) %>%
ggplot(aes(ad, reorder(as.factor(flavor),count))) +
geom_tile(aes(fill = count))
)
df %>%
group_by(week) %>%
summarize(med_rev = median(revenue)) %>%
ggplot(mapping = aes(x = as.integer(week), y = med_rev)) +
geom_line()+
scale_y_continuous(labels=scales::dollar)+
theme_classic()+
scale_x_continuous(breaks=seq(0,52,by=2))+
labs(title = "Median revenue by weeks",
x = "week",
y = "median revenue")
line_graph_3firm<-df %>%
group_by(week,Producers) %>%
summarize(med_rev = median(revenue)) %>%
ggplot(mapping = aes(x = as.integer(week), y = med_rev,colour=Producers)) +
geom_line()+
scale_y_continuous(labels=scales::dollar)+
theme_classic()+
scale_x_continuous(breaks=seq(0,52,by=2))+
labs(title = "Median revenue by weeks",
subtitle = "General Mills consistently has higher revenues throughout 52 weeks",
x = "week",
y = "median revenue")
line_graph_3firm
ggsave(filename = "line_graph_3firm.png", plot = line_graph_3firm)
## Saving 7 x 5 in image
df %>%
filter(str_detect(df$brand, "GENERAL MILLS"))%>%
group_by(week,brand) %>%
summarize(med_rev = median(revenue)) %>%
ggplot(mapping = aes(x = as.integer(week), y = med_rev,colour=brand)) +
geom_line()+
scale_y_continuous(labels=scales::dollar)+
theme_classic()+
scale_x_continuous(breaks=seq(0,52,by=2))+
labs(title = "Median revenue by weeks",
subtitle = "higher grades do not mean higher prices below grade of 5",
x = "week",
y = "median revenue")
df %>%
filter(str_detect(df$brand, "KELLOGGS"))%>%
group_by(week,brand) %>%
summarize(med_rev = median(revenue)) %>%
ggplot(mapping = aes(x = as.integer(week), y = med_rev,colour=brand)) +
geom_line()+
scale_y_continuous(labels=scales::dollar)+
theme_classic()+
scale_x_continuous(breaks=seq(0,52,by=2))+
labs(title = "Median revenue by weeks",
subtitle = "higher grades do not mean higher prices below grade of 5",
x = "week",
y = "median revenue")
df %>%
filter(str_detect(df$brand, "POST"))%>%
group_by(week,brand) %>%
summarize(med_rev = median(revenue)) %>%
ggplot(mapping = aes(x = as.integer(week), y = med_rev,colour=brand)) +
geom_line()+
scale_y_continuous(labels=scales::dollar)+
theme_classic()+
scale_x_continuous(breaks=seq(0,52,by=2))+
labs(title = "Median revenue by weeks",
x = "week",
y = "median revenue")
Promo_ad_tile<-df %>%
group_by(ad, promo) %>%
summarise(med_rev = median(revenue)) %>%
ggplot(aes(ad, promo)) +
geom_tile(aes(fill = med_rev))+
labs(title = "Promotion and Advertisment effect on median Revenue",subtitle = "promotion adn advertisement increase median revenue")
ggsave(filename = "Promo_ad_tile.png", plot = Promo_ad_tile)
## Saving 7 x 5 in image
Promo_ad_tile
Comment: We can clearly see that Ad B with promo accounts for higher revenue. But which brands? and which parts of the weeks? Same for Ad A with promo.
df %>%
group_by(brand,flavor)%>%
summarize(med_rev=median(revenue),n_eth=n()) %>%
ggplot(aes(x = brand, y = med_rev)) + geom_bar(stat = "identity")+
facet_wrap( ~ flavor,ncol = 5)+
geom_text(aes(label = dollar(med_rev)),
angle=90,vjust="inward",hjust="inward")+
scale_y_continuous(labels=scales::dollar)+
theme(
axis.text.x = element_text(angle=90, hjust=1,size = 6)
)
Comment: we can see that general mills $14 median revenue more than kelloggs and $138 median dollars more than post. The difference of median revenue between kellogs and GM is probably insignificant but between GM and Post seems to be significant.
By promo:
df1<-df %>%
group_by(promo)%>%
mutate(rev_diff = c(0,diff(revenue)))
r_diff<-ggplot(df1, mapping=aes(x=brand,y=rev_diff)) +
geom_boxplot() +
theme(
axis.text.x = element_text(angle=90, hjust=1,size = 8))+
labs(title = "Revenue difference based on promotion for each brand",subtitle = "median revenue is not different for any brand",y="Revenue")
ggsave(filename = "rev_diff.png", plot = r_diff)
## Saving 7 x 5 in image
r_diff
Comment: There is no significant difference in revenue for each of the brands by using or not using promos. By ads:
df2<-df %>%
group_by(ad)%>%
mutate(rev_diff = c(0,diff(revenue)))
r_diff_ad<-ggplot(df2, mapping=aes(x=brand,y=rev_diff)) +
geom_boxplot() +
theme(
axis.text.x = element_text(angle=90, hjust=1,size = 8))+
labs(title = "Revenue difference based on advertisement for each brand",subtitle = "median revenue is not different for any brand",y="Revenue")
ggsave(filename = "rev_diff_ad.png", plot = r_diff_ad)
## Saving 7 x 5 in image
r_diff_ad
Comment:There is not significant difference in revenue for each of the brands among using ad A,ad B and no ad.
# 95% CI, get z-value for upper tail, use 0.975 since is one sided
z <- qnorm(0.975)
# Incorporate CI into bar graph of means
df1 %>%
group_by(brand,promo) %>%
summarise(mean_rev_diff = mean(rev_diff), sd = sd(rev_diff),
n = n(), ci = z * sd/sqrt(n)) %>%
ggplot(aes(x = brand, y = mean_rev_diff,fill=promo)) +
geom_bar(stat = "identity", position = "dodge") +
theme(
axis.text.x = element_text(angle=90, hjust=1,size = 8))+
geom_errorbar(aes(ymin = mean_rev_diff - ci, ymax = mean_rev_diff + ci),
width = 0.5, position = position_dodge(0.9))
Comment: having promo is not reliable in determining sales
# 95% CI, get z-value for upper tail, use 0.975 since is one sided
z <- qnorm(0.975)
# Incorporate CI into bar graph of means
df2 %>%
group_by(brand,ad) %>%
summarise(mean_rev_diff = mean(rev_diff), sd = sd(rev_diff),
n = n(), ci = z * sd/sqrt(n)) %>%
ggplot(aes(x = brand, y = mean_rev_diff,fill=ad)) +
geom_bar(stat = "identity", position = "dodge") +
theme(
axis.text.x = element_text(angle=90, hjust=1,size = 8))
# 95% CI, get z-value for upper tail, use 0.975 since is one sided
z <- qnorm(0.975)
# Incorporate CI into bar graph of means
sig_unit_promo<-df %>%
group_by(brand,promo) %>%
summarise(mean_unit_diff = mean(units), sd = sd(units),
n = n(), ci = z * sd/sqrt(n)) %>%
ggplot(aes(x = brand, y = mean_unit_diff,fill=promo)) +
geom_bar(stat = "identity", position = "dodge") +
geom_text(aes(label = round(mean_unit_diff, 2)), vjust = -.8, color = "black", # vjust moves lables above CI
position = position_dodge(0.9), size = 3)+
geom_errorbar(aes(ymin = mean_unit_diff - ci, ymax = mean_unit_diff + ci),
width = 0.5, position = position_dodge(0.9)) +
expand_limits(y=c(0, 17))+
theme_classic()+
theme(
axis.text.x = element_text(angle=90, hjust=1,size = 8))+
labs(title = "Mean Units difference based on Promotion by brand",
subtitle = "Promotion is a reliable measure for units sold for each brand",
x = "brand",
y = "unit sold difference")
sig_unit_promo
ggsave(filename = "sig_unit_promo.png", plot = sig_unit_promo)
## Saving 7 x 5 in image
comment:mean price difference for with and without promo is reliable across brands
# 95% CI, get z-value for upper tail, use 0.975 since is one sided
z <- qnorm(0.975)
# Incorporate CI into bar graph of means
sig_price_ad<-df %>%
group_by(brand,ad) %>%
summarise(mean_price_diff = mean(price), sd = sd(price),
n = n(), ci = z * sd/sqrt(n)) %>%
ggplot(aes(x = brand, y = mean_price_diff,fill=ad)) +
geom_bar(stat = "identity", position = "dodge") +
geom_text(aes(label = round(mean_price_diff, 2)), vjust = -1.7, color = "black", # vjust moves lables above CI
position = position_dodge(0.9), size = 2)+
geom_errorbar(aes(ymin = mean_price_diff - ci, ymax = mean_price_diff + ci),
width = 0.5, position = position_dodge(0.9)) +
expand_limits(y=c(0, 5))+
theme_classic()+
theme(
axis.text.x = element_text(angle=90, hjust=1,size = 8))+
labs(title = "Mean Price difference based on Advertisement by brand",
subtitle = "advertisement is a reliable measure for price for each brand",
x = "brand",
y = "mean price difference")
sig_price_ad
ggsave(filename = "sig_price_ad.png", plot = sig_price_ad)
## Saving 7 x 5 in image
Comment: mean price difference are reliable across the brands with ads taken into account
mod <- glm(rev_diff ~ brand+promo+ad+package+volume,
data = df1)
summary(mod)
##
## Call:
## glm(formula = rev_diff ~ brand + promo + ad + package + volume,
## data = df1)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -120.449 -14.642 0.149 14.035 126.876
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.714e-01 1.136e+00 -0.591 0.5544
## brandGENERAL MILLS CINNAMON TST CR -2.233e-01 9.389e-01 -0.238 0.8120
## brandGENERAL MILLS COCOA PUFFS -1.536e-01 1.078e+00 -0.143 0.8867
## brandGENERAL MILLS KIX 8.814e-06 1.031e+00 0.000 1.0000
## brandGENERAL MILLS LUCKY CHARMS -1.126e-02 9.498e-01 -0.012 0.9905
## brandKELLOGGS COCOA KRISPIES -1.274e-01 1.128e+00 -0.113 0.9101
## brandKELLOGGS FROOT LOOPS -1.232e-01 8.918e-01 -0.138 0.8901
## brandKELLOGGS FROSTED FLAKES -6.498e-02 9.131e-01 -0.071 0.9433
## brandKELLOGGS FROSTED MINI WHEATS -2.539e-01 9.912e-01 -0.256 0.7978
## brandKELLOGGS RAISIN BRAN -2.155e-01 1.082e+00 -0.199 0.8422
## brandKELLOGGS RICE KRISPIES -2.133e-01 9.784e-01 -0.218 0.8275
## brandKELLOGGS SMART START -1.696e-01 1.053e+00 -0.161 0.8721
## brandKELLOGGS SPECIAL K -4.998e-02 9.914e-01 -0.050 0.9598
## brandPOST GRAPE NUTS -2.915e-01 1.093e+00 -0.267 0.7896
## brandPOST SHREDDED WHEAT -2.115e-01 1.039e+00 -0.204 0.8387
## promo1 -1.420e-01 4.663e-01 -0.304 0.7608
## adB 2.058e+00 1.066e+00 1.931 0.0535
## adNONE 3.382e-01 7.387e-01 0.458 0.6471
## packageCUP -2.831e-03 1.306e+00 -0.002 0.9983
## volume 4.365e-01 6.615e-01 0.660 0.5094
##
## (Intercept)
## brandGENERAL MILLS CINNAMON TST CR
## brandGENERAL MILLS COCOA PUFFS
## brandGENERAL MILLS KIX
## brandGENERAL MILLS LUCKY CHARMS
## brandKELLOGGS COCOA KRISPIES
## brandKELLOGGS FROOT LOOPS
## brandKELLOGGS FROSTED FLAKES
## brandKELLOGGS FROSTED MINI WHEATS
## brandKELLOGGS RAISIN BRAN
## brandKELLOGGS RICE KRISPIES
## brandKELLOGGS SMART START
## brandKELLOGGS SPECIAL K
## brandPOST GRAPE NUTS
## brandPOST SHREDDED WHEAT
## promo1
## adB .
## adNONE
## packageCUP
## volume
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 694.8016)
##
## Null deviance: 15171166 on 21849 degrees of freedom
## Residual deviance: 15167518 on 21830 degrees of freedom
## AIC: 205008
##
## Number of Fisher Scoring iterations: 2
mod1 <- glm(price ~ brand+promo+ad+package+volume,
data = df1)
summary(mod1)
##
## Call:
## glm(formula = price ~ brand + promo + ad + package + volume,
## data = df1)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.9452 -0.4142 -0.0443 0.4117 6.0602
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.235618 0.027160 82.314 < 2e-16
## brandGENERAL MILLS CINNAMON TST CR 0.021795 0.022453 0.971 0.3317
## brandGENERAL MILLS COCOA PUFFS -0.136537 0.025770 -5.298 1.18e-07
## brandGENERAL MILLS KIX 0.114476 0.024649 4.644 3.43e-06
## brandGENERAL MILLS LUCKY CHARMS 0.371259 0.022714 16.345 < 2e-16
## brandKELLOGGS COCOA KRISPIES 0.080672 0.026987 2.989 0.0028
## brandKELLOGGS FROOT LOOPS 0.002528 0.021327 0.119 0.9056
## brandKELLOGGS FROSTED FLAKES -0.321631 0.021836 -14.729 < 2e-16
## brandKELLOGGS FROSTED MINI WHEATS -0.472586 0.023704 -19.937 < 2e-16
## brandKELLOGGS RAISIN BRAN -0.880740 0.025875 -34.039 < 2e-16
## brandKELLOGGS RICE KRISPIES 0.031118 0.023398 1.330 0.1836
## brandKELLOGGS SMART START -0.037521 0.025194 -1.489 0.1364
## brandKELLOGGS SPECIAL K 0.407257 0.023709 17.177 < 2e-16
## brandPOST GRAPE NUTS -0.879209 0.026135 -33.641 < 2e-16
## brandPOST SHREDDED WHEAT -0.440622 0.024841 -17.738 < 2e-16
## promo1 -0.858783 0.011151 -77.011 < 2e-16
## adB 0.014909 0.025487 0.585 0.5586
## adNONE 0.225012 0.017666 12.737 < 2e-16
## packageCUP -1.310767 0.031244 -41.953 < 2e-16
## volume 1.648435 0.015819 104.206 < 2e-16
##
## (Intercept) ***
## brandGENERAL MILLS CINNAMON TST CR
## brandGENERAL MILLS COCOA PUFFS ***
## brandGENERAL MILLS KIX ***
## brandGENERAL MILLS LUCKY CHARMS ***
## brandKELLOGGS COCOA KRISPIES **
## brandKELLOGGS FROOT LOOPS
## brandKELLOGGS FROSTED FLAKES ***
## brandKELLOGGS FROSTED MINI WHEATS ***
## brandKELLOGGS RAISIN BRAN ***
## brandKELLOGGS RICE KRISPIES
## brandKELLOGGS SMART START
## brandKELLOGGS SPECIAL K ***
## brandPOST GRAPE NUTS ***
## brandPOST SHREDDED WHEAT ***
## promo1 ***
## adB
## adNONE ***
## packageCUP ***
## volume ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.3973636)
##
## Null deviance: 21723.1 on 21849 degrees of freedom
## Residual deviance: 8674.4 on 21830 degrees of freedom
## AIC: 41864
##
## Number of Fisher Scoring iterations: 2