Departmental Store Analysis using R
Departmental Store Analysis using R
In this case study, I got a dataset from a departmental store, covering product details during the challenging period of May 2020, marked by the COVID-19 pandemic. The primary objective is to identify the most profitable product investments. Leveraging R programming, my task involved delving into the dataset to analyze product patterns and trends, with the ultimate aim of extracting valuable insights for informed and strategic decision-making.
PROBLEM STATEMENT:
You are provided with a dataset of a departmental store. It contains details of products from May, 2020, a period marked by Covid-19. Your manager wants you to find out that investing in which products will be more profitable. Your objective is to analyze the patterns and trends of the products, and gather insights for strategic decision making.
##Packages used for analysis are:
dplyr, ggplot2, corrplot, ggcorrplot
DATA MANIPULATION WITH dplyr
#Load the data
store <- read.csv("C:/Users/priti/Downloads/MY DEPARTMENTAL STORE .csv")
#Load the dplyr package
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#glimpse() function
glimpse(store)
## Rows: 550
## Columns: 8
## $ UNIQUE_ID <chr> "GSK01", "GSK02", "GSK03", "GSK04", "GSK05", "GSK06"~
## $ PRODUCT_NAME <chr> "A Masks", "N hand sanitizer,350 ml", "G Channa Dal,~
## $ COMPANY <chr> "A", "N", "G", "I", "S", "S", "S", "G", "Z", "A", "Z~
## $ PRODUCT_TYPE <chr> "hygiene", "hygiene", "foodgrains&spices", "Organic ~
## $ PRODUCT_CATEGORY <chr> "mask", "sanitizer", "pulses(dal)", "Dry Fruits", "b~
## $ COST_PRICE <dbl> 160.00, 248.00, 162.00, 77.14, 494.50, 445.05, 270.0~
## $ SELLING_PRICE <dbl> 200.000, 400.000, 180.000, 133.000, 593.400, 534.060~
## $ QUANTITY_DEMANDED <int> 890, 800, 456, 100, 111, 111, 360, 52, 27, 353, 500,~
#filter() function
Q1 GET THE INFORMATION OF THE PRODUCT WHERE PRODUCT TYPE IS “SNACKS”.
s1 <- filter(store, PRODUCT_TYPE=="snacks")
Q2 GET THE INFORMATION OF THE PRODUCT WHERE IT BELONGS TO COMPANY S AND M.
s2 <- filter(store, COMPANY == c("S", "M"))
#slice() function
Q3 GET THE DETAILS OF FIRST 10 ROWS
s3 <- store %>% slice(1:10)
Q4 GET THE FIRST 6 ROWS USING HEAD TOOL
s4 <- store %>% slice_head(n = 6)
#arrange() function
Q5 ARRANGE THE DATASET ASCENDING ORDER OF QUANTITY DEMANDED.
s5 <- arrange(store, QUANTITY_DEMANDED)
Q6 ARRANGE THE DATATSET DESCENDING ORDER OF SELLING PRICE.
s6 <- arrange(store, desc(SELLING_PRICE))
#select() function
Q7 GET THE INFORMATION OF COLUMNS SELLING PRICE AND COMPANY
s7 <- select(store, SELLING_PRICE, COMPANY)
Q8 GET THE INFORMATION OF COLUMNS 2 AND 5
s8 <- select(store, c(2,5))
Q9 GET THE INFORMATION OF RANGE OF COLUMNS 2 TO 6
s9 <- select(store, 2:6)
Q10 GET THE INFORMATION OF COLUMNS STARTS WITH "P"
s10 <- select(store, starts_with("P"))
Q11 GET THE INFORMATION OF COLUMNS ENDS WITH "E"
s11 <- select(store, ends_with("E"))
#mutate function()
Q12 ADD A COLUMN TO SHOW PROFIT
s12 <- mutate(store, PROFIT = SELLING_PRICE-COST_PRICE)
Q13 ADD A COLUMN TO SHOW PROFIT PERCENTAGE
s13 <- mutate(s12, PROFIT_PERCENTAGE = PROFIT/COST_PRICE*100) #HERE TAKEN
DATAFRAME 's12' INSTEAD 'store' AS ALL NEW COLUMNS SHOULD BE STORED IN ONE FILE.
Q14 ADD A COLUMN TO SHOW NET PROFIT
s14 <- mutate(s13, NET_PROFIT = PROFIT*QUANTITY_DEMANDED)
store.new <- mutate(store, PROFIT = SELLING_PRICE-COST_PRICE, PROFIT_PERCENTAGE = PROFIT/COST_PRICE*100, NET_PROFIT = PROFIT*QUANTITY_DEMANDED)
STATISTICAL INTERPRETATIONS
#summarise() function
Q15 FIND THE MEAN AND MEDIAN
summarise(store.new, AVG = mean(NET_PROFIT))
## AVG
## 1 12092.49
summarise(store.new, AVG = median(NET_PROFIT))
## SUM
## 1 6650872
Q16 FIND THE SUM
summarise(store.new, SUM = sum(NET_PROFIT))
## SUM
## 1 6650872
Q17 FIND THE MINIMUM AND MAXIMUM
summarise(store.new, MINIMUM = min(NET_PROFIT))
## MINIMUM
## 1 223.44
summarise(store.new, MAXIMUM = max(NET_PROFIT))
## MAXIMUM
## 1 136800
Q18 FIND THE VARIANCE AND STANDARD DEVIATION
summarise(store.new, VARIANCE = var(NET_PROFIT))
## VARIANCE
## 1 231656689
summarise(store.new, STANDARD_DEVIATION = sd(NET_PROFIT))
## STANDARD_DEVIATION
## 1 15220.27
Q19 FIND THE SUMMARY, QUANTILE, AND RANGE
summarise(store.new, SUMMARY = summary(NET_PROFIT))
## SUMMARY
## 1 223.44
## 2 4299.39
## 3 9350.00
## 4 12092.49
## 5 13661.89
## 6 136800.00
summarise(store.new, QUANTILE = quantile(NET_PROFIT))
## QUANTILE
## 1 223.44
## 2 4299.39
## 3 9350.00
## 4 13661.89
## 5 136800.00
summarise(store.new, RANGE = range(NET_PROFIT))
## RANGE
## 1 223.44
## 2 136800.00
#group_by() function
Q20 SORT THE DATA GROUPED BY PRODUCT_TYPE
s20 <- group_by(store.new, PRODUCT_TYPE)
#Find the following descriptive statistics for the net profit for the sorted data grouped by product type.
Q1 Find the Mean and Median
summarise(s20, AVG = mean(NET_PROFIT))
## # A tibble: 12 x 2
## PRODUCT_TYPE AVG
## <chr> <dbl>
## 1 baby product 27383.
## 2 beauty products 8952.
## 3 beverage 10187.
## 4 dairy products 9820
## 5 foodgrains&spices 7348.
## 6 household 17890.
## 7 hygiene 39051.
## 8 Organic food 6003.
## 9 Packed Food 11891.
## 10 pet food 10096.
## 11 snacks 6063.
## 12 Spreads&Sauces 15856.
summarise(s20, AVG = median(NET_PROFIT))
## # A tibble: 12 x 2
## PRODUCT_TYPE AVG
## <chr> <dbl>
## 1 baby product 28400
## 2 beauty products 9131.
## 3 beverage 6958.
## 4 dairy products 9610
## 5 foodgrains&spices 6820.
## 6 household 19320
## 7 hygiene 28839.
## 8 Organic food 3728.
## 9 Packed Food 12312
## 10 pet food 10000
## 11 snacks 4520
## 12 Spreads&Sauces 15606
Q2 Find the SUM
summarise(s20, SUM = sum(NET_PROFIT))
## # A tibble: 12 x 2
## PRODUCT_TYPE SUM
## <chr> <dbl>
## 1 baby product 164300
## 2 beauty products 895172.
## 3 beverage 213921.
## 4 dairy products 78560
## 5 foodgrains&spices 668639.
## 6 household 375683.
## 7 hygiene 2108732.
## 8 Organic food 492251.
## 9 Packed Food 998867.
## 10 pet food 60574
## 11 snacks 388041.
## 12 Spreads&Sauces 206130.
Q3 Find the MINIMUM AND MAXIMUM
summarise(s20, MINIMUM = min(NET_PROFIT))
## # A tibble: 12 x 2
## PRODUCT_TYPE MINIMUM
## <chr> <dbl>
## 1 baby product 12520
## 2 beauty products 1152.
## 3 beverage 2880
## 4 dairy products 5200
## 5 foodgrains&spices 251.
## 6 household 6426
## 7 hygiene 3300
## 8 Organic food 223.
## 9 Packed Food 3408
## 10 pet food 10000
## 11 snacks 592
## 12 Spreads&Sauces 13107.
summarise(s20, MAXIMUM = max(NET_PROFIT))
## # A tibble: 12 x 2
## PRODUCT_TYPE MAXIMUM
## <chr> <dbl>
## 1 baby product 40200
## 2 beauty products 16925.
## 3 beverage 19706.
## 4 dairy products 14440
## 5 foodgrains&spices 30600
## 6 household 29652
## 7 hygiene 136800
## 8 Organic food 48770.
## 9 Packed Food 29232
## 10 pet food 10314
## 11 snacks 11248
## 12 Spreads&Sauces 19200
Q4 Find the VARIANCE AND STANDARD DEVIATION
summarise(s20, VARIANCE = var(NET_PROFIT))
## # A tibble: 12 x 2
## PRODUCT_TYPE VARIANCE
## <chr> <dbl>
## 1 baby product 108127427.
## 2 beauty products 15055790.
## 3 beverage 33898319.
## 4 dairy products 9337800
## 5 foodgrains&spices 31113432.
## 6 household 33580807.
## 7 hygiene 1213862270.
## 8 Organic food 50637939.
## 9 Packed Food 15620461.
## 10 pet food 22257.
## 11 snacks 16904446.
## 12 Spreads&Sauces 4270949.
summarise(s20, STANDARD_DEVIATION = sd(NET_PROFIT))
## # A tibble: 12 x 2
## PRODUCT_TYPE STANDARD_DEVIATION
## <chr> <dbl>
## 1 baby product 10398.
## 2 beauty products 3880.
## 3 beverage 5822.
## 4 dairy products 3056.
## 5 foodgrains&spices 5578.
## 6 household 5795.
## 7 hygiene 34841.
## 8 Organic food 7116.
## 9 Packed Food 3952.
## 10 pet food 149.
## 11 snacks 4112.
## 12 Spreads&Sauces 2067.
Q5 Find the QUANTILE AND RANGE
summarise(s20, QUANTILE = quantile(NET_PROFIT))
## `summarise()` has grouped output by 'PRODUCT_TYPE'. You can override using the
## `.groups` argument.
## # A tibble: 60 x 2
## # Groups: PRODUCT_TYPE [12]
## PRODUCT_TYPE QUANTILE
## <chr> <dbl>
## 1 baby product 12520
## 2 baby product 20785
## 3 baby product 28400
## 4 baby product 34500
## 5 baby product 40200
## 6 beauty products 1152.
## 7 beauty products 8526.
## 8 beauty products 9131.
## 9 beauty products 11352.
## 10 beauty products 16925.
## # ... with 50 more rows
summarise(s20, RANGE = range(NET_PROFIT))
## `summarise()` has grouped output by 'PRODUCT_TYPE'. You can override using the
## `.groups` argument.
## # A tibble: 24 x 2
## # Groups: PRODUCT_TYPE [12]
## PRODUCT_TYPE RANGE
## <chr> <dbl>
## 1 baby product 12520
## 2 baby product 40200
## 3 beauty products 1152.
## 4 beauty products 16925.
## 5 beverage 2880
## 6 beverage 19706.
## 7 dairy products 5200
## 8 dairy products 14440
## 9 foodgrains&spices 251.
## 10 foodgrains&spices 30600
## # ... with 14 more rows
DATA VISUALIZATION WITH ggplot2
#Load the ggplot2 package
library(ggplot2)
#Column Plot
Q1 PLOT FOR AVERAGE_QUANTITY AND PRODUCT TYPE
store.new %>% group_by(PRODUCT_TYPE) %>%
summarise(AVG_QUANTITY = mean(QUANTITY_DEMANDED)) %>%
ggplot(aes(x = PRODUCT_TYPE, y = AVG_QUANTITY))+geom_col(width = 0.6, fill="red")+
theme(text=element_text(size = 9))
Q2 PLOT FOR NET_PROFIT AND PRODUCT TYPE
store.new %>% group_by(PRODUCT_TYPE) %>%
summarise(AVG_NET_PROFIT = mean(NET_PROFIT)) %>%
ggplot(aes(x = PRODUCT_TYPE, y = AVG_NET_PROFIT))+geom_col(width = 0.6, fill="red")+
theme(text=element_text(size = 9))
#Scatter Plot
Q3 PLOT FOR NET PROFIT AND COMPANY
store.new %>%
ggplot(aes(x = COMPANY, y = NET_PROFIT, color = PRODUCT_CATEGORY))+geom_point()
Q4 PLOT FOR PROFIT AND QUANTITY DEMANDED WHERE PRODUCT TYPE == "Beauty products"
store.new %>% filter(PRODUCT_TYPE == "beauty products") %>%
ggplot(aes(x = QUANTITY_DEMANDED, y = PROFIT, color = PRODUCT_CATEGORY))+geom_point()
#Line Plot
Q5 PLOT FOR SELLING PRICE VS QUANTITY DEMANDED - PRICE-DEMAND RELATIONSHIP
ggplot(store.new, aes(x = QUANTITY_DEMANDED, y = SELLING_PRICE))+geom_line(color="orange")
Q6 PLOT FOR AVG NET PROFIT AND COMPANY
store.new %>% group_by(PRODUCT_TYPE, COMPANY) %>%
summarise(AVG_NET_PROFIT = mean(NET_PROFIT)) %>%
ggplot(aes(x = PRODUCT_TYPE, y = AVG_NET_PROFIT, group = COMPANY, color = COMPANY))+
geom_line()+theme(text = element_text(size = 10))
#Histogram
Q7 PLOT FOR PROFIT PERCENT OF PRODUCT CATEGORY
store.new %>%
ggplot(aes(x = PROFIT_PERCENTAGE, fill = PRODUCT_CATEGORY))+
geom_histogram(binwidth = 30)
Q8 PLOT FOR QUANTITY DEMANDED OF PRODUCT CATEGORY WHERE PRODUCT TYPE IS "Snacks"
store.new %>% filter(PRODUCT_TYPE == "snacks") %>%
ggplot(aes(x = QUANTITY_DEMANDED, fill = PRODUCT_CATEGORY))+
geom_histogram(binwidth = 30)
#Pie Chart
Q9 MAKE A PIE CHART FOR EACH HYGIENE PRODUCT QUANTITY DEMANDED
STEP1: LETS PREPARE REQUIRED DATA
S21 <- filter(store.new, PRODUCT_TYPE == "hygiene") %>%
group_by(PRODUCT_CATEGORY) %>%
summarise(QUANTITY_DEMANDED = sum(QUANTITY_DEMANDED))
STEP2: LETS CALCULATE THE PERCENTAGE OF EACH PRODUCT
S22 <- S21 %>%
arrange(desc(PRODUCT_CATEGORY)) %>%
mutate(percentage = round(QUANTITY_DEMANDED*100/sum(QUANTITY_DEMANDED))) %>%
mutate(y_pos = cumsum(percentage)-0.5*percentage)
STEP3: LETS CREATE THE PIE CHART
S22 %>%
ggplot(aes(x="", percentage, fill = PRODUCT_CATEGORY))+
geom_bar(width = 1, stat = "identity", color = "white", alpha = .5)+
coord_polar("y", start = 0)+
geom_text(aes(y = y_pos, label = paste0(percentage, "%")), color = "black")+
scale_fill_manual(values = rainbow(7))+ theme_void()
Q10 MAKE A DONUT CHART FOR EACH HYGIENE PRODUCT QUANTITY DEMANDED
Using same data that created for Pie chart
S22 %>%
ggplot(aes(x = 2, percentage, fill = PRODUCT_CATEGORY))+
geom_bar(stat = "identity", color = "white", alpha = .5)+
coord_polar(theta = "y", start = 0)+
geom_text(aes(y = y_pos, label = paste0(percentage, "%")), color = "black")+
scale_fill_manual(values = rainbow(7))+ theme_void()+ xlim(0.6,2.6)
Q11 MAKE A COXCOMB CHART FOR. EACH HYGIENE PRODUCT QUANTITY DEMANDED
Using same data that created for Pie chart
S22 %>%
ggplot(aes(PRODUCT_CATEGORY, QUANTITY_DEMANDED, fill = PRODUCT_CATEGORY))+
geom_bar(stat = "identity")+
coord_polar("x", start = 0, direction = -1)+
xlab("DIFFERENT HYGIENE PRODUCTS")+
ylab("QUANTITY")
CORRELATION
#Loading packages
library(ggplot2)
library(corrplot)
library(ggcorrplot)
#Load the dataset
store.rat <- read.csv("C:/Users/priti/Downloads/RATINGS.csv")
FIND AND PLOT THE CORRELATION BETWEEN RATINGS AND QUANTITY DEMANDED
cor(store.rat$RATINGS, store.rat$QUANTITY_DEMANDED)
## [1] 0.7682532
plot(store.rat$RATINGS, store.rat$QUANTITY_DEMANDED)
#Correlation test
#Pearson's correlation
cor.test(store.rat$RATINGS, store.rat$QUANTITY_DEMANDED)
##
## Pearson's product-moment correlation
##
## data: store.rat$RATINGS and store.rat$QUANTITY_DEMANDED
## t = 6.0006, df = 25, p-value = 2.881e-06
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.5483194 0.8887882
## sample estimates:
## cor
## 0.7682532
#Correlation matrix
To get the greater insight lets works with our primary dataset i.e. 'store.new'
FIND AND PLOT THE CORRELATION MATRIX
store.new1 <- dplyr::select_if(store.new, is.numeric)
r <- cor(store.new, use = "complete.obs")
round(r,2)
## COST_PRICE SELLING_PRICE QUANTITY_DEMANDED PROFIT
## COST_PRICE 1.00 0.98 -0.53 0.62
## SELLING_PRICE 0.98 1.00 -0.58 0.78
## QUANTITY_DEMANDED -0.53 -0.58 1.00 -0.55
## PROFIT 0.62 0.78 -0.55 1.00
## PROFIT_PERCENTAGE -0.18 0.00 -0.35 0.50
## NET_PROFIT 0.15 0.20 0.34 0.27
## PROFIT_PERCENTAGE NET_PROFIT
## COST_PRICE -0.18 0.15
## SELLING_PRICE 0.00 0.20
## QUANTITY_DEMANDED -0.35 0.34
## PROFIT 0.50 0.27
## PROFIT_PERCENTAGE 1.00 0.14
## NET_PROFIT 0.14 1.00
PLOTTING CORRELATION MATRIX
#Heat Map
ggcorrplot(r)
PLOTTING THE SORTED LOWER TRIANGLE
ggcorrplot(r,
hc.order = TRUE,
type = "lower",
lab = TRUE)