Pritish Jain - Departmental store Analysis with R

Departmental Store Analysis using R

In this case study, I got a dataset from a departmental store, covering product details during the challenging period of May 2020, marked by the COVID-19 pandemic. The primary objective is to identify the most profitable product investments. Leveraging R programming, my task involved delving into the dataset to analyze product patterns and trends, with the ultimate aim of extracting valuable insights for informed and strategic decision-making.

PROBLEM STATEMENT:

You are provided with a dataset of a departmental store. It contains details of products from May, 2020, a period marked by Covid-19. Your manager wants you to find out that investing in which products will be more profitable. Your objective is to analyze the patterns and trends of the products, and gather insights for strategic decision making.

##Packages used for analysis are:

dplyr, ggplot2, corrplot, ggcorrplot

DATA MANIPULATION WITH dplyr

#Load the data

store <- read.csv("C:/Users/priti/Downloads/MY DEPARTMENTAL STORE .csv")

#Load the dplyr package

library(dplyr)

##

## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':

##

## filter, lag

## The following objects are masked from 'package:base':

##

## intersect, setdiff, setequal, union

#glimpse() function

glimpse(store)

## Rows: 550

## Columns: 8

## $ UNIQUE_ID <chr> "GSK01", "GSK02", "GSK03", "GSK04", "GSK05", "GSK06"~

## $ PRODUCT_NAME <chr> "A Masks", "N hand sanitizer,350 ml", "G Channa Dal,~

## $ COMPANY <chr> "A", "N", "G", "I", "S", "S", "S", "G", "Z", "A", "Z~

## $ PRODUCT_TYPE <chr> "hygiene", "hygiene", "foodgrains&spices", "Organic ~

## $ PRODUCT_CATEGORY <chr> "mask", "sanitizer", "pulses(dal)", "Dry Fruits", "b~

## $ COST_PRICE <dbl> 160.00, 248.00, 162.00, 77.14, 494.50, 445.05, 270.0~

## $ SELLING_PRICE <dbl> 200.000, 400.000, 180.000, 133.000, 593.400, 534.060~

## $ QUANTITY_DEMANDED <int> 890, 800, 456, 100, 111, 111, 360, 52, 27, 353, 500,~

#filter() function

Q1 GET THE INFORMATION OF THE PRODUCT WHERE PRODUCT TYPE IS “SNACKS”.

s1 <- filter(store, PRODUCT_TYPE=="snacks")

Q2 GET THE INFORMATION OF THE PRODUCT WHERE IT BELONGS TO COMPANY S AND M.

s2 <- filter(store, COMPANY == c("S", "M"))

#slice() function

Q3 GET THE DETAILS OF FIRST 10 ROWS

s3 <- store %>% slice(1:10)

Q4 GET THE FIRST 6 ROWS USING HEAD TOOL

s4 <- store %>% slice_head(n = 6)

#arrange() function

Q5 ARRANGE THE DATASET ASCENDING ORDER OF QUANTITY DEMANDED.

s5 <- arrange(store, QUANTITY_DEMANDED)

Q6 ARRANGE THE DATATSET DESCENDING ORDER OF SELLING PRICE.

s6 <- arrange(store, desc(SELLING_PRICE))

#select() function

Q7 GET THE INFORMATION OF COLUMNS SELLING PRICE AND COMPANY

s7 <- select(store, SELLING_PRICE, COMPANY)

Q8 GET THE INFORMATION OF COLUMNS 2 AND 5

s8 <- select(store, c(2,5))

Q9 GET THE INFORMATION OF RANGE OF COLUMNS 2 TO 6

s9 <- select(store, 2:6)

Q10 GET THE INFORMATION OF COLUMNS STARTS WITH "P"

s10 <- select(store, starts_with("P"))

Q11 GET THE INFORMATION OF COLUMNS ENDS WITH "E"

s11 <- select(store, ends_with("E"))

#mutate function()

Q12 ADD A COLUMN TO SHOW PROFIT

s12 <- mutate(store, PROFIT = SELLING_PRICE-COST_PRICE)

Q13 ADD A COLUMN TO SHOW PROFIT PERCENTAGE

s13 <- mutate(s12, PROFIT_PERCENTAGE = PROFIT/COST_PRICE*100) #HERE TAKEN

DATAFRAME 's12' INSTEAD 'store' AS ALL NEW COLUMNS SHOULD BE STORED IN ONE FILE.

Q14 ADD A COLUMN TO SHOW NET PROFIT

s14 <- mutate(s13, NET_PROFIT = PROFIT*QUANTITY_DEMANDED)

store.new <- mutate(store, PROFIT = SELLING_PRICE-COST_PRICE, PROFIT_PERCENTAGE = PROFIT/COST_PRICE*100, NET_PROFIT = PROFIT*QUANTITY_DEMANDED)

STATISTICAL INTERPRETATIONS

#summarise() function

Q15 FIND THE MEAN AND MEDIAN

summarise(store.new, AVG = mean(NET_PROFIT))

## AVG

## 1 12092.49

summarise(store.new, AVG = median(NET_PROFIT))

## SUM

## 1 6650872

Q16 FIND THE SUM

summarise(store.new, SUM = sum(NET_PROFIT))

## SUM

## 1 6650872

Q17 FIND THE MINIMUM AND MAXIMUM

summarise(store.new, MINIMUM = min(NET_PROFIT))

## MINIMUM

## 1 223.44

summarise(store.new, MAXIMUM = max(NET_PROFIT))

## MAXIMUM

## 1 136800

Q18 FIND THE VARIANCE AND STANDARD DEVIATION

summarise(store.new, VARIANCE = var(NET_PROFIT))

## VARIANCE

## 1 231656689

summarise(store.new, STANDARD_DEVIATION = sd(NET_PROFIT))

## STANDARD_DEVIATION

## 1 15220.27

Q19 FIND THE SUMMARY, QUANTILE, AND RANGE

summarise(store.new, SUMMARY = summary(NET_PROFIT))

## SUMMARY

## 1 223.44

## 2 4299.39

## 3 9350.00

## 4 12092.49

## 5 13661.89

## 6 136800.00

summarise(store.new, QUANTILE = quantile(NET_PROFIT))

## QUANTILE

## 1 223.44

## 2 4299.39

## 3 9350.00

## 4 13661.89

## 5 136800.00

summarise(store.new, RANGE = range(NET_PROFIT))

## RANGE

## 1 223.44

## 2 136800.00

#group_by() function

Q20 SORT THE DATA GROUPED BY PRODUCT_TYPE

s20 <- group_by(store.new, PRODUCT_TYPE)

#Find the following descriptive statistics for the net profit for the sorted data grouped by product type.

Q1 Find the Mean and Median

summarise(s20, AVG = mean(NET_PROFIT))

## # A tibble: 12 x 2

## PRODUCT_TYPE AVG

## <chr> <dbl>

## 1 baby product 27383.

## 2 beauty products 8952.

## 3 beverage 10187.

## 4 dairy products 9820

## 5 foodgrains&spices 7348.

## 6 household 17890.

## 7 hygiene 39051.

## 8 Organic food 6003.

## 9 Packed Food 11891.

## 10 pet food 10096.

## 11 snacks 6063.

## 12 Spreads&Sauces 15856.

summarise(s20, AVG = median(NET_PROFIT))

## # A tibble: 12 x 2

## PRODUCT_TYPE AVG

## <chr> <dbl>

## 1 baby product 28400

## 2 beauty products 9131.

## 3 beverage 6958.

## 4 dairy products 9610

## 5 foodgrains&spices 6820.

## 6 household 19320

## 7 hygiene 28839.

## 8 Organic food 3728.

## 9 Packed Food 12312

## 10 pet food 10000

## 11 snacks 4520

## 12 Spreads&Sauces 15606

Q2 Find the SUM

summarise(s20, SUM = sum(NET_PROFIT))

## # A tibble: 12 x 2

## PRODUCT_TYPE SUM

## <chr> <dbl>

## 1 baby product 164300

## 2 beauty products 895172.

## 3 beverage 213921.

## 4 dairy products 78560

## 5 foodgrains&spices 668639.

## 6 household 375683.

## 7 hygiene 2108732.

## 8 Organic food 492251.

## 9 Packed Food 998867.

## 10 pet food 60574

## 11 snacks 388041.

## 12 Spreads&Sauces 206130.

Q3 Find the MINIMUM AND MAXIMUM

summarise(s20, MINIMUM = min(NET_PROFIT))

## # A tibble: 12 x 2

## PRODUCT_TYPE MINIMUM

## <chr> <dbl>

## 1 baby product 12520

## 2 beauty products 1152.

## 3 beverage 2880

## 4 dairy products 5200

## 5 foodgrains&spices 251.

## 6 household 6426

## 7 hygiene 3300

## 8 Organic food 223.

## 9 Packed Food 3408

## 10 pet food 10000

## 11 snacks 592

## 12 Spreads&Sauces 13107.

summarise(s20, MAXIMUM = max(NET_PROFIT))

## # A tibble: 12 x 2

## PRODUCT_TYPE MAXIMUM

## <chr> <dbl>

## 1 baby product 40200

## 2 beauty products 16925.

## 3 beverage 19706.

## 4 dairy products 14440

## 5 foodgrains&spices 30600

## 6 household 29652

## 7 hygiene 136800

## 8 Organic food 48770.

## 9 Packed Food 29232

## 10 pet food 10314

## 11 snacks 11248

## 12 Spreads&Sauces 19200

Q4 Find the VARIANCE AND STANDARD DEVIATION

summarise(s20, VARIANCE = var(NET_PROFIT))

## # A tibble: 12 x 2

## PRODUCT_TYPE VARIANCE

## <chr> <dbl>

## 1 baby product 108127427.

## 2 beauty products 15055790.

## 3 beverage 33898319.

## 4 dairy products 9337800

## 5 foodgrains&spices 31113432.

## 6 household 33580807.

## 7 hygiene 1213862270.

## 8 Organic food 50637939.

## 9 Packed Food 15620461.

## 10 pet food 22257.

## 11 snacks 16904446.

## 12 Spreads&Sauces 4270949.

summarise(s20, STANDARD_DEVIATION = sd(NET_PROFIT))

## # A tibble: 12 x 2

## PRODUCT_TYPE STANDARD_DEVIATION

## <chr> <dbl>

## 1 baby product 10398.

## 2 beauty products 3880.

## 3 beverage 5822.

## 4 dairy products 3056.

## 5 foodgrains&spices 5578.

## 6 household 5795.

## 7 hygiene 34841.

## 8 Organic food 7116.

## 9 Packed Food 3952.

## 10 pet food 149.

## 11 snacks 4112.

## 12 Spreads&Sauces 2067.

Q5 Find the QUANTILE AND RANGE

summarise(s20, QUANTILE = quantile(NET_PROFIT))

## `summarise()` has grouped output by 'PRODUCT_TYPE'. You can override using the

## `.groups` argument.

## # A tibble: 60 x 2

## # Groups: PRODUCT_TYPE [12]

## PRODUCT_TYPE QUANTILE

## <chr> <dbl>

## 1 baby product 12520

## 2 baby product 20785

## 3 baby product 28400

## 4 baby product 34500

## 5 baby product 40200

## 6 beauty products 1152.

## 7 beauty products 8526.

## 8 beauty products 9131.

## 9 beauty products 11352.

## 10 beauty products 16925.

## # ... with 50 more rows

summarise(s20, RANGE = range(NET_PROFIT))

## `summarise()` has grouped output by 'PRODUCT_TYPE'. You can override using the

## `.groups` argument.

## # A tibble: 24 x 2

## # Groups: PRODUCT_TYPE [12]

## PRODUCT_TYPE RANGE

## <chr> <dbl>

## 1 baby product 12520

## 2 baby product 40200

## 3 beauty products 1152.

## 4 beauty products 16925.

## 5 beverage 2880

## 6 beverage 19706.

## 7 dairy products 5200

## 8 dairy products 14440

## 9 foodgrains&spices 251.

## 10 foodgrains&spices 30600

## # ... with 14 more rows

DATA VISUALIZATION WITH ggplot2

#Load the ggplot2 package

library(ggplot2)

#Column Plot

Q1 PLOT FOR AVERAGE_QUANTITY AND PRODUCT TYPE

store.new %>% group_by(PRODUCT_TYPE) %>%

summarise(AVG_QUANTITY = mean(QUANTITY_DEMANDED)) %>%

ggplot(aes(x = PRODUCT_TYPE, y = AVG_QUANTITY))+geom_col(width = 0.6, fill="red")+

theme(text=element_text(size = 9))

Q2 PLOT FOR NET_PROFIT AND PRODUCT TYPE

store.new %>% group_by(PRODUCT_TYPE) %>%

summarise(AVG_NET_PROFIT = mean(NET_PROFIT)) %>%

ggplot(aes(x = PRODUCT_TYPE, y = AVG_NET_PROFIT))+geom_col(width = 0.6, fill="red")+

theme(text=element_text(size = 9))

#Scatter Plot

Q3 PLOT FOR NET PROFIT AND COMPANY

store.new %>%

ggplot(aes(x = COMPANY, y = NET_PROFIT, color = PRODUCT_CATEGORY))+geom_point()

Q4 PLOT FOR PROFIT AND QUANTITY DEMANDED WHERE PRODUCT TYPE == "Beauty products"

store.new %>% filter(PRODUCT_TYPE == "beauty products") %>%

ggplot(aes(x = QUANTITY_DEMANDED, y = PROFIT, color = PRODUCT_CATEGORY))+geom_point()

#Line Plot

Q5 PLOT FOR SELLING PRICE VS QUANTITY DEMANDED - PRICE-DEMAND RELATIONSHIP

ggplot(store.new, aes(x = QUANTITY_DEMANDED, y = SELLING_PRICE))+geom_line(color="orange")

Q6 PLOT FOR AVG NET PROFIT AND COMPANY

store.new %>% group_by(PRODUCT_TYPE, COMPANY) %>%

summarise(AVG_NET_PROFIT = mean(NET_PROFIT)) %>%

ggplot(aes(x = PRODUCT_TYPE, y = AVG_NET_PROFIT, group = COMPANY, color = COMPANY))+

geom_line()+theme(text = element_text(size = 10))

#Histogram

Q7 PLOT FOR PROFIT PERCENT OF PRODUCT CATEGORY

store.new %>%

ggplot(aes(x = PROFIT_PERCENTAGE, fill = PRODUCT_CATEGORY))+

geom_histogram(binwidth = 30)

Q8 PLOT FOR QUANTITY DEMANDED OF PRODUCT CATEGORY WHERE PRODUCT TYPE IS "Snacks"

store.new %>% filter(PRODUCT_TYPE == "snacks") %>%

ggplot(aes(x = QUANTITY_DEMANDED, fill = PRODUCT_CATEGORY))+

geom_histogram(binwidth = 30)

#Pie Chart

Q9 MAKE A PIE CHART FOR EACH HYGIENE PRODUCT QUANTITY DEMANDED

STEP1: LETS PREPARE REQUIRED DATA

S21 <- filter(store.new, PRODUCT_TYPE == "hygiene") %>%

group_by(PRODUCT_CATEGORY) %>%

summarise(QUANTITY_DEMANDED = sum(QUANTITY_DEMANDED))

STEP2: LETS CALCULATE THE PERCENTAGE OF EACH PRODUCT

S22 <- S21 %>%

arrange(desc(PRODUCT_CATEGORY)) %>%

mutate(percentage = round(QUANTITY_DEMANDED*100/sum(QUANTITY_DEMANDED))) %>%

mutate(y_pos = cumsum(percentage)-0.5*percentage)

STEP3: LETS CREATE THE PIE CHART

S22 %>%

ggplot(aes(x="", percentage, fill = PRODUCT_CATEGORY))+

geom_bar(width = 1, stat = "identity", color = "white", alpha = .5)+

coord_polar("y", start = 0)+

geom_text(aes(y = y_pos, label = paste0(percentage, "%")), color = "black")+

scale_fill_manual(values = rainbow(7))+ theme_void()

Q10 MAKE A DONUT CHART FOR EACH HYGIENE PRODUCT QUANTITY DEMANDED

Using same data that created for Pie chart

S22 %>%

ggplot(aes(x = 2, percentage, fill = PRODUCT_CATEGORY))+

geom_bar(stat = "identity", color = "white", alpha = .5)+

coord_polar(theta = "y", start = 0)+

geom_text(aes(y = y_pos, label = paste0(percentage, "%")), color = "black")+

scale_fill_manual(values = rainbow(7))+ theme_void()+ xlim(0.6,2.6)

Q11 MAKE A COXCOMB CHART FOR. EACH HYGIENE PRODUCT QUANTITY DEMANDED

Using same data that created for Pie chart

S22 %>%

ggplot(aes(PRODUCT_CATEGORY, QUANTITY_DEMANDED, fill = PRODUCT_CATEGORY))+

geom_bar(stat = "identity")+

coord_polar("x", start = 0, direction = -1)+

xlab("DIFFERENT HYGIENE PRODUCTS")+

ylab("QUANTITY")

CORRELATION

#Loading packages

library(ggplot2)

library(corrplot)

library(ggcorrplot)

#Load the dataset

store.rat <- read.csv("C:/Users/priti/Downloads/RATINGS.csv")

FIND AND PLOT THE CORRELATION BETWEEN RATINGS AND QUANTITY DEMANDED

cor(store.rat$RATINGS, store.rat$QUANTITY_DEMANDED)

## [1] 0.7682532

plot(store.rat$RATINGS, store.rat$QUANTITY_DEMANDED)

#Correlation test

#Pearson's correlation

cor.test(store.rat$RATINGS, store.rat$QUANTITY_DEMANDED)

##

## Pearson's product-moment correlation

##

## data: store.rat$RATINGS and store.rat$QUANTITY_DEMANDED

## t = 6.0006, df = 25, p-value = 2.881e-06

## alternative hypothesis: true correlation is not equal to 0

## 95 percent confidence interval:

## 0.5483194 0.8887882

## sample estimates:

## cor

## 0.7682532

#Correlation matrix

To get the greater insight lets works with our primary dataset i.e. 'store.new'

FIND AND PLOT THE CORRELATION MATRIX

store.new1 <- dplyr::select_if(store.new, is.numeric)

r <- cor(store.new, use = "complete.obs")

round(r,2)

## COST_PRICE SELLING_PRICE QUANTITY_DEMANDED PROFIT

## COST_PRICE 1.00 0.98 -0.53 0.62

## SELLING_PRICE 0.98 1.00 -0.58 0.78

## QUANTITY_DEMANDED -0.53 -0.58 1.00 -0.55

## PROFIT 0.62 0.78 -0.55 1.00

## PROFIT_PERCENTAGE -0.18 0.00 -0.35 0.50

## NET_PROFIT 0.15 0.20 0.34 0.27

## PROFIT_PERCENTAGE NET_PROFIT

## COST_PRICE -0.18 0.15

## SELLING_PRICE 0.00 0.20

## QUANTITY_DEMANDED -0.35 0.34

## PROFIT 0.50 0.27

## PROFIT_PERCENTAGE 1.00 0.14

## NET_PROFIT 0.14 1.00

PLOTTING CORRELATION MATRIX

#Heat Map

ggcorrplot(r)

PLOTTING THE SORTED LOWER TRIANGLE

ggcorrplot(r,

hc.order = TRUE,

type = "lower",

lab = TRUE)

Previous project

Next project

Let's Connect:

Quick Links