#
# Unit 8 - Data frames 
#
# Appendix: The pipeline operator 
# 

# Preliminary note: 
# (1) A somewhat advanced material. 
#     Presented here because it is commonly 
#     used in forums and tutorials. 
# (2) You yourself can do without it. 
#     It is essentially an optional 
#     way of writing code in a certain 
#     specific context. 
# (3) A brief introduction; a "teaser". 
# (4) Watch this only if you feel comfortable 
#     with you programming. It includes 
#      a bit of new material aside for 
#     the pipeline operator. 

# Take a look at the following code 
gender <- c(1, 2, 1, 2, 1, 1, 1, 2)
fac <- factor(gender)
print(fac)

# As you know we can write the above 
# shortly as follows: 
# (assuming we do not need vec and fac)
print(   factor(   c(1, 2, 1, 2, 1, 1, 1, 2))  )
 
# In R we can write the above in another way:
# using the %>% operator 

library(dplyr)
#  library(magrittr)
c(1, 2, 1, 2, 1, 1, 1, 2) %>% factor() %>% print() 

# The operator %>% is useful,
# or at least commonly used, 
# instead of "nesting" calls to functions defined
# in the libraries where the operator is defined,
# e.g. dplyr. 

# Loading data for example 
setwd("c:/R")
employees <- read.csv("employees.csv")
print(employees)

library(dplyr)
mutate
group_by
tally

# Problem: 
# Let us define four salary groups: 
# A - salary < 5000
# B - 5000 <= salary < 10000
# C - 10000 <= salary < 15000
# D - 15000 <= salary < 20000
# how many observations 
# are there in each rank group? 

# Phase I - defining a categorial variable (factor)
newEmployees <- mutate(.data = employees, 
                       RANK = cut(SALARY, 
                                  breaks = seq(0, 20000, 5000),
                                  labels = c("A", "B", "C", "D")))
print(newEmployees)

# Phase 2 - producing a grouping object 
rankGrouping <- group_by(.data = newEmployees, RANK) 
print(rankGrouping)


# Phase 3 - counting the observations in each group
countByRank <- tally(rankGrouping)
print(countByRank)


# All together:
# Notice that each reaturn value is used 
# as an argument passed to a function 
# as the data essential for carrying out 
# the next phase. 
newEmployees <- mutate(.data = employees, 
                       RANK = cut(SALARY, 
                                  breaks = seq(0, 20000, 5000),
                                  labels = c("A", "B", "C", "D")))
rankGrouping <- group_by(.data = newEmployees, RANK) 
countByRank <- tally(rankGrouping)

# Nesting 
countByRank <- tally(group_by(.data = mutate(.data = employees, 
                                            RANK = cut(SALARY, 
                                                      breaks = seq(0, 20000, 5000),
                                                      labels = c("A", "B", "C", "D"))), 
                              RANK))
print(countByRank)

# A clearer code, using %>% 
rankGrouping <- employees %>%
                mutate(RANK = cut(SALARY, 
                              breaks=seq(0,20000,5000),
                              labels=c("A","B","C","D"))) %>%
                group_by(RANK) %>%
                tally()
print(countByRank)