# # Unit 8 - Data frames # # Appendix: The pipeline operator # # Preliminary note: # (1) A somewhat advanced material. # Presented here because it is commonly # used in forums and tutorials. # (2) You yourself can do without it. # It is essentially an optional # way of writing code in a certain # specific context. # (3) A brief introduction; a "teaser". # (4) Watch this only if you feel comfortable # with you programming. It includes # a bit of new material aside for # the pipeline operator. # Take a look at the following code gender <- c(1, 2, 1, 2, 1, 1, 1, 2) fac <- factor(gender) print(fac) # As you know we can write the above # shortly as follows: # (assuming we do not need vec and fac) print( factor( c(1, 2, 1, 2, 1, 1, 1, 2)) ) # In R we can write the above in another way: # using the %>% operator library(dplyr) # library(magrittr) c(1, 2, 1, 2, 1, 1, 1, 2) %>% factor() %>% print() # The operator %>% is useful, # or at least commonly used, # instead of "nesting" calls to functions defined # in the libraries where the operator is defined, # e.g. dplyr. # Loading data for example setwd("c:/R") employees <- read.csv("employees.csv") print(employees) library(dplyr) mutate group_by tally # Problem: # Let us define four salary groups: # A - salary < 5000 # B - 5000 <= salary < 10000 # C - 10000 <= salary < 15000 # D - 15000 <= salary < 20000 # how many observations # are there in each rank group? # Phase I - defining a categorial variable (factor) newEmployees <- mutate(.data = employees, RANK = cut(SALARY, breaks = seq(0, 20000, 5000), labels = c("A", "B", "C", "D"))) print(newEmployees) # Phase 2 - producing a grouping object rankGrouping <- group_by(.data = newEmployees, RANK) print(rankGrouping) # Phase 3 - counting the observations in each group countByRank <- tally(rankGrouping) print(countByRank) # All together: # Notice that each reaturn value is used # as an argument passed to a function # as the data essential for carrying out # the next phase. newEmployees <- mutate(.data = employees, RANK = cut(SALARY, breaks = seq(0, 20000, 5000), labels = c("A", "B", "C", "D"))) rankGrouping <- group_by(.data = newEmployees, RANK) countByRank <- tally(rankGrouping) # Nesting countByRank <- tally(group_by(.data = mutate(.data = employees, RANK = cut(SALARY, breaks = seq(0, 20000, 5000), labels = c("A", "B", "C", "D"))), RANK)) print(countByRank) # A clearer code, using %>% rankGrouping <- employees %>% mutate(RANK = cut(SALARY, breaks=seq(0,20000,5000), labels=c("A","B","C","D"))) %>% group_by(RANK) %>% tally() print(countByRank)