# # Unit 8 - Data frames # # Adding columns and rows, # Recoding # # Loading data setwd("c:/R") employees <- read.csv("employees.csv") print(employees) # What if we would like # to add a Bonus column: # for each employee the bonus # equals EDUC * 100. # In general - how do we # add a column to a data frame? # We will now see two possible ways. . # The first - with the $ operator, # However now it appears to the left # of the assignment operator. employees$BONUS <- 100 print(employees) # An important observation: # We did not have to assign a vector # of nrow(employees) length. # R itself added a column of this length. print(length(employees$BONUS)) # This is different from the situation # of doing a similar manipulation on the # vector level. gender <- c("M", "F", "M", "F") educ <- c(12, 15, 13, 11) bonus <- educ * 100 # The value may be a result of # an operation, e.g. arithmatical. # Note the replacement of the values # in the (now) already existing # column BONUS. employees <- read.csv("employees.csv") employees$BONUS <- 100 * 5 print(employees) # No prolbme in using a data frame's # variable in the calculation employees <- read.csv("employees.csv") employees$BONUS <- employees$EDUC * 100 print(employees) # In all the above cases, the data frame # itself has changed. # The case is different when whe do the # same using dplyr's mutate(). library(dplyr) employees <- read.csv("employees.csv") newEmployees <- mutate(.data = employees, BONUS = EDUC * 100) print(newEmployees) print(employees) # Converting column type # Examine this code: employees <- read.csv("employees.csv") print(class(employees$GENDER)) employees$GENDER <- as.character(employees$GENDER) print(employees) print(class(employees$GENDER)) employees <- read.csv("employees.csv") print(class(employees$GENDER)) employees$CHARGENDER <- as.character(employees$GENDER) print(employees) print(class(employees$CHARGENDER)) employees <- read.csv("employees.csv") print(class(employees$GENDER)) employees$GENDER <- factor(employees$GENDER) print(employees) print(employees$GENDER) print(class(employees$GENDER)) employees <- read.csv("employees.csv") print(class(employees$GENDER)) employees$CHARGENDER <- factor(employees$GENDER) print(employees) print(employees$GENDER) print(class(employees$GENDER)) # Now let us look at how to # insert values to the new column # according to conditions. # What if we would like to set # a differential bonus: # - EDUC * 100, unless # - if EDUC < 13, NO BONUS # - if EDUC > 19, EDUC * 1000 # Basically the procedure here # is identical to that employed # in the vector' level, save for # the need to equate lengths. # We thus begin with the "default" and # continue from there. employees <- read.csv("employees.csv") employees$BONUS <- employees$EDUC * 100 print(employees) # - if EDUC < 13, NO BONUS # the [ ] operator again, # now to the left of the # assignment operator employees$BONUS[employees$EDUC < 13] <- 0 print(employees) # From here on, we are doing REPLACING # rather than INSERTING. # - if EDUC > 19, EDUC * 1000 employees$BONUS[employees$EDUC > 19] <- employees$EDUC * 1000 print(employees) # Now let us look at this: # Produce the variable LEVEL. # It has two values: # - "A", for EDUC > 12 # - "B", otherewise employees <- read.csv("employees.csv") employees$LEVEL <- "A" employees$LEVEL[employees$EDUC <= 12] <- "B" print(employees) # We can do the same # using the ifelse function. employees <- read.csv("employees.csv") employees$LEVEL <- ifelse(employees$EDUC > 12, "A", "B") print(employees) # Inserting Rows # What happens if we state # an index which does not exist? # Remember this! It will be important # when learning how to iteratively construct # a data frame employees <- read.csv("employees.csv") print(employees) employees[11,] <- c(7777, "Yasmin", "Lee", 1, 15, 1000.72) print(employees) employees <- read.csv("employees.csv") print(employees) employees[12,] <- c(7777, "Yasmin", "Lee", 1, 15, 1000.72) print(employees) # Recoding # Finally inspect this problem: # Produce the variable RANK. # It has four values, according to # ranges in SALARY: # - (0,5000] --- "A" # - (5000,10000] --- "B" # - (10000,15000] --- "C" # - (15000,20000] --- "D" # Premise: all salaries are less # than or equal to 20000. employees <- read.csv("employees.csv") employees$RANK <- "D" employees$RANK[employees$SALARY <= 15000] <- "C" employees$RANK[employees$SALARY <= 10000] <- "B" employees$RANK[employees$SALARY <= 5000] <- "A" print(employees) # Note that the variable must exist # Before it can be assigned values # by conditions. employees <- read.csv("employees.csv") employees$RANK[employees$SALARY <= 15000] <- "C" # The above is also achiveable using cut() employees <- read.csv("employees.csv") employees$RANK <- cut(employees$SALARY, breaks = c(0, 5000, 10000, 15000, 20000), labels = c("A", "B", "C", "D")) print(employees) print(class(employees$RANK)) print(employees$RANK) print(str(employees$RANK)) # using seq() for producing the # boundaires vector. employees <- read.csv("employees.csv") employees$RANK <- cut(employees$SALARY, breaks = seq(0, 20000, 5000), labels = c("A", "B", "C", "D")) print(employees) print(seq(0, 20000, 5000)) # Finally, how would we carry out # this recoding: # Produce the variable EDUCRANK. # It has four values, according to # Premise: EDUC's values are less than 25. # ranges in EDUC: # - 1, 2, 9 --- "A" # - 3, 5, 7, 10, 11, 12, 13, 17 --- "B" # - all other --- "C" employees <- read.csv("employees.csv") employees$EDUCRANK <- "A" employees$RANK[employees$EDUC IN THEM 3 or 5 or 7 or 10 or 11 or 12 or 13 or 17] <- "B"