#
# Unit 8 - Data frames 
#
# Adding columns and rows, 
# Recoding 
#

# Loading data 
setwd("c:/R")
employees <- read.csv("employees.csv")
print(employees)

# What if we would like 
# to add a Bonus column: 
# for each employee the bonus 
# equals EDUC * 100. 
# In general - how do we
# add a column to a data frame? 
# We will now see two possible ways. . 

# The first - with the $ operator, 
# However now it appears to the left 
# of the assignment operator. 
employees$BONUS <- 100 
print(employees)

# An important observation: 
# We did not have to assign a vector
# of nrow(employees) length. 
# R itself added a column of this length. 
print(length(employees$BONUS))

# This is different from the situation 
# of doing a similar manipulation on the 
# vector level. 
gender <- c("M", "F", "M", "F")
educ <- c(12, 15, 13, 11)
bonus <- educ * 100 


# The value may be a result of
# an operation, e.g. arithmatical. 
# Note the replacement of the values
# in the (now) already existing
# column BONUS. 
employees <- read.csv("employees.csv")
employees$BONUS <- 100 * 5 
print(employees)

# No prolbme in using a data frame's 
# variable in the calculation
employees <- read.csv("employees.csv")
employees$BONUS <- employees$EDUC * 100
print(employees)

# In all the above cases, the data frame 
# itself has changed. 
# The case is different when whe do the 
# same using dplyr's mutate(). 
library(dplyr)
employees <- read.csv("employees.csv")
newEmployees <- mutate(.data = employees, BONUS = EDUC * 100)
print(newEmployees)
print(employees)


# Converting column type 
# Examine this code: 
employees <- read.csv("employees.csv")
print(class(employees$GENDER))
employees$GENDER <- as.character(employees$GENDER)
print(employees)
print(class(employees$GENDER))

employees <- read.csv("employees.csv")
print(class(employees$GENDER))
employees$CHARGENDER <- as.character(employees$GENDER)
print(employees)
print(class(employees$CHARGENDER))


employees <- read.csv("employees.csv")
print(class(employees$GENDER))
employees$GENDER <- factor(employees$GENDER)
print(employees)
print(employees$GENDER)
print(class(employees$GENDER))

employees <- read.csv("employees.csv")
print(class(employees$GENDER))
employees$CHARGENDER <- factor(employees$GENDER)
print(employees)
print(employees$GENDER)
print(class(employees$GENDER))

# Now let us look at how to 
# insert values to the new column
# according to conditions. 

# What if we would like to set 
# a differential bonus:
# - EDUC * 100, unless
# - if EDUC < 13, NO BONUS 
# - if EDUC > 19, EDUC * 1000 
# Basically the procedure here 
# is identical to that employed
# in the vector' level, save for 
# the need to equate lengths. 
# We thus begin with the "default" and 
# continue from there. 
employees <- read.csv("employees.csv")
employees$BONUS <- employees$EDUC * 100
print(employees)

# - if EDUC < 13, NO BONUS 
# the [ ] operator again, 
# now to the left of the 
# assignment operator
employees$BONUS[employees$EDUC < 13] <- 0 
print(employees)

# From here on, we are doing REPLACING
# rather than INSERTING. 
# - if EDUC > 19, EDUC * 1000 
employees$BONUS[employees$EDUC > 19] <- employees$EDUC * 1000
print(employees)


# Now let us look at this: 
# Produce the variable LEVEL. 
# It has two values: 
# - "A", for EDUC > 12 
# - "B", otherewise 
employees <- read.csv("employees.csv")
employees$LEVEL <- "A"
employees$LEVEL[employees$EDUC <= 12] <- "B"
print(employees)

# We can do the same 
# using the ifelse function. 
employees <- read.csv("employees.csv")
employees$LEVEL <- ifelse(employees$EDUC > 12, "A", "B")
print(employees)


# Inserting Rows 


# What happens if we state 
# an index which does not exist? 
# Remember this! It will be important 
# when learning how to iteratively construct
# a data frame 
employees <- read.csv("employees.csv")
print(employees)
employees[11,] <- c(7777, "Yasmin", "Lee", 1, 15, 1000.72)
print(employees)

employees <- read.csv("employees.csv")
print(employees)
employees[12,] <- c(7777, "Yasmin", "Lee", 1, 15, 1000.72)
print(employees)


# Recoding

# Finally inspect this problem: 
# Produce the variable RANK. 
# It has four values, according to 
# ranges in SALARY: 
# - (0,5000]      --- "A"
# - (5000,10000]  --- "B"
# - (10000,15000] --- "C"
# - (15000,20000] --- "D"
# Premise: all salaries are less 
# than or equal to 20000. 
employees <- read.csv("employees.csv")
employees$RANK <- "D"
employees$RANK[employees$SALARY <= 15000] <- "C"
employees$RANK[employees$SALARY <= 10000] <- "B"
employees$RANK[employees$SALARY <= 5000] <- "A"
print(employees)


# Note that the variable must exist 
# Before it can be assigned values 
# by conditions. 
employees <- read.csv("employees.csv")
employees$RANK[employees$SALARY <= 15000] <- "C"


# The above is also achiveable using cut() 
employees <- read.csv("employees.csv")
employees$RANK <- cut(employees$SALARY, 
                      breaks = c(0, 5000, 10000, 15000, 20000),
                      labels = c("A", "B", "C", "D"))
print(employees)
print(class(employees$RANK))
print(employees$RANK)
print(str(employees$RANK))


# using seq() for producing the 
# boundaires vector. 
employees <- read.csv("employees.csv")
employees$RANK <- cut(employees$SALARY, 
                      breaks = seq(0, 20000, 5000),
                      labels = c("A", "B", "C", "D"))
print(employees)

print(seq(0, 20000, 5000))

# Finally, how would we carry out
# this recoding: 
# Produce the variable EDUCRANK. 
# It has four values, according to 
# Premise: EDUC's values are less than 25. 
# ranges in EDUC: 
# - 1, 2, 9  --- "A"
# - 3, 5, 7, 10, 11, 12, 13, 17 --- "B"
# - all other   --- "C"
employees <- read.csv("employees.csv")
employees$EDUCRANK <- "A"
employees$RANK[employees$EDUC IN THEM 3 or 5 or 7 or 10 or 11 or 12 or 13 or 17] <- "B"