# # Unit 8 - Data frames # # Filtering a data frame # # Loading data setwd("c:/R") employees <- read.csv("employees.csv") print(employees) # Find the salaries of men. # Find the salaries of men # with more than 12 years of education. # Find the maximum salary of women. # For carrying out all the above, # we have to (at least) filter # the original data base: # - Leave only observations where GENDER == 1 # - Leave only observations where GENDER == 2 # - Leave only observations where GENDER == 2 # and EDUC > 12 # We will now see several methods # for doing that. # the [ ] operator # stating the data frame in the brackets # is essential; the same for the comma. filteredDF <- employees[employees$GENDER == 2,] print(filteredDF) womenSalary <- filteredDF$SALARY print(womenSalary) # original data frame remains unchanged print(employees) # Do not forget the comma # (and perhaps indicted select columns), # otherwise R would not know # which columns to select. filteredDF <- employees[employees$GENDER == 2] # Composite logical conditions # Breaking the line is not necessary; # it is done here only for # making the syntax clearer. filteredDF <- employees[employees$GENDER == 2 & employees$EDUC > 12,] print(filteredDF) filteredDF <- employees[employees$EDUC < 13 | employees$EDUC > 14,] print(filteredDF) # Another method: dplyr's filter() # the .data parameter enables a shorter # instruction. # Also - no comma here. library(dplyr) filteredDF <- filter(.data = employees, GENDER == 2 & EDUC > 12) print(filteredDF) library(dplyr) filteredDF <- filter(.data = employees, EDUC < 13 | EDUC > 14) print(filteredDF) # Which observations will be # included in the data frame # produced here? library(dplyr) filteredDF <- filter(.data = employees, !(GENDER == 2 & EDUC > 12)) print(filteredDF) # See also: subset()