#
# Unit 8 - Data frames 
#
# Filtering a data frame
#


# Loading data
setwd("c:/R")
employees <- read.csv("employees.csv")
print(employees)


# Find the salaries of men. 
# Find the salaries of men
#          with more than 12 years of education. 
# Find the maximum salary of women. 

# For carrying out all the above, 
# we have to (at least) filter 
# the original data base: 
# - Leave only observations where GENDER == 1
# - Leave only observations where GENDER == 2
# - Leave only observations where GENDER == 2
#                                 and EDUC > 12 

# We will now see several methods 
# for doing that. 

# the [ ] operator 
# stating the data frame in the brackets 
# is essential; the same for the comma. 
filteredDF <- employees[employees$GENDER == 2,]
print(filteredDF)

womenSalary <- filteredDF$SALARY
print(womenSalary)
# original data frame remains unchanged 
print(employees)

# Do not forget the comma 
# (and perhaps indicted select columns), 
# otherwise R would not know 
# which columns to select. 
filteredDF <- employees[employees$GENDER == 2]


# Composite logical conditions

# Breaking the line is not necessary; 
# it is done here only for 
# making the syntax clearer. 
filteredDF <- employees[employees$GENDER == 2 
                        & 
                        employees$EDUC > 12,]
print(filteredDF)

filteredDF <- employees[employees$EDUC < 13 
                        |
                        employees$EDUC > 14,]
print(filteredDF)


# Another method: dplyr's filter() 

# the .data parameter enables a shorter 
# instruction.
# Also - no comma here. 
library(dplyr)
filteredDF <- filter(.data = employees,
                     GENDER == 2 & EDUC > 12)
print(filteredDF)

library(dplyr)
filteredDF <- filter(.data = employees,
                     EDUC < 13 | EDUC > 14)
print(filteredDF)

# Which observations will be 
# included in the data frame 
# produced here? 
library(dplyr)
filteredDF <- filter(.data = employees,
                     !(GENDER == 2 & EDUC > 12))
print(filteredDF)

# See also: subset()