# # Unit 8 - Data frames # # Categorical variables: # A first acquaintance with factors # # Loading data setwd("c:/R") employees <- read.csv("employees.csv") print(employees) # No surprise. print(class(employees$GENDER)) # Surprise indeed! print(class(employees$FNAME)) # Factor is a data structure # whose purpose is to storing # one-dimensional collections # comprised of limited number of # unique values, and representing # categorical variables, # e.g. gender. print(employees$GENDER) # LNAME is not a categorical variable, # why? print(employees$LNAME) print(class(employees$LNAME)) # Converting a vector into a factor, # One common way: the function factor() # Premise: the conversion is made when # the vector represents a categorical variable. intGender <- employees$GENDER print(intGender) facGender <- factor(employees$GENDER) print(class(facGender)) print(facGender) # Setting factor's labels facGender <- factor(employees$GENDER, levels = c(1, 2), labels = c("Female", "Male")) print(facGender) # A small demonstration for one # scenario in which using factors # is preferable to using vectors. # Bar plot based on employees$gender intGender <- employees$GENDER barplot(height = table(intGender), col = c(4, 2)) # Based on employees$gender, factorized barplot(height = table(facGender), col = c(4, 2)) # A glimpse onto the reasons # to prefer factors over # integer vectors. # (1) Clearer presentations of data # (2) Essential in some statistical procedures, # (3) More efficient in terms of memory use. # (4) Some functions return factors. # (5) Using factors it is possible to # display character vectors in a # non-alphabetical order # Further observations on factors: # Fetching factor's levels: # the function levels() facGenderLevels <- levels(facGender) print(facGenderLevels) print(class(facGenderLevels)) # You can convert a factor into # a vector, e.g. with the # function vector() # 1 facGender <- factor(employees$GENDER) print(facGender) intGender <- as.integer(facGender) print(intGender) print(class(intGender)) facGender <- factor(employees$GENDER, levels = c(1, 2), labels = c("Female", "Male")) print(facGender) intGender <- as.integer(facGender) print(intGender) # Note that you can "slice" a factor. # The result is itself a factor. facGender <- factor(employees$GENDER) partOfFacGender <- facGender[2:5] print(partOfFacGender) print(class(partOfFacGender)) facGender <- factor(employees$GENDER) print(facGender) partOfFacGender <- facGender[1:2] print(partOfFacGender) # You cannot change factor' contents. facGender <- factor(employees$GENDER) facGender[2] <- 3 print(facGender) # Factors are not meant to be # used in calculations: print(facGender + 5) # and you cannot use factors # in comparison expressions: if (facGender[2] > 5) { print("something") } # See also: # - gl() - for producing regular # series factors # - ordered factors # So how come R now does not change # character columns to factors: https://cran.r-project.org/doc/manuals/r-release/NEWS.html