# # Unit 8 - data frames # # Practice - Part I # # (0) load the file stress.csv into a data frame print(getwd()) setwd("c:/R") stress <- read.csv("stress.csv") print(class(stress)) View(stress) # (1) Print column names # expected result: [1] "ID" "GENDER" "AGE" "FACULTY" "INCOME" print(names(stress)) # (2) Print stress, first three rows. # Expected result ID GENDER AGE FACULTY INCOME 1 1 1 22 3 20347.80 2 2 2 26 2 11054.30 3 3 1 26 2 16578.54 print(head(stress, n = 3)) # (3) how many rows? hom many columns) # Expected result: [1] 100 5 print(dim(stress)) # (4) Print GENDER column # Expected result: [1] 1 2 1 1 2 1 2 2 1 2 1 1 2 2 1 1 1 2 2 1 2 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1 [37] 1 2 2 1 2 2 1 2 1 2 2 2 2 2 1 1 1 2 1 1 2 2 1 2 2 2 2 1 2 1 2 1 1 1 2 1 [73] 2 2 2 2 2 2 2 2 1 1 2 2 2 2 1 1 1 2 1 2 1 1 1 2 2 1 2 2 print(stress$GENDER) # (5) Produce a data frame by combining # GENDER column with the the vector # 1000, 1001, 1002, ... # Use the function data.frame() # Head of the expected result X1000..1000...length.stress.GENDER....1. stress.GENDER 1 1000 1 2 1001 2 3 1002 1 newDF <- data.frame(1000:(1000+length(stress$GENDER)-1), stress$GENDER) print(head(newDF, n = 3)) # (6) Provide proper names to the columns # of the new data frame. # Head of the expected result: ID GENDER 1 1000 1 2 1001 2 3 1002 1 names(newDF) <- c("ID", "GENDER") print(head(newDF, n = 3)) # (7) Produce the data frame again, now using cbind(). # Head of the expected result: X1 X2 1 1000 1 2 1001 2 3 1002 1 binded <- cbind(1000:(1000+length(stress$GENDER)-1), stress$GENDER) print(head(binded, n = 3)) newDF <- data.frame(binded) print(head(newDF, n = 3)) # (8) Rename the columns of the data frame # to "ID" and "GENDER". # Do not change the original data frame. # Head of the expected result: ID GENDER 1 1000 1 2 1001 2 3 1002 1 library(dplyr) newDF <- rename(.data = newDF, ID = X1, GENDER = X2) print(head(newDF, n = 3)) # (9) Replace row indices of the # original stress data frame # with the vector 1000, 1001, .... # Head of the Expected result. ID GENDER AGE FACULTY INCOME 1000 1 1 22 3 20347.80 1001 2 2 26 2 11054.30 1002 3 1 26 2 16578.54 rownames(stress) <- 1000:(1000+length(stress$GENDER)-1) print(head(stress, n = 3)) rownames(stress) <- NULL print(head(stress, n = 3)) # (10) Print the INCOME of the student in the # second row. # Expected result: [1] 11054.3 print(head(stress, n = 5)) print(stress[2, 5]) print(stress[2, "INCOME"]) print(stress[2, "income"]) print(stress[2, c("INCOME")]) # (11) Produce a data frame including # GENDER, FACULTY and INCOME # for the 25-50 rows. # Head of the expected result: GENDER FACULTY INCOME 25 1 3 17243.07 26 1 2 17345.99 27 2 2 14265.55 28 2 3 17976.83 29 2 3 20000.15 newDF <- stress[25:50, c("GENDER", "FACULTY", "INCOME")] print(head(newDF, n= 5)) newDF <- select(.data = stress, GENDER, FACULTY, INCOME) newDF <- newDF[25:50,] # (12) Produce a data frame including # GENDER, FACULTY and INCOME # for the 75th row tiill the last. # Head of the expected result: GENDER FACULTY INCOME 75 2 3 10187.99 76 2 2 20605.09 77 2 1 22896.27 78 2 1 19258.29 79 2 1 17052.33 newDF <- select(.data = stress, GENDER, FACULTY:INCOME) newDF <- tail(newDF, n = 26) print(head(newDF, n= 5)) # (13) On the basis of the stress data frame, # Produce 3 samples of size 7 with revision. # Possible result ID GENDER AGE FACULTY INCOME 11 11 1 26 2 8157.84 12 12 1 21 3 14728.57 72 72 1 20 3 18098.63 31 31 1 24 3 18298.45 31.1 31 1 24 3 18298.45 89 89 1 28 3 21156.43 88 88 1 24 1 20233.58 ID GENDER AGE FACULTY INCOME 28 28 2 28 3 17976.83 74 74 2 25 2 19965.21 79 79 2 22 1 17052.33 69 69 1 22 1 14906.47 17 17 1 25 1 8867.20 11 11 1 26 2 8157.84 35 35 1 24 1 18911.21 ID GENDER AGE FACULTY INCOME 73 73 2 22 3 18960.88 16 16 1 24 1 11815.51 4 4 1 23 1 8971.88 59 59 1 23 2 18109.81 19 19 2 21 1 16502.91 20 20 1 20 2 14133.87 98 98 1 24 2 15141.64 for (i in 1:3) { inds <- sample(1:nrow(stress), size = 7, replace = TRUE) sampleDF <- stress[inds,] print(sampleDF) } # (14) How many female students (GENDER == 2)? # Answer: 57 print(length(stress$GENDER[stress$GENDER == 2])) # (15) How many students study in # FACULTY 2 and are over 25 years old? # Answer: 8 print(length(stress$GENDER[stress$FACULTY == 2 & stress$AGE > 25])) # (16) Produce a data frame including the information # about all students in this group. # Expected result: ID GENDER AGE FACULTY INCOME 2 2 2 26 2 11054.30 3 3 1 26 2 16578.54 11 11 1 26 2 8157.84 27 27 2 26 2 14265.55 47 47 2 28 2 12733.87 61 61 2 26 2 19241.29 68 68 1 26 2 21300.50 76 76 2 28 2 20605.09 newDF <- stress[stress$FACULTY == 2 & stress$AGE > 25,] newDF <- filter(.data = stress, stress$FACULTY == 2 & stress$AGE > 25) # (17) How many students DO NOT belong to # the aforementioned group? # Answer: 92 print(length(stress$GENDER[!(stress$FACULTY == 2 & stress$AGE > 25)])) # (18) Change the age of the first sudent, # to NA. stress <- read.csv("stress.csv") print(head(stress)) stress[1, "AGE"] <- NA print(head(stress, n=1)) # (19) Replace ID with 1000:1099. # Head of the expected result. ID GENDER AGE FACULTY INCOME 1 1000 1 22 3 20347.80 2 1001 2 26 2 11054.30 3 1002 1 26 2 16578.54 4 1003 1 23 1 8971.88 5 1004 2 24 3 10489.32 6 1005 1 27 1 8120.38 stress <- read.csv("stress.csv") stress$ID <- 1000:1099 print(head(stress)) # (20) Change all INCOMES of student under 26 # to NA. # Head of the expected result: ID GENDER AGE FACULTY INCOME 1 1 1 22 3 NA 2 2 2 26 2 11054.30 3 3 1 26 2 16578.54 4 4 1 23 1 NA 5 5 2 24 3 NA 6 6 1 27 1 8120.38 stress <- read.csv("stress.csv") stress$INCOME[stress$AGE < 26] <- NA print(head(stress)) # (21) Double the income for female students. # Head of the expected result: ID GENDER AGE FACULTY INCOME 1 1 1 22 3 20347.80 2 2 2 26 2 22108.60 3 3 1 26 2 16578.54 4 4 1 23 1 8971.88 5 5 2 24 3 20978.64 6 6 1 27 1 8120.38 stress <- read.csv("stress.csv") stress$INCOME <- stress$INCOME * stress$GENDER print(head(stress)) # (22) Do the same only for students under 26. # Head of the expected result: ID GENDER AGE FACULTY INCOME 1 1 1 22 3 20347.80 2 2 2 26 2 11054.30 3 3 1 26 2 16578.54 4 4 1 23 1 8971.88 5 5 2 24 3 20978.64 6 6 1 27 1 8120.38 stress <- read.csv("stress.csv") stress$INCOME[stress$AGE<26] <- stress$INCOME[stress$AGE<26] * stress$GENDER[stress$AGE<26] print(head(stress)) # (23) Facotirze FACULTY as follows: # 1 - Humanities # 2 - Social Sciences # 3 - Science # Head of the expected result: [1] Science Social Sciences Social Sciences Humanities [5] Science Humanities facultyFactor <- factor(stress$FACULTY, levels = 1:3, labels = c("Humanities", "Social Sciences", "Science")) print(facultyFactor) print(head(facultyFactor)) # (24) Add the variable YEAR, # holding the year 2023 for all participants # Head of the expected rsult: ID GENDER AGE FACULTY INCOME YEAR 1 1 1 22 3 20347.80 2023 2 2 2 26 2 11054.30 2023 3 3 1 26 2 16578.54 2023 4 4 1 23 1 8971.88 2023 5 5 2 24 3 20978.64 2023 6 6 1 27 1 8120.38 2023 stress$YEAR <- 2023 print(head(stress)) library(dplyr) stress <- read.csv("stress.csv") stress <- mutate(.data = stress, YEAR = 2023) # (25) Add the variable ageGroup: # 1 - age under 26 # 2 = age above 25 and less than 29 # 3 - age above 28 # Head of the expected result: ID GENDER AGE FACULTY INCOME ageGoup 1 1 1 22 3 20347.80 1 2 2 2 26 2 11054.30 2 3 3 1 26 2 16578.54 2 4 4 1 23 1 8971.88 1 5 5 2 24 3 10489.32 1 6 6 1 27 1 8120.38 2 stress <- read.csv("stress.csv") stress$ageGoup <- 3 stress$ageGoup[stress$AGE < 29 & stress$AGE > 25] <- 2 stress$ageGoup[stress$AGE < 26] <- 1 print(head(stress)) stress <- read.csv("stress.csv") stress$ageGroup <- cut(x = stress$AGE, breaks = c(0, 26, 29, 120), labels = c("g1", "g2", "g3")) class(stress$ageGroup) # (26) Produce the following factor: # name - ageCat # levels - "A" for ages 22, 29 # - "B" for ages 24 to 27 # - "C" for all other ages # Head of the expected resul: ID GENDER AGE FACULTY INCOME ageCat 1 1 1 22 3 20347.80 A 2 2 2 26 2 11054.30 B 3 3 1 26 2 16578.54 B 4 4 1 23 1 8971.88 C 5 5 2 24 3 10489.32 B 6 6 1 27 1 8120.38 B stress <- read.csv("stress.csv") stress$ageCat <- "C" stress$ageCat[stress$AGE %in% c(22, 29)] <- "A" stress$ageCat[stress$AGE %in% 24:27] <- "B" print(head(stress)) # (27) Sort as follows: # - GENDER - ascending # - FACULTY - descending # - AGE - ascending # Head of the expected result: ID GENDER AGE FACULTY INCOME ageCat 1 72 1 20 3 18098.63 C 2 12 1 21 3 14728.57 C 3 1 1 22 3 20347.80 A 4 15 1 22 3 11380.81 A 5 64 1 22 3 14289.76 A 6 31 1 24 3 18298.45 B 7 66 1 25 3 15139.98 B 8 36 1 26 3 17104.95 B 9 70 1 26 3 14477.03 B 10 55 1 27 3 21017.69 B 11 25 1 28 3 17243.07 C 12 89 1 28 3 21156.43 C 13 20 1 20 2 14133.87 C 14 26 1 20 2 17345.99 C 15 52 1 20 2 8275.27 C 16 40 1 21 2 22925.07 C 17 56 1 21 2 17110.01 C 18 51 1 22 2 20266.76 A 19 53 1 22 2 11348.09 A 20 93 1 22 2 12781.95 A stress <- arrange(.data = stress, GENDER, desc(FACULTY), AGE) head(stress, n=20)