#
# Unit 8 - data frames 
#
# Practice - Part I 
#

# (0) load the file stress.csv into a data frame 
print(getwd())
setwd("c:/R")
stress <- read.csv("stress.csv")
print(class(stress))
View(stress)

# (1) Print column names 
# expected result: 
[1] "ID"      "GENDER"  "AGE"     "FACULTY" "INCOME" 

print(names(stress))

# (2) Print stress, first three rows. 
# Expected result 
  ID GENDER AGE FACULTY   INCOME
1  1      1  22       3 20347.80
2  2      2  26       2 11054.30
3  3      1  26       2 16578.54

print(head(stress, n = 3))

# (3) how many rows? hom many columns) 
# Expected result: 
[1] 100   5

print(dim(stress))

# (4) Print GENDER column 
#     Expected result: 
[1] 1 2 1 1 2 1 2 2 1 2 1 1 2 2 1 1 1 2 2 1 2 2 2 2 1 1 2 2 2 1 1 2 2 2 1 1
[37] 1 2 2 1 2 2 1 2 1 2 2 2 2 2 1 1 1 2 1 1 2 2 1 2 2 2 2 1 2 1 2 1 1 1 2 1
[73] 2 2 2 2 2 2 2 2 1 1 2 2 2 2 1 1 1 2 1 2 1 1 1 2 2 1 2 2

print(stress$GENDER)

# (5) Produce a data frame by combining 
#    GENDER column with the the vector 
#    1000, 1001, 1002, ... 
#    Use the function data.frame() 
#    Head of the expected result
  X1000..1000...length.stress.GENDER....1. stress.GENDER
1                                     1000             1
2                                     1001             2
3                                     1002             1

newDF <- data.frame(1000:(1000+length(stress$GENDER)-1), stress$GENDER)
print(head(newDF, n = 3))

# (6) Provide proper names to the columns 
#    of the new data frame. 
#    Head of the expected result: 
    ID GENDER
1 1000      1
2 1001      2
3 1002      1

names(newDF) <- c("ID", "GENDER")
print(head(newDF, n = 3))

# (7) Produce the data frame again, now using cbind(). 
#     Head of the expected result:
    X1 X2
1 1000  1
2 1001  2
3 1002  1

binded <- cbind(1000:(1000+length(stress$GENDER)-1), stress$GENDER)
print(head(binded, n = 3))
newDF <- data.frame(binded)
print(head(newDF, n = 3))

# (8) Rename the columns of the data frame 
#     to "ID" and "GENDER". 
#     Do not change the original data frame. 
#     Head of the expected result:
    ID GENDER
1 1000      1
2 1001      2
3 1002      1

library(dplyr)
newDF <- rename(.data = newDF, ID = X1, GENDER = X2)
print(head(newDF, n = 3))

# (9) Replace row indices of the 
#     original stress data frame
#     with the vector 1000, 1001, .... 
#     Head of the Expected result. 
     ID GENDER AGE FACULTY   INCOME
1000  1      1  22       3 20347.80
1001  2      2  26       2 11054.30
1002  3      1  26       2 16578.54

rownames(stress) <- 1000:(1000+length(stress$GENDER)-1)
print(head(stress, n = 3))

rownames(stress) <- NULL
print(head(stress, n = 3))
                          

# (10) Print the INCOME of the student in the 
#    second row. 
#    Expected result: 
[1] 11054.3

print(head(stress, n = 5))

print(stress[2, 5])
print(stress[2, "INCOME"])
print(stress[2, "income"])
print(stress[2, c("INCOME")])

# (11) Produce a data frame including 
#      GENDER, FACULTY and INCOME 
#      for the 25-50 rows. 
#      Head of the expected result: 
   GENDER FACULTY   INCOME
25      1       3 17243.07
26      1       2 17345.99
27      2       2 14265.55
28      2       3 17976.83
29      2       3 20000.15

newDF <- stress[25:50, c("GENDER", "FACULTY", "INCOME")]
print(head(newDF, n= 5))

newDF <- select(.data = stress, 
                GENDER, FACULTY, INCOME)
newDF <- newDF[25:50,]


# (12) Produce a data frame including 
#      GENDER, FACULTY and INCOME 
#      for the 75th row tiill the last. 
#      Head of the expected result: 
   GENDER FACULTY   INCOME
75      2       3 10187.99
76      2       2 20605.09
77      2       1 22896.27
78      2       1 19258.29
79      2       1 17052.33

newDF <- select(.data = stress, 
                GENDER, FACULTY:INCOME)
newDF <- tail(newDF, n = 26)
print(head(newDF, n= 5))


# (13) On the basis of the stress data frame, 
#      Produce 3 samples of size 7 with revision. 
#      Possible result 
     ID GENDER AGE FACULTY   INCOME
11   11      1  26       2  8157.84
12   12      1  21       3 14728.57
72   72      1  20       3 18098.63
31   31      1  24       3 18298.45
31.1 31      1  24       3 18298.45
89   89      1  28       3 21156.43
88   88      1  24       1 20233.58
   ID GENDER AGE FACULTY   INCOME
28 28      2  28       3 17976.83
74 74      2  25       2 19965.21
79 79      2  22       1 17052.33
69 69      1  22       1 14906.47
17 17      1  25       1  8867.20
11 11      1  26       2  8157.84
35 35      1  24       1 18911.21
   ID GENDER AGE FACULTY   INCOME
73 73      2  22       3 18960.88
16 16      1  24       1 11815.51
4   4      1  23       1  8971.88
59 59      1  23       2 18109.81
19 19      2  21       1 16502.91
20 20      1  20       2 14133.87
98 98      1  24       2 15141.64

for (i in 1:3) {
  inds <- sample(1:nrow(stress), 
                     size = 7, 
                     replace = TRUE)
  sampleDF <- stress[inds,]
  print(sampleDF)
}

# (14) How many female students (GENDER == 2)? 
#      Answer: 57
print(length(stress$GENDER[stress$GENDER == 2]))


# (15) How many students study in 
#      FACULTY 2 and are over 25 years old?
#      Answer: 8 

print(length(stress$GENDER[stress$FACULTY == 2 & stress$AGE > 25]))


# (16) Produce a data frame including the information
#      about all students in this group. 
#      Expected result: 
   ID GENDER AGE FACULTY   INCOME
2   2      2  26       2 11054.30
3   3      1  26       2 16578.54
11 11      1  26       2  8157.84
27 27      2  26       2 14265.55
47 47      2  28       2 12733.87
61 61      2  26       2 19241.29
68 68      1  26       2 21300.50
76 76      2  28       2 20605.09


newDF <- stress[stress$FACULTY == 2 & stress$AGE > 25,]

newDF <- filter(.data = stress, stress$FACULTY == 2 & stress$AGE > 25)



# (17) How many students DO NOT belong to 
#    the aforementioned group? 
#    Answer: 92
print(length(stress$GENDER[!(stress$FACULTY == 2 & stress$AGE > 25)]))


# (18) Change the age of the first sudent, 
#    to NA. 
stress <- read.csv("stress.csv")
print(head(stress))
stress[1, "AGE"] <- NA
print(head(stress, n=1))


# (19) Replace ID with 1000:1099. 
#      Head of the expected result. 
    ID GENDER AGE FACULTY   INCOME
1 1000      1  22       3 20347.80
2 1001      2  26       2 11054.30
3 1002      1  26       2 16578.54
4 1003      1  23       1  8971.88
5 1004      2  24       3 10489.32
6 1005      1  27       1  8120.38


stress <- read.csv("stress.csv")
stress$ID <- 1000:1099
print(head(stress))



# (20) Change all INCOMES of student under 26
#     to NA. 
#     Head of the expected result: 
  ID GENDER AGE FACULTY   INCOME
1  1      1  22       3       NA
2  2      2  26       2 11054.30
3  3      1  26       2 16578.54
4  4      1  23       1       NA
5  5      2  24       3       NA
6  6      1  27       1  8120.38

stress <- read.csv("stress.csv")
stress$INCOME[stress$AGE < 26] <- NA
print(head(stress))


# (21) Double the income for female students. 
#      Head of the expected result: 
  ID GENDER AGE FACULTY   INCOME
1  1      1  22       3 20347.80
2  2      2  26       2 22108.60
3  3      1  26       2 16578.54
4  4      1  23       1  8971.88
5  5      2  24       3 20978.64
6  6      1  27       1  8120.38

stress <- read.csv("stress.csv")
stress$INCOME <- stress$INCOME * stress$GENDER
print(head(stress))

# (22) Do the same only for students under 26. 
#      Head of the expected result: 
  ID GENDER AGE FACULTY   INCOME
1  1      1  22       3 20347.80
2  2      2  26       2 11054.30
3  3      1  26       2 16578.54
4  4      1  23       1  8971.88
5  5      2  24       3 20978.64
6  6      1  27       1  8120.38

stress <- read.csv("stress.csv")
stress$INCOME[stress$AGE<26] <- stress$INCOME[stress$AGE<26] * stress$GENDER[stress$AGE<26]
print(head(stress))

# (23) Facotirze FACULTY as follows: 
#      1 - Humanities 
#      2 - Social Sciences 
#      3 - Science 
#     Head of the expected result: 
[1] Science          Social Sciences Social Sciences Humanities      
[5] Science          Humanities 


facultyFactor <- factor(stress$FACULTY, 
                        levels = 1:3, 
                        labels = c("Humanities", "Social Sciences", "Science"))
print(facultyFactor)
print(head(facultyFactor))



# (24) Add the variable YEAR, 
#      holding the year 2023 for all participants 
#      Head of the expected rsult: 
  ID GENDER AGE FACULTY   INCOME YEAR
1  1      1  22       3 20347.80 2023
2  2      2  26       2 11054.30 2023
3  3      1  26       2 16578.54 2023
4  4      1  23       1  8971.88 2023
5  5      2  24       3 20978.64 2023
6  6      1  27       1  8120.38 2023



stress$YEAR <- 2023
print(head(stress))

library(dplyr)
stress <- read.csv("stress.csv")
stress <- mutate(.data = stress,
                 YEAR = 2023)

# (25) Add the variable ageGroup: 
#      1 - age under 26 
#      2 = age above 25 and less than 29 
#      3 - age above 28 
#     Head of the expected result: 
  ID GENDER AGE FACULTY   INCOME ageGoup
1  1      1  22       3 20347.80       1
2  2      2  26       2 11054.30       2
3  3      1  26       2 16578.54       2
4  4      1  23       1  8971.88       1
5  5      2  24       3 10489.32       1
6  6      1  27       1  8120.38       2

stress <- read.csv("stress.csv")
stress$ageGoup <- 3
stress$ageGoup[stress$AGE < 29 & stress$AGE > 25] <- 2
stress$ageGoup[stress$AGE < 26] <- 1
print(head(stress))

stress <- read.csv("stress.csv")
stress$ageGroup <- cut(x = stress$AGE, 
                   breaks = c(0, 26, 29, 120),
                   labels = c("g1", "g2", "g3"))

class(stress$ageGroup)



# (26) Produce the following factor:
#      name - ageCat
#      levels - "A" for ages 22, 29
#             - "B" for ages 24 to 27 
#             - "C" for all other ages 
#      Head of the expected resul: 
  ID GENDER AGE FACULTY   INCOME ageCat
1  1      1  22       3 20347.80      A
2  2      2  26       2 11054.30      B
3  3      1  26       2 16578.54      B
4  4      1  23       1  8971.88      C
5  5      2  24       3 10489.32      B
6  6      1  27       1  8120.38      B

stress <- read.csv("stress.csv")
stress$ageCat <- "C"
stress$ageCat[stress$AGE %in% c(22, 29)] <- "A"
stress$ageCat[stress$AGE %in% 24:27] <- "B"
print(head(stress))


# (27) Sort as follows: 
#      - GENDER - ascending 
#      - FACULTY - descending 
#      - AGE - ascending 
#      Head of the expected result: 
   ID GENDER AGE FACULTY   INCOME ageCat
1  72      1  20       3 18098.63      C
2  12      1  21       3 14728.57      C
3   1      1  22       3 20347.80      A
4  15      1  22       3 11380.81      A
5  64      1  22       3 14289.76      A
6  31      1  24       3 18298.45      B
7  66      1  25       3 15139.98      B
8  36      1  26       3 17104.95      B
9  70      1  26       3 14477.03      B
10 55      1  27       3 21017.69      B
11 25      1  28       3 17243.07      C
12 89      1  28       3 21156.43      C
13 20      1  20       2 14133.87      C
14 26      1  20       2 17345.99      C
15 52      1  20       2  8275.27      C
16 40      1  21       2 22925.07      C
17 56      1  21       2 17110.01      C
18 51      1  22       2 20266.76      A
19 53      1  22       2 11348.09      A
20 93      1  22       2 12781.95      A

stress <- arrange(.data = stress,
                  GENDER, desc(FACULTY), AGE)
head(stress, n=20)