#
# Unit 8 - data frames 
#
# Practice II
#

# Preliminary notes: 
- Material 
- Practice in writing functions and other previous material 
- Somewhat more advanced in terms of complexity
- Use studied material 
- If possible, watch this in one stroke


setwd("c:/R")

# (1) Write the function myTranspose. 
#      THe function has two parameters: 
#      - df - a data frame
#      - colnames - a list of column names in df
#      - rows - a range of row indices in df 
#      THe function "slices" df such that 
#      it includes only the column whose names
#      appear in colnames, and only the rows 
#      whose indices are included in rows, 
#      and then transposes
#      the resulting data frame, and returns 
#      this data frame. 
myTranspose <- function(df, colnames, rowInds) {
  ? 
}


stress <- read.csv("stress.csv")
print(head(stress)) 
myColnames <- names(stress)[2:4]
print(myColnames)
myRows <- 1:5
result <- myTranspose(stress, myColnames, myRows)
print(result)


print(stress[1:5,c("GENDER", "AGE", "FACULTY")])

myTranspose <- function(df, colnames, rowInds) {
  slicedDF <- df[rowInds, colnames]
  return (t(slicedDF))
}






# (2) Write the function colsWithVal. 
#    The function has two parameters: 
#    - df - a data frame 
#    - val - any value 
#    The function produces a vector of all indices
#    of columns which include the value val, and 
#    returns the result. 
colsWithVal <- function(df, val) {
  ???
}
  
stress <- read.csv("stress.csv")
stress$AGE[2] <- NA
stress$ID[4] <- NA
print(head(stress))
print(colsWithVal(stress, NA))

print(class(stress[3]))


colsWithVal <- function(df, val) {
  inds <- c()
  for (ind in 1:ncol(df)) { 
    if (val %in% df[,ind]) { 
      inds <- c(inds, ind)
    }
  }
  return (inds)
}







# (3) write the function: filterByTwoVars. 
#    The function has three parameters, as follows: 
#    df - a data frame 
#    vars - a vector of names of two columns in df 
#    vals - a vector of two values 
#    The function returns a data frame including 
#    all rows in df where the value of the first column 
#    in vars equals the first value in vals, and the value of 
#    the second column in vars equals the second value in vals. 
filterByTwoVars <- function(df, vars, vals) {
  ???
}

stress <- read.csv("stress.csv")
result <- filterByTwoVars(stress, c("GENDER", "AGE"), c(1, 28))
print(result)

print(stress[stress$GENDER == 1 & stress$AGE == 28,])

filterByTwoVars <- function(df, vars, vals) {
    return (df[  df[,vars[1]] == vals[1] & df[,vars[2]] == vals[2],])
}



# What would this code print? 
stress <- read.csv("stress.csv")
print(head(stress))
result <- filterByTwoVars(stress, c(2, 4), c(1, 2))
print(result)



# (4) (a) Write the function isEqual. 
#        The function receives two data frames 
#        and an index row to each of these data frames
#        and checks if the rows are equal in terms  
#        of length and contents: if yes, it returns TRUE, 
#        and if no, it returns FALSE. 
isEqual <- function(df1, ind1, df2, ind2) {
  ??? 
} 

stress <- read.csv("stress.csv")
idAndGender1 <- stress[1:3, 1:2]
print(idAndGender1)
idAndGender2 <- stress[3:5, 1:2]
print(idAndGender2)
print(isEqual(idAndGender1, 3, idAndGender2, 1))


print(idAndGender1[3,])
print(idAndGender2[1,])
print(idAndGender1[3,] == idAndGender2[1,])

print(c(1, 2) == c(1, 2))


isEqual <- function(df1, ind1, df2, ind2) {
  if (ncol(df1) != ncol(df2)) {
    return (FALSE)
  }
  for (ind in 1:ncol(df1)) {
    if (df1[ind1,ind] != df2[ind2,ind]) {
      return (FALSE)
    }
  }
  return (TRUE)
} 


# (b) Write the function rowInDF. 
#        The function recevies two data frames 
#        and an index to the second data frames
#        It checks if the first data frame includes a 
#        row whose contents are equal to the contents 
#        of the given row in the second data frame: 
#        if yes, it returns TRUE, 
#        and if no, it returns FALSE. 
rowInDF <- function(df1, df2, df2Ind) {
    ???
} 

stress <- read.csv("stress.csv")
idAndGender1 <- stress[1:3, 1:2]
print(idAndGender1)
idAndGender2 <- stress[3:5, 1:2]
print(idAndGender2)
print(rowInDF(idAndGender1, idAndGender2, 1))
print(rowInDF(idAndGender1, idAndGender2, 2))



rowInDF <- function(df1, df2, df2Ind) {
  for (i in 1:nrow(df1)) {
    if (isEqual(df1, i, df2, df2Ind) == TRUE) {
      return (TRUE)
    }
  }
  return (FALSE)
} 



stress <- read.csv("stress.csv")
idAndGender1 <- stress[1:3, 1:2]
print(idAndGender1)
newDF <- data.frame() 
print(newDF)
print(newDF[1,])

print(rowInDF(newDF, idAndGender1, 1))




#     (c) Write the function withoutDuplicates. 
#         The function has one parameter, df, a data frame. 
#         The function produces a new data frame 
#         in which no two rows have the same content. 
withoutDuplicates <- function(df) {
  ???
}

stress <- read.csv("stress.csv")
idAndGender <- stress[1:3, 1:2]
print(idAndGender)
idAndGender <- rbind(idAndGender, c(3, 1))
print(idAndGender)
idAndGender <- rbind(idAndGender, c(1, 1))
print(idAndGender)
result <- withoutDuplicates(idAndGender)
print(result)

withoutDuplicates <- function(df) {
  noDuplicatesDF <- df[1, ]
  if (nrow(df) > 1) {
    for (i in 2:nrow(df)) {
      if (!rowInDF(noDuplicatesDF, df, i)) {
        noDuplicatesDF <- rbind(noDuplicatesDF, df[i,])      
      }
    }
  }
  return (noDuplicatesDF)
}



withoutDuplicates <- function(df) {
  noDuplicatesDF <- data.frame() 
  for (i in 1:nrow(df)) {
    if (!rowInDF(noDuplicatesDF, df, i)) {
      noDuplicatesDF <- rbind(noDuplicatesDF, df[i,])      
    }
  }
  return (noDuplicatesDF)
}


# (5) (a) Read the data frame stressWide. 
#      Then turn it into long format. 
#       Cond1 - black and white, happy, 
#       cond2 - black and white, sad
#       cond3 - black and white, neutral
#       Cond4 - color, happy, 
#       cond5 - color, sad
#       cond6 - color, neutral
stressWide <- read.csv("stressWide.csv")
print(head(stressWide))

# Expected result 
  ID  Kind    Mood Stress
1  1    bw   happy  47.04
2  1    bw     sad  33.60
3  1    bw neutral  95.20
4  1 color   happy  81.19
5  1 color     sad  70.90
6  1 color neutral   6.92



library(tidyr)
stressWide <- read.csv("stressWide.csv")
print(head(stressWide))
newColNames <- c("bw_happy","bw_sad","bw_neutral","color_happy","color_sad","color_neutral")
names(stressWide)[6:ncol(stressWide)] <- newColNames
print(names(stressWide))
stressWide <- stressWide[c(1, 6:ncol(stressWide))]
print(head(stressWide))
stressLong <- data.frame(pivot_longer(stressWide, 
                                      cols = bw_happy:color_neutral,
                                      names_to = c("Kind", "Mood"),
                                      names_pattern = "(.*)_(.*)",
                                      values_to = "Stress"))
print(head(stressLong, n = 14))


# (b) Write the function wideToLongMeans. 
#          THe function has the following parameters: 
#          df - a data frame in wide format, two "factors" 
#          colNums - a range of columns of the measured data
#          varnames - a vector of names for 
#                     the new "factor" variables
#          varname - a vector of names for 
#                    the new measured data variable
#          The function convrts df into a long format, 
#          using the arguments provided to the other parametres, 
#          and then "shrinks" the long format data frame 
#          such that it does not include the ID variables, 
#          and each pair of "factor" values appear in 
#          one row only with the mean of the measured data. 
#          You may use the function mean() 
#          Do not user other built-in functions. 
wideToLongMeans <- function(df, colnums, varnames, varname) 

stressWide <- read.csv("stressWide.csv")
newColNames <- c("bw_happy","bw_sad","bw_neutral","color_happy","color_sad","color_neutral")
names(stressWide)[6:ncol(stressWide)] <- newColNames
stressWide <- stressWide[c(1, 6:ncol(stressWide))]
print(head(stressWide))
myColnums <- c(2:7)
myVarnames <- c("Kind", "Mood")
meansDF <- wideToLongMeans(stressWide, myColnums, myVarnames, "Stress")
print(meansDF)


# Let us begin from the "middle", namely suppose that 
# we have the long-format data frame. 
# Now follow these middle steps. 


# (1) Produce separate data frames for each pair 
#     of values of the factor variables, e.g.
     ID  Kind    Mood Stress
6     1 color neutral   6.92
12    2 color neutral  37.65
18    3 color neutral  96.16

      ID  Kind Mood Stress
5     1 color  sad  70.90
11    2 color  sad  83.50
17    3 color  sad  32.66
  
myColnums <- c(2:7)
myVarnames <- c("Kind", "Mood")
for (i in 1:length(myColnums)) {
  pairAsDF <- stressLong[i, myVarnames]
  print(pairAsDF)
}

print(head(stressLong, n = 8))


myColnums <- c(2:7)
myVarnames <- c("Kind", "Mood")
for (i in 1:length(myColnums)) {
  pairAsDF <- stressLong[i, myVarnames]
  currentPair <- c(pairAsDF[1, 1], pairAsDF[1, 2])
  # Note this code: this it remind you of something? 
  filteredDF <- stressLong [(stressLong$Kind == currentPair[1] & 
                             stressLong$Mood == currentPair[2]),]
  print(filteredDF)
}

filterByTwoVars <- function(df, vars, vals) {
  return (df[df[,vars[1]] == vals[1] & df[,vars[2]] == vals[2],])
}

myColnums <- c(2:7)
myVarnames <- c("Kind", "Mood")
for (i in 1:length(myColnums)) {
  pairAsDF <- stressLong[i, myVarnames]
  currentPair <- c(pairAsDF[1, 1], pairAsDF[1, 2])
  filteredDF <- filterByTwoVars(stressLong, myColnums, currentPair) 
  print(filteredDF)
}

print(mean(c(2, 6, 1)))

# Then revise the code so as it produces the 
# requested data frame, not in a function 

















myColnums <- c(2:7)
myVarnames <- c("Kind", "Mood")
meansDF <- data.frame(matrix(NA, ncol=3, nrow = length(myColnums))  )
print(meansDF)
for (i in 1:length(myColnums)) {
  pairAsDF <- stressLong[i, myVarnames]
  currentPair <- c(pairAsDF[1, 1], pairAsDF[1, 2])
  filteredDF <- filterByTwoVars(stressLong, myColnums, currentPair)   
  pairMean <- mean(filteredDF$Stress) 
  meansDF[i,1:2] <- currentPair
  meansDF[i,3] <- pairMean
}
print(meansDF)
names(meansDF) <- c(myVarnames, "Mean")
print(meansDF)


# Finally, write the function. 
















wideToLongMeans <- function(df, colnums, varnames, varname) {
  library(tidyr)
  dfLong <- data.frame(pivot_longer(stressWide, 
                                        cols = colnums,
                                        names_to = varnames,
                                        names_pattern = "(.*)_(.*)",
                                        values_to = varname))
  meansDF <- data.frame(matrix(NA, ncol=3, nrow = length(colnums))  )
  for (i in 1:length(colnums)) {
    pairAsDF <- dfLong[i, varnames]
    currentPair <- c(pairAsDF[1, 1], pairAsDF[1, 2])
    filteredDF <- filterByTwoVars(stressLong, varnames, currentPair)
    print(filteredDF)
    pairMean <- mean(filteredDF[,varname])
    meansDF[i,1:2] <- currentPair
    meansDF[i,3] <- pairMean
  }
  names(meansDF) <- c(varnames, "Mean")
  return (meansDF)
}

stressWide <- read.csv("stressWide.csv")
newColNames <- c("bw_happy","bw_sad","bw_neutral","color_happy","color_sad","color_neutral")
names(stressWide)[6:ncol(stressWide)] <- newColNames
stressWide <- stressWide[c(1, 6:ncol(stressWide))]
print(head(stressWide))
myColnums <- c(2:7)
myVarnames <- c("Kind", "Mood")
meansDF <- wideToLongMeans(stressWide, myColnums, myVarnames, "Stress")
print(meansDF)


# (6) Save the latter produced data frame 
#     in a .csv file. 
write.csv(meansDF, file = "meansDF.csv")
recentMeansDF <- read.csv("meansDF.csv")
print(recentMeansDF)

write.csv(meansDF, file = "meansDF.csv", row.names = FALSE)
recentMeansDF <- read.csv("meansDF.csv")
print(recentMeansDF)