# # Unit 8 - data frames # # Practice II # # Preliminary notes: - Material - Practice in writing functions and other previous material - Somewhat more advanced in terms of complexity - Use studied material - If possible, watch this in one stroke setwd("c:/R") # (1) Write the function myTranspose. # THe function has two parameters: # - df - a data frame # - colnames - a list of column names in df # - rows - a range of row indices in df # THe function "slices" df such that # it includes only the column whose names # appear in colnames, and only the rows # whose indices are included in rows, # and then transposes # the resulting data frame, and returns # this data frame. myTranspose <- function(df, colnames, rowInds) { ? } stress <- read.csv("stress.csv") print(head(stress)) myColnames <- names(stress)[2:4] print(myColnames) myRows <- 1:5 result <- myTranspose(stress, myColnames, myRows) print(result) print(stress[1:5,c("GENDER", "AGE", "FACULTY")]) myTranspose <- function(df, colnames, rowInds) { slicedDF <- df[rowInds, colnames] return (t(slicedDF)) } # (2) Write the function colsWithVal. # The function has two parameters: # - df - a data frame # - val - any value # The function produces a vector of all indices # of columns which include the value val, and # returns the result. colsWithVal <- function(df, val) { ??? } stress <- read.csv("stress.csv") stress$AGE[2] <- NA stress$ID[4] <- NA print(head(stress)) print(colsWithVal(stress, NA)) print(class(stress[3])) colsWithVal <- function(df, val) { inds <- c() for (ind in 1:ncol(df)) { if (val %in% df[,ind]) { inds <- c(inds, ind) } } return (inds) } # (3) write the function: filterByTwoVars. # The function has three parameters, as follows: # df - a data frame # vars - a vector of names of two columns in df # vals - a vector of two values # The function returns a data frame including # all rows in df where the value of the first column # in vars equals the first value in vals, and the value of # the second column in vars equals the second value in vals. filterByTwoVars <- function(df, vars, vals) { ??? } stress <- read.csv("stress.csv") result <- filterByTwoVars(stress, c("GENDER", "AGE"), c(1, 28)) print(result) print(stress[stress$GENDER == 1 & stress$AGE == 28,]) filterByTwoVars <- function(df, vars, vals) { return (df[ df[,vars[1]] == vals[1] & df[,vars[2]] == vals[2],]) } # What would this code print? stress <- read.csv("stress.csv") print(head(stress)) result <- filterByTwoVars(stress, c(2, 4), c(1, 2)) print(result) # (4) (a) Write the function isEqual. # The function receives two data frames # and an index row to each of these data frames # and checks if the rows are equal in terms # of length and contents: if yes, it returns TRUE, # and if no, it returns FALSE. isEqual <- function(df1, ind1, df2, ind2) { ??? } stress <- read.csv("stress.csv") idAndGender1 <- stress[1:3, 1:2] print(idAndGender1) idAndGender2 <- stress[3:5, 1:2] print(idAndGender2) print(isEqual(idAndGender1, 3, idAndGender2, 1)) print(idAndGender1[3,]) print(idAndGender2[1,]) print(idAndGender1[3,] == idAndGender2[1,]) print(c(1, 2) == c(1, 2)) isEqual <- function(df1, ind1, df2, ind2) { if (ncol(df1) != ncol(df2)) { return (FALSE) } for (ind in 1:ncol(df1)) { if (df1[ind1,ind] != df2[ind2,ind]) { return (FALSE) } } return (TRUE) } # (b) Write the function rowInDF. # The function recevies two data frames # and an index to the second data frames # It checks if the first data frame includes a # row whose contents are equal to the contents # of the given row in the second data frame: # if yes, it returns TRUE, # and if no, it returns FALSE. rowInDF <- function(df1, df2, df2Ind) { ??? } stress <- read.csv("stress.csv") idAndGender1 <- stress[1:3, 1:2] print(idAndGender1) idAndGender2 <- stress[3:5, 1:2] print(idAndGender2) print(rowInDF(idAndGender1, idAndGender2, 1)) print(rowInDF(idAndGender1, idAndGender2, 2)) rowInDF <- function(df1, df2, df2Ind) { for (i in 1:nrow(df1)) { if (isEqual(df1, i, df2, df2Ind) == TRUE) { return (TRUE) } } return (FALSE) } stress <- read.csv("stress.csv") idAndGender1 <- stress[1:3, 1:2] print(idAndGender1) newDF <- data.frame() print(newDF) print(newDF[1,]) print(rowInDF(newDF, idAndGender1, 1)) # (c) Write the function withoutDuplicates. # The function has one parameter, df, a data frame. # The function produces a new data frame # in which no two rows have the same content. withoutDuplicates <- function(df) { ??? } stress <- read.csv("stress.csv") idAndGender <- stress[1:3, 1:2] print(idAndGender) idAndGender <- rbind(idAndGender, c(3, 1)) print(idAndGender) idAndGender <- rbind(idAndGender, c(1, 1)) print(idAndGender) result <- withoutDuplicates(idAndGender) print(result) withoutDuplicates <- function(df) { noDuplicatesDF <- df[1, ] if (nrow(df) > 1) { for (i in 2:nrow(df)) { if (!rowInDF(noDuplicatesDF, df, i)) { noDuplicatesDF <- rbind(noDuplicatesDF, df[i,]) } } } return (noDuplicatesDF) } withoutDuplicates <- function(df) { noDuplicatesDF <- data.frame() for (i in 1:nrow(df)) { if (!rowInDF(noDuplicatesDF, df, i)) { noDuplicatesDF <- rbind(noDuplicatesDF, df[i,]) } } return (noDuplicatesDF) } # (5) (a) Read the data frame stressWide. # Then turn it into long format. # Cond1 - black and white, happy, # cond2 - black and white, sad # cond3 - black and white, neutral # Cond4 - color, happy, # cond5 - color, sad # cond6 - color, neutral stressWide <- read.csv("stressWide.csv") print(head(stressWide)) # Expected result ID Kind Mood Stress 1 1 bw happy 47.04 2 1 bw sad 33.60 3 1 bw neutral 95.20 4 1 color happy 81.19 5 1 color sad 70.90 6 1 color neutral 6.92 library(tidyr) stressWide <- read.csv("stressWide.csv") print(head(stressWide)) newColNames <- c("bw_happy","bw_sad","bw_neutral","color_happy","color_sad","color_neutral") names(stressWide)[6:ncol(stressWide)] <- newColNames print(names(stressWide)) stressWide <- stressWide[c(1, 6:ncol(stressWide))] print(head(stressWide)) stressLong <- data.frame(pivot_longer(stressWide, cols = bw_happy:color_neutral, names_to = c("Kind", "Mood"), names_pattern = "(.*)_(.*)", values_to = "Stress")) print(head(stressLong, n = 14)) # (b) Write the function wideToLongMeans. # THe function has the following parameters: # df - a data frame in wide format, two "factors" # colNums - a range of columns of the measured data # varnames - a vector of names for # the new "factor" variables # varname - a vector of names for # the new measured data variable # The function convrts df into a long format, # using the arguments provided to the other parametres, # and then "shrinks" the long format data frame # such that it does not include the ID variables, # and each pair of "factor" values appear in # one row only with the mean of the measured data. # You may use the function mean() # Do not user other built-in functions. wideToLongMeans <- function(df, colnums, varnames, varname) stressWide <- read.csv("stressWide.csv") newColNames <- c("bw_happy","bw_sad","bw_neutral","color_happy","color_sad","color_neutral") names(stressWide)[6:ncol(stressWide)] <- newColNames stressWide <- stressWide[c(1, 6:ncol(stressWide))] print(head(stressWide)) myColnums <- c(2:7) myVarnames <- c("Kind", "Mood") meansDF <- wideToLongMeans(stressWide, myColnums, myVarnames, "Stress") print(meansDF) # Let us begin from the "middle", namely suppose that # we have the long-format data frame. # Now follow these middle steps. # (1) Produce separate data frames for each pair # of values of the factor variables, e.g. ID Kind Mood Stress 6 1 color neutral 6.92 12 2 color neutral 37.65 18 3 color neutral 96.16 ID Kind Mood Stress 5 1 color sad 70.90 11 2 color sad 83.50 17 3 color sad 32.66 myColnums <- c(2:7) myVarnames <- c("Kind", "Mood") for (i in 1:length(myColnums)) { pairAsDF <- stressLong[i, myVarnames] print(pairAsDF) } print(head(stressLong, n = 8)) myColnums <- c(2:7) myVarnames <- c("Kind", "Mood") for (i in 1:length(myColnums)) { pairAsDF <- stressLong[i, myVarnames] currentPair <- c(pairAsDF[1, 1], pairAsDF[1, 2]) # Note this code: this it remind you of something? filteredDF <- stressLong [(stressLong$Kind == currentPair[1] & stressLong$Mood == currentPair[2]),] print(filteredDF) } filterByTwoVars <- function(df, vars, vals) { return (df[df[,vars[1]] == vals[1] & df[,vars[2]] == vals[2],]) } myColnums <- c(2:7) myVarnames <- c("Kind", "Mood") for (i in 1:length(myColnums)) { pairAsDF <- stressLong[i, myVarnames] currentPair <- c(pairAsDF[1, 1], pairAsDF[1, 2]) filteredDF <- filterByTwoVars(stressLong, myColnums, currentPair) print(filteredDF) } print(mean(c(2, 6, 1))) # Then revise the code so as it produces the # requested data frame, not in a function myColnums <- c(2:7) myVarnames <- c("Kind", "Mood") meansDF <- data.frame(matrix(NA, ncol=3, nrow = length(myColnums)) ) print(meansDF) for (i in 1:length(myColnums)) { pairAsDF <- stressLong[i, myVarnames] currentPair <- c(pairAsDF[1, 1], pairAsDF[1, 2]) filteredDF <- filterByTwoVars(stressLong, myColnums, currentPair) pairMean <- mean(filteredDF$Stress) meansDF[i,1:2] <- currentPair meansDF[i,3] <- pairMean } print(meansDF) names(meansDF) <- c(myVarnames, "Mean") print(meansDF) # Finally, write the function. wideToLongMeans <- function(df, colnums, varnames, varname) { library(tidyr) dfLong <- data.frame(pivot_longer(stressWide, cols = colnums, names_to = varnames, names_pattern = "(.*)_(.*)", values_to = varname)) meansDF <- data.frame(matrix(NA, ncol=3, nrow = length(colnums)) ) for (i in 1:length(colnums)) { pairAsDF <- dfLong[i, varnames] currentPair <- c(pairAsDF[1, 1], pairAsDF[1, 2]) filteredDF <- filterByTwoVars(stressLong, varnames, currentPair) print(filteredDF) pairMean <- mean(filteredDF[,varname]) meansDF[i,1:2] <- currentPair meansDF[i,3] <- pairMean } names(meansDF) <- c(varnames, "Mean") return (meansDF) } stressWide <- read.csv("stressWide.csv") newColNames <- c("bw_happy","bw_sad","bw_neutral","color_happy","color_sad","color_neutral") names(stressWide)[6:ncol(stressWide)] <- newColNames stressWide <- stressWide[c(1, 6:ncol(stressWide))] print(head(stressWide)) myColnums <- c(2:7) myVarnames <- c("Kind", "Mood") meansDF <- wideToLongMeans(stressWide, myColnums, myVarnames, "Stress") print(meansDF) # (6) Save the latter produced data frame # in a .csv file. write.csv(meansDF, file = "meansDF.csv") recentMeansDF <- read.csv("meansDF.csv") print(recentMeansDF) write.csv(meansDF, file = "meansDF.csv", row.names = FALSE) recentMeansDF <- read.csv("meansDF.csv") print(recentMeansDF)