#Vectors have a linear structure. Most of our datsets are rectangular. #Different ways to import data: #In R Studio, use the "Import Dataset" on the right #rename the dataframe (unless you like the default name) d <- john_data #other methods allow more flexibility #use the "file.choose" command d <- read.csv(file.choose(), stringsAsFactors = FALSE, as.is=TRUE, strip.white=TRUE) #load in the data yourself thorugh the console d <- read.csv("c:\\directoryname\\filename.csv") #pc d <- read.csv("~/Documents/Postdoc/Teaching/Intro to R/data files/john_data.csv") #mac #look at the data to make sure it makes sense names(d) #look at the column names rownames(d) #defaults to numbers if no names are assigned head(d) #look at the first six lines tail(d) #look at the last six lines #change the column names names(d) <- c("Lineage", "Time", "DT", "Ploidy", "Enviro") names(d)[5] <- "Env" #As it reads your data, R will classify your variables into types: #Columns with only numbers are make into numeric or integer variables #Columns with non-numeric characters are made into factors unless otherwise specified ("stringsAsFactors=FALSE" in the read command) str(d) #check how R has classified the variables #check on R's classification of just one variable class(d$Ploidy) #integer, character, numeric, etc. is.numeric(d$Ploidy) #TRUE or FALSE is.character(d$Ploidy) #TRUE or FALSE is.integer(d$Ploidy) #TRUE or FALSE #convert bewteen character types d$Ploidy <- as.character(d$Ploidy) d$Ploidy <- as.factor(d$Ploidy) d$Ploidy <- as.numeric(as.character(d$Ploidy)) d$DT <- as.numeric(as.character(d$DT)) #Useful data fram functions and operations ncol(d) #number of columns nrow(d) #number of rows length(d) #number of variables (NOT the length of the columns = nrow) #Access aspects of a data frame #Vectors can be refered to by name d$Env #the fifth (environment) vector d$E #the fifth (environment) vector (none of the other columns start with 'E') d[,"Env"] #the fifth vector #Variables can also be accessed using square brackets. Integers before the comma refer to rows, integers after the comma indicate columns: mydata[rows, columns] d[,5] #the fifth vector d[1,] #the first row d[1:10,] #the first ten rows d[1,3] #the third element in the first row d[,3][1] #the third element in the first row d$DT[1] #also the fifth element in the first row d[1:4,c(3,5)] #the second and fifth elements in the first four rows #Change values within a data frame d[,3][1] <- 2 #Create a new variable based on properties of an existing variable: d$PloidyFac[d$Ploidy > 3.5] <- "tet" d$PloidyFac[d$Ploidy < 3.5 & d$Ploidy > 2.5 ] <- "trip" d$PloidyFac[d$Ploidy < 2.5 & d$Ploidy ] <- "dip" is.factor(d$PloidyFac) d$PloidyFac <- as.factor(d$PloidyFac) str(d) #Combine two vectors together d$timeEnv <- paste(d$Time, d$Env, sep=".") #saving this for later (since I use "d" a lot) john <- d #Combine vectors into a data frame. Vectors need to be the same length, or multipliers of the length tlo <- c("TLO1", "TLO2","TLO3", "TLO4", "TLO5", "TLO7", "TLO8", "TLO9", "TLO10") clade <- c("alpha", "beta", "alpha", "gamma", "gamma", "gamma", "alpha", "alpha", "alpha") chr <- c("R","R","1","1","2", "3","3","4","4") express <- c(10.2, 0.8, 0.9, 0.2, 0.3, 0.000001, 1.3, 2, 0.1) loc <- "subtel" tlos <- data.frame(tlo, clade, chr, express, loc, stringsAsFactors = FALSE) tlo34 <- c("TLO34", "alpha", "1", "1", "internal") #add a row tlos <- rbind(tlos, tlo34) str(tlos) #add a column env <- "YPD" tlos <- cbind(tlos, env) #can also do tlos$env <- "YPD" ################################################################################# ################################################################################# #1) Oops, TLO8 is a gamma, not an alpha. Correct that. #2) Make a new vector that contains the log of expression values. #2) We also collected expression data in fluconazole. Add the information below into the existing dataframe without adding new columns (multiple ways to do this). #expression in fluconazole: # TL01 - 3 # TL02 - 0.4 # TL03 - 2.4 # TL04 - 0.9 # TL05 - 1.1 # TL07 - 0.23 # TL08 - 0.42 # TL09 - 4 # TL010 - 100 # TL034 - 43.2 ################################################################################# ################################################################################# #Sort and order tlos.x <- tlos[order(tlos$clade),] #by one column tlos.x <- tlos[order(tlos$clade, tlos$express),] #by two columns #Obtain a subset of a dataframe using logical statements atlo <- subset(tlos, clade=="alpha") bgtlo <- subset(tlos, clade!="alpha") aRtlo <- subset(tlos, clade=="alpha" & chr=="4") #Write/save a data frame to a text file. Include "row.names=FALSE" if you don't want the row names or number of the data frome included in the first column write.csv(d, file='c:\\directoryname\\filename.csv', row.names=FALSE) #PC write.csv(d, file="/directoryname/filename.csv", row.names=FALSE) #Mac