# Project overview: # 1. Read the data from a file # 2. Clean the dataset if necessary, check for missing values # 3. Get summary of the data # 4. Use R graphics to explore each variable in the dataset # 5. Perform statistical analysis #Load readr and dplyr packages library(readr) library(dplyr) #------------------------------------- # # Step 1: Importing the dataset # #------------------------------------- # Use read_csv() function from package readr to import the data from the following path # http://scv.bu.edu/classes/BI594/cancer.csv # Name the dataframe - cancer cancer <- read_csv("http://scv.bu.edu/classes/BI594/cancer.csv") #------------------------------------- # # Step 2: Data initial exploration # #------------------------------------- # Use str() and summary() functions to explore the dataframe: # How many rows and columns are there? # What are the names of the columns? # What are the types of the columns ( numeric, character, etc.)? # How many missing data in each column of the data frame str(cancer) summary(cancer) #------------------------------------- # # Step 3: Data cleaning/preparation # #------------------------------------- # NCCTG Lung Cancer Data: # Survival in patients with advanced lung cancer from the North Central Cancer Treatment Group. # Performance scores rate how well the patient can perform usual daily activities. # Column descriptions: # inst: Institution code # time: Survival time in days # status: censoring status 1=censored, 2=dead # age: Age in years # sex: Male=1 Female=2 # ph.ecog: ECOG performance score (0=good 5=dead) # ph.karno: Karnofsky performance score (bad=0-good=100) rated by physician # pat.karno: Karnofsky performance score as rated by patient # meal.cal: Calories consumed at meals # wt.loss: Weight loss in last six months # 1. Select all columns except "inst"; # 2. Remove all rows that contain missing values in ph.ecog, ph.karno or pat.karno # 3. Create a new column "months" - survival time in months (by dividing time by 30); use round() function to round the result to the nearest integer. # 4. Sort the dataset by age in reverse order # 5. Save the result in a new dataframe "cancer.clean" # # Hint: You might want to use select(), filter(), mutate() and arrange() functions from dplyr package cancer.clean <- cancer %>% select( -inst ) %>% filter ( !is.na(ph.ecog) & !is.na(ph.karno) & !is.na(pat.karno) ) %>% mutate(months = round(time / 30)) %>% arrange( desc(age)) # Using head() and summary() functions check your new dataframe cancer clean head(cancer.clean) summary(cancer.clean) #------------------------------------- # # Step 4: Data mining # #------------------------------------- # Using table() function explore how many male and female patients are there table( cancer.clean$sex ) # Using group_by() and summary() functions from dplyr package compute mean values of time and age for male and female cancer.clean %>% group_by (sex ) %>% summarise ( ave.time = mean(time), ave.age = mean(age) ) # What is the range of the variable age in this dataset? range(cancer.clean$age) #------------------------------------- # # Step 5: Data exploration using graphics # #------------------------------------- # Load library ggplot2 library(ggplot2) # Using either base R graphics functions or ggplot2 package construct a scatterplot: # time ~ age plot(time ~ age, data=cancer.clean) ggplot(cancer.clean, aes(age, time)) + geom_point() # Improve the graph: # 1. provide title # 2. change y axis label to be "time (days)" # 3. color the points based of status # Using base R functions plot(time ~ age, data=cancer.clean, ylab="time (days)", col = status, pch = 19) title(" Survival time vs Age of the patient") # using ggplot2: ggplot(cancer.clean, aes(age, time, color=as.factor(status))) + geom_point( ) # Use function table to build a cross-validation table for variables sex and status # Save the table in a variable tbl tbl <- table( cancer.clean$sex, cancer.clean$status ) # Use rownames() and colnames() function to provide meaningful lables for this table # Columns correspond to status, rows - to sex rownames(tbl) <- c('Male','Female') colnames(tbl) <- c('Censored','Dead') tbl # Using base R graphics functions barplot() construct a side by side barplot status vs sex barplot(tbl, beside = TRUE ) # Improve the graph: # 1. specify a title # 2. Use 2 different colors for female and male bars # 3. Add a legend to explane the colors b <- barplot(tbl, main = "Survival status for men and women", col= c("blue","red"), beside=TRUE) legend( "topleft", col= c("blue","red"), pch = 15, legend = c("men","women")) #------------------------------------- # # Step 6: Statistical analysis # #------------------------------------- # using lm() function compute regression analysis using # Time as the dependent variable and age, sex and ph.ecog as independent # Fit a model to the lung cancer data set # save your model in a variable lmfit # use summary() function to determin which variable(s) in this model is/are statistically significant lmfit <- lm( time ~ age + sex + ph.ecog, data=cancer) summary(lmfit) # Use function table to build a cross-validation table for variables sex and status # Save the table in a variable tbl tbl <- table( cancer.clean$sex, cancer.clean$status ) # Use rownames() and colnames() function to provide meaningful lables for this table # Columns correspond to status, rows - to sex rownames(tbl) <- c('Male','Female') colnames(tbl) <- c('Censored','Dead') tbl # Use chisq.test() function on the above table to perform Pearson's Chi-squared Test chisq.test(tbl, correct=FALSE) # Install (if necessary) and load package epitools # Use riskratio function from epitools package to compute risk ratio # install.pcakges("epitools") library(epitools) riskratio(tbl, method="wald")