# Project overview:

# 1. Read the data from a file
# 2. Clean the dataset if necessary, check for missing values
# 3. Get summary of the data
# 4. Use R graphics to explore each variable in the dataset
# 5. Perform statistical analysis

#Load readr and dplyr packages
library(readr)
library(dplyr)

#-------------------------------------
#
# Step 1: Importing the dataset
#
#-------------------------------------

# Use read_csv() function from package readr to import the data from the following path
# http://scv.bu.edu/classes/BI594/cancer.csv
# Name the dataframe  - cancer

cancer <- read_csv("http://scv.bu.edu/classes/BI594/cancer.csv") 


#-------------------------------------
#
# Step 2: Data initial exploration
#
#-------------------------------------

# Use str() and summary() functions to explore the dataframe:
# How many rows and columns are there?
# What are the names of the columns?
# What are the types of the columns ( numeric, character, etc.)?
# How many missing data in each column of the data frame
str(cancer)
summary(cancer)


#-------------------------------------
#
# Step 3: Data cleaning/preparation
#
#-------------------------------------


# NCCTG Lung Cancer Data:
#   Survival in patients with advanced lung cancer from the North Central Cancer Treatment Group. 
#   Performance scores rate how well the patient can perform usual daily activities. 

# Column descriptions:
#   inst:  Institution code
#   time:	Survival time in days
#   status:	censoring status 1=censored, 2=dead
#   age:	Age in years
#   sex:	Male=1 Female=2
#   ph.ecog:	ECOG performance score (0=good 5=dead)
#   ph.karno:	Karnofsky performance score (bad=0-good=100) rated by physician
#   pat.karno:	Karnofsky performance score as rated by patient
#   meal.cal:	Calories consumed at meals
#   wt.loss:	Weight loss in last six months


# 1. Select all columns except "inst"; 
# 2. Remove all rows that contain missing values in ph.ecog, ph.karno or pat.karno
# 3. Create a new column "months" - survival time in months (by dividing time by 30); use round() function to round the result to the nearest integer.
# 4. Sort the dataset by age in reverse order
# 5. Save the result in a new dataframe "cancer.clean"
#
# Hint: You might want to use select(), filter(), mutate() and arrange() functions from dplyr package


cancer.clean <- cancer %>%
  select( -inst ) %>%
  filter ( !is.na(ph.ecog) & !is.na(ph.karno) & !is.na(pat.karno) ) %>%
  mutate(months = round(time / 30)) %>%
  arrange( desc(age))

# Using head() and summary() functions check your new dataframe cancer clean
head(cancer.clean)
summary(cancer.clean)


#-------------------------------------
#
# Step 4: Data mining
#
#-------------------------------------

# Using table() function explore how many male and female patients are there
table( cancer.clean$sex )


# Using group_by() and summary() functions from dplyr package compute mean values of time and age for male and female
cancer.clean %>% group_by (sex ) %>% summarise ( ave.time = mean(time), ave.age = mean(age) )

# What is the range of the variable age in this dataset?
range(cancer.clean$age)

#-------------------------------------
#
# Step 5: Data exploration using graphics
#
#-------------------------------------
# Load library ggplot2
library(ggplot2)


# Using either base R graphics functions or ggplot2 package construct a scatterplot:
# time ~ age

plot(time ~ age, data=cancer.clean)

ggplot(cancer.clean, aes(age, time)) + geom_point()

# Improve the graph:
#   1. provide title
#   2. change y axis label to be "time (days)" 
#   3. color the points based of status

# Using base R functions
plot(time ~ age, data=cancer.clean, 
     ylab="time (days)",
     col = status, 
     pch = 19)
title(" Survival time vs Age of the patient")

# using ggplot2:

ggplot(cancer.clean, aes(age, time, color=as.factor(status))) + geom_point( )


# Use function table to build a cross-validation table for variables sex and status
# Save the table in a variable tbl
tbl <- table( cancer.clean$sex, cancer.clean$status )

# Use rownames() and colnames() function to provide meaningful lables for this table
# Columns correspond to status, rows - to sex
rownames(tbl) <- c('Male','Female')
colnames(tbl) <- c('Censored','Dead')
tbl

# Using  base R graphics functions barplot() construct a side by side barplot status vs sex
barplot(tbl, beside = TRUE )

# Improve the graph:
# 1. specify a title
# 2. Use 2 different colors for female and male bars
# 3. Add a legend to explane the colors
b <- barplot(tbl, main = "Survival status for men and women",
             col= c("blue","red"),
             beside=TRUE)
legend( "topleft", col= c("blue","red"), pch = 15, legend = c("men","women"))

#-------------------------------------
#
# Step 6: Statistical analysis
#
#-------------------------------------

# using lm() function compute regression analysis using
# Time as the dependent variable and age, sex and ph.ecog as independent
# Fit a model to the lung cancer data set
# save your model in a variable lmfit
# use summary() function to determin which variable(s) in this model is/are statistically significant
lmfit <- lm( time ~ age + sex + ph.ecog, data=cancer)
summary(lmfit)


# Use function table to build a cross-validation table for variables sex and status
# Save the table in a variable tbl
tbl <- table( cancer.clean$sex, cancer.clean$status )

# Use rownames() and colnames() function to provide meaningful lables for this table
# Columns correspond to status, rows - to sex
rownames(tbl) <- c('Male','Female')
colnames(tbl) <- c('Censored','Dead')
tbl

# Use chisq.test() function on the above table to perform Pearson's Chi-squared Test
chisq.test(tbl, correct=FALSE)

# Install (if necessary) and load package epitools
# Use riskratio function from epitools package to compute risk ratio
# install.pcakges("epitools")
library(epitools)
riskratio(tbl, method="wald")