# Project overview:

# 1. Read the data from a file
# 2. Clean the dataset if necessary, check for missing values
# 3. Get summary of the data
# 4. Use R graphics to explore each variable in the dataset
# 5. Perform statistical analysis

#Load readr and dplyr packages
library( )
library( )

#-------------------------------------
#
# Step 1: Importing the dataset
#
#-------------------------------------

# Use read_csv() function from package readr to import the data from the following path
# http://scv.bu.edu/classes/BI594/cancer.csv
# Name the dataframe  - cancer
[
[ place your code here ]


#-------------------------------------
#
# Step 2: Data initial exploration
#
#-------------------------------------

# Use str() and summary() functions to explore the dataframe:
# How many rows and columns are there?
# What are the names of the columns?
# What are the types of the columns ( numeric, character, etc.)?
# How many missing data in each column of the data frame
[ place your code here ]


#-------------------------------------
#
# Step 3: Data cleaning/preparation
#
#-------------------------------------


# NCCTG Lung Cancer Data:
#   Survival in patients with advanced lung cancer from the North Central Cancer Treatment Group. 
#   Performance scores rate how well the patient can perform usual daily activities. 

# Column descriptions:
#   inst:  Institution code
#   time:	Survival time in days
#   status:	censoring status 1=censored, 2=dead
#   age:	Age in years
#   sex:	Male=1 Female=2
#   ph.ecog:	ECOG performance score (0=good 5=dead)
#   ph.karno:	Karnofsky performance score (bad=0-good=100) rated by physician
#   pat.karno:	Karnofsky performance score as rated by patient
#   meal.cal:	Calories consumed at meals
#   wt.loss:	Weight loss in last six months


# 1. Select all columns except "inst"; 
# 2. Remove all rows that contain missing values in ph.ecog, ph.karno or pat.karno
# 3. Create a new column "months" - survival time in months (by dividing time by 30); use round() function to round the result to the nearest integer.
# 4. Sort the dataset by age in reverse order
# 5. Save the result in a new dataframe "cancer.clean"
#
# Hint: You might want to use select(), filter(), mutate() and arrange() functions from dplyr package


cancer.clean <- cancer %>%
[ place your code here ]

# Using head() and summary() functions check your new dataframe cancer clean
[ place your code here ]


#-------------------------------------
#
# Step 4: Data mining
#
#-------------------------------------

# Using table() function explore how many male and female patients are there
[ place your code here ]


# Using group_by() and summary() functions from dplyr package compute mean values of time and age for male and female
[ place your code here ]


# What is the range of the variable age in this dataset?
[ place your code here ]

#-------------------------------------
#
# Step 5: Data exploration using graphics
#
#-------------------------------------
# Load library ggplot2
library( )


# Using either base R graphics functions or ggplot2 package construct a scatterplot:
# time ~ age

[ place your code here ]

# Improve the graph:
#   1. provide title
#   2. change y axis label to be "time (days)" 
#   3. color the points based of status

[ place your code here ]

# Use function table to build a cross-validation table for variables sex and status
# Save the table in a variable tbl
[ place your code here ]


# Use rownames() and colnames() function to provide meaningful lables for this table
# Columns correspond to status, rows - to sex
[ place your code here ]

# Using  base R graphics functions barplot() construct a side by side barplot status vs sex
[ place your code here ]

# Improve the graph:
# 1. specify a title
# 2. Use 2 different colors for female and male bars
# 3. Add a legend to explane the colors
[ place your code here ]

#-------------------------------------
#
# Step 6: Statistical analysis
#
#-------------------------------------

# using lm() function compute regression analysis using
# Time as the dependent variable and age, sex and ph.ecog as independent
# Fit a model to the lung cancer data set
# save your model in a variable lmfit
# use summary() function to determin which variable(s) in this model is/are statistically significant
[ place your code here ]


# Use function table to build a cross-validation table for variables sex and status
# Save the table in a variable tbl
[ place your code here ]


# Use rownames() and colnames() function to provide meaningful lables for this table
# Columns correspond to status, rows - to sex
[ place your code here ]

# Use chisq.test() function on the above table to perform Pearson's Chi-squared Test
[ place your code here ]


# Install (if necessary) and load package epitools
# Use riskratio function from epitools package to compute risk ratio
# install.pcakges("epitools")
library()
[ place your code here ]