#--------------------------------------
#
# Introduction to R (Part 3): PROJECT
# PiBS Professional Skillset"
#
#-------------------------------------

#load packages we need 
library(tidyverse)


# Read input dataset
df <- read_csv("http://rcs.bu.edu/classes/FC764/med_data.csv")


# ---------------------
# Explore dataframe
# ---------------------

# Display first 6 records (head of the file)


# Using glimpse() or str() function explore the dataframe


# What kind of variables are there?


# How many observations are there


# What are the column names:


# using summary() function, display basic statistics for the dataframe
summary(df)

# From the output of the statistics function find the columns that have missing values;

# Look at the maximum value of the CIGPDAY column. 
# Is this value possible?
# This is probably not a real value but rather a code for a missing value...


#-------------------------------
# Missing Data handling
#-------------------------------

# Are there any missing data in this dataset 
# (use either summary() or anyNA() function to find out)


# Check how many missing data we have in column TOTCHOL
# Note: Use sum() function together with is.na()


# How many values equal to 999 do we have in CIGPDAY column?


# Let's mark all values in CIGPDAY that are equal to 999 as NA
df <- df %>%
  mutate( CIGPDAY = ifelse( CIGPDAY == 999, NA , CIGPDAY ) )


# Let's run the summary function again


# How many observations are in this dataset? How many missing values? 
# Is it OK to remove missing observations?
# Always be careful with removing missing values!!!
# See if there are any patterns of missing data
# use na.omit() to remove all missing values and assign the result back to df


#-------------------------------
# Exploration of categorical variables
#-------------------------------

# Which columns might contain categorical variables?
# Use unique() function to see unique values in those columns

# Use table() function to find how many observations we have for each gender

# Use table() function to find how many people in this group have diabetes


#-------------------------------
# For each gender compute average value (mean) of cigarettes per day
#-------------------------------

df %>%
  group_by() %>%
  summarise()


#-------------------------------
# For each gender and for each group - with diabetes and without, 
# compute minimum, maximum and average value of total cholesterol
#-------------------------------


#-------------------------------------
# Graphics exploration of the dataset
#-------------------------------------

# Display a scatter plot of SYSBP and DIABP variables and draw a linear regression line 
ggplot(df, aes()) +
  geom_point() + 
  geom_smooth(method="lm")

  
# Display histogram for the variable AGE


# Is the default value for the bin-width is optimal? Change it to be 5 years. Is it better?


#------------------------
# running basic analysis
#------------------------

# Use cor() function to calculate correlation between SYSBP and DIABP


# Use t.test() function to test hypothesis that the number of cigarettes per day smoked by women and men is the same
# Use "formula" notation:   
#       dependent_variable  ~ independent_variable

t.test( ... ~ ... , data=df)
# What is the p-value?
# What is the 95% confidence interval? Does it contain zero?


# Use lm() function to compute linear regression between Age and Total Cholesterol.
# Use "formula" notation:   
#       dependent_variable  ~ independent_variable
# Save results in a variable lm.res
lm.res <- lm( ... ~ ... , data = df  )

# Apply function summary() to lm.res to see the output of the linear regression model