#-------------------------------------- # # Introduction to R (Part 3): PROJECT # PiBS Professional Skillset" # #------------------------------------- #load packages we need library(tidyverse) # Read input dataset df <- read_csv("http://rcs.bu.edu/classes/FC764/med_data.csv") # --------------------- # Explore dataframe # --------------------- # Display first 6 records (head of the file) # Using glimpse() or str() function explore the dataframe # What kind of variables are there? # How many observations are there # What are the column names: # using summary() function, display basic statistics for the dataframe summary(df) # From the output of the statistics function find the columns that have missing values; # Look at the maximum value of the CIGPDAY column. # Is this value possible? # This is probably not a real value but rather a code for a missing value... #------------------------------- # Missing Data handling #------------------------------- # Are there any missing data in this dataset # (use either summary() or anyNA() function to find out) # Check how many missing data we have in column TOTCHOL # Note: Use sum() function together with is.na() # How many values equal to 999 do we have in CIGPDAY column? # Let's mark all values in CIGPDAY that are equal to 999 as NA df <- df %>% mutate( CIGPDAY = ifelse( CIGPDAY == 999, NA , CIGPDAY ) ) # Let's run the summary function again # How many observations are in this dataset? How many missing values? # Is it OK to remove missing observations? # Always be careful with removing missing values!!! # See if there are any patterns of missing data # use na.omit() to remove all missing values and assign the result back to df #------------------------------- # Exploration of categorical variables #------------------------------- # Which columns might contain categorical variables? # Use unique() function to see unique values in those columns # Use table() function to find how many observations we have for each gender # Use table() function to find how many people in this group have diabetes #------------------------------- # For each gender compute average value (mean) of cigarettes per day #------------------------------- df %>% group_by() %>% summarise() #------------------------------- # For each gender and for each group - with diabetes and without, # compute minimum, maximum and average value of total cholesterol #------------------------------- #------------------------------------- # Graphics exploration of the dataset #------------------------------------- # Display a scatter plot of SYSBP and DIABP variables and draw a linear regression line ggplot(df, aes()) + geom_point() + geom_smooth(method="lm") # Display histogram for the variable AGE # Is the default value for the bin-width is optimal? Change it to be 5 years. Is it better? #------------------------ # running basic analysis #------------------------ # Use cor() function to calculate correlation between SYSBP and DIABP # Use t.test() function to test hypothesis that the number of cigarettes per day smoked by women and men is the same # Use "formula" notation: # dependent_variable ~ independent_variable t.test( ... ~ ... , data=df) # What is the p-value? # What is the 95% confidence interval? Does it contain zero? # Use lm() function to compute linear regression between Age and Total Cholesterol. # Use "formula" notation: # dependent_variable ~ independent_variable # Save results in a variable lm.res lm.res <- lm( ... ~ ... , data = df ) # Apply function summary() to lm.res to see the output of the linear regression model