# # Part 1: Python Basics 


# Basic Data Types and Operations
a = 10
b = 5
print("Addition:", a + b)
print("Subtraction:", a - b)
print("Multiplication:", a * b)
print("Division:", a / b)


# If-else statement
if a > b:
    print("a is greater than b")
else:
    print("a is less than or equal to b")


# Functions
def greet(name):
    return f"Hello, {name}!"

print(greet("PhD Students"))


# Importing a module
import math
print("Square root of 16:", math.sqrt(16))


# # Part 2: Fundamental Python Data Structures:
# 1. Lists
# 2. Tuples
# 3. Dictionaries
# 4. Sets


## 1. Lists: ordered, mutable collections
genes = ["Gene1", "Gene2", "Gene3"]
print("List of genes:", genes)

# Sometimes you may see the following syntax for print
print( f"List of genes: {genes}" )


# Lists can be modified, for example I can add an element to the above list
genes.append("Gene4")
print(f"Updated list: {genes}")


## 2. Tuples: ordered immutable collections
nucleotide_counts = (10, 15, 8, 12)
print(f"Nucleotide counts (A, T, G, C): {nucleotide_counts}")


# 3. Dictionaries: key-value pairs
student = {"name": "John", "age": 25, "courses": ["Math", "Science"]}
print("Student Dictionary:", student)

student["age"] = 26
print("Updated Student Dictionary:", student)


# Another example:
gene_expression = {"Gene1": 5.4, "Gene2": 3.2, "Gene3": 4.7}
print(f"Gene expression dictionary: {gene_expression}")
print(f"Expression of Gene2: {gene_expression['Gene2']}")


# Sets: Unordered, unique collections
unique_bases = {"A", "T", "G", "C"}
print(f"Unique bases: {unique_bases}")


# The Python sets are highly useful to efficiently remove duplicate values from a collection like 
# a list and to perform common math operations like unions and intersections. 


# Define a list with many (non-unique) elements
x = [1,5,8,2,5,1,3,8,2,1]
print(f"Unique values in x: {set(x)}" )


# How to know the type of Python object
print( type(unique_bases) )
print( type(student) )
print( type(greet) )


# # Part 3: Working with Data


# Data manipulation using pandas
import pandas as pd
import numpy as np


# Example dataset: Gene expression
data = {
    "Gene": ["Gene1", "Gene2", "Gene3"],
    "Control": [2.3, 2.5, 1.8],
    "Treated": [3.5, 2.8, 2.1]
}

print(data)

# Let's check what kind of object we just created
print( type(data) )


# Let's convert this object to a Data Frame:
df = pd.DataFrame(data)
print("DataFrame:")
print(df)


# Let's check the type of this object
print( type(df) )


# Let's add another column to this data frame:
# Add a new column for log2 fold change
df["Log2_FC"] = np.log2( df["Treated"] / df["Control"] )
print("\nUpdated DataFrame with Log2 Fold Change:")
print(df)


# Load the dataset
file_path = "medical_data.csv"  # Update the path if needed
df = pd.read_csv(file_path)


# Display the first few rows of the dataset
print("=== First 5 Rows of the Dataset ===")
print(df.head())


# Display summary statistics
print("\n=== Summary Statistics ===")
print(df.describe(include="all"))


# Check for missing values
# Unlike R, Python does not have a special symbol for the missing values and the missing values are marked as NaN
print("\n=== Missing Values ===")
print(df.isnull().sum())


# Hmmm.... 
# It looks like Python interpreted the word "None" in the input dataset as an indicator of a "missing value"
# In this case it looks like "None" is a valid value
# Let's reread the dataset and indicate that "None" should not be considered as a missing value
df = pd.read_csv(file_path, na_filter=False, na_values=[''])

print(df.isnull().sum())


# Basic Data Exploration
print("\n=== Basic Data Exploration ===")
print(f"Number of Rows: {df.shape[0]}")
print(f"Number of Columns: {df.shape[1]}")
print(f"Column Names: {df.columns.tolist()}")


# Filter data: Patients with systolic blood pressure > 140
print("\n=== Patients with Systolic Blood Pressure > 140 ===")
high_bp = df[df["Blood_Pressure_Systolic"] > 140]
print(high_bp.head())


# Sort the data: Top 10 patients by cholesterol level
print("\n=== Top 10 Patients by Cholesterol Level ===")
top_cholesterol = df.sort_values(by="Cholesterol_mg_dl", ascending=False).head(10)
print(top_cholesterol)


# select specific columns
df2 = df[['Patient_ID', 'Age', 'Height_cm', 'Weight_kg']]
print( df2.head() )


# Create a new column: BMI (Body Mass Index)
print("\n=== Adding BMI Column ===")
df["BMI"] = (df["Weight_kg"] / ((df["Height_cm"] / 100) ** 2)).round(2)
print(df[["Patient_ID", "Height_cm", "Weight_kg", "BMI"]].head())


# Save filtered data: Patients with diabetes into a new CSV
print("\n=== Saving Patients with Diabetes to 'diabetes_patients.csv' ===")
diabetes_patients = df[df["Diabetes"] == "Yes"]
diabetes_patients.to_csv("diabetes_patients.csv", index=False)
print("File 'diabetes_patients.csv' saved successfully.")


# Summarize categorical variables: Counts for chronic conditions
print("\n=== Counts for Chronic Conditions ===")
chronic_conditions_counts = df["Chronic_Condition"].value_counts()
print(chronic_conditions_counts)


# Grouping and Aggregation: Average weight by smoking history
print("\n=== Average Weight by Smoking History ===")
avg_weight_by_smoking = df.groupby("Smoking_History")["Weight_kg"].mean()
print(avg_weight_by_smoking)


# # Part 4: Graphics

# Visualization using Matplotlib and Seaborn
import matplotlib.pyplot as plt
import seaborn as sns


# 1. Age Distribution (Histogram)
plt.figure(figsize=(8, 6))
plt.hist(df["Age"], bins=15, color="skyblue", edgecolor="black")
plt.title("Age Distribution", fontsize=16)
plt.xlabel("Age", fontsize=12)
plt.ylabel("Frequency", fontsize=12)
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.show()


# 2. Scatter Plot: Weight vs. Height with BMI Coloring
plt.figure(figsize=(8, 6))
scatter = plt.scatter(
    df["Height_cm"], df["Weight_kg"], c=df["BMI"], cmap="viridis", edgecolor="k", alpha=0.7
)
plt.colorbar(scatter, label="BMI")
plt.title("Weight vs. Height with BMI", fontsize=16)
plt.xlabel("Height (cm)", fontsize=12)
plt.ylabel("Weight (kg)", fontsize=12)
plt.grid(alpha=0.5)
plt.show()


# 3. Box Plot: Cholesterol Levels by Smoking History
plt.figure(figsize=(8, 6))
sns.boxplot(data=df, x="Smoking_History", y="Cholesterol_mg_dl", palette="pastel")
plt.title("Cholesterol Levels by Smoking History", fontsize=16)
plt.xlabel("Smoking History", fontsize=12)
plt.ylabel("Cholesterol (mg/dL)", fontsize=12)
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.show()


# In[ ]:


# 4. Bar Plot: Counts of Chronic Conditions
plt.figure(figsize=(8, 6))
sns.countplot(data=df, x="Chronic_Condition", palette="Set2", order=df["Chronic_Condition"].value_counts().index)
plt.title("Counts of Chronic Conditions", fontsize=16)
plt.xlabel("Chronic Condition", fontsize=12)
plt.ylabel("Count", fontsize=12)
plt.xticks(rotation=45, ha="right")
plt.show()


# 5. Correlation Heatmap
plt.figure(figsize=(10, 8))
correlation_matrix = df.select_dtypes(include='number').corr()
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap="coolwarm", cbar=True)
plt.title("Correlation Heatmap", fontsize=16)
plt.show()