# # Part 1: Python Basics # Basic Data Types and Operations a = 10 b = 5 print("Addition:", a + b) print("Subtraction:", a - b) print("Multiplication:", a * b) print("Division:", a / b) # If-else statement if a > b: print("a is greater than b") else: print("a is less than or equal to b") # Functions def greet(name): return f"Hello, {name}!" print(greet("PhD Students")) # Importing a module import math print("Square root of 16:", math.sqrt(16)) # # Part 2: Fundamental Python Data Structures: # 1. Lists # 2. Tuples # 3. Dictionaries # 4. Sets ## 1. Lists: ordered, mutable collections genes = ["Gene1", "Gene2", "Gene3"] print("List of genes:", genes) # Sometimes you may see the following syntax for print print( f"List of genes: {genes}" ) # Lists can be modified, for example I can add an element to the above list genes.append("Gene4") print(f"Updated list: {genes}") ## 2. Tuples: ordered immutable collections nucleotide_counts = (10, 15, 8, 12) print(f"Nucleotide counts (A, T, G, C): {nucleotide_counts}") # 3. Dictionaries: key-value pairs student = {"name": "John", "age": 25, "courses": ["Math", "Science"]} print("Student Dictionary:", student) student["age"] = 26 print("Updated Student Dictionary:", student) # Another example: gene_expression = {"Gene1": 5.4, "Gene2": 3.2, "Gene3": 4.7} print(f"Gene expression dictionary: {gene_expression}") print(f"Expression of Gene2: {gene_expression['Gene2']}") # Sets: Unordered, unique collections unique_bases = {"A", "T", "G", "C"} print(f"Unique bases: {unique_bases}") # The Python sets are highly useful to efficiently remove duplicate values from a collection like # a list and to perform common math operations like unions and intersections. # Define a list with many (non-unique) elements x = [1,5,8,2,5,1,3,8,2,1] print(f"Unique values in x: {set(x)}" ) # How to know the type of Python object print( type(unique_bases) ) print( type(student) ) print( type(greet) ) # # Part 3: Working with Data # Data manipulation using pandas import pandas as pd import numpy as np # Example dataset: Gene expression data = { "Gene": ["Gene1", "Gene2", "Gene3"], "Control": [2.3, 2.5, 1.8], "Treated": [3.5, 2.8, 2.1] } print(data) # Let's check what kind of object we just created print( type(data) ) # Let's convert this object to a Data Frame: df = pd.DataFrame(data) print("DataFrame:") print(df) # Let's check the type of this object print( type(df) ) # Let's add another column to this data frame: # Add a new column for log2 fold change df["Log2_FC"] = np.log2( df["Treated"] / df["Control"] ) print("\nUpdated DataFrame with Log2 Fold Change:") print(df) # Load the dataset file_path = "medical_data.csv" # Update the path if needed df = pd.read_csv(file_path) # Display the first few rows of the dataset print("=== First 5 Rows of the Dataset ===") print(df.head()) # Display summary statistics print("\n=== Summary Statistics ===") print(df.describe(include="all")) # Check for missing values # Unlike R, Python does not have a special symbol for the missing values and the missing values are marked as NaN print("\n=== Missing Values ===") print(df.isnull().sum()) # Hmmm.... # It looks like Python interpreted the word "None" in the input dataset as an indicator of a "missing value" # In this case it looks like "None" is a valid value # Let's reread the dataset and indicate that "None" should not be considered as a missing value df = pd.read_csv(file_path, na_filter=False, na_values=['']) print(df.isnull().sum()) # Basic Data Exploration print("\n=== Basic Data Exploration ===") print(f"Number of Rows: {df.shape[0]}") print(f"Number of Columns: {df.shape[1]}") print(f"Column Names: {df.columns.tolist()}") # Filter data: Patients with systolic blood pressure > 140 print("\n=== Patients with Systolic Blood Pressure > 140 ===") high_bp = df[df["Blood_Pressure_Systolic"] > 140] print(high_bp.head()) # Sort the data: Top 10 patients by cholesterol level print("\n=== Top 10 Patients by Cholesterol Level ===") top_cholesterol = df.sort_values(by="Cholesterol_mg_dl", ascending=False).head(10) print(top_cholesterol) # select specific columns df2 = df[['Patient_ID', 'Age', 'Height_cm', 'Weight_kg']] print( df2.head() ) # Create a new column: BMI (Body Mass Index) print("\n=== Adding BMI Column ===") df["BMI"] = (df["Weight_kg"] / ((df["Height_cm"] / 100) ** 2)).round(2) print(df[["Patient_ID", "Height_cm", "Weight_kg", "BMI"]].head()) # Save filtered data: Patients with diabetes into a new CSV print("\n=== Saving Patients with Diabetes to 'diabetes_patients.csv' ===") diabetes_patients = df[df["Diabetes"] == "Yes"] diabetes_patients.to_csv("diabetes_patients.csv", index=False) print("File 'diabetes_patients.csv' saved successfully.") # Summarize categorical variables: Counts for chronic conditions print("\n=== Counts for Chronic Conditions ===") chronic_conditions_counts = df["Chronic_Condition"].value_counts() print(chronic_conditions_counts) # Grouping and Aggregation: Average weight by smoking history print("\n=== Average Weight by Smoking History ===") avg_weight_by_smoking = df.groupby("Smoking_History")["Weight_kg"].mean() print(avg_weight_by_smoking) # # Part 4: Graphics # Visualization using Matplotlib and Seaborn import matplotlib.pyplot as plt import seaborn as sns # 1. Age Distribution (Histogram) plt.figure(figsize=(8, 6)) plt.hist(df["Age"], bins=15, color="skyblue", edgecolor="black") plt.title("Age Distribution", fontsize=16) plt.xlabel("Age", fontsize=12) plt.ylabel("Frequency", fontsize=12) plt.grid(axis="y", linestyle="--", alpha=0.7) plt.show() # 2. Scatter Plot: Weight vs. Height with BMI Coloring plt.figure(figsize=(8, 6)) scatter = plt.scatter( df["Height_cm"], df["Weight_kg"], c=df["BMI"], cmap="viridis", edgecolor="k", alpha=0.7 ) plt.colorbar(scatter, label="BMI") plt.title("Weight vs. Height with BMI", fontsize=16) plt.xlabel("Height (cm)", fontsize=12) plt.ylabel("Weight (kg)", fontsize=12) plt.grid(alpha=0.5) plt.show() # 3. Box Plot: Cholesterol Levels by Smoking History plt.figure(figsize=(8, 6)) sns.boxplot(data=df, x="Smoking_History", y="Cholesterol_mg_dl", palette="pastel") plt.title("Cholesterol Levels by Smoking History", fontsize=16) plt.xlabel("Smoking History", fontsize=12) plt.ylabel("Cholesterol (mg/dL)", fontsize=12) plt.grid(axis="y", linestyle="--", alpha=0.7) plt.show() # In[ ]: # 4. Bar Plot: Counts of Chronic Conditions plt.figure(figsize=(8, 6)) sns.countplot(data=df, x="Chronic_Condition", palette="Set2", order=df["Chronic_Condition"].value_counts().index) plt.title("Counts of Chronic Conditions", fontsize=16) plt.xlabel("Chronic Condition", fontsize=12) plt.ylabel("Count", fontsize=12) plt.xticks(rotation=45, ha="right") plt.show() # 5. Correlation Heatmap plt.figure(figsize=(10, 8)) correlation_matrix = df.select_dtypes(include='number').corr() sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap="coolwarm", cbar=True) plt.title("Correlation Heatmap", fontsize=16) plt.show()