CAP 5771 Spring 25

Logo

This is the web page for Introduction to Data Science at the University of Florida.

Descriptive Statistics Utilities

Download the code

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
from datetime import datetime

def describe_data(df):
    print("***Describing the data:***")
    num_rows, num_columns = df.shape  
    print(f"Number of rows: {num_rows}")
    print(f"Number of columns: {num_columns}")
    
    print("\nColumn details:")
    for column in df.columns:
        col_data = df[column]
        col_dtype = col_data.dtype
        print(f"\nColumn: {column}, Type: {col_dtype}")

        if pd.api.types.is_numeric_dtype(col_data):
            min_val = col_data.min()
            max_val = col_data.max()
            mean_val = col_data.mean()
            median_val = col_data.median()
            print(f"  Min: {min_val}")
            print(f"  Max: {max_val}")
            print(f"  Mean: {mean_val:.2f}")
            print(f"  Median: {median_val}")
        elif pd.api.types.is_categorical_dtype(col_data) or col_data.dtype == 'object':
            num_categories = col_data.nunique()
            print(f"  Number of categories: {num_categories}")
            if num_categories <= 10:  
                print("  Counts per category:")
                category_counts = col_data.value_counts()
                for index, value in category_counts.items():
                    print(f"    {index}: {value}")
        elif pd.api.types.is_datetime64_any_dtype(col_data):
            min_date = col_data.min()
            max_date = col_data.max()
            print(f"  Date Range: {min_date} to {max_date}")
            print(f"  Number of unique dates: {col_data.nunique()}")
        else:
            unique_vals = col_data.unique()
            if len(unique_vals) <= 10:  
                print("  Unique values:")
                for val in unique_vals:
                    print(f"    {val}")

    return num_rows, num_columns

def count_nulls(df):
    print("Describing Nulls in the data:")
    
    null_counts_columns = df.isnull().sum()
    print("Null counts per variable:")
    print(null_counts_columns)
    
    null_counts_rows = df.isnull().sum(axis=1)
    max_nulls = null_counts_rows.max()
    # print(null_counts_rows)
    # print(80*'-')
    # breakpoint()
    # print(null_counts_rows == max_nulls)
    # print(80*'-')
    # breakpoint()
    # print(null_counts_rows[null_counts_rows == max_nulls])
    # print(80*'-')

    rows_with_most_nulls = null_counts_rows[null_counts_rows == max_nulls].index.tolist()


    total_rows = len(df)
    rows_with_any_nulls = (null_counts_rows > 0).sum()
    percentage_with_nulls = (rows_with_any_nulls / total_rows) * 100

    print(f"\nRows with the highest number of nulls ({max_nulls} nulls):")
    print(rows_with_most_nulls)
    print(f"Percentage of rows with any nulls: {percentage_with_nulls:.2f}%")

    directory = "Images"
    fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(15, 5), gridspec_kw={'width_ratios': [3, 1]})
    
    sns.histplot(null_counts_rows, bins=max_nulls, kde=False, color='blue', ax=ax1)
    ax1.set_title('Histogram of Nulls Per Row')
    ax1.set_xlabel('Number of Nulls')
    ax1.set_ylabel('Frequency of Rows')
    ax1.grid(True)

    for p in ax1.patches:
        ax1.annotate(f'{int(p.get_height())}', (p.get_x() + p.get_width() / 2., p.get_height()),
                    ha='center', va='bottom', color='black', xytext=(0, 5), textcoords='offset points')

    sns.boxplot(y=null_counts_rows, color='green', ax=ax2)
    ax2.set_title('Box Plot of Nulls Per Row')
    ax2.set_ylabel('Number of Nulls')  

    current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"{directory}/Null_distributions_{current_time}.png"
    plt.savefig(filename)
    plt.close()
    print(f"Saved histogram and boxplot as: {filename}")  


# def describe_numeric(df, ls_int = None):
#     print("***Reporting on Numeric variables:***")
#     numeric_vars = df.select_dtypes(include=['int64', 'float64'])
#     descriptions = numeric_vars.describe()
#     print(descriptions)

#     for column in numeric_vars:
#         data = numeric_vars[column].dropna()
#         if data.empty:
#             print(f"No data available for histogram of {column} after removing NaNs.")
#             continue
#         fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(12, 4), gridspec_kw={'width_ratios': [3, 1]})
        
#         if column in ls_int:
#             print("Bin is 1 for:", column)
#             bin_width = 1
#         else:
#             bin_width = None  # Let Seaborn determine for floats
        
#         # Pass bin_width to sns.histplot()
#         sns.histplot(data, ax=ax1, color='blue', alpha=0.7, kde=False, binwidth=bin_width, element='bars', stat='count')
#         bin_width = ax1.patches[0].get_width() if ax1.patches else 0
#         ax1.set_title(f'Histogram of {column}')
#         ax1.set_xlabel(f"{column} (Bin width: {bin_width:.2f})")
#         ax1.set_ylabel('Frequency')
#         ax1.grid(True)

#         sns.boxplot(y=data, ax=ax2, color='green')
#         ax2.set_title(f'Box Plot of {column}')
#         ax2.set_ylabel('Values')
#         ax2.set_xlabel('Box plot')

#         plt.tight_layout()  
        
#         filename = f"Images/Numeric/{column}.png"
#         plt.savefig(filename, format='png', dpi=300)
#         plt.close(fig)  
#         print(f"{filename} has been saved")


def plot_correlations(df, target_var):
    numeric_vars = df.select_dtypes(include=['int64', 'float64', 'float32', 'int32'])

    if target_var not in numeric_vars:
        print(f"The target variable '{target_var}' is not in the DataFrame or is not numeric.")
        return
    
    num_vars = numeric_vars.columns.size - 1  
    n_cols = 3  
    n_rows = (num_vars + n_cols - 1) // n_cols 
    
    fig, axes = plt.subplots(nrows=n_rows, ncols=n_cols, figsize=(n_cols * 5, n_rows * 5))
    fig.suptitle('Scatter Plots of ' + target_var + ' with Other Numerical Variables', fontsize=16, y=1.02)
    
    ax = axes.ravel()
    
    for i, var in enumerate([col for col in numeric_vars.columns if col != target_var]):
        sns.scatterplot(x=numeric_vars[var], y=numeric_vars[target_var], ax=ax[i], alpha=0.6)
        ax[i].set_xlabel(var)
        ax[i].set_ylabel(target_var)
        ax[i].grid(True)
    
    for j in range(i + 1, n_cols * n_rows):
        ax[j].axis('off')

    plt.tight_layout()
    filename = "Images/Numeric/correlations_{target_var}.png"
    plt.savefig(filename)
    print(f"{filename} has been saved")


# df[[col1, col2]].plot(x=col1, y=col2, kind="line")



def describe_numeric2(df, ls_int):
    print("***Reporting on Numeric variables:***")
    numeric_vars = df.select_dtypes(include=['int64', 'float64'])
    descriptions = numeric_vars.describe()
    print(descriptions)

    for column in numeric_vars:
        data = numeric_vars[column].dropna()
        if data.empty:
            print(f"No data available for histogram of {column} after removing NaNs.")
            continue
        
        fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(12, 4), gridspec_kw={'width_ratios': [3, 1]})

        if column in ls_int:
            print(f"Setting bin width = 1 for: {column}")
            min_val, max_val = int(data.min()), int(data.max())  # Get min and max
            bins = range(min_val, max_val + 1, 1)  # Create explicit bins
            sns.histplot(data, ax=ax1, color='blue', alpha=0.7, kde=False, bins=bins, element='bars', stat='count')
            bin_width = 1  # Explicitly set
        else:
            sns.histplot(data, ax=ax1, color='blue', alpha=0.7, kde=False, binwidth=None, element='bars', stat='count')
            bin_width = ax1.patches[0].get_width() if ax1.patches else 0  # Compute bin width for floats

        ax1.set_title(f'Histogram of {column}')
        ax1.set_xlabel(f"{column} (Bin width: {bin_width if bin_width else 'auto'})")
        ax1.set_ylabel('Frequency')
        ax1.grid(True)

        sns.boxplot(y=data, ax=ax2, color='green')
        ax2.set_title(f'Box Plot of {column}')
        ax2.set_ylabel('Values')
        ax2.set_xlabel('Box plot')

        plt.tight_layout()  
        
        filename = f"Images/Numeric/{column}.png"
        plt.savefig(filename, format='png', dpi=300)
        plt.close(fig)  
        print(f"{filename} has been saved")