This is the web page for Introduction to Data Science at the University of Florida.
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
from datetime import datetime
def describe_data(df):
print("***Describing the data:***")
num_rows, num_columns = df.shape
print(f"Number of rows: {num_rows}")
print(f"Number of columns: {num_columns}")
print("\nColumn details:")
for column in df.columns:
col_data = df[column]
col_dtype = col_data.dtype
print(f"\nColumn: {column}, Type: {col_dtype}")
if pd.api.types.is_numeric_dtype(col_data):
min_val = col_data.min()
max_val = col_data.max()
mean_val = col_data.mean()
median_val = col_data.median()
print(f" Min: {min_val}")
print(f" Max: {max_val}")
print(f" Mean: {mean_val:.2f}")
print(f" Median: {median_val}")
elif pd.api.types.is_categorical_dtype(col_data) or col_data.dtype == 'object':
num_categories = col_data.nunique()
print(f" Number of categories: {num_categories}")
if num_categories <= 10:
print(" Counts per category:")
category_counts = col_data.value_counts()
for index, value in category_counts.items():
print(f" {index}: {value}")
elif pd.api.types.is_datetime64_any_dtype(col_data):
min_date = col_data.min()
max_date = col_data.max()
print(f" Date Range: {min_date} to {max_date}")
print(f" Number of unique dates: {col_data.nunique()}")
else:
unique_vals = col_data.unique()
if len(unique_vals) <= 10:
print(" Unique values:")
for val in unique_vals:
print(f" {val}")
return num_rows, num_columns
def count_nulls(df):
print("Describing Nulls in the data:")
null_counts_columns = df.isnull().sum()
print("Null counts per variable:")
print(null_counts_columns)
null_counts_rows = df.isnull().sum(axis=1)
max_nulls = null_counts_rows.max()
# print(null_counts_rows)
# print(80*'-')
# breakpoint()
# print(null_counts_rows == max_nulls)
# print(80*'-')
# breakpoint()
# print(null_counts_rows[null_counts_rows == max_nulls])
# print(80*'-')
rows_with_most_nulls = null_counts_rows[null_counts_rows == max_nulls].index.tolist()
total_rows = len(df)
rows_with_any_nulls = (null_counts_rows > 0).sum()
percentage_with_nulls = (rows_with_any_nulls / total_rows) * 100
print(f"\nRows with the highest number of nulls ({max_nulls} nulls):")
print(rows_with_most_nulls)
print(f"Percentage of rows with any nulls: {percentage_with_nulls:.2f}%")
directory = "Images"
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(15, 5), gridspec_kw={'width_ratios': [3, 1]})
sns.histplot(null_counts_rows, bins=max_nulls, kde=False, color='blue', ax=ax1)
ax1.set_title('Histogram of Nulls Per Row')
ax1.set_xlabel('Number of Nulls')
ax1.set_ylabel('Frequency of Rows')
ax1.grid(True)
for p in ax1.patches:
ax1.annotate(f'{int(p.get_height())}', (p.get_x() + p.get_width() / 2., p.get_height()),
ha='center', va='bottom', color='black', xytext=(0, 5), textcoords='offset points')
sns.boxplot(y=null_counts_rows, color='green', ax=ax2)
ax2.set_title('Box Plot of Nulls Per Row')
ax2.set_ylabel('Number of Nulls')
current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"{directory}/Null_distributions_{current_time}.png"
plt.savefig(filename)
plt.close()
print(f"Saved histogram and boxplot as: {filename}")
# def describe_numeric(df, ls_int = None):
# print("***Reporting on Numeric variables:***")
# numeric_vars = df.select_dtypes(include=['int64', 'float64'])
# descriptions = numeric_vars.describe()
# print(descriptions)
# for column in numeric_vars:
# data = numeric_vars[column].dropna()
# if data.empty:
# print(f"No data available for histogram of {column} after removing NaNs.")
# continue
# fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(12, 4), gridspec_kw={'width_ratios': [3, 1]})
# if column in ls_int:
# print("Bin is 1 for:", column)
# bin_width = 1
# else:
# bin_width = None # Let Seaborn determine for floats
# # Pass bin_width to sns.histplot()
# sns.histplot(data, ax=ax1, color='blue', alpha=0.7, kde=False, binwidth=bin_width, element='bars', stat='count')
# bin_width = ax1.patches[0].get_width() if ax1.patches else 0
# ax1.set_title(f'Histogram of {column}')
# ax1.set_xlabel(f"{column} (Bin width: {bin_width:.2f})")
# ax1.set_ylabel('Frequency')
# ax1.grid(True)
# sns.boxplot(y=data, ax=ax2, color='green')
# ax2.set_title(f'Box Plot of {column}')
# ax2.set_ylabel('Values')
# ax2.set_xlabel('Box plot')
# plt.tight_layout()
# filename = f"Images/Numeric/{column}.png"
# plt.savefig(filename, format='png', dpi=300)
# plt.close(fig)
# print(f"{filename} has been saved")
def plot_correlations(df, target_var):
numeric_vars = df.select_dtypes(include=['int64', 'float64', 'float32', 'int32'])
if target_var not in numeric_vars:
print(f"The target variable '{target_var}' is not in the DataFrame or is not numeric.")
return
num_vars = numeric_vars.columns.size - 1
n_cols = 3
n_rows = (num_vars + n_cols - 1) // n_cols
fig, axes = plt.subplots(nrows=n_rows, ncols=n_cols, figsize=(n_cols * 5, n_rows * 5))
fig.suptitle('Scatter Plots of ' + target_var + ' with Other Numerical Variables', fontsize=16, y=1.02)
ax = axes.ravel()
for i, var in enumerate([col for col in numeric_vars.columns if col != target_var]):
sns.scatterplot(x=numeric_vars[var], y=numeric_vars[target_var], ax=ax[i], alpha=0.6)
ax[i].set_xlabel(var)
ax[i].set_ylabel(target_var)
ax[i].grid(True)
for j in range(i + 1, n_cols * n_rows):
ax[j].axis('off')
plt.tight_layout()
filename = "Images/Numeric/correlations_{target_var}.png"
plt.savefig(filename)
print(f"{filename} has been saved")
# df[[col1, col2]].plot(x=col1, y=col2, kind="line")
def describe_numeric2(df, ls_int):
print("***Reporting on Numeric variables:***")
numeric_vars = df.select_dtypes(include=['int64', 'float64'])
descriptions = numeric_vars.describe()
print(descriptions)
for column in numeric_vars:
data = numeric_vars[column].dropna()
if data.empty:
print(f"No data available for histogram of {column} after removing NaNs.")
continue
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(12, 4), gridspec_kw={'width_ratios': [3, 1]})
if column in ls_int:
print(f"Setting bin width = 1 for: {column}")
min_val, max_val = int(data.min()), int(data.max()) # Get min and max
bins = range(min_val, max_val + 1, 1) # Create explicit bins
sns.histplot(data, ax=ax1, color='blue', alpha=0.7, kde=False, bins=bins, element='bars', stat='count')
bin_width = 1 # Explicitly set
else:
sns.histplot(data, ax=ax1, color='blue', alpha=0.7, kde=False, binwidth=None, element='bars', stat='count')
bin_width = ax1.patches[0].get_width() if ax1.patches else 0 # Compute bin width for floats
ax1.set_title(f'Histogram of {column}')
ax1.set_xlabel(f"{column} (Bin width: {bin_width if bin_width else 'auto'})")
ax1.set_ylabel('Frequency')
ax1.grid(True)
sns.boxplot(y=data, ax=ax2, color='green')
ax2.set_title(f'Box Plot of {column}')
ax2.set_ylabel('Values')
ax2.set_xlabel('Box plot')
plt.tight_layout()
filename = f"Images/Numeric/{column}.png"
plt.savefig(filename, format='png', dpi=300)
plt.close(fig)
print(f"{filename} has been saved")