Back to NotebooksBack
DownloadRegressionGraph
Python
import pandas as pd
import numpy as np
from IPython.display import display
import os
# Define file paths
source_file_path = r"\Regression.xlsx"
result_file_path = r"\RegressionGraph_Results.xlsx"
# Verify paths exist
if not os.path.exists(source_file_path):
print(f"Warning: Source file not found at {source_file_path}")
else:
print(f"Source file found: {source_file_path}")
# Create results directory if it doesn't exist
os.makedirs(os.path.dirname(result_file_path), exist_ok=True)
Python
# Read and display the source data
try:
df = pd.read_excel(source_file_path)
print(f"Data shape: {df.shape}")
print(f"\nFirst few rows:")
display(df.head(10))
except Exception as e:
print(f"Error reading file: {e}")Python
# Sum rainfall data by month, year, and ID for both sheets
try:
# Get all sheet names
excel_file = pd.ExcelFile(source_file_path)
sheet_names = excel_file.sheet_names
print(f"Available sheets: {sheet_names}\n")
all_summaries = {}
# Process each sheet
for sheet_name in sheet_names:
print(f"Processing sheet: {sheet_name}")
df_sheet = pd.read_excel(source_file_path, sheet_name=sheet_name)
print(f" Shape: {df_sheet.shape}")
print(f" First few rows and columns:")
display(df_sheet.iloc[:5, :5])
# The structure is:
# - Column A (index 0): Date
# - Columns B-FJ (index 1-164): Station IDs (column headers are the IDs)
# - Rows 2-8400: Rainfall data
# Get date column (first column)
date_col = df_sheet.columns[0]
# Get ID columns (all columns except the first one)
id_columns = df_sheet.columns[1:]
print(f" Date column: {date_col}")
print(f" Number of stations (ID columns): {len(id_columns)}")
print(f" Sample IDs: {list(id_columns[:5])}\n")
# Prepare data for unpivoting
df_process = df_sheet.copy()
# Convert date column to datetime
df_process[date_col] = pd.to_datetime(df_process[date_col], errors='coerce')
# Unpivot: convert from wide format (stations as columns) to long format
df_melted = df_process.melt(id_vars=[date_col], var_name='Station_ID', value_name='Rainfall')
# Convert rainfall to numeric
df_melted['Rainfall'] = pd.to_numeric(df_melted['Rainfall'], errors='coerce')
# Extract year and month
df_melted['Year'] = df_melted[date_col].dt.year
df_melted['Month'] = df_melted[date_col].dt.month
# Group by Year, Month, and Station_ID, then sum rainfall
summary = df_melted.groupby(['Year', 'Month', 'Station_ID'])['Rainfall'].sum().reset_index()
summary.columns = ['Year', 'Month', 'Station_ID', 'Total_Rainfall']
# Sort by Year, Month, Station_ID
summary = summary.sort_values(['Year', 'Month', 'Station_ID']).reset_index(drop=True)
all_summaries[sheet_name] = summary
print(f" Summary shape: {summary.shape}")
print(f" Sample of summary data:\n")
display(summary.head(15))
print()
# Save all summaries to Excel with multiple sheets
if all_summaries:
with pd.ExcelWriter(result_file_path, engine='openpyxl') as writer:
for sheet_name, summary_df in all_summaries.items():
# Limit sheet name to 31 characters for Excel
excel_sheet_name = f"{sheet_name[:20]}_Summary"[:31]
summary_df.to_excel(writer, sheet_name=excel_sheet_name, index=False)
print(f"\n✓ Summary file saved to: {result_file_path}")
print(f"Total rows in summaries:")
for sheet_name, summary_df in all_summaries.items():
print(f" {sheet_name}: {len(summary_df)} rows")
else:
print("\nNo summaries created - could not process sheets")
except Exception as e:
print(f"Error: {e}")
import traceback
traceback.print_exc()Python
# Create regression graphs for each month (all data points within 2001-2023)
import matplotlib.pyplot as plt
import numpy as np
import os
# Load result file to get both BMKG and GPM summaries
result_file_path_read = r"\Phyton\Processed\RegressionGraph_Results.xlsx"
# Define output folder for saving graphs
output_folder = r"\Phyton\Processed\RegressionGraphs"
# Create output folder if it doesn't exist
if not os.path.exists(output_folder):
os.makedirs(output_folder)
print(f"Created output folder: {output_folder}\n")
try:
# Read summaries from result file
bmkg_summary = pd.read_excel(result_file_path_read, sheet_name='BMKG 01to23_Summary')
gpm_summary = pd.read_excel(result_file_path_read, sheet_name='GPM 01to23_Summary')
print(f"BMKG Summary shape: {bmkg_summary.shape}")
print(f"GPM Summary shape: {gpm_summary.shape}\n")
# Get unique months
months = sorted(bmkg_summary['Month'].unique())
month_names = ['January', 'February', 'March', 'April', 'May', 'June',
'July', 'August', 'September', 'October', 'November', 'December']
# For each month, plot all data points (Year, Station_ID pairs)
for month in months:
# Get BMKG data for this month
bmkg_month = bmkg_summary[bmkg_summary['Month'] == month].copy()
gpm_month = gpm_summary[gpm_summary['Month'] == month].copy()
# Merge by Year and Station_ID to get paired data
merged = pd.merge(
bmkg_month[['Year', 'Station_ID', 'Total_Rainfall']].rename(columns={'Total_Rainfall': 'BMKG_Rainfall'}),
gpm_month[['Year', 'Station_ID', 'Total_Rainfall']].rename(columns={'Total_Rainfall': 'GPM_Rainfall'}),
on=['Year', 'Station_ID'],
how='inner'
)
if len(merged) == 0:
print(f"No matching data for month {month}")
continue
print(f"Month {month} ({month_names[month-1]}): {len(merged)} data points")
# Prepare X (BMKG) and Y (GPM) data
X = merged['BMKG_Rainfall'].values
y = merged['GPM_Rainfall'].values
# Create single figure
fig, ax = plt.subplots(figsize=(12, 8))
fig.suptitle(f'{month_names[month-1]}',
fontsize=16, fontweight='bold')
# Plot scatter points
ax.scatter(X, y, alpha=0.6, s=100, color='grey', edgecolors='black', linewidth=0.8)
# ===== REGRESSION WITH INTERCEPT =====
z_with = np.polyfit(X, y, 1)
p_with = np.poly1d(z_with)
x_line = np.linspace(0, 1750, 100) # Extend to full plot range
# Calculate R-squared and other metrics
y_pred = p_with(X)
ss_res = np.sum((y - y_pred)**2)
ss_tot = np.sum((y - np.mean(y))**2)
r2_with = 1 - (ss_res / ss_tot) if ss_tot != 0 else 0
# Calculate correlation and RMSE
corr_with = np.corrcoef(X, y)[0, 1]
rmse_with = np.sqrt(ss_res / len(X))
# Plot line with intercept
ax.plot(x_line, p_with(x_line), 'k-', linewidth=2.5)
# Add label near the line (positioned to avoid overlap)
x_label_pos = 900 # Position for solid line label
y_label_pos = p_with(x_label_pos)
ax.text(x_label_pos, y_label_pos + 120,
f'y = {z_with[0]:.4f}x + {z_with[1]:.4f}\nR² = {r2_with:.4f}',
fontsize=9, bbox=dict(boxstyle='round,pad=0.5', facecolor='white',
edgecolor='black', alpha=0.9), verticalalignment='bottom')
# ===== REGRESSION WITHOUT INTERCEPT =====
slope_no_intercept = np.sum(X * y) / np.sum(X * X)
y_pred_no = slope_no_intercept * X
ss_res_no = np.sum((y - y_pred_no)**2)
ss_tot_no = np.sum(y**2)
r2_without = 1 - (ss_res_no / ss_tot_no) if ss_tot_no != 0 else 0
rmse_without = np.sqrt(ss_res_no / len(X))
x_line_no = np.linspace(0, 1750, 100) # Extend to full plot range
y_line_no = slope_no_intercept * x_line_no
# Plot line without intercept
ax.plot(x_line_no, y_line_no, 'k--', linewidth=2.5)
# Add label near the line (positioned to avoid overlap)
x_label_pos_no = 1300 # Position for dashed line label (different from solid)
y_label_pos_no = slope_no_intercept * x_label_pos_no
ax.text(x_label_pos_no, y_label_pos_no + 80,
f'y = {slope_no_intercept:.4f}x\nR² = {r2_without:.4f}',
fontsize=9, bbox=dict(boxstyle='round,pad=0.5', facecolor='white',
edgecolor='black', alpha=0.9, linestyle='--'), verticalalignment='bottom')
# Configure axes
ax.set_xlabel('BMKG (mm/month)', fontsize=13, fontweight='bold')
ax.set_ylabel('GPM (mm/month)', fontsize=13, fontweight='bold')
ax.set_xlim(0, 1750)
ax.set_ylim(0, 1750)
ax.grid(True, alpha=0.1, linestyle='--')
plt.tight_layout()
plt.show()
# Print summary statistics
print(f" With Intercept:")
print(f" Equation: y = {z_with[0]:.6f}x + {z_with[1]:.6f}")
print(f" R² = {r2_with:.6f}, Correlation = {corr_with:.6f}, RMSE = {rmse_with:.4f}")
print(f" Without Intercept:")
print(f" Equation: y = {slope_no_intercept:.6f}x")
print(f" R² = {r2_without:.6f}, RMSE = {rmse_without:.4f}\n")
print("✓ All regression graphs completed!")
except FileNotFoundError:
print(f"Error: Result file not found at {result_file_path_read}")
except Exception as e:
print(f"Error: {e}")
import traceback
traceback.print_exc()