RegressionGraph

Python

import pandas as pd
import numpy as np
from IPython.display import display
import os

# Define file paths
source_file_path = r"\Regression.xlsx"
result_file_path = r"\RegressionGraph_Results.xlsx"

# Verify paths exist
if not os.path.exists(source_file_path):
    print(f"Warning: Source file not found at {source_file_path}")
else:
    print(f"Source file found: {source_file_path}")

# Create results directory if it doesn't exist
os.makedirs(os.path.dirname(result_file_path), exist_ok=True)

Python

# Read and display the source data
try:
    df = pd.read_excel(source_file_path)
    print(f"Data shape: {df.shape}")
    print(f"\nFirst few rows:")
    display(df.head(10))
except Exception as e:
    print(f"Error reading file: {e}")

Python

# Sum rainfall data by month, year, and ID for both sheets
try:
    # Get all sheet names
    excel_file = pd.ExcelFile(source_file_path)
    sheet_names = excel_file.sheet_names
    print(f"Available sheets: {sheet_names}\n")
    
    all_summaries = {}
    
    # Process each sheet
    for sheet_name in sheet_names:
        print(f"Processing sheet: {sheet_name}")
        df_sheet = pd.read_excel(source_file_path, sheet_name=sheet_name)
        print(f"  Shape: {df_sheet.shape}")
        print(f"  First few rows and columns:")
        display(df_sheet.iloc[:5, :5])
        
        # The structure is:
        # - Column A (index 0): Date
        # - Columns B-FJ (index 1-164): Station IDs (column headers are the IDs)
        # - Rows 2-8400: Rainfall data
        
        # Get date column (first column)
        date_col = df_sheet.columns[0]
        
        # Get ID columns (all columns except the first one)
        id_columns = df_sheet.columns[1:]
        
        print(f"  Date column: {date_col}")
        print(f"  Number of stations (ID columns): {len(id_columns)}")
        print(f"  Sample IDs: {list(id_columns[:5])}\n")
        
        # Prepare data for unpivoting
        df_process = df_sheet.copy()
        
        # Convert date column to datetime
        df_process[date_col] = pd.to_datetime(df_process[date_col], errors='coerce')
        
        # Unpivot: convert from wide format (stations as columns) to long format
        df_melted = df_process.melt(id_vars=[date_col], var_name='Station_ID', value_name='Rainfall')
        
        # Convert rainfall to numeric
        df_melted['Rainfall'] = pd.to_numeric(df_melted['Rainfall'], errors='coerce')
        
        # Extract year and month
        df_melted['Year'] = df_melted[date_col].dt.year
        df_melted['Month'] = df_melted[date_col].dt.month
        
        # Group by Year, Month, and Station_ID, then sum rainfall
        summary = df_melted.groupby(['Year', 'Month', 'Station_ID'])['Rainfall'].sum().reset_index()
        summary.columns = ['Year', 'Month', 'Station_ID', 'Total_Rainfall']
        
        # Sort by Year, Month, Station_ID
        summary = summary.sort_values(['Year', 'Month', 'Station_ID']).reset_index(drop=True)
        
        all_summaries[sheet_name] = summary
        print(f"  Summary shape: {summary.shape}")
        print(f"  Sample of summary data:\n")
        display(summary.head(15))
        print()
    
    # Save all summaries to Excel with multiple sheets
    if all_summaries:
        with pd.ExcelWriter(result_file_path, engine='openpyxl') as writer:
            for sheet_name, summary_df in all_summaries.items():
                # Limit sheet name to 31 characters for Excel
                excel_sheet_name = f"{sheet_name[:20]}_Summary"[:31]
                summary_df.to_excel(writer, sheet_name=excel_sheet_name, index=False)
        
        print(f"\n✓ Summary file saved to: {result_file_path}")
        print(f"Total rows in summaries:")
        for sheet_name, summary_df in all_summaries.items():
            print(f"  {sheet_name}: {len(summary_df)} rows")
    else:
        print("\nNo summaries created - could not process sheets")
        
except Exception as e:
    print(f"Error: {e}")
    import traceback
    traceback.print_exc()

Python

# Create regression graphs for each month (all data points within 2001-2023)
import matplotlib.pyplot as plt
import numpy as np
import os

# Load result file to get both BMKG and GPM summaries
result_file_path_read = r"\Phyton\Processed\RegressionGraph_Results.xlsx"

# Define output folder for saving graphs
output_folder = r"\Phyton\Processed\RegressionGraphs"

# Create output folder if it doesn't exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)
    print(f"Created output folder: {output_folder}\n")

try:
    # Read summaries from result file
    bmkg_summary = pd.read_excel(result_file_path_read, sheet_name='BMKG 01to23_Summary')
    gpm_summary = pd.read_excel(result_file_path_read, sheet_name='GPM 01to23_Summary')
    
    print(f"BMKG Summary shape: {bmkg_summary.shape}")
    print(f"GPM Summary shape: {gpm_summary.shape}\n")
    
    # Get unique months
    months = sorted(bmkg_summary['Month'].unique())
    month_names = ['January', 'February', 'March', 'April', 'May', 'June', 
                   'July', 'August', 'September', 'October', 'November', 'December']
    
    # For each month, plot all data points (Year, Station_ID pairs)
    for month in months:
        # Get BMKG data for this month
        bmkg_month = bmkg_summary[bmkg_summary['Month'] == month].copy()
        gpm_month = gpm_summary[gpm_summary['Month'] == month].copy()
        
        # Merge by Year and Station_ID to get paired data
        merged = pd.merge(
            bmkg_month[['Year', 'Station_ID', 'Total_Rainfall']].rename(columns={'Total_Rainfall': 'BMKG_Rainfall'}),
            gpm_month[['Year', 'Station_ID', 'Total_Rainfall']].rename(columns={'Total_Rainfall': 'GPM_Rainfall'}),
            on=['Year', 'Station_ID'],
            how='inner'
        )
        
        if len(merged) == 0:
            print(f"No matching data for month {month}")
            continue
        
        print(f"Month {month} ({month_names[month-1]}): {len(merged)} data points")
        
        # Prepare X (BMKG) and Y (GPM) data
        X = merged['BMKG_Rainfall'].values
        y = merged['GPM_Rainfall'].values
        
        # Create single figure
        fig, ax = plt.subplots(figsize=(12, 8))
        fig.suptitle(f'{month_names[month-1]}', 
                     fontsize=16, fontweight='bold')
        
        # Plot scatter points
        ax.scatter(X, y, alpha=0.6, s=100, color='grey', edgecolors='black', linewidth=0.8)
        
        # ===== REGRESSION WITH INTERCEPT =====
        z_with = np.polyfit(X, y, 1)
        p_with = np.poly1d(z_with)
        x_line = np.linspace(0, 1750, 100)  # Extend to full plot range
        
        # Calculate R-squared and other metrics
        y_pred = p_with(X)
        ss_res = np.sum((y - y_pred)**2)
        ss_tot = np.sum((y - np.mean(y))**2)
        r2_with = 1 - (ss_res / ss_tot) if ss_tot != 0 else 0
        
        # Calculate correlation and RMSE
        corr_with = np.corrcoef(X, y)[0, 1]
        rmse_with = np.sqrt(ss_res / len(X))
        
        # Plot line with intercept
        ax.plot(x_line, p_with(x_line), 'k-', linewidth=2.5)
        
        # Add label near the line (positioned to avoid overlap)
        x_label_pos = 900  # Position for solid line label
        y_label_pos = p_with(x_label_pos)
        ax.text(x_label_pos, y_label_pos + 120, 
                f'y = {z_with[0]:.4f}x + {z_with[1]:.4f}\nR² = {r2_with:.4f}',
                fontsize=9, bbox=dict(boxstyle='round,pad=0.5', facecolor='white', 
                edgecolor='black', alpha=0.9), verticalalignment='bottom')
        
        # ===== REGRESSION WITHOUT INTERCEPT =====
        slope_no_intercept = np.sum(X * y) / np.sum(X * X)
        
        y_pred_no = slope_no_intercept * X
        ss_res_no = np.sum((y - y_pred_no)**2)
        ss_tot_no = np.sum(y**2)
        r2_without = 1 - (ss_res_no / ss_tot_no) if ss_tot_no != 0 else 0
        rmse_without = np.sqrt(ss_res_no / len(X))
        
        x_line_no = np.linspace(0, 1750, 100)  # Extend to full plot range
        y_line_no = slope_no_intercept * x_line_no
        
        # Plot line without intercept
        ax.plot(x_line_no, y_line_no, 'k--', linewidth=2.5)
        
        # Add label near the line (positioned to avoid overlap)
        x_label_pos_no = 1300  # Position for dashed line label (different from solid)
        y_label_pos_no = slope_no_intercept * x_label_pos_no
        ax.text(x_label_pos_no, y_label_pos_no + 80, 
                f'y = {slope_no_intercept:.4f}x\nR² = {r2_without:.4f}',
                fontsize=9, bbox=dict(boxstyle='round,pad=0.5', facecolor='white', 
                edgecolor='black', alpha=0.9, linestyle='--'), verticalalignment='bottom')
        
        # Configure axes
        ax.set_xlabel('BMKG (mm/month)', fontsize=13, fontweight='bold')
        ax.set_ylabel('GPM (mm/month)', fontsize=13, fontweight='bold')
        ax.set_xlim(0, 1750)
        ax.set_ylim(0, 1750)
        ax.grid(True, alpha=0.1, linestyle='--')
        
        plt.tight_layout()
        plt.show()
        
        # Print summary statistics
        print(f"  With Intercept:")
        print(f"    Equation: y = {z_with[0]:.6f}x + {z_with[1]:.6f}")
        print(f"    R² = {r2_with:.6f}, Correlation = {corr_with:.6f}, RMSE = {rmse_with:.4f}")
        print(f"  Without Intercept:")
        print(f"    Equation: y = {slope_no_intercept:.6f}x")
        print(f"    R² = {r2_without:.6f}, RMSE = {rmse_without:.4f}\n")
    
    print("✓ All regression graphs completed!")
    
except FileNotFoundError:
    print(f"Error: Result file not found at {result_file_path_read}")
except Exception as e:
    print(f"Error: {e}")
    import traceback
    traceback.print_exc()