Source code for bsrn.visualization.availability

"""
Visualization of BSRN archive file availability.
"""

import os
import re
import warnings
import pandas as pd
import numpy as np
from datetime import datetime
from plotnine import (
    ggplot, aes, geom_tile, scale_fill_cmap,
    theme, element_text, element_blank, element_line,
    labs, scale_x_continuous, scale_y_discrete,
    coord_equal, theme_minimal
)
from bsrn.io.retrieval import get_bsrn_file_inventory


[docs] def plot_bsrn_availability( stations, username, password, start_year=1992, end_year=None, output_file=None, title=None ): """ Plot BSRN archive file availability from the BSRN FTP inventory. Parameters ---------- stations : str or list One or more station abbreviations (e.g., 'PAY' or ['PAY', 'NYA']). username : str BSRN FTP username. password : str BSRN FTP password. start_year : int, default 1992 First year to include in the visualization. end_year : int, optional Last year to include. If omitted, the current calendar year is used. output_file : str, optional Path to the output file (e.g., 'availability.pdf'). title : str, optional Plot title. If None (default), no title is drawn. Returns ------- fig : plotnine.ggplot.ggplot The generated availability heatmap figure. """ if isinstance(stations, str): stations = [stations] stations = [s.upper() for s in stations] stations.sort() # Sort stations alphabetically if end_year is None: end_year = datetime.now().year years = list(range(start_year, end_year + 1)) # Fetch data from FTP print(f"Searching BSRN FTP for stations: {', '.join(stations)}...") inventory = get_bsrn_file_inventory(stations, username, password) if not inventory or all(len(files) == 0 for files in inventory.values()): warnings.warn( "BSRN FTP connection failed or returned no data. Plot may be empty. " "Check credentials (username/password) and network.", UserWarning, stacklevel=2, ) # Pattern: STNMMYY.dat.gz or STNMMYY.001 pattern = re.compile(r"([A-Z]{3})(\d{2})(\d{2})\.(?:dat\.gz|\d{3}).*", re.IGNORECASE) # Dimensions for square cells; width 160 mm width_inch = 160 / 25.4 num_cols = len(years) num_rows = 12 if len(stations) == 1 else len(stations) # Estimate usable width for the plot (subtracting space for Y-axis labels) usable_width = width_inch - 0.8 cell_size = usable_width / num_cols # Total height scales with number of rows + overhead for title and legend total_height_inch = (cell_size * num_rows) + 1.0 # Data collection plot_data = [] if len(stations) == 1: stn = stations[0] month_names = ['J', 'F', 'M', 'A', 'M', 'J', 'J', 'A', 'S', 'O', 'N', 'D'] df = pd.DataFrame(0, index=range(1, 13), columns=years) for filename in inventory.get(stn, []): basename = os.path.basename(filename) match = pattern.match(basename) if match: stn_match, mm_str, yy_str = match.groups() month, yy = int(mm_str), int(yy_str) # BSRN started 1991: yy>=91 -> 1900s year = (1900 + yy) if yy >= 91 else (2000 + yy) if year in df.columns and month in df.index: df.at[month, year] = 1 # Reset and melt for plotnine df = df.reset_index().rename(columns={'index': 'Month'}) df_melt = df.melt(id_vars=['Month'], var_name='Year', value_name='Availability') # Use string representations of month integers to keep uniqueness month_strs = [str(m) for m in range(1, 13)] df_melt['Month'] = pd.Categorical(df_melt['Month'].astype(str), categories=reversed(month_strs), ordered=True) df_melt['Year'] = df_melt['Year'].astype(int) # Create mapping dictionary for ggplot axis labels month_labels = {str(m): name for m, name in zip(range(1, 13), month_names)} _labs = {"y": f"Station: {stn}", "x": "Year"} if title is not None: _labs["title"] = title p = ( ggplot(df_melt, aes(x='Year', y='Month', fill='Availability')) + geom_tile(color='white', size=0.5) + scale_fill_cmap(cmap_name='viridis', name="Availability") + labs(**_labs) + scale_y_discrete(labels=lambda items: [month_labels.get(x, x) for x in items]) ) else: df = pd.DataFrame(0, index=stations, columns=years) for stn in stations: for filename in inventory.get(stn, []): basename = os.path.basename(filename) match = pattern.match(basename) if match: stn_match, mm_str, yy_str = match.groups() yy = int(yy_str) year = (1900 + yy) if yy >= 91 else (2000 + yy) if year in df.columns: df.at[stn, year] += 1 df = df.reset_index().rename(columns={'index': 'Station'}) df_melt = df.melt(id_vars=['Station'], var_name='Year', value_name='Months_Available') df_melt['Station'] = pd.Categorical(df_melt['Station'], categories=reversed(stations), ordered=True) df_melt['Year'] = df_melt['Year'].astype(int) _labs = {"y": "Stations", "x": "Year"} if title is not None: _labs["title"] = title p = ( ggplot(df_melt, aes(x='Year', y='Station', fill='Months_Available')) + geom_tile(color='white', size=0.5) + scale_fill_cmap(cmap_name='viridis', name="Months Available") + labs(**_labs) + scale_y_discrete() ) year_breaks = [y for i, y in enumerate(years) if i % 2 == 0] if len(years) > 20 else years p = p + scale_x_continuous(breaks=year_breaks) + coord_equal() p = p + theme_minimal() + theme( text=element_text(family='Times New Roman', size=9), axis_title=element_text(size=9), axis_text=element_text(size=9), plot_title=element_text(size=9, margin={'b': 1}), legend_title=element_text(size=9), legend_text=element_text(size=9), legend_position="bottom", legend_key_width=100, legend_key_height=5, legend_margin=-12, legend_box_spacing=0, axis_title_x=element_text(size=9, margin={"t": 1, "b": 2}), plot_margin_top=0, plot_margin_right=0, plot_margin_bottom=0, plot_margin_left=0, panel_grid=element_line(color="white", size=0.5), figure_size=(width_inch, total_height_inch), axis_text_x=element_text(rotation=45, hjust=1) ) if output_file: p.save(output_file, dpi=300) return p