Source code for bsrn.io.mcclear

"""
CAMS McClear HTTP retrieval helpers.
CAMS McClear HTTP 下载辅助函数。
"""

import io
import pandas as pd
import requests
from bsrn.constants import MCCLEAR_INTEGRATED_COLUMNS, MCCLEAR_VARIABLE_MAP, MCCLEAR_API_HOST


def _parse_mcclear(raw_or_buffer):
    """
    Parse SoDa McClear CSV into the project DataFrame (used by ``_download_mcclear`` only).
    将 SoDa McClear CSV 解析为项目 DataFrame（仅由 ``_download_mcclear`` 使用）。

    Parameters
    ----------
    raw_or_buffer : str or file-like
        Raw CAMS text or readable text buffer.
        CAMS 原始文本或可读文本缓冲区。

    Returns
    -------
    data : pd.DataFrame
        Parsed time-series data with UTC index for sub-daily resolutions.
        解析后的时间序列数据；亚日尺度为 UTC 索引。

    Raises
    ------
    ValueError
        If the McClear header line is missing or the payload is invalid.
        缺少 McClear 表头或载荷无效时。

    References
    ----------
    .. [1] CAMS McClear service info. (n.d.). SoDa.
       http://www.soda-pro.com/web-services/radiation/cams-mcclear/info
    """
    if isinstance(raw_or_buffer, str):
        fbuf = io.StringIO(raw_or_buffer)
    else:
        fbuf = raw_or_buffer

    # Read metadata header lines until column names line / 读取元数据表头直到列名行
    while True:
        line = fbuf.readline()
        if not line:
            raise ValueError("Invalid McClear payload: header not found. / 无法找到表头。")
        line = line.rstrip("\n")
        if line.startswith("# Observation period"):
            names = line.lstrip("# ").split(";")
            break

    data = pd.read_csv(fbuf, sep=";", comment="#", header=None, names=names)
    
    # Interval bounds from first column / 从首列解析观测时段起止
    obs_period = data["Observation period"].str.split("/")
    # Using the first part of the period (start-time) for floor-style labeling. 
    # 使用时段的第一部分（起始时间）进行向下对齐（floor）风格的标记。
    data.index = pd.to_datetime(obs_period.str[0], utc=True)

    # Convert Wh/m^2 to W/m^2 using interval duration / 依据时间区间长度将 Wh/m^2 转换为 W/m^2
    integrated_cols = [c for c in MCCLEAR_INTEGRATED_COLUMNS if c in data.columns]
    time_delta = pd.to_datetime(obs_period.str[1]) - pd.to_datetime(obs_period.str[0])
    hours = time_delta.dt.total_seconds() / 3600.0
    data[integrated_cols] = data[integrated_cols].divide(hours.tolist(), axis="rows")

    data.index.name = None
    data = data.rename(columns=MCCLEAR_VARIABLE_MAP)
    return data


def _download_mcclear(latitude, longitude, start, end, email, elev=None,
                      timeout=30):
    """
    Download and parse CAMS McClear from SoDa (used by ``fetch_mcclear`` only).
    从 SoDa 下载并解析 CAMS McClear（仅由 ``fetch_mcclear`` 调用）。

    Parameters
    ----------
    latitude : float
        Latitude in decimal degrees. [degrees]
        十进制度纬度。[度]
    longitude : float
        Longitude in decimal degrees. [degrees]
        十进制度经度。[度]
    start : datetime.datetime or pandas.Timestamp
        Start date (inclusive) of requested period.
        请求时间段的起始日期（含）。
    end : datetime.datetime or pandas.Timestamp
        End date (inclusive) of requested period.
        请求时间段的结束日期（含）。
    email : str
        SoDa account email.
        SoDa 账户邮箱。
    elev : float, optional
        Station elevation. [m] If None, use SoDa default terrain lookup.
        站点海拔高度。[米] 若为 None 则使用 SoDa 默认地形高程。
    timeout : int, default 30
        HTTP request timeout in seconds.
        HTTP 请求超时时间（秒）。

    Returns
    -------
    data : pd.DataFrame
        Parsed McClear data.
        解析后的 McClear 数据。

    Raises
    ------
    ValueError
        If the request starts before 2004-01-01 or the response is not valid CSV.
        起始日期早于 2004-01-01，或响应非有效 CSV 时。
    requests.Timeout
        If the HTTP request exceeds *timeout*.
        HTTP 请求超过 *timeout* 时。
    requests.HTTPError
        If SoDa returns a non-success status after ``raise_for_status``.
        SoDa 返回非成功 HTTP 状态时。

    References
    ----------
    .. [1] Lefèvre, M., Oumbe, A., Blanc, P., Espinar, B., Gschwind, B., Qu, Z.,
       et al. (2013). McClear: A new model estimating downwelling solar
       radiation at ground level in clear-sky conditions. Atmospheric
       Measurement Techniques, 6(9), 2403–2418.
    .. [2] Gschwind, B., Wald, L., Blanc, P., Lefèvre, M., Schroedter-Homscheidt, M.,
       & Arola, A. (2019). Improving the McClear model estimating the downwelling
       solar radiation at ground level in cloud-free conditions – McClear-v3.
       Meteorologische Zeitschrift, 28(2).
    """
    if elev is None:
        elev = -999

    # McClear availability: service is defined from 2004-01-01 onward.
    # McClear 可用性：服务从 2004-01-01 起提供。
    start_ts = pd.Timestamp(start)
    if start_ts.tzinfo is not None:
        start_cmp = start_ts.tz_convert("UTC").tz_localize(None)
    else:
        start_cmp = start_ts
    if start_cmp < pd.Timestamp("2004-01-01"):
        raise ValueError(
            "McClear data are only available from 2004-01-01 onward. / "
            "McClear 数据仅在 2004-01-01 之后可用。"
        )

    # Format dates and username for SoDa request / 为 SoDa 请求格式化日期和用户名
    end_ts = pd.Timestamp(end)
    if start_ts.tzinfo is not None:
        start_str = start_ts.tz_convert("UTC").strftime("%Y-%m-%d")
    else:
        start_str = start_ts.strftime("%Y-%m-%d")
    if end_ts.tzinfo is not None:
        end_str = end_ts.tz_convert("UTC").strftime("%Y-%m-%d")
    else:
        end_str = end_ts.strftime("%Y-%m-%d")
    email_encoded = email.replace("@", "%2540")

    # Build WPS DataInputs payload for McClear (1‑min, UT) / 构建 McClear 的 WPS DataInputs 载荷（1 分钟、UT）
    data_inputs_dict = {
        "latitude": latitude,
        "longitude": longitude,
        "altitude": elev,
        "date_begin": start_str,
        "date_end": end_str,
        "time_ref": "UT",
        "summarization": "PT01M",
        "username": email_encoded,
        "verbose": "false",
    }
    data_inputs = ";".join([f"{key}={value}" for key, value in data_inputs_dict.items()])
    params = {
        "Service": "WPS",
        "Request": "Execute",
        "Identifier": "get_mcclear",
        "version": "1.0.0",
        "RawDataOutput": "irradiation",
    }

    # Use the same HTTPS endpoint and request pattern as pvlib.iotools.get_cams,
    # with the host defined in project constants.
    # 使用与 pvlib.iotools.get_cams 相同的 HTTPS 端点和请求格式，主机名由项目常量统一管理。
    base_url = f"https://{MCCLEAR_API_HOST}/service/wps"

    try:
        res = requests.get(
            base_url + "?DataInputs=" + data_inputs,
            params=params,
            timeout=timeout,
        )
    except requests.Timeout as exc:
        raise requests.Timeout(
            f"McClear request timed out for {base_url}: {exc}"
        ) from exc

    # If an error occurs on the server side, CAMS returns a PyWPS-style XML/HTML
    # with ows:ExceptionText; bubble that up for easier debugging.
    # 服务器端出错时，CAMS 会返回包含 ows:ExceptionText 的 PyWPS 风格 XML/HTML，将其拼接到错误信息中便于调试。
    if not res.ok:
        text = res.text or ""
        if "ows:ExceptionText" in text:
            try:
                errors = text.split("ows:ExceptionText")[1][1:-2]
            except Exception:
                errors = text
            res.reason = f"{res.reason}: <{errors}>"
        res.raise_for_status()

    # Successful responses are CSV; parse directly from memory.
    # 成功响应为 CSV；直接在内存中解析。
    fbuf = io.StringIO(res.content.decode("utf-8"))
    data = _parse_mcclear(fbuf)
    return data



[docs]
def fetch_mcclear(index, latitude, longitude, elev, email, timeout=30):
    """
    Retrieve and align McClear data to a target DatetimeIndex.
    获取并对齐 McClear 数据到给定的 DatetimeIndex。

    Parameters
    ----------
    index : pd.DatetimeIndex
        Target time index to align McClear outputs to.
        需要对齐的目标时间索引。
    latitude : float
        Latitude in decimal degrees. [degrees]
        十进制度纬度。[度]
    longitude : float
        Longitude in decimal degrees. [degrees]
        十进制度经度。[度]
    elev : float
        Site elevation. [m]
        站点海拔。[米]
    email : str
        SoDa account email.
        SoDa 账户邮箱。
    timeout : int, default 30
        HTTP request timeout in seconds.
        HTTP 请求超时时间（秒）。

    Returns
    -------
    aligned : pd.DataFrame
        McClear data reindexed to `index`. Must contain
        `ghi_clear`, `bni_clear`, and `dhi_clear`.
        重新索引到 `index` 的 McClear 数据，包含
        `ghi_clear`、`bni_clear` 与 `dhi_clear` 列。

    Raises
    ------
    ValueError
        If ``index`` is not a DatetimeIndex, McClear columns are missing, or the
        downloaded frame has an invalid index.
        ``index`` 非 DatetimeIndex、McClear 缺列或下载数据索引无效时。
    requests.Timeout
        Propagated from :func:`_download_mcclear` when the HTTP call times out.
        由 :func:`_download_mcclear` 在 HTTP 超时时向上传递。
    requests.HTTPError
        Propagated from SoDa on HTTP failure.
        SoDa HTTP 失败时向上传递。
    """
    if not isinstance(index, pd.DatetimeIndex):
        raise ValueError(
            "index must be a pandas DatetimeIndex. / index 必须是 pandas DatetimeIndex。"
        )

    # Determine inclusive date range from index / 从索引确定包含的起止日期
    start = pd.Timestamp(index.min()).to_pydatetime()
    end = pd.Timestamp(index.max()).to_pydatetime()

    data = _download_mcclear(
        latitude=latitude,
        longitude=longitude,
        start=start,
        end=end,
        email=email,
        elev=elev,
        timeout=timeout,
    )

    if not isinstance(data.index, pd.DatetimeIndex):
        raise ValueError(
            "McClear data index must be DatetimeIndex. / McClear 数据索引必须为 DatetimeIndex。"
        )

    data = data.copy()
    if data.index.tz is None:
        data.index = data.index.tz_localize("UTC")
    else:
        data.index = data.index.tz_convert("UTC")

    required_cols = {"ghi_clear", "bni_clear", "dhi_clear"}
    missing = required_cols - set(data.columns)
    if missing:
        raise ValueError(
            f"McClear data missing required columns: {sorted(missing)}"
        )

    aligned = data.reindex(index)
    return aligned