ranczo-energy-usage-scrapers/app_impl.py
Bartosz Wieczorek 166d64d51e init
2025-09-02 18:14:05 +02:00

89 lines
3.3 KiB
Python

from __future__ import annotations
from datetime import datetime, date, time, timedelta, timezone
from zoneinfo import ZoneInfo
from typing import Sequence, Tuple, Optional
from logging_utils import function_logger
from utils.time_helpers import WARSAW_TZ, UTC
def local_midnight(d: date, tz: ZoneInfo) -> datetime:
"""Return local midnight (tz-aware) for a given calendar date."""
return datetime(d.year, d.month, d.day, 0, 0, tzinfo=tz)
def end_of_business_day_utc(business_day: date, tz: ZoneInfo) -> datetime:
"""
End of the business day [start, end) expressed in UTC.
For day D this is local midnight of D+1 converted to UTC.
Correct across 23h/25h DST days.
"""
end_local = local_midnight(business_day + timedelta(days=1), tz=tz)
return end_local.astimezone(UTC)
def is_day_fully_ingested(scraper, business_day: date, tz: ZoneInfo) -> bool:
"""
Decide if 'business_day' is fully present in DB based on scraper.last_entry().
Assumes that a fully ingested day ends at local midnight of D+1 in the DB.
"""
log = function_logger()
last: Optional[datetime] = scraper.last_entry()
if not last:
log.info("No last entry found for scrapper")
return False
log.debug("Last entry: {}".format(last))
# Normalize to UTC in case session TimeZone differs
assert last.tzinfo is not None, "last must be tz-aware"
needed_end_utc = end_of_business_day_utc(business_day, tz)
return last >= needed_end_utc
def maybe_ingest_for_next_day(
scraper,
now_local: datetime,
tz: ZoneInfo
) -> int:
"""
If the publication time for tomorrow has passed (with grace), and tomorrow is not in DB yet,
call ingest_day(tomorrow). Return the number of rows affected by the ingestion.
"""
log = function_logger()
log.debug("Start")
assert now_local.tzinfo is not None, "now_local must be tz-aware (Europe/Warsaw)."
today = now_local.date()
# Publication happens today for the next business day
target_day = today + timedelta(days=1)
log.info("Target day = {}".format(target_day))
if is_day_fully_ingested(scraper, target_day, tz):
log.info("Publication already happend, we got the data already, skipping")
return 0
# Ingest the next business day
log.info("Target day is NOT fully ingested, trying to read {}".format(target_day))
ingested_rows = scraper.ingest_day(target_day)
log.debug("Ingested rows = {}".format(ingested_rows))
return ingested_rows
def hourly_tick(
scraper
) -> int:
"""
Run this once per hour.
- Backfills if needed (once on startup or when the DB lags).
- Attempts ingestion for the next business day after each publication window.
Returns (affected_from_backfill, affected_from_next_day).
"""
log = function_logger()
now_local = datetime.now(WARSAW_TZ)
log.info(f"Hourly tick started for {now_local}")
if now_local.time().hour < 11:
log.info("Hourly tick skipped, too early for update")
return 0
# 1) Try to ingest the next business day after each con figured window
affected_next = 0
affected_next += maybe_ingest_for_next_day(scraper, now_local, WARSAW_TZ)
log.info(f"Hourly tick ended for {now_local} with {affected_next} affected rows")
return affected_next