ranczo-energy-price-scrapers/EnergyPriceScraper.py
2025-08-28 14:15:42 +02:00

145 lines
5.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from __future__ import annotations
from dataclasses import dataclass
from datetime import datetime, date, timedelta
from typing import Iterable, List, Tuple, Dict, Any, Optional
from zoneinfo import ZoneInfo
import json
import psycopg
import time as _time
WAW = ZoneInfo("Europe/Warsaw")
IntervalRow = Tuple[datetime, datetime, float, str, str, str, str, str, str] # patrz _rows_to_upsert
UPSERT_SQL = """
INSERT INTO pricing.energy_prices
(ts_start, ts_end, price_pln_net, provider, kind, side, buyer, seller, source_meta)
VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s)
ON CONFLICT (ts_start, ts_end, provider, kind, side)
DO UPDATE SET
price_pln_net = EXCLUDED.price_pln_net,
buyer = EXCLUDED.buyer,
seller = EXCLUDED.seller,
source_meta = COALESCE(pricing.energy_prices.source_meta, '{}'::jsonb)
|| COALESCE(EXCLUDED.source_meta, '{}'::jsonb),
inserted_at = now();
"""
@dataclass
class EnergyPriceScraperBase:
"""Bazowa klasa dla scraperów rynkowych (zbieranie → normalizacja → UPSERT)."""
dsn: Optional[str] = None
conn: Optional[psycopg.Connection] = None
tz: ZoneInfo = WAW
period: timedelta = timedelta(hours=1)
# identyfikatory NADPISZ w podklasie:
PROVIDER: str = "" # np. 'PSE' / 'instrat' / 'PSTRYK'
KIND: str = "" # np. 'rce' / 'fixing_I' / 'market_price'
SIDE: str = "buy" # 'buy'|'sell'
BUYER: str = "end_user"
SELLER: str = "market_index"
# throttling/retry
max_retries: int = 3
backoff_sec: float = 1.0
# ---------- public API ----------
def ingest_day(self, business_day: date) -> int:
"""Pobiera i zapisuje całą dobę [00:00, 24:00) lokalnie. Zwraca liczbę upsertowanych wierszy."""
points = self.fetch_day(business_day) # [(start,end,price,meta_dict), ...]
rows = self._rows_to_upsert(points)
return self._upsert(rows)
def ingest_range(self, start_day: date, end_day: date) -> int:
"""Backfill: [start_day, end_day] po dniach lokalnych."""
total = 0
d = start_day
while d <= end_day:
total += self.ingest_day(d)
d = d + timedelta(days=1)
return total
def fetch_day(self, business_day: date) -> List[Tuple[datetime, datetime, float, Dict[str, Any]]]:
"""Zaimplementuj w podklasie. Zwracaj listę punktów z NETTO PLN/kWh."""
raise NotImplementedError
# ---------- helpers ----------
def _ensure_conn(self) -> psycopg.Connection:
if self.conn:
return self.conn
if not self.dsn:
raise RuntimeError("Podaj dsn= lub conn= dla PriceScraperBase")
self.conn = psycopg.connect(self.dsn)
return self.conn
def _rows_to_upsert(self, points: Iterable[Tuple[datetime, datetime, float, Dict[str, Any]]]) -> List[IntervalRow]:
rows: List[IntervalRow] = []
for ts_start, ts_end, price_pln_kwh_net, meta in points:
# sanity: TZ
if ts_start.tzinfo is None: ts_start = ts_start.replace(tzinfo=self.tz)
else: ts_start = ts_start.astimezone(self.tz)
if ts_end.tzinfo is None: ts_end = ts_end.replace(tzinfo=self.tz)
else: ts_end = ts_end.astimezone(self.tz)
rows.append((
ts_start, ts_end, float(price_pln_kwh_net),
self.PROVIDER, self.KIND, self.SIDE, self.BUYER, self.SELLER,
json.dumps(meta or {})
))
return rows
def _upsert(self, rows: List[IntervalRow]) -> int:
if not rows: return 0
for attempt in range(1, self.max_retries + 1):
try:
with self._ensure_conn().cursor() as cur:
cur.executemany(UPSERT_SQL, rows)
self._ensure_conn().commit()
return len(rows)
except Exception:
if attempt >= self.max_retries:
raise
_time.sleep(self.backoff_sec * attempt)
return 0
# małe ułatwienie do budowy godzin z doby
def _day_range(self, d: date) -> Tuple[datetime, datetime]:
start = datetime(d.year, d.month, d.day, 0, 0, tzinfo=self.tz)
return start, start + timedelta(days=1)
def rows_from_series(series_pln_per_kwh: pd.Series, provider: str, kind: str,
period: timedelta = timedelta(hours=1), meta: dict | None = None):
"""
Zamienia serię godzinową (index = start okresu, tz-aware) na listę wierszy dla upsertu.
"""
if series_pln_per_kwh.empty:
return []
s = series_pln_per_kwh.copy()
idx = s.index
if getattr(idx, "tz", None) is None:
idx = idx.tz_localize(TZ, nonexistent="shift_forward", ambiguous="infer")
s.index = idx
else:
s = s.tz_convert(TZ)
meta_json = json.dumps(meta or {})
rows = []
for ts_start, price in s.dropna().items():
ts_end = ts_start + period
rows.append((ts_start, ts_end, float(price), provider, kind, meta_json))
return rows
def upsert_energy_prices(conn, rows):
"""
rows: iterable krotek:
(ts_start, ts_end, price_pln_net, provider, kind, source_meta_json)
"""
if not rows:
return
with conn.cursor() as cur:
cur.executemany(UPSERT_SQL, rows)
conn.commit()
if __name__ == "__main__":
conn = setup_db()