145 lines
5.4 KiB
Python
145 lines
5.4 KiB
Python
from __future__ import annotations
|
||
from dataclasses import dataclass
|
||
from datetime import datetime, date, timedelta
|
||
from typing import Iterable, List, Tuple, Dict, Any, Optional
|
||
from zoneinfo import ZoneInfo
|
||
import json
|
||
import psycopg
|
||
import time as _time
|
||
|
||
WAW = ZoneInfo("Europe/Warsaw")
|
||
IntervalRow = Tuple[datetime, datetime, float, str, str, str, str, str, str] # patrz _rows_to_upsert
|
||
|
||
UPSERT_SQL = """
|
||
INSERT INTO pricing.energy_prices
|
||
(ts_start, ts_end, price_pln_net, provider, kind, side, buyer, seller, source_meta)
|
||
VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s)
|
||
ON CONFLICT (ts_start, ts_end, provider, kind, side)
|
||
DO UPDATE SET
|
||
price_pln_net = EXCLUDED.price_pln_net,
|
||
buyer = EXCLUDED.buyer,
|
||
seller = EXCLUDED.seller,
|
||
source_meta = COALESCE(pricing.energy_prices.source_meta, '{}'::jsonb)
|
||
|| COALESCE(EXCLUDED.source_meta, '{}'::jsonb),
|
||
inserted_at = now();
|
||
"""
|
||
|
||
@dataclass
|
||
class EnergyPriceScraperBase:
|
||
"""Bazowa klasa dla scraperów rynkowych (zbieranie → normalizacja → UPSERT)."""
|
||
dsn: Optional[str] = None
|
||
conn: Optional[psycopg.Connection] = None
|
||
tz: ZoneInfo = WAW
|
||
period: timedelta = timedelta(hours=1)
|
||
# identyfikatory – NADPISZ w podklasie:
|
||
PROVIDER: str = "" # np. 'PSE' / 'instrat' / 'PSTRYK'
|
||
KIND: str = "" # np. 'rce' / 'fixing_I' / 'market_price'
|
||
SIDE: str = "buy" # 'buy'|'sell'
|
||
BUYER: str = "end_user"
|
||
SELLER: str = "market_index"
|
||
|
||
# throttling/retry
|
||
max_retries: int = 3
|
||
backoff_sec: float = 1.0
|
||
|
||
# ---------- public API ----------
|
||
def ingest_day(self, business_day: date) -> int:
|
||
"""Pobiera i zapisuje całą dobę [00:00, 24:00) lokalnie. Zwraca liczbę upsertowanych wierszy."""
|
||
points = self.fetch_day(business_day) # [(start,end,price,meta_dict), ...]
|
||
rows = self._rows_to_upsert(points)
|
||
return self._upsert(rows)
|
||
|
||
def ingest_range(self, start_day: date, end_day: date) -> int:
|
||
"""Backfill: [start_day, end_day] po dniach lokalnych."""
|
||
total = 0
|
||
d = start_day
|
||
while d <= end_day:
|
||
total += self.ingest_day(d)
|
||
d = d + timedelta(days=1)
|
||
return total
|
||
|
||
def fetch_day(self, business_day: date) -> List[Tuple[datetime, datetime, float, Dict[str, Any]]]:
|
||
"""Zaimplementuj w podklasie. Zwracaj listę punktów z NETTO PLN/kWh."""
|
||
raise NotImplementedError
|
||
|
||
# ---------- helpers ----------
|
||
def _ensure_conn(self) -> psycopg.Connection:
|
||
if self.conn:
|
||
return self.conn
|
||
if not self.dsn:
|
||
raise RuntimeError("Podaj dsn= lub conn= dla PriceScraperBase")
|
||
self.conn = psycopg.connect(self.dsn)
|
||
return self.conn
|
||
|
||
def _rows_to_upsert(self, points: Iterable[Tuple[datetime, datetime, float, Dict[str, Any]]]) -> List[IntervalRow]:
|
||
rows: List[IntervalRow] = []
|
||
for ts_start, ts_end, price_pln_kwh_net, meta in points:
|
||
# sanity: TZ
|
||
if ts_start.tzinfo is None: ts_start = ts_start.replace(tzinfo=self.tz)
|
||
else: ts_start = ts_start.astimezone(self.tz)
|
||
if ts_end.tzinfo is None: ts_end = ts_end.replace(tzinfo=self.tz)
|
||
else: ts_end = ts_end.astimezone(self.tz)
|
||
rows.append((
|
||
ts_start, ts_end, float(price_pln_kwh_net),
|
||
self.PROVIDER, self.KIND, self.SIDE, self.BUYER, self.SELLER,
|
||
json.dumps(meta or {})
|
||
))
|
||
return rows
|
||
|
||
def _upsert(self, rows: List[IntervalRow]) -> int:
|
||
if not rows: return 0
|
||
for attempt in range(1, self.max_retries + 1):
|
||
try:
|
||
with self._ensure_conn().cursor() as cur:
|
||
cur.executemany(UPSERT_SQL, rows)
|
||
self._ensure_conn().commit()
|
||
return len(rows)
|
||
except Exception:
|
||
if attempt >= self.max_retries:
|
||
raise
|
||
_time.sleep(self.backoff_sec * attempt)
|
||
return 0
|
||
|
||
# małe ułatwienie do budowy godzin z doby
|
||
def _day_range(self, d: date) -> Tuple[datetime, datetime]:
|
||
start = datetime(d.year, d.month, d.day, 0, 0, tzinfo=self.tz)
|
||
return start, start + timedelta(days=1)
|
||
|
||
|
||
|
||
def rows_from_series(series_pln_per_kwh: pd.Series, provider: str, kind: str,
|
||
period: timedelta = timedelta(hours=1), meta: dict | None = None):
|
||
"""
|
||
Zamienia serię godzinową (index = start okresu, tz-aware) na listę wierszy dla upsertu.
|
||
"""
|
||
if series_pln_per_kwh.empty:
|
||
return []
|
||
|
||
s = series_pln_per_kwh.copy()
|
||
idx = s.index
|
||
if getattr(idx, "tz", None) is None:
|
||
idx = idx.tz_localize(TZ, nonexistent="shift_forward", ambiguous="infer")
|
||
s.index = idx
|
||
else:
|
||
s = s.tz_convert(TZ)
|
||
|
||
meta_json = json.dumps(meta or {})
|
||
rows = []
|
||
for ts_start, price in s.dropna().items():
|
||
ts_end = ts_start + period
|
||
rows.append((ts_start, ts_end, float(price), provider, kind, meta_json))
|
||
return rows
|
||
|
||
def upsert_energy_prices(conn, rows):
|
||
"""
|
||
rows: iterable krotek:
|
||
(ts_start, ts_end, price_pln_net, provider, kind, source_meta_json)
|
||
"""
|
||
if not rows:
|
||
return
|
||
with conn.cursor() as cur:
|
||
cur.executemany(UPSERT_SQL, rows)
|
||
conn.commit()
|
||
|
||
if __name__ == "__main__":
|
||
conn = setup_db() |