Add data scrappers

This commit is contained in:
Bartosz Wieczorek 2025-08-28 14:15:42 +02:00
parent 2aec71ce49
commit 01f4f569f4
11 changed files with 377 additions and 119 deletions

View File

@ -1,4 +1,3 @@
# DistributionProvider/TauronG13sProvider.py
from __future__ import annotations from __future__ import annotations
from dataclasses import dataclass from dataclasses import dataclass
from datetime import time, datetime from datetime import time, datetime

View File

@ -1,4 +1,3 @@
# EnergyPriceProvider/RDNProvider.py
from __future__ import annotations from __future__ import annotations
from dataclasses import dataclass from dataclasses import dataclass
from datetime import datetime, timedelta from datetime import datetime, timedelta

View File

@ -1,4 +1,3 @@
# EnergyPriceProvider/TauronG13Provider.py
from __future__ import annotations from __future__ import annotations
from datetime import time, datetime from datetime import time, datetime
from EnergyPrice import EnergyPriceBase from EnergyPrice import EnergyPriceBase

145
EnergyPriceScraper.py Normal file
View File

@ -0,0 +1,145 @@
from __future__ import annotations
from dataclasses import dataclass
from datetime import datetime, date, timedelta
from typing import Iterable, List, Tuple, Dict, Any, Optional
from zoneinfo import ZoneInfo
import json
import psycopg
import time as _time
WAW = ZoneInfo("Europe/Warsaw")
IntervalRow = Tuple[datetime, datetime, float, str, str, str, str, str, str] # patrz _rows_to_upsert
UPSERT_SQL = """
INSERT INTO pricing.energy_prices
(ts_start, ts_end, price_pln_net, provider, kind, side, buyer, seller, source_meta)
VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s)
ON CONFLICT (ts_start, ts_end, provider, kind, side)
DO UPDATE SET
price_pln_net = EXCLUDED.price_pln_net,
buyer = EXCLUDED.buyer,
seller = EXCLUDED.seller,
source_meta = COALESCE(pricing.energy_prices.source_meta, '{}'::jsonb)
|| COALESCE(EXCLUDED.source_meta, '{}'::jsonb),
inserted_at = now();
"""
@dataclass
class EnergyPriceScraperBase:
"""Bazowa klasa dla scraperów rynkowych (zbieranie → normalizacja → UPSERT)."""
dsn: Optional[str] = None
conn: Optional[psycopg.Connection] = None
tz: ZoneInfo = WAW
period: timedelta = timedelta(hours=1)
# identyfikatory NADPISZ w podklasie:
PROVIDER: str = "" # np. 'PSE' / 'instrat' / 'PSTRYK'
KIND: str = "" # np. 'rce' / 'fixing_I' / 'market_price'
SIDE: str = "buy" # 'buy'|'sell'
BUYER: str = "end_user"
SELLER: str = "market_index"
# throttling/retry
max_retries: int = 3
backoff_sec: float = 1.0
# ---------- public API ----------
def ingest_day(self, business_day: date) -> int:
"""Pobiera i zapisuje całą dobę [00:00, 24:00) lokalnie. Zwraca liczbę upsertowanych wierszy."""
points = self.fetch_day(business_day) # [(start,end,price,meta_dict), ...]
rows = self._rows_to_upsert(points)
return self._upsert(rows)
def ingest_range(self, start_day: date, end_day: date) -> int:
"""Backfill: [start_day, end_day] po dniach lokalnych."""
total = 0
d = start_day
while d <= end_day:
total += self.ingest_day(d)
d = d + timedelta(days=1)
return total
def fetch_day(self, business_day: date) -> List[Tuple[datetime, datetime, float, Dict[str, Any]]]:
"""Zaimplementuj w podklasie. Zwracaj listę punktów z NETTO PLN/kWh."""
raise NotImplementedError
# ---------- helpers ----------
def _ensure_conn(self) -> psycopg.Connection:
if self.conn:
return self.conn
if not self.dsn:
raise RuntimeError("Podaj dsn= lub conn= dla PriceScraperBase")
self.conn = psycopg.connect(self.dsn)
return self.conn
def _rows_to_upsert(self, points: Iterable[Tuple[datetime, datetime, float, Dict[str, Any]]]) -> List[IntervalRow]:
rows: List[IntervalRow] = []
for ts_start, ts_end, price_pln_kwh_net, meta in points:
# sanity: TZ
if ts_start.tzinfo is None: ts_start = ts_start.replace(tzinfo=self.tz)
else: ts_start = ts_start.astimezone(self.tz)
if ts_end.tzinfo is None: ts_end = ts_end.replace(tzinfo=self.tz)
else: ts_end = ts_end.astimezone(self.tz)
rows.append((
ts_start, ts_end, float(price_pln_kwh_net),
self.PROVIDER, self.KIND, self.SIDE, self.BUYER, self.SELLER,
json.dumps(meta or {})
))
return rows
def _upsert(self, rows: List[IntervalRow]) -> int:
if not rows: return 0
for attempt in range(1, self.max_retries + 1):
try:
with self._ensure_conn().cursor() as cur:
cur.executemany(UPSERT_SQL, rows)
self._ensure_conn().commit()
return len(rows)
except Exception:
if attempt >= self.max_retries:
raise
_time.sleep(self.backoff_sec * attempt)
return 0
# małe ułatwienie do budowy godzin z doby
def _day_range(self, d: date) -> Tuple[datetime, datetime]:
start = datetime(d.year, d.month, d.day, 0, 0, tzinfo=self.tz)
return start, start + timedelta(days=1)
def rows_from_series(series_pln_per_kwh: pd.Series, provider: str, kind: str,
period: timedelta = timedelta(hours=1), meta: dict | None = None):
"""
Zamienia serię godzinową (index = start okresu, tz-aware) na listę wierszy dla upsertu.
"""
if series_pln_per_kwh.empty:
return []
s = series_pln_per_kwh.copy()
idx = s.index
if getattr(idx, "tz", None) is None:
idx = idx.tz_localize(TZ, nonexistent="shift_forward", ambiguous="infer")
s.index = idx
else:
s = s.tz_convert(TZ)
meta_json = json.dumps(meta or {})
rows = []
for ts_start, price in s.dropna().items():
ts_end = ts_start + period
rows.append((ts_start, ts_end, float(price), provider, kind, meta_json))
return rows
def upsert_energy_prices(conn, rows):
"""
rows: iterable krotek:
(ts_start, ts_end, price_pln_net, provider, kind, source_meta_json)
"""
if not rows:
return
with conn.cursor() as cur:
cur.executemany(UPSERT_SQL, rows)
conn.commit()
if __name__ == "__main__":
conn = setup_db()

View File

@ -0,0 +1,48 @@
# EnergyPriceScraperFactory.py
from __future__ import annotations
import importlib
from typing import Any, cast, Type
from EnergyPriceScraper import EnergyPriceScraperBase
import os
import psycopg
DB_HOST = os.getenv("PGHOST", "192.168.30.10")
DB_PORT = int(os.getenv("PGPORT", "5432"))
DB_NAME = os.getenv("PGDATABASE", "postgres")
DB_USER = os.getenv("PGUSER", "energy_ingest")
DB_PASS = os.getenv("PGPASSWORD", "2f1rLCa03mQrbmlCbD6envk")
def setup_db():
# psycopg 3
conn = psycopg.connect(
host=DB_HOST, port=DB_PORT, dbname=DB_NAME, user=DB_USER, password=DB_PASS
)
return conn
def create(name: str, /, **kwargs: Any) -> EnergyPriceScraperBase:
"""
Convention:
module: Scraper.<Name>Scraper
class: <Name>Provider
Example: create("TauronG13", rates={...})
"""
safe = "".join(ch for ch in name if ch.isalnum() or ch == "_")
module_name = f"Scraper.{safe}Scraper"
class_name = f"{safe}Scraper"
try:
mod = importlib.import_module(module_name)
except ModuleNotFoundError as e:
raise ValueError(f"Scraper module not found: {module_name}") from e
try:
cls = getattr(mod, class_name)
except AttributeError as e:
raise ValueError(f"Scraper class not found: {class_name} in {module_name}") from e
if not issubclass(cls, EnergyPriceScraperBase):
raise TypeError(f"{class_name} must inherit PriceScraperBase")
ProviderCls = cast(Type[EnergyPriceScraperBase], cls)
return ProviderCls(**kwargs) # type: ignore[arg-type]

View File

@ -1,72 +0,0 @@
import os
from datetime import timedelta
import json
import psycopg
import pandas as pd
import zoneinfo
TZ = zoneinfo.ZoneInfo("Europe/Warsaw")
DB_HOST = os.getenv("PGHOST", "192.168.30.10")
DB_PORT = int(os.getenv("PGPORT", "5432"))
DB_NAME = os.getenv("PGDATABASE", "postgres")
DB_USER = os.getenv("PGUSER", "energy_ingest")
DB_PASS = os.getenv("PGPASSWORD", "2f1rLCa03mQrbmlCbD6envk")
UPSERT_SQL = """
INSERT INTO pricing.energy_prices
(ts_start, ts_end, price_pln_net, provider, kind, side, buyer, seller, source_meta)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
ON CONFLICT (ts_start, ts_end, provider, kind, side)
DO UPDATE SET
price_pln_net = EXCLUDED.price_pln_net,
buyer = EXCLUDED.buyer,
seller = EXCLUDED.seller,
source_meta = COALESCE(pricing.energy_prices.source_meta, '{}'::jsonb)
|| COALESCE(EXCLUDED.source_meta, '{}'::jsonb),
inserted_at = now();
"""
def setup_db():
# psycopg 3
conn = psycopg.connect(
host=DB_HOST, port=DB_PORT, dbname=DB_NAME, user=DB_USER, password=DB_PASS
)
return conn
def rows_from_series(series_pln_per_kwh: pd.Series, provider: str, kind: str,
period: timedelta = timedelta(hours=1), meta: dict | None = None):
"""
Zamienia serię godzinową (index = start okresu, tz-aware) na listę wierszy dla upsertu.
"""
if series_pln_per_kwh.empty:
return []
s = series_pln_per_kwh.copy()
idx = s.index
if getattr(idx, "tz", None) is None:
idx = idx.tz_localize(TZ, nonexistent="shift_forward", ambiguous="infer")
s.index = idx
else:
s = s.tz_convert(TZ)
meta_json = json.dumps(meta or {})
rows = []
for ts_start, price in s.dropna().items():
ts_end = ts_start + period
rows.append((ts_start, ts_end, float(price), provider, kind, meta_json))
return rows
def upsert_energy_prices(conn, rows):
"""
rows: iterable krotek:
(ts_start, ts_end, price_pln_net, provider, kind, source_meta_json)
"""
if not rows:
return
with conn.cursor() as cur:
cur.executemany(UPSERT_SQL, rows)
conn.commit()
if __name__ == "__main__":
conn = setup_db()

View File

@ -0,0 +1,78 @@
from __future__ import annotations
from datetime import datetime, timedelta, date
from typing import List, Tuple, Dict, Any
import pandas as pd
from EnergyPriceScraper import EnergyPriceScraperBase
from utils.time_helpers import WARSAW_TZ
class InstratRDN_CSVScraper(EnergyPriceScraperBase):
"""
Przykładowy scraper RDN z CSV/JSON (public HTTP).
Oczekuje CSV z kolumnami: 'date', 'fixing_i_price' (PLN/MWh) lub już PLN/kWh.
"""
PROVIDER = "instrat"
KIND = "fixing_I"
SIDE = "buy"
BUYER = "end_user"
SELLER = "market_index"
url: str
def __init__(self, path: str, **kwargs):
super().__init__(**kwargs)
self.data = self.load_instrat_csv(path)
def load_instrat_csv(self, path: str) -> pd.DataFrame:
"""
Wczytuje CSV Instrat z format
date,fixing_i_price,fixing_i_volume,fixing_ii_price,fixing_ii_volume
01.01.2016 00:00,108.27,2565.10,108.55,89.10
"""
# 1) Wczytanie z autodetekcją polskiego formatu
dateparse = lambda x: datetime.strptime(x, '%d.%m.%Y %H:%M').replace(tzinfo=WARSAW_TZ)
df = pd.read_csv(path, parse_dates=['date'], date_parser=dateparse)
fi_pln_kwh = (df["fixing_i_price"] / 1000.0).round(4)
fii_pln_kwh = (df["fixing_ii_price"] / 1000.0).round(4)
self.out = pd.DataFrame({
"fixing_i_pln_kwh": fi_pln_kwh.values,
"fixing_ii_pln_kwh": fii_pln_kwh.values,
"fixing_i_volume": pd.to_numeric(df.get("fixing_i_volume"), errors="coerce").values,
"fixing_ii_volume": pd.to_numeric(df.get("fixing_ii_volume"), errors="coerce").values,
}, index=df["date"]).sort_index()
# sanity check — nie wyszło pusto
if self.out[["fixing_i_pln_kwh", "fixing_ii_pln_kwh"]].notna().sum().sum() == 0:
raise RuntimeError("Brak cen po przeliczeniu — sprawdź separator/format liczb w CSV.")
def fetch_day(self, business_day: date) -> List[Tuple[datetime, datetime, float, Dict[str, Any]]]:
if not hasattr(self, "out"):
raise RuntimeError("Brak danych: najpierw wczytaj CSV i zbuduj self.out")
# wybór kolumny wg KIND (domyślnie Fixing I)
kind = getattr(self, "KIND", "fixing_I")
kind_norm = str(kind).replace(" ", "_").lower()
if "fixing_ii" in kind_norm:
col = "fixing_ii_pln_kwh"
fixing_tag = "II"
else:
col = "fixing_i_pln_kwh"
fixing_tag = "I"
# zakres doby lokalnej
day_start = datetime(business_day.year, business_day.month, business_day.day, 0, 0, tzinfo=self.tz)
day_end = day_start + timedelta(days=1)
# filtr i emisja punktów
df_day = self.out.loc[(self.out.index >= day_start) & (self.out.index < day_end)]
if col not in df_day.columns:
raise KeyError(f"Kolumna '{col}' nie istnieje w self.out")
points: List[Tuple[datetime, datetime, float, Dict[str, Any]]] = []
for ts, price in df_day[col].dropna().items():
ts_end = ts + getattr(self, "period", timedelta(hours=1))
points.append((ts.to_pydatetime(), ts_end.to_pydatetime(), float(price),
{"source": "instrat_csv", "unit": "PLN/kWh", "fixing": fixing_tag, "taxes_included": False}))
return points

46
Scraper/PSE_RCEScraper.py Normal file
View File

@ -0,0 +1,46 @@
from __future__ import annotations
from datetime import datetime, timedelta, date
from typing import List, Tuple, Dict, Any
import requests
from EnergyPriceScraper import EnergyPriceScraperBase, WAW
class PSE_RCEScraper(EnergyPriceScraperBase):
"""
PSE RCE (PLN) godziny dla danej doby.
Zwraca NETTO PLN/kWh (jeżeli RCE jest w PLN/MWh, dzielimy przez 1000).
"""
PROVIDER = "PSE"
KIND = "rce"
SIDE = "sell"
BUYER = "reteiler" # sprzedawca rozliczajacy prosumenta
SELLER = "prosumer"
api_url: str = "https://api.raporty.pse.pl/api/rce-pln"
session: requests.Session
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.session = requests.Session()
self.session.headers.update({"accept": "application/json"})
def fetch_day(self, business_day: date) -> List[Tuple[datetime, datetime, float, Dict[str, Any]]]:
# RCE v2: filter by business_date, select rce_pln,dtime,period
params = {
"$select": "rce_pln,dtime,period",
"$filter": f"business_date eq '{business_day:%Y-%m-%d}'",
}
r = self.session.get(self.api_url, params=params, timeout=30)
r.raise_for_status()
data = r.json().get("value", [])
out: List[Tuple[datetime, datetime, float, Dict[str, Any]]] = []
for item in data:
# dtime to ISO; period (w minutach) bywa 60
ts = datetime.fromisoformat(item["dtime"]).astimezone(WAW)
per_min = int(item.get("period", 60))
ts_end = ts + timedelta(minutes=per_min)
price_pln_mwh = float(item["rce_pln"])
price_pln_kwh = price_pln_mwh / 1000.0 # NETTO PLN/kWh
out.append((ts, ts_end, price_pln_kwh, {"source": "PSE_RCE_v2"}))
return out

49
Scraper/PstrykScraper.py Normal file
View File

@ -0,0 +1,49 @@
from __future__ import annotations
from datetime import datetime, timedelta, date
from typing import List, Tuple, Dict, Any, Optional
import os
import requests
from EnergyPriceScraper import EnergyPriceScraperBase, WAW
class PstrykScraper(EnergyPriceScraperBase):
"""
Szablon: ceny publikowane przez sprzedawcę (Pstryk).
Załóż: Bearer token w ENV PSTRYK_TOKEN, endpoint w ENV PSTRYK_API_BASE, np.:
PSTRYK_API_BASE=https://api.pstryk.example.com
Endpoint (przykład): GET /prices?date=YYYY-MM-DD
-> [{"ts":"2025-08-27T00:00:00+02:00","net_pln_kwh":0.44}, ...]
"""
PROVIDER = "PSTRYK"
KIND = "market_price"
SIDE = "buy"
BUYER = "end_user"
SELLER = "PSTRYK"
api_base: str
token: str
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.api_base = os.getenv("PSTRYK_API_BASE", "").rstrip("/")
self.token = os.getenv("PSTRYK_TOKEN", "")
if not self.api_base or not self.token:
raise RuntimeError("Ustaw PSTRYK_API_BASE i PSTRYK_TOKEN w środowisku.")
self.session = requests.Session()
self.session.headers.update({
"accept": "application/json",
"authorization": f"Bearer {self.token}",
"user-agent": "energy-scraper/1.0",
})
def fetch_day(self, business_day: date) -> List[Tuple[datetime, datetime, float, Dict[str, Any]]]:
url = f"{self.api_base}/prices"
r = self.session.get(url, params={"date": f"{business_day:%Y-%m-%d}"}, timeout=30)
r.raise_for_status()
data = r.json()
out: List[Tuple[datetime, datetime, float, Dict[str, Any]]] = []
for item in data:
ts = datetime.fromisoformat(item["ts"]).astimezone(WAW)
p = float(item["net_pln_kwh"])
out.append((ts, ts + self.period, p, {"source": "pstryk_api"}))
return out

1
Scraper/__init__.py Normal file
View File

@ -0,0 +1 @@
# (left intentionally empty; factory imports modules dynamically)

54
main.py
View File

@ -1,56 +1,22 @@
# pip install pandas python-dateutil
import pandas as pd
from datetime import datetime, timedelta from datetime import datetime, timedelta
import zoneinfo
TZ = zoneinfo.ZoneInfo("Europe/Warsaw")
import DistributionCostFactory import DistributionCostFactory
from EnergyPriceProvider import DynamicPricesProvider from EnergyPriceProvider import DynamicPricesProvider
from plot_cost_breakdown import plot_stacked_with_negatives from plot_cost_breakdown import plot_stacked_with_negatives
from matplotlib import pyplot as plt from matplotlib import pyplot as plt
def load_instrat_csv(path: str) -> pd.DataFrame: import EnergyPriceScraperFactory
"""
Wczytuje CSV Instrat z format
date,fixing_i_price,fixing_i_volume,fixing_ii_price,fixing_ii_volume
01.01.2016 00:00,108.27,2565.10,108.55,89.10
"""
# 1) Wczytanie z autodetekcją polskiego formatu
dateparse = lambda x: datetime.strptime(x, '%d.%m.%Y %H:%M').replace(tzinfo=TZ)
df = pd.read_csv(path, parse_dates=['date'], date_parser=dateparse)
fi_pln_kwh = (df["fixing_i_price"] / 1000.0).round(4)
fii_pln_kwh = (df["fixing_ii_price"] / 1000.0).round(4)
out = pd.DataFrame({
"fixing_i_pln_kwh": fi_pln_kwh.values,
"fixing_ii_pln_kwh": fii_pln_kwh.values,
"fixing_i_volume": pd.to_numeric(df.get("fixing_i_volume"), errors="coerce").values,
"fixing_ii_volume": pd.to_numeric(df.get("fixing_ii_volume"), errors="coerce").values,
}, index=df["date"]).sort_index()
# sanity check — nie wyszło pusto
if out[["fixing_i_pln_kwh", "fixing_ii_pln_kwh"]].notna().sum().sum() == 0:
raise RuntimeError("Brak cen po przeliczeniu — sprawdź separator/format liczb w CSV.")
return out
import EnergyScrapper as es
if __name__ == "__main__": if __name__ == "__main__":
path = "electricity_prices_day_ahead_hourly_all.csv" path = "electricity_prices_day_ahead_hourly_all.csv"
scraper = EnergyPriceScraperFactory.create("InstratRDN_CSV", conn=EnergyPriceScraperFactory.setup_db(), path=path)
day = scraper.fetch_day(datetime(2025, 6, 27, 0, 0))
print(day)
# conn = es.setup_db()
# df = load_instrat_csv(path) # df = load_instrat_csv(path)
# netto
# df[["fixing_i_pln_kwh","fixing_ii_pln_kwh"]].to_csv(
# "tge_fixings_pln_kwh.csv", index_label="timestamp", float_format="%.2f"
# )
# print("OK: zapisano tge_fixings_pln_kwh_pl.csv oraz tge_fixings_pln_kwh.csv")
conn = es.setup_db()
# s = df["fixing_i_pln_kwh"] # s = df["fixing_i_pln_kwh"]
# rows1 = es.rows_from_series(s, # rows1 = es.rows_from_series(s,
# provider="instrat", # provider="instrat",
@ -58,8 +24,8 @@ if __name__ == "__main__":
# meta={"type":"RDN", "unit":"PLN/kWh","source":"csv_export", "taxes_included":False} # meta={"type":"RDN", "unit":"PLN/kWh","source":"csv_export", "taxes_included":False}
# ) # )
# es.upsert_energy_prices(conn, rows1) # es.upsert_energy_prices(conn, rows1)
# # #
# s = df["fixing_ii_pln_kwh"] # # s = df["fixing_ii_pln_kwh"]
# rows1 = es.rows_from_series(s, # rows1 = es.rows_from_series(s,
# provider="instrat", # provider="instrat",
# kind="fixing II", # kind="fixing II",