import asyncio import httpx import json import logging from sqlalchemy import text from app.database import AsyncSessionLocal # MB 2.0 Naplózási beállítások logger = logging.getLogger("Data-Loader-Master") logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s') class VehicleDataLoader: # Ellenőrzött, működő források SOURCES = { "car-data-kohut": "https://raw.githubusercontent.com/DanielKohut/car-data/master/car_data.json", "car-list-matth": "https://raw.githubusercontent.com/matthlavacka/car-list/master/car-list.json" } @staticmethod def normalize_name(name: str) -> str: """ Alapvető szövegtisztítás: nagybetű, szóközmentesítés. """ if not name: return "" n = str(name).upper().strip() # Gyakori szinonimák egységesítése synonyms = {"VW": "VOLKSWAGEN", "MERCEDES": "MERCEDES-BENZ", "ALFA": "ALFA ROMEO"} return synonyms.get(n, n) async def fetch_json(self, url: str): """ Biztonságos letöltés időtúllépés kezeléssel. """ async with httpx.AsyncClient(timeout=60.0, follow_redirects=True) as client: try: resp = await client.get(url) if resp.status_code == 200: return resp.json() logger.error(f"❌ Letöltési hiba ({resp.status_code}): {url}") except Exception as e: logger.error(f"🚨 Kommunikációs hiba: {url} -> {e}") return None def map_source_data(self, source_name, raw_data): """ Mapping Layer: Átfordítja a különböző források JSON szerkezetét a mi egységes vehicle.reference_lookup sémánkra. """ unified_entries = [] try: if source_name == "car-data-kohut": # Szerkezet: list[{"brand": "...", "models": ["...", ...]}] for brand_item in raw_data: make = self.normalize_name(brand_item.get("brand")) for model in brand_item.get("models", []): unified_entries.append({ "make": make, "model": self.normalize_name(model), "year": None, "specs": json.dumps({"source": "kohut"}), "source": source_name, "source_id": None }) elif source_name == "car-list-matth": # Szerkezet: list[{"brand": "...", "models": ["...", ...]}] for brand_item in raw_data: make = self.normalize_name(brand_item.get("brand")) for model in brand_item.get("models", []): unified_entries.append({ "make": make, "model": self.normalize_name(model), "year": None, "specs": json.dumps({"source": "matthlavacka"}), "source": source_name, "source_id": None }) elif source_name == "os-vehicle-db": # Szerkezet: list[{"make": "...", "model": "...", "year": ...}] for item in raw_data: unified_entries.append({ "make": self.normalize_name(item.get("make")), "model": self.normalize_name(item.get("model")), "year": item.get("year"), "specs": json.dumps(item), "source": source_name, "source_id": str(item.get("id", "")) }) except Exception as e: logger.error(f"⚠️ Mapping hiba a(z) {source_name} feldolgozásakor: {e}") return unified_entries async def save_to_db(self, entries): """ Batch Upsert folyamat az adatbázisba. """ if not entries: return async with AsyncSessionLocal() as db: stmt = text(""" INSERT INTO vehicle.reference_lookup (make, model, year, specs, source, source_id) VALUES (:make, :model, :year, :specs, :source, :source_id) ON CONFLICT ON CONSTRAINT _ref_lookup_uc DO UPDATE SET specs = EXCLUDED.specs, updated_at = NOW() """) try: # 1000-es csomagokban küldjük be for i in range(0, len(entries), 1000): batch = entries[i:i+1000] for row in batch: await db.execute(stmt, row) await db.commit() logger.info(f"💾 Mentve: {min(i + 1000, len(entries))} / {len(entries)}") except Exception as e: await db.rollback() logger.error(f"🚨 Adatbázis hiba: {e}") async def run_sync(self): logger.info("🚀 Data Loader (Fast Lane Support) elindul...") for name, url in self.SOURCES.items(): raw_json = await self.fetch_json(url) if raw_json: logger.info(f"🧹 {name} feldolgozása...") unified = self.map_source_data(name, raw_json) await self.save_to_db(unified) logger.info("🏁 Minden forrás szinkronizálva.") if __name__ == "__main__": asyncio.run(VehicleDataLoader().run_sync())