import asyncio import logging import random import json import re import sys from bs4 import BeautifulSoup from playwright.async_api import async_playwright from sqlalchemy import text from app.database import AsyncSessionLocal # --- NAPLÓZÁS KONFIGURÁCIÓ --- logging.basicConfig(level=logging.INFO, format='%(asctime)s [R3-EXTRACTOR-v1.2] %(message)s') logger = logging.getLogger("R3") # --- KONFIGURÁCIÓS PARAMÉTEREK --- MAX_RETRY_LIMIT = 3 # Max 3 próbálkozás járművenként class R3DataMiner: def clean_key(self, key): if "," in key: key = key.split(",")[-1] key = key.replace("What is the ", "").replace("How much ", "").replace("How many ", "") return key.split("?")[0].strip().capitalize() async def scrape_specs(self, context, url): page = await context.new_page() try: # Véletlenszerű várakozás a bot-védelem elkerülésére await asyncio.sleep(random.uniform(4, 8)) await page.goto(url, wait_until="domcontentloaded", timeout=60000) content = await page.content() soup = BeautifulSoup(content, 'html.parser') data = {"make": "", "model": "", "generation": "", "modification": "", "year_from": None, "power_kw": 0, "engine_cc": 0, "specifications": {}, "source_url": url} # Eredeti parszoló logika for row in soup.find_all('tr'): th, td = row.find('th'), row.find('td') if not th or not td: continue k_raw, v = th.get_text(strip=True), td.get_text(strip=True) k_low = k_raw.lower() if "brand" == k_low: data["make"] = v elif "model" == k_low: data["model"] = v elif "generation" == k_low: data["generation"] = v elif "modification" == k_low: data["modification"] = v elif "start of production" in k_low: m = re.search(r'(\d{4})', v) data["year_from"] = int(m.group(1)) if m else None elif "power" == k_low: hp = re.search(r'(\d+)\s*Hp', v, re.I) if hp: data["power_kw"] = int(int(hp.group(1)) / 1.36) elif "displacement" in k_low: cc = re.search(r'(\d+)\s*cm3', v) if cc: data["engine_cc"] = int(cc.group(1)) data["specifications"][self.clean_key(k_raw)] = v if not data["make"] or not data["specifications"]: return None return data except Exception as e: logger.error(f"Hiba az adatlapon ({url}): {e}") return None finally: await page.close() async def run(self): async with async_playwright() as p: browser = await p.chromium.launch(headless=True) context = await browser.new_context( user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36" ) while True: target = None async with AsyncSessionLocal() as db: try: # JAVÍTÁS: Kikerült a priority_score, mert az oszlop nem létezik a crawler_queue táblában res = await db.execute(text(""" UPDATE vehicle.auto_data_crawler_queue SET status = 'processing' WHERE id = ( SELECT id FROM vehicle.auto_data_crawler_queue WHERE level = 'engine' AND status IN ('pending', 'error') AND retry_count < 3 ORDER BY id ASC LIMIT 1 FOR UPDATE SKIP LOCKED ) RETURNING id, url, name, retry_count """)) target = res.fetchone() await db.commit() except Exception as e: logger.error(f"❌ DB Hiba a feladatfelvételnél: {e}") await asyncio.sleep(5) continue if not target: logger.info("🏁 Minden feladat elvégezve. Leállás.") break t_id, t_url, t_name, t_retry = target if t_retry is None: t_retry = 0 logger.info(f"🚀 [{t_retry + 1}/3] Dolgozom: {t_name}") data = await self.scrape_specs(context, t_url) async with AsyncSessionLocal() as db: if data and data["make"]: await db.execute(text(""" INSERT INTO vehicle.external_reference_library (source_name, make, model, generation, modification, year_from, power_kw, engine_cc, specifications, source_url) VALUES ('auto-data.net', :make, :model, :gen, :mod, :y, :p, :e, :s, :u) ON CONFLICT (source_url) DO UPDATE SET specifications = EXCLUDED.specifications, last_scraped_at = NOW(); """), { "make": data["make"], "model": data["model"], "gen": data["generation"], "mod": data["modification"], "y": data["year_from"], "p": data["power_kw"], "e": data["engine_cc"], "s": json.dumps(data["specifications"]), "u": data["source_url"] }) await db.execute(text("UPDATE vehicle.auto_data_crawler_queue SET status = 'completed', updated_at = NOW() WHERE id = :id"), {"id": t_id}) logger.info(f"✅ ARANYMENTÉS: {data['make']} {data['model']} {data['modification']}") else: new_retry = t_retry + 1 if new_retry >= 3: await db.execute(text(""" UPDATE vehicle.auto_data_crawler_queue SET status = 'manual_review_needed', retry_count = :rc, error_msg = 'Sikertelen adatgyűjtés 3 próbálkozás után', updated_at = NOW() WHERE id = :id """), {"rc": new_retry, "id": t_id}) logger.error(f"🚨 LIMIT ELÉRVE: {t_name} -> manual_review_needed") else: await db.execute(text(""" UPDATE vehicle.auto_data_crawler_queue SET status = 'error', retry_count = :rc, updated_at = NOW() WHERE id = :id """), {"rc": new_retry, "id": t_id}) logger.warning(f"⚠️ Sikertelen próbálkozás ({new_retry}/3): {t_name}") await db.commit() await browser.close() if __name__ == "__main__": miner = R3DataMiner() try: asyncio.run(miner.run()) except KeyboardInterrupt: logger.info("🛑 Felhasználói leállítás.")