#!/usr/bin/env python3 import asyncio import json import re import logging import random import urllib.parse from playwright.async_api import async_playwright from sqlalchemy import text from app.database import AsyncSessionLocal logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] [R5-SENTINEL] %(message)s') logger = logging.getLogger("R5") COLUMN_MAPPING = { "horsepower": "power_kw", "engine displacement": "engine_capacity", "maximum torque": "torque_nm", "top speed": "max_speed", "acceleration 0 to 100 km/h": "acceleration_0_100", "curb weight": "curb_weight", "wheelbase": "wheelbase", "num. of seats": "seats" } class R5Harvester: def __init__(self): self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36" def clean_number(self, val: str, key: str = "") -> int: if not val or val == "-": return 0 try: if "hp" in val.lower() or "kw" in val.lower(): kw_match = re.search(r'(\d+)\s*kw', val.lower()) if kw_match: return int(kw_match.group(1)) nums = re.findall(r'\d+', val.replace(' ', '').replace(',', '').replace('.', '')) return int(nums[0]) if nums else 0 except: return 0 async def scrape_car_details(self, page, make, model, year): try: # 1. Belső keresés search_url = f"https://www.ultimatespecs.com/index.php?brand={urllib.parse.quote(make)}&q={urllib.parse.quote(model + ' ' + str(year))}" logger.info(f"🔍 Keresés indítása...") await page.goto(search_url, wait_until="networkidle", timeout=30000) # 2. Megkeressük a linket, de NEM kattintunk, hanem elkérjük az URL-t # Rugalmasabb szelektor a 75 találat kezeléséhez link_element = await page.wait_for_selector("a[href*='/car-specs/']", timeout=15000) if not link_element: return None href = await link_element.get_attribute("href") target_url = href if href.startswith("http") else f"https://www.ultimatespecs.com{href}" # 3. KÖZVETLEN UGRÁS (Direct Jump) - Ez kikerüli a hirdetéseket logger.info(f"🚀 Közvetlen ugrás az adatlapra: {target_url}") await page.goto(target_url, wait_until="networkidle", timeout=30000) # 4. Parszolás (Minden táblázatot nézünk) full_specs = await page.evaluate(""" () => { let results = {}; document.querySelectorAll('table.table_specs, table.responsive').forEach(table => { table.querySelectorAll('tr').forEach(row => { let t = row.querySelector('.table_specs_title, .td_title, td:first-child'); let v = row.querySelector('.table_specs_value, .td_value, td:last-child'); if(t && v) { let k = t.innerText.replace(':','').trim().toLowerCase(); let val = v.innerText.trim(); if(k && val && val !== "-") results[k] = val; } }); }); return results; } """) return full_specs except Exception as e: logger.error(f"❌ Scrape hiba: {str(e)[:100]}...") return None async def run(self): async with async_playwright() as p: browser = await p.chromium.launch(headless=True) context = await browser.new_context(user_agent=self.user_agent) page = await context.new_page() while True: async with AsyncSessionLocal() as db: query = text(""" SELECT id, make, marketing_name, year_from FROM vehicle.vehicle_model_definitions WHERE (power_kw IS NULL OR power_kw = 0) AND status IN ('manual_review_needed', 'pending', 'enrich_ready') ORDER BY priority_score DESC LIMIT 1 """) target = (await db.execute(query)).fetchone() if not target: logger.info("✨ Pipeline üres.") break t_id, make, model, year = target logger.info(f"🚜 Feldolgozás: {make} {model} ({year})") web_data = await self.scrape_car_details(page, make, model, year) if not web_data or len(web_data) < 5: logger.warning(f"⚠️ Sikertelen gyűjtés, státusz: research_failed_empty") await db.execute(text("UPDATE vehicle.vehicle_model_definitions SET status = 'research_failed_empty' WHERE id = :id"), {"id": t_id}) await db.commit() continue updates = {col: self.clean_number(web_data.get(k)) for k, col in COLUMN_MAPPING.items()} if updates.get('power_kw', 0) > 0: await db.execute(text(""" UPDATE vehicle.vehicle_model_definitions SET power_kw = :power_kw, engine_capacity = :engine_capacity, torque_nm = :torque_nm, max_speed = :max_speed, acceleration_0_100 = :acceleration_0_100, curb_weight = :curb_weight, wheelbase = :wheelbase, specifications = specifications || :full_json, status = 'published', updated_at = NOW() WHERE id = :id """), {**updates, "id": t_id, "full_json": json.dumps(web_data)}) await db.commit() logger.info(f"✅ PUBLIKÁLVA: {make} {model} ({updates['power_kw']} kW)") else: await db.execute(text("UPDATE vehicle.vehicle_model_definitions SET status = 'research_failed_empty' WHERE id = :id"), {"id": t_id}) await db.commit() await asyncio.sleep(random.uniform(3, 6)) await browser.close() if __name__ == "__main__": harvester = R5Harvester() asyncio.run(harvester.run())