import asyncio import logging import random import json import re from bs4 import BeautifulSoup from playwright.async_api import async_playwright from sqlalchemy import text from app.database import AsyncSessionLocal logging.basicConfig(level=logging.INFO, format='%(asctime)s [R4-EXTRACTOR] %(message)s') logger = logging.getLogger("R4") class FinalExtractor: def __init__(self): self.semaphore = asyncio.Semaphore(2) # Biztonságos párhuzamosság def clean_key(self, key): if "," in key: key = key.split(",")[-1] key = key.replace("What is the ", "").replace("How much ", "").replace("How many ", "") key = key.split("?")[0].strip() return key.capitalize() async def scrape_engine(self, context, url): page = await context.new_page() try: await asyncio.sleep(random.uniform(3, 6)) # Anti-bot késleltetés await page.goto(url, wait_until="domcontentloaded", timeout=60000) content = await page.content() soup = BeautifulSoup(content, 'html.parser') data = { "make": "", "model": "", "generation": "", "modification": "", "year_from": None, "year_to": None, "power_kw": 0, "engine_cc": 0, "specifications": {}, "source_url": url } rows = soup.find_all('tr') for row in rows: th, td = row.find('th'), row.find('td') if not th or not td: continue raw_k, val = th.get_text(strip=True), td.get_text(strip=True) k_low = raw_k.lower() if "brand" == k_low: data["make"] = val elif "model" == k_low: data["model"] = val elif "generation" == k_low: data["generation"] = val elif "modification" == k_low: data["modification"] = val elif "start of production" in k_low: m = re.search(r'(\d{4})', val) if m: data["year_from"] = int(m.group(1)) elif "end of production" in k_low: m = re.search(r'(\d{4})', val) if m: data["year_to"] = int(m.group(1)) elif "power" == k_low: hp_m = re.search(r'(\d+)\s*Hp', val, re.I) if hp_m: data["power_kw"] = int(int(hp_m.group(1)) / 1.36) elif "displacement" in k_low: cc_m = re.search(r'(\d+)\s*cm3', val) if cc_m: data["engine_cc"] = int(cc_m.group(1)) clean_k = self.clean_key(raw_k) if clean_k and val: data["specifications"][clean_k] = val return data except Exception as e: logger.error(f"Hiba az adatlapon ({url}): {e}") return None finally: await page.close() async def save_to_library(self, data): if not data or not data["make"]: return async with AsyncSessionLocal() as db: try: await db.execute(text(""" INSERT INTO vehicle.external_reference_library (source_name, make, model, generation, modification, year_from, year_to, power_kw, engine_cc, specifications, source_url) VALUES ('auto-data.net', :make, :model, :gen, :mod, :y_f, :y_t, :p_kw, :e_cc, :specs, :url) ON CONFLICT (source_url) DO UPDATE SET specifications = EXCLUDED.specifications, last_scraped_at = NOW(); """), { "make": data["make"], "model": data["model"], "gen": data["generation"], "mod": data["modification"], "y_f": data["year_from"], "y_t": data["year_to"], "p_kw": data["power_kw"], "e_cc": data["engine_cc"], "specs": json.dumps(data["specifications"]), "url": data["source_url"] }) await db.commit() logger.info(f"✅ ARANYMENTÉS: {data['make']} {data['model']} ({data['power_kw']} kW)") except Exception as e: logger.error(f"DB Hiba: {e}") async def run(self): logger.info("🚀 R4 Adatbányász indítása...") async with async_playwright() as p: browser = await p.chromium.launch(headless=True) context = await browser.new_context(user_agent="Mozilla/5.0...") while True: async with AsyncSessionLocal() as db: res = await db.execute(text(""" UPDATE vehicle.auto_data_crawler_queue SET status = 'processing' WHERE id = ( SELECT id FROM vehicle.auto_data_crawler_queue WHERE level = 'engine' AND status = 'pending' ORDER BY id ASC LIMIT 1 FOR UPDATE SKIP LOCKED ) RETURNING id, url, name """)) target = res.fetchone() await db.commit() if not target: logger.info("🏁 Nincs több feldolgozandó motoradat. Alvás 60mp...") await asyncio.sleep(60) continue t_id, t_url, t_name = target async with self.semaphore: data = await self.scrape_engine(context, t_url) if data: await self.save_to_library(data) new_status = 'completed' else: new_status = 'error' async with AsyncSessionLocal() as db: await db.execute(text("UPDATE vehicle.auto_data_crawler_queue SET status = :s WHERE id = :id"), {"s": new_status, "id": t_id}) await db.commit() if __name__ == "__main__": asyncio.run(FinalExtractor().run())