import asyncio import logging import random import re from playwright.async_api import async_playwright from sqlalchemy import text from app.database import AsyncSessionLocal # --- NAPLÓZÁS KONFIGURÁCIÓ --- logging.basicConfig( level=logging.INFO, format='%(asctime)s [R1-RECOVERY] %(message)s' ) logger = logging.getLogger("R1") async def analyze_and_extract_links(page, current_url, current_level): """ Gondolatmenet: Intelligens link-osztályozás. Javítás: Motorcyclespecs (.htm és /model/) támogatás hozzáadva. """ found_links = [] # Linkek kinyerése hrefs = await page.eval_on_selector_all( "a", "nodes => nodes.map(n => ({ 'name': n.innerText.trim(), 'url': n.href }))" ) logger.info(f"🔎 Oldal elemzése: {len(hrefs)} link található összesen.") for link in hrefs: url = link['url'] name = link['name'] if not name or len(name) < 2: continue if re.search(r'[^\x00-\x7F]+', name): continue # Nyelvi pajzs # 1. AUTOEVOLUTION if "autoevolution.com/moto/" in url: if url.endswith(".html") and "#" not in url: found_links.append({'name': name, 'url': url, 'level': 'engine'}) elif url.count('/') >= 5: found_links.append({'name': name, 'url': url, 'level': 'model'}) # 2. BIKEZ elif "bikez.com" in url: if "/motorcycles/" in url: found_links.append({'name': name, 'url': url, 'level': 'engine'}) elif "/models/" in url: found_links.append({'name': name, 'url': url, 'level': 'model'}) # 3. MOTORCYCLESPECS (Kritikus javítás!) elif "motorcyclespecs.co.za" in url: # Ha a linkben benne van a /model/ és .htm-re végződik, az egy adatlap if "/model/" in url and (".htm" in url or ".html" in url): found_links.append({'name': name, 'url': url, 'level': 'engine'}) # Ha a brand oldalon vagyunk és további listákat látunk elif "/bikes/" in url and name.lower() not in current_url.lower(): found_links.append({'name': name, 'url': url, 'level': 'model'}) return found_links async def main(): """ Gondolatmenet: A fő vezérlő hurok. """ async with async_playwright() as p: browser = await p.chromium.launch(headless=True) context = await browser.new_context( user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36" ) logger.info("🤖 R1 Recovery Scout elindult...") while True: target = None async with AsyncSessionLocal() as db: try: # Feladat felvétele (Márka vagy Modell szint) res = await db.execute(text(""" UPDATE vehicle.auto_data_crawler_queue SET status = 'processing' WHERE id = ( SELECT id FROM vehicle.auto_data_crawler_queue WHERE (status = 'pending' OR status = 'error' OR status = 'completed_empty') AND level = 'brand' AND category = 'bike' ORDER BY id ASC LIMIT 1 FOR UPDATE SKIP LOCKED ) RETURNING id, url, name, level """)) target = res.fetchone() await db.commit() except Exception as e: logger.error(f"❌ DB Hiba: {e}") await db.rollback() if not target: logger.info("🏁 Nincs több feladat. Alvás 30mp...") await asyncio.sleep(30) continue t_id, t_url, t_name, t_level = target page = await context.new_page() try: logger.info(f"🚀 [{t_level}] {t_name} felderítése -> {t_url}") await page.goto(t_url, wait_until="domcontentloaded", timeout=60000) await asyncio.sleep(2) # Várunk, hogy a JavaScript is lefusson links = await analyze_and_extract_links(page, t_url, t_level) async with AsyncSessionLocal() as db: if links: for link in links: await db.execute(text(""" INSERT INTO vehicle.auto_data_crawler_queue (url, level, parent_id, name, status, category) VALUES (:url, :level, :p_id, :name, 'pending', 'bike') ON CONFLICT (url) DO NOTHING """), {"url": link['url'], "level": link['level'], "p_id": t_id, "name": link['name']}) await db.execute(text("UPDATE vehicle.auto_data_crawler_queue SET status = 'completed' WHERE id = :id"), {"id": t_id}) logger.info(f"✅ Siker: {t_name} -> {len(links)} új link mentve.") else: await db.execute(text("UPDATE vehicle.auto_data_crawler_queue SET status = 'completed_empty' WHERE id = :id"), {"id": t_id}) logger.warning(f"⚠️ Üres: {t_name} oldalon nem találtam motorokat.") await db.commit() except Exception as e: logger.error(f"❌ Hiba: {t_name} -> {e}") finally: await page.close() await asyncio.sleep(random.uniform(3, 5)) await browser.close() if __name__ == "__main__": asyncio.run(main())