átlagos kiegészítséek jó sok
This commit is contained in:
137
backend/app/workers/vehicle/R1_model_scout.py
Normal file
137
backend/app/workers/vehicle/R1_model_scout.py
Normal file
@@ -0,0 +1,137 @@
|
||||
import asyncio
|
||||
import logging
|
||||
import random
|
||||
import re
|
||||
from playwright.async_api import async_playwright
|
||||
from sqlalchemy import text
|
||||
from app.database import AsyncSessionLocal
|
||||
|
||||
# --- NAPLÓZÁS KONFIGURÁCIÓ ---
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s [R1-RECOVERY] %(message)s'
|
||||
)
|
||||
logger = logging.getLogger("R1")
|
||||
|
||||
async def analyze_and_extract_links(page, current_url, current_level):
|
||||
"""
|
||||
Gondolatmenet: Intelligens link-osztályozás.
|
||||
Javítás: Motorcyclespecs (.htm és /model/) támogatás hozzáadva.
|
||||
"""
|
||||
found_links = []
|
||||
|
||||
# Linkek kinyerése
|
||||
hrefs = await page.eval_on_selector_all(
|
||||
"a",
|
||||
"nodes => nodes.map(n => ({ 'name': n.innerText.trim(), 'url': n.href }))"
|
||||
)
|
||||
|
||||
logger.info(f"🔎 Oldal elemzése: {len(hrefs)} link található összesen.")
|
||||
|
||||
for link in hrefs:
|
||||
url = link['url']
|
||||
name = link['name']
|
||||
|
||||
if not name or len(name) < 2: continue
|
||||
if re.search(r'[^\x00-\x7F]+', name): continue # Nyelvi pajzs
|
||||
|
||||
# 1. AUTOEVOLUTION
|
||||
if "autoevolution.com/moto/" in url:
|
||||
if url.endswith(".html") and "#" not in url:
|
||||
found_links.append({'name': name, 'url': url, 'level': 'engine'})
|
||||
elif url.count('/') >= 5:
|
||||
found_links.append({'name': name, 'url': url, 'level': 'model'})
|
||||
|
||||
# 2. BIKEZ
|
||||
elif "bikez.com" in url:
|
||||
if "/motorcycles/" in url:
|
||||
found_links.append({'name': name, 'url': url, 'level': 'engine'})
|
||||
elif "/models/" in url:
|
||||
found_links.append({'name': name, 'url': url, 'level': 'model'})
|
||||
|
||||
# 3. MOTORCYCLESPECS (Kritikus javítás!)
|
||||
elif "motorcyclespecs.co.za" in url:
|
||||
# Ha a linkben benne van a /model/ és .htm-re végződik, az egy adatlap
|
||||
if "/model/" in url and (".htm" in url or ".html" in url):
|
||||
found_links.append({'name': name, 'url': url, 'level': 'engine'})
|
||||
# Ha a brand oldalon vagyunk és további listákat látunk
|
||||
elif "/bikes/" in url and name.lower() not in current_url.lower():
|
||||
found_links.append({'name': name, 'url': url, 'level': 'model'})
|
||||
|
||||
return found_links
|
||||
|
||||
async def main():
|
||||
"""
|
||||
Gondolatmenet: A fő vezérlő hurok.
|
||||
"""
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
context = await browser.new_context(
|
||||
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
|
||||
)
|
||||
|
||||
logger.info("🤖 R1 Recovery Scout elindult...")
|
||||
|
||||
while True:
|
||||
target = None
|
||||
async with AsyncSessionLocal() as db:
|
||||
try:
|
||||
# Feladat felvétele (Márka vagy Modell szint)
|
||||
res = await db.execute(text("""
|
||||
UPDATE vehicle.auto_data_crawler_queue SET status = 'processing'
|
||||
WHERE id = (
|
||||
SELECT id FROM vehicle.auto_data_crawler_queue
|
||||
WHERE (status = 'pending' OR status = 'error' OR status = 'completed_empty')
|
||||
AND level = 'brand'
|
||||
AND category = 'bike'
|
||||
ORDER BY id ASC LIMIT 1 FOR UPDATE SKIP LOCKED
|
||||
) RETURNING id, url, name, level
|
||||
"""))
|
||||
target = res.fetchone()
|
||||
await db.commit()
|
||||
except Exception as e:
|
||||
logger.error(f"❌ DB Hiba: {e}")
|
||||
await db.rollback()
|
||||
|
||||
if not target:
|
||||
logger.info("🏁 Nincs több feladat. Alvás 30mp...")
|
||||
await asyncio.sleep(30)
|
||||
continue
|
||||
|
||||
t_id, t_url, t_name, t_level = target
|
||||
page = await context.new_page()
|
||||
|
||||
try:
|
||||
logger.info(f"🚀 [{t_level}] {t_name} felderítése -> {t_url}")
|
||||
await page.goto(t_url, wait_until="domcontentloaded", timeout=60000)
|
||||
await asyncio.sleep(2) # Várunk, hogy a JavaScript is lefusson
|
||||
|
||||
links = await analyze_and_extract_links(page, t_url, t_level)
|
||||
|
||||
async with AsyncSessionLocal() as db:
|
||||
if links:
|
||||
for link in links:
|
||||
await db.execute(text("""
|
||||
INSERT INTO vehicle.auto_data_crawler_queue (url, level, parent_id, name, status, category)
|
||||
VALUES (:url, :level, :p_id, :name, 'pending', 'bike')
|
||||
ON CONFLICT (url) DO NOTHING
|
||||
"""), {"url": link['url'], "level": link['level'], "p_id": t_id, "name": link['name']})
|
||||
|
||||
await db.execute(text("UPDATE vehicle.auto_data_crawler_queue SET status = 'completed' WHERE id = :id"), {"id": t_id})
|
||||
logger.info(f"✅ Siker: {t_name} -> {len(links)} új link mentve.")
|
||||
else:
|
||||
await db.execute(text("UPDATE vehicle.auto_data_crawler_queue SET status = 'completed_empty' WHERE id = :id"), {"id": t_id})
|
||||
logger.warning(f"⚠️ Üres: {t_name} oldalon nem találtam motorokat.")
|
||||
|
||||
await db.commit()
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Hiba: {t_name} -> {e}")
|
||||
finally:
|
||||
await page.close()
|
||||
await asyncio.sleep(random.uniform(3, 5))
|
||||
|
||||
await browser.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
Reference in New Issue
Block a user