átlagos kiegészítséek jó sok

This commit is contained in:
Roo
2026-03-22 11:02:05 +00:00
parent f53e0b53df
commit 5d44339f21
249 changed files with 20922 additions and 2253 deletions

View File

@@ -0,0 +1,137 @@
import asyncio
import logging
import random
import re
from playwright.async_api import async_playwright
from sqlalchemy import text
from app.database import AsyncSessionLocal
# --- NAPLÓZÁS KONFIGURÁCIÓ ---
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s [R1-RECOVERY] %(message)s'
)
logger = logging.getLogger("R1")
async def analyze_and_extract_links(page, current_url, current_level):
"""
Gondolatmenet: Intelligens link-osztályozás.
Javítás: Motorcyclespecs (.htm és /model/) támogatás hozzáadva.
"""
found_links = []
# Linkek kinyerése
hrefs = await page.eval_on_selector_all(
"a",
"nodes => nodes.map(n => ({ 'name': n.innerText.trim(), 'url': n.href }))"
)
logger.info(f"🔎 Oldal elemzése: {len(hrefs)} link található összesen.")
for link in hrefs:
url = link['url']
name = link['name']
if not name or len(name) < 2: continue
if re.search(r'[^\x00-\x7F]+', name): continue # Nyelvi pajzs
# 1. AUTOEVOLUTION
if "autoevolution.com/moto/" in url:
if url.endswith(".html") and "#" not in url:
found_links.append({'name': name, 'url': url, 'level': 'engine'})
elif url.count('/') >= 5:
found_links.append({'name': name, 'url': url, 'level': 'model'})
# 2. BIKEZ
elif "bikez.com" in url:
if "/motorcycles/" in url:
found_links.append({'name': name, 'url': url, 'level': 'engine'})
elif "/models/" in url:
found_links.append({'name': name, 'url': url, 'level': 'model'})
# 3. MOTORCYCLESPECS (Kritikus javítás!)
elif "motorcyclespecs.co.za" in url:
# Ha a linkben benne van a /model/ és .htm-re végződik, az egy adatlap
if "/model/" in url and (".htm" in url or ".html" in url):
found_links.append({'name': name, 'url': url, 'level': 'engine'})
# Ha a brand oldalon vagyunk és további listákat látunk
elif "/bikes/" in url and name.lower() not in current_url.lower():
found_links.append({'name': name, 'url': url, 'level': 'model'})
return found_links
async def main():
"""
Gondolatmenet: A fő vezérlő hurok.
"""
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context = await browser.new_context(
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
)
logger.info("🤖 R1 Recovery Scout elindult...")
while True:
target = None
async with AsyncSessionLocal() as db:
try:
# Feladat felvétele (Márka vagy Modell szint)
res = await db.execute(text("""
UPDATE vehicle.auto_data_crawler_queue SET status = 'processing'
WHERE id = (
SELECT id FROM vehicle.auto_data_crawler_queue
WHERE (status = 'pending' OR status = 'error' OR status = 'completed_empty')
AND level = 'brand'
AND category = 'bike'
ORDER BY id ASC LIMIT 1 FOR UPDATE SKIP LOCKED
) RETURNING id, url, name, level
"""))
target = res.fetchone()
await db.commit()
except Exception as e:
logger.error(f"❌ DB Hiba: {e}")
await db.rollback()
if not target:
logger.info("🏁 Nincs több feladat. Alvás 30mp...")
await asyncio.sleep(30)
continue
t_id, t_url, t_name, t_level = target
page = await context.new_page()
try:
logger.info(f"🚀 [{t_level}] {t_name} felderítése -> {t_url}")
await page.goto(t_url, wait_until="domcontentloaded", timeout=60000)
await asyncio.sleep(2) # Várunk, hogy a JavaScript is lefusson
links = await analyze_and_extract_links(page, t_url, t_level)
async with AsyncSessionLocal() as db:
if links:
for link in links:
await db.execute(text("""
INSERT INTO vehicle.auto_data_crawler_queue (url, level, parent_id, name, status, category)
VALUES (:url, :level, :p_id, :name, 'pending', 'bike')
ON CONFLICT (url) DO NOTHING
"""), {"url": link['url'], "level": link['level'], "p_id": t_id, "name": link['name']})
await db.execute(text("UPDATE vehicle.auto_data_crawler_queue SET status = 'completed' WHERE id = :id"), {"id": t_id})
logger.info(f"✅ Siker: {t_name} -> {len(links)} új link mentve.")
else:
await db.execute(text("UPDATE vehicle.auto_data_crawler_queue SET status = 'completed_empty' WHERE id = :id"), {"id": t_id})
logger.warning(f"⚠️ Üres: {t_name} oldalon nem találtam motorokat.")
await db.commit()
except Exception as e:
logger.error(f"❌ Hiba: {t_name} -> {e}")
finally:
await page.close()
await asyncio.sleep(random.uniform(3, 5))
await browser.close()
if __name__ == "__main__":
asyncio.run(main())