átlagos kiegészítséek jó sok

This commit is contained in:
Roo
2026-03-22 11:02:05 +00:00
parent f53e0b53df
commit 5d44339f21
249 changed files with 20922 additions and 2253 deletions

View File

@@ -0,0 +1,59 @@
# /opt/docker/dev/service_finder/backend/app/workers/vehicle/bike/bike_R0_brand_hunter.py
import asyncio, logging
from playwright.async_api import async_playwright
from sqlalchemy import text
from app.database import AsyncSessionLocal
logging.basicConfig(level=logging.INFO, format='%(asctime)s [BIKE-R0] %(message)s')
logger = logging.getLogger("R0")
SOURCES = [
{
"name": "AutoEvolution",
"url": "https://www.autoevolution.com/moto/",
# Robusztusabb szelektor a márkákhoz
"selector": ".brand a, .all-brands a, .moto-brand a",
"category": "bike"
}
]
async def run_r0():
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context = await browser.new_context(user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/122.0.0.0")
async with AsyncSessionLocal() as db:
for src in SOURCES:
page = await context.new_page()
try:
logger.info(f"Márkák kinyerése: {src['name']}...")
await page.goto(src['url'], wait_until="networkidle", timeout=60000)
# Ha a szelektor nem talál semmit, begyűjtjük az összes /moto/ linket
links = await page.eval_on_selector_all("a[href*='/moto/']",
"nodes => nodes.map(n => ({ 'name': n.innerText.trim(), 'url': n.href }))")
# Szűrés: csak a tiszta márka-linkek (pl. .../moto/aprilia/)
# A márka linkek általában 5 perjelből állnak (https:// + domain + moto + márka + /)
brand_links = [l for l in links if l['url'].count('/') == 5 and not l['url'].endswith('.html')]
count = 0
for link in brand_links:
if len(link['name']) < 2: continue
await db.execute(text("""
INSERT INTO vehicle.auto_data_crawler_queue (url, level, name, status, category)
VALUES (:url, 'brand', :name, 'pending', 'bike')
ON CONFLICT (url) DO NOTHING
"""), {"url": link['url'], "name": link['name']})
count += 1
await db.commit()
logger.info(f"✅ [{src['name']}] kész: {count} márkát találtam.")
except Exception as e:
logger.error(f"❌ Hiba: {e}")
finally:
await page.close()
await browser.close()
if __name__ == "__main__":
asyncio.run(run_r0())

View File

@@ -0,0 +1,171 @@
import asyncio
import logging
import random
import re
from playwright.async_api import async_playwright
from sqlalchemy import text
from app.database import AsyncSessionLocal
# --- NAPLÓZÁS KONFIGURÁCIÓ ---
# Megtartjuk a részletes naplózást minden eseményhez
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s [BIKE-R1-AUTOEVO] %(message)s'
)
logger = logging.getLogger("R1")
async def analyze_and_extract_links(page, current_url):
"""
Gondolatmenet: Intelligens link-osztályozás az AutoEvolution struktúrája alapján.
Minden funkciót megőrzünk: Language Shield, zajszűrés és a horgony-fix.
"""
found_links = []
# Minden link begyűjtése az elemzéshez a megadott szelektorral
hrefs = await page.eval_on_selector_all(
"a[href*='/moto/']",
"nodes => nodes.map(n => ({ 'name': n.innerText.trim(), 'url': n.href }))"
)
junk_keywords = [
'privacy', 'cookie', 'settings', 'contact', 'terms', 'advertising',
'about us', 'copyright', 'login', 'registration'
]
for link in hrefs:
# --- HORGONY ÉS PARAMÉTER TISZTÍTÁS ---
# Itt volt a hiba: levágjuk a # részt, de a linket megtartjuk az ellenőrzéshez!
raw_url = link['url'].split('#')[0].split('?')[0].rstrip('/')
name = link['name']
# --- 1. LANGUAGE SHIELD & ZAJ SZŰRÉS ---
if not name or len(name) < 2:
continue
# Csak latin karakterek (No Greek/Cyrillic/Polish/etc)
if re.search(r'[^\x00-\x7F]+', name):
continue
# Kizárjuk a navigációs szemetet
if any(junk in name.lower() for junk in junk_keywords):
continue
# --- 2. AUTOEVOLUTION MÉLYSÉGI LOGIKA ---
if "autoevolution.com/moto/" in raw_url:
# Önhivatkozás és főoldal (visszafelé navigáció) kiszűrése
if raw_url == current_url.rstrip('/') or raw_url.endswith('/moto'):
continue
# Elágazás a szintek között az URL szerkezete alapján
path_segments = raw_url.strip('/').split('/')
# Ha .html-re végződik, az a technikai specifikáció (ENGINE szint)
if raw_url.endswith(".html"):
found_links.append({'name': name, 'url': raw_url, 'level': 'engine'})
# Ha legalább 6 szegmens van és nincs .html, az egy al-modell vagy generáció (MODEL szint)
elif len(path_segments) >= 6:
found_links.append({'name': name, 'url': raw_url, 'level': 'model'})
return found_links
async def get_next_task(db):
"""
Prioritásos feladatfelvétel: A márka (brand) szinteket részesítjük előnyben.
SKIP LOCKED biztosítja a párhuzamos futtathatóságot.
"""
query = text("""
UPDATE vehicle.auto_data_crawler_queue SET status = 'processing'
WHERE id = (
SELECT id FROM vehicle.auto_data_crawler_queue
WHERE status = 'pending'
AND category = 'bike'
AND url LIKE '%autoevolution.com%'
AND level IN ('brand', 'model')
ORDER BY
CASE WHEN level = 'brand' THEN 0 ELSE 1 END ASC,
id ASC
LIMIT 1 FOR UPDATE SKIP LOCKED
) RETURNING id, url, name, level
""")
res = await db.execute(query)
return res.fetchone()
async def main():
"""
Fő vezérlő hurok teljes hibakezeléssel és tranzakció-biztonsággal.
"""
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context = await browser.new_context(
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
)
logger.info("🤖 R1 AutoEvolution Specialist elindult...")
while True:
target = None
try:
async with AsyncSessionLocal() as db:
target = await get_next_task(db)
await db.commit()
except Exception as e:
logger.error(f"❌ Adatbázis hiba a feladatfelvételnél: {e}")
await asyncio.sleep(5)
continue
if not target:
logger.info("🏁 Nincs több AutoEvolution feladat. Alvás 60mp...")
await asyncio.sleep(60)
continue
t_id, t_url, t_name, t_level = target
page = await context.new_page()
try:
logger.info(f"🚀 Felderítés ({t_level}): {t_name} -> {t_url}")
# A domcontentloaded gyorsabb, de várunk utána a JS-re
await page.goto(t_url, wait_until="domcontentloaded", timeout=60000)
await asyncio.sleep(random.uniform(2, 3))
links = await analyze_and_extract_links(page, t_url)
async with AsyncSessionLocal() as db:
try:
new_links_count = 0
for link in links:
# Minden talált variációt elmentünk a várólistába
await db.execute(text("""
INSERT INTO vehicle.auto_data_crawler_queue (url, level, parent_id, name, status, category)
VALUES (:url, :level, :p_id, :name, 'pending', 'bike')
ON CONFLICT (url) DO NOTHING
"""), {"url": link['url'], "level": link['level'], "p_id": t_id, "name": link['name']})
new_links_count += 1
# Feladat lezárása
await db.execute(text("UPDATE vehicle.auto_data_crawler_queue SET status = 'completed', updated_at = NOW() WHERE id = :id"), {"id": t_id})
await db.commit()
logger.info(f"{t_name} kész. Talált AutoEvolution linkek: {new_links_count}")
except Exception as inner_db_error:
await db.rollback()
logger.error(f"❌ Belső mentési hiba: {inner_db_error}")
raise inner_db_error
except Exception as e:
logger.error(f"❌ Kritikus hiba a navigáció során: {t_name} -> {e}")
async with AsyncSessionLocal() as db:
await db.execute(text("UPDATE vehicle.auto_data_crawler_queue SET status = 'error', error_msg = :msg, updated_at = NOW() WHERE id = :id"),
{"msg": str(e), "id": t_id})
await db.commit()
finally:
await page.close()
# Kíméljük a szervert a kitiltás ellen
await asyncio.sleep(random.uniform(3, 5))
await browser.close()
if __name__ == "__main__":
try:
asyncio.run(main())
except KeyboardInterrupt:
logger.info("🛑 Leállítás.")

View File

@@ -0,0 +1,173 @@
import asyncio
import logging
import random
import re
from playwright.async_api import async_playwright
from sqlalchemy import text
from app.database import AsyncSessionLocal
# --- NAPLÓZÁS ---
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s [R2-BIKE-DEPTH] %(message)s',
handlers=[logging.StreamHandler()]
)
logger = logging.getLogger("R2")
async def get_page_safe(page, url):
"""
Bot védelem kijátszása valós viselkedéssel és Cloudflare ellenőrzéssel.
"""
delay = random.uniform(4, 7)
await asyncio.sleep(delay)
try:
await page.goto(url, wait_until="domcontentloaded", timeout=60000)
title = await page.title()
if "Just a moment" in title or "Cloudflare" in title:
logger.error(f"Bot védelem észlelve: {url}")
raise Exception("Bot védelem (CF) megállította a robotot.")
return page
except Exception as e:
logger.error(f"Hiba az oldal betöltésekor: {url} -> {e}")
raise
async def extract_scoped_links(page, p_id, current_url):
"""
MÉLYSÉGI FELDERÍTÉS: Generation -> Engine variációk kinyerése.
Scope-Lock: Csak az adott márkán belüli linkeket követi.
"""
# Kinyerjük a márka nevét az URL-ből a scope-lockhoz
path_segments = current_url.strip('/').split('/')
if len(path_segments) < 5:
return 0
brand_anchor = path_segments[4]
hrefs = await page.eval_on_selector_all(
"a[href*='/moto/']",
"nodes => nodes.map(n => ({ 'name': n.innerText.trim(), 'url': n.href }))"
)
junk = ['privacy', 'cookie', 'settings', 'contact', 'terms', 'advertising', 'login', 'about', 'copyright']
found_count = 0
async with AsyncSessionLocal() as db:
for link in hrefs:
# TISZTÍTÁS: Levágjuk a horgonyt, hogy az adatlapot lássuk
clean_url = link['url'].split('#')[0].split('?')[0].rstrip('/')
name = link['name'].replace('\n', ' ').strip()
# Alap szűrések
if not name or len(name) < 2: continue
if re.search(r'[^\x00-\x7F]+', name): continue
if any(k in name.lower() for k in junk): continue
# SCOPE LOCK: Csak az adott márkához tartozó linkeket engedjük át
if brand_anchor not in clean_url.lower():
continue
# Navigációs szűrés
if any(x in clean_url for x in ['-brand-', 'allbrands', 'en/brands', '/moto/']):
if clean_url.count('/') < 5: continue
# Önhivatkozás elkerülése
if clean_url == current_url.rstrip('/'):
continue
# Szintek meghatározása
if clean_url.endswith(".html"):
target_level = 'engine'
elif clean_url.count('/') >= 6:
target_level = 'generation'
else:
continue
# Mentés az adatbázisba
await db.execute(text("""
INSERT INTO vehicle.auto_data_crawler_queue (url, level, parent_id, name, status, category)
VALUES (:url, :level, :p_id, :name, 'pending', 'bike')
ON CONFLICT (url) DO NOTHING
"""), {"url": clean_url, "level": target_level, "p_id": p_id, "name": name})
found_count += 1
await db.commit()
return found_count
async def process_target(context, t_id, t_url, t_name, t_level):
"""
Egy adott feladat (URL) teljes körű feldolgozása.
"""
page = await context.new_page()
try:
logger.info(f"🚀 Mélységi fúrás [{t_level}]: {t_name}")
await get_page_safe(page, t_url)
# Variációk és generációk kinyerése
found = await extract_scoped_links(page, t_id, t_url)
async with AsyncSessionLocal() as db:
new_status = 'completed' if found > 0 else 'completed_leaf'
await db.execute(text("""
UPDATE vehicle.auto_data_crawler_queue
SET status = :s, error_msg = NULL, updated_at = NOW()
WHERE id = :id
"""), {"s": new_status, "id": t_id})
await db.commit()
logger.info(f"✅ Befejezve: {t_name} -> {found} új variáció rögzítve.")
except Exception as e:
logger.error(f"❌ Kritikus hiba feldolgozás közben ({t_name}): {e}")
async with AsyncSessionLocal() as db:
await db.execute(text("""
UPDATE vehicle.auto_data_crawler_queue
SET status = 'error', error_msg = :msg, updated_at = NOW()
WHERE id = :id
"""), {"msg": str(e), "id": t_id})
await db.commit()
finally:
await page.close()
async def main():
"""
Fő hurok mélységi stratégiával (level ASC).
"""
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context = await browser.new_context(
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/122.0.0.0",
viewport={'width': 1920, 'height': 1080}
)
logger.info("🤖 R2 Motoros Mélységi Felderítő aktív.")
while True:
async with AsyncSessionLocal() as db:
res = await db.execute(text("""
UPDATE vehicle.auto_data_crawler_queue SET status = 'processing'
WHERE id = (
SELECT id FROM vehicle.auto_data_crawler_queue
WHERE status = 'pending'
AND level IN ('model', 'generation')
AND category = 'bike'
AND url LIKE '%autoevolution.com%'
ORDER BY level ASC, id ASC
LIMIT 1 FOR UPDATE SKIP LOCKED
) RETURNING id, url, name, level
"""))
target = res.fetchone()
await db.commit()
if not target:
logger.info("🏁 Minden variáció felderítve. Alvás 60mp...")
await asyncio.sleep(60)
continue
await process_target(context, target[0], target[1], target[2], target[3])
await browser.close()
if __name__ == "__main__":
try:
asyncio.run(main())
except KeyboardInterrupt:
logger.info("🛑 Leállítás.")

View File

@@ -0,0 +1,95 @@
# /opt/docker/dev/service_finder/backend/app/workers/vehicle/bike/bike_R3_engine_scout.py
import asyncio
import logging
import random
import json
import re
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright
from sqlalchemy import text
from app.database import AsyncSessionLocal
logging.basicConfig(level=logging.INFO, format='%(asctime)s [R3-EXTRACTOR] %(message)s')
logger = logging.getLogger("R3")
class R3DataMiner:
def clean_key(self, key):
if "," in key: key = key.split(",")[-1]
key = key.replace("What is the ", "").replace("How much ", "").replace("How many ", "")
return key.split("?")[0].strip().capitalize()
async def scrape_specs(self, context, url):
page = await context.new_page()
try:
await asyncio.sleep(random.uniform(4, 8))
await page.goto(url, wait_until="domcontentloaded", timeout=60000)
content = await page.content()
soup = BeautifulSoup(content, 'html.parser')
data = {"make": "", "model": "", "generation": "", "modification": "",
"year_from": None, "power_kw": 0, "engine_cc": 0,
"specifications": {}, "source_url": url}
for row in soup.find_all('tr'):
th, td = row.find('th'), row.find('td')
if not th or not td: continue
k_raw, v = th.get_text(strip=True), td.get_text(strip=True)
k_low = k_raw.lower()
if "brand" == k_low: data["make"] = v
elif "model" == k_low: data["model"] = v
elif "generation" == k_low: data["generation"] = v
elif "modification" == k_low: data["modification"] = v
elif "start of production" in k_low:
m = re.search(r'(\d{4})', v)
data["year_from"] = int(m.group(1)) if m else None
elif "power" == k_low:
hp = re.search(r'(\d+)\s*Hp', v, re.I)
if hp: data["power_kw"] = int(int(hp.group(1)) / 1.36)
elif "displacement" in k_low:
cc = re.search(r'(\d+)\s*cm3', v)
if cc: data["engine_cc"] = int(cc.group(1))
data["specifications"][self.clean_key(k_raw)] = v
return data
except Exception as e:
logger.error(f"Hiba az adatlapon: {e}"); return None
finally: await page.close()
async def run(self):
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context = await browser.new_context(user_agent="Mozilla/5.0...")
while True:
async with AsyncSessionLocal() as db:
res = await db.execute(text("""
UPDATE vehicle.auto_data_crawler_queue SET status = 'processing'
WHERE id = (SELECT id FROM vehicle.auto_data_crawler_queue
WHERE level = 'engine' AND status = 'pending'
ORDER BY id ASC LIMIT 1 FOR UPDATE SKIP LOCKED)
RETURNING id, url, name
"""))
target = res.fetchone()
await db.commit()
if not target: break
data = await self.scrape_specs(context, target[1])
if data and data["make"]:
async with AsyncSessionLocal() as db:
await db.execute(text("""
INSERT INTO vehicle.external_reference_library
(source_name, make, model, generation, modification, year_from, power_kw, engine_cc, specifications, source_url)
VALUES ('auto-data.net', :make, :model, :gen, :mod, :y, :p, :e, :s, :u)
ON CONFLICT (source_url) DO UPDATE SET specifications = EXCLUDED.specifications, last_scraped_at = NOW();
"""), {"make": data["make"], "model": data["model"], "gen": data["generation"], "mod": data["modification"],
"y": data["year_from"], "p": data["power_kw"], "e": data["engine_cc"], "s": json.dumps(data["specifications"]), "u": data["source_url"]})
await db.execute(text("UPDATE vehicle.auto_data_crawler_queue SET status = 'completed' WHERE id = :id"), {"id": target[0]})
await db.commit()
logger.info(f"✅ ARANYMENTÉS: {data['make']} {data['model']} {data['modification']}")
else:
async with AsyncSessionLocal() as db:
await db.execute(text("UPDATE vehicle.auto_data_crawler_queue SET status = 'error' WHERE id = :id"), {"id": target[0]})
await db.commit()
await browser.close()
if __name__ == "__main__": asyncio.run(R3DataMiner().run())

View File

@@ -0,0 +1,218 @@
#!/usr/bin/env python3
import asyncio
import logging
import random
import json
import sys
from playwright.async_api import async_playwright
from sqlalchemy import text
from app.database import AsyncSessionLocal
# --- NAPLÓZÁS KONFIGURÁCIÓ ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s [R4-HARVESTER-v1.2] %(message)s')
logger = logging.getLogger("R4")
# --- KONFIGURÁCIÓS PARAMÉTEREK ---
MAX_RETRY_LIMIT = 5 # Max 5 próbálkozás járművenként
async def parse_specs(page):
"""
A GYŐZTES DOM PARSZOLÓ LOGIKA (HIÁNYTALAN)
Ez a script felismeri a hibás táblázatokat, a dt/dd listákat és a sima vastagított szövegeket is.
"""
script = """
() => {
let results = {};
// 1. MÓDSZER: Régi motorok (pl. BMW F650GS) -> td.left és td.right
let leftCells = document.querySelectorAll('td.left');
leftCells.forEach(cell => {
let key = cell.innerText.replace(/:$/, '').trim();
let rightCell = cell.nextElementSibling;
if(rightCell && rightCell.classList.contains('right')) {
results[key] = rightCell.innerText.trim();
}
});
// 2. MÓDSZER: Modern motorok (pl. Aprilia) -> dt és dd
let dts = document.querySelectorAll('dt');
dts.forEach(dt => {
let key = dt.innerText.replace(/:$/, '').trim();
let dd = dt.nextElementSibling;
if(dd && dd.tagName.toLowerCase() === 'dd') {
results[key] = dd.innerText.trim();
}
});
// 3. MÓDSZER: Alternatív modern layout -> span.label és span.value
let specRows = document.querySelectorAll('.spec-row');
specRows.forEach(row => {
let label = row.querySelector('.label');
let value = row.querySelector('.value');
if(label && value) {
let key = label.innerText.replace(/:$/, '').trim();
if (!results[key]) {
results[key] = value.innerText.trim();
}
}
});
// 4. MÓDSZER: Veterán ("Adler") fallback -> Vastagított szöveg
if (Object.keys(results).length === 0) {
document.querySelectorAll('b, strong').forEach(b => {
let key = b.innerText.replace(/:$/, '').trim();
if(key.length > 2 && key.length < 30) {
let val = "";
if(b.nextSibling && b.nextSibling.nodeType === 3) {
val = b.nextSibling.textContent.trim();
}
else if (b.nextElementSibling && b.nextElementSibling.tagName !== 'B') {
val = b.nextElementSibling.innerText.trim();
}
if(val && !results[key]) {
results[key] = val;
}
}
});
}
return results;
}
"""
try:
data = await page.evaluate(script)
if data and len(data) > 0:
relevant_keys = [
"Production", "Year", "Segment",
"Type", "Displacement", "Bore X Stroke", "Compression Ratio",
"Horsepower", "Torque", "Fuel System", "Gearbox", "Clutch",
"Final Drive", "Frame", "Front Suspension", "Rear Suspension",
"Front Brake", "Rear Brake", "Overall Length", "Overall Width",
"Seat Height", "Wheelbase", "Fuel Capacity", "Weight", "Dry Weight",
"Wet Weight", "Front", "Rear"
]
filtered_data = {k: v for k, v in data.items() if any(rk.lower() in k.lower() for rk in relevant_keys)}
return filtered_data if len(filtered_data) > 0 else data
return None
except Exception as e:
logger.error(f"❌ Parszolási hiba a JS kiértékeléskor: {e}")
return None
async def main():
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context = await browser.new_context(
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
viewport={'width': 1920, 'height': 1080}
)
logger.info("🤖 R4 Motor Adat-Arató v1.2 elindult.")
while True:
target = None
try:
async with AsyncSessionLocal() as db:
# JAVÍTÁS: Kikerült a completed_empty a választható státuszok közül!
# Csak 'pending' és 'error' jöhet, ha a retry_count < 5.
res = await db.execute(text("""
UPDATE vehicle.auto_data_crawler_queue SET status = 'processing'
WHERE id = (
SELECT id FROM vehicle.auto_data_crawler_queue
WHERE status IN ('pending', 'error')
AND retry_count < 5
AND level = 'engine' AND category = 'bike'
ORDER BY id ASC LIMIT 1 FOR UPDATE SKIP LOCKED
) RETURNING id, url, name, retry_count
"""))
target = res.fetchone()
await db.commit()
except Exception as e:
logger.error(f"❌ DB Hiba a feladatfelvételnél: {e}")
await asyncio.sleep(5)
continue
if not target:
logger.info("🏁 Minden motor feldolgozva vagy manuális felülvizsgálatra vár. Alvás 60mp...")
await asyncio.sleep(60)
continue
t_id, t_url, t_name, t_retry_count = target
if t_retry_count is None: t_retry_count = 0
page = await context.new_page()
try:
logger.info(f"📊 [{t_retry_count + 1}/5] Adatbányászat: {t_name}")
await page.goto(t_url, wait_until="domcontentloaded", timeout=60000)
await asyncio.sleep(2)
data = await parse_specs(page)
async with AsyncSessionLocal() as db:
if data and len(data) > 0:
# SIKERES MENTÉS
await db.execute(text("""
INSERT INTO vehicle.motorcycle_specs (crawler_id, full_name, raw_data, url)
VALUES (:cid, :name, :data, :url)
ON CONFLICT (crawler_id) DO UPDATE SET raw_data = :data, updated_at = NOW()
"""), {"cid": t_id, "name": t_name, "data": json.dumps(data), "url": t_url})
await db.execute(text("UPDATE vehicle.auto_data_crawler_queue SET status = 'completed', updated_at = NOW() WHERE id = :id"), {"id": t_id})
await db.commit()
logger.info(f"✅ Mentve: {t_name} ({len(data)} paraméter)")
else:
# ÜRES OLDAL VAGY HIÁNYZÓ ADAT
new_retry_count = t_retry_count + 1
if new_retry_count >= 5:
# Elérte a limitet -> JAVÍTANDÓ (manual_review_needed)
await db.execute(text("""
UPDATE vehicle.auto_data_crawler_queue
SET status = 'manual_review_needed',
retry_count = :rc,
error_msg = 'Sikertelen adatgyűjtés 5 próbálkozás után (üres oldal)',
updated_at = NOW()
WHERE id = :id
"""), {"rc": new_retry_count, "id": t_id})
logger.error(f"🚨 LIMIT ELÉRVE: {t_name} -> manuális javításra jelölve.")
else:
# Még próbálkozhat -> státusz visszaállítása hibára
await db.execute(text("""
UPDATE vehicle.auto_data_crawler_queue
SET status = 'error',
retry_count = :rc,
updated_at = NOW()
WHERE id = :id
"""), {"rc": new_retry_count, "id": t_id})
logger.warning(f"⚠️ Üres maradt: {t_name} (Próbálkozás: {new_retry_count}/5)")
await db.commit()
except Exception as e:
logger.error(f"❌ Hiba a feldolgozás során: {t_name} -> {e}")
async with AsyncSessionLocal() as db:
new_retry_count = t_retry_count + 1
status = 'error' if new_retry_count < 5 else 'manual_review_needed'
await db.execute(text("""
UPDATE vehicle.auto_data_crawler_queue
SET status = :st,
retry_count = :rc,
error_msg = :msg,
updated_at = NOW()
WHERE id = :id
"""), {"st": status, "rc": new_retry_count, "msg": str(e), "id": t_id})
await db.commit()
finally:
await page.close()
await asyncio.sleep(random.uniform(2.0, 4.0))
await browser.close()
if __name__ == "__main__":
try:
asyncio.run(main())
except KeyboardInterrupt:
logger.info("🛑 Felhasználói leállítás.")

View File

@@ -0,0 +1,113 @@
import asyncio
import json
from playwright.async_api import async_playwright
async def test_scraper():
# Két probléma-fókuszú URL: a modern Aprilia és a régi, hibás HTML-ű BMW
test_urls = [
"https://www.autoevolution.com/moto/aprilia-rs-660-factory-2025.html",
"https://www.autoevolution.com/moto/bmw-f-650-gs-2011.html"
]
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context = await browser.new_context(
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
)
page = await context.new_page()
for url in test_urls:
print(f"\n{'='*60}")
print(f"🌍 MEGNYITÁS: {url}")
print(f"{'='*60}")
# A DOM betöltése megvárása
await page.goto(url, wait_until="domcontentloaded", timeout=60000)
await asyncio.sleep(2) # Várunk picit a JS futásra
# A TÖKÉLETESÍTETT AUTOEVOLUTION PARSZOLÓ
script = """
() => {
let results = {};
// 1. MÓDSZER: Régi motorok (pl. BMW F650GS) -> td.left és td.right
let leftCells = document.querySelectorAll('td.left');
leftCells.forEach(cell => {
let key = cell.innerText.replace(/:$/, '').trim();
let rightCell = cell.nextElementSibling;
if(rightCell && rightCell.classList.contains('right')) {
results[key] = rightCell.innerText.trim();
}
});
// 2. MÓDSZER: Modern motorok (pl. Aprilia) -> dt és dd
let dts = document.querySelectorAll('dt');
dts.forEach(dt => {
let key = dt.innerText.replace(/:$/, '').trim();
let dd = dt.nextElementSibling;
if(dd && dd.tagName.toLowerCase() === 'dd') {
results[key] = dd.innerText.trim();
}
});
// 3. MÓDSZER: Alternatív modern layout -> span.label és span.value
let specRows = document.querySelectorAll('.spec-row');
specRows.forEach(row => {
let label = row.querySelector('.label');
let value = row.querySelector('.value');
if(label && value) {
let key = label.innerText.replace(/:$/, '').trim();
if (!results[key]) {
results[key] = value.innerText.trim();
}
}
});
// 4. MÓDSZER: "Adler" típusú elavult leírások fallbackje -> Vastagított szöveg
if (Object.keys(results).length === 0) {
document.querySelectorAll('b, strong').forEach(b => {
let key = b.innerText.replace(/:$/, '').trim();
if(key.length > 2 && key.length < 30) {
let val = "";
// Ha a szöveg közvetlenül a tag után van (Text Node)
if(b.nextSibling && b.nextSibling.nodeType === 3) {
val = b.nextSibling.textContent.trim();
}
// Ha egy másik elemben van
else if (b.nextElementSibling && b.nextElementSibling.tagName !== 'B') {
val = b.nextElementSibling.innerText.trim();
}
if(val && !results[key]) {
results[key] = val;
}
}
});
}
return results;
}
"""
data = await page.evaluate(script)
if data and len(data) > 0:
# Kiszűrjük a zajt, csak a releváns műszaki adatokat hagyjuk meg
relevant_keys = ["Type", "Displacement", "Bore X Stroke", "Compression Ratio",
"Horsepower", "Torque", "Fuel System", "Gearbox", "Clutch",
"Final Drive", "Frame", "Front Suspension", "Rear Suspension",
"Front Brake", "Rear Brake", "Overall Length", "Overall Width",
"Seat Height", "Wheelbase", "Fuel Capacity", "Weight", "Dry Weight",
"Wet Weight", "Front", "Rear"]
filtered_data = {k: v for k, v in data.items() if any(rk.lower() in k.lower() for rk in relevant_keys)}
print("\n🟢 KINYERT ADATOK (DOM PARSZOLÓ):")
print(json.dumps(filtered_data if filtered_data else data, indent=2, ensure_ascii=False))
print(f"\n✅ Összesen {len(filtered_data if filtered_data else data)} műszaki paramétert találtam.")
else:
print("\n🔴 NULLA ADAT - A DOM parszoló nem talált egyezést.")
await browser.close()
if __name__ == "__main__":
asyncio.run(test_scraper())