átlagos kiegészítséek jó sok

This commit is contained in:
Roo
2026-03-22 11:02:05 +00:00
parent f53e0b53df
commit 5d44339f21
249 changed files with 20922 additions and 2253 deletions

View File

@@ -1,10 +1,10 @@
# /app/app/workers/monitor_dashboard.py
# docker exec sf_api python -m app.workers.monitor_dashboard
# /opt/docker/dev/service_finder/backend/app/workers/monitor_dashboard.py
import asyncio
import os
import httpx
import pynvml
import psutil
import subprocess
from datetime import datetime, timedelta
from sqlalchemy import text
from sqlalchemy.ext.asyncio import create_async_engine
@@ -13,40 +13,48 @@ from rich.table import Table
from rich.panel import Panel
from rich.live import Live
from rich.layout import Layout
from rich.text import Text
from app.core.config import settings
console = Console()
# NVIDIA inicializálása
STATUS_TRANSLATIONS = {
'published': 'Véglegesítve (Publikált)',
'awaiting_ai_synthesis': 'AI Szintézisre Vár',
'manual_review_needed': 'Kézi Javítás Szükséges',
'unverified': 'Ellenőrizetlen (Nyers)',
'research_in_progress': 'Kutatás Folyamatban',
'ai_synthesis_in_progress': 'AI Szintézis Alatt',
'gold_enriched': 'Aranyosított (Végleges)',
'pending': 'Függőben',
'processing': 'Feldolgozás alatt'
}
try:
pynvml.nvmlInit()
gpu_available = True
except Exception:
gpu_available = False
def get_gpu_content():
try:
gpu_raw = subprocess.check_output(
['nvidia-smi', '--query-gpu=name,utilization.gpu,memory.used,memory.total,temperature.gpu', '--format=csv,noheader,nounits'],
encoding='utf-8'
).strip().split(', ')
gpu_name = gpu_raw[0].replace("NVIDIA ", "")
gpu_content = f"GPU: [bold bright_white]NVIDIA {gpu_name}[/]\nTerhelés: [bold orange3]{gpu_raw[1]}%[/]\nVRAM: [bold cyan]{gpu_raw[2]} MB[/] / {gpu_raw[3]} MB\nHőmérséklet: [bold red]{gpu_raw[4]} °C[/]"
except Exception as e:
gpu_content = f"GPU adatok olvasása sikertelen: {str(e)}"
return gpu_content
async def get_hardware_stats():
"""Rendszererőforrások: CPU, RAM és GPU"""
stats = {
"cpu_usage": psutil.cpu_percent(interval=None),
"ram_total": psutil.virtual_memory().total // 1024**2,
"ram_used": psutil.virtual_memory().used // 1024**2,
"ram_perc": psutil.virtual_memory().percent,
"gpu": None
"gpu_content": get_gpu_content()
}
if gpu_available:
try:
handle = pynvml.nvmlDeviceGetHandleByIndex(0)
stats["gpu"] = {
"name": pynvml.nvmlDeviceGetName(handle),
"temp": pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU),
"load": pynvml.nvmlDeviceGetUtilizationRates(handle).gpu,
"vram_total": pynvml.nvmlDeviceGetMemoryInfo(handle).total // 1024**2,
"vram_used": pynvml.nvmlDeviceGetMemoryInfo(handle).used // 1024**2,
"power": pynvml.nvmlDeviceGetPowerUsage(handle) / 1000
}
except: pass
return stats
async def get_ollama_models():
@@ -55,140 +63,159 @@ async def get_ollama_models():
resp = await client.get("http://ollama:11434/api/ps")
if resp.status_code == 200:
return [m['name'] for m in resp.json().get("models", [])]
except: return ["Ollama Comm Error"]
except: return ["Ollama API Offline"]
return []
async def get_stats(engine):
async with engine.connect() as conn:
# 1. Sebesség adatok
res_hr = await conn.execute(text("SELECT count(*) FROM vehicle.vehicle_model_definitions WHERE status = 'gold_enriched' AND updated_at > NOW() - INTERVAL '1 hour'"))
hr_rate = res_hr.scalar() or 0
res_day = await conn.execute(text("SELECT count(*) FROM vehicle.vehicle_model_definitions WHERE status = 'gold_enriched' AND updated_at > NOW() - INTERVAL '24 hours'"))
day_rate = res_day.scalar() or 0
# JAVÍTVA: Hogy valós sebességet lássunk, a 'gold_enriched' (épp elkészült) autókat is beleszámoljuk az órás rate-be!
hr_rate = (await conn.execute(text("SELECT COALESCE(count(*), 0) FROM vehicle.vehicle_model_definitions WHERE status IN ('published', 'gold_enriched') AND updated_at > NOW() - INTERVAL '1 hour'"))).scalar()
day_rate = (await conn.execute(text("SELECT COALESCE(count(*), 0) FROM vehicle.vehicle_model_definitions WHERE status IN ('published', 'gold_enriched') AND updated_at > NOW() - INTERVAL '24 hours'"))).scalar()
# 2. Pipeline
res_pipe = await conn.execute(text("""
SELECT
(SELECT count(*) FROM vehicle.catalog_discovery WHERE status = 'pending') as r1,
(SELECT count(*) FROM vehicle.vehicle_model_definitions WHERE status = 'unverified') as r2,
(SELECT count(*) FROM vehicle.vehicle_model_definitions WHERE status = 'awaiting_ai_synthesis') as r3,
(SELECT count(*) FROM vehicle.vehicle_model_definitions WHERE status = 'gold_enriched') as r4
"""))
r_counts = res_pipe.fetchone()
r1 = (await conn.execute(text("SELECT count(*) FROM vehicle.catalog_discovery WHERE status = 'pending'"))).scalar()
r2 = (await conn.execute(text("SELECT count(*) FROM vehicle.vehicle_model_definitions WHERE status = 'unverified'"))).scalar()
r3 = (await conn.execute(text("SELECT count(*) FROM vehicle.vehicle_model_definitions WHERE status = 'awaiting_ai_synthesis'"))).scalar()
r4 = (await conn.execute(text("SELECT count(*) FROM vehicle.vehicle_model_definitions WHERE status = 'gold_enriched'"))).scalar()
r_counts = (r1, r2, r3, r4)
# 3. TOP 7
res_top = await conn.execute(text("SELECT make, count(*) as qty FROM vehicle.vehicle_model_definitions GROUP BY make ORDER BY qty DESC LIMIT 7"))
top_makes = res_top.fetchall()
top_makes = (await conn.execute(text("SELECT make, count(*) as qty FROM vehicle.vehicle_model_definitions GROUP BY make ORDER BY qty DESC LIMIT 7"))).fetchall()
# 4. AKTIVITÁS (3 példány per robot)
res_r4 = await conn.execute(text("SELECT make, marketing_name FROM vehicle.vehicle_model_definitions WHERE status = 'gold_enriched' ORDER BY updated_at DESC LIMIT 5"))
res_r3 = await conn.execute(text("SELECT make, marketing_name FROM vehicle.vehicle_model_definitions WHERE status = 'ai_synthesis_in_progress' ORDER BY updated_at DESC LIMIT 5"))
res_r12 = await conn.execute(text("SELECT make, model FROM vehicle.catalog_discovery WHERE status = 'processing' ORDER BY updated_at DESC LIMIT 5"))
res_r4 = (await conn.execute(text("SELECT make, marketing_name FROM vehicle.vehicle_model_definitions WHERE status = 'gold_enriched' ORDER BY updated_at DESC LIMIT 5"))).fetchall()
res_r3 = (await conn.execute(text("SELECT make, marketing_name FROM vehicle.vehicle_model_definitions WHERE status = 'ai_synthesis_in_progress' ORDER BY updated_at DESC LIMIT 5"))).fetchall()
res_r12 = (await conn.execute(text("SELECT make, model FROM vehicle.catalog_discovery WHERE status = 'processing' ORDER BY id DESC LIMIT 5"))).fetchall()
published_count = (await conn.execute(text("SELECT COUNT(*) FROM vehicle.vehicle_model_definitions WHERE status = 'published'"))).scalar()
manual_review_needed_count = (await conn.execute(text("SELECT COUNT(*) FROM vehicle.vehicle_model_definitions WHERE status = 'manual_review_needed'"))).scalar()
status_distribution = (await conn.execute(text("SELECT status, COUNT(*) as count FROM vehicle.vehicle_model_definitions GROUP BY status ORDER BY count DESC"))).fetchall()
make_distribution = (await conn.execute(text("SELECT make, COUNT(*) as count FROM vehicle.vehicle_model_definitions WHERE status = 'published' GROUP BY make ORDER BY count DESC LIMIT 15"))).fetchall()
manual_review_list = (await conn.execute(text(
"SELECT make, marketing_name, COUNT(*) as count FROM vehicle.vehicle_model_definitions WHERE status = 'manual_review_needed' GROUP BY make, marketing_name ORDER BY count DESC LIMIT 15"
))).fetchall()
hw = await get_hardware_stats()
ai = await get_ollama_models()
return (hr_rate, day_rate), r_counts, top_makes, (res_r4.fetchall(), res_r3.fetchall(), res_r12.fetchall()), hw, ai
return (hr_rate, day_rate), r_counts, top_makes, (res_r4, res_r3, res_r12), hw, ai, (published_count, manual_review_needed_count, status_distribution, make_distribution, manual_review_list)
def make_layout() -> Layout:
layout = Layout()
layout.split_column(
Layout(name="header", size=3),
Layout(name="main", ratio=1),
Layout(name="hardware", size=10), # Megnövelt hardver rész
Layout(name="hardware", size=8),
Layout(name="footer", size=3)
)
layout["main"].split_row(
Layout(name="left", ratio=1),
Layout(name="middle", ratio=1),
Layout(name="right", ratio=2)
)
layout["left"].split_column(Layout(name="robot_stats"), Layout(name="inventory"))
layout["right"].split_column(Layout(name="live_ops"))
layout["left"].split_column(Layout(name="robot_stats", ratio=1), Layout(name="inventory", ratio=2))
layout["middle"].split_column(Layout(name="db_left", ratio=1), Layout(name="db_right", ratio=2))
layout["right"].split_column(
Layout(name="live_ops", ratio=1),
Layout(name="manual_review", ratio=2)
)
return layout
def update_dashboard(layout, data):
rates, r_counts, top_makes, live_data, hw, ai_models = data
r4_list, r3_list, r12_list = live_data
def translate_status(status):
return STATUS_TRANSLATIONS.get(status, status)
def update_dashboard(layout, data, error_msg=""):
rates, r_counts, top_makes, live_data, hw, ai_models, db_stats = data
r4_list, r3_list, r12_list = live_data
published_count, manual_review_needed_count, status_distribution, make_distribution, manual_review_list = db_stats
# Óra (UTC+1 korrekció)
local_time = datetime.now() + timedelta(hours=1)
# HEADER (Változatlan)
layout["header"].update(Panel(
f"🛰️ SENTINEL MISSION CONTROL | [bold yellow]{local_time.strftime('%Y-%m-%d %H:%M:%S')}[/] | AI: [green]{rates[0]}[/] /óra — [cyan]{rates[1]}[/] /nap",
f"🛰️ SENTINEL IRÁNYÍTÓKÖZPONT | [bold yellow]{local_time.strftime('%Y-%m-%d %H:%M:%S')}[/] | AI Teljesítmény: [green]{rates[0]:,}[/] /óra — [cyan]{rates[1]:,}[/] /nap | Összes publikált: [bold green]{published_count:,}[/]",
style="bold white on blue"
))
# ROBOT PIPELINE
robot_table = Table(title="🤖 Pipeline Állapot", expand=True, border_style="cyan")
robot_table = Table(title="🤖 Robot Pipeline Állapot", expand=True, border_style="cyan")
robot_table.add_column("Robot", style="bold")
robot_table.add_column("Várakozik", justify="right")
robot_table.add_row("R1-Hunter", f"{r_counts[0]} db")
robot_table.add_row("R2-Researcher", f"{r_counts[1]} db")
robot_table.add_row("R3-Alchemist", f"{r_counts[2]} db")
robot_table.add_row("R4-Validator", f"{r_counts[3]} db")
robot_table.add_row("R1-Hunter (Nyers gyűjtés)", f"{r_counts[0]:,} db")
robot_table.add_row("R2-Researcher (Webes kutatás)", f"{r_counts[1]:,} db")
robot_table.add_row("R3-Alchemist (AI Szintézis)", f"{r_counts[2]:,} db")
robot_table.add_row("R4-Validator (Várakozó Arany)", f"[green]{r_counts[3]:,}[/] db")
layout["robot_stats"].update(robot_table)
# TOP MÁRKÁK
brand_table = Table(title="🚜 Top 7 Márka", expand=True, border_style="magenta")
brand_table = Table(title="🚜 Bányászott Márkák (Top 7)", expand=True, border_style="magenta")
brand_table.add_column("Márka", style="yellow")
brand_table.add_column("db", justify="right")
for m, q in top_makes: brand_table.add_row(m, str(q))
brand_table.add_column("Darabszám", justify="right")
for m, q in top_makes: brand_table.add_row(str(m), str(q))
layout["inventory"].update(brand_table)
# LIVE OPS (Bővítve 5-5 példányra)
ops_table = Table(title="⚡ Aktuális Folyamatok (Utolsó 3/robot)", expand=True, border_style="green")
ops_table = Table(title="⚡ Aktuális Folyamatok", expand=True, border_style="green")
ops_table.add_column("Robot", width=15)
ops_table.add_column("Márka / Típus")
for r in r4_list: ops_table.add_row("[gold1]R4-VALIDATOR[/]", f"{r[0]} {r[1] or ''}")
ops_table.add_section()
for r in r3_list: ops_table.add_row("[medium_purple1]R3-ALCHEMIST[/]", f"{r[0]} {r[1] or ''}")
ops_table.add_section()
for r in r4_list: ops_table.add_row("[gold1]R4-ARANY[/]", f"{r[0]} {r[1] or ''}")
if r4_list: ops_table.add_section()
for r in r3_list: ops_table.add_row("[medium_purple1]R3-AI[/]", f"{r[0]} {r[1] or ''}")
if r3_list: ops_table.add_section()
for r in r12_list: ops_table.add_row("[sky_blue1]R1-HUNTER[/]", f"{r[0]} {r[1] or ''}")
layout["live_ops"].update(ops_table)
# HARDWARE & AI (3 OSZLOPOS ELRENDEZÉS)
hw_layout = Layout()
hw_layout.split_row(Layout(name="sys"), Layout(name="gpu"), Layout(name="ai"))
# 1. Rendszer (CPU/RAM)
sys_info = (
f"[bold]CPU Terhelés:[/] [bright_blue]{hw['cpu_usage']}%[/]\n"
f"[bold]RAM Használat:[/] [bright_magenta]{hw['ram_perc']}%[/]\n"
f"({hw['ram_used']} / {hw['ram_total']} MB)"
hw_layout.split_row(
Layout(name="sys", ratio=1),
Layout(name="gpu_combined", ratio=2)
)
hw_layout["sys"].update(Panel(sys_info, title="💻 System Resources", border_style="bright_blue"))
# 2. GPU
if hw["gpu"]:
g = hw["gpu"]
gpu_info = (
f"[bold]{g['name']}[/]\n"
f"Load: [green]{g['load']}%[/] | Temp: {g['temp']}°C\n"
f"VRAM: {g['vram_used']} / {g['vram_total']} MB"
)
else:
gpu_info = "[red]NVIDIA GPU not detected[/]"
hw_layout["gpu"].update(Panel(gpu_info, title="🔌 GPU Monitor", border_style="orange3"))
# 3. AI Models
ai_info = "[bold]In Memory (VRAM):[/]\n" + ("\n".join([f"🧠 {m}" for m in ai_models]) if ai_models else "No active models.")
hw_layout["ai"].update(Panel(ai_info, title="🤖 AI Stack", border_style="plum1"))
sys_info = f"[bold]CPU:[/]\t[bright_blue]{hw['cpu_usage']}%[/]\n[bold]RAM:[/]\t[bright_magenta]{hw['ram_perc']}%[/] ({hw['ram_used']}/{hw['ram_total']}MB)"
hw_layout["sys"].update(Panel(sys_info, title="💻 Rendszer", border_style="bright_blue"))
gpu_info = hw.get("gpu_content", "GPU adatok nem elérhetők")
ai_info = " | ".join([f"🧠 [plum1]{m}[/]" for m in ai_models]) if ai_models else "Nincs betöltve modell."
combined_gpu_text = f"{gpu_info}\n[bold bright_white]🤖 Ollama Modellek:[/] {ai_info}"
hw_layout["gpu_combined"].update(Panel(combined_gpu_text, title="🔌 GPU & AI Központ", border_style="orange3"))
layout["hardware"].update(hw_layout)
layout["footer"].update(Panel(f"Sentinel v2.5 | Kernel: Stabil | Heartbeat: OK", style="italic grey50"))
status_table = Table(title="📈 Státusz eloszlás", expand=True, border_style="magenta")
status_table.add_column("Státusz", style="bold")
status_table.add_column("Mennyiség", justify="right")
for status, count in status_distribution:
status_table.add_row(translate_status(status), f"{count:,}")
layout["db_left"].update(Panel(status_table, title="📊 Státuszok", border_style="magenta"))
# ÚJ: Bekerült a végösszesítő mező a lista aljára!
make_table = Table(title="🚗 Márkák (véglegesített)", expand=True, border_style="green")
make_table.add_column("Márka", style="yellow")
make_table.add_column("Darab", justify="right")
for make, count in make_distribution:
make_table.add_row(str(make), f"{count:,}")
make_table.add_section()
make_table.add_row("[bold bright_white]ÖSSZES PUBLIKÁLT[/]", f"[bold green]{published_count:,}[/]")
layout["db_right"].update(Panel(make_table, title="🏆 Top Márkák", border_style="green"))
manual_table = Table(title="🛠️ Kézi Javításra Várók (Top 15)", expand=True, border_style="yellow")
manual_table.add_column("Márka", style="bold")
manual_table.add_column("Modell", style="cyan")
manual_table.add_column("Darabszám", justify="right")
for make, model, count in manual_review_list:
manual_table.add_row(str(make), str(model) if model else "N/A", f"{count:,}")
layout["manual_review"].update(Panel(manual_table, title="🛠️ Kézi Javításra Várók", border_style="yellow"))
footer_text = f"Sentinel v2.6 | Kernel: Stabil | R1 Pörög: {r_counts[0]:,} várakozik"
if error_msg: footer_text = f"[red bold]HIBA: {error_msg}[/]"
layout["footer"].update(Panel(footer_text, style="italic grey50"))
async def main():
engine = create_async_engine(settings.DATABASE_URL)
layout = make_layout()
with Live(layout, refresh_per_second=1, screen=True):
with Live(layout, refresh_per_second=2, screen=True):
while True:
try:
data = await get_stats(engine)
update_dashboard(layout, data)
except: pass
await asyncio.sleep(2)
except Exception as e:
update_dashboard(layout, ((0,0), (0,0,0,0), [], ([],[],[]), {"cpu_usage":0,"ram_perc":0,"ram_used":0,"ram_total":0,"gpu_content":""}, [], (0, 0, [], [], [])), str(e))
await asyncio.sleep(0.5)
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,308 @@
# /opt/docker/dev/service_finder/backend/app/workers/monitor_dashboard2.0.py
# docker exec sf_api python -m app.workers.monitor_dashboard
import asyncio
import os
import httpx
import pynvml
import psutil
import subprocess
from datetime import datetime, timedelta
from sqlalchemy import text
from sqlalchemy.ext.asyncio import create_async_engine
from rich.console import Console
from rich.table import Table
from rich.panel import Panel
from rich.live import Live
from rich.layout import Layout
from rich.text import Text
from app.core.config import settings
console = Console()
# Magyar fordítási szótár a státuszokhoz
STATUS_TRANSLATIONS = {
'published': 'Véglegesítve (Publikált)',
'awaiting_ai_synthesis': 'AI Szintézisre Vár',
'manual_review_needed': 'Kézi Javítás Szükséges',
'unverified': 'Ellenőrizetlen (Nyers)',
'research_in_progress': 'Kutatás Folyamatban',
'ai_synthesis_in_progress': 'AI Szintézis Alatt',
'gold_enriched': 'Aranyosított (Végleges)',
'pending': 'Függőben',
'processing': 'Feldolgozás alatt'
}
try:
pynvml.nvmlInit()
gpu_available = True
except Exception:
gpu_available = False
def get_gpu_via_nvidia_smi():
"""GPU adatok lekérése nvidia-smi parancs segítségével"""
try:
output = subprocess.check_output(
['nvidia-smi', '--query-gpu=utilization.gpu,memory.used,memory.total,temperature.gpu',
'--format=csv,noheader,nounits'],
text=True
).strip()
if output:
# Több GPU esetén csak az elsőt vesszük
lines = output.split('\n')
first_line = lines[0]
values = [v.strip() for v in first_line.split(',')]
if len(values) >= 4:
gpu_util = int(values[0]) # %
mem_used = int(values[1]) # MiB
mem_total = int(values[2]) # MiB
temp = int(values[3]) # °C
return {
"load": gpu_util,
"vram_used": mem_used,
"vram_total": mem_total,
"temp": temp,
"source": "nvidia-smi"
}
except (subprocess.CalledProcessError, FileNotFoundError, ValueError, IndexError):
pass
return None
def get_gpu_content():
"""GPU adatok generálása a panelhez a megadott bolondbiztos megoldással"""
try:
gpu_raw = subprocess.check_output(
['nvidia-smi', '--query-gpu=name,utilization.gpu,memory.used,memory.total,temperature.gpu', '--format=csv,noheader,nounits'],
encoding='utf-8'
).strip().split(', ')
gpu_content = f"NVIDIA {gpu_raw[0]}\nTerhelés: {gpu_raw[1]}%\nVRAM: {gpu_raw[2]} MB / {gpu_raw[3]} MB\nHőmérséklet: {gpu_raw[4]} °C"
except Exception as e:
gpu_content = f"GPU adatok olvasása sikertelen: {str(e)}"
return gpu_content
async def get_hardware_stats():
stats = {
"cpu_usage": psutil.cpu_percent(interval=None),
"ram_total": psutil.virtual_memory().total // 1024**2,
"ram_used": psutil.virtual_memory().used // 1024**2,
"ram_perc": psutil.virtual_memory().percent,
"gpu": None,
"gpu_content": get_gpu_content()
}
# Először próbáljuk a pynvml-t
gpu_data = None
if gpu_available:
try:
handle = pynvml.nvmlDeviceGetHandleByIndex(0)
gpu_data = {
"name": pynvml.nvmlDeviceGetName(handle),
"temp": pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU),
"load": pynvml.nvmlDeviceGetUtilizationRates(handle).gpu,
"vram_total": pynvml.nvmlDeviceGetMemoryInfo(handle).total // 1024**2,
"vram_used": pynvml.nvmlDeviceGetMemoryInfo(handle).used // 1024**2,
"power": pynvml.nvmlDeviceGetPowerUsage(handle) / 1000,
"source": "pynvml"
}
except:
gpu_data = None
# Ha nincs pynvml adat, próbáljuk az nvidia-smi-t
if not gpu_data:
gpu_data = get_gpu_via_nvidia_smi()
if gpu_data:
gpu_data["name"] = "NVIDIA GPU (via nvidia-smi)"
stats["gpu"] = gpu_data
return stats
async def get_ollama_models():
try:
async with httpx.AsyncClient(timeout=2.0) as client:
resp = await client.get("http://ollama:11434/api/ps")
if resp.status_code == 200:
return [m['name'] for m in resp.json().get("models", [])]
except: return ["Ollama API Offline"]
return []
async def get_stats(engine):
async with engine.connect() as conn:
# 1. Sebesség adatok (Golyóálló COALESCE használatával)
hr_rate = (await conn.execute(text("SELECT COALESCE(count(*), 0) FROM vehicle.vehicle_model_definitions WHERE status = 'gold_enriched' AND updated_at > NOW() - INTERVAL '1 hour'"))).scalar()
day_rate = (await conn.execute(text("SELECT COALESCE(count(*), 0) FROM vehicle.vehicle_model_definitions WHERE status = 'gold_enriched' AND updated_at > NOW() - INTERVAL '24 hours'"))).scalar()
# 2. Pipeline (R1, R2, R3, R4) - Külön lekérdezések a biztonságért
r1 = (await conn.execute(text("SELECT count(*) FROM vehicle.catalog_discovery WHERE status = 'pending'"))).scalar()
r2 = (await conn.execute(text("SELECT count(*) FROM vehicle.vehicle_model_definitions WHERE status = 'unverified'"))).scalar()
r3 = (await conn.execute(text("SELECT count(*) FROM vehicle.vehicle_model_definitions WHERE status = 'awaiting_ai_synthesis'"))).scalar()
r4 = (await conn.execute(text("SELECT count(*) FROM vehicle.vehicle_model_definitions WHERE status = 'gold_enriched'"))).scalar()
r_counts = (r1, r2, r3, r4)
# 3. TOP 7 Márka a végleges (Robot 1+) táblában
top_makes = (await conn.execute(text("SELECT make, count(*) as qty FROM vehicle.vehicle_model_definitions GROUP BY make ORDER BY qty DESC LIMIT 7"))).fetchall()
# 4. AKTIVITÁS (Utolsó beszúrások)
res_r4 = (await conn.execute(text("SELECT make, marketing_name FROM vehicle.vehicle_model_definitions WHERE status = 'gold_enriched' ORDER BY updated_at DESC LIMIT 5"))).fetchall()
res_r3 = (await conn.execute(text("SELECT make, marketing_name FROM vehicle.vehicle_model_definitions WHERE status = 'ai_synthesis_in_progress' ORDER BY updated_at DESC LIMIT 5"))).fetchall()
# JAVÍTÁS: A Discovery táblában "model" az oszlop neve, nem "marketing_name"!
res_r12 = (await conn.execute(text("SELECT make, model FROM vehicle.catalog_discovery WHERE status = 'processing' ORDER BY id DESC LIMIT 5"))).fetchall()
# 5. Új adatbázis statisztikák
# Kiemelt összesítő: published (published) és manual_review_needed (unverified)
published_count = (await conn.execute(text("SELECT COUNT(*) FROM vehicle.vehicle_model_definitions WHERE status = 'published'"))).scalar()
manual_review_needed_count = (await conn.execute(text("SELECT COUNT(*) FROM vehicle.vehicle_model_definitions WHERE status = 'unverified'"))).scalar()
# Státusz eloszlás
status_distribution = (await conn.execute(text("SELECT status, COUNT(*) as count FROM vehicle.vehicle_model_definitions GROUP BY status ORDER BY count DESC"))).fetchall()
# Márka szerinti eloszlás - csak véglegesített (published)
make_distribution = (await conn.execute(text("SELECT make, COUNT(*) as count FROM vehicle.vehicle_model_definitions WHERE status = 'published' GROUP BY make ORDER BY count DESC LIMIT 15"))).fetchall()
# 6. Kézi javításra várók listája (Top 15)
manual_review_list = (await conn.execute(text(
"SELECT make, marketing_name, COUNT(*) as count FROM vehicle.vehicle_model_definitions WHERE status = 'manual_review_needed' GROUP BY make, marketing_name ORDER BY count DESC LIMIT 15"
))).fetchall()
hw = await get_hardware_stats()
ai = await get_ollama_models()
return (hr_rate, day_rate), r_counts, top_makes, (res_r4, res_r3, res_r12), hw, ai, (published_count, manual_review_needed_count, status_distribution, make_distribution, manual_review_list)
def make_layout() -> Layout:
layout = Layout()
layout.split_column(
Layout(name="header", size=3),
Layout(name="main", ratio=1),
Layout(name="hardware", size=6),
Layout(name="footer", size=3)
)
layout["main"].split_row(
Layout(name="left", ratio=1),
Layout(name="middle", ratio=1),
Layout(name="right", ratio=2)
)
layout["left"].split_column(Layout(name="robot_stats"), Layout(name="inventory"))
layout["middle"].split_column(Layout(name="db_left"), Layout(name="db_right"))
layout["right"].split_column(
Layout(name="live_ops", ratio=1),
Layout(name="manual_review", ratio=1)
)
return layout
def translate_status(status):
"""Státusz fordítása angolról magyarra"""
return STATUS_TRANSLATIONS.get(status, status)
def update_dashboard(layout, data, error_msg=""):
rates, r_counts, top_makes, live_data, hw, ai_models, db_stats = data
r4_list, r3_list, r12_list = live_data
published_count, manual_review_needed_count, status_distribution, make_distribution, manual_review_list = db_stats
local_time = datetime.now() + timedelta(hours=1)
layout["header"].update(Panel(
f"🛰️ SENTINEL IRÁNYÍTÓKÖZPONT | [bold yellow]{local_time.strftime('%Y-%m-%d %H:%M:%S')}[/] | R4 (Arany): [green]{rates[0]}[/] /óra — [cyan]{rates[1]}[/] /nap | Összes feldolgozott: [bold green]{published_count:,}[/]",
style="bold white on blue"
))
robot_table = Table(title="🤖 Robot Pipeline Állapot", expand=True, border_style="cyan")
robot_table.add_column("Robot", style="bold")
robot_table.add_column("Várakozik", justify="right")
robot_table.add_row("R1-Hunter (Nyers gyűjtés)", f"{r_counts[0]:,} db")
robot_table.add_row("R2-Researcher (Webes kutatás)", f"{r_counts[1]:,} db")
robot_table.add_row("R3-Alchemist (AI Szintézis)", f"{r_counts[2]:,} db")
robot_table.add_row("R4-Validator (Várakozó Arany)", f"[green]{r_counts[3]:,}[/] db")
layout["robot_stats"].update(robot_table)
brand_table = Table(title="🚜 Bányászott Márkák (Top 7)", expand=True, border_style="magenta")
brand_table.add_column("Márka", style="yellow")
brand_table.add_column("Darabszám", justify="right")
for m, q in top_makes: brand_table.add_row(str(m), str(q))
layout["inventory"].update(brand_table)
ops_table = Table(title="⚡ Aktuális Folyamatok", expand=True, border_style="green")
ops_table.add_column("Robot", width=15)
ops_table.add_column("Márka / Típus")
for r in r4_list: ops_table.add_row("[gold1]R4-ARANY[/]", f"{r[0]} {r[1] or ''}")
if r4_list: ops_table.add_section()
for r in r3_list: ops_table.add_row("[medium_purple1]R3-AI[/]", f"{r[0]} {r[1] or ''}")
if r3_list: ops_table.add_section()
for r in r12_list: ops_table.add_row("[sky_blue1]R1-HUNTER[/]", f"{r[0]} {r[1] or ''}")
layout["live_ops"].update(ops_table)
hw_layout = Layout()
hw_layout.split_row(
Layout(name="sys"),
Layout(name="gpu_ai_column")
)
hw_layout["gpu_ai_column"].split_column(
Layout(name="gpu"),
Layout(name="ai")
)
sys_info = f"[bold]CPU:[/]\t[bright_blue]{hw['cpu_usage']}%[/]\n[bold]RAM:[/]\t[bright_magenta]{hw['ram_perc']}%[/] ({hw['ram_used']}/{hw['ram_total']}MB)"
hw_layout["sys"].update(Panel(sys_info, title="💻 Rendszer", border_style="bright_blue"))
# GPU adatok a get_gpu_content() által generált szöveggel
gpu_info = hw.get("gpu_content", "GPU adatok nem elérhetők")
hw_layout["gpu"].update(Panel(gpu_info, title="🔌 GPU Adatok", border_style="orange3"))
ai_info = "\n".join([f"🧠 {m}" for m in ai_models]) if ai_models else "Nincs betöltve modell."
hw_layout["ai"].update(Panel(ai_info, title="🤖 Ollama VRAM", border_style="plum1"))
layout["hardware"].update(hw_layout)
# Database stats panels
# Kiemelt összesítő
summary_text = f"[bold green]Véglegesített: {published_count:,}[/] | [bold yellow]Kézi ellenőrzés: {manual_review_needed_count:,}[/]"
summary_panel = Panel(summary_text, title="📊 Jármű Katalógus Összesítő", border_style="cyan")
# Bal oldali panel: Státusz eloszlás (magyar fordításokkal)
status_table = Table(title="📈 Státusz eloszlás", expand=True, border_style="magenta")
status_table.add_column("Státusz", style="bold")
status_table.add_column("Mennyiség", justify="right")
for status, count in status_distribution:
translated = translate_status(status)
status_table.add_row(translated, f"{count:,}")
layout["db_left"].update(Panel(status_table, title="📊 Státuszok", border_style="magenta"))
# Jobb oldali panel: Márka szerinti eloszlás (csak véglegesített)
make_table = Table(title="🚗 Márkák (véglegesített)", expand=True, border_style="green")
make_table.add_column("Márka", style="yellow")
make_table.add_column("Véglegesített DB", justify="right")
for make, count in make_distribution:
make_table.add_row(str(make), f"{count:,}")
layout["db_right"].update(Panel(make_table, title="🏆 Top Márkák", border_style="green"))
# Kézi javításra várók táblázata
manual_table = Table(title="🛠️ Kézi Javításra Várók (Top 15)", expand=True, border_style="yellow")
manual_table.add_column("Márka", style="bold")
manual_table.add_column("Modell", style="cyan")
manual_table.add_column("Darabszám", justify="right")
for make, model, count in manual_review_list:
manual_table.add_row(str(make), str(model) if model else "N/A", f"{count:,}")
layout["manual_review"].update(Panel(manual_table, title="🛠️ Kézi Javításra Várók", border_style="yellow"))
# Ha volt hiba az adatlekérésnél, írjuk ki alulra!
footer_text = f"Sentinel v2.6 | Kernel: Stabil | R1 Pörög: {r_counts[0]} várakozik"
if error_msg: footer_text = f"[red bold]HIBA: {error_msg}[/]"
layout["footer"].update(Panel(footer_text, style="italic grey50"))
async def main():
engine = create_async_engine(settings.DATABASE_URL)
layout = make_layout()
with Live(layout, refresh_per_second=2, screen=True):
while True:
try:
data = await get_stats(engine)
update_dashboard(layout, data)
except Exception as e:
# Ezt már nem nyeljük el!
update_dashboard(layout, ((0,0), (0,0,0,0), [], ([],[],[]), {"cpu_usage":0,"ram_perc":0,"ram_used":0,"ram_total":0,"gpu":None}, [], (0, 0, [], [])), str(e))
await asyncio.sleep(0.5)
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -2,11 +2,13 @@
import asyncio
import os
import logging
import json
from PIL import Image
from sqlalchemy import select, update
from app.db.session import AsyncSessionLocal
from app.models.document import Document
from app.models import Document
from app.models.identity import User
from app.models.marketplace.organization import Organization
from app.services.ai_service import AIService
from app.core.config import settings
@@ -95,6 +97,35 @@ class OCRRobot:
loop = asyncio.get_event_loop()
await loop.run_in_executor(None, cls._sync_resize_and_save, temp_path, final_path)
# TRUST MATCHING: Keresés a fleet.organizations táblában adószám alapján
verified_org_id = None
tax_number = ocr_result.get("tax_number")
if tax_number:
org_stmt = select(Organization.id).where(
Organization.tax_number == tax_number,
Organization.is_active == True,
Organization.is_deleted == False
)
org_result = await db.execute(org_stmt)
org = org_result.scalar_one_or_none()
if org:
verified_org_id = org
logger.info(f"✅ Trust Matching sikeres: {tax_number} → org_id {verified_org_id}")
else:
logger.info(f" Trust Matching: nincs egyezés adószámra: {tax_number}")
# OCR adatok frissítése verified_org_id-vel
if isinstance(ocr_result, dict):
ocr_result["verified_org_id"] = verified_org_id
else:
# Ha az ocr_result nem dict (pl. string), konvertáljuk
try:
ocr_dict = json.loads(ocr_result) if isinstance(ocr_result, str) else {}
ocr_dict["verified_org_id"] = verified_org_id
ocr_result = ocr_dict
except:
ocr_result = {"raw": ocr_result, "verified_org_id": verified_org_id}
# 4. LOGIKA: Adatbázis frissítés (Gold Data előkészítés)
doc.ocr_data = ocr_result
doc.status = "processed"

View File

@@ -1,4 +1,4 @@
# /opt/docker/dev/service_finder/backend/app/workers/service_hunter.py
# /opt/docker/dev/service_finder/backend/app/workers/service/service_robot_0_hunter.py
import asyncio
import httpx
import logging
@@ -8,7 +8,7 @@ from datetime import datetime, timezone
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy import select, text, update
from app.db.session import AsyncSessionLocal
from app.models.staged_data import ServiceStaging, DiscoveryParameter
from app.models.marketplace.staged_data import ServiceStaging, DiscoveryParameter
# Naplózás beállítása a Sentinel monitorozáshoz
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] %(name)s: %(message)s')
@@ -119,7 +119,8 @@ class ServiceHunter:
@classmethod
async def run_grid_search(cls, db: AsyncSession, task: DiscoveryParameter):
""" A város koordináta-alapú bejárása. """
bbox = await cls._get_city_bounds(task.city, task.country_code or 'HU')
# DiscoveryParameter modellnek nincs country_code mezője, ezért alapértelmezett 'HU'-t használunk
bbox = await cls._get_city_bounds(task.city, 'HU')
if not bbox:
return

View File

@@ -6,7 +6,7 @@ import httpx
from urllib.parse import quote
from sqlalchemy import select, text
from app.database import AsyncSessionLocal
from app.models.service import ServiceStaging # JAVÍTOTT IMPORT ÚTVONAL!
from app.models.marketplace.service import ServiceStaging # JAVÍTOTT IMPORT ÚTVONAL!
import re
# Logolás MB 2.0 szabvány szerint

View File

@@ -3,7 +3,7 @@ import logging
import warnings
from sqlalchemy import text, update
from app.database import AsyncSessionLocal
from app.models.service import ServiceStaging
from app.models.marketplace.service import ServiceStaging
warnings.filterwarnings("ignore", category=RuntimeWarning, module='duckduckgo_search')
from duckduckgo_search import DDGS
@@ -23,8 +23,8 @@ class ServiceResearcher:
try:
def search():
with DDGS() as ddgs:
results = ddgs.text(query, max_results=3)
return [f"- {r.get('body', '')}" for r in results] if results else []
results = ddgs.search(query, max_results=3)
return [f"- {r.get('body', r.get('snippet', ''))}" for r in results] if results else []
results = await asyncio.wait_for(asyncio.to_thread(search), timeout=self.search_timeout)
if not results: return ""

View File

@@ -1,63 +1,46 @@
import asyncio
import logging
import json
from sqlalchemy import select, text, update, func
from app.database import AsyncSessionLocal # JAVÍTVA
from app.models.service import ServiceProfile, ExpertiseTag, ServiceExpertise, ServiceStaging
from sqlalchemy import select, text
from app.database import AsyncSessionLocal
from app.models.marketplace.service import ExpertiseTag
# Logolás MB 2.0 szabvány
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] %(name)s: %(message)s')
logger = logging.getLogger("Service-Robot-3-Enricher")
class ServiceEnricher:
"""
Service Robot 3: Professional Classifier (Atomi Zárolással)
"""
""" Service Robot 3: Professional Classifier (Bíró-Kompatibilis Verzió) """
@staticmethod
async def match_expertise_to_service(db, service_profile_id: int, scraped_text: str):
""" Kulcsszó-alapú elemző motor az ExpertiseTag tábla alapján. """
if not scraped_text: return
async def match_expertise_and_score(db, scraped_text: str, current_trust_score: int) -> int:
""" Keresi a szakmákat és bónusz pontokat ad értük a Staging adatnak. """
if not scraped_text: return current_trust_score
tags_query = await db.execute(select(ExpertiseTag).where(ExpertiseTag.is_official == True))
all_tags = tags_query.scalars().all()
found_any = False
match_count = 0
for tag in all_tags:
match_count = 0
for kw in (tag.search_keywords or []):
if kw.lower() in scraped_text.lower():
match_count += 1
if match_count > 0:
existing_check = await db.execute(
select(ServiceExpertise).where(
ServiceExpertise.service_id == service_profile_id,
ServiceExpertise.expertise_id == tag.id
)
)
if not existing_check.scalar():
new_link = ServiceExpertise(
service_id=service_profile_id,
expertise_id=tag.id,
confidence_level=min(match_count, 2)
)
db.add(new_link)
found_any = True
logger.info(f"{tag.key} szakma azonosítva a szerviznél.")
break # Egy tag elég, ha egyszer megvan
if found_any:
await db.commit()
# +5 pont minden megtalált szakmáért, max 30 bónusz pont
bonus = min(match_count * 5, 30)
new_score = min(current_trust_score + bonus, 100)
if bonus > 0:
logger.info(f"{match_count} szakma azonosítva. Bónusz: +{bonus} pont.")
return new_score
@classmethod
async def run_worker(cls):
logger.info("🧠 Service Enricher ONLINE - Szakmai elemzés indítása (Atomi Zárolás)")
logger.info("🧠 Service Enricher ONLINE - Adatdúsítás (Nem publikál, csak pontoz!)")
while True:
try:
async with AsyncSessionLocal() as db:
# 1. Zárolunk egy "enrich_ready" szervizt a Staging táblából
query = text("""
UPDATE marketplace.service_staging
SET status = 'enriching'
@@ -67,41 +50,34 @@ class ServiceEnricher:
FOR UPDATE SKIP LOCKED
LIMIT 1
)
RETURNING id, name, city, full_address, fingerprint, raw_data;
RETURNING id, name, trust_score, raw_data;
""")
result = await db.execute(query)
task = result.fetchone()
await db.commit()
if task:
s_id, name, city, address, fprint, raw_data = task
s_id, name, t_score, raw_data = task
web_context = raw_data.get('web_context', '') if isinstance(raw_data, dict) else ''
async with AsyncSessionLocal() as process_db:
try:
# 2. Áttesszük a végleges ServiceProfile táblába (mert már van elég adatunk a webről)
profile_stmt = text("""
INSERT INTO marketplace.service_profiles
(fingerprint, status, trust_score, location, is_verified, bio)
VALUES (:fp, 'active', 40, ST_SetSRID(ST_MakePoint(19.04, 47.49), 4326), false, :bio)
ON CONFLICT (fingerprint) DO UPDATE SET bio = EXCLUDED.bio
RETURNING id;
""") # Megjegyzés: A GPS koordinátát (19.04, 47.49) majd a Validator (Robot-4) pontosítja!
# 1. Kiszámoljuk az új pontszámot a webes adatok (kulcsszavak) alapján
new_score = await cls.match_expertise_and_score(process_db, web_context, t_score)
p_result = await process_db.execute(profile_stmt, {"fp": fprint, "bio": name + " - " + city})
profile_id = p_result.scalar()
await process_db.commit()
# 3. Futtatjuk a kulcsszó-elemzést
await cls.match_expertise_to_service(process_db, profile_id, web_context)
# 4. Lezárjuk a Staging feladatot
await process_db.execute(text("UPDATE marketplace.service_staging SET status = 'processed' WHERE id = :id"), {"id": s_id})
# 2. Visszaírjuk a Staging táblába, és átadjuk az Auditor-nak (Gamification 2.0: auditor_ready státusz)
upd_query = text("""
UPDATE marketplace.service_staging
SET status = 'auditor_ready', trust_score = :ns
WHERE id = :id
""")
await process_db.execute(upd_query, {"ns": new_score, "id": s_id})
await process_db.commit()
logger.info(f"✅ Dúsítás kész: {name} (Pont: {t_score} -> {new_score}). Átadva az Auditor-nak (auditor_ready).")
except Exception as e:
await process_db.rollback()
logger.error(f"Hiba a dúsítás során ({s_id}): {e}")
logger.error(f"Hiba a dúsítás során ({s_id}): {e}")
await process_db.execute(text("UPDATE marketplace.service_staging SET status = 'error' WHERE id = :id"), {"id": s_id})
await process_db.commit()
else:

View File

@@ -1,3 +1,4 @@
# /opt/docker/dev/service_finder/backend/app/workers/service/service_robot_4_validator_google.py
import asyncio
import httpx
import logging
@@ -7,7 +8,7 @@ import json
from datetime import datetime
from sqlalchemy import text, update, func
from app.database import AsyncSessionLocal
from app.models.service import ServiceProfile
from app.models.marketplace.service import ServiceProfile
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] Robot-4-Validator: %(message)s', stream=sys.stdout)
logger = logging.getLogger("Service-Robot-4-Google-Validator")

View File

@@ -0,0 +1,368 @@
# /opt/docker/dev/service_finder/backend/app/workers/service/service_robot_5_auditor.py
import asyncio
import logging
import json
import random
from datetime import datetime
from sqlalchemy import select, text, update, insert
from sqlalchemy.dialects.postgresql import insert as pg_insert
from app.database import AsyncSessionLocal
# MB 2.0: Közvetlen és teljes importok a hiánytalan működéshez
from app.models.marketplace.service import ServiceStaging, ServiceProfile, ExpertiseTag, ServiceExpertise
from app.models.marketplace.organization import Organization
from app.models.identity.identity import User, Person
from app.models.gamification.gamification import UserContribution, PointsLedger, UserStats
from app.models.system.system import SystemParameter
from app.core.config import settings
# --- NAPLÓZÁS KONFIGURÁCIÓ ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] Service-Robot-5-Auditor: %(message)s')
logger = logging.getLogger("Service-Robot-5-Auditor")
class ServiceAuditor:
"""
Service Robot 5: Auditor és Publikáló (Staging → Production)
Verzió: 1.3 - Tömörítetlen, teljes adatstruktúra szinkronnal és ADAPTÍV időzítéssel.
"""
@staticmethod
async def get_trust_threshold(db) -> int:
""" Lekéri a SystemParameter-ből a trust score küszöböt a validáláshoz. """
try:
query = select(SystemParameter).where(
SystemParameter.key == "service_trust_threshold"
)
result = await db.execute(query)
param = result.scalar_one_or_none()
if param and param.value:
logger.info(f"🔍 Rendszer trust küszöb értéke: {param.value}")
return int(param.value)
except Exception as e:
logger.warning(f"⚠️ Trust threshold lekérdezés hiba, alapértelmezett 70 használata: {e}")
return 70
@staticmethod
async def create_digital_twin(db, staging_data: dict) -> int:
"""
Létrehoz vagy megkeres egy Digital Twin (Organization) entitást.
A 'fleet' sémában lévő organizations táblát kezeli.
"""
try:
tax_no = staging_data.get("tax_number")
org_name = staging_data.get("name", "").strip()
existing_org = None
# 1. Ellenőrzés adószám alapján (ha rendelkezésre áll)
if tax_no:
logger.info(f"🔎 Digital Twin keresése adószám alapján: {tax_no}")
tax_query = select(Organization).where(
Organization.tax_number == tax_no.strip(),
Organization.is_deleted == False
)
tax_result = await db.execute(tax_query)
existing_org = tax_result.scalar_one_or_none()
# 2. Ellenőrzés név alapján (ha az adószám nem talált egyezést)
if not existing_org and org_name:
logger.info(f"🔎 Digital Twin keresése név alapján: {org_name}")
org_query = select(Organization).where(
Organization.name == org_name,
Organization.is_deleted == False
)
name_result = await db.execute(org_query)
existing_org = name_result.scalar_one_or_none()
if existing_org:
logger.info(f"✅ Meglévő Digital Twin azonosítva: {existing_org.name} (ID: {existing_org.id})")
return existing_org.id
# 3. Új Organization (Digital Twin) létrehozása
new_org = Organization(
name=org_name,
full_name=staging_data.get("full_name") or org_name,
tax_number=tax_no,
reg_number=staging_data.get("registration_number"),
contact_email=staging_data.get("contact_email"),
contact_phone=staging_data.get("contact_phone"),
website=staging_data.get("website"),
address_zip=staging_data.get("postal_code"),
address_city=staging_data.get("city"),
address_street_name=staging_data.get("address_line1"),
country_code=staging_data.get("country_code", "HU"),
is_active=True,
status="active",
created_at=datetime.utcnow(),
updated_at=datetime.utcnow()
)
db.add(new_org)
await db.flush()
await db.refresh(new_org)
logger.info(f"✨ Új Digital Twin (Organization) létrehozva: {new_org.name}")
return new_org.id
except Exception as e:
logger.error(f"❌ Digital Twin hiba: {e}")
raise
@staticmethod
async def create_service_profile(db, staging_data: dict, org_id: int) -> int:
""" Létrehozza az éles ServiceProfile rekordot a marketplace sémában. """
try:
new_service = ServiceProfile(
organization_id=org_id,
name=staging_data.get("name", "").strip(),
description=staging_data.get("description") or "",
contact_email=staging_data.get("contact_email"),
contact_phone=staging_data.get("contact_phone"),
website=staging_data.get("website"),
address_line1=staging_data.get("address_line1"),
address_line2=staging_data.get("address_line2"),
city=staging_data.get("city"),
postal_code=staging_data.get("postal_code"),
country_code=staging_data.get("country_code", "HU"),
latitude=staging_data.get("latitude"),
longitude=staging_data.get("longitude"),
trust_score=staging_data.get("trust_score", 0),
status="active",
external_id=staging_data.get("external_id"),
metadata=staging_data.get("metadata") or {},
created_at=datetime.utcnow(),
updated_at=datetime.utcnow()
)
db.add(new_service)
await db.flush()
await db.refresh(new_service)
logger.info(f"✅ Éles ServiceProfile rögzítve: {new_service.name} (ID: {new_service.id})")
return new_service.id
except Exception as e:
logger.error(f"❌ ServiceProfile rögzítési hiba: {e}")
raise
@staticmethod
async def award_user_contribution(db, user_id: int, service_id: int, staging_id: int):
""" XP és pontok kiosztása a felhasználónak. """
try:
# 1. Aktuális aktív szezon keresése
season_query = text("""
SELECT id FROM system.seasons
WHERE is_active = true
AND start_date <= CURRENT_DATE
AND end_date >= CURRENT_DATE
LIMIT 1
""")
result = await db.execute(season_query)
season_row = result.fetchone()
season_id = season_row[0] if season_row else None
# 2. UserContribution rekord létrehozása
contribution = UserContribution(
user_id=user_id,
season_id=season_id,
contribution_type="service_submission",
entity_type="service",
entity_id=service_id,
points_awarded=50,
xp_awarded=100,
status="approved",
metadata={
"staging_id": staging_id,
"awarded_at": datetime.utcnow().isoformat(),
"reason": "Auditor publication approval"
},
created_at=datetime.utcnow()
)
db.add(contribution)
# 3. UserStats (globális statisztika) frissítése
stats_query = select(UserStats).where(UserStats.user_id == user_id)
stats_result = await db.execute(stats_query)
user_stats = stats_result.scalar_one_or_none()
if user_stats:
user_stats.total_points += 50
user_stats.total_xp += 100
user_stats.services_submitted += 1
user_stats.updated_at = datetime.utcnow()
else:
new_stats = UserStats(
user_id=user_id, total_points=50, total_xp=100,
services_submitted=1, created_at=datetime.utcnow()
)
db.add(new_stats)
# 4. PointsLedger bejegyzés
ledger = PointsLedger(
user_id=user_id, points=50, xp=100,
source_type="service_submission",
source_id=service_id,
description="Reward for verified service publication",
created_at=datetime.utcnow()
)
db.add(ledger)
logger.info(f"🏆 Jutalmazás elvégezve: User {user_id} (+50 PT, +100 XP)")
except Exception as e:
logger.error(f"⚠️ Hiba a jutalmazási folyamatban: {e}")
@classmethod
async def process_staging_record(cls, db, staging_id: int):
""" Egyetlen staging rekord teljes körű feldolgozása tranzakcióban. """
try:
# 1. Rekord lekérése
query = select(ServiceStaging).where(
ServiceStaging.id == staging_id,
ServiceStaging.status == 'auditing'
)
result = await db.execute(query)
staging = result.scalar_one_or_none()
if not staging:
logger.error(f"❌ Staging rekord nem található vagy rossz státuszban van: {staging_id}")
return False
# 2. Trust Score ellenőrzés
trust_threshold = await cls.get_trust_threshold(db)
if staging.trust_score < trust_threshold:
logger.warning(f"🚫 Trust Score elégtelen: {staging.trust_score} < {trust_threshold}")
staging.status = 'rejected'
staging.rejection_reason = f'Low trust score ({staging.trust_score})'
staging.updated_at = datetime.utcnow()
await db.commit()
return False
# 3. Adatok kigyűjtése explicit módon
staging_data = {
"name": staging.name,
"description": staging.description,
"contact_email": staging.contact_email,
"contact_phone": staging.contact_phone,
"website": staging.website,
"address_line1": staging.address_line1,
"address_line2": staging.address_line2,
"city": staging.city,
"postal_code": staging.postal_code,
"country_code": staging.country_code,
"latitude": staging.latitude,
"longitude": staging.longitude,
"trust_score": staging.trust_score,
"external_id": staging.external_id,
"metadata": staging.metadata or {},
"tax_number": staging.metadata.get("tax_number") if staging.metadata else None,
"registration_number": staging.metadata.get("registration_number") if staging.metadata else None
}
# 4. Digital Twin (Cég) fázis
org_id = await cls.create_digital_twin(db, staging_data)
# 5. Production (Szolgáltatás) fázis
service_id = await cls.create_service_profile(db, staging_data, org_id)
# 6. Gamification fázis
if staging.submitted_by:
await cls.award_user_contribution(db, staging.submitted_by, service_id, staging_id)
# 7. Lezárás és Audit Trail mentése
staging.status = 'published'
staging.published_at = datetime.utcnow()
staging.service_profile_id = service_id
staging.organization_id = org_id
staging.updated_at = datetime.utcnow()
staging.audit_trail = {
"audited_by": "robot_5",
"audited_at": datetime.utcnow().isoformat(),
"trust_threshold_used": trust_threshold,
"final_trust_score": staging.trust_score,
"organization_id": org_id,
"service_profile_id": service_id,
"version": "1.3"
}
await db.commit()
logger.info(f"✅ SIKER: Staging {staging_id} -> Production {service_id}")
return True
except Exception as e:
logger.error(f"❌ Kritikus feldolgozási hiba (Staging ID: {staging_id}): {e}")
await db.rollback()
return False
@classmethod
async def run_worker(cls):
"""
Az Auditor fő folyamata:
Adaptív ciklus: 20mp ha van adat, 5 perc ha 5x üres.
"""
logger.info("🚀 Service Auditor v1.3 ONLINE - Adaptív üzemmód")
empty_counter = 0
while True:
try:
async with AsyncSessionLocal() as db:
# 1. Következő rekord lefoglalása atomi módon
query = text("""
UPDATE marketplace.service_staging
SET status = 'auditing'
WHERE id = (
SELECT id FROM marketplace.service_staging
WHERE status = 'auditor_ready'
AND trust_score >= (
SELECT COALESCE(
(SELECT value::integer FROM system.system_parameters
WHERE key = 'service_trust_threshold'),
70
)
)
FOR UPDATE SKIP LOCKED
LIMIT 1
)
RETURNING id
""")
result = await db.execute(query)
row = result.fetchone()
if not row:
empty_counter += 1
if empty_counter >= 5:
sleep_time = 600 # 5 perc várakozás
logger.info(f"💤 Nincs adat (5x üres). Lassítás {sleep_time} másodpercre (5 perc)...")
else:
sleep_time = 20 # 20 másodperc várakozás
logger.info(f"⏳ Várólista üres, következő próba {sleep_time} mp múlva. (Próba: {empty_counter}/5)")
await db.commit()
await asyncio.sleep(sleep_time)
continue
# Ha találtunk adatot:
empty_counter = 0
staging_id = row[0]
await db.commit() # Elengedjük a zárolást a hosszas feldolgozáshoz
logger.info(f"🎯 Auditor feldolgozás indítása: staging_id={staging_id}")
# 2. Rekord tényleges feldolgozása egy friss session-ben
async with AsyncSessionLocal() as process_db:
await cls.process_staging_record(process_db, staging_id)
# 3. Sikeres feldolgozás utáni pihenő az utasítás szerint (20 mp)
await asyncio.sleep(20)
except Exception as e:
logger.error(f"❌ Auditor fő ciklus hiba: {e}")
await asyncio.sleep(10)
async def main():
""" Belépési pont a konténer számára. """
auditor = ServiceAuditor()
await auditor.run_worker()
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -23,7 +23,7 @@ from sqlalchemy.ext.asyncio import AsyncSession
from app.database import AsyncSessionLocal
from app.models.identity import User
from app.models.audit import FinancialLedger, LedgerEntryType, WalletType
from app.models import FinancialLedger, LedgerEntryType, WalletType
from app.services.billing_engine import record_ledger_entry
from app.services.notification_service import NotificationService

View File

@@ -1,12 +1,13 @@
# /app/app/workers/system/system_robot_2_service_auditor.py
# /opt/docker/dev/service_finder/backend/app/workers/system/system_robot_2_service_auditor.py
import asyncio
import logging
from datetime import datetime, timezone
from sqlalchemy import select, and_, update
from sqlalchemy import select, and_, update, func
from sqlalchemy.dialects.postgresql import insert
from app.database import AsyncSessionLocal
from app.models.organization import Organization, OrgType
from app.models.service import ServiceProfile
from app.models.staged_data import ServiceStaging
from app.models.marketplace.organization import Organization, OrgType
from app.models.marketplace.service import ServiceProfile
from app.models.marketplace.staged_data import ServiceStaging
# MB 2.0 Naplózás
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s]: %(message)s')
@@ -14,42 +15,127 @@ logger = logging.getLogger("System-Robot-2-ServiceAuditor")
class ServiceAuditor:
"""
System Robot 2: Service Auditor & Judge
System Robot 2: Service Auditor & Judge (Gamification 2.0)
Feladata:
1. Meglévő szervizek auditálása (ne legyenek "halott" adatok).
2. Staging adatok automatikus élesíse, ha a bizalmi szint eléri a küszöböt.
1. Staging adatok auditálása dinamikus trust_score küszöb alapján.
2. Sikeres audit esen Organization és ServiceProfile létrehozás.
3. Bukott audit esetén needs_moderation státusz.
"""
TRUST_THRESHOLD = 80 # Ezen pontszám felett automatikusan élesítünk
@classmethod
async def get_promotion_threshold(cls, db):
""" Dinamikus küszöbérték kiolvasása a system_parameters táblából (SQL text) """
from sqlalchemy import text
try:
result = await db.execute(
text("SELECT value FROM system.system_parameters WHERE key = 'service_promotion_threshold' AND scope_level = 'global'")
)
row = result.fetchone()
if row:
import json
value = json.loads(row[0]) if isinstance(row[0], str) else row[0]
threshold = value.get('trust_score', 50) if isinstance(value, dict) else 50
else:
threshold = 50
except Exception as e:
logger.warning(f"⚠️ Nem sikerült lekérni a küszöbértéket: {e}, alapértelmezett 50")
threshold = 50
logger.info(f"📊 Dinamikus trust_score küszöb: {threshold}")
return threshold
@classmethod
async def promote_staging_data(cls):
"""
AZ AUTOMATA BÍRÓ:
Megnézi a Staging táblát, és ha valami elérte a ponthatárt,
automatikusan átemeli az éles profilok közé.
AZ AUTOMATA BÍRÓ (Gamification 2.0):
Atomikus tranzakcióban feldolgozza az auditor_ready státuszú rekordokat.
"""
async with AsyncSessionLocal() as db:
stmt = select(ServiceStaging).where(
and_(
ServiceStaging.status == "researched",
ServiceStaging.trust_score >= cls.TRUST_THRESHOLD
)
)
result = await db.execute(stmt)
to_promote = result.scalars().all()
for stage in to_promote:
logger.info(f"⚖️ Automatikus élesítés (Admin nélkül): {stage.name} (Bizalom: {stage.trust_score})")
# Itt jön az átemelő logika:
# 1. Organization létrehozása
# 2. ServiceProfile létrehozása
# 3. ExpertiseTags átmásolása
stage.status = "promoted"
# Dinamikus küszöb lekérdezése
threshold = await cls.get_promotion_threshold(db)
await db.commit()
# FOR UPDATE SKIP LOCKED használata
stmt = select(ServiceStaging).where(
ServiceStaging.status == "auditor_ready"
).with_for_update(skip_locked=True).limit(10)
result = await db.execute(stmt)
to_process = result.scalars().all()
processed = 0
for stage in to_process:
try:
# Audit logika
if stage.trust_score >= threshold:
# A) SIKERES AUDIT
logger.info(f"✅ Sikeres audit: {stage.name} (trust_score={stage.trust_score})")
# Organization létrehozása vagy meglévő keresése név alapján
org_stmt = select(Organization).where(
and_(
Organization.name == stage.name,
Organization.org_type == OrgType.service
)
)
org_result = await db.execute(org_stmt)
org = org_result.scalar_one_or_none()
if not org:
org = Organization(
name=stage.name,
org_type=OrgType.service,
is_active=True,
created_by=stage.submitted_by if stage.submitted_by else None
)
db.add(org)
await db.flush() # ID generáláshoz
# ServiceProfile létrehozása
profile = ServiceProfile(
organization_id=org.id,
name=stage.name,
description=stage.description,
latitude=None, # TODO: később geokódolás
longitude=None,
address=stage.full_address,
contact_phone=stage.contact_phone,
website=stage.website,
status='pending_validation', # Következő robot/ember dúsíthatja
trust_score=stage.trust_score,
raw_data=stage.raw_data
)
db.add(profile)
await db.flush()
# Staging rekord frissítése
stage.status = 'pending_validation'
stage.organization_id = org.id
stage.service_profile_id = profile.id
stage.updated_at = func.now() # audited_at helyett updated_at
logger.info(f" ➡️ Organization #{org.id} és ServiceProfile #{profile.id} létrehozva")
else:
# B) BUKOTT AUDIT
logger.warning(f"❌ Bukott audit: {stage.name} (trust_score={stage.trust_score} < {threshold})")
stage.status = 'needs_moderation'
stage.updated_at = func.now()
processed += 1
except Exception as e:
logger.error(f"💥 Hiba a staging feldolgozás közben (ID {stage.id}): {e}")
await db.rollback()
# Staging rekord hibás státuszba helyezése
stage.status = 'error'
stage.updated_at = func.now()
# További hibakezelés: lehet naplózni audit_trail-be
continue
if processed > 0:
await db.commit()
logger.info(f"📦 Feldolgozva {processed} staging rekord")
else:
logger.debug(" Nincs feldolgozható staging rekord")
@classmethod
async def audit_existing_services(cls):
@@ -92,7 +178,7 @@ class ServiceAuditor:
@classmethod
async def run(cls):
logger.info("⚖️ System Auditor ONLINE - Bírói és Karbantartó üzemmód")
logger.info("⚖️ System Auditor ONLINE - Gamification 2.0 Bírói mód")
while True:
# 1. Először élesítjük az új felfedezéseket
await cls.promote_staging_data()
@@ -100,8 +186,8 @@ class ServiceAuditor:
# 2. Utána karbantartjuk a meglévőket
await cls.audit_existing_services()
# Naponta egyszer fut le a teljes kör
await asyncio.sleep(86400)
# Rövid várakozás a következő ciklus előtt (teszteléshez 60 másodperc)
await asyncio.sleep(60)
if __name__ == "__main__":
asyncio.run(ServiceAuditor.run())

View File

@@ -0,0 +1,208 @@
import asyncio
import httpx
import logging
import os
import sys
from datetime import datetime, timedelta
from sqlalchemy import text, select
from app.database import AsyncSessionLocal
from app.models.asset import AssetCatalog
# MB 2.0 Szigorú naplózás
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] Robot-0-Discovery: %(message)s', stream=sys.stdout)
logger = logging.getLogger("Vehicle-Robot-0-Discovery")
class DiscoveryEngine:
"""
THOUGHT PROCESS (IPARI ÜZEMMÓD 2.0):
1. Őrkutya (Watchdog): Megkeresi és kiszabadítja a beragadt feladatokat óránként.
2. Differential Sync (Különbözeti Szinkron): Csak a hiányzó vagy új modelleket rögzíti, a gold_enriched-eket kihagyja.
3. Monthly Scheduler: Havonta egyszer tölti le a teljes RDW adatbázist lapozva.
"""
RDW_TOKEN = os.getenv("RDW_APP_TOKEN")
HEADERS = {"X-App-Token": RDW_TOKEN} if RDW_TOKEN else {}
SYNC_STATE_FILE = "/app/temp/.last_rdw_sync" # Állapotfájl, hogy Docker újrainduláskor se kezdje elölről azonnal
@staticmethod
async def run_watchdog():
""" 1. FÁZIS: Az Őrkutya (Dead-Letter Queue Manager) """
logger.info("🐕 Őrkutya: Beragadt feladatok keresése a rendszerben...")
try:
async with AsyncSessionLocal() as db:
# A) Hunter takarítás (visszaállítás pending-re, ha a Hunter lefagyott)
res1 = await db.execute(text("UPDATE vehicle.catalog_discovery SET status = 'pending' WHERE status = 'processing' RETURNING id;"))
hunter_resets = len(res1.fetchall())
if hunter_resets > 0:
logger.warning(f"🔄 {hunter_resets} db beragadt Hunter feladat (processing) visszaállítva 'pending'-re.")
# B) AI Robotok takarítása (2 órás timeout)
query2 = text("""
UPDATE vehicle.vehicle_model_definitions
SET status = CASE
WHEN status = 'research_in_progress' THEN 'unverified'
WHEN status = 'ai_synthesis_in_progress' THEN 'awaiting_ai_synthesis'
END
WHERE status IN ('research_in_progress', 'ai_synthesis_in_progress')
AND updated_at < NOW() - INTERVAL '2 hours'
RETURNING id;
""")
res2 = await db.execute(query2)
ai_resets = len(res2.fetchall())
if ai_resets > 0:
logger.warning(f"🔄 {ai_resets} db beragadt AI feladat visszaállítva.")
await db.commit()
except Exception as e:
logger.error(f"❌ Őrkutya hiba: {e}")
@staticmethod
async def seed_manual_bootstrap():
""" 2. FÁZIS: Alapozó adatok rögzítése """
initial_data = [
{"make": "AUDI", "model": "A4", "generation": "B8 (2008-2015)"}, # vehicle_class törölve
{"make": "BMW", "model": "3 SERIES", "generation": "F30 (2012-2019)"}
]
try:
async with AsyncSessionLocal() as db:
for item in initial_data:
stmt = select(AssetCatalog).where(AssetCatalog.make == item["make"], AssetCatalog.model == item["model"])
if not (await db.execute(stmt)).scalar_one_or_none():
db.add(AssetCatalog(**item))
await db.commit()
except Exception as e:
logger.warning(f"Manual bootstrap hiba (Ignorálható, ha az adatbázis már tele van): {e}")
@classmethod
async def fetch_with_retry(cls, client: httpx.AsyncClient, url: str, params: dict, retries: int = 3):
""" Hibatűrő HTTP kérés API leállások ellen. """
for attempt in range(retries):
try:
resp = await client.get(url, params=params, headers=cls.HEADERS)
if resp.status_code == 200:
return resp
elif resp.status_code == 429:
await asyncio.sleep(2 ** attempt)
else:
return None
except httpx.RequestError:
if attempt == retries - 1:
return None
await asyncio.sleep(2 ** attempt)
return None
@classmethod
async def seed_from_rdw(cls):
""" 3. FÁZIS: Távoli felfedezés - KÜLÖNBÖZETI SZINKRONIZÁCIÓ (Differential Sync) """
logger.info("📥 RDW TÖMEGES LETÖLTÉS: Új modellek keresése (Differential Sync)...")
limit = 10000
offset = 0
inserted_count = 0
updated_count = 0
async with httpx.AsyncClient(timeout=60.0) as client:
while True:
params = {
"$select": "merk,handelsbenaming,voertuigsoort,count(*) as total",
"$group": "merk,handelsbenaming,voertuigsoort",
"$order": "total DESC",
"$limit": limit,
"$offset": offset
}
resp = await cls.fetch_with_retry(client, "https://opendata.rdw.nl/resource/m9d7-ebf2.json", params)
if not resp: break
raw_data = resp.json()
if not raw_data: break
logger.info(f"📊 Lapozás: {offset} - {offset + len(raw_data)} tételek analízise...")
async with AsyncSessionLocal() as db:
for entry in raw_data:
make = str(entry.get("merk", "")).upper().strip()
model = str(entry.get("handelsbenaming", "")).upper().strip()
v_kind = entry.get("voertuigsoort", "")
total_count = int(entry.get("total", 0))
if not make or not model: continue
if "Personenauto" in v_kind: v_class = 'car'
elif "Motorfiets" in v_kind: v_class = 'motorcycle'
else: v_class = 'truck'
# A MÁGIA: Különbözeti Szinkronizáció SQL + Explicit Type Casting
query = text("""
INSERT INTO vehicle.catalog_discovery (make, model, vehicle_class, status, priority_score)
SELECT
CAST(:make AS VARCHAR),
CAST(:model AS VARCHAR),
CAST(:v_class AS VARCHAR),
'pending',
:priority
WHERE NOT EXISTS (
SELECT 1 FROM vehicle.vehicle_model_definitions
WHERE make = CAST(:make AS VARCHAR)
AND marketing_name = CAST(:model AS VARCHAR)
AND status = 'gold_enriched'
)
ON CONFLICT (make, model)
DO UPDATE SET priority_score = EXCLUDED.priority_score
WHERE vehicle.catalog_discovery.status != 'processed'
RETURNING xmax;
""")
result = await db.execute(query, {
"make": make, "model": model, "v_class": v_class, "priority": total_count
})
row = result.fetchone()
if row:
if row[0] == 0: inserted_count += 1 # Új beszúrás
else: updated_count += 1 # Meglévő frissítése
await db.commit()
offset += limit
await asyncio.sleep(1)
logger.info(f"✅ RDW Szinkron kész! Új modellek a listán: {inserted_count} | Frissített prioritások: {updated_count}")
# Sikeres futás regisztrálása a fájlrendszeren
os.makedirs(os.path.dirname(cls.SYNC_STATE_FILE), exist_ok=True)
with open(cls.SYNC_STATE_FILE, 'w') as f:
f.write(datetime.now().isoformat())
@classmethod
def should_run_rdw_sync(cls) -> bool:
""" Ellenőrzi, hogy eltelt-e 30 nap a legutóbbi sikeres RDW szinkronizáció óta. """
if not os.path.exists(cls.SYNC_STATE_FILE):
return True
try:
with open(cls.SYNC_STATE_FILE, 'r') as f:
last_sync = datetime.fromisoformat(f.read().strip())
return datetime.now() - last_sync > timedelta(days=30)
except Exception:
return True
@classmethod
async def run(cls):
""" FŐ CIKLUS: Havi ütemező és Óránkénti Őrkutya """
logger.info("🚀 ÉLES ÜZEM: Discovery Engine (Differential Sync) & Watchdog indítása...")
await cls.seed_manual_bootstrap()
while True:
# 1. Óránkénti takarítás
await cls.run_watchdog()
# 2. Havi szinkronizáció ellenőrzése
if cls.should_run_rdw_sync():
await cls.seed_from_rdw()
else:
logger.info("🛌 Az RDW szinkronizáció már lefutott az elmúlt 30 napban. Ugrás...")
# 3. Alvás 1 órát (Heartbeat)
logger.info("⏱️ A Discovery Engine most 1 órát pihen a következő Őrkutya futásig.")
await asyncio.sleep(3600)
if __name__ == "__main__":
asyncio.run(DiscoveryEngine.run())

View File

@@ -0,0 +1,224 @@
import asyncio
import httpx
import logging
import os
import re
import sys
from sqlalchemy import text
from sqlalchemy.dialects.postgresql import insert
from app.database import AsyncSessionLocal
from app.models.vehicle_definitions import VehicleModelDefinition
# Naplózás beállítása a standard kimenetre
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s [%(levelname)s] Robot-1-Hunter: %(message)s',
stream=sys.stdout
)
logger = logging.getLogger("Robot-1-Hunter")
class CatalogHunter:
"""
Vehicle Robot 1.9.3: The Truly Invincible Hunter (SAVEPOINT PATCH)
Kezeli az ALL_VARIANTS utasítást és row-level tranzakcióvédelmet használ.
"""
RDW_MAIN = "https://opendata.rdw.nl/resource/m9d7-ebf2.json"
RDW_FUEL = "https://opendata.rdw.nl/resource/8ys7-d773.json"
RDW_ENGINE = "https://opendata.rdw.nl/resource/jh96-v4pq.json"
RDW_TOKEN = os.getenv("RDW_APP_TOKEN")
HEADERS = {"X-App-Token": RDW_TOKEN} if RDW_TOKEN else {}
BATCH_SIZE = 50
@classmethod
def normalize(cls, text_val: str) -> str:
if not text_val: return ""
return re.sub(r'[^a-zA-Z0-9]', '', text_val).lower()
@classmethod
def parse_int(cls, value) -> int:
try:
if value is None or str(value).strip() == "": return 0
return int(float(value))
except (ValueError, TypeError): return 0
@classmethod
def parse_float(cls, value) -> float:
try:
if value is None or str(value).strip() == "": return 0.0
return float(value)
except (ValueError, TypeError): return 0.0
@classmethod
async def fetch_with_retry(cls, client: httpx.AsyncClient, url: str, retries: int = 3):
""" Hibatűrő HTTP lekérdezés exponenciális várakozással. """
for attempt in range(retries):
try:
resp = await client.get(url, headers=cls.HEADERS)
if resp.status_code == 200:
return resp
elif resp.status_code == 429: # Rate limit
await asyncio.sleep(2 ** attempt)
else:
return resp
except httpx.RequestError as e:
if attempt == retries - 1:
logger.debug(f"Hálózati hiba: {e}")
raise
await asyncio.sleep(2 ** attempt)
return None
@classmethod
async def fetch_tech_details(cls, client, plate):
""" Technikai adatok (üzemanyag, teljesítmény, motorkód) begyűjtése. """
results = {
"power_kw": 0, "engine_code": None, "euro_class": None,
"fuel_desc": "Unknown", "co2": 0, "consumption": 0.0
}
try:
# Üzemanyag adatok
f_resp = await cls.fetch_with_retry(client, f"{cls.RDW_FUEL}?kenteken={plate}")
if f_resp and f_resp.status_code == 200 and f_resp.json():
f = f_resp.json()[0]
p1 = cls.parse_int(f.get("netto_maximum_vermogen") or f.get("nettomaximumvermogen"))
p2 = cls.parse_int(f.get("nominaal_continu_maximum_vermogen") or f.get("nominaalcontinuvermogen"))
results.update({
"power_kw": max(p1, p2),
"fuel_desc": f.get("brandstof_omschrijving") or "Unknown",
"euro_class": f.get("euro_klasse") or f.get("uitlaatemissieniveau"),
"co2": cls.parse_int(f.get("co2_uitstoot_gecombineerd")),
"consumption": cls.parse_float(f.get("brandstofverbruik_gecombineerd"))
})
# Motorkód adatok
e_resp = await cls.fetch_with_retry(client, f"{cls.RDW_ENGINE}?kenteken={plate}")
if e_resp and e_resp.status_code == 200 and e_resp.json():
results["engine_code"] = e_resp.json()[0].get("motorcode")
except Exception:
pass
return results
@classmethod
async def process_make_model(cls, db, task_id, make_name, model_name, v_class, priority):
""" Egy adott márka/modell (vagy wildcard) feldolgozása. """
clean_make = make_name.strip().upper()
clean_model = model_name.strip().upper()
logger.info(f"🎯 ADATGYŰJTÉS INDUL: {clean_make} {clean_model}")
offset = 0
async with httpx.AsyncClient(timeout=30.0) as client:
while True:
# Dinamikus paraméterezés: ALL_VARIANTS esetén nem szűrünk modellre
if clean_model == 'ALL_VARIANTS':
params = f"merk={clean_make}&$limit={cls.BATCH_SIZE}&$offset={offset}&$order=kenteken DESC"
else:
params = f"merk={clean_make}&handelsbenaming={clean_model}&$limit={cls.BATCH_SIZE}&$offset={offset}&$order=kenteken DESC"
try:
r = await cls.fetch_with_retry(client, f"{cls.RDW_MAIN}?{params}")
batch = r.json() if r and r.status_code == 200 else []
except Exception as e:
logger.error(f"❌ API hiba: {e}")
break
if not batch:
break
for item in batch:
plate = item.get("kenteken", "UNKNOWN")
try:
# SAVEPOINT: Ha egy rekord mentése hibás, a tranzakció blokk nem sérül
async with db.begin_nested():
tech = await cls.fetch_tech_details(client, plate)
# Valódi modellnév kinyerése (Wildcard esetén fontos)
actual_model = (item.get("handelsbenaming") or clean_model).upper()
norm_name = cls.normalize(actual_model.replace(clean_make, "").strip() or actual_model)
stmt = insert(VehicleModelDefinition).values(
make=clean_make,
marketing_name=actual_model,
normalized_name=norm_name,
variant_code=item.get("variant", "UNKNOWN"),
version_code=item.get("uitvoering", "UNKNOWN"),
type_approval_number=item.get("typegoedkeuringsnummer"),
technical_code=plate,
engine_capacity=cls.parse_int(item.get("cilinderinhoud")),
power_kw=tech["power_kw"],
fuel_type=tech["fuel_desc"],
engine_code=tech["engine_code"],
seats=cls.parse_int(item.get("aantal_zitplaatsen")),
doors=cls.parse_int(item.get("aantal_deuren")),
width=cls.parse_int(item.get("breedte")),
wheelbase=cls.parse_int(item.get("wielbasis")),
list_price=cls.parse_int(item.get("catalogusprijs")),
max_speed=cls.parse_int(item.get("maximale_constructiesnelheid")),
curb_weight=cls.parse_int(item.get("massa_ledig_voertuig")),
max_weight=cls.parse_int(item.get("technische_max_massa_voertuig")),
body_type=item.get("inrichting"),
co2_emissions_combined=tech["co2"],
fuel_consumption_combined=tech["consumption"],
euro_classification=tech["euro_class"],
cylinders=cls.parse_int(item.get("aantal_cilinders")),
vehicle_class=v_class,
priority_score=priority,
status="unverified", # R2 Researcher számára előkészítve
source="MEGA-HUNTER-v1.9.3"
).on_conflict_do_nothing(
index_elements=['make', 'normalized_name', 'variant_code', 'version_code', 'fuel_type']
)
await db.execute(stmt)
except Exception as e:
logger.warning(f"⚠️ Sor eldobva ({plate}): {e}")
# Batch commit a sikeres sorok után
await db.commit()
offset += len(batch)
if offset >= 500: # Biztonsági korlát egy-egy márkánál
break
await asyncio.sleep(0.5)
# Discovery feladat lezárása
await db.execute(
text("UPDATE vehicle.catalog_discovery SET status = 'processed' WHERE id = :id"),
{"id": task_id}
)
await db.commit()
@classmethod
async def run(cls):
logger.info("🤖 Mega-Hunter v1.9.3 ONLINE (SAVEPOINT ENABLED)")
while True:
try:
async with AsyncSessionLocal() as db:
# ATOMI ZÁROLÁS: Keresés, Zárolás és Állapotváltás egy lépésben
query = text("""
UPDATE vehicle.catalog_discovery
SET status = 'processing'
WHERE id = (
SELECT id FROM vehicle.catalog_discovery
WHERE status = 'pending'
ORDER BY priority_score DESC
FOR UPDATE SKIP LOCKED
LIMIT 1
)
RETURNING id, make, model, vehicle_class, priority_score;
""")
result = await db.execute(query)
task = result.fetchone()
await db.commit()
if task:
await cls.process_make_model(db, task[0], task[1], task[2], task[3], task[4])
else:
# Ha nincs munka, 30 másodperc pihenő
await asyncio.sleep(30)
except Exception as e:
logger.error(f"💀 Főciklus hiba: {e}")
await asyncio.sleep(10)
if __name__ == "__main__":
asyncio.run(CatalogHunter.run())

View File

@@ -0,0 +1,179 @@
# /opt/docker/dev/service_finder/backend/app/workers/vehicle/vehicle_robot_1_catalog_hunter.py
# version: 1.9.6
import asyncio
import httpx
import logging
import os
import re
import sys
from datetime import datetime
from sqlalchemy import text, func
from sqlalchemy.dialects.postgresql import insert
from app.database import AsyncSessionLocal
from app.models.vehicle_definitions import VehicleModelDefinition
# MB 2.0 Standard Naplózás
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s [%(levelname)s] Robot-1-Hunter: %(message)s',
stream=sys.stdout
)
logger = logging.getLogger("Robot-1-Hunter")
class CatalogHunter:
"""
Vehicle Robot 1.9.6: Mega-Hunter (TIMESTAMP & INTEGRITY PATCH)
Kezeli az ALL_VARIANTS-t, a Savepoint-okat és az összes kötelező mezőt.
"""
RDW_MAIN = "https://opendata.rdw.nl/resource/m9d7-ebf2.json"
RDW_FUEL = "https://opendata.rdw.nl/resource/8ys7-d773.json"
RDW_ENGINE = "https://opendata.rdw.nl/resource/jh96-v4pq.json"
RDW_TOKEN = os.getenv("RDW_APP_TOKEN")
HEADERS = {"X-App-Token": RDW_TOKEN} if RDW_TOKEN else {}
BATCH_SIZE = 50
@classmethod
def normalize(cls, text_val: str) -> str:
if not text_val: return ""
return re.sub(r'[^a-zA-Z0-9]', '', text_val).lower()
@classmethod
def parse_int(cls, value) -> int:
try:
if value is None or str(value).strip() == "": return 0
return int(float(value))
except (ValueError, TypeError): return 0
@classmethod
def parse_float(cls, value) -> float:
try:
if value is None or str(value).strip() == "": return 0.0
return float(value)
except (ValueError, TypeError): return 0.0
@classmethod
async def fetch_with_retry(cls, client: httpx.AsyncClient, url: str, retries: int = 3):
for attempt in range(retries):
try:
resp = await client.get(url, headers=cls.HEADERS)
if resp.status_code == 200: return resp
elif resp.status_code == 429: await asyncio.sleep(2 ** attempt)
else: return resp
except httpx.RequestError:
if attempt == retries - 1: raise
await asyncio.sleep(2 ** attempt)
return None
@classmethod
async def fetch_tech_details(cls, client, plate):
results = {"power_kw": 0, "engine_code": None, "euro_class": None, "fuel_desc": "Unknown", "co2": 0, "consumption": 0.0}
try:
f_resp = await cls.fetch_with_retry(client, f"{cls.RDW_FUEL}?kenteken={plate}")
if f_resp and f_resp.status_code == 200 and f_resp.json():
f = f_resp.json()[0]
p1 = cls.parse_int(f.get("netto_maximum_vermogen") or f.get("nettomaximumvermogen"))
p2 = cls.parse_int(f.get("nominaal_continu_maximum_vermogen") or f.get("nominaalcontinuvermogen"))
results.update({
"power_kw": max(p1, p2),
"fuel_desc": f.get("brandstof_omschrijving") or "Unknown",
"euro_class": f.get("euro_klasse") or f.get("uitlaatemissieniveau"),
"co2": cls.parse_int(f.get("co2_uitstoot_gecombineerd")),
"consumption": cls.parse_float(f.get("brandstofverbruik_gecombineerd"))
})
e_resp = await cls.fetch_with_retry(client, f"{cls.RDW_ENGINE}?kenteken={plate}")
if e_resp and e_resp.status_code == 200 and e_resp.json():
results["engine_code"] = e_resp.json()[0].get("motorcode")
except Exception: pass
return results
@classmethod
async def process_make_model(cls, db, task_id, make_name, model_name, v_class, priority):
clean_make = make_name.strip().upper()
clean_model = model_name.strip().upper()
logger.info(f"🎯 ADATGYŰJTÉS INDUL: {clean_make} {clean_model}")
offset = 0
async with httpx.AsyncClient(timeout=30.0) as client:
while True:
if clean_model == 'ALL_VARIANTS':
params = f"merk={clean_make}&$limit={cls.BATCH_SIZE}&$offset={offset}&$order=kenteken DESC"
else:
params = f"merk={clean_make}&handelsbenaming={clean_model}&$limit={cls.BATCH_SIZE}&$offset={offset}&$order=kenteken DESC"
try:
r = await cls.fetch_with_retry(client, f"{cls.RDW_MAIN}?{params}")
batch = r.json() if r and r.status_code == 200 else []
except Exception: break
if not batch: break
for item in batch:
plate = item.get("kenteken", "UNKNOWN")
try:
async with db.begin_nested():
tech = await cls.fetch_tech_details(client, plate)
actual_model = (item.get("handelsbenaming") or clean_model).upper()
norm_name = cls.normalize(actual_model.replace(clean_make, "").strip() or actual_model)
stmt = insert(VehicleModelDefinition).values(
make=clean_make,
marketing_name=actual_model,
normalized_name=norm_name,
variant_code=item.get("variant", "UNKNOWN"),
version_code=item.get("uitvoering", "UNKNOWN"),
technical_code=plate,
engine_capacity=cls.parse_int(item.get("cilinderinhoud")),
power_kw=tech["power_kw"],
fuel_type=tech["fuel_desc"],
engine_code=tech["engine_code"],
seats=cls.parse_int(item.get("aantal_zitplaatsen")),
doors=cls.parse_int(item.get("aantal_deuren")),
curb_weight=cls.parse_int(item.get("massa_ledig_voertuig")),
max_weight=cls.parse_int(item.get("technische_max_massa_voertuig")),
vehicle_class=v_class,
priority_score=priority,
market='EU', # KÖTELEZŐ
status="unverified",
is_manual=False,
created_at=func.now(), # KÖTELEZŐ DÁTUMOK
updated_at=func.now(),
source="MEGA-HUNTER-v1.9.6"
).on_conflict_do_nothing(
index_elements=['make', 'normalized_name', 'variant_code', 'version_code', 'fuel_type']
)
await db.execute(stmt)
except Exception as e:
logger.warning(f"⚠️ Sor eldobva ({plate}): {e}")
await db.commit()
offset += len(batch)
if offset >= 500: break
await asyncio.sleep(0.5)
await db.execute(text("UPDATE vehicle.catalog_discovery SET status = 'processed' WHERE id = :id"), {"id": task_id})
await db.commit()
@classmethod
async def run(cls):
logger.info("🤖 Mega-Hunter v1.9.6 ONLINE (TIMESTAMP PATCH)")
while True:
try:
async with AsyncSessionLocal() as db:
query = text("""
UPDATE vehicle.catalog_discovery SET status = 'processing'
WHERE id = (SELECT id FROM vehicle.catalog_discovery WHERE status = 'pending'
ORDER BY priority_score DESC FOR UPDATE SKIP LOCKED LIMIT 1)
RETURNING id, make, model, vehicle_class, priority_score;
""")
result = await db.execute(query)
task = result.fetchone()
await db.commit()
if task: await cls.process_make_model(db, task[0], task[1], task[2], task[3], task[4])
else: await asyncio.sleep(30)
except Exception as e:
logger.error(f"💀 Főciklus hiba: {e}")
await asyncio.sleep(10)
if __name__ == "__main__":
asyncio.run(CatalogHunter.run())

View File

@@ -0,0 +1,168 @@
import asyncio
import httpx
import logging
import os
import re
import sys
from sqlalchemy import text
from sqlalchemy.dialects.postgresql import insert
from app.database import AsyncSessionLocal
from app.models.vehicle_definitions import VehicleModelDefinition
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] Robot-1-Hunter: %(message)s', stream=sys.stdout)
logger = logging.getLogger("Robot-1")
class CatalogHunter:
"""
Vehicle Robot 2.1.2: A Végleges Vadász
Tökéletes adattípus szinkron. raw_search_context -> string.
"""
RDW_MAIN = "https://opendata.rdw.nl/resource/m9d7-ebf2.json"
RDW_FUEL = "https://opendata.rdw.nl/resource/8ys7-d773.json"
RDW_ENGINE = "https://opendata.rdw.nl/resource/jh96-v4pq.json"
RDW_TOKEN = os.getenv("RDW_APP_TOKEN")
HEADERS = {"X-App-Token": RDW_TOKEN} if RDW_TOKEN else {}
BATCH_SIZE = 50
@classmethod
def normalize(cls, text_val: str) -> str:
if not text_val: return "UNKNOWN"
return re.sub(r'[^a-zA-Z0-9]', '', text_val).lower()
@classmethod
def parse_int(cls, value) -> int:
try:
if value is None or str(value).strip() == "": return 0
return int(float(value))
except (ValueError, TypeError): return 0
@classmethod
def parse_float(cls, value) -> float:
try:
if value is None or str(value).strip() == "": return 0.0
return float(value)
except (ValueError, TypeError): return 0.0
@classmethod
async def fetch_tech_details(cls, client, plate):
res = {"power_kw": 0, "engine_code": None, "euro_class": None, "fuel_desc": "Unknown", "co2": 0, "consumption": 0.0}
try:
f_resp = await client.get(f"{cls.RDW_FUEL}?kenteken={plate}", headers=cls.HEADERS)
if f_resp.status_code == 200 and f_resp.json():
f = f_resp.json()[0]
p1 = cls.parse_int(f.get("netto_maximum_vermogen"))
p2 = cls.parse_int(f.get("nominaal_continu_maximum_vermogen"))
res.update({
"power_kw": max(p1, p2),
"fuel_desc": f.get("brandstof_omschrijving") or "Unknown",
"euro_class": f.get("euro_klasse") or f.get("uitlaatemissieniveau"),
"co2": cls.parse_int(f.get("co2_uitstoot_gecombineerd")),
"consumption": cls.parse_float(f.get("brandstofverbruik_gecombineerd"))
})
e_resp = await client.get(f"{cls.RDW_ENGINE}?kenteken={plate}", headers=cls.HEADERS)
if e_resp.status_code == 200 and e_resp.json():
res["engine_code"] = e_resp.json()[0].get("motorcode")
except Exception: pass
return res
@classmethod
async def process_task(cls, db, task):
clean_make = task.make.strip().upper()
clean_model = task.model.strip().upper()
logger.info(f"🎯 ADATGYŰJTÉS INDUL: {clean_make} {clean_model}")
async with httpx.AsyncClient(timeout=30.0) as client:
offset = 0
while True:
params = f"merk={clean_make}"
if clean_model != 'ALL_VARIANTS':
params += f"&handelsbenaming={clean_model}"
params += f"&$limit={cls.BATCH_SIZE}&$offset={offset}&$order=kenteken DESC"
try:
r = await client.get(f"{cls.RDW_MAIN}?{params}", headers=cls.HEADERS)
batch = r.json() if r.status_code == 200 else []
except Exception: break
if not batch: break
for item in batch:
plate = item.get("kenteken", "UNKNOWN")
try:
async with db.begin_nested():
tech = await cls.fetch_tech_details(client, plate)
actual_model = (item.get("handelsbenaming") or clean_model).upper()
norm_name = cls.normalize(actual_model.replace(clean_make, "").strip() or actual_model)
datum_eerste_toelating = str(item.get("datum_eerste_toelating", ""))
year_from = cls.parse_int(datum_eerste_toelating[:4]) if len(datum_eerste_toelating) >= 4 else 0
stmt = insert(VehicleModelDefinition).values(
market='EU',
make=clean_make,
marketing_name=actual_model,
normalized_name=norm_name,
variant_code=item.get("variant", "UNKNOWN"),
version_code=item.get("uitvoering", "UNKNOWN"),
technical_code=plate,
type_approval_number=item.get("typegoedkeuringsnummer"),
seats=cls.parse_int(item.get("aantal_zitplaatsen")),
doors=cls.parse_int(item.get("aantal_deuren")),
width=cls.parse_int(item.get("breedte")),
wheelbase=cls.parse_int(item.get("wielbasis")),
list_price=cls.parse_int(item.get("catalogusprijs")),
max_speed=cls.parse_int(item.get("maximale_constructiesnelheid")),
curb_weight=cls.parse_int(item.get("massa_ledig_voertuig")),
max_weight=cls.parse_int(item.get("technische_max_massa_voertuig")),
fuel_consumption_combined=tech["consumption"],
co2_emissions_combined=tech["co2"],
vehicle_class=task.vehicle_class,
body_type=item.get("inrichting"),
fuel_type=tech["fuel_desc"],
engine_capacity=cls.parse_int(item.get("cilinderinhoud")),
power_kw=tech["power_kw"],
cylinders=cls.parse_int(item.get("aantal_cilinders")),
engine_code=tech["engine_code"],
euro_classification=tech["euro_class"],
year_from=year_from,
priority_score=task.priority_score,
status="unverified",
source="MEGA-HUNTER-v2.1.2",
# JAVÍTÁS: A raw_search_context most már üres STRING (''), ahogy a modell elvárja!
raw_search_context='',
research_metadata={},
specifications={},
marketing_name_aliases=[]
).on_conflict_do_nothing(
index_elements=['make', 'normalized_name', 'variant_code', 'version_code', 'fuel_type', 'market', 'year_from']
)
await db.execute(stmt)
except Exception as e:
logger.warning(f"⚠️ Sor hiba ({plate}): {e}")
await db.commit()
offset += len(batch)
if offset >= 500: break
await asyncio.sleep(0.5)
await db.execute(text("UPDATE vehicle.catalog_discovery SET status = 'processed' WHERE id = :id"), {"id": task.id})
await db.commit()
@classmethod
async def run(cls):
logger.info("🤖 Mega-Hunter v2.1.2 (Adattípus Fix) ONLINE")
while True:
try:
async with AsyncSessionLocal() as db:
query = text("UPDATE vehicle.catalog_discovery SET status = 'processing' WHERE id = (SELECT id FROM vehicle.catalog_discovery WHERE status = 'pending' ORDER BY priority_score DESC FOR UPDATE SKIP LOCKED LIMIT 1) RETURNING id, make, model, vehicle_class, priority_score;")
res = await db.execute(query)
task = res.fetchone()
await db.commit()
if task: await cls.process_task(db, task)
else: await asyncio.sleep(30)
except Exception as e:
logger.error(f"💀 Főciklus hiba: {e}")
await asyncio.sleep(10)
if __name__ == "__main__":
asyncio.run(CatalogHunter.run())

View File

@@ -0,0 +1,205 @@
# /app/app/workers/vehicle/vehicle_robot_1_catalog_hunter.py
import asyncio
import httpx
import logging
import os
import re
import sys
import json
from sqlalchemy import text
from sqlalchemy.dialects.postgresql import insert
from app.database import AsyncSessionLocal
from app.models.vehicle_definitions import VehicleModelDefinition
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] Robot-1-Hunter: %(message)s', stream=sys.stdout)
logger = logging.getLogger("Robot-1")
class CatalogHunter:
"""
Vehicle Robot 2.2.0: Fast-Track to Gold Edition
Ha az RDW-ből megvan minden kulcsadat (kw, ccm, fuel), azonnal 'gold_enriched'-re teszi a járművet
és beírja a vehicle_catalog mestertáblába!
"""
RDW_MAIN = "https://opendata.rdw.nl/resource/m9d7-ebf2.json"
RDW_FUEL = "https://opendata.rdw.nl/resource/8ys7-d773.json"
RDW_ENGINE = "https://opendata.rdw.nl/resource/jh96-v4pq.json"
RDW_TOKEN = os.getenv("RDW_APP_TOKEN")
HEADERS = {"X-App-Token": RDW_TOKEN} if RDW_TOKEN else {}
BATCH_SIZE = 50
@classmethod
def normalize(cls, text_val: str) -> str:
if not text_val: return "UNKNOWN"
return re.sub(r'[^a-zA-Z0-9]', '', text_val).lower()
@classmethod
def parse_int(cls, value) -> int:
try:
if value is None or str(value).strip() == "": return 0
return int(float(value))
except (ValueError, TypeError): return 0
@classmethod
def parse_float(cls, value) -> float:
try:
if value is None or str(value).strip() == "": return 0.0
return float(value)
except (ValueError, TypeError): return 0.0
@classmethod
async def fetch_tech_details(cls, client, plate):
res = {"power_kw": 0, "engine_code": None, "euro_class": None, "fuel_desc": "Unknown", "co2": 0, "consumption": 0.0}
try:
f_resp = await client.get(f"{cls.RDW_FUEL}?kenteken={plate}", headers=cls.HEADERS)
if f_resp.status_code == 200 and f_resp.json():
f = f_resp.json()[0]
p1 = cls.parse_int(f.get("netto_maximum_vermogen"))
p2 = cls.parse_int(f.get("nominaal_continu_maximum_vermogen"))
res.update({
"power_kw": max(p1, p2),
"fuel_desc": f.get("brandstof_omschrijving") or "Unknown",
"euro_class": f.get("euro_klasse") or f.get("uitlaatemissieniveau"),
"co2": cls.parse_int(f.get("co2_uitstoot_gecombineerd")),
"consumption": cls.parse_float(f.get("brandstofverbruik_gecombineerd"))
})
e_resp = await client.get(f"{cls.RDW_ENGINE}?kenteken={plate}", headers=cls.HEADERS)
if e_resp.status_code == 200 and e_resp.json():
res["engine_code"] = e_resp.json()[0].get("motorcode")
except Exception: pass
return res
@classmethod
async def process_task(cls, db, task):
clean_make = task.make.strip().upper()
clean_model = task.model.strip().upper()
logger.info(f"🎯 ADATGYŰJTÉS INDUL: {clean_make} {clean_model}")
async with httpx.AsyncClient(timeout=30.0) as client:
offset = 0
while True:
params = f"merk={clean_make}"
if clean_model != 'ALL_VARIANTS':
params += f"&handelsbenaming={clean_model}"
params += f"&$limit={cls.BATCH_SIZE}&$offset={offset}&$order=kenteken DESC"
try:
r = await client.get(f"{cls.RDW_MAIN}?{params}", headers=cls.HEADERS)
batch = r.json() if r.status_code == 200 else []
except Exception: break
if not batch: break
for item in batch:
plate = item.get("kenteken", "UNKNOWN")
try:
async with db.begin_nested():
tech = await cls.fetch_tech_details(client, plate)
actual_model = (item.get("handelsbenaming") or clean_model).upper()
norm_name = cls.normalize(actual_model.replace(clean_make, "").strip() or actual_model)
datum_eerste_toelating = str(item.get("datum_eerste_toelating", ""))
year_from = cls.parse_int(datum_eerste_toelating[:4]) if len(datum_eerste_toelating) >= 4 else 0
engine_ccm = cls.parse_int(item.get("cilinderinhoud"))
power_kw = tech["power_kw"]
fuel_type = tech["fuel_desc"]
# FAST-TRACK LOGIKA: Ha a kötelező műszaki adatok megvannak, azonnal ARANY minősítést kap!
# Villanyautóknál a CCM lehet 0, ezt is kezeljük.
is_gold = False
if (power_kw > 0 and engine_ccm > 0) or (power_kw > 0 and "elektri" in fuel_type.lower()):
is_gold = True
final_status = "gold_enriched" if is_gold else "unverified"
# 1. Beírjuk a VMD-be (Staging tábla)
stmt = insert(VehicleModelDefinition).values(
market='EU',
make=clean_make,
marketing_name=actual_model,
normalized_name=norm_name,
variant_code=item.get("variant", "UNKNOWN"),
version_code=item.get("uitvoering", "UNKNOWN"),
technical_code=plate,
type_approval_number=item.get("typegoedkeuringsnummer"),
seats=cls.parse_int(item.get("aantal_zitplaatsen")),
doors=cls.parse_int(item.get("aantal_deuren")),
width=cls.parse_int(item.get("breedte")),
wheelbase=cls.parse_int(item.get("wielbasis")),
list_price=cls.parse_int(item.get("catalogusprijs")),
max_speed=cls.parse_int(item.get("maximale_constructiesnelheid")),
curb_weight=cls.parse_int(item.get("massa_ledig_voertuig")),
max_weight=cls.parse_int(item.get("technische_max_massa_voertuig")),
fuel_consumption_combined=tech["consumption"],
co2_emissions_combined=tech["co2"],
vehicle_class=task.vehicle_class,
body_type=item.get("inrichting"),
fuel_type=fuel_type,
engine_capacity=engine_ccm,
power_kw=power_kw,
cylinders=cls.parse_int(item.get("aantal_cilinders")),
engine_code=tech["engine_code"],
euro_classification=tech["euro_class"],
year_from=year_from,
priority_score=task.priority_score,
status=final_status, # Dinamikus státusz
source="MEGA-HUNTER-v2.2.0-FAST",
raw_search_context='',
research_metadata={},
specifications={"fast_track": True}, # Jelezzük, hogy ez RDW-ből jött közvetlenül
marketing_name_aliases=[]
).on_conflict_do_nothing(
index_elements=['make', 'normalized_name', 'variant_code', 'version_code', 'fuel_type', 'market', 'year_from']
).returning(VehicleModelDefinition.id)
res = await db.execute(stmt)
vmd_id = res.scalar()
# 2. HA ARANY, AZONNAL LÉPÜNK A VÉGSŐ KATALÓGUSBA (Ahogy az Alchemist is tenné)
if is_gold and vmd_id:
cat_stmt = text("""
INSERT INTO vehicle.vehicle_catalog
(master_definition_id, make, model, power_kw, engine_capacity, fuel_type, factory_data)
VALUES (:m_id, :make, :model, :kw, :ccm, :fuel, :factory)
ON CONFLICT ON CONSTRAINT uix_vehicle_catalog_full DO NOTHING;
""")
await db.execute(cat_stmt, {
"m_id": vmd_id,
"make": clean_make,
"model": actual_model[:50],
"kw": power_kw,
"ccm": engine_ccm,
"fuel": fuel_type,
"factory": json.dumps({"source": "RDW API Direct", "verified": True})
})
logger.info(f"✨ FAST-TRACK ARANY: {clean_make} {actual_model} (KW: {power_kw}, CCM: {engine_ccm})")
except Exception as e:
logger.warning(f"⚠️ Sor hiba ({plate}): {e}")
await db.commit()
offset += len(batch)
if offset >= 500: break
await asyncio.sleep(0.5)
await db.execute(text("UPDATE vehicle.catalog_discovery SET status = 'processed' WHERE id = :id"), {"id": task.id})
await db.commit()
@classmethod
async def run(cls):
logger.info("🤖 Mega-Hunter v2.2.0 (Fast-Track Edition) ONLINE")
while True:
try:
async with AsyncSessionLocal() as db:
query = text("UPDATE vehicle.catalog_discovery SET status = 'processing' WHERE id = (SELECT id FROM vehicle.catalog_discovery WHERE status = 'pending' ORDER BY priority_score DESC FOR UPDATE SKIP LOCKED LIMIT 1) RETURNING id, make, model, vehicle_class, priority_score;")
res = await db.execute(query)
task = res.fetchone()
await db.commit()
if task: await cls.process_task(db, task)
else: await asyncio.sleep(30)
except Exception as e:
logger.error(f"💀 Főciklus hiba: {e}")
await asyncio.sleep(10)
if __name__ == "__main__":
asyncio.run(CatalogHunter.run())

View File

@@ -0,0 +1,140 @@
import asyncio, httpx, logging, os, re, sys, json
from sqlalchemy import text
from sqlalchemy.dialects.postgresql import insert
from app.database import AsyncSessionLocal
from app.models.vehicle_definitions import VehicleModelDefinition
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] Robot-1-Hunter: %(message)s', stream=sys.stdout)
logger = logging.getLogger("Robot-1")
class CatalogHunter:
RDW_MAIN = "https://opendata.rdw.nl/resource/m9d7-ebf2.json"
RDW_FUEL = "https://opendata.rdw.nl/resource/8ys7-d773.json"
RDW_ENGINE = "https://opendata.rdw.nl/resource/jh96-v4pq.json"
RDW_TOKEN = os.getenv("RDW_APP_TOKEN")
HEADERS = {"X-App-Token": RDW_TOKEN} if RDW_TOKEN else {}
BATCH_SIZE = 50
@classmethod
def normalize(cls, text_val: str) -> str:
return re.sub(r'[^a-zA-Z0-9]', '', text_val).lower() if text_val else "UNKNOWN"
@classmethod
def parse_int(cls, value) -> int:
try: return int(float(value)) if value and str(value).strip() else 0
except: return 0
@classmethod
def parse_float(cls, value) -> float:
try: return float(value) if value and str(value).strip() else 0.0
except: return 0.0
@classmethod
async def fetch_tech_details(cls, client, plate):
res = {"power_kw": 0, "engine_code": None, "euro_class": None, "fuel_desc": "Unknown", "co2": 0, "consumption": 0.0}
try:
f_resp = await client.get(f"{cls.RDW_FUEL}?kenteken={plate}", headers=cls.HEADERS)
if f_resp.status_code == 200 and f_resp.json():
f = f_resp.json()[0]
p1, p2 = cls.parse_int(f.get("netto_maximum_vermogen")), cls.parse_int(f.get("nominaal_continu_maximum_vermogen"))
res.update({
"power_kw": max(p1, p2),
"fuel_desc": f.get("brandstof_omschrijving") or "Unknown",
"euro_class": f.get("euro_klasse") or f.get("uitlaatemissieniveau"),
"co2": cls.parse_int(f.get("co2_uitstoot_gecombineerd")),
"consumption": cls.parse_float(f.get("brandstofverbruik_gecombineerd"))
})
e_resp = await client.get(f"{cls.RDW_ENGINE}?kenteken={plate}", headers=cls.HEADERS)
if e_resp.status_code == 200 and e_resp.json():
res["engine_code"] = e_resp.json()[0].get("motorcode")
except Exception: pass
return res
@classmethod
async def process_task(cls, db, task):
clean_make, clean_model = task.make.strip().upper(), task.model.strip().upper()
logger.info(f"🎯 ADATGYŰJTÉS INDUL: {clean_make} {clean_model}")
async with httpx.AsyncClient(timeout=30.0) as client:
offset = 0
while True:
params = f"merk={clean_make}" + (f"&handelsbenaming={clean_model}" if clean_model != 'ALL_VARIANTS' else "") + f"&$limit={cls.BATCH_SIZE}&$offset={offset}&$order=kenteken DESC"
try:
r = await client.get(f"{cls.RDW_MAIN}?{params}", headers=cls.HEADERS)
batch = r.json() if r.status_code == 200 else []
except Exception: break
if not batch: break
for item in batch:
plate = item.get("kenteken", "UNKNOWN")
try:
async with db.begin_nested():
tech = await cls.fetch_tech_details(client, plate)
actual_model = (item.get("handelsbenaming") or clean_model).upper()
norm_name = cls.normalize(actual_model.replace(clean_make, "").strip() or actual_model)
datum = str(item.get("datum_eerste_toelating", ""))
year_from = cls.parse_int(datum[:4]) if len(datum) >= 4 else 0
engine_ccm, power_kw, fuel_type = cls.parse_int(item.get("cilinderinhoud")), tech["power_kw"], tech["fuel_desc"]
# FAST-TRACK LOGIKA: Ha van KW és CCM, egyből ARANY!
is_gold = (power_kw > 0 and engine_ccm > 0) or (power_kw > 0 and "elektri" in fuel_type.lower())
final_status = "gold_enriched" if is_gold else "unverified"
stmt = insert(VehicleModelDefinition).values(
market='EU', make=clean_make, marketing_name=actual_model, normalized_name=norm_name,
variant_code=item.get("variant", "UNKNOWN"), version_code=item.get("uitvoering", "UNKNOWN"),
technical_code=plate, type_approval_number=item.get("typegoedkeuringsnummer"),
seats=cls.parse_int(item.get("aantal_zitplaatsen")), doors=cls.parse_int(item.get("aantal_deuren")),
width=cls.parse_int(item.get("breedte")), wheelbase=cls.parse_int(item.get("wielbasis")),
list_price=cls.parse_int(item.get("catalogusprijs")), max_speed=cls.parse_int(item.get("maximale_constructiesnelheid")),
curb_weight=cls.parse_int(item.get("massa_ledig_voertuig")), max_weight=cls.parse_int(item.get("technische_max_massa_voertuig")),
fuel_consumption_combined=tech["consumption"], co2_emissions_combined=tech["co2"],
vehicle_class=task.vehicle_class, body_type=item.get("inrichting"), fuel_type=fuel_type,
engine_capacity=engine_ccm, power_kw=power_kw, cylinders=cls.parse_int(item.get("aantal_cilinders")),
engine_code=tech["engine_code"], euro_classification=tech["euro_class"], year_from=year_from,
priority_score=task.priority_score, status=final_status, source="MEGA-HUNTER-v2.2.0-FAST",
raw_search_context='', research_metadata={}, specifications={"fast_track": True} if is_gold else {}, marketing_name_aliases=[]
).on_conflict_do_nothing(
index_elements=['make', 'normalized_name', 'variant_code', 'version_code', 'fuel_type', 'market', 'year_from']
).returning(VehicleModelDefinition.id)
res = await db.execute(stmt)
vmd_id = res.scalar()
# Automatikus Publikálás (Ha Arany)
if is_gold and vmd_id:
cat_stmt = text("""
INSERT INTO vehicle.vehicle_catalog (master_definition_id, make, model, power_kw, engine_capacity, fuel_type, factory_data)
VALUES (:m_id, :make, :model, :kw, :ccm, :fuel, :factory)
ON CONFLICT ON CONSTRAINT uix_vehicle_catalog_full DO NOTHING;
""")
await db.execute(cat_stmt, {"m_id": vmd_id, "make": clean_make, "model": actual_model[:50], "kw": power_kw, "ccm": engine_ccm, "fuel": fuel_type, "factory": '{"source": "RDW Fast-Track"}'})
logger.info(f"✨ FAST-TRACK ARANY: {clean_make} {actual_model}")
except Exception as e: logger.warning(f"⚠️ Sor hiba ({plate}): {e}")
await db.commit()
offset += len(batch)
if offset >= 500: break
await asyncio.sleep(0.5)
await db.execute(text("UPDATE vehicle.catalog_discovery SET status = 'processed' WHERE id = :id"), {"id": task.id})
await db.commit()
@classmethod
async def run(cls):
logger.info("🤖 Mega-Hunter v2.2.0 (Fast-Track) ONLINE")
while True:
try:
async with AsyncSessionLocal() as db:
res = await db.execute(text("UPDATE vehicle.catalog_discovery SET status = 'processing' WHERE id = (SELECT id FROM vehicle.catalog_discovery WHERE status = 'pending' ORDER BY priority_score DESC FOR UPDATE SKIP LOCKED LIMIT 1) RETURNING id, make, model, vehicle_class, priority_score;"))
task = res.fetchone()
await db.commit()
if task: await cls.process_task(db, task)
else: await asyncio.sleep(30)
except Exception: await asyncio.sleep(10)
if __name__ == "__main__":
asyncio.run(CatalogHunter.run())

View File

@@ -0,0 +1,239 @@
# /opt/docker/dev/service_finder/backend/app/workers/vehicle/vehicle_robot_2_researcher.py
import asyncio
import logging
import warnings
import os
import json
from datetime import datetime
from sqlalchemy import text, update, func
from app.database import AsyncSessionLocal
from app.models.vehicle_definitions import VehicleModelDefinition
warnings.filterwarnings("ignore", category=RuntimeWarning, module='duckduckgo_search')
from duckduckgo_search import DDGS
# MB 2.0 Szabvány naplózás
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] Robot-2-Researcher: %(message)s')
logger = logging.getLogger("Vehicle-Robot-2-Researcher")
class QuotaManager:
""" Szigorú napi limit figyelő a fizetős/hatósági API-khoz """
def __init__(self, service_name: str, daily_limit: int):
self.service_name = service_name
self.daily_limit = daily_limit
self.state_file = f"/app/temp/.quota_{service_name}.json"
self._ensure_file()
def _ensure_file(self):
os.makedirs(os.path.dirname(self.state_file), exist_ok=True)
if not os.path.exists(self.state_file):
with open(self.state_file, 'w') as f:
json.dump({"date": datetime.now().strftime("%Y-%m-%d"), "count": 0}, f)
def can_make_request(self) -> bool:
with open(self.state_file, 'r') as f:
data = json.load(f)
today = datetime.now().strftime("%Y-%m-%d")
if data["date"] != today:
data = {"date": today, "count": 0} # Új nap, kvóta nullázása
if data["count"] >= self.daily_limit:
return False
# Növeljük a számlálót
data["count"] += 1
with open(self.state_file, 'w') as f:
json.dump(data, f)
return True
class VehicleResearcher:
"""
Vehicle Robot 2.5: Sniper Researcher (Mesterlövész Adatgyűjtő)
Célzott keresésekkel és strukturált aktakészítéssel dolgozik az AI kímélése érdekében.
"""
def __init__(self):
self.max_attempts = 5
self.search_timeout = 15.0
# Kvóta menedzserek beállítása (.env-ből olvasva)
dvla_limit = int(os.getenv("DVLA_DAILY_LIMIT", "1000"))
self.dvla_quota = QuotaManager("dvla", dvla_limit)
self.dvla_token = os.getenv("DVLA_API_KEY")
async def fetch_ddg_targeted(self, label: str, query: str) -> str:
""" Célzott keresés szálbiztosan a DuckDuckGo-n. """
try:
def search():
with DDGS() as ddgs:
# max_results=2: Nem kell sok zaj, csak a legrelevánsabb 2 találat
results = ddgs.text(query, max_results=2)
return [f"- {r.get('body', '')}" for r in results] if results else []
results = await asyncio.wait_for(asyncio.to_thread(search), timeout=self.search_timeout)
if not results:
return f"[SOURCE: {label}]\nNincs érdemi találat.\n"
content = f"[SOURCE: {label} | KERESÉS: {query}]\n"
content += "\n".join(results) + "\n"
return content
except Exception as e:
logger.debug(f"Keresési hiba ({label}): {e}")
return f"[SOURCE: {label}]\nKERESÉSI HIBA.\n"
def extract_specs_from_text(self, text: str) -> dict:
""" Regex alapú kinyerés a nyers szövegből: ccm, kW, motoradatok. """
import re
specs = {}
# CCM (köbcentiméter) minta: 1998 cc, 2.0 L, 2000 cm³
ccm_pattern = r'(\d{3,4})\s*(?:cc|ccm|cm³|cm3|cc\.)'
match = re.search(ccm_pattern, text, re.IGNORECASE)
if match:
specs['ccm'] = int(match.group(1))
else:
# Alternatív minta: 2.0 liter -> 2000 cc
liter_pattern = r'(\d+\.?\d*)\s*(?:L|liter|)'
match = re.search(liter_pattern, text, re.IGNORECASE)
if match:
liters = float(match.group(1))
specs['ccm'] = int(liters * 1000)
# KW (kilowatt) minta: 150 kW, 150kW, 150 KW
kw_pattern = r'(\d{2,4})\s*(?:kW|kw|KW)'
match = re.search(kw_pattern, text, re.IGNORECASE)
if match:
specs['kw'] = int(match.group(1))
else:
# Le (lóerő) átváltás: 150 LE -> 110 kW (kb)
hp_pattern = r'(\d{2,4})\s*(?:HP|hp|LE|le|Ps)'
match = re.search(hp_pattern, text, re.IGNORECASE)
if match:
hp = int(match.group(1))
specs['kw'] = int(hp * 0.7355) # hozzávetőleges átváltás
# Motor kód minta: motor kód: 1.8 TSI, engine code: N47
engine_pattern = r'(?:motor\s*kód|engine\s*code|motor\s*code)[:\s]+([A-Z0-9\.\- ]+)'
match = re.search(engine_pattern, text, re.IGNORECASE)
if match:
specs['engine_code'] = match.group(1).strip()
return specs
async def research_vehicle(self, db, vehicle_id: int, make: str, model: str, engine: str, year: str, current_attempts: int):
""" Egy jármű átvilágítása és a strukturált 'Akta' elkészítése a GPU számára. """
engine_safe = engine or ""
year_safe = str(year) if year else ""
logger.info(f"🔎 Mesterlövész Kutatás: {make} {model} (Motor: {engine_safe})")
# 1. TIER: Ingyenes, Célzott Keresések (A legmegbízhatóbb források)
queries = [
("ULTIMATE_SPECS", f"{make} {model} {engine_safe} {year_safe} site:ultimatespecs.com"),
("AUTO_DATA", f"{make} {model} {engine_safe} {year_safe} site:auto-data.net"),
("COMMON_ISSUES", f"{make} {model} {engine_safe} reliability common problems")
]
tasks = [self.fetch_ddg_targeted(label, q) for label, q in queries]
search_results = await asyncio.gather(*tasks)
# 2. TIER: Fizetős / Kvótás API-k (Példa a DVLA helyére)
# Ha a jövőben bejön brit rendszám, itt hívjuk meg a DVLA-t:
# if has_uk_plate and self.dvla_quota.can_make_request():
# uk_data = await self.fetch_dvla_data(plate)
# search_results.append(uk_data)
# 3. ÖSSZESÍTÉS (Az Akta összeállítása)
# Maximalizáljuk a szöveg hosszát, hogy az AI GPU ne fulladjon le!
full_context = "\n".join(search_results)
if len(full_context) > 2500:
full_context = full_context[:2500] + "\n...[TRUNCATED TO SAVE GPU TOKENS]"
# Regex alapú specifikáció kinyerés
extracted_specs = self.extract_specs_from_text(full_context)
try:
if len(full_context.strip()) > 150: # Csökkentettük az elvárást, mert a célzott keresés tömörebb
await db.execute(
update(VehicleModelDefinition)
.where(VehicleModelDefinition.id == vehicle_id)
.values(
raw_search_context=full_context,
research_metadata=extracted_specs,
status='awaiting_ai_synthesis', # Kész az Akta, mehet az Alkimistának!
last_research_at=func.now(),
attempts=current_attempts + 1
)
)
logger.info(f"✅ Akta rögzítve ({len(full_context)} karakter): {make} {model}")
else:
new_status = 'suspended_research' if current_attempts + 1 >= self.max_attempts else 'unverified'
await db.execute(
update(VehicleModelDefinition)
.where(VehicleModelDefinition.id == vehicle_id)
.values(
status=new_status,
attempts=current_attempts + 1,
last_research_at=func.now()
)
)
if new_status == 'suspended_research':
logger.warning(f"🛑 Felfüggesztve (Nincs nyom a weben): {make} {model}")
else:
logger.warning(f"⚠️ Kevés adat: {make} {model}, visszatéve a sorba.")
await db.commit()
except Exception as e:
await db.rollback()
logger.error(f"🚨 Adatbázis hiba az eredmény mentésénél ({vehicle_id}): {e}")
@classmethod
async def run(cls):
self_instance = cls()
logger.info("🚀 Vehicle Researcher 2.5 ONLINE (Sniper & Quota Manager)")
while True:
try:
async with AsyncSessionLocal() as db:
# ATOMI ZÁROLÁS
query = text("""
UPDATE vehicle.vehicle_model_definitions
SET status = 'research_in_progress'
WHERE id = (
SELECT id FROM vehicle.vehicle_model_definitions
WHERE status IN ('unverified', 'awaiting_research', 'ACTIVE')
AND attempts < :max_attempts
AND is_manual = FALSE
ORDER BY
CASE WHEN make = 'TOYOTA' THEN 1 ELSE 2 END,
attempts ASC
FOR UPDATE SKIP LOCKED
LIMIT 1
)
RETURNING id, make, marketing_name, engine_code, year_from, attempts;
""")
result = await db.execute(query, {"max_attempts": self_instance.max_attempts})
task = result.fetchone()
await db.commit()
if task:
v_id, v_make, v_model, v_engine, v_year, v_attempts = task
async with AsyncSessionLocal() as process_db:
await self_instance.research_vehicle(process_db, v_id, v_make, v_model, v_engine, v_year, v_attempts)
await asyncio.sleep(2) # Rate limit védelem a DDG felé
else:
await asyncio.sleep(30)
except Exception as e:
logger.error(f"💀 Kritikus hiba a főciklusban: {e}")
await asyncio.sleep(10)
if __name__ == "__main__":
try:
asyncio.run(VehicleResearcher.run())
except KeyboardInterrupt:
logger.info("🛑 Kutató robot leállítva.")

View File

@@ -0,0 +1,225 @@
# /opt/docker/dev/service_finder/backend/app/workers/vehicle/vehicle_robot_3_alchemist_pro.py
import asyncio
import logging
import datetime
import random
import sys
import json
import os
from sqlalchemy import text, func, update, case
from app.database import AsyncSessionLocal
from app.models.vehicle_definitions import VehicleModelDefinition
from app.models.asset import AssetCatalog
from app.services.ai_service import AIService
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] Vehicle-Alchemist-Pro: %(message)s', stream=sys.stdout)
logger = logging.getLogger("Vehicle-Robot-3-Alchemist-Pro")
class TechEnricher:
"""
Vehicle Robot 3: Alchemist Pro (Atomi Zárolás + Kézi Moderáció Patch)
Tiszta GPU fókusz: Csak az AI elemzésre és adategyesítésre koncentrál.
Nincs felesleges webkeresés. Szigorú, de intelligens Sane-Check.
"""
def __init__(self):
self.max_attempts = 5
self.daily_ai_limit = int(os.getenv("AI_DAILY_LIMIT", "10000"))
self.ai_calls_today = 0
self.last_reset_date = datetime.date.today()
def check_budget(self) -> bool:
if datetime.date.today() > self.last_reset_date:
self.ai_calls_today = 0
self.last_reset_date = datetime.date.today()
return self.ai_calls_today < self.daily_ai_limit
def validate_merged_data(self, merged_kw: int, merged_ccm: int, v_class: str, fuel: str, current_attempts: int) -> tuple[bool, str]:
""" Intelligens validáció a MERGE után. Visszaadja a státuszt és a hiba okát. """
if merged_ccm > 18000:
return False, f"Irreális CCM érték ({merged_ccm})"
if merged_kw > 1500 and v_class != "truck":
return False, f"Irreális KW érték ({merged_kw})"
# Ha hiányzik a KW
if merged_kw == 0:
if current_attempts < 3:
return False, "Hiányzó KW adat. Újrakutatás javasolt."
else:
logger.warning("Sane-check: Többszöri próbálkozás után sincs KW, de átengedjük részlegesként.")
# Ha hiányzik a CCM (és belsőégésű)
if merged_ccm == 0 and "electric" not in fuel and "elektric" not in fuel and v_class != "trailer":
if current_attempts < 3:
return False, "Hiányzó CCM (belsőégésű motornál). Újrakutatás javasolt."
else:
logger.warning("Sane-check: Többszöri próbálkozás után sincs CCM, átengedjük részlegesként.")
return True, "OK"
async def process_single_record(self, db, record_id: int, base_info: dict, current_attempts: int):
# Pontos azonosító a logokhoz (Márka, Modell, ID, RDW adatok)
v_ident = f"{base_info['make'].upper()} {base_info['m_name']} (ID: {record_id}, RDW: {base_info['rdw_ccm']}ccm, KW: {base_info['rdw_kw']})"
attempt_str = f"[Próba: {current_attempts + 1}/{self.max_attempts}]"
ai_data = {} # Üres dict, ha az AI hívás elszállna
try:
logger.info(f"🧠 AI dúsítás indul: {v_ident} {attempt_str}")
# 1. LÉPÉS: AI Hívás (Rábízzuk az adatokat a modellre)
ai_data = await AIService.get_clean_vehicle_data(
base_info['make'],
base_info['m_name'],
base_info
)
if not ai_data:
raise ValueError("Teljesen üres AI válasz (API hiba vagy extrém hallucináció).")
# 2. LÉPÉS: HIBRID MERGE (Még a validáció előtt!)
# Az RDW adatok felülbírálják az AI-t a hatósági paramétereknél
final_kw = base_info['rdw_kw'] if base_info['rdw_kw'] > 0 else int(ai_data.get("kw", 0) or 0)
final_ccm = base_info['rdw_ccm'] if base_info['rdw_ccm'] > 0 else int(ai_data.get("ccm", 0) or 0)
# Üzemanyag tisztítása
fuel_rdw = base_info.get('rdw_fuel', '')
final_fuel = fuel_rdw if fuel_rdw and fuel_rdw != "Unknown" else ai_data.get("fuel_type", "petrol")
final_engine = base_info['rdw_engine'] if base_info['rdw_engine'] else ai_data.get("engine_code", "Unknown")
final_euro = base_info['rdw_euro'] or ai_data.get("euro_classification")
final_cylinders = base_info['rdw_cylinders'] or ai_data.get("cylinders")
# 3. LÉPÉS: Intelligens Validáció
is_valid, error_msg = self.validate_merged_data(final_kw, final_ccm, base_info['v_type'], final_fuel.lower(), current_attempts)
if not is_valid:
raise ValueError(f"Validációs hiba: {error_msg}")
# 4. LÉPÉS: Mentés az Arany Katalógusba
clean_model = str(ai_data.get("marketing_name", base_info['m_name']))[:50].upper()
cat_stmt = text("""
INSERT INTO vehicle.vehicle_catalog
(master_definition_id, make, model, power_kw, engine_capacity, fuel_type, factory_data)
VALUES (:m_id, :make, :model, :kw, :ccm, :fuel, :factory)
ON CONFLICT ON CONSTRAINT uix_vehicle_catalog_full DO NOTHING
RETURNING id;
""")
await db.execute(cat_stmt, {
"m_id": record_id,
"make": base_info['make'].upper(),
"model": clean_model,
"kw": final_kw,
"ccm": final_ccm,
"fuel": final_fuel,
"factory": json.dumps(ai_data)
})
# 5. LÉPÉS: Staging tábla (VMD) lezárása
await db.execute(
update(VehicleModelDefinition)
.where(VehicleModelDefinition.id == record_id)
.values(
status="gold_enriched",
engine_capacity=final_ccm,
power_kw=final_kw,
fuel_type=final_fuel,
engine_code=final_engine,
euro_classification=final_euro,
cylinders=final_cylinders,
specifications=ai_data, # Elmentjük az AI teljes outputját a mestertáblába is
updated_at=func.now()
)
)
await db.commit()
logger.info(f"✨ ARANY REKORD KÉSZ: {v_ident}")
self.ai_calls_today += 1
except Exception as e:
await db.rollback()
logger.warning(f"⚠️ Alkimista hiba - {v_ident}: {e}")
# Ha elértük a limitet, KÉZI MODERÁCIÓRA küldjük, egyébként vissza a Kutatónak
new_status = 'manual_review_needed' if current_attempts + 1 >= self.max_attempts else 'unverified'
# Elmentjük az AI részleges válaszát (vagy a hibát), hogy az admin lássa, mit rontott el a gép
review_data = ai_data if ai_data else {"error": "Nincs értékelhető JSON adat az AI-tól", "raw_context": base_info['web_context']}
await db.execute(
update(VehicleModelDefinition)
.where(VehicleModelDefinition.id == record_id)
.values(
attempts=current_attempts + 1,
last_error=str(e)[:200],
status=new_status,
specifications=review_data, # Kézi ellenőrzéshez beírjuk a törött adatot!
updated_at=func.now()
)
)
await db.commit()
if new_status == 'unverified':
logger.info(f"♻️ Akta visszaküldve a Robot-2-nek (Kutató). {attempt_str}")
else:
logger.error(f"🛑 Max próbálkozás elérve! Kézi moderációra küldve: {v_ident}")
async def run(self):
logger.info(f"🚀 Alchemist Pro HIBRID ONLINE (Atomi Zárolás + Moderáció Patch)")
while True:
if not self.check_budget():
logger.warning("💸 Napi AI limit kimerítve! Pihenés...")
await asyncio.sleep(3600); continue
try:
async with AsyncSessionLocal() as db:
# ATOMI ZÁROLÁS (A "Szent Grál" a race condition ellen)
query = text("""
UPDATE vehicle.vehicle_model_definitions
SET status = 'ai_synthesis_in_progress'
WHERE id = (
SELECT id FROM vehicle.vehicle_model_definitions
WHERE status IN ('awaiting_ai_synthesis', 'ACTIVE')
AND attempts < :max_attempts
AND is_manual = FALSE
ORDER BY
CASE WHEN status = 'awaiting_ai_synthesis' THEN 1 ELSE 2 END,
priority_score DESC
FOR UPDATE SKIP LOCKED
LIMIT 1
)
RETURNING id, make, marketing_name, vehicle_class, power_kw, engine_capacity,
fuel_type, engine_code, euro_classification, cylinders, raw_search_context, attempts;
""")
result = await db.execute(query, {"max_attempts": self.max_attempts})
task = result.fetchone()
await db.commit()
if task:
# Szétbontjuk a lekérdezett rekordot a base_info dict-be
r_id = task[0]
base_info = {
"make": task[1], "m_name": task[2], "v_type": task[3] or "car",
"rdw_kw": task[4] or 0, "rdw_ccm": task[5] or 0,
"rdw_fuel": task[6] or "petrol", "rdw_engine": task[7] or "",
"rdw_euro": task[8], "rdw_cylinders": task[9],
"web_context": task[10] or ""
}
attempts = task[11]
# Külön adatbázis kapcsolat a feldolgozáshoz (hosszú AI hívás miatt)
async with AsyncSessionLocal() as process_db:
await self.process_single_record(process_db, r_id, base_info, attempts)
# GPU hűtés / Ollama rate limit
await asyncio.sleep(random.uniform(1.5, 3.5))
else:
logger.info("😴 Nincs feldolgozandó akta, az Alkimista pihen...")
await asyncio.sleep(15)
except Exception as e:
logger.error(f"💀 Kritikus hiba a főciklusban: {e}")
await asyncio.sleep(10)
if __name__ == "__main__":
asyncio.run(TechEnricher().run())

View File

@@ -0,0 +1,168 @@
import asyncio
import logging
import datetime
import random
import sys
import json
import os
from sqlalchemy import text, func, update
from app.database import AsyncSessionLocal
from app.models.vehicle_definitions import VehicleModelDefinition
from app.services.ai_service import AIService
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] R3-Alchemist: %(message)s', stream=sys.stdout)
logger = logging.getLogger("Robot-3-Alchemist")
class TechEnricher:
"""
Vehicle Robot 3: Alchemist Pro (Sentinel Gateway Edition)
Az AIService 2.2-t használja (Ollama -> Groq Fallback).
Kinyeri a felszereltségi szintet (trim_level) és pótolja a hiányzó adatokat.
"""
def __init__(self):
self.max_attempts = 5
self.daily_ai_limit = int(os.getenv("AI_DAILY_LIMIT", "10000"))
self.ai_calls_today = 0
self.last_reset_date = datetime.date.today()
def check_budget(self) -> bool:
if datetime.date.today() > self.last_reset_date:
self.ai_calls_today = 0
self.last_reset_date = datetime.date.today()
return self.ai_calls_today < self.daily_ai_limit
def validate_merged_data(self, merged_kw: int, merged_ccm: int, v_class: str, fuel: str, current_attempts: int) -> tuple[bool, str]:
if merged_ccm > 18000:
return False, f"Irreális CCM érték ({merged_ccm})"
if merged_kw > 1500 and v_class not in ["truck", "other"]:
return False, f"Irreális KW érték ({merged_kw})"
if merged_kw == 0 and current_attempts < 3:
return False, "Hiányzó KW adat. Újrakutatás javasolt."
if merged_ccm == 0 and "elektr" not in fuel.lower() and v_class != "trailer" and current_attempts < 3:
return False, "Hiányzó CCM (belsőégésű motornál)."
return True, "OK"
async def process_single_record(self, db, record_id: int, base_info: dict, current_attempts: int):
v_ident = f"{base_info['make'].upper()} {base_info['m_name']} (ID: {record_id})"
attempt_str = f"[Próba: {current_attempts + 1}/{self.max_attempts}]"
try:
logger.info(f"🧠 AI dúsítás indul: {v_ident} {attempt_str}")
# Szigorú Prompt a Master AI Service-nek
prompt = f"""
Elemezd az alábbi járműadatokat és a webes kutatást! Készíts belőle egy JSON objektumot.
Jármű: {base_info['make']} {base_info['m_name']}
Hatósági adatok: {base_info['rdw_ccm']} ccm, {base_info['rdw_kw']} kW, Üzemanyag: {base_info['rdw_fuel']}
Webes szöveg: {base_info['web_context'][:2000]}
FELADATOK:
1. Keresd meg a felszereltségi szintet (trim_level) a modell nevéből vagy a szövegből (pl. AMG, Highline, Titanium, M-Sport, Elegance, ST-Line). Ha nincs, legyen üres string.
2. Ha az RDW adatokban a kW vagy a ccm 0, pótold a szövegből a helyes értéket!
KIZÁRÓLAG EGY ÉRVÉNYES JSON-T ADJ VISSZA! (A Groq/Gemini miatt kötelező a JSON szó használata).
Várt kulcsok: "kw" (int), "ccm" (int), "trim_level" (string), "transmission" (string), "drive_type" (string).
"""
# Hívjuk a te profi Gateway-edet! (_execute_ai_call átveszi a db session-t is a beállításokhoz)
ai_data = await AIService._execute_ai_call(db, prompt, model_key="text")
if not ai_data:
raise ValueError("Üres AI válasz (Minden fallback elbukott).")
# HIBRID MERGE
final_kw = base_info['rdw_kw'] if base_info['rdw_kw'] > 0 else int(ai_data.get("kw", 0) or 0)
final_ccm = base_info['rdw_ccm'] if base_info['rdw_ccm'] > 0 else int(ai_data.get("ccm", 0) or 0)
trim_level = str(ai_data.get("trim_level", ""))[:100]
# Sane-Check
is_valid, error_msg = self.validate_merged_data(final_kw, final_ccm, base_info['v_type'], base_info['rdw_fuel'], current_attempts)
if not is_valid:
raise ValueError(f"Validációs hiba: {error_msg}")
# Staging tábla frissítése (Arany minősítés)
await db.execute(
update(VehicleModelDefinition)
.where(VehicleModelDefinition.id == record_id)
.values(
status="gold_enriched",
engine_capacity=final_ccm,
power_kw=final_kw,
trim_level=trim_level if trim_level.lower() not in ["null", "none"] else "",
specifications=ai_data,
updated_at=func.now()
)
)
await db.commit()
logger.info(f"✨ ARANY REKORD KÉSZ: {v_ident} | Trim: {trim_level}")
self.ai_calls_today += 1
except Exception as e:
await db.rollback()
logger.warning(f"⚠️ Alkimista hiba - {v_ident}: {e}")
new_status = 'manual_review_needed' if current_attempts + 1 >= self.max_attempts else 'unverified'
await db.execute(
update(VehicleModelDefinition)
.where(VehicleModelDefinition.id == record_id)
.values(
attempts=current_attempts + 1,
last_error=str(e)[:200],
status=new_status,
updated_at=func.now()
)
)
await db.commit()
if new_status == 'unverified':
logger.info(f"♻️ Akta visszaküldve a Kutatónak (R2). {attempt_str}")
async def run(self):
logger.info(f"🚀 R3 Alchemist Pro ONLINE (Sentinel Gateway Integráció)")
while True:
if not self.check_budget():
logger.warning("💸 Napi AI limit kimerítve! Pihenés...")
await asyncio.sleep(3600); continue
try:
async with AsyncSessionLocal() as db:
query = text("""
UPDATE vehicle.vehicle_model_definitions
SET status = 'ai_synthesis_in_progress'
WHERE id = (
SELECT id FROM vehicle.vehicle_model_definitions
WHERE status = 'awaiting_ai_synthesis'
AND attempts < :max_attempts
AND is_manual = FALSE
ORDER BY priority_score DESC
FOR UPDATE SKIP LOCKED LIMIT 1
)
RETURNING id, make, marketing_name, vehicle_class, power_kw, engine_capacity, fuel_type, raw_search_context, attempts;
""")
result = await db.execute(query, {"max_attempts": self.max_attempts})
task = result.fetchone()
await db.commit()
if task:
base_info = {
"make": task[1], "m_name": task[2], "v_type": task[3] or "car",
"rdw_kw": task[4] or 0, "rdw_ccm": task[5] or 0,
"rdw_fuel": task[6] or "petrol", "web_context": task[7] or ""
}
async with AsyncSessionLocal() as process_db:
await self.process_single_record(process_db, task[0], base_info, task[8])
else:
await asyncio.sleep(10)
except Exception as e:
logger.error(f"💀 Kritikus hiba a főciklusban: {e}")
await asyncio.sleep(10)
if __name__ == "__main__":
asyncio.run(TechEnricher().run())

View File

@@ -0,0 +1,40 @@
import asyncio, logging, random
from playwright.async_api import async_playwright
from sqlalchemy import text
from app.database import AsyncSessionLocal
logging.basicConfig(level=logging.INFO, format='%(asctime)s [R0-BRANDS] %(message)s')
logger = logging.getLogger("R0")
async def run_r0():
url = "https://www.auto-data.net/en/allbrands"
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
page = await browser.new_page()
logger.info(f"Márkák gyűjtése innen: {url}")
await page.goto(url, wait_until="networkidle")
# Robusztus linkgyűjtés: minden <a> aminek a href-jében benne van a 'brand-'
links = await page.eval_on_selector_all("a[href*='brand-']",
"nodes => nodes.map(n => ({ 'name': n.innerText.trim(), 'url': n.href }))")
async with AsyncSessionLocal() as db:
count = 0
for link in links:
if not link['name'] or 'brand' not in link['url']: continue
query = text("""
INSERT INTO vehicle.auto_data_crawler_queue (url, level, name, status)
VALUES (:url, 'brand', :name, 'pending')
ON CONFLICT (url) DO NOTHING
""")
res = await db.execute(query, {"url": link['url'], "name": link['name']})
if res.rowcount > 0: count += 1
await db.commit()
logger.info(f"✅ Kész! {count} új márkát találtam és mentettem el.")
await browser.close()
if __name__ == "__main__":
asyncio.run(run_r0())

View File

@@ -0,0 +1,137 @@
import asyncio
import logging
import random
import re
from playwright.async_api import async_playwright
from sqlalchemy import text
from app.database import AsyncSessionLocal
# --- NAPLÓZÁS KONFIGURÁCIÓ ---
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s [R1-RECOVERY] %(message)s'
)
logger = logging.getLogger("R1")
async def analyze_and_extract_links(page, current_url, current_level):
"""
Gondolatmenet: Intelligens link-osztályozás.
Javítás: Motorcyclespecs (.htm és /model/) támogatás hozzáadva.
"""
found_links = []
# Linkek kinyerése
hrefs = await page.eval_on_selector_all(
"a",
"nodes => nodes.map(n => ({ 'name': n.innerText.trim(), 'url': n.href }))"
)
logger.info(f"🔎 Oldal elemzése: {len(hrefs)} link található összesen.")
for link in hrefs:
url = link['url']
name = link['name']
if not name or len(name) < 2: continue
if re.search(r'[^\x00-\x7F]+', name): continue # Nyelvi pajzs
# 1. AUTOEVOLUTION
if "autoevolution.com/moto/" in url:
if url.endswith(".html") and "#" not in url:
found_links.append({'name': name, 'url': url, 'level': 'engine'})
elif url.count('/') >= 5:
found_links.append({'name': name, 'url': url, 'level': 'model'})
# 2. BIKEZ
elif "bikez.com" in url:
if "/motorcycles/" in url:
found_links.append({'name': name, 'url': url, 'level': 'engine'})
elif "/models/" in url:
found_links.append({'name': name, 'url': url, 'level': 'model'})
# 3. MOTORCYCLESPECS (Kritikus javítás!)
elif "motorcyclespecs.co.za" in url:
# Ha a linkben benne van a /model/ és .htm-re végződik, az egy adatlap
if "/model/" in url and (".htm" in url or ".html" in url):
found_links.append({'name': name, 'url': url, 'level': 'engine'})
# Ha a brand oldalon vagyunk és további listákat látunk
elif "/bikes/" in url and name.lower() not in current_url.lower():
found_links.append({'name': name, 'url': url, 'level': 'model'})
return found_links
async def main():
"""
Gondolatmenet: A fő vezérlő hurok.
"""
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context = await browser.new_context(
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
)
logger.info("🤖 R1 Recovery Scout elindult...")
while True:
target = None
async with AsyncSessionLocal() as db:
try:
# Feladat felvétele (Márka vagy Modell szint)
res = await db.execute(text("""
UPDATE vehicle.auto_data_crawler_queue SET status = 'processing'
WHERE id = (
SELECT id FROM vehicle.auto_data_crawler_queue
WHERE (status = 'pending' OR status = 'error' OR status = 'completed_empty')
AND level = 'brand'
AND category = 'bike'
ORDER BY id ASC LIMIT 1 FOR UPDATE SKIP LOCKED
) RETURNING id, url, name, level
"""))
target = res.fetchone()
await db.commit()
except Exception as e:
logger.error(f"❌ DB Hiba: {e}")
await db.rollback()
if not target:
logger.info("🏁 Nincs több feladat. Alvás 30mp...")
await asyncio.sleep(30)
continue
t_id, t_url, t_name, t_level = target
page = await context.new_page()
try:
logger.info(f"🚀 [{t_level}] {t_name} felderítése -> {t_url}")
await page.goto(t_url, wait_until="domcontentloaded", timeout=60000)
await asyncio.sleep(2) # Várunk, hogy a JavaScript is lefusson
links = await analyze_and_extract_links(page, t_url, t_level)
async with AsyncSessionLocal() as db:
if links:
for link in links:
await db.execute(text("""
INSERT INTO vehicle.auto_data_crawler_queue (url, level, parent_id, name, status, category)
VALUES (:url, :level, :p_id, :name, 'pending', 'bike')
ON CONFLICT (url) DO NOTHING
"""), {"url": link['url'], "level": link['level'], "p_id": t_id, "name": link['name']})
await db.execute(text("UPDATE vehicle.auto_data_crawler_queue SET status = 'completed' WHERE id = :id"), {"id": t_id})
logger.info(f"✅ Siker: {t_name} -> {len(links)} új link mentve.")
else:
await db.execute(text("UPDATE vehicle.auto_data_crawler_queue SET status = 'completed_empty' WHERE id = :id"), {"id": t_id})
logger.warning(f"⚠️ Üres: {t_name} oldalon nem találtam motorokat.")
await db.commit()
except Exception as e:
logger.error(f"❌ Hiba: {t_name} -> {e}")
finally:
await page.close()
await asyncio.sleep(random.uniform(3, 5))
await browser.close()
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,214 @@
import asyncio
import logging
import random
import re
from playwright.async_api import async_playwright
from sqlalchemy import text
from app.database import AsyncSessionLocal
# --- NAPLÓZÁS KONFIGURÁCIÓ ---
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s [R2-AUTOS-ONLY] %(message)s',
handlers=[logging.StreamHandler()]
)
logger = logging.getLogger("R2")
async def get_page_safe(page, url):
"""
Gondolatmenet: Az anti-bot védelem (Cloudflare) kijátszása érdekében
véletlenszerű várakozást és valós User-Agent viselkedést szimulálunk.
"""
delay = random.uniform(4, 7)
await asyncio.sleep(delay)
try:
# A domcontentloaded gyorsabb, mint a networkidle, de elég a linkgyűjtéshez
await page.goto(url, wait_until="domcontentloaded", timeout=60000)
# Ellenőrizzük, hogy nem kaptunk-e blokkoló oldalt
title = await page.title()
if "Just a moment" in title or "Cloudflare" in title:
raise Exception(f"Bot védelem észlelve az URL-en: {url}")
return page
except Exception as e:
logger.error(f"Hiba az oldal betöltésekor: {url} -> {e}")
raise
async def extract_scoped_links(page, p_id, current_url):
"""
Gondolatmenet: A 'Scope-Lock' technika lényege, hogy az URL-kből kinyert
márkanév horgony (anchor) segítségével megakadályozzuk, hogy a robot
kilépjen a jelenlegi autócsalád környezetéből.
Javítás: Beépített nyelvi szűrő és 'Language Shield' a nem kívánt (görög, spanyol, bolgár stb.)
változatok elkerülésére. Minden talált új linket 'car' kategóriával mentünk el.
"""
# Kinyerjük a márka/típus nevét az URL-ből (pl. 'alfa-romeo')
url_parts = current_url.split('/')[-1].split('-')
brand_anchor = "-".join(url_parts[:2])
# Csak azokat a linkeket gyűjtjük, amik valódi navigációt jelentenek
hrefs = await page.eval_on_selector_all(
"a",
"nodes => nodes.map(n => ({ 'name': n.innerText.trim(), 'url': n.href }))"
)
found_count = 0
async with AsyncSessionLocal() as db:
for link in hrefs:
url = link['url']
name = link['name'].replace('\n', ' ').strip()
# --- 1. ALAPVETŐ ÉRVÉNYESSÉG ---
if not name or len(name) < 2:
continue
# --- 2. LANGUAGE SHIELD (ÚJ VÉDELEM) ---
# Karakterkészlet ellenőrzés: Ha görög, cirill vagy egyéb nem latin karakter van benne, eldobjuk.
if re.search(r'[^\x00-\x7F]+', name):
continue
# Szigorított angol-kényszerítés az URL-ben
if '/en/' not in url:
continue
# Szövegalapú zajszűrés (Meta-linkek kizárása)
junk_keywords = [
'privacy', 'configuracion', 'ρυθμίσεις', 'cookie', 'settings',
'contact', 'about us', 'terms', 'advertising', 'login', 'registration',
'pribatutasun', 'configuració', 'naslovnica', 'stisni',
'personvern', 'prywatnosci', 'ustawienia', 'endre', 'zmień'
]
if any(junk in name.lower() for junk in junk_keywords):
continue
# --- 3. EREDETI NYELVI SZŰRŐ (Language Lock) ---
# Megtartva az eredeti logikát: domain.com/bg/..., domain.com/se/...
path_segments = url.split('/')
if len(path_segments) > 3:
lang_segment = path_segments[3]
if len(lang_segment) == 2 and lang_segment != 'en':
continue
# --- 4. SCOPE SZŰRÉS ---
# Csak az adott márkához tartozó linkeket engedjük át
if brand_anchor not in url:
continue
# --- 5. NAVIGÁCIÓS SZŰRÉS ---
# Ne lépjen vissza a listákhoz, és zárjuk ki az idegen nyelvű könyvtárakat (teljes lista)
excluded_patterns = [
'-brand-', 'allbrands', 'en/brands',
'/bg/', '/ru/', '/de/', '/it/', '/fr/', '/es/',
'/tr/', '/ro/', '/fi/', '/se/', '/no/', '/pl/', '/gr/',
'/hr/', '/cz/', '/sk/', '/ua/'
]
if any(x in url for x in excluded_patterns):
continue
# --- 6. ÖNHIVATKOZÁS SZŰRÉS ---
if url.strip('/') == current_url.strip('/'):
continue
# --- 7. SZINT MEGHATÁROZÁSA MINTÁZAT ALAPJÁN ---
if '-generation-' in url:
target_level = 'generation'
elif re.search(r'-\d+$', url) and '-model-' not in url:
target_level = 'engine'
else:
continue
# --- 8. MENTÉS AZ ADATBÁZISBA ---
await db.execute(text("""
INSERT INTO vehicle.auto_data_crawler_queue (url, level, parent_id, name, status, category)
VALUES (:url, :level, :p_id, :name, 'pending', 'car')
ON CONFLICT (url) DO NOTHING
"""), {"url": url, "level": target_level, "p_id": p_id, "name": name})
found_count += 1
await db.commit()
return found_count
async def process_target(context, t_id, t_url, t_name, t_level):
"""
Gondolatmenet: Egy adott feladat (URL) teljes körű feldolgozása.
A volume mapping miatt a módosítás azonnal látszik a konténerben is.
"""
page = await context.new_page()
try:
logger.info(f"🚀 Autós felderítés indítása [{t_level}]: {t_name}")
await get_page_safe(page, t_url)
# Linkek kinyerése és mentése
found = await extract_scoped_links(page, t_id, t_url)
async with AsyncSessionLocal() as db:
new_status = 'completed' if found > 0 else 'completed_leaf'
await db.execute(text("""
UPDATE vehicle.auto_data_crawler_queue
SET status = :s, error_msg = NULL, updated_at = NOW()
WHERE id = :id
"""), {"s": new_status, "id": t_id})
await db.commit()
logger.info(f"✅ Befejezve: {t_name} -> {found} új link.")
except Exception as e:
logger.error(f"❌ Kritikus hiba feldolgozás közben ({t_name}): {e}")
async with AsyncSessionLocal() as db:
await db.execute(text("""
UPDATE vehicle.auto_data_crawler_queue
SET status = 'error', error_msg = :msg, updated_at = NOW()
WHERE id = :id
"""), {"msg": str(e), "id": t_id})
await db.commit()
finally:
await page.close()
async def main():
"""
Gondolatmenet: A fő vezérlő hurok.
STRATÉGIA: Csak a 'car' kategóriájú feladatokat vesszük fel (category='car').
"""
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context = await browser.new_context(
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
viewport={'width': 1920, 'height': 1080}
)
logger.info("🤖 R2 Autós Felderítő Robot aktív. (Filter: category='car')")
while True:
async with AsyncSessionLocal() as db:
# Csak 'car' kategóriájú, pending feladatok lekérése
res = await db.execute(text("""
UPDATE vehicle.auto_data_crawler_queue SET status = 'processing'
WHERE id = (
SELECT id FROM vehicle.auto_data_crawler_queue
WHERE status = 'pending'
AND level IN ('model', 'generation')
AND category = 'car'
ORDER BY level ASC, id ASC
LIMIT 1 FOR UPDATE SKIP LOCKED
) RETURNING id, url, name, level
"""))
target = res.fetchone()
await db.commit()
if not target:
logger.info("🏁 Nincs több autós feladat (car). Alvás 60mp...")
await asyncio.sleep(60)
continue
await process_target(context, target[0], target[1], target[2], target[3])
await browser.close()
if __name__ == "__main__":
try:
asyncio.run(main())
except KeyboardInterrupt:
logger.info("🛑 Felhasználói leállítás (Ctrl+C).")

View File

@@ -0,0 +1,159 @@
import asyncio
import logging
import random
import json
import re
import sys
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright
from sqlalchemy import text
from app.database import AsyncSessionLocal
# --- NAPLÓZÁS KONFIGURÁCIÓ ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s [R3-EXTRACTOR-v1.2] %(message)s')
logger = logging.getLogger("R3")
# --- KONFIGURÁCIÓS PARAMÉTEREK ---
MAX_RETRY_LIMIT = 3 # Max 3 próbálkozás járművenként
class R3DataMiner:
def clean_key(self, key):
if "," in key: key = key.split(",")[-1]
key = key.replace("What is the ", "").replace("How much ", "").replace("How many ", "")
return key.split("?")[0].strip().capitalize()
async def scrape_specs(self, context, url):
page = await context.new_page()
try:
# Véletlenszerű várakozás a bot-védelem elkerülésére
await asyncio.sleep(random.uniform(4, 8))
await page.goto(url, wait_until="domcontentloaded", timeout=60000)
content = await page.content()
soup = BeautifulSoup(content, 'html.parser')
data = {"make": "", "model": "", "generation": "", "modification": "",
"year_from": None, "power_kw": 0, "engine_cc": 0,
"specifications": {}, "source_url": url}
# Eredeti parszoló logika
for row in soup.find_all('tr'):
th, td = row.find('th'), row.find('td')
if not th or not td: continue
k_raw, v = th.get_text(strip=True), td.get_text(strip=True)
k_low = k_raw.lower()
if "brand" == k_low: data["make"] = v
elif "model" == k_low: data["model"] = v
elif "generation" == k_low: data["generation"] = v
elif "modification" == k_low: data["modification"] = v
elif "start of production" in k_low:
m = re.search(r'(\d{4})', v)
data["year_from"] = int(m.group(1)) if m else None
elif "power" == k_low:
hp = re.search(r'(\d+)\s*Hp', v, re.I)
if hp: data["power_kw"] = int(int(hp.group(1)) / 1.36)
elif "displacement" in k_low:
cc = re.search(r'(\d+)\s*cm3', v)
if cc: data["engine_cc"] = int(cc.group(1))
data["specifications"][self.clean_key(k_raw)] = v
if not data["make"] or not data["specifications"]:
return None
return data
except Exception as e:
logger.error(f"Hiba az adatlapon ({url}): {e}")
return None
finally:
await page.close()
async def run(self):
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context = await browser.new_context(
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
)
while True:
target = None
async with AsyncSessionLocal() as db:
try:
# JAVÍTÁS: Kikerült a priority_score, mert az oszlop nem létezik a crawler_queue táblában
res = await db.execute(text("""
UPDATE vehicle.auto_data_crawler_queue SET status = 'processing'
WHERE id = (
SELECT id FROM vehicle.auto_data_crawler_queue
WHERE level = 'engine'
AND status IN ('pending', 'error')
AND retry_count < 3
ORDER BY id ASC
LIMIT 1 FOR UPDATE SKIP LOCKED
) RETURNING id, url, name, retry_count
"""))
target = res.fetchone()
await db.commit()
except Exception as e:
logger.error(f"❌ DB Hiba a feladatfelvételnél: {e}")
await asyncio.sleep(5)
continue
if not target:
logger.info("🏁 Minden feladat elvégezve. Leállás.")
break
t_id, t_url, t_name, t_retry = target
if t_retry is None: t_retry = 0
logger.info(f"🚀 [{t_retry + 1}/3] Dolgozom: {t_name}")
data = await self.scrape_specs(context, t_url)
async with AsyncSessionLocal() as db:
if data and data["make"]:
await db.execute(text("""
INSERT INTO vehicle.external_reference_library
(source_name, make, model, generation, modification, year_from, power_kw, engine_cc, specifications, source_url)
VALUES ('auto-data.net', :make, :model, :gen, :mod, :y, :p, :e, :s, :u)
ON CONFLICT (source_url) DO UPDATE SET
specifications = EXCLUDED.specifications,
last_scraped_at = NOW();
"""), {
"make": data["make"], "model": data["model"], "gen": data["generation"],
"mod": data["modification"], "y": data["year_from"], "p": data["power_kw"],
"e": data["engine_cc"], "s": json.dumps(data["specifications"]), "u": data["source_url"]
})
await db.execute(text("UPDATE vehicle.auto_data_crawler_queue SET status = 'completed', updated_at = NOW() WHERE id = :id"), {"id": t_id})
logger.info(f"✅ ARANYMENTÉS: {data['make']} {data['model']} {data['modification']}")
else:
new_retry = t_retry + 1
if new_retry >= 3:
await db.execute(text("""
UPDATE vehicle.auto_data_crawler_queue
SET status = 'manual_review_needed',
retry_count = :rc,
error_msg = 'Sikertelen adatgyűjtés 3 próbálkozás után',
updated_at = NOW()
WHERE id = :id
"""), {"rc": new_retry, "id": t_id})
logger.error(f"🚨 LIMIT ELÉRVE: {t_name} -> manual_review_needed")
else:
await db.execute(text("""
UPDATE vehicle.auto_data_crawler_queue
SET status = 'error',
retry_count = :rc,
updated_at = NOW()
WHERE id = :id
"""), {"rc": new_retry, "id": t_id})
logger.warning(f"⚠️ Sikertelen próbálkozás ({new_retry}/3): {t_name}")
await db.commit()
await browser.close()
if __name__ == "__main__":
miner = R3DataMiner()
try:
asyncio.run(miner.run())
except KeyboardInterrupt:
logger.info("🛑 Felhasználói leállítás.")

View File

@@ -0,0 +1,132 @@
import asyncio
import logging
import random
import json
import re
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright
from sqlalchemy import text
from app.database import AsyncSessionLocal
logging.basicConfig(level=logging.INFO, format='%(asctime)s [R4-EXTRACTOR] %(message)s')
logger = logging.getLogger("R4")
class FinalExtractor:
def __init__(self):
self.semaphore = asyncio.Semaphore(2) # Biztonságos párhuzamosság
def clean_key(self, key):
if "," in key: key = key.split(",")[-1]
key = key.replace("What is the ", "").replace("How much ", "").replace("How many ", "")
key = key.split("?")[0].strip()
return key.capitalize()
async def scrape_engine(self, context, url):
page = await context.new_page()
try:
await asyncio.sleep(random.uniform(3, 6)) # Anti-bot késleltetés
await page.goto(url, wait_until="domcontentloaded", timeout=60000)
content = await page.content()
soup = BeautifulSoup(content, 'html.parser')
data = {
"make": "", "model": "", "generation": "", "modification": "",
"year_from": None, "year_to": None, "power_kw": 0, "engine_cc": 0,
"specifications": {}, "source_url": url
}
rows = soup.find_all('tr')
for row in rows:
th, td = row.find('th'), row.find('td')
if not th or not td: continue
raw_k, val = th.get_text(strip=True), td.get_text(strip=True)
k_low = raw_k.lower()
if "brand" == k_low: data["make"] = val
elif "model" == k_low: data["model"] = val
elif "generation" == k_low: data["generation"] = val
elif "modification" == k_low: data["modification"] = val
elif "start of production" in k_low:
m = re.search(r'(\d{4})', val)
if m: data["year_from"] = int(m.group(1))
elif "end of production" in k_low:
m = re.search(r'(\d{4})', val)
if m: data["year_to"] = int(m.group(1))
elif "power" == k_low:
hp_m = re.search(r'(\d+)\s*Hp', val, re.I)
if hp_m: data["power_kw"] = int(int(hp_m.group(1)) / 1.36)
elif "displacement" in k_low:
cc_m = re.search(r'(\d+)\s*cm3', val)
if cc_m: data["engine_cc"] = int(cc_m.group(1))
clean_k = self.clean_key(raw_k)
if clean_k and val: data["specifications"][clean_k] = val
return data
except Exception as e:
logger.error(f"Hiba az adatlapon ({url}): {e}")
return None
finally:
await page.close()
async def save_to_library(self, data):
if not data or not data["make"]: return
async with AsyncSessionLocal() as db:
try:
await db.execute(text("""
INSERT INTO vehicle.external_reference_library
(source_name, make, model, generation, modification, year_from, year_to, power_kw, engine_cc, specifications, source_url)
VALUES ('auto-data.net', :make, :model, :gen, :mod, :y_f, :y_t, :p_kw, :e_cc, :specs, :url)
ON CONFLICT (source_url) DO UPDATE SET specifications = EXCLUDED.specifications, last_scraped_at = NOW();
"""), {
"make": data["make"], "model": data["model"], "gen": data["generation"],
"mod": data["modification"], "y_f": data["year_from"], "y_t": data["year_to"],
"p_kw": data["power_kw"], "e_cc": data["engine_cc"],
"specs": json.dumps(data["specifications"]), "url": data["source_url"]
})
await db.commit()
logger.info(f"✅ ARANYMENTÉS: {data['make']} {data['model']} ({data['power_kw']} kW)")
except Exception as e:
logger.error(f"DB Hiba: {e}")
async def run(self):
logger.info("🚀 R4 Adatbányász indítása...")
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context = await browser.new_context(user_agent="Mozilla/5.0...")
while True:
async with AsyncSessionLocal() as db:
res = await db.execute(text("""
UPDATE vehicle.auto_data_crawler_queue SET status = 'processing'
WHERE id = (
SELECT id FROM vehicle.auto_data_crawler_queue
WHERE level = 'engine' AND status = 'pending'
ORDER BY id ASC LIMIT 1 FOR UPDATE SKIP LOCKED
) RETURNING id, url, name
"""))
target = res.fetchone()
await db.commit()
if not target:
logger.info("🏁 Nincs több feldolgozandó motoradat. Alvás 60mp...")
await asyncio.sleep(60)
continue
t_id, t_url, t_name = target
async with self.semaphore:
data = await self.scrape_engine(context, t_url)
if data:
await self.save_to_library(data)
new_status = 'completed'
else:
new_status = 'error'
async with AsyncSessionLocal() as db:
await db.execute(text("UPDATE vehicle.auto_data_crawler_queue SET status = :s WHERE id = :id"),
{"s": new_status, "id": t_id})
await db.commit()
if __name__ == "__main__":
asyncio.run(FinalExtractor().run())

View File

@@ -0,0 +1,59 @@
# /opt/docker/dev/service_finder/backend/app/workers/vehicle/bike/bike_R0_brand_hunter.py
import asyncio, logging
from playwright.async_api import async_playwright
from sqlalchemy import text
from app.database import AsyncSessionLocal
logging.basicConfig(level=logging.INFO, format='%(asctime)s [BIKE-R0] %(message)s')
logger = logging.getLogger("R0")
SOURCES = [
{
"name": "AutoEvolution",
"url": "https://www.autoevolution.com/moto/",
# Robusztusabb szelektor a márkákhoz
"selector": ".brand a, .all-brands a, .moto-brand a",
"category": "bike"
}
]
async def run_r0():
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context = await browser.new_context(user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/122.0.0.0")
async with AsyncSessionLocal() as db:
for src in SOURCES:
page = await context.new_page()
try:
logger.info(f"Márkák kinyerése: {src['name']}...")
await page.goto(src['url'], wait_until="networkidle", timeout=60000)
# Ha a szelektor nem talál semmit, begyűjtjük az összes /moto/ linket
links = await page.eval_on_selector_all("a[href*='/moto/']",
"nodes => nodes.map(n => ({ 'name': n.innerText.trim(), 'url': n.href }))")
# Szűrés: csak a tiszta márka-linkek (pl. .../moto/aprilia/)
# A márka linkek általában 5 perjelből állnak (https:// + domain + moto + márka + /)
brand_links = [l for l in links if l['url'].count('/') == 5 and not l['url'].endswith('.html')]
count = 0
for link in brand_links:
if len(link['name']) < 2: continue
await db.execute(text("""
INSERT INTO vehicle.auto_data_crawler_queue (url, level, name, status, category)
VALUES (:url, 'brand', :name, 'pending', 'bike')
ON CONFLICT (url) DO NOTHING
"""), {"url": link['url'], "name": link['name']})
count += 1
await db.commit()
logger.info(f"✅ [{src['name']}] kész: {count} márkát találtam.")
except Exception as e:
logger.error(f"❌ Hiba: {e}")
finally:
await page.close()
await browser.close()
if __name__ == "__main__":
asyncio.run(run_r0())

View File

@@ -0,0 +1,171 @@
import asyncio
import logging
import random
import re
from playwright.async_api import async_playwright
from sqlalchemy import text
from app.database import AsyncSessionLocal
# --- NAPLÓZÁS KONFIGURÁCIÓ ---
# Megtartjuk a részletes naplózást minden eseményhez
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s [BIKE-R1-AUTOEVO] %(message)s'
)
logger = logging.getLogger("R1")
async def analyze_and_extract_links(page, current_url):
"""
Gondolatmenet: Intelligens link-osztályozás az AutoEvolution struktúrája alapján.
Minden funkciót megőrzünk: Language Shield, zajszűrés és a horgony-fix.
"""
found_links = []
# Minden link begyűjtése az elemzéshez a megadott szelektorral
hrefs = await page.eval_on_selector_all(
"a[href*='/moto/']",
"nodes => nodes.map(n => ({ 'name': n.innerText.trim(), 'url': n.href }))"
)
junk_keywords = [
'privacy', 'cookie', 'settings', 'contact', 'terms', 'advertising',
'about us', 'copyright', 'login', 'registration'
]
for link in hrefs:
# --- HORGONY ÉS PARAMÉTER TISZTÍTÁS ---
# Itt volt a hiba: levágjuk a # részt, de a linket megtartjuk az ellenőrzéshez!
raw_url = link['url'].split('#')[0].split('?')[0].rstrip('/')
name = link['name']
# --- 1. LANGUAGE SHIELD & ZAJ SZŰRÉS ---
if not name or len(name) < 2:
continue
# Csak latin karakterek (No Greek/Cyrillic/Polish/etc)
if re.search(r'[^\x00-\x7F]+', name):
continue
# Kizárjuk a navigációs szemetet
if any(junk in name.lower() for junk in junk_keywords):
continue
# --- 2. AUTOEVOLUTION MÉLYSÉGI LOGIKA ---
if "autoevolution.com/moto/" in raw_url:
# Önhivatkozás és főoldal (visszafelé navigáció) kiszűrése
if raw_url == current_url.rstrip('/') or raw_url.endswith('/moto'):
continue
# Elágazás a szintek között az URL szerkezete alapján
path_segments = raw_url.strip('/').split('/')
# Ha .html-re végződik, az a technikai specifikáció (ENGINE szint)
if raw_url.endswith(".html"):
found_links.append({'name': name, 'url': raw_url, 'level': 'engine'})
# Ha legalább 6 szegmens van és nincs .html, az egy al-modell vagy generáció (MODEL szint)
elif len(path_segments) >= 6:
found_links.append({'name': name, 'url': raw_url, 'level': 'model'})
return found_links
async def get_next_task(db):
"""
Prioritásos feladatfelvétel: A márka (brand) szinteket részesítjük előnyben.
SKIP LOCKED biztosítja a párhuzamos futtathatóságot.
"""
query = text("""
UPDATE vehicle.auto_data_crawler_queue SET status = 'processing'
WHERE id = (
SELECT id FROM vehicle.auto_data_crawler_queue
WHERE status = 'pending'
AND category = 'bike'
AND url LIKE '%autoevolution.com%'
AND level IN ('brand', 'model')
ORDER BY
CASE WHEN level = 'brand' THEN 0 ELSE 1 END ASC,
id ASC
LIMIT 1 FOR UPDATE SKIP LOCKED
) RETURNING id, url, name, level
""")
res = await db.execute(query)
return res.fetchone()
async def main():
"""
Fő vezérlő hurok teljes hibakezeléssel és tranzakció-biztonsággal.
"""
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context = await browser.new_context(
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
)
logger.info("🤖 R1 AutoEvolution Specialist elindult...")
while True:
target = None
try:
async with AsyncSessionLocal() as db:
target = await get_next_task(db)
await db.commit()
except Exception as e:
logger.error(f"❌ Adatbázis hiba a feladatfelvételnél: {e}")
await asyncio.sleep(5)
continue
if not target:
logger.info("🏁 Nincs több AutoEvolution feladat. Alvás 60mp...")
await asyncio.sleep(60)
continue
t_id, t_url, t_name, t_level = target
page = await context.new_page()
try:
logger.info(f"🚀 Felderítés ({t_level}): {t_name} -> {t_url}")
# A domcontentloaded gyorsabb, de várunk utána a JS-re
await page.goto(t_url, wait_until="domcontentloaded", timeout=60000)
await asyncio.sleep(random.uniform(2, 3))
links = await analyze_and_extract_links(page, t_url)
async with AsyncSessionLocal() as db:
try:
new_links_count = 0
for link in links:
# Minden talált variációt elmentünk a várólistába
await db.execute(text("""
INSERT INTO vehicle.auto_data_crawler_queue (url, level, parent_id, name, status, category)
VALUES (:url, :level, :p_id, :name, 'pending', 'bike')
ON CONFLICT (url) DO NOTHING
"""), {"url": link['url'], "level": link['level'], "p_id": t_id, "name": link['name']})
new_links_count += 1
# Feladat lezárása
await db.execute(text("UPDATE vehicle.auto_data_crawler_queue SET status = 'completed', updated_at = NOW() WHERE id = :id"), {"id": t_id})
await db.commit()
logger.info(f"{t_name} kész. Talált AutoEvolution linkek: {new_links_count}")
except Exception as inner_db_error:
await db.rollback()
logger.error(f"❌ Belső mentési hiba: {inner_db_error}")
raise inner_db_error
except Exception as e:
logger.error(f"❌ Kritikus hiba a navigáció során: {t_name} -> {e}")
async with AsyncSessionLocal() as db:
await db.execute(text("UPDATE vehicle.auto_data_crawler_queue SET status = 'error', error_msg = :msg, updated_at = NOW() WHERE id = :id"),
{"msg": str(e), "id": t_id})
await db.commit()
finally:
await page.close()
# Kíméljük a szervert a kitiltás ellen
await asyncio.sleep(random.uniform(3, 5))
await browser.close()
if __name__ == "__main__":
try:
asyncio.run(main())
except KeyboardInterrupt:
logger.info("🛑 Leállítás.")

View File

@@ -0,0 +1,173 @@
import asyncio
import logging
import random
import re
from playwright.async_api import async_playwright
from sqlalchemy import text
from app.database import AsyncSessionLocal
# --- NAPLÓZÁS ---
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s [R2-BIKE-DEPTH] %(message)s',
handlers=[logging.StreamHandler()]
)
logger = logging.getLogger("R2")
async def get_page_safe(page, url):
"""
Bot védelem kijátszása valós viselkedéssel és Cloudflare ellenőrzéssel.
"""
delay = random.uniform(4, 7)
await asyncio.sleep(delay)
try:
await page.goto(url, wait_until="domcontentloaded", timeout=60000)
title = await page.title()
if "Just a moment" in title or "Cloudflare" in title:
logger.error(f"Bot védelem észlelve: {url}")
raise Exception("Bot védelem (CF) megállította a robotot.")
return page
except Exception as e:
logger.error(f"Hiba az oldal betöltésekor: {url} -> {e}")
raise
async def extract_scoped_links(page, p_id, current_url):
"""
MÉLYSÉGI FELDERÍTÉS: Generation -> Engine variációk kinyerése.
Scope-Lock: Csak az adott márkán belüli linkeket követi.
"""
# Kinyerjük a márka nevét az URL-ből a scope-lockhoz
path_segments = current_url.strip('/').split('/')
if len(path_segments) < 5:
return 0
brand_anchor = path_segments[4]
hrefs = await page.eval_on_selector_all(
"a[href*='/moto/']",
"nodes => nodes.map(n => ({ 'name': n.innerText.trim(), 'url': n.href }))"
)
junk = ['privacy', 'cookie', 'settings', 'contact', 'terms', 'advertising', 'login', 'about', 'copyright']
found_count = 0
async with AsyncSessionLocal() as db:
for link in hrefs:
# TISZTÍTÁS: Levágjuk a horgonyt, hogy az adatlapot lássuk
clean_url = link['url'].split('#')[0].split('?')[0].rstrip('/')
name = link['name'].replace('\n', ' ').strip()
# Alap szűrések
if not name or len(name) < 2: continue
if re.search(r'[^\x00-\x7F]+', name): continue
if any(k in name.lower() for k in junk): continue
# SCOPE LOCK: Csak az adott márkához tartozó linkeket engedjük át
if brand_anchor not in clean_url.lower():
continue
# Navigációs szűrés
if any(x in clean_url for x in ['-brand-', 'allbrands', 'en/brands', '/moto/']):
if clean_url.count('/') < 5: continue
# Önhivatkozás elkerülése
if clean_url == current_url.rstrip('/'):
continue
# Szintek meghatározása
if clean_url.endswith(".html"):
target_level = 'engine'
elif clean_url.count('/') >= 6:
target_level = 'generation'
else:
continue
# Mentés az adatbázisba
await db.execute(text("""
INSERT INTO vehicle.auto_data_crawler_queue (url, level, parent_id, name, status, category)
VALUES (:url, :level, :p_id, :name, 'pending', 'bike')
ON CONFLICT (url) DO NOTHING
"""), {"url": clean_url, "level": target_level, "p_id": p_id, "name": name})
found_count += 1
await db.commit()
return found_count
async def process_target(context, t_id, t_url, t_name, t_level):
"""
Egy adott feladat (URL) teljes körű feldolgozása.
"""
page = await context.new_page()
try:
logger.info(f"🚀 Mélységi fúrás [{t_level}]: {t_name}")
await get_page_safe(page, t_url)
# Variációk és generációk kinyerése
found = await extract_scoped_links(page, t_id, t_url)
async with AsyncSessionLocal() as db:
new_status = 'completed' if found > 0 else 'completed_leaf'
await db.execute(text("""
UPDATE vehicle.auto_data_crawler_queue
SET status = :s, error_msg = NULL, updated_at = NOW()
WHERE id = :id
"""), {"s": new_status, "id": t_id})
await db.commit()
logger.info(f"✅ Befejezve: {t_name} -> {found} új variáció rögzítve.")
except Exception as e:
logger.error(f"❌ Kritikus hiba feldolgozás közben ({t_name}): {e}")
async with AsyncSessionLocal() as db:
await db.execute(text("""
UPDATE vehicle.auto_data_crawler_queue
SET status = 'error', error_msg = :msg, updated_at = NOW()
WHERE id = :id
"""), {"msg": str(e), "id": t_id})
await db.commit()
finally:
await page.close()
async def main():
"""
Fő hurok mélységi stratégiával (level ASC).
"""
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context = await browser.new_context(
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/122.0.0.0",
viewport={'width': 1920, 'height': 1080}
)
logger.info("🤖 R2 Motoros Mélységi Felderítő aktív.")
while True:
async with AsyncSessionLocal() as db:
res = await db.execute(text("""
UPDATE vehicle.auto_data_crawler_queue SET status = 'processing'
WHERE id = (
SELECT id FROM vehicle.auto_data_crawler_queue
WHERE status = 'pending'
AND level IN ('model', 'generation')
AND category = 'bike'
AND url LIKE '%autoevolution.com%'
ORDER BY level ASC, id ASC
LIMIT 1 FOR UPDATE SKIP LOCKED
) RETURNING id, url, name, level
"""))
target = res.fetchone()
await db.commit()
if not target:
logger.info("🏁 Minden variáció felderítve. Alvás 60mp...")
await asyncio.sleep(60)
continue
await process_target(context, target[0], target[1], target[2], target[3])
await browser.close()
if __name__ == "__main__":
try:
asyncio.run(main())
except KeyboardInterrupt:
logger.info("🛑 Leállítás.")

View File

@@ -0,0 +1,95 @@
# /opt/docker/dev/service_finder/backend/app/workers/vehicle/bike/bike_R3_engine_scout.py
import asyncio
import logging
import random
import json
import re
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright
from sqlalchemy import text
from app.database import AsyncSessionLocal
logging.basicConfig(level=logging.INFO, format='%(asctime)s [R3-EXTRACTOR] %(message)s')
logger = logging.getLogger("R3")
class R3DataMiner:
def clean_key(self, key):
if "," in key: key = key.split(",")[-1]
key = key.replace("What is the ", "").replace("How much ", "").replace("How many ", "")
return key.split("?")[0].strip().capitalize()
async def scrape_specs(self, context, url):
page = await context.new_page()
try:
await asyncio.sleep(random.uniform(4, 8))
await page.goto(url, wait_until="domcontentloaded", timeout=60000)
content = await page.content()
soup = BeautifulSoup(content, 'html.parser')
data = {"make": "", "model": "", "generation": "", "modification": "",
"year_from": None, "power_kw": 0, "engine_cc": 0,
"specifications": {}, "source_url": url}
for row in soup.find_all('tr'):
th, td = row.find('th'), row.find('td')
if not th or not td: continue
k_raw, v = th.get_text(strip=True), td.get_text(strip=True)
k_low = k_raw.lower()
if "brand" == k_low: data["make"] = v
elif "model" == k_low: data["model"] = v
elif "generation" == k_low: data["generation"] = v
elif "modification" == k_low: data["modification"] = v
elif "start of production" in k_low:
m = re.search(r'(\d{4})', v)
data["year_from"] = int(m.group(1)) if m else None
elif "power" == k_low:
hp = re.search(r'(\d+)\s*Hp', v, re.I)
if hp: data["power_kw"] = int(int(hp.group(1)) / 1.36)
elif "displacement" in k_low:
cc = re.search(r'(\d+)\s*cm3', v)
if cc: data["engine_cc"] = int(cc.group(1))
data["specifications"][self.clean_key(k_raw)] = v
return data
except Exception as e:
logger.error(f"Hiba az adatlapon: {e}"); return None
finally: await page.close()
async def run(self):
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context = await browser.new_context(user_agent="Mozilla/5.0...")
while True:
async with AsyncSessionLocal() as db:
res = await db.execute(text("""
UPDATE vehicle.auto_data_crawler_queue SET status = 'processing'
WHERE id = (SELECT id FROM vehicle.auto_data_crawler_queue
WHERE level = 'engine' AND status = 'pending'
ORDER BY id ASC LIMIT 1 FOR UPDATE SKIP LOCKED)
RETURNING id, url, name
"""))
target = res.fetchone()
await db.commit()
if not target: break
data = await self.scrape_specs(context, target[1])
if data and data["make"]:
async with AsyncSessionLocal() as db:
await db.execute(text("""
INSERT INTO vehicle.external_reference_library
(source_name, make, model, generation, modification, year_from, power_kw, engine_cc, specifications, source_url)
VALUES ('auto-data.net', :make, :model, :gen, :mod, :y, :p, :e, :s, :u)
ON CONFLICT (source_url) DO UPDATE SET specifications = EXCLUDED.specifications, last_scraped_at = NOW();
"""), {"make": data["make"], "model": data["model"], "gen": data["generation"], "mod": data["modification"],
"y": data["year_from"], "p": data["power_kw"], "e": data["engine_cc"], "s": json.dumps(data["specifications"]), "u": data["source_url"]})
await db.execute(text("UPDATE vehicle.auto_data_crawler_queue SET status = 'completed' WHERE id = :id"), {"id": target[0]})
await db.commit()
logger.info(f"✅ ARANYMENTÉS: {data['make']} {data['model']} {data['modification']}")
else:
async with AsyncSessionLocal() as db:
await db.execute(text("UPDATE vehicle.auto_data_crawler_queue SET status = 'error' WHERE id = :id"), {"id": target[0]})
await db.commit()
await browser.close()
if __name__ == "__main__": asyncio.run(R3DataMiner().run())

View File

@@ -0,0 +1,218 @@
#!/usr/bin/env python3
import asyncio
import logging
import random
import json
import sys
from playwright.async_api import async_playwright
from sqlalchemy import text
from app.database import AsyncSessionLocal
# --- NAPLÓZÁS KONFIGURÁCIÓ ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s [R4-HARVESTER-v1.2] %(message)s')
logger = logging.getLogger("R4")
# --- KONFIGURÁCIÓS PARAMÉTEREK ---
MAX_RETRY_LIMIT = 5 # Max 5 próbálkozás járművenként
async def parse_specs(page):
"""
A GYŐZTES DOM PARSZOLÓ LOGIKA (HIÁNYTALAN)
Ez a script felismeri a hibás táblázatokat, a dt/dd listákat és a sima vastagított szövegeket is.
"""
script = """
() => {
let results = {};
// 1. MÓDSZER: Régi motorok (pl. BMW F650GS) -> td.left és td.right
let leftCells = document.querySelectorAll('td.left');
leftCells.forEach(cell => {
let key = cell.innerText.replace(/:$/, '').trim();
let rightCell = cell.nextElementSibling;
if(rightCell && rightCell.classList.contains('right')) {
results[key] = rightCell.innerText.trim();
}
});
// 2. MÓDSZER: Modern motorok (pl. Aprilia) -> dt és dd
let dts = document.querySelectorAll('dt');
dts.forEach(dt => {
let key = dt.innerText.replace(/:$/, '').trim();
let dd = dt.nextElementSibling;
if(dd && dd.tagName.toLowerCase() === 'dd') {
results[key] = dd.innerText.trim();
}
});
// 3. MÓDSZER: Alternatív modern layout -> span.label és span.value
let specRows = document.querySelectorAll('.spec-row');
specRows.forEach(row => {
let label = row.querySelector('.label');
let value = row.querySelector('.value');
if(label && value) {
let key = label.innerText.replace(/:$/, '').trim();
if (!results[key]) {
results[key] = value.innerText.trim();
}
}
});
// 4. MÓDSZER: Veterán ("Adler") fallback -> Vastagított szöveg
if (Object.keys(results).length === 0) {
document.querySelectorAll('b, strong').forEach(b => {
let key = b.innerText.replace(/:$/, '').trim();
if(key.length > 2 && key.length < 30) {
let val = "";
if(b.nextSibling && b.nextSibling.nodeType === 3) {
val = b.nextSibling.textContent.trim();
}
else if (b.nextElementSibling && b.nextElementSibling.tagName !== 'B') {
val = b.nextElementSibling.innerText.trim();
}
if(val && !results[key]) {
results[key] = val;
}
}
});
}
return results;
}
"""
try:
data = await page.evaluate(script)
if data and len(data) > 0:
relevant_keys = [
"Production", "Year", "Segment",
"Type", "Displacement", "Bore X Stroke", "Compression Ratio",
"Horsepower", "Torque", "Fuel System", "Gearbox", "Clutch",
"Final Drive", "Frame", "Front Suspension", "Rear Suspension",
"Front Brake", "Rear Brake", "Overall Length", "Overall Width",
"Seat Height", "Wheelbase", "Fuel Capacity", "Weight", "Dry Weight",
"Wet Weight", "Front", "Rear"
]
filtered_data = {k: v for k, v in data.items() if any(rk.lower() in k.lower() for rk in relevant_keys)}
return filtered_data if len(filtered_data) > 0 else data
return None
except Exception as e:
logger.error(f"❌ Parszolási hiba a JS kiértékeléskor: {e}")
return None
async def main():
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context = await browser.new_context(
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
viewport={'width': 1920, 'height': 1080}
)
logger.info("🤖 R4 Motor Adat-Arató v1.2 elindult.")
while True:
target = None
try:
async with AsyncSessionLocal() as db:
# JAVÍTÁS: Kikerült a completed_empty a választható státuszok közül!
# Csak 'pending' és 'error' jöhet, ha a retry_count < 5.
res = await db.execute(text("""
UPDATE vehicle.auto_data_crawler_queue SET status = 'processing'
WHERE id = (
SELECT id FROM vehicle.auto_data_crawler_queue
WHERE status IN ('pending', 'error')
AND retry_count < 5
AND level = 'engine' AND category = 'bike'
ORDER BY id ASC LIMIT 1 FOR UPDATE SKIP LOCKED
) RETURNING id, url, name, retry_count
"""))
target = res.fetchone()
await db.commit()
except Exception as e:
logger.error(f"❌ DB Hiba a feladatfelvételnél: {e}")
await asyncio.sleep(5)
continue
if not target:
logger.info("🏁 Minden motor feldolgozva vagy manuális felülvizsgálatra vár. Alvás 60mp...")
await asyncio.sleep(60)
continue
t_id, t_url, t_name, t_retry_count = target
if t_retry_count is None: t_retry_count = 0
page = await context.new_page()
try:
logger.info(f"📊 [{t_retry_count + 1}/5] Adatbányászat: {t_name}")
await page.goto(t_url, wait_until="domcontentloaded", timeout=60000)
await asyncio.sleep(2)
data = await parse_specs(page)
async with AsyncSessionLocal() as db:
if data and len(data) > 0:
# SIKERES MENTÉS
await db.execute(text("""
INSERT INTO vehicle.motorcycle_specs (crawler_id, full_name, raw_data, url)
VALUES (:cid, :name, :data, :url)
ON CONFLICT (crawler_id) DO UPDATE SET raw_data = :data, updated_at = NOW()
"""), {"cid": t_id, "name": t_name, "data": json.dumps(data), "url": t_url})
await db.execute(text("UPDATE vehicle.auto_data_crawler_queue SET status = 'completed', updated_at = NOW() WHERE id = :id"), {"id": t_id})
await db.commit()
logger.info(f"✅ Mentve: {t_name} ({len(data)} paraméter)")
else:
# ÜRES OLDAL VAGY HIÁNYZÓ ADAT
new_retry_count = t_retry_count + 1
if new_retry_count >= 5:
# Elérte a limitet -> JAVÍTANDÓ (manual_review_needed)
await db.execute(text("""
UPDATE vehicle.auto_data_crawler_queue
SET status = 'manual_review_needed',
retry_count = :rc,
error_msg = 'Sikertelen adatgyűjtés 5 próbálkozás után (üres oldal)',
updated_at = NOW()
WHERE id = :id
"""), {"rc": new_retry_count, "id": t_id})
logger.error(f"🚨 LIMIT ELÉRVE: {t_name} -> manuális javításra jelölve.")
else:
# Még próbálkozhat -> státusz visszaállítása hibára
await db.execute(text("""
UPDATE vehicle.auto_data_crawler_queue
SET status = 'error',
retry_count = :rc,
updated_at = NOW()
WHERE id = :id
"""), {"rc": new_retry_count, "id": t_id})
logger.warning(f"⚠️ Üres maradt: {t_name} (Próbálkozás: {new_retry_count}/5)")
await db.commit()
except Exception as e:
logger.error(f"❌ Hiba a feldolgozás során: {t_name} -> {e}")
async with AsyncSessionLocal() as db:
new_retry_count = t_retry_count + 1
status = 'error' if new_retry_count < 5 else 'manual_review_needed'
await db.execute(text("""
UPDATE vehicle.auto_data_crawler_queue
SET status = :st,
retry_count = :rc,
error_msg = :msg,
updated_at = NOW()
WHERE id = :id
"""), {"st": status, "rc": new_retry_count, "msg": str(e), "id": t_id})
await db.commit()
finally:
await page.close()
await asyncio.sleep(random.uniform(2.0, 4.0))
await browser.close()
if __name__ == "__main__":
try:
asyncio.run(main())
except KeyboardInterrupt:
logger.info("🛑 Felhasználói leállítás.")

View File

@@ -0,0 +1,113 @@
import asyncio
import json
from playwright.async_api import async_playwright
async def test_scraper():
# Két probléma-fókuszú URL: a modern Aprilia és a régi, hibás HTML-ű BMW
test_urls = [
"https://www.autoevolution.com/moto/aprilia-rs-660-factory-2025.html",
"https://www.autoevolution.com/moto/bmw-f-650-gs-2011.html"
]
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context = await browser.new_context(
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
)
page = await context.new_page()
for url in test_urls:
print(f"\n{'='*60}")
print(f"🌍 MEGNYITÁS: {url}")
print(f"{'='*60}")
# A DOM betöltése megvárása
await page.goto(url, wait_until="domcontentloaded", timeout=60000)
await asyncio.sleep(2) # Várunk picit a JS futásra
# A TÖKÉLETESÍTETT AUTOEVOLUTION PARSZOLÓ
script = """
() => {
let results = {};
// 1. MÓDSZER: Régi motorok (pl. BMW F650GS) -> td.left és td.right
let leftCells = document.querySelectorAll('td.left');
leftCells.forEach(cell => {
let key = cell.innerText.replace(/:$/, '').trim();
let rightCell = cell.nextElementSibling;
if(rightCell && rightCell.classList.contains('right')) {
results[key] = rightCell.innerText.trim();
}
});
// 2. MÓDSZER: Modern motorok (pl. Aprilia) -> dt és dd
let dts = document.querySelectorAll('dt');
dts.forEach(dt => {
let key = dt.innerText.replace(/:$/, '').trim();
let dd = dt.nextElementSibling;
if(dd && dd.tagName.toLowerCase() === 'dd') {
results[key] = dd.innerText.trim();
}
});
// 3. MÓDSZER: Alternatív modern layout -> span.label és span.value
let specRows = document.querySelectorAll('.spec-row');
specRows.forEach(row => {
let label = row.querySelector('.label');
let value = row.querySelector('.value');
if(label && value) {
let key = label.innerText.replace(/:$/, '').trim();
if (!results[key]) {
results[key] = value.innerText.trim();
}
}
});
// 4. MÓDSZER: "Adler" típusú elavult leírások fallbackje -> Vastagított szöveg
if (Object.keys(results).length === 0) {
document.querySelectorAll('b, strong').forEach(b => {
let key = b.innerText.replace(/:$/, '').trim();
if(key.length > 2 && key.length < 30) {
let val = "";
// Ha a szöveg közvetlenül a tag után van (Text Node)
if(b.nextSibling && b.nextSibling.nodeType === 3) {
val = b.nextSibling.textContent.trim();
}
// Ha egy másik elemben van
else if (b.nextElementSibling && b.nextElementSibling.tagName !== 'B') {
val = b.nextElementSibling.innerText.trim();
}
if(val && !results[key]) {
results[key] = val;
}
}
});
}
return results;
}
"""
data = await page.evaluate(script)
if data and len(data) > 0:
# Kiszűrjük a zajt, csak a releváns műszaki adatokat hagyjuk meg
relevant_keys = ["Type", "Displacement", "Bore X Stroke", "Compression Ratio",
"Horsepower", "Torque", "Fuel System", "Gearbox", "Clutch",
"Final Drive", "Frame", "Front Suspension", "Rear Suspension",
"Front Brake", "Rear Brake", "Overall Length", "Overall Width",
"Seat Height", "Wheelbase", "Fuel Capacity", "Weight", "Dry Weight",
"Wet Weight", "Front", "Rear"]
filtered_data = {k: v for k, v in data.items() if any(rk.lower() in k.lower() for rk in relevant_keys)}
print("\n🟢 KINYERT ADATOK (DOM PARSZOLÓ):")
print(json.dumps(filtered_data if filtered_data else data, indent=2, ensure_ascii=False))
print(f"\n✅ Összesen {len(filtered_data if filtered_data else data)} műszaki paramétert találtam.")
else:
print("\n🔴 NULLA ADAT - A DOM parszoló nem talált egyezést.")
await browser.close()
if __name__ == "__main__":
asyncio.run(test_scraper())

View File

@@ -0,0 +1,73 @@
{
"rdw": {
"field_map": {
"merk": "make",
"handelsbenaming": "marketing_name",
"inrichting": "body_type",
"massa_ledig_voertuig": "curb_weight",
"technische_max_massa_voertuig": "max_weight",
"cilinderinhoud": "engine_capacity",
"aantal_cilinders": "cylinders",
"wielbasis": "wheelbase",
"aantal_deuren": "doors",
"aantal_zitplaatsen": "seats",
"catalogusprijs": "list_price",
"maximale_constructiesnelheid": "max_speed",
"datum_eerste_toelating": "year_from"
},
"fuel_map": {
"brandstof_omschrijving": "fuel_type",
"nettomaximumvermogen": "power_kw",
"netto_max_vermogen_elektrisch": "power_kw_electric",
"uitlaatemissieniveau": "euro_class",
"brandstofverbruik_gecombineerd": "consumption",
"co2_uitstoot_gecombineerd": "co2"
},
"engine_map": {
"motorcode": "engine_code"
},
"body_type_translations": {
"stationwagen": "KOMBI",
"hatchback": "FERDEHÁTÚ",
"sedan": "LÉPCSŐSHÁTÚ (SEDAN)",
"terreinwagen": "TEREPJÁRÓ (SUV)",
"cabriolet": "KABRIÓ",
"motorfiets": "MOTORKERÉKPÁR",
"land- of bosbouwtrekker": "TRAKTOR",
"niet geregistreerd": "NEM_REGISZTRÁLT",
"onbekend": "ISMERETLEN",
"niet geregistreerd": "NOT_REGISTERED",
"onbekend": "UNKNOWN",
"stationwagen": "ESTATE",
"hatchback": "HATCHBACK",
"sedan": "SEDAN",
"mpv": "MPV",
"terreinwagen": "SUV",
"cabriolet": "CONVERTIBLE",
"coupe": "COUPE",
"personenbus": "MPV",
"pick-up": "PICKUP",
"open wagen": "PICKUP",
"gesloten opbouw": "VAN",
"kampeerwagen": "RV"
},
"power_calculation": {
"ratio_source": "vermogen_massarijklaar",
"weight_source": "massa_rijklaar"
},
"fuel_translations": {
"Benzine": "Benzin",
"Elektriciteit": "Elektromos",
"Diesel": "Dízel",
"LPG": "Autógáz (LPG)",
"Niet geregistreerd": "ISMERETLEN",
"Benzine": "Petrol",
"Elektriciteit": "Electric",
"Diesel": "Diesel",
"LPG": "LPG",
"CNG": "CNG",
"Waterstof": "Hydrogen",
"Niet geregistreerd": "UNKNOWN"
}
}
}

View File

@@ -1,4 +1,4 @@
# /app/app/workers/vehicle/mapping_rules.py
# /opt/docker/dev/service_finder/backend/app/workers/vehicle/mapping_rules.py
SOURCE_MAPPINGS = {
"os-vehicle-db": {

View File

@@ -0,0 +1,113 @@
import asyncio
import json
import re
import requests
from sqlalchemy import text
from app.database import AsyncSessionLocal
# --- TECHNIKAI SZÓTÁR ÉS MAPPING ---
# Ez a szótár fordítja le az UltimateSpecs kulcsokat az adatbázis oszlopneveire
MAPPING = {
"Maximum power": "power_kw",
"Engine capacity": "engine_capacity",
"Maximum torque": "torque_nm",
"Top Speed": "max_speed",
"Acceleration 0 to 100 km/h": "acceleration_0_100",
"Curb Weight": "curb_weight",
"Wheelbase": "wheelbase",
"Num. of Seats": "seats",
"Drive wheels - Traction - Layout": "drive_type",
"Body": "body_type"
}
async def r5_test_run():
print("🚀 R5 Hibrid Robot indítása (Teszt üzemmód)...")
async with AsyncSessionLocal() as db:
# 1. KIVÁLASZTÁS: Kiveszünk egy olyan autót, ami még nincs dúsítva (R1 bázisból)
query = text("""
SELECT id, make, marketing_name, year_from, technical_code, fuel_type
FROM vehicle.vehicle_model_definitions
WHERE (power_kw IS NULL OR power_kw = 0 OR engine_capacity IS NULL OR engine_capacity = 0)
AND status IN ('manual_review_needed', 'research_failed_empty', 'pending', 'enrich_ready')
ORDER BY priority_score DESC
LIMIT 1
""")
target = (await db.execute(query)).fetchone()
if not target:
print("✨ Nincs feldolgozatlan autó az adatbázisban.")
return
t_id, make, model, year, tech_code, fuel = target
print(f"🎯 Célpont: {make} {model} ({year})")
print(f"📌 Technical Code: {tech_code or 'Nincs megadva'}")
# 2. RDW ADATOK (Holland hatósági bázis)
# Ha van technical_code (pl. Fiatnál a típusazonosító), az RDW-ből pontos adatot kapunk
rdw_data = {}
if tech_code:
print("🇳🇱 RDW adatok lekérése...")
# Az RDW API m9d7-ebf2 táblája tartalmazza a típus specifikációkat
rdw_url = f"https://opendata.rdw.nl/resource/m9d7-ebf2.json?handelsbenaming={tech_code.upper()}"
try:
res = requests.get(rdw_url, timeout=5).json()
if res:
rdw_data = {
"power_kw": int(float(res[0].get('nettomaximumvermogen', 0))),
"engine_capacity": int(res[0].get('cilinderinhoud', 0)),
"curb_weight": int(res[0].get('massa_ledig_voertuig', 0))
}
print("✅ RDW adatok sikeresen betöltve.")
except:
print("⚠️ RDW nem elérhető vagy nincs találat.")
# 3. ULTIMATESPECS ADATOK (Szimulált kaparás a kért logika alapján)
print("🏁 UltimateSpecs adatok gyűjtése...")
# Itt futna a Playwright scraper, ami kinyeri a táblázatot
# Példa nyers adatokra, amit az oldalról szedünk le:
raw_web_data = {
"Maximum power": "103 PS / 76 kW @ 5750 rpm",
"Engine capacity": "1581 cm3",
"Maximum torque": "144 Nm @ 4000 rpm",
"Top Speed": "180 km/h",
"Acceleration 0 to 100 km/h": "11.5 s",
"Curb Weight": "1090 kg",
"Wheelbase": "254 cm",
"Body": "Hatchback"
}
# 4. ÖSSZEFŰZÉS ÉS FORDÍTÁS
final_mdm_record = {
"id": t_id,
"make": make,
"marketing_name": model,
"year_from": year,
"fuel_type": fuel
}
# Alkalmazzuk a mappinget és a regex tisztítást
for web_key, db_key in MAPPING.items():
val = raw_web_data.get(web_key)
if val:
# Számértékek kinyerése (pl. "76 kW" -> 76, "1581 cm3" -> 1581)
numbers = re.findall(r'\d+', str(val))
if numbers:
# Ha több szám van (pl. kW és LE), a relevánsat választjuk
final_mdm_record[db_key] = numbers[1] if "kW" in str(val) and len(numbers)>1 else numbers[0]
else:
final_mdm_record[db_key] = val
# RDW adatok prioritása (ezek a legpontosabbak, felülírják a webet)
final_mdm_record.update({k: v for k, v in rdw_data.items() if v})
# --- TERMINÁL KIMENET ---
print("\n" + "="*50)
print("📊 VÉGLEGES MDM REKORD (ELŐNÉZET)")
print("="*50)
print(json.dumps(final_mdm_record, indent=2, ensure_ascii=False))
print("="*50)
print("\n[R5] Ha az adatok rendben vannak, mehet az élesítés?")
if __name__ == "__main__":
asyncio.run(r5_test_run())

View File

@@ -0,0 +1,138 @@
#!/usr/bin/env python3
import asyncio
import json
import re
import logging
import random
import urllib.parse
from playwright.async_api import async_playwright
from sqlalchemy import text
from app.database import AsyncSessionLocal
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] [R5-SENTINEL] %(message)s')
logger = logging.getLogger("R5")
COLUMN_MAPPING = {
"horsepower": "power_kw",
"engine displacement": "engine_capacity",
"maximum torque": "torque_nm",
"top speed": "max_speed",
"acceleration 0 to 100 km/h": "acceleration_0_100",
"curb weight": "curb_weight",
"wheelbase": "wheelbase",
"num. of seats": "seats"
}
class R5Harvester:
def __init__(self):
self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
def clean_number(self, val: str, key: str = "") -> int:
if not val or val == "-": return 0
try:
if "hp" in val.lower() or "kw" in val.lower():
kw_match = re.search(r'(\d+)\s*kw', val.lower())
if kw_match: return int(kw_match.group(1))
nums = re.findall(r'\d+', val.replace(' ', '').replace(',', '').replace('.', ''))
return int(nums[0]) if nums else 0
except: return 0
async def scrape_car_details(self, page, make, model, year):
try:
# 1. Belső keresés
search_url = f"https://www.ultimatespecs.com/index.php?brand={urllib.parse.quote(make)}&q={urllib.parse.quote(model + ' ' + str(year))}"
logger.info(f"🔍 Keresés indítása...")
await page.goto(search_url, wait_until="networkidle", timeout=30000)
# 2. Megkeressük a linket, de NEM kattintunk, hanem elkérjük az URL-t
# Rugalmasabb szelektor a 75 találat kezeléséhez
link_element = await page.wait_for_selector("a[href*='/car-specs/']", timeout=15000)
if not link_element:
return None
href = await link_element.get_attribute("href")
target_url = href if href.startswith("http") else f"https://www.ultimatespecs.com{href}"
# 3. KÖZVETLEN UGRÁS (Direct Jump) - Ez kikerüli a hirdetéseket
logger.info(f"🚀 Közvetlen ugrás az adatlapra: {target_url}")
await page.goto(target_url, wait_until="networkidle", timeout=30000)
# 4. Parszolás (Minden táblázatot nézünk)
full_specs = await page.evaluate("""
() => {
let results = {};
document.querySelectorAll('table.table_specs, table.responsive').forEach(table => {
table.querySelectorAll('tr').forEach(row => {
let t = row.querySelector('.table_specs_title, .td_title, td:first-child');
let v = row.querySelector('.table_specs_value, .td_value, td:last-child');
if(t && v) {
let k = t.innerText.replace(':','').trim().toLowerCase();
let val = v.innerText.trim();
if(k && val && val !== "-") results[k] = val;
}
});
});
return results;
}
""")
return full_specs
except Exception as e:
logger.error(f"❌ Scrape hiba: {str(e)[:100]}...")
return None
async def run(self):
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context = await browser.new_context(user_agent=self.user_agent)
page = await context.new_page()
while True:
async with AsyncSessionLocal() as db:
query = text("""
SELECT id, make, marketing_name, year_from
FROM vehicle.vehicle_model_definitions
WHERE (power_kw IS NULL OR power_kw = 0)
AND status IN ('manual_review_needed', 'pending', 'enrich_ready')
ORDER BY priority_score DESC LIMIT 1
""")
target = (await db.execute(query)).fetchone()
if not target:
logger.info("✨ Pipeline üres.")
break
t_id, make, model, year = target
logger.info(f"🚜 Feldolgozás: {make} {model} ({year})")
web_data = await self.scrape_car_details(page, make, model, year)
if not web_data or len(web_data) < 5:
logger.warning(f"⚠️ Sikertelen gyűjtés, státusz: research_failed_empty")
await db.execute(text("UPDATE vehicle.vehicle_model_definitions SET status = 'research_failed_empty' WHERE id = :id"), {"id": t_id})
await db.commit()
continue
updates = {col: self.clean_number(web_data.get(k)) for k, col in COLUMN_MAPPING.items()}
if updates.get('power_kw', 0) > 0:
await db.execute(text("""
UPDATE vehicle.vehicle_model_definitions
SET power_kw = :power_kw, engine_capacity = :engine_capacity,
torque_nm = :torque_nm, max_speed = :max_speed,
acceleration_0_100 = :acceleration_0_100, curb_weight = :curb_weight,
wheelbase = :wheelbase, specifications = specifications || :full_json,
status = 'published', updated_at = NOW()
WHERE id = :id
"""), {**updates, "id": t_id, "full_json": json.dumps(web_data)})
await db.commit()
logger.info(f"✅ PUBLIKÁLVA: {make} {model} ({updates['power_kw']} kW)")
else:
await db.execute(text("UPDATE vehicle.vehicle_model_definitions SET status = 'research_failed_empty' WHERE id = :id"), {"id": t_id})
await db.commit()
await asyncio.sleep(random.uniform(3, 6))
await browser.close()
if __name__ == "__main__":
harvester = R5Harvester()
asyncio.run(harvester.run())

View File

@@ -1,4 +1,5 @@
# /opt/docker/dev/service_finder/backend/app/workers/vehicle/robot_report.py
# docker exec sf_api python -m app.workers.vehicle.robot_report
import asyncio
import psutil
import pynvml

View File

@@ -0,0 +1,425 @@
#!/usr/bin/env python3
"""
Worker: vehicle_ultimate_r0_spider
Producer-Consumer lánc első eleme. Kivesz egy autót a vehicle.vehicle_model_definitions táblából,
keres az UltimateSpecs oldalán, és a talált .html linkeket beszúrja a vehicle.auto_data_crawler_queue táblába.
"""
import asyncio
import logging
import random
import sys
import signal
import urllib.parse
from datetime import datetime
from typing import Optional, Dict, Any, List
from playwright.async_api import async_playwright, Page, Browser, BrowserContext
from sqlalchemy import text, select, and_, or_
from sqlalchemy.exc import IntegrityError
from sqlalchemy.ext.asyncio import AsyncSession
from app.database import AsyncSessionLocal
from app.models.vehicle.external_reference_queue import ExternalReferenceQueue
from app.models.vehicle.vehicle_definitions import VehicleModelDefinition
# Logging konfiguráció
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s [R0-SPIDER] %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger("R0-SPIDER")
# Konfiguráció
SLEEP_INTERVAL = random.uniform(3, 6) # 3-6 mp között várakozás
MAX_RETRIES = 3
BASE_URL = "https://www.ultimatespecs.com/index.php?q={query}"
class UltimateSpecsSpider:
def __init__(self):
self.running = True
self.playwright = None
self.browser: Optional[Browser] = None
self.context: Optional[BrowserContext] = None
self.user_agent = (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
)
async def init_browser(self):
"""Playwright böngésző inicializálása"""
try:
self.playwright = await async_playwright().start()
self.browser = await self.playwright.chromium.launch(
headless=True,
args=[
'--disable-blink-features=AutomationControlled',
'--disable-dev-shm-usage',
'--no-sandbox',
]
)
self.context = await self.browser.new_context(
user_agent=self.user_agent,
viewport={'width': 1920, 'height': 1080},
java_script_enabled=True
)
logger.info("Playwright böngésző inicializálva")
except Exception as e:
logger.error(f"Hiba a böngésző inicializálásakor: {e}")
raise
async def close_browser(self):
"""Playwright böngésző lezárása"""
if self.context:
await self.context.close()
if self.browser:
await self.browser.close()
if self.playwright:
await self.playwright.stop()
logger.info("Playwright böngésző lezárva")
async def fetch_next_vehicle(self, session: AsyncSession) -> Optional[Dict[str, Any]]:
"""
Kivesz egy feldolgozandó járművet a vehicle_model_definitions táblából.
"""
query = text("""
SELECT id, make, marketing_name, year_from, vehicle_class
FROM vehicle.vehicle_model_definitions
WHERE status IN ('pending', 'manual_review_needed')
AND vehicle_class IN ('car', 'motorcycle')
ORDER BY priority_score DESC, updated_at ASC
LIMIT 1
FOR UPDATE SKIP LOCKED
""")
try:
result = await session.execute(query)
row = result.fetchone()
if row:
return {
'id': row[0],
'make': row[1],
'marketing_name': row[2],
'year_from': row[3],
'vehicle_class': row[4]
}
return None
except Exception as e:
logger.error(f"Hiba a következő jármű lekérdezésekor: {e}")
return None
def build_search_query(self, make: str, marketing_name: str, year_from: Optional[int]) -> str:
"""
Build search query for UltimateSpecs.
"""
# Clean and prepare the query
make_clean = make.lower().replace(' ', '-').replace('.', '')
model_clean = marketing_name.lower().replace(' ', '-').replace('.', '')
# Remove common suffixes
for suffix in ['-', 'series', 'class', 'model']:
if model_clean.endswith(suffix):
model_clean = model_clean[:-len(suffix)].rstrip('-')
query_parts = [make_clean, model_clean]
if year_from:
query_parts.append(str(year_from))
return ' '.join(query_parts)
async def extract_links_with_js(self, page: Page, make_url: str, model_word: str) -> List[Dict[str, str]]:
"""
Extract .html links from the page using the provided JavaScript filter.
"""
js_code = """
(args) => {
let targetMakeUrl = args.makeUrl; // pl. 'honda' vagy 'alfa-romeo'
let targetModel = args.modelWord; // pl. 'civic'
let specs = [];
document.querySelectorAll('a').forEach(a => {
let href = a.getAttribute('href') || '';
let text = a.innerText.trim();
let hrefLow = href.toLowerCase();
let textLow = text.toLowerCase();
if (hrefLow.includes('/car-specs/') || hrefLow.includes('/motorcycles-specs/')) {
// SZIGORÚ MÁRKA SZŰRŐ AZ URL-BEN (Reklámok ellen)
if (hrefLow.includes('/' + targetMakeUrl + '/') || hrefLow.includes(targetMakeUrl + '-models')) {
// MODELL SZŰRŐ A SZÖVEGBEN VAGY URL-BEN
if (targetModel === '' || textLow.includes(targetModel) || hrefLow.includes(targetModel)) {
if (hrefLow.endsWith('.html') && text.length > 1) {
specs.push({ name: text, url: href });
}
}
}
}
});
return specs;
}
"""
try:
# Prepare arguments for the JS function
args = {
'makeUrl': make_url.lower(),
'modelWord': model_word.lower()
}
# Execute the JavaScript
specs = await page.evaluate(js_code, args)
return specs
except Exception as e:
logger.error(f"Hiba a JS szűrő futtatásakor: {e}")
return []
async def search_and_extract_links(self, vehicle: Dict[str, Any]) -> List[Dict[str, str]]:
"""
Search on UltimateSpecs and extract links using two-step drill-down.
"""
search_query = self.build_search_query(
vehicle['make'],
vehicle['marketing_name'],
vehicle['year_from']
)
# Prepare make URL part
make_url = vehicle['make'].lower().replace(' ', '-').replace('.', '')
model_word = vehicle['marketing_name'].lower().split()[0] if vehicle['marketing_name'] else ''
encoded_query = urllib.parse.quote(search_query)
search_url = BASE_URL.format(query=encoded_query)
logger.info(f"Keresés: {search_query} | URL: {search_url}")
page = None
try:
page = await self.context.new_page()
# 1. Step: Go to search page
await page.goto(search_url, wait_until='networkidle', timeout=30000)
# Check if we're on a category page or search results
current_url = page.url
# 2. Step: Extract links with JS filter
all_links = await self.extract_links_with_js(page, make_url, model_word)
# If no links found on first page, try to click on first result
if not all_links and 'index.php' in current_url:
# Try to find and click on first relevant link
first_link = await page.query_selector('a[href*="/car-specs/"], a[href*="/motorcycles-specs/"]')
if first_link:
await first_link.click()
await page.wait_for_load_state('networkidle')
# Extract links from the new page
all_links = await self.extract_links_with_js(page, make_url, model_word)
# Ensure URLs are absolute
for link in all_links:
if not link['url'].startswith('http'):
link['url'] = f"https://www.ultimatespecs.com{link['url']}"
logger.info(f"{len(all_links)} link találva")
return all_links
except Exception as e:
logger.error(f"Hiba a keresés során: {e}")
return []
finally:
if page:
await page.close()
async def save_links_to_queue(self, session: AsyncSession, links: List[Dict[str, str]],
vehicle: Dict[str, Any]) -> int:
"""
Save extracted links to the external reference queue.
"""
saved_count = 0
for link in links:
try:
# Check if URL already exists
existing_query = select(ExternalReferenceQueue).where(
ExternalReferenceQueue.url == link['url']
)
existing_result = await session.execute(existing_query)
if existing_result.scalar_one_or_none():
logger.debug(f"URL már létezik: {link['url']}")
continue
# Create new queue entry
queue_entry = ExternalReferenceQueue(
url=link['url'],
level='engine',
category=vehicle['vehicle_class'] or 'car',
name=link['name'][:255],
parent_id=vehicle['id'],
status='pending'
)
session.add(queue_entry)
await session.commit()
saved_count += 1
logger.debug(f"URL mentve: {link['url']}")
except IntegrityError:
await session.rollback()
logger.debug(f"URL már létezik (integrity): {link['url']}")
except Exception as e:
await session.rollback()
logger.error(f"Hiba a URL mentésekor: {e}")
return saved_count
async def update_vehicle_status(self, session: AsyncSession, vehicle_id: int,
status: str, error_msg: str = None):
"""
Update the vehicle's status in the database.
"""
try:
query = text("""
UPDATE vehicle.vehicle_model_definitions
SET status = :status,
last_error = :error_msg,
updated_at = NOW(),
attempts = attempts + 1
WHERE id = :id
""")
await session.execute(
query,
{'status': status, 'error_msg': error_msg, 'id': vehicle_id}
)
await session.commit()
logger.info(f"Jármű státusz frissítve: {vehicle_id} -> {status}")
except Exception as e:
await session.rollback()
logger.error(f"Hiba a státusz frissítésekor: {e}")
async def process_single_vehicle(self):
"""
Process a single vehicle: fetch, search, extract links, save to queue.
"""
async with AsyncSessionLocal() as session:
try:
# 1. Fetch next vehicle
vehicle = await self.fetch_next_vehicle(session)
if not vehicle:
logger.info("Nincs feldolgozandó jármű")
return False
logger.info(f"Feldolgozás: {vehicle['make']} {vehicle['marketing_name']} "
f"(ID: {vehicle['id']})")
# 2. Search and extract links
links = await self.search_and_extract_links(vehicle)
if not links:
# No links found
await self.update_vehicle_status(
session, vehicle['id'],
'research_failed_empty',
'No links found on UltimateSpecs'
)
logger.warning(f"Nem található link: {vehicle['make']} {vehicle['marketing_name']}")
return True
# 3. Save links to queue
saved_count = await self.save_links_to_queue(session, links, vehicle)
# 4. Update vehicle status
if saved_count > 0:
await self.update_vehicle_status(
session, vehicle['id'],
'spider_dispatched',
f'{saved_count} links added to queue'
)
logger.info(f"{saved_count} link mentve a queue-ba")
else:
# All links already existed
await self.update_vehicle_status(
session, vehicle['id'],
'spider_dispatched',
'All links already in queue'
)
logger.info("Minden link már szerepel a queue-ban")
return True
except Exception as e:
logger.error(f"Hiba a jármű feldolgozása során: {e}")
# Try to update status with error
try:
if 'vehicle' in locals():
await self.update_vehicle_status(
session, vehicle['id'],
'research_failed_network',
str(e)[:500]
)
except:
pass
return True
async def run(self):
"""
Main loop of the spider.
"""
logger.info("UltimateSpecs R0 Spider indítása...")
try:
await self.init_browser()
while self.running:
try:
# Process a single vehicle
processed = await self.process_single_vehicle()
if not processed:
# No vehicles to process, wait longer
await asyncio.sleep(SLEEP_INTERVAL * 2)
else:
# Wait before next iteration
await asyncio.sleep(SLEEP_INTERVAL)
except KeyboardInterrupt:
logger.info("Keyboard interrupt, leállítás...")
self.running = False
break
except Exception as e:
logger.error(f"Hiba a fő ciklusban: {e}")
await asyncio.sleep(SLEEP_INTERVAL)
finally:
await self.close_browser()
logger.info("UltimateSpecs R0 Spider leállt")
def stop(self):
"""Stop the spider gracefully."""
self.running = False
logger.info("Leállítás kérése érkezett")
async def main():
"""Main entry point."""
spider = UltimateSpecsSpider()
# Signal handling for graceful shutdown
def signal_handler(signum, frame):
logger.info(f"Signal {signum} received, stopping...")
spider.stop()
signal.signal(signal.SIGINT, signal_handler)
signal.signal(signal.SIGTERM, signal_handler)
try:
await spider.run()
except Exception as e:
logger.error(f"Váratlan hiba: {e}")
sys.exit(1)
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,355 @@
#!/usr/bin/env python3
"""
Worker: vehicle_ultimate_r1_scraper
Producer-Consumer lánc második eleme (A Nyers Letöltő).
Kivesz egy feldolgozandó linket a vehicle.auto_data_crawler_queue táblából (level='engine'),
letölti a HTML tartalmat Playwright böngészővel, kinyeri a specifikációkat JS parserrel,
és elmenti a vehicle.external_reference_library táblába.
"""
import asyncio
import logging
import random
import sys
import signal
import json
from datetime import datetime
from typing import Optional, Dict, Any, List
from playwright.async_api import async_playwright, Page, Browser, BrowserContext, TimeoutError as PlaywrightTimeoutError
from sqlalchemy import text, select, and_, or_
from sqlalchemy.exc import IntegrityError
from sqlalchemy.ext.asyncio import AsyncSession
from app.database import AsyncSessionLocal, ensure_models_loaded
from app.models.vehicle.external_reference_queue import ExternalReferenceQueue
from app.models.vehicle.external_reference import ExternalReferenceLibrary
# Logging konfiguráció
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s [R1-SCRAPER] %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger("R1-SCRAPER")
# Konfiguráció
SLEEP_INTERVAL = random.uniform(3, 6) # 3-6 mp között várakozás
MAX_RETRIES = 3
CLOUDFLARE_KEYWORDS = ["just a moment", "cloudflare", "checking your browser"]
class UltimateSpecsScraper:
def __init__(self):
self.running = True
self.playwright = None
self.browser: Optional[Browser] = None
self.context: Optional[BrowserContext] = None
self.user_agent = (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
)
async def init_browser(self):
"""Playwright böngésző inicializálása"""
try:
self.playwright = await async_playwright().start()
self.browser = await self.playwright.chromium.launch(
headless=True,
args=[
'--disable-blink-features=AutomationControlled',
'--disable-dev-shm-usage',
'--no-sandbox',
]
)
self.context = await self.browser.new_context(
user_agent=self.user_agent,
viewport={'width': 1920, 'height': 1080},
java_script_enabled=True
)
logger.info("Playwright böngésző inicializálva")
except Exception as e:
logger.error(f"Hiba a böngésző inicializálásakor: {e}")
raise
async def close_browser(self):
"""Playwright böngésző lezárása"""
if self.context:
await self.context.close()
if self.browser:
await self.browser.close()
if self.playwright:
await self.playwright.stop()
logger.info("Playwright böngésző lezárva")
async def fetch_next_queue_item(self, session: AsyncSession) -> Optional[Dict[str, Any]]:
"""
Kivesz egy feldolgozandó linket a vehicle.auto_data_crawler_queue táblából.
"""
query = text("""
SELECT id, url, category, parent_id
FROM vehicle.auto_data_crawler_queue
WHERE level = 'engine' AND status = 'pending'
FOR UPDATE SKIP LOCKED LIMIT 1
""")
try:
result = await session.execute(query)
row = result.fetchone()
if row:
return {
"id": row[0],
"url": row[1],
"category": row[2],
"parent_id": row[3]
}
return None
except Exception as e:
logger.error(f"Hiba a queue lekérdezésekor: {e}")
return None
async def scrape_with_retry(self, url: str, max_retries: int = MAX_RETRIES) -> Optional[Dict[str, Any]]:
"""
Playwright böngészővel letölti a HTML tartalmat, retry logikával.
"""
for attempt in range(1, max_retries + 1):
try:
logger.info(f"Próbálkozás {attempt}/{max_retries}: {url}")
page = await self.context.new_page()
# Navigáció
await page.goto(url, wait_until="domcontentloaded", timeout=30000)
# Várjunk a táblázatokra
try:
await page.wait_for_selector('table', timeout=5000)
except PlaywrightTimeoutError:
logger.warning("Nem található táblázat 5 másodpercen belül, de folytatjuk")
# Ellenőrizzük Cloudflare blokkolást
title = await page.title()
title_lower = title.lower()
if any(keyword in title_lower for keyword in CLOUDFLARE_KEYWORDS):
raise Exception(f"Cloudflare blokkolás észlelve: {title}")
# JS parser futtatása
specs = await page.evaluate("""() => {
let results = {};
// 1. ÖSSZES táblázat letapogatása
document.querySelectorAll('table').forEach(table => {
table.querySelectorAll('tr').forEach(row => {
let t = row.querySelector('.table_specs_title, .td_title, td:first-child, th:first-child');
let v = row.querySelector('.table_specs_value, .td_value, td:last-child');
if(t && v) {
let k = t.innerText.replace(/:/g,'').trim().toLowerCase();
let val = v.innerText.trim();
if(k && val && val !== "-") { results[k] = val; }
}
});
});
// 2. Extra szekciók és dimenziók mentése
const sections = {};
document.querySelectorAll('h2, h3, h4, .section-title, .specs-header').forEach(header => {
const title = header.innerText.trim();
if (title && title.length > 0) {
let nextElement = header.nextElementSibling;
let sectionData = {};
for (let i = 0; i < 5 && nextElement; i++) {
if (nextElement.tagName === 'TABLE') {
nextElement.querySelectorAll('tr').forEach(row => {
let t = row.querySelector('td:first-child, th:first-child');
let v = row.querySelector('td:last-child');
if(t && v) {
let k = t.innerText.replace(/:/g,'').trim().toLowerCase();
let val = v.innerText.trim();
if(k && val && val !== "-") {
sectionData[k] = val;
results[`${title.toLowerCase().replace(/ /g, '_')}_${k}`] = val;
}
}
});
}
nextElement = nextElement.nextElementSibling;
}
sections[title.toLowerCase().replace(/ /g, '_')] = sectionData;
}
});
results['_sections'] = sections;
return results;
}""")
await page.close()
if specs and len(specs) > 0:
logger.info(f"Sikeres letöltés, {len(specs)} specifikáció kinyerve")
return specs
else:
logger.warning("Üres specifikációk, újrapróbálkozás")
raise Exception("Üres specifikációk")
except Exception as e:
logger.error(f"Hiba a {attempt}. próbálkozásnál: {e}")
if attempt < max_retries:
backoff = random.uniform(2, 5)
logger.info(f"Várakozás {backoff:.1f} másodpercet...")
await asyncio.sleep(backoff)
else:
logger.error(f"Összes próbálkozás sikertelen: {e}")
return None
return None
async def process_queue_item(self, session: AsyncSession, item: Dict[str, Any]) -> bool:
"""
Feldolgoz egy queue tételt: letölti, kinyeri, elmenti.
"""
queue_id = item["id"]
url = item["url"]
category = item["category"]
try:
# 1. Letöltés
specs = await self.scrape_with_retry(url)
if not specs:
# Hiba esetén frissítjük a queue-t
await session.execute(
text("""
UPDATE vehicle.auto_data_crawler_queue
SET status = 'error', error_msg = :error_msg, retry_count = retry_count + 1
WHERE id = :id
"""),
{"error_msg": "Sikertelen letöltés (üres specifikációk vagy Cloudflare)", "id": queue_id}
)
await session.commit()
logger.error(f"Queue {queue_id} sikertelen, státusz: error")
return False
# 2. Új rekord létrehozása az external_reference_library táblában (nyers SQL)
# A specifications dict-et JSON stringgé alakítjuk
import json
specs_json = json.dumps(specs)
insert_query = text("""
INSERT INTO vehicle.external_reference_library
(source_name, source_url, category, specifications, pipeline_status, created_at, last_scraped_at)
VALUES (:source_name, :source_url, :category, CAST(:specifications AS jsonb), :pipeline_status, NOW(), NOW())
RETURNING id
""")
result = await session.execute(
insert_query,
{
"source_name": "ultimatespecs",
"source_url": url,
"category": category,
"specifications": specs_json,
"pipeline_status": "pending_enrich"
}
)
new_id = result.scalar()
# 3. Queue tétel frissítése completed-re
await session.execute(
text("""
UPDATE vehicle.auto_data_crawler_queue
SET status = 'completed', updated_at = NOW()
WHERE id = :id
"""),
{"id": queue_id}
)
await session.commit()
logger.info(f"Queue {queue_id} sikeresen feldolgozva, library ID: {new_id}")
return True
except Exception as e:
logger.error(f"Hiba a queue {queue_id} feldolgozásakor: {e}")
await session.rollback()
# Hiba esetén error státusz
try:
await session.execute(
text("""
UPDATE vehicle.auto_data_crawler_queue
SET status = 'error', error_msg = :error_msg, retry_count = retry_count + 1
WHERE id = :id
"""),
{"error_msg": str(e)[:500], "id": queue_id}
)
await session.commit()
except Exception as update_err:
logger.error(f"Hiba a queue frissítésekor: {update_err}")
return False
async def run_once(self):
"""Egyetlen feldolgozási ciklus"""
# Biztosítjuk, hogy a modellek regisztrálva legyenek
ensure_models_loaded()
async with AsyncSessionLocal() as session:
try:
# Tranzakció kezdése
async with session.begin():
item = await self.fetch_next_queue_item(session)
if not item:
logger.info("Nincs feldolgozandó queue tétel")
return False
logger.info(f"Feldolgozás: {item['url']}")
success = await self.process_queue_item(session, item)
return success
except Exception as e:
logger.error(f"Hiba a run_once-ban: {e}")
return False
async def run_loop(self):
"""Fő ciklus: végtelen while, 3-6 mp várakozással"""
await self.init_browser()
try:
while self.running:
success = await self.run_once()
if not success:
# Ha nincs munka, várjunk egy kicsit
sleep_time = SLEEP_INTERVAL
logger.debug(f"Várakozás {sleep_time:.1f} másodpercet...")
await asyncio.sleep(sleep_time)
else:
# Sikeres feldolgozás után rövid várakozás
await asyncio.sleep(random.uniform(1, 2))
except KeyboardInterrupt:
logger.info("Keyboard interrupt, leállítás...")
except Exception as e:
logger.error(f"Váratlan hiba a fő ciklusban: {e}")
finally:
await self.close_browser()
def stop(self):
"""Leállítási jelzés"""
self.running = False
logger.info("Leállítási jelzés küldve")
async def main():
"""Fő függvény"""
scraper = UltimateSpecsScraper()
# Signal kezelés
def signal_handler(signum, frame):
scraper.stop()
signal.signal(signal.SIGINT, signal_handler)
signal.signal(signal.SIGTERM, signal_handler)
try:
await scraper.run_loop()
except Exception as e:
logger.error(f"Fatal error: {e}")
sys.exit(1)
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,299 @@
#!/usr/bin/env python3
"""
Worker: vehicle_ultimate_r2_enricher
Producer-Consumer lánc harmadik eleme (Az Elemző). Offline adattisztítást és strukturálást végez.
Kivesz egy feldolgozandó sort a vehicle.external_reference_library táblából (pipeline_status='pending_enrich'),
hozzácsatolja a vehicle.auto_data_crawler_queue adatait, kinyeri a standard értékeket a nyers JSON-ből,
és strukturált JSON-be csomagolja (standardized + _raw).
"""
import asyncio
import logging
import random
import sys
import signal
import json
import re
from datetime import datetime
from typing import Optional, Dict, Any, List, Tuple
from sqlalchemy import text, select, and_, or_
from sqlalchemy.exc import IntegrityError, SQLAlchemyError
from sqlalchemy.ext.asyncio import AsyncSession
from app.database import AsyncSessionLocal
# Logging konfiguráció
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s [R2-ENRICHER] %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger("R2-ENRICHER")
# Konfiguráció
SLEEP_INTERVAL = random.uniform(1, 3) # 1-3 mp között várakozás
# Fuzzy mapping a metrikákhoz
FUZZY_MAPPING = {
"power_kw": ["horsepower", "total electric power", "engine power", "maximum power", "power"],
"engine_capacity": ["engine displacement", "displacement", "capacity", "cm3", "cu-in"],
"torque_nm": ["maximum torque", "total electric torque", "torque"],
"max_speed": ["top speed", "maximum speed"],
"curb_weight": ["curb weight", "weight"],
"wheelbase": ["wheelbase"],
"seats": ["num. of seats", "seats"]
}
# Szöveges mezők keresési kulcsszavai
TEXT_FIELD_KEYWORDS = {
"fuel_type": ["fuel type", "fuel", "engine fuel", "fuel system"],
"transmission_type": ["transmission", "gear", "gearbox"],
"drive_type": ["drive type", "drive", "drivetrain"],
"body_type": ["body type", "body", "car body"]
}
class UltimateSpecsEnricher:
def __init__(self):
self.running = True
async def fetch_next_library_item(self, session: AsyncSession) -> Optional[Dict[str, Any]]:
"""
Kivesz egy feldolgozandó sort a Library-ből.
"""
query = text("""
SELECT id, specifications, make, model, year_from
FROM vehicle.external_reference_library
WHERE pipeline_status = 'pending_enrich'
FOR UPDATE SKIP LOCKED LIMIT 1
""")
try:
result = await session.execute(query)
row = result.fetchone()
if row:
return {
"id": row[0],
"specifications": row[1] if isinstance(row[1], dict) else {},
"make": row[2],
"model": row[3],
"year_from": row[4]
}
return None
except SQLAlchemyError as e:
logger.error(f"SQL hiba a lekérdezés során: {e}")
return None
def extract_fuzzy_metric(self, specifications: Dict[str, Any], target_key: str, keywords: List[str]) -> Optional[float]:
"""
Keres a specifications szótárban a megadott kulcsszavak alapján, és számot próbál kinyerni.
"""
if not specifications:
return None
# Először próbáljuk meg a kulcsokat (case-insensitive)
spec_lower = {k.lower(): v for k, v in specifications.items()}
for keyword in keywords:
for key, value in spec_lower.items():
if keyword.lower() in key:
# Ha a érték szám vagy string, próbáljuk kinyerni a számot
num = self.clean_number(value)
if num is not None:
# Ha a kulcs tartalmazza a "hp" vagy "horsepower" és a cél kW, konvertáljuk
if target_key == "power_kw" and ("hp" in key or "horsepower" in key):
# hp -> kW konverzió (1 hp = 0.7457 kW)
num = num * 0.7457
return num
return None
def clean_number(self, value: Any) -> Optional[float]:
"""
Kinyeri a számot egy stringből vagy más típusból.
"""
if value is None:
return None
if isinstance(value, (int, float)):
return float(value)
if isinstance(value, str):
# Távolítsuk el a nem szám karaktereket, kivéve pont és mínusz
# Keresünk mintákat mint "120 kW" vagy "120kW"
match = re.search(r'([-+]?\d*\.?\d+)\s*(?:kW|hp|cc|Nm|kg|km/h|mph)?', value, re.IGNORECASE)
if match:
try:
return float(match.group(1))
except ValueError:
pass
# Ha nincs specifikus egység, próbáljunk meg bármilyen számot kinyerni
matches = re.findall(r'[-+]?\d*\.?\d+', value)
if matches:
try:
return float(matches[0])
except ValueError:
pass
return None
def extract_text_field(self, specifications: Dict[str, Any], keywords: List[str]) -> Optional[str]:
"""
Kinyer egy szöveges mezőt a specifications-ből a kulcsszavak alapján.
"""
if not specifications:
return None
spec_lower = {k.lower(): v for k, v in specifications.items()}
for keyword in keywords:
for key, value in spec_lower.items():
if keyword.lower() in key:
if isinstance(value, str):
return value.strip()
elif isinstance(value, (int, float)):
return str(value)
return None
def enrich_specifications(self, raw_specs: Dict[str, Any], make: str, model: str, year_from: int) -> Dict[str, Any]:
"""
Fő strukturáló függvény: kinyeri a standard értékeket és létrehozza az új JSON struktúrát.
"""
standardized = {}
# Metrikák kinyerése
for target_key, keywords in FUZZY_MAPPING.items():
value = self.extract_fuzzy_metric(raw_specs, target_key, keywords)
standardized[target_key] = value
# Szöveges mezők kinyerése
for field, keywords in TEXT_FIELD_KEYWORDS.items():
value = self.extract_text_field(raw_specs, keywords)
standardized[field] = value
# Készítsük az új JSON struktúrát
updated_specifications = {
"standardized": standardized,
"_raw": raw_specs # Az eredeti R1 adat érintetlenül megmarad!
}
return updated_specifications
async def process_item(self, session: AsyncSession, item: Dict[str, Any]) -> bool:
"""
Feldolgoz egy elemet: kinyeri az adatokat, frissíti az adatbázist.
"""
try:
logger.info(f"Feldolgozás: ID={item['id']}, {item['make']} {item['model']} ({item['year_from']})")
# Adatok kinyerése és strukturálása
updated_specs = self.enrich_specifications(
item['specifications'],
item['make'],
item['model'],
item['year_from']
)
# Kinyert értékek a fizikai oszlopokhoz
power_kw = updated_specs['standardized'].get('power_kw')
engine_cc = updated_specs['standardized'].get('engine_capacity')
# UPDATE végrehajtása
update_query = text("""
UPDATE vehicle.external_reference_library
SET power_kw = :power_kw,
engine_cc = :engine_cc,
make = :make,
model = :model,
year_from = :year_from,
specifications = :updated_specifications,
pipeline_status = 'pending_match'
WHERE id = :id
""")
params = {
"power_kw": int(power_kw) if power_kw is not None else None,
"engine_cc": int(engine_cc) if engine_cc is not None else None,
"make": item['make'],
"model": item['model'],
"year_from": item['year_from'],
"updated_specifications": json.dumps(updated_specs),
"id": item['id']
}
await session.execute(update_query, params)
await session.commit()
logger.info(f"Sikeres frissítés: ID={item['id']}, power_kw={power_kw}, engine_cc={engine_cc}")
return True
except Exception as e:
logger.error(f"Hiba a feldolgozás során ID={item['id']}: {e}")
await session.rollback()
return False
async def run_once(self):
"""
Egyetlen feldolgozási ciklus.
"""
async with AsyncSessionLocal() as session:
try:
# Tranzakció indítása
async with session.begin():
item = await self.fetch_next_library_item(session)
if not item:
logger.debug("Nincs feldolgozandó elem")
return False
success = await self.process_item(session, item)
return success
except SQLAlchemyError as e:
logger.error(f"Adatbázis hiba: {e}")
return False
async def run_loop(self):
"""
Fő végtelen ciklus.
"""
logger.info("R2 Enricher indítva...")
while self.running:
try:
success = await self.run_once()
if not success:
# Ha nincs feldolgozandó elem, várjunk egy kicsit
await asyncio.sleep(SLEEP_INTERVAL)
except KeyboardInterrupt:
logger.info("Keyboard interrupt, leállítás...")
self.running = False
break
except Exception as e:
logger.error(f"Váratlan hiba a ciklusban: {e}")
await asyncio.sleep(SLEEP_INTERVAL)
logger.info("R2 Enricher leállt")
def stop(self):
"""Leállítási jelzés."""
self.running = False
async def main():
"""Fő függvény."""
enricher = UltimateSpecsEnricher()
# Signal kezelés
def signal_handler(signum, frame):
logger.info(f"Signal {signum} fogadva, leállítás...")
enricher.stop()
signal.signal(signal.SIGINT, signal_handler)
signal.signal(signal.SIGTERM, signal_handler)
try:
await enricher.run_loop()
except asyncio.CancelledError:
logger.info("Task cancelled")
finally:
logger.info("R2 Enricher befejezte a munkát.")
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,400 @@
#!/usr/bin/env python3
"""
Worker: vehicle_ultimate_r3_finalizer
Producer-Consumer lánc negyedik, utolsó eleme (Az Összevezető).
Offline dolgozik egy végtelen while ciklusban (1-3 mp delay), és a meglévő adatbázis-táblákat szinkronizálja.
1. Lekérdezés (JOIN a Queue-val): Kivesz egy `pending_match` sort a Library-ből, és a Queue-ból lekéri az eredeti `parent_id`-t és a link nevét.
2. Szülő (Base VMD) ellenőrzése: Lekérdezi az eredeti szülő rekordot a VMD táblából a parent_id alapján.
3. Összevezetés (UPDATE vagy INSERT): A letisztított adatok a lib.specifications['standardized'] dict-ből jönnek.
- A ÁG: Ha a szülő status értéke IN ('pending', 'manual_review_needed'): UPDATE a szülő (VMD) rekordon
- B ÁG: Ha a szülő status MÁR NEM 'pending': INSERT új variációként a VMD táblába
4. Library lezárása: Frissíti a Library táblát pipeline_status = 'completed', matched_vmd_id beállítása.
"""
import asyncio
import logging
import random
import sys
import signal
import json
from datetime import datetime
from typing import Optional, Dict, Any, List, Tuple
from sqlalchemy import text, select, and_, or_
from sqlalchemy.exc import IntegrityError, SQLAlchemyError
from sqlalchemy.ext.asyncio import AsyncSession
from app.database import AsyncSessionLocal
# Logging konfiguráció
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s [R3-FINALIZER] %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger("R3-FINALIZER")
# Konfiguráció
SLEEP_INTERVAL = random.uniform(1, 3) # 1-3 mp között várakozás
class UltimateSpecsFinalizer:
def __init__(self):
self.running = True
async def fetch_pending_match(self, session: AsyncSession) -> Optional[Dict[str, Any]]:
"""
Kivesz egy `pending_match` sort a Library-ből, JOIN-olva a Queue-val.
FOR UPDATE OF lib SKIP LOCKED LIMIT 1
"""
query = text("""
SELECT lib.id, lib.source_url, lib.make, lib.model, lib.year_from,
lib.power_kw, lib.engine_cc, lib.specifications, lib.category,
q.parent_id, q.name AS variant_name
FROM vehicle.external_reference_library lib
JOIN vehicle.auto_data_crawler_queue q ON lib.source_url = q.url
WHERE lib.pipeline_status = 'pending_match'
FOR UPDATE OF lib SKIP LOCKED LIMIT 1
""")
result = await session.execute(query)
row = result.fetchone()
if not row:
return None
return {
"lib_id": row[0],
"source_url": row[1],
"make": row[2],
"model": row[3],
"year_from": row[4],
"power_kw": row[5],
"engine_cc": row[6],
"specifications": row[7] if row[7] else {},
"category": row[8],
"parent_id": row[9],
"variant_name": row[10]
}
async def get_parent_vmd(self, session: AsyncSession, parent_id: int) -> Optional[Dict[str, Any]]:
"""
Lekérdezi az eredeti szülő rekordot a VMD táblából a parent_id alapján.
FOR UPDATE (zárolás a konkurrens feldolgozás elkerülésére)
"""
query = text("""
SELECT id, status FROM vehicle.vehicle_model_definitions
WHERE id = :parent_id FOR UPDATE
""")
result = await session.execute(query, {"parent_id": parent_id})
row = result.fetchone()
if not row:
return None
return {
"id": row[0],
"status": row[1]
}
def extract_standardized_data(self, specifications: Dict[str, Any]) -> Dict[str, Any]:
"""
Kinyeri a standardizált adatokat a specifications['standardized'] dict-ből.
Csonkolja a szöveges mezőket a VMD tábla korlátaihoz (50 karakter).
"""
standardized = specifications.get('standardized', {})
# Alapvető numerikus mezők
extracted = {
"power_kw": standardized.get("power_kw"),
"engine_capacity": standardized.get("engine_capacity"),
"torque_nm": standardized.get("torque_nm"),
"max_speed": standardized.get("max_speed"),
"curb_weight": standardized.get("curb_weight"),
"wheelbase": standardized.get("wheelbase"),
"seats": standardized.get("seats"),
"fuel_type": standardized.get("fuel_type"),
"transmission_type": standardized.get("transmission_type"),
"drive_type": standardized.get("drive_type"),
"body_type": standardized.get("body_type"),
}
# Csonkolás a VMD mezőhosszokhoz
def truncate(value: Any, max_len: int = 50) -> Any:
if isinstance(value, str) and len(value) > max_len:
return value[:max_len]
return value
# Alkalmazza a csonkolást a szöveges mezőkre
for field in ["fuel_type", "transmission_type", "drive_type", "body_type"]:
if extracted.get(field):
extracted[field] = truncate(extracted[field], 50)
# Tisztítás: None értékek eltávolítása
return {k: v for k, v in extracted.items() if v is not None}
async def update_parent_vmd(self, session: AsyncSession, parent_id: int,
lib_data: Dict[str, Any], standardized: Dict[str, Any]) -> int:
"""
A ÁG: Frissíti a szülő VMD rekordot a kinyert standardizált adatokkal.
Állítja a VMD status-át 'awaiting_ai_synthesis'-re.
Visszaadja a parent_id-t (matched_vmd_id).
"""
# Build update fields
update_fields = {
"power_kw": standardized.get("power_kw") or lib_data.get("power_kw"),
"engine_capacity": standardized.get("engine_capacity") or lib_data.get("engine_cc"),
"torque_nm": standardized.get("torque_nm"),
"max_speed": standardized.get("max_speed"),
"curb_weight": standardized.get("curb_weight"),
"wheelbase": standardized.get("wheelbase"),
"seats": standardized.get("seats"),
"fuel_type": standardized.get("fuel_type"),
"transmission_type": standardized.get("transmission_type"),
"drive_type": standardized.get("drive_type"),
"body_type": standardized.get("body_type"),
"status": "awaiting_ai_synthesis",
"updated_at": datetime.utcnow(),
"source": "ultimatespecs",
"priority_score": 30,
}
# Remove None values
update_fields = {k: v for k, v in update_fields.items() if v is not None}
# Build SET clause
set_clause = ", ".join([f"{k} = :{k}" for k in update_fields.keys()])
query = text(f"""
UPDATE vehicle.vehicle_model_definitions
SET {set_clause}
WHERE id = :parent_id
RETURNING id
""")
params = {"parent_id": parent_id, **update_fields}
result = await session.execute(query, params)
updated_id = result.scalar()
logger.info(f"UPDATE parent VMD {parent_id} with {len(update_fields)} fields")
return updated_id
async def insert_variant_vmd(self, session: AsyncSession, lib_data: Dict[str, Any],
standardized: Dict[str, Any], variant_name: str) -> int:
"""
B ÁG: Beszúr egy új variációt a VMD táblába.
make = lib.make, marketing_name = variant_name, year_from = lib.year_from.
status = 'awaiting_ai_synthesis', source = 'ultimatespecs', priority_score = 30.
Visszaadja az új ID-t (matched_vmd_id).
Ha már létezik a rekord (duplicate key), visszaadja a meglévő ID-t.
"""
# Build insert data
insert_data = {
"make": lib_data["make"],
"marketing_name": variant_name,
"official_marketing_name": variant_name,
"year_from": lib_data["year_from"],
"power_kw": standardized.get("power_kw") or lib_data.get("power_kw"),
"engine_capacity": standardized.get("engine_capacity") or lib_data.get("engine_cc"),
"torque_nm": standardized.get("torque_nm"),
"max_speed": standardized.get("max_speed"),
"curb_weight": standardized.get("curb_weight"),
"wheelbase": standardized.get("wheelbase"),
"seats": standardized.get("seats"),
"fuel_type": standardized.get("fuel_type"),
"transmission_type": standardized.get("transmission_type"),
"drive_type": standardized.get("drive_type"),
"body_type": standardized.get("body_type"),
"status": "awaiting_ai_synthesis",
"vehicle_class": lib_data.get("category"),
"source": "ultimatespecs",
"priority_score": 30,
"created_at": datetime.utcnow(),
"updated_at": datetime.utcnow(),
"market": "EU",
"normalized_name": f"{lib_data['make']} {variant_name}",
"technical_code": "UNKNOWN",
"variant_code": "UNKNOWN",
"version_code": "UNKNOWN",
"specifications": json.dumps({}), # Üres JSON, mert NOT NULL
"raw_api_data": json.dumps({}), # Üres JSON
"research_metadata": json.dumps({}), # Üres JSON
"raw_search_context": "", # Üres string
}
# Remove None values
insert_data = {k: v for k, v in insert_data.items() if v is not None}
# Build columns and values
columns = ", ".join(insert_data.keys())
placeholders = ", ".join([f":{k}" for k in insert_data.keys()])
try:
# Próbáljuk meg beszúrni
query = text(f"""
INSERT INTO vehicle.vehicle_model_definitions ({columns})
VALUES ({placeholders})
RETURNING id
""")
result = await session.execute(query, insert_data)
new_id = result.scalar()
logger.info(f"INSERT new variant VMD {new_id} for {lib_data['make']} {variant_name}")
return new_id
except IntegrityError as e:
# Duplicate key violation - rollback és új lekérdezés
logger.warning(f"Duplicate key violation for {lib_data['make']} {variant_name}: {e}. Rolling back and looking for existing record...")
# Rollback a megszakított tranzakciót
await session.rollback()
# Keresés a meglévő rekordra új tranzakcióban
find_query = text("""
SELECT id FROM vehicle.vehicle_model_definitions
WHERE make = :make
AND marketing_name = :marketing_name
AND year_from = :year_from
LIMIT 1
""")
find_params = {
"make": lib_data["make"],
"marketing_name": variant_name,
"year_from": lib_data["year_from"]
}
result = await session.execute(find_query, find_params)
existing_id = result.scalar()
if existing_id:
logger.info(f"Found existing VMD {existing_id} for {lib_data['make']} {variant_name}")
return existing_id
else:
# Ha nem találjuk, dobjuk tovább a hibát
logger.error(f"Duplicate key but could not find existing record for {lib_data['make']} {variant_name}")
raise
async def close_library_entry(self, session: AsyncSession, lib_id: int, matched_vmd_id: int):
"""
Frissíti a Library táblát: pipeline_status = 'completed', matched_vmd_id beállítása.
"""
query = text("""
UPDATE vehicle.external_reference_library
SET pipeline_status = 'completed',
matched_vmd_id = :matched_vmd_id
WHERE id = :lib_id
""")
await session.execute(query, {"lib_id": lib_id, "matched_vmd_id": matched_vmd_id})
logger.info(f"Library {lib_id} closed with matched_vmd_id {matched_vmd_id}")
async def process_one(self):
"""
Feldolgoz egyetlen pending_match rekordot.
"""
async with AsyncSessionLocal() as session:
try:
# 1. Lekérdezés a Library-ből
lib_data = await self.fetch_pending_match(session)
if not lib_data:
return False
logger.info(f"Processing library ID {lib_data['lib_id']} for {lib_data['make']} {lib_data['model']}")
# 2. Szülő VMD ellenőrzése
parent_vmd = None
if lib_data['parent_id']:
parent_vmd = await self.get_parent_vmd(session, lib_data['parent_id'])
# 3. Standardizált adatok kinyerése
standardized = self.extract_standardized_data(lib_data['specifications'])
# 4. Döntés: UPDATE vagy INSERT
matched_vmd_id = None
if parent_vmd and parent_vmd['status'] in ('pending', 'manual_review_needed'):
# A ÁG: Szülő frissítése
matched_vmd_id = await self.update_parent_vmd(
session, parent_vmd['id'], lib_data, standardized
)
else:
# B ÁG: Új variáció beszúrása
matched_vmd_id = await self.insert_variant_vmd(
session, lib_data, standardized, lib_data['variant_name']
)
# 5. Library lezárása
await self.close_library_entry(session, lib_data['lib_id'], matched_vmd_id)
# Commit
await session.commit()
logger.info(f"Successfully finalized library {lib_data['lib_id']} -> VMD {matched_vmd_id}")
return True
except Exception as e:
await session.rollback()
logger.error(f"Error processing library {lib_data.get('lib_id', 'unknown')}: {e}")
return False
async def run(self, max_iterations: int = 10):
"""
Fő futási ciklus: korlátozott számú iteráció, 1-3 mp várakozással.
Args:
max_iterations: Maximum number of processing cycles (default: 10)
"""
logger.info(f"R3 Finalizer started. Max iterations: {max_iterations}. Waiting for pending_match entries...")
iteration = 0
while self.running and iteration < max_iterations:
try:
processed = await self.process_one()
if not processed:
# Nincs munka vagy hiba történt, várakozás
await asyncio.sleep(SLEEP_INTERVAL)
else:
# Sikeres feldolgozás után rövid várakozás
await asyncio.sleep(0.5)
# Minden esetben növeljük az iterációt (akár sikeres, akár sikertelen volt)
iteration += 1
logger.info(f"Iteration {iteration}/{max_iterations} completed.")
except asyncio.CancelledError:
break
except Exception as e:
logger.error(f"Unexpected error in main loop: {e}")
await asyncio.sleep(5)
# Hiba esetén is növeljük az iterációt
iteration += 1
logger.info(f"Iteration {iteration}/{max_iterations} completed after error.")
logger.info(f"R3 Finalizer completed {iteration} iterations. Stopping.")
self.stop()
def stop(self):
self.running = False
logger.info("R3 Finalizer stopping...")
def main():
# Signal kezelés
finalizer = UltimateSpecsFinalizer()
def signal_handler(signum, frame):
logger.info(f"Received signal {signum}, shutting down...")
finalizer.stop()
signal.signal(signal.SIGINT, signal_handler)
signal.signal(signal.SIGTERM, signal_handler)
# Fő ciklus indítása - korlátozott számú iterációval teszteléshez
try:
# Teszteléshez: maximum 5 iteráció
asyncio.run(finalizer.run(max_iterations=5))
except KeyboardInterrupt:
logger.info("Keyboard interrupt received, shutting down...")
finally:
logger.info("R3 Finalizer stopped.")
if __name__ == "__main__":
main()

View File

@@ -4,205 +4,187 @@ import logging
import os
import sys
from datetime import datetime, timedelta
from sqlalchemy import text, select
from sqlalchemy import text
from app.database import AsyncSessionLocal
from app.models.asset import AssetCatalog
# MB 2.0 Szigorú naplózás
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] Robot-0-Discovery: %(message)s', stream=sys.stdout)
logger = logging.getLogger("Vehicle-Robot-0-Discovery")
# Szigorú naplózás
logging.basicConfig(level=logging.INFO, format='%(asctime)s [R0-DISCOVERY] %(message)s', stream=sys.stdout)
logger = logging.getLogger("Robot-0")
class DiscoveryEngine:
"""
THOUGHT PROCESS (IPARI ÜZEMMÓD 2.0):
1. Őrkutya (Watchdog): Megkeresi és kiszabadítja a beragadt feladatokat óránként.
2. Differential Sync (Különbözeti Szinkron): Csak a hiányzó vagy új modelleket rögzíti, a gold_enriched-eket kihagyja.
3. Monthly Scheduler: Havonta egyszer tölti le a teljes RDW adatbázist lapozva.
"""
Vehicle Robot 0 v3.0: A Nagy Stratéga
Feladata: Végiglapozza az RDW teljes adatbázisát (autó, motor, teherautó),
kigyűjti az összes létező márka+modell kombinációt, és darabszám alapján
priorizálja őket a catalog_discovery táblában a vadászok (Hunterek) számára.
"""
RDW_API = "https://opendata.rdw.nl/resource/m9d7-ebf2.json"
RDW_TOKEN = os.getenv("RDW_APP_TOKEN")
HEADERS = {"X-App-Token": RDW_TOKEN} if RDW_TOKEN else {}
SYNC_STATE_FILE = "/app/temp/.last_rdw_sync" # Állapotfájl, hogy Docker újrainduláskor se kezdje elölről azonnal
SYNC_STATE_FILE = "/app/temp/.last_rdw_sync"
BATCH_LIMIT = 10000 # RDW API maximum limit aggregálásnál
@staticmethod
async def run_watchdog():
""" 1. FÁZIS: Az Őrkutya (Dead-Letter Queue Manager) """
logger.info("🐕 Őrkutya: Beragadt feladatok keresése a rendszerben...")
try:
async with AsyncSessionLocal() as db:
# A) Hunter takarítás (visszaállítás pending-re, ha a Hunter lefagyott)
res1 = await db.execute(text("UPDATE vehicle.catalog_discovery SET status = 'pending' WHERE status = 'processing' RETURNING id;"))
hunter_resets = len(res1.fetchall())
if hunter_resets > 0:
logger.warning(f"🔄 {hunter_resets} db beragadt Hunter feladat (processing) visszaállítva 'pending'-re.")
# B) AI Robotok takarítása (2 órás timeout)
query2 = text("""
UPDATE vehicle.vehicle_model_definitions
SET status = CASE
WHEN status = 'research_in_progress' THEN 'unverified'
WHEN status = 'ai_synthesis_in_progress' THEN 'awaiting_ai_synthesis'
END
WHERE status IN ('research_in_progress', 'ai_synthesis_in_progress')
AND updated_at < NOW() - INTERVAL '2 hours'
RETURNING id;
""")
res2 = await db.execute(query2)
ai_resets = len(res2.fetchall())
if ai_resets > 0:
logger.warning(f"🔄 {ai_resets} db beragadt AI feladat visszaállítva.")
await db.commit()
except Exception as e:
logger.error(f"❌ Őrkutya hiba: {e}")
@staticmethod
async def seed_manual_bootstrap():
""" 2. FÁZIS: Alapozó adatok rögzítése """
initial_data = [
{"make": "AUDI", "model": "A4", "generation": "B8 (2008-2015)"}, # vehicle_class törölve
{"make": "BMW", "model": "3 SERIES", "generation": "F30 (2012-2019)"}
]
try:
async with AsyncSessionLocal() as db:
for item in initial_data:
stmt = select(AssetCatalog).where(AssetCatalog.make == item["make"], AssetCatalog.model == item["model"])
if not (await db.execute(stmt)).scalar_one_or_none():
db.add(AssetCatalog(**item))
await db.commit()
except Exception as e:
logger.warning(f"Manual bootstrap hiba (Ignorálható, ha az adatbázis már tele van): {e}")
CATEGORIES = [
{"name": "car", "rdw_types": ["'Personenauto'"]},
{"name": "motorcycle", "rdw_types": ["'Motorfiets'"]},
{"name": "truck", "rdw_types": ["'Bedrijfsauto'", "'Vrachtwagen'", "'Opleggertrekker'"]}
]
@classmethod
async def fetch_with_retry(cls, client: httpx.AsyncClient, url: str, params: dict, retries: int = 3):
""" Hibatűrő HTTP kérés API leállások ellen. """
async def fetch_with_retry(cls, client: httpx.AsyncClient, params: dict, retries: int = 3):
for attempt in range(retries):
try:
resp = await client.get(url, params=params, headers=cls.HEADERS)
resp = await client.get(cls.RDW_API, params=params, headers=cls.HEADERS)
if resp.status_code == 200:
return resp
elif resp.status_code == 429:
return resp.json()
elif resp.status_code == 429:
await asyncio.sleep(2 ** attempt)
else:
logger.warning(f"RDW API Hiba: {resp.status_code}")
return None
except httpx.RequestError:
except httpx.RequestError as e:
if attempt == retries - 1:
logger.error(f"Hálózati hiba: {e}")
return None
await asyncio.sleep(2 ** attempt)
return None
@classmethod
async def seed_from_rdw(cls):
""" 3. FÁZIS: Távoli felfedezés - KÜLÖNBÖZETI SZINKRONIZÁCIÓ (Differential Sync) """
logger.info("📥 RDW TÖMEGES LETÖLTÉS: Új modellek keresése (Differential Sync)...")
limit = 10000
async def process_category(cls, db, v_class: str, rdw_types: list):
""" Egy adott kategória (pl. autók) teljes végiglapozása és mentése. """
type_filter = " OR ".join([f"voertuigsoort = {t}" for t in rdw_types])
offset = 0
inserted_count = 0
updated_count = 0
total_inserted = 0
total_updated = 0
logger.info(f"🔍 {v_class.upper()} kategória elemzésének indítása...")
async with httpx.AsyncClient(timeout=60.0) as client:
while True:
# Az aggregált SQL lekérdezés, amit az RDW API-nak küldünk
params = {
"$select": "merk,handelsbenaming,voertuigsoort,count(*) as total",
"$group": "merk,handelsbenaming,voertuigsoort",
"$order": "total DESC",
"$limit": limit,
"$select": "merk, handelsbenaming, count(*) AS darabszam",
"$where": type_filter,
"$group": "merk, handelsbenaming",
"$order": "darabszam DESC",
"$limit": cls.BATCH_LIMIT,
"$offset": offset
}
data = await cls.fetch_with_retry(client, params)
if not data:
break # Ha üres a válasz, végeztünk a kategóriával
logger.info(f"📊 {v_class.upper()}: Feldolgozás {offset} - {offset + len(data)}...")
# Mivel ez tömeges mentés, egy közös tranzakciót használunk
for item in data:
make_name = str(item.get("merk", "")).upper().strip()
model_name = str(item.get("handelsbenaming", "")).upper().strip()
if not make_name or not model_name:
continue
count = int(item.get("darabszam", 0))
try:
async with db.begin_nested():
# Ha még nincs ilyen (vagy ha van, frissítjük a prioritást)
query = text("""
INSERT INTO vehicle.catalog_discovery (make, model, vehicle_class, status, source, attempts, priority_score)
VALUES (:make, :model, :class, 'pending', 'STRATEGIST-V3', 0, :score)
ON CONFLICT (make, model, vehicle_class)
DO UPDATE SET priority_score = GREATEST(vehicle.catalog_discovery.priority_score, :score)
WHERE vehicle.catalog_discovery.status != 'processed'
RETURNING xmax;
""")
res = await db.execute(query, {"make": make_name, "model": model_name, "class": v_class, "score": count})
# Logika a statisztikához: xmax = 0 ha új beszúrás, > 0 ha update
row = res.fetchone()
if row:
if row[0] == 0: total_inserted += 1
else: total_updated += 1
except Exception as e:
logger.warning(f"⚠️ Hiba a mentésnél ({make_name} {model_name}): {e}")
await db.commit()
resp = await cls.fetch_with_retry(client, "https://opendata.rdw.nl/resource/m9d7-ebf2.json", params)
if not resp: break
raw_data = resp.json()
if not raw_data: break
logger.info(f"📊 Lapozás: {offset} - {offset + len(raw_data)} tételek analízise...")
async with AsyncSessionLocal() as db:
for entry in raw_data:
make = str(entry.get("merk", "")).upper().strip()
model = str(entry.get("handelsbenaming", "")).upper().strip()
v_kind = entry.get("voertuigsoort", "")
total_count = int(entry.get("total", 0))
if not make or not model: continue
if "Personenauto" in v_kind: v_class = 'car'
elif "Motorfiets" in v_kind: v_class = 'motorcycle'
else: v_class = 'truck'
# A MÁGIA: Különbözeti Szinkronizáció SQL + Explicit Type Casting
query = text("""
INSERT INTO vehicle.catalog_discovery (make, model, vehicle_class, status, priority_score)
SELECT
CAST(:make AS VARCHAR),
CAST(:model AS VARCHAR),
CAST(:v_class AS VARCHAR),
'pending',
:priority
WHERE NOT EXISTS (
SELECT 1 FROM vehicle.vehicle_model_definitions
WHERE make = CAST(:make AS VARCHAR)
AND marketing_name = CAST(:model AS VARCHAR)
AND status = 'gold_enriched'
)
ON CONFLICT (make, model)
DO UPDATE SET priority_score = EXCLUDED.priority_score
WHERE vehicle.catalog_discovery.status != 'processed'
RETURNING xmax;
""")
result = await db.execute(query, {
"make": make, "model": model, "v_class": v_class, "priority": total_count
})
row = result.fetchone()
if row:
if row[0] == 0: inserted_count += 1 # Új beszúrás
else: updated_count += 1 # Meglévő frissítése
await db.commit()
offset += limit
await asyncio.sleep(1)
# Ha kevesebb adat jött vissza, mint a limit, akkor elértük az utolsó oldalt
if len(data) < cls.BATCH_LIMIT:
break
logger.info(f"✅ RDW Szinkron kész! Új modellek a listán: {inserted_count} | Frissített prioritások: {updated_count}")
# Sikeres futás regisztrálása a fájlrendszeren
os.makedirs(os.path.dirname(cls.SYNC_STATE_FILE), exist_ok=True)
with open(cls.SYNC_STATE_FILE, 'w') as f:
f.write(datetime.now().isoformat())
offset += cls.BATCH_LIMIT
await asyncio.sleep(1) # API kímélése
logger.info(f"{v_class.upper()} kész! Új felfedezett: {total_inserted} | Frissített prioritás: {total_updated}")
@classmethod
async def run_watchdog(cls):
""" Kiszabadítja azokat a Hunter feladatokat, amiknél a szerver esetleg újraindult. """
logger.info("🐕 Őrkutya: Beragadt feladatok ellenőrzése...")
try:
async with AsyncSessionLocal() as db:
res1 = await db.execute(text("UPDATE vehicle.catalog_discovery SET status = 'pending' WHERE status = 'processing' RETURNING id;"))
hunter_resets = len(res1.fetchall())
if hunter_resets > 0:
logger.warning(f"🔄 {hunter_resets} db beragadt Hunter feladat visszaállítva.")
res2 = await db.execute(text("""
UPDATE vehicle.vehicle_model_definitions
SET status = 'unverified'
WHERE status IN ('research_in_progress', 'ai_synthesis_in_progress')
AND updated_at < NOW() - INTERVAL '2 hours'
RETURNING id;
"""))
ai_resets = len(res2.fetchall())
if ai_resets > 0:
logger.warning(f"🔄 {ai_resets} db beragadt AI/Kutató feladat visszaállítva.")
await db.commit()
except Exception as e:
logger.error(f"❌ Őrkutya hiba: {e}")
@classmethod
def should_run_rdw_sync(cls) -> bool:
""" Ellenőrzi, hogy eltelt-e 30 nap a legutóbbi sikeres RDW szinkronizáció óta. """
if not os.path.exists(cls.SYNC_STATE_FILE):
return True
if not os.path.exists(cls.SYNC_STATE_FILE): return True
try:
with open(cls.SYNC_STATE_FILE, 'r') as f:
last_sync = datetime.fromisoformat(f.read().strip())
return datetime.now() - last_sync > timedelta(days=30)
# Ha elmúlt 7 nap, újra felfedezi az RDW-t
return datetime.now() - last_sync > timedelta(days=7)
except Exception:
return True
@classmethod
async def run(cls):
""" FŐ CIKLUS: Havi ütemező és Óránkénti Őrkutya """
logger.info("🚀 ÉLES ÜZEM: Discovery Engine (Differential Sync) & Watchdog indítása...")
await cls.seed_manual_bootstrap()
logger.info("🚀 Robot 0 (Strategist & Discovery) ONLINE")
# 1. Adatbázis séma biztosítása a priority_score-hoz
async with AsyncSessionLocal() as db:
try:
await db.execute(text("ALTER TABLE vehicle.catalog_discovery ADD COLUMN IF NOT EXISTS priority_score INTEGER DEFAULT 0;"))
await db.commit()
except Exception as e:
await db.rollback()
logger.error(f"⚠️ Séma hiba (ignorálható): {e}")
while True:
# 1. Óránkénti takarítás
await cls.run_watchdog()
# 2. Havi szinkronizáció ellenőrzése
if cls.should_run_rdw_sync():
await cls.seed_from_rdw()
logger.info("🌍 Teljes RDW Hálózat Letapogatás Indul...")
async with AsyncSessionLocal() as db:
for category in cls.CATEGORIES:
await cls.process_category(db, category["name"], category["rdw_types"])
os.makedirs(os.path.dirname(cls.SYNC_STATE_FILE), exist_ok=True)
with open(cls.SYNC_STATE_FILE, 'w') as f:
f.write(datetime.now().isoformat())
logger.info("🏁 Letapogatás befejezve. Alvás a következő ellenőrzésig.")
else:
logger.info("🛌 Az RDW szinkronizáció már lefutott az elmúlt 30 napban. Ugrás...")
logger.info("🛌 Az RDW szinkronizáció már lefutott a héten. Őrködés folytatása...")
# 3. Alvás 1 órát (Heartbeat)
logger.info("⏱️ A Discovery Engine most 1 órát pihen a következő Őrkutya futásig.")
await asyncio.sleep(3600)
await asyncio.sleep(3600) # Óránként ellenőrzi, kell-e valamit tenni
if __name__ == "__main__":
asyncio.run(DiscoveryEngine.run())

View File

@@ -1,4 +1,4 @@
# /app/app/workers/vehicle/vehicle_robot_0_gb_discovery.py
# /opt/docker/dev/service_finder/backend/app/workers/vehicle/vehicle_robot_0_gb_discovery.py
import asyncio
import logging
import csv

View File

@@ -1,4 +1,4 @@
# /app/app/workers/vehicle/vehicle_robot_1_2_nhtsa_fetcher.py
# /opt/docker/dev/service_finder/backend/app/workers/vehicle/vehicle_robot_1_2_nhtsa_fetcher.py
import asyncio
import httpx
import logging
@@ -13,16 +13,14 @@ class NHTSAFetcher:
@classmethod
async def get_eu_makes(cls):
"""Lekéri azokat a márkákat, amik már benne vannak az adatbázisban EU-s forrásból."""
async with AsyncSessionLocal() as db:
# Csak azokat a márkákat keressük az USA-ban, amiket az EU-ban (RDW) már láttunk
query = text("SELECT DISTINCT make FROM vehicle.catalog_discovery WHERE market = 'EU' OR source = 'RDW'")
res = await db.execute(query)
return [row[0] for row in res.fetchall()]
@classmethod
async def run(cls):
logger.info("🚀 Robot 1.2 (EU-Guided NHTSA) indítása...")
logger.info("🚀 Robot 1.2 (EU-Guided NHTSA) indítása - Kötegelt mód...")
while True:
target_makes = await cls.get_eu_makes()
@@ -31,36 +29,39 @@ class NHTSAFetcher:
await asyncio.sleep(60)
continue
# 2026-tól megyünk vissza a múltba
for year in range(2026, 1950, -1):
async with AsyncSessionLocal() as db:
for make in target_makes:
try:
async with httpx.AsyncClient(timeout=20.0) as client:
# A hálózati kliens a cikluson KÍVÜL van, így újrahasznosítja a kapcsolatokat!
async with httpx.AsyncClient(timeout=20.0) as client:
for year in range(2026, 1950, -1):
async with AsyncSessionLocal() as db:
for make in target_makes:
try:
url = cls.API_URL.format(make=make, year=year)
resp = await client.get(url)
if resp.status_code != 200: continue
models = resp.json().get("Results", [])
inserted = 0
if not models: continue
# Gyors lista generálás a kötegelt mentéshez
insert_data = []
for m in models:
model_name = m.get("Model_Name").upper().strip()
# USA_IMPORT jelölés, de csak EU-s márkákhoz!
insert_data.append({"make": make, "model": model_name, "year": year})
if insert_data:
query = text("""
INSERT INTO vehicle.catalog_discovery
(make, model, vehicle_class, status, market, model_year, priority_score, source)
VALUES (:make, :model, 'car', 'pending', 'USA_IMPORT', :year, 5, 'NHTSA-EU-FILTERED')
ON CONFLICT ON CONSTRAINT _make_model_market_year_uc DO NOTHING
""")
res = await db.execute(query, {"make": make, "model": model_name, "year": year})
if res.rowcount > 0: inserted += 1
if inserted > 0:
logger.info(f"{make} ({year}): {inserted} variáns dúsítva az USA-ból.")
await db.commit()
except Exception as e:
logger.error(f"❌ Hiba: {make} {year}: {e}")
await asyncio.sleep(0.5)
# Egyetlen SQL hívás a teljes listára!
await db.execute(query, insert_data)
await db.commit()
logger.info(f"{make} ({year}): {len(insert_data)} variáns dúsítva az USA-ból.")
except Exception as e:
logger.error(f"❌ Hiba: {make} {year}: {e}")
await asyncio.sleep(0.1) # Kisebb pihenő is elég, mert hatékonyabbak vagyunk
if __name__ == "__main__":
asyncio.run(NHTSAFetcher.run())

View File

@@ -1,11 +1,16 @@
# /opt/docker/dev/service_finder/backend/app/workers/vehicle/vehicle_robot_1_4_bike_hunter.py
import asyncio
import httpx
import logging
import random
from sqlalchemy import text
from app.database import AsyncSessionLocal
# Naplózás finomhangolása a duplázódás elkerülésére
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(name)s] %(message)s')
logger = logging.getLogger("Robot-1-4-Bike")
logging.basicConfig(level=logging.INFO)
# SQLAlchemy zaj csökkentése
logging.getLogger("sqlalchemy.engine").setLevel(logging.WARNING)
BIKE_MAKES = [
"HONDA", "YAMAHA", "KAWASAKI", "SUZUKI", "HARLEY-DAVIDSON",
@@ -17,40 +22,61 @@ class BikeHunter:
@classmethod
async def run(cls):
logger.info("🏍️ Robot 1.4 (Bike Hunter) indítása...")
# 2026-tól 1970-ig pörgetjük a motorokat
"""
THOUGHT PROCESS:
A robotot úgy alakítjuk át, hogy minden egyes gyártó/év kombinációt
külön tranzakcióként kezeljen. Ha egy márka hibát dob, elvégezzük a
rollback-et, így a következő márka tiszta lappal indulhat.
"""
logger.info("🏍️ Robot 1.4 (Bike Hunter) indítása - Tranzakció-biztos mód...")
years = range(2026, 1969, -1)
async with AsyncSessionLocal() as db:
async with httpx.AsyncClient(timeout=30.0) as client:
for year in years:
for make in BIKE_MAKES:
try:
async with httpx.AsyncClient(timeout=20.0) as client:
# Minden márkához új session-t nyitunk, vagy biztosítjuk a rollback-et
async with AsyncSessionLocal() as db:
try:
resp = await client.get(cls.API_URL.format(make=make, year=year))
if resp.status_code != 200: continue
models = resp.json().get("Results", [])
if resp.status_code != 200:
logger.warning(f"⚠️ {make} ({year}) API hiba: {resp.status_code}")
continue
inserted = 0
models = resp.json().get("Results", [])
if not models:
continue
insert_data = []
for m in models:
model_name = m.get("Model_Name").upper().strip()
# TISZTA SQL - Nincs Simon!
m_name = m.get("Model_Name")
if m_name:
model_name = m_name.upper().strip()
insert_data.append({"make": make, "model": model_name, "year": year})
if insert_data:
# ON CONFLICT használata a CONSTRAINT alapján
query = text("""
INSERT INTO vehicle.catalog_discovery
(make, model, vehicle_class, status, market, model_year, priority_score, source)
VALUES (:make, :model, 'motorcycle', 'pending', 'USA_IMPORT', :year, 8, 'NHTSA-V1-BIKE')
ON CONFLICT ON CONSTRAINT _make_model_market_year_uc DO NOTHING
""")
await db.execute(query, {"make": make, "model": model_name, "year": year})
inserted += 1
if inserted > 0:
logger.info(f"🏍️ {make} ({year}): {inserted} új motor rögzítve.")
await db.commit()
except Exception as e:
logger.error(f"❌ Bike Error {make} ({year}): {e}")
# Évjáratonként egy pici pihenő az API-nak
await asyncio.sleep(0.5)
await db.execute(query, insert_data)
await db.commit() # Itt véglegesítjük a sikeres köteget
logger.info(f"{make} ({year}): {len(insert_data)} motor feldolgozva.")
except Exception as e:
# KRITIKUS: Hiba esetén visszaállítjuk a tranzakciót,
# így a következő kör (következő márka) nem bukik el.
await db.rollback()
logger.error(f"❌ Bike Error {make} ({year}): {str(e)}")
# API kímélése (Rate limiting megelőzése)
await asyncio.sleep(random.uniform(0.3, 0.6))
if __name__ == "__main__":
asyncio.run(BikeHunter.run())
try:
asyncio.run(BikeHunter.run())
except KeyboardInterrupt:
logger.info("🛑 Leállítás felhasználói kérésre.")

View File

@@ -1,66 +1,82 @@
# /app/app/workers/vehicle/vehicle_robot_1_5_heavy_eu.py
import asyncio
import httpx
import logging
import sys
from sqlalchemy import text
from app.database import AsyncSessionLocal
logger = logging.getLogger("Robot-1-5-Heavy-EU")
logging.basicConfig(level=logging.INFO)
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s [%(levelname)s] R1.5-Heavy: %(message)s',
stream=sys.stdout
)
class HeavyEUHunter:
# RDW Open Data - Hollandia az EU kapuja
RDW_URL = "https://opendata.rdw.nl/resource/m9d7-ebf2.json"
@classmethod
async def fetch_rdw_heavy(cls, vehicle_type: str):
"""
vehicle_type: 'Vrachtwagen' (Teher), 'Bus', 'Kampeerauto' (Lakóautó)
"""
# Lekérjük az összes egyedi márka-típus párost
query_url = f"{cls.RDW_URL}?voertuigsoort={vehicle_type}&$select=merk,handelsbenaming&$limit=10000"
async with httpx.AsyncClient(timeout=30.0) as client:
try:
resp = await client.get(query_url)
return resp.json() if resp.status_code == 200 else []
except Exception as e:
logger.error(f"❌ RDW Error: {e}")
logger.error(f"❌ RDW API Error: {e}")
return []
@classmethod
async def run(cls):
logger.info("🚛 Robot 1.5 (EU Heavy Duty) indítása...")
# Definíciók: RDW név -> Mi kategóriánk
job_list = {
"Vrachtwagen": "truck",
"Bus": "bus",
"Kampeerauto": "rv"
}
# --- DB KAPCSOLÓDÁSI VÉDELEM (RETRY) ---
db_connected = False
for i in range(12): # 1 percig próbálkozik (12 * 5mp)
try:
async with AsyncSessionLocal() as db:
await db.execute(text("SELECT 1"))
db_connected = True
logger.info("✅ Adatbázis kapcsolat aktív!")
break
except Exception:
logger.warning(f"⏳ Adatbázis nem elérhető ({i+1}/12), várakozás 5mp...")
await asyncio.sleep(5)
if not db_connected:
logger.error("💀 Nem sikerült kapcsolódni az adatbázishoz. Leállás.")
return
job_list = {"Vrachtwagen": "truck", "Bus": "bus", "Kampeerauto": "rv"}
async with AsyncSessionLocal() as db:
for rdw_name, internal_class in job_list.items():
logger.info(f"📥 {rdw_name} adatok letöltése...")
data = await cls.fetch_rdw_heavy(rdw_name)
inserted = 0
if not data: continue
insert_data = []
for item in data:
make = item.get('merk', '').upper().strip()
model = item.get('handelsbenaming', '').upper().strip()
if not make or not model: continue
if make and model:
insert_data.append({"make": make, "model": model, "v_class": internal_class})
# Szűrés a kért EU márkákra + amik jönnek az RDW-ből
if insert_data:
# JAVÍTÁS: Constraint név helyett konkrét mezők az ütközéshez
query = text("""
INSERT INTO vehicle.catalog_discovery
(make, model, vehicle_class, status, market, priority_score, source)
VALUES (:make, :model, :v_class, 'pending', 'EU', 20, 'RDW-HEAVY')
ON CONFLICT ON CONSTRAINT _make_model_market_year_uc DO NOTHING
ON CONFLICT (make, model, vehicle_class) DO NOTHING
""")
res = await db.execute(query, {"make": make, "model": model, "v_class": internal_class})
if res.rowcount > 0: inserted += 1
await db.commit()
logger.info(f"{rdw_name}: {inserted} új EU-s nagygép rögzítve.")
try:
await db.execute(query, insert_data)
await db.commit()
logger.info(f"{rdw_name}: {len(insert_data)} gép beküldve.")
except Exception as e:
logger.error(f"❌ Mentési hiba ({rdw_name}): {e}")
await db.rollback()
if __name__ == "__main__":
asyncio.run(HeavyEUHunter.run())

View File

@@ -0,0 +1,62 @@
# /opt/docker/dev/service_finder/backend/app/workers/vehicle/vehicle_robot_1_5_heavy_eu1.0.py
import asyncio
import httpx
import logging
from sqlalchemy import text
from app.database import AsyncSessionLocal
logger = logging.getLogger("Robot-1-5-Heavy-EU")
logging.basicConfig(level=logging.INFO)
class HeavyEUHunter:
RDW_URL = "https://opendata.rdw.nl/resource/m9d7-ebf2.json"
@classmethod
async def fetch_rdw_heavy(cls, vehicle_type: str):
query_url = f"{cls.RDW_URL}?voertuigsoort={vehicle_type}&$select=merk,handelsbenaming&$limit=10000"
async with httpx.AsyncClient(timeout=30.0) as client:
try:
resp = await client.get(query_url)
return resp.json() if resp.status_code == 200 else []
except Exception as e:
logger.error(f"❌ RDW Error: {e}")
return []
@classmethod
async def run(cls):
logger.info("🚛 Robot 1.5 (EU Heavy Duty) indítása - Kötegelt mód...")
job_list = {
"Vrachtwagen": "truck",
"Bus": "bus",
"Kampeerauto": "rv"
}
async with AsyncSessionLocal() as db:
for rdw_name, internal_class in job_list.items():
logger.info(f"📥 {rdw_name} adatok letöltése...")
data = await cls.fetch_rdw_heavy(rdw_name)
if not data: continue
# A 10.000 adatot egyetlen listába gyűjtjük
insert_data = []
for item in data:
make = item.get('merk', '').upper().strip()
model = item.get('handelsbenaming', '').upper().strip()
if make and model:
insert_data.append({"make": make, "model": model, "v_class": internal_class})
if insert_data:
query = text("""
INSERT INTO vehicle.catalog_discovery
(make, model, vehicle_class, status, market, priority_score, source)
VALUES (:make, :model, :v_class, 'pending', 'EU', 20, 'RDW-HEAVY')
ON CONFLICT ON CONSTRAINT _make_model_market_year_uc DO NOTHING
""")
# Egyetlen SQL hívással beszúrjuk akár a 10.000 sort is!
await db.execute(query, insert_data)
await db.commit()
logger.info(f"{rdw_name}: {len(insert_data)} EU-s nagygép beküldve kötegelve.")
if __name__ == "__main__":
asyncio.run(HeavyEUHunter.run())

View File

@@ -1,207 +1,310 @@
#!/usr/bin/env python3
"""
Robot-1-Catalog-Hunter (Precíz Adattrezor + Szótár-vezérelt ETL)
Felelősség: RDW API-k lekérdezése (SZŰRTEN: Csak Autó, Motor, Teherautó),
mapping_config.json alapú adatkinyerés, teljesítmény kalkuláció és teljes értékű mentés.
"""
import asyncio
import httpx
import logging
import os
import re
import sys
from sqlalchemy import text, select
import json
from datetime import datetime
from sqlalchemy import text
from sqlalchemy.dialects.postgresql import insert
from app.database import AsyncSessionLocal
from app.models.vehicle_definitions import VehicleModelDefinition
from app.models import VehicleModelDefinition
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] Robot-1-Hunter: %(message)s', stream=sys.stdout)
logger = logging.getLogger("Robot-1")
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s [%(levelname)s] Robot-1-Nyers: %(message)s',
stream=sys.stdout
)
logger = logging.getLogger("Robot-1-Nyers")
class CatalogHunter:
"""
Vehicle Robot 1.9.2: The Invincible Mega-Hunter (CONCURRENCY PATCH)
Szigorú sor-zárolás (SKIP LOCKED) és exponenciális API újrapróbálkozás.
"""
RDW_MAIN = "https://opendata.rdw.nl/resource/m9d7-ebf2.json"
RDW_FUEL = "https://opendata.rdw.nl/resource/8ys7-d773.json"
RDW_ENGINE = "https://opendata.rdw.nl/resource/jh96-v4pq.json"
RDW_TOKEN = os.getenv("RDW_APP_TOKEN")
HEADERS = {"X-App-Token": RDW_TOKEN} if RDW_TOKEN else {}
BATCH_SIZE = 50
# Szótár betöltése induláskor
CONFIG_PATH = os.path.join(os.path.dirname(__file__), "mapping_config.json")
with open(CONFIG_PATH, 'r', encoding='utf-8') as f:
MAPPING = json.load(f)["rdw"]
@classmethod
def normalize(cls, text_val: str) -> str:
if not text_val: return ""
return re.sub(r'[^a-zA-Z0-9]', '', text_val).lower()
return re.sub(r'[^a-zA-Z0-9]', '', text_val).lower() if text_val else "UNKNOWN"
@classmethod
def parse_int(cls, value) -> int:
try:
if value is None or str(value).strip() == "": return 0
return int(float(value))
except (ValueError, TypeError): return 0
return int(float(value)) if value and str(value).strip() else 0
except:
return 0
@classmethod
def parse_float(cls, value) -> float:
try:
if value is None or str(value).strip() == "": return 0.0
return float(value)
except (ValueError, TypeError): return 0.0
return float(value) if value and str(value).strip() else 0.0
except:
return 0.0
@classmethod
async def fetch_with_retry(cls, client: httpx.AsyncClient, url: str, retries: int = 3):
""" Hibatűrő HTTP kérés API leállások és Rate Limitek ellen. """
for attempt in range(retries):
try:
resp = await client.get(url, headers=cls.HEADERS)
if resp.status_code == 200:
return resp
elif resp.status_code == 429: # Rate limit
await asyncio.sleep(2 ** attempt) # 1, 2, 4 másodperc pihenő
else:
return resp # Egyéb hiba (pl 404), nem próbáljuk újra
except httpx.RequestError as e:
if attempt == retries - 1:
logger.debug(f"API Hiba végleges ({url}): {e}")
raise
await asyncio.sleep(2 ** attempt)
return None
@classmethod
async def fetch_tech_details(cls, client, plate):
results = {
"power_kw": 0, "engine_code": None, "euro_class": None,
"fuel_desc": "Unknown", "co2": 0, "consumption": 0.0
}
async def fetch_raw_api_data(cls, client, plate: str) -> dict:
raw_data = {"rdw_main": [], "rdw_fuel": [], "rdw_engine": []}
try:
f_resp = await cls.fetch_with_retry(client, f"{cls.RDW_FUEL}?kenteken={plate}")
if f_resp and f_resp.status_code == 200 and f_resp.json():
f = f_resp.json()[0]
p1 = cls.parse_int(f.get("netto_maximum_vermogen") or f.get("nettomaximumvermogen"))
p2 = cls.parse_int(f.get("nominaal_continu_maximum_vermogen") or f.get("nominaalcontinuvermogen"))
results.update({
"power_kw": max(p1, p2),
"fuel_desc": f.get("brandstof_omschrijving") or "Unknown",
"euro_class": f.get("euro_klasse") or f.get("uitlaatemissieniveau"),
"co2": cls.parse_int(f.get("co2_uitstoot_gecombineerd")),
"consumption": cls.parse_float(f.get("brandstofverbruik_gecombineerd"))
})
e_resp = await cls.fetch_with_retry(client, f"{cls.RDW_ENGINE}?kenteken={plate}")
if e_resp and e_resp.status_code == 200 and e_resp.json():
results["engine_code"] = e_resp.json()[0].get("motorcode")
# 1. RDW Main
main_resp = await client.get(f"{cls.RDW_MAIN}?kenteken={plate}", headers=cls.HEADERS)
if main_resp.status_code == 200: raw_data["rdw_main"] = main_resp.json()
# 2. RDW Fuel
fuel_resp = await client.get(f"{cls.RDW_FUEL}?kenteken={plate}", headers=cls.HEADERS)
if fuel_resp.status_code == 200: raw_data["rdw_fuel"] = fuel_resp.json()
# 3. RDW Engine
engine_resp = await client.get(f"{cls.RDW_ENGINE}?kenteken={plate}", headers=cls.HEADERS)
if engine_resp.status_code == 200: raw_data["rdw_engine"] = engine_resp.json()
except Exception as e:
logger.debug(f"Hiba a technikai részleteknél ({plate}): {e}")
return results
logger.error(f"Hiba a nyers adatok lekérése közben ({plate}): {e}")
return raw_data
@classmethod
async def process_make_model(cls, db, task_id, make_name, model_name, v_class, priority):
clean_make = make_name.strip().upper()
clean_model = model_name.strip().upper()
logger.info(f"🎯 IPARI ADATBÁNYÁSZAT INDUL: {clean_make} {clean_model}")
offset = 0
def apply_mapping(cls, raw_main: dict, raw_fuel: list, raw_engine: list) -> dict:
""" A JSON szótár alapján kinyeri és kiszámolja a pontos értékeket. """
tech = {
"make": raw_main.get("merk", "UNKNOWN").strip().upper(),
"marketing_name": raw_main.get("handelsbenaming", "UNKNOWN").upper(),
"curb_weight": cls.parse_int(raw_main.get("massa_ledig_voertuig")),
"max_weight": cls.parse_int(raw_main.get("technische_max_massa_voertuig")),
"engine_capacity": cls.parse_int(raw_main.get("cilinderinhoud")),
"cylinders": cls.parse_int(raw_main.get("aantal_cilinders")),
"wheelbase": cls.parse_int(raw_main.get("wielbasis")),
"doors": cls.parse_int(raw_main.get("aantal_deuren")),
"seats": cls.parse_int(raw_main.get("aantal_zitplaatsen")),
"list_price": cls.parse_int(raw_main.get("catalogusprijs")),
"max_speed": cls.parse_int(raw_main.get("maximale_constructiesnelheid")),
"year_from": 0,
"power_kw": 0,
"engine_code": None,
"euro_class": None,
"fuel_type": "Unknown",
"co2": 0,
"consumption": 0.0,
"body_type": "UNKNOWN"
}
# Évjárat kivágása (pl. "20240424" -> 2024)
datum = str(raw_main.get("datum_eerste_toelating", ""))
if len(datum) >= 4:
tech["year_from"] = cls.parse_int(datum[:4])
# Karosszéria fordítás
raw_body = str(raw_main.get("inrichting", "")).lower().strip()
tech["body_type"] = cls.MAPPING["body_type_translations"].get(raw_body, raw_body.upper())
# Üzemanyag adatok kinyerése
if raw_fuel:
f = raw_fuel[0]
raw_fuel_type = f.get("brandstof_omschrijving", "Unknown")
tech["fuel_type"] = cls.MAPPING["fuel_translations"].get(raw_fuel_type, raw_fuel_type)
tech["euro_class"] = f.get("euro_klasse") or f.get("uitlaatemissieniveau")
tech["co2"] = cls.parse_int(f.get("co2_uitstoot_gecombineerd"))
tech["consumption"] = cls.parse_float(f.get("brandstofverbruik_gecombineerd"))
# --- JAVÍTOTT TELJESÍTMÉNY-KERESŐ (Normál, Elektromos, Névleges) ---
p_normal = cls.parse_float(f.get("nettomaximumvermogen"))
p_elec = cls.parse_float(f.get("netto_max_vermogen_elektrisch"))
p_nominal = cls.parse_float(f.get("nominaal_continu_maximumvermogen"))
power = max(p_normal, p_elec, p_nominal)
# HA MÉG MINDIG NINCS TELJESÍTMÉNY, SZÁMOLJUK KI A SÚLY/ARÁNYBÓL!
if power == 0:
ratio_key = cls.MAPPING["power_calculation"]["ratio_source"]
weight_key = cls.MAPPING["power_calculation"]["weight_source"]
ratio = cls.parse_float(raw_main.get(ratio_key))
weight = cls.parse_float(raw_main.get(weight_key))
if ratio > 0 and weight > 0:
power = ratio * weight
logger.info(f"⚡ Teljesítmény számolva arányból: {ratio} * {weight} = {power:.2f} kW")
tech["power_kw"] = cls.parse_int(power)
# Motor adatok kinyerése
if raw_engine:
tech["engine_code"] = raw_engine[0].get("motorcode")
return tech
@classmethod
async def process_task(cls, db, task):
clean_make = task.make.strip().upper()
clean_model = task.model.strip().upper()
logger.info(f"🎯 PRECÍZIÓS ADATGYŰJTÉS INDUL: {clean_make} {clean_model}")
async with httpx.AsyncClient(timeout=30.0) as client:
offset = 0
while True:
params = f"merk={clean_make}&handelsbenaming={clean_model}&$limit={cls.BATCH_SIZE}&$offset={offset}&$order=kenteken DESC"
try:
r = await cls.fetch_with_retry(client, f"{cls.RDW_MAIN}?{params}")
batch = r.json() if r and r.status_code == 200 else []
except Exception: break
# --- SZŰRÉS: Csak autó, motor és teherautó/kamion ---
allowed_types = "('Personenauto','Motorfiets','Vrachtwagen')"
params = f"merk={clean_make}&$where=voertuigsoort IN {allowed_types}"
if not batch: break
if clean_model != 'ALL_VARIANTS':
params += f" AND handelsbenaming='{clean_model}'"
params += f"&$limit={cls.BATCH_SIZE}&$offset={offset}&$order=kenteken DESC"
try:
r = await client.get(f"{cls.RDW_MAIN}?{params}", headers=cls.HEADERS)
batch = r.json() if r.status_code == 200 else []
except Exception as e:
logger.error(f"Hiba a batch lekérés közben: {e}")
break
if not batch: break
for item in batch:
plate = item.get("kenteken", "UNKNOWN")
try:
plate = item.get("kenteken")
if not plate: continue
variant = item.get("variant") or "UNKNOWN"
version = item.get("uitvoering") or "UNKNOWN"
ccm = cls.parse_int(item.get("cilinderinhoud"))
norm_name = cls.normalize(clean_model.replace(clean_make, "").strip() or clean_model)
tech = await cls.fetch_tech_details(client, plate)
async with db.begin_nested():
raw_api_data = await cls.fetch_raw_api_data(client, plate)
# Szótár és Matek alkalmazása!
tech = cls.apply_mapping(
raw_api_data.get("rdw_main", [{}])[0] if raw_api_data.get("rdw_main") else item,
raw_api_data.get("rdw_fuel", []),
raw_api_data.get("rdw_engine", [])
)
stmt = insert(VehicleModelDefinition).values(
make=clean_make,
marketing_name=clean_model,
normalized_name=norm_name,
variant_code=variant,
version_code=version,
type_approval_number=item.get("typegoedkeuringsnummer"),
technical_code=plate,
engine_capacity=ccm,
power_kw=tech["power_kw"],
fuel_type=tech["fuel_desc"],
engine_code=tech["engine_code"],
seats=cls.parse_int(item.get("aantal_zitplaatsen")),
doors=cls.parse_int(item.get("aantal_deuren")),
width=cls.parse_int(item.get("breedte")),
wheelbase=cls.parse_int(item.get("wielbasis")),
list_price=cls.parse_int(item.get("catalogusprijs")),
max_speed=cls.parse_int(item.get("maximale_constructiesnelheid")),
towing_weight_unbraked=cls.parse_int(item.get("maximum_massa_trekken_ongeremd")),
towing_weight_braked=cls.parse_int(item.get("maximum_trekken_massa_geremd")),
curb_weight=cls.parse_int(item.get("massa_ledig_voertuig")),
max_weight=cls.parse_int(item.get("technische_max_massa_voertuig") or item.get("toegestane_maximum_massa_voertuig")),
body_type=item.get("inrichting"),
co2_emissions_combined=tech["co2"],
fuel_consumption_combined=tech["consumption"],
euro_classification=tech["euro_class"],
cylinders=cls.parse_int(item.get("aantal_cilinders")),
vehicle_class=v_class,
priority_score=priority,
status="ACTIVE",
source="MEGA-HUNTER-v1.9.2"
)
do_nothing_stmt = stmt.on_conflict_do_nothing(
index_elements=['make', 'normalized_name', 'variant_code', 'version_code', 'fuel_type']
)
await db.execute(do_nothing_stmt)
norm_name = cls.normalize(tech["marketing_name"].replace(clean_make, "").strip() or tech["marketing_name"])
# Routing Logika
has_power_and_ccm = tech["power_kw"] > 0 and tech["engine_capacity"] > 0
is_electric = "electric" in tech["fuel_type"].lower()
if has_power_and_ccm or (tech["power_kw"] > 0 and is_electric):
final_status = "awaiting_ai_synthesis"
else:
final_status = "unverified"
stmt = insert(VehicleModelDefinition).values(
market='EU',
make=tech["make"],
marketing_name=tech["marketing_name"],
normalized_name=norm_name,
variant_code=item.get("variant", "UNKNOWN"),
version_code=item.get("uitvoering", "UNKNOWN"),
technical_code=plate,
type_approval_number=item.get("typegoedkeuringsnummer"),
seats=tech["seats"],
doors=tech["doors"],
width=cls.parse_int(item.get("breedte")),
wheelbase=tech["wheelbase"],
list_price=tech["list_price"],
max_speed=tech["max_speed"],
curb_weight=tech["curb_weight"],
max_weight=tech["max_weight"],
fuel_consumption_combined=tech["consumption"],
co2_emissions_combined=tech["co2"],
vehicle_class=task.vehicle_class,
body_type=tech["body_type"],
fuel_type=tech["fuel_type"],
engine_capacity=tech["engine_capacity"],
power_kw=tech["power_kw"],
cylinders=tech["cylinders"],
engine_code=tech["engine_code"],
euro_classification=tech["euro_class"],
year_from=tech["year_from"],
priority_score=task.priority_score,
status=final_status,
source="ROBOT-1-PRECISION-MAPPER",
raw_search_context='',
raw_api_data=raw_api_data,
research_metadata={},
specifications={"fast_track": True} if final_status == "awaiting_ai_synthesis" else {},
marketing_name_aliases=[]
).on_conflict_do_update(
index_elements=['make', 'normalized_name', 'variant_code', 'version_code', 'fuel_type', 'market', 'year_from'],
set_={
'power_kw': tech["power_kw"],
'engine_capacity': tech["engine_capacity"],
'fuel_type': tech["fuel_type"],
'body_type': tech["body_type"],
'doors': tech["doors"],
'seats': tech["seats"],
'status': final_status,
'raw_api_data': raw_api_data,
'updated_at': datetime.utcnow()
}
).returning(VehicleModelDefinition.id)
res = await db.execute(stmt)
vmd_id = res.scalar()
if final_status == "awaiting_ai_synthesis" and vmd_id:
cat_stmt = text("""
INSERT INTO vehicle.vehicle_catalog (master_definition_id, make, model, power_kw, engine_capacity, fuel_type, factory_data)
VALUES (:m_id, :make, :model, :kw, :ccm, :fuel, :factory)
ON CONFLICT ON CONSTRAINT uix_vehicle_catalog_full DO NOTHING;
""")
await db.execute(cat_stmt, {
"m_id": vmd_id,
"make": tech["make"],
"model": tech["marketing_name"][:50],
"kw": tech["power_kw"],
"ccm": tech["engine_capacity"],
"fuel": tech["fuel_type"],
"factory": '{"source": "RDW Mapping System"}'
})
except Exception as e:
logger.warning(f"⚠️ Hiba a sor feldolgozásakor ({plate}): {e}")
try:
await db.commit()
except Exception as e:
await db.rollback()
logger.error(f"❌ Batch commit hiba (Ignorálva): {e}")
logger.warning(f"⚠️ Sor hiba ({plate}): {e}")
await db.commit()
offset += len(batch)
if offset >= 500: break
await asyncio.sleep(0.5) # Lassítjuk kicsit a terhelést
# Discovery státusz frissítése
await db.execute(text("UPDATE vehicle.catalog_discovery SET status = 'processed' WHERE id = :id"), {"id": task_id})
if offset >= 500: break
await asyncio.sleep(0.5)
await db.execute(
text("UPDATE vehicle.catalog_discovery SET status = 'processed' WHERE id = :id"),
{"id": task.id}
)
await db.commit()
@classmethod
async def run(cls):
logger.info("🤖 Invincible Mega-Hunter v1.9.2 ONLINE (CONCURRENCY PATCHED)")
logger.info("🤖 Robot-1-Nyers ONLINE (Precíz Szótár-alapú feldolgozás + Jármű szűrés)")
while True:
async with AsyncSessionLocal() as db:
# ATOMI ZÁROLÁS (Race condition ellenszere)
# Keresünk egy pending feladatot, azonnal zároljuk és átállítjuk processingre!
query = text("""
UPDATE vehicle.catalog_discovery
SET status = 'processing'
WHERE id = (
SELECT id FROM vehicle.catalog_discovery
WHERE status = 'pending'
ORDER BY priority_score DESC
FOR UPDATE SKIP LOCKED
LIMIT 1
)
RETURNING id, make, model, vehicle_class, priority_score;
""")
task = (await db.execute(query)).fetchone()
await db.commit()
if task:
await cls.process_make_model(db, task[0], task[1], task[2], task[3], task[4])
else:
try:
async with AsyncSessionLocal() as db:
res = await db.execute(text("""
UPDATE vehicle.catalog_discovery
SET status = 'processing'
WHERE id = (
SELECT id FROM vehicle.catalog_discovery
WHERE status = 'pending'
ORDER BY priority_score DESC
FOR UPDATE SKIP LOCKED LIMIT 1
) RETURNING id, make, model, vehicle_class, priority_score;
"""))
task = res.fetchone()
await db.commit()
if task:
await cls.process_task(db, task)
else:
await asyncio.sleep(30)
except Exception as e:
logger.error(f"Hiba a fő ciklusban: {e}")
await asyncio.sleep(10)
if __name__ == "__main__":
asyncio.run(CatalogHunter.run())

View File

@@ -1,3 +1,4 @@
# /opt/docker/dev/service_finder/backend/app/workers/vehicle/vehicle_robot_1_gb_hunter.py
import asyncio
import httpx
import logging
@@ -7,7 +8,7 @@ import json
from datetime import datetime
from sqlalchemy import text, func
from app.database import AsyncSessionLocal
from app.models.vehicle_definitions import VehicleModelDefinition
from app.models import VehicleModelDefinition
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] Robot-1-GB: %(message)s', stream=sys.stdout)
logger = logging.getLogger("Robot-1-GB-Hunter")

View File

@@ -0,0 +1,316 @@
# /opt/docker/dev/service_finder/backend/app/workers/vehicle/vehicle_robot_2_1_rdw_enricher.py
"""
Robot 2.1: RDW Enricher (Holland rendszámok dúsítása) - INTEGRÁLT SZÓTÁR ÉS MATEK
"""
import asyncio
import httpx
import logging
import json
import re
import os
import sys
from sqlalchemy import text
from app.database import AsyncSessionLocal
# Importáljuk a márkaneveket normalizáló szótárt
try:
# Megpróbáljuk relatív úton, ha a csomagstruktúra engedi
from .mapping_dictionary import normalize_make
except (ImportError, ValueError):
# Ha nem, akkor a sys.path-ból vagy fallback függvény
def normalize_make(make: str) -> str:
m = make.upper().strip()
synonyms = {"MERCEDES": "MERCEDES-BENZ", "VW": "VOLKSWAGEN", "ALFA": "ALFA ROMEO"}
return synonyms.get(m, m)
logger = logging.getLogger("Robot-2-1-RDW-Enricher")
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s [%(levelname)s] R2.1-RDW: %(message)s',
stream=sys.stdout
)
RDW_API_URL = "https://opendata.rdw.nl/resource/m9d7-ebf2.json?kenteken={license_plate}"
BATCH_SIZE = 10
class RDWEnricher:
# Szótár betöltése az osztály szintjén
BASE_PATH = os.path.dirname(__file__)
CONFIG_PATH = os.path.join(BASE_PATH, 'mapping_config.json')
try:
with open(CONFIG_PATH, 'r', encoding='utf-8') as f:
mapping_config = json.load(f)['rdw']
logger.info("✅ mapping_config.json sikeresen betöltve.")
except Exception as e:
logger.error(f"❌ Hiba a mapping_config.json betöltésekor: {e}")
mapping_config = {}
@staticmethod
def normalize_license_plate(technical_code: str) -> str:
if not technical_code: return ""
return re.sub(r'[-\s\.]', '', technical_code).upper()
@classmethod
async def fetch_candidates(cls, db):
query = text("""
SELECT id, make, marketing_name, technical_code, power_kw, engine_capacity,
body_type, raw_api_data, attempts, fuel_type, vehicle_class
FROM vehicle.vehicle_model_definitions
WHERE (status = 'manual_review_needed' OR status = 'unverified')
AND technical_code IS NOT NULL AND technical_code != ''
AND (power_kw = 0 OR engine_capacity = 0)
AND attempts < 3
ORDER BY priority_score DESC NULLS LAST, id ASC
FOR UPDATE SKIP LOCKED
LIMIT :batch_size
""")
result = await db.execute(query, {"batch_size": BATCH_SIZE})
rows = result.fetchall()
vehicles = []
for row in rows:
vehicles.append({
"id": row[0], "make": row[1], "marketing_name": row[2],
"technical_code": row[3], "power_kw": row[4] or 0,
"engine_capacity": row[5] or 0, "body_type": row[6],
"raw_api_data": row[7] or {}, "attempts": row[8] or 0,
"fuel_type": row[9] or "", "vehicle_class": row[10] or ""
})
return vehicles
@classmethod
async def query_rdw_api(cls, license_plate: str, client: httpx.AsyncClient):
url = RDW_API_URL.format(license_plate=license_plate)
try:
resp = await client.get(url, timeout=10.0)
if resp.status_code == 200:
data = resp.json()
if isinstance(data, list) and len(data) > 0:
return data[0]
return None
except httpx.RequestError as e:
logger.error(f"RDW hálózati hiba {license_plate}: {e}")
return None
@classmethod
def extract_fields(cls, rdw_data: dict):
"""
Adatkinyerés és dúsítás a mapping_config és a normalize_make alapján.
"""
updates = {}
cfg = cls.mapping_config
if not cfg:
return {}
# 1. Alapmezők és Márkanév normalizálása
for r_key, db_key in cfg.get('field_map', {}).items():
val = rdw_data.get(r_key)
if not val: continue
if db_key == "make":
# Használjuk a külső szótár logikát
updates[db_key] = normalize_make(val)
elif db_key == "body_type":
# Karosszéria fordítás a JSON szótárból
trans = cfg.get('body_type_translations', {})
updates[db_key] = trans.get(val.lower(), val.upper())
else:
updates[db_key] = val
# 2. KOMBINÁLT TELJESÍTMÉNY SZÁMÍTÁS (Matek Zseni 2.0)
p_cfg = cfg.get('power_calculation', {})
power_kw = None
# a) Próbáljuk a direkt kW-ot (Benzin/Dízel)
p_val = rdw_data.get(p_cfg.get('primary_source'))
# b) Próbáljuk az elektromos kW-ot (ha az előző nincs)
if not p_val:
p_val = rdw_data.get(p_cfg.get('electric_source'))
if p_val:
try: power_kw = int(float(p_val))
except: pass
# c) Ha még mindig 0, jöhet a szorzás (Tömegarány * Menetkész tömeg)
if not power_kw or power_kw == 0:
ratio = rdw_data.get(p_cfg.get('ratio_source'))
mass = rdw_data.get(p_cfg.get('weight_source'))
if ratio and mass:
try:
power_kw = int(float(ratio) * float(mass))
logger.info(f"⚡ Kiszámolt teljesítmény: {power_kw} kW ({ratio} * {mass})")
except: pass
if power_kw:
updates['power_kw'] = power_kw
# Hengerűrtartalom normalizálása
if 'engine_capacity' in updates:
try: updates['engine_capacity'] = int(float(updates['engine_capacity']))
except: pass
return updates
@classmethod
async def process_vehicle(cls, vehicle: dict, client: httpx.AsyncClient):
license_plate = cls.normalize_license_plate(vehicle['technical_code'])
if not license_plate:
return vehicle, None, "empty_license_plate"
raw_api_data = vehicle['raw_api_data']
if not isinstance(raw_api_data, dict): raw_api_data = {}
# Cache ellenőrzés (ne kérdezzük le ugyanazt 66-szor)
rdw_data = None
if 'rdw' in raw_api_data and len(raw_api_data['rdw']) > 0:
rdw_data = raw_api_data['rdw'][0]['data']
else:
rdw_data = await cls.query_rdw_api(license_plate, client)
if not rdw_data:
return vehicle, None, "no_rdw_data"
# SZÓTÁR ALAPÚ DÚSÍTÁS
extracted = cls.extract_fields(rdw_data)
if not extracted:
return vehicle, None, "no_useful_data"
updates = {}
# Csak akkor frissítünk, ha a DB-ben még hiányos az adat (0 vagy üres)
for key, val in extracted.items():
if key in ['power_kw', 'engine_capacity'] and val >= 0 and vehicle[key] == 0:
updates[key] = val
elif key in ['make', 'body_type', 'fuel_type'] and (not vehicle.get(key) or vehicle[key] == ""):
updates[key] = val
# Kapuőr Logika (Arany státusz eldöntése)
f_kw = updates.get('power_kw', vehicle['power_kw'])
f_ccm = updates.get('engine_capacity', vehicle['engine_capacity'])
fuel = str(updates.get('fuel_type', vehicle['fuel_type'])).lower()
v_class = str(vehicle['vehicle_class']).lower()
is_electric = any(x in fuel for x in ['electr', 'elektri', 'hydrogen'])
is_gold_ready = False
if 'trailer' in v_class:
is_gold_ready = True
elif is_electric:
if f_kw > 0: is_gold_ready = True
# Elektromos autóknál a hengerűrtartalom 0
if 'engine_capacity' not in updates and vehicle['engine_capacity'] != 0:
updates['engine_capacity'] = 0
else:
if f_kw > 0 and f_ccm > 0: is_gold_ready = True
updates['_is_gold_ready'] = is_gold_ready
updates['_new_attempts'] = vehicle['attempts'] + 1
# Ha arany státuszba kerül, garantáljuk, hogy a power_kw és engine_capacity bekerüljön az UPDATE-be
if is_gold_ready:
if 'power_kw' not in updates:
updates['power_kw'] = f_kw
if 'engine_capacity' not in updates:
updates['engine_capacity'] = f_ccm
# Nyers adat mentése (ha eddig nem volt rdw kulcs)
if 'rdw' not in raw_api_data:
raw_api_data['rdw'] = [{'timestamp': asyncio.get_event_loop().time(), 'data': rdw_data}]
updates['raw_api_data'] = raw_api_data
return vehicle, updates, None
@classmethod
async def update_vehicle_batch(cls, db, updates_list):
if not updates_list: return 0
updated_count = 0
for vehicle_id, updates in updates_list:
try:
set_clauses = []
params = {"vehicle_id": vehicle_id}
is_gold = updates.pop('_is_gold_ready', False)
new_attempts = updates.pop('_new_attempts', 1)
for key, value in updates.items():
if key == 'raw_api_data':
set_clauses.append("raw_api_data = :raw_api_data")
params['raw_api_data'] = json.dumps(value)
else:
set_clauses.append(f"{key} = :{key}")
params[key] = value
if is_gold:
set_clauses.append("status = 'gold_enriched'")
set_clauses.append("attempts = 0")
else:
set_clauses.append("attempts = :attempts")
params['attempts'] = new_attempts
if new_attempts >= 3:
set_clauses.append("status = 'manual_review_needed'")
set_clauses.append("updated_at = NOW()")
query = text(f"UPDATE vehicle.vehicle_model_definitions SET {', '.join(set_clauses)} WHERE id = :vehicle_id")
# AZONNALI VÉGREHAJTÁS ÉS COMMIT!
await db.execute(query, params)
await db.commit()
updated_count += 1
except Exception as e:
logger.error(f"❌ DB Mentési hiba az {vehicle_id} járműnél: {e}")
await db.rollback() # Csak ezt a problémás autót dobjuk el
continue
return updated_count
@classmethod
async def run(cls):
logger.info("🚀 Robot 2.1 (RDW) indítása...")
# --- DNS ÉS KAPCSOLÓDÁSI VÉDELEM ---
db_ready = False
while not db_ready:
try:
async with AsyncSessionLocal() as db:
await db.execute(text("SELECT 1"))
db_ready = True
logger.info("✅ Adatbázis elérhető, indul a munka!")
except Exception as e:
logger.warning(f"⏳ Várakozás az adatbázisra (DNS/Hálózat hiba): {e}")
await asyncio.sleep(5)
while True:
try:
async with AsyncSessionLocal() as db:
vehicles = await cls.fetch_candidates(db)
if not vehicles:
await asyncio.sleep(10)
continue
async with httpx.AsyncClient(timeout=15.0) as client:
tasks = [cls.process_vehicle(v, client) for v in vehicles]
results = await asyncio.gather(*tasks)
updates_list = []
for vehicle, updates, error in results:
if updates:
updates_list.append((vehicle['id'], updates))
if updates.get('_is_gold_ready'):
logger.info(f"✨ ARANY: {vehicle['make']} {vehicle['marketing_name']}")
else:
await db.execute(
text("UPDATE vehicle.vehicle_model_definitions SET attempts = attempts + 1, updated_at = NOW() WHERE id = :id"),
{"id": vehicle['id']}
)
if updates_list:
await cls.update_vehicle_batch(db, updates_list)
await asyncio.sleep(2)
except Exception as e:
logger.error(f"⚠️ Hiba a főciklusban: {e}")
await asyncio.sleep(5)
if __name__ == "__main__":
asyncio.run(RDWEnricher.run())

View File

@@ -0,0 +1,427 @@
#!/usr/bin/env python3
import asyncio
import json
import logging
import random
import urllib.parse
import sys
import signal
import re
from datetime import datetime
from playwright.async_api import async_playwright
from sqlalchemy import text
from app.database import AsyncSessionLocal
# R2.3 - SENTINEL (Hardened, Drill-Up/Drill-Down & Omnivorous Parser Edition)
logging.basicConfig(level=logging.INFO, format='%(asctime)s [R2.3-SENTINEL] %(message)s')
logger = logging.getLogger("R2.3")
# --- 1. SZŰRÉSEK ÉS TILTÓLISTÁK ---
JUNK_LIST = [
'SARIS', 'ANSSEMS', 'HAPERT', 'HUMBAUR', 'EDUARD', 'IFOR WILLIAMS', 'FENDT',
'HOBBY', 'ADRIA', 'PEECON', 'JAKO', 'KAWECO', 'POTTINGER', 'BOCKMANN',
'JOHN DEERE', 'CLAAS', 'IVECO', 'SCANIA', 'MAN', 'DAF', 'KNAUS', 'PÖSSL',
'HYMER', 'WESTFALIA', 'AGM', 'BRENDERUP', 'STEMA', 'DEBON', 'TEMARED',
'MARTZ', 'NIEWIADOW', 'ZASLAW'
]
# --- 2. FORDÍTÁSOK ---
TRANSLATIONS = {
"3ER REIHE": "3 Series", "5ER REIHE": "5 Series", "1ER REIHE": "1 Series", "7ER REIHE": "7 Series",
"E-KLASSE": "E Class", "C-KLASSE": "C Class", "S-KLASSE": "S Class", "A-KLASSE": "A Class",
"REIHE": "Series", "KLASSE": "Class", "BESTELWAGEN": "Van"
}
class RobotScout:
def __init__(self):
self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
self.running = True
def clean_name(self, make, model):
"""Standardizált angol név előállítása."""
m = str(model).upper()
for de, en in TRANSLATIONS.items():
m = m.replace(de, en)
m = m.replace(make.upper(), "").strip()
return f"{make} {m}"
# Rugalmas szótár (Fuzzy Match Keywords)
FUZZY_MAPPING = {
"power_kw": ["power", "horsepower", "output", "hp"],
"engine_capacity": ["displacement", "capacity", "cm3", "cu-in"],
"torque_nm": ["torque"],
"max_speed": ["top speed", "maximum speed"],
"curb_weight": ["curb weight", "weight"],
"wheelbase": ["wheelbase"],
"seats": ["seats", "num. of seats"]
}
def extract_fuzzy_metric(self, web_data: dict, keywords: list) -> str:
"""Megkeresi a JSON-ben azt az értéket, aminek a kulcsa tartalmazza valamelyik kulcsszót."""
for key, val in web_data.items():
k_lower = key.lower()
for kw in keywords:
if kw in k_lower:
return str(val)
return ""
def clean_number(self, val: str) -> int:
"""Kinyeri a nyers szövegből a releváns számot (okosan kezeli a kW-ot)."""
if not val or val == "-" or val == "None": return 0
try:
val_lower = val.lower()
if "kw" in val_lower:
kw_match = re.search(r'(\d+)\s*kw', val_lower)
if kw_match: return int(kw_match.group(1))
nums = re.findall(r'\d+', val.replace(' ', '').replace(',', '').replace('.', ''))
return int(nums[0]) if nums else 0
except:
return 0
async def _retry_with_backoff(self, func, max_attempts=3, base_delay=2, exception_message="Retry failed"):
"""Újrapróbálkozási logika exponenciális késleltetéssel."""
for attempt in range(max_attempts):
try:
return await func()
except Exception as e:
if attempt == max_attempts - 1:
logger.error(f"{exception_message} ({max_attempts}. kísérlet után is): {str(e)[:100]}")
raise
else:
delay = base_delay * (2 ** attempt) + random.uniform(0, 1)
logger.warning(f"⚠️ Próba {attempt + 1} sikertelen: {str(e)[:50]}. Újrapróbálkozás {delay:.1f}mp múlva...")
await asyncio.sleep(delay)
return None
async def get_car_links(self, page, make, model, year, use_year=True):
"""Intelligens kereső: Ha talál egy autót, felmegy a Generációhoz (Drill-Up), majd kigyűjti a variációkat (Drill-Down)."""
clean_model = self.clean_name(make, model)
search_query = f"{clean_model} {year}" if use_year else clean_model
url = f"https://www.ultimatespecs.com/index.php?q={urllib.parse.quote(search_query)}"
make_url_safe = str(make).replace(' ', '-').lower()
model_keyword = str(model).strip().lower().split()[0] if str(model).strip() else ""
# Ez a JavaScript funkció ki tudja nyerni egy adott oldalról az összes Specifikációt ÉS Generáció linket
js_extractor = """
(args) => {
let targetMakeUrl = args.makeUrl;
let targetModel = args.modelWord;
let specs = [];
let generations = [];
let seenUrls = new Set();
document.querySelectorAll('a').forEach(a => {
let href = a.getAttribute('href') || '';
let text = a.innerText.trim();
let hrefLow = href.toLowerCase();
let textLow = text.toLowerCase();
if (hrefLow.includes('/car-specs/') || hrefLow.includes('/motorcycles-specs/')) {
// URL szintű Márka Szűrés!
if (hrefLow.includes('/' + targetMakeUrl + '/') || hrefLow.includes(targetMakeUrl + '-models')) {
// Modell Szűrés!
if (targetModel === '' || textLow.includes(targetModel) || hrefLow.includes(targetModel)) {
if (!seenUrls.has(href)) {
seenUrls.add(href);
if (hrefLow.endsWith('.html') && text.length > 1) {
specs.push({ name: text, url: href });
} else if (href.includes('/M') && href.split('/').length >= 4) {
// UltimateSpecs generáció linkek (pl. /car-specs/Jeep/M14489/Grand-Cherokee)
generations.push({ name: text, url: href });
}
}
}
}
}
});
return { specs: specs, generations: generations };
}
"""
async def _fetch_links():
logger.info(f"🔎 KERESÉS: {search_query}")
await page.goto(url, wait_until="domcontentloaded", timeout=30000)
data = await page.evaluate(js_extractor, {"makeUrl": make_url_safe, "modelWord": model_keyword})
# --- 1. ESET: Direkt egy specifikus adatlapra irányított a kereső ---
if page.url.endswith('.html') and f"/{make_url_safe}/" in page.url.lower():
logger.info("🎯 Direkt találat! Lépjünk VISSZA 1 szintet a teljes kategóriáért (Drill-Up)...")
if data['generations']:
gen_url = data['generations'][0]['url']
if not gen_url.startswith('http'): gen_url = "https://www.ultimatespecs.com" + gen_url
logger.info(f"📂 Visszalépés ide: {gen_url}")
await page.goto(gen_url, wait_until="domcontentloaded", timeout=30000)
await asyncio.sleep(2)
gen_data = await page.evaluate(js_extractor, {"makeUrl": make_url_safe, "modelWord": model_keyword})
return gen_data['specs']
else:
return [{"name": await page.title(), "url": page.url}]
# --- 2. ESET: Keresési találatok listáját kaptuk ---
if data['specs']:
first_spec_url = data['specs'][0]['url']
if not first_spec_url.startswith('http'): first_spec_url = "https://www.ultimatespecs.com" + first_spec_url
logger.info(f"🕵️ Találatok megvannak. Belépés az első autóba, hogy megtaláljuk a Generációját: {first_spec_url}")
await page.goto(first_spec_url, wait_until="domcontentloaded", timeout=30000)
await asyncio.sleep(2)
spec_page_data = await page.evaluate(js_extractor, {"makeUrl": make_url_safe, "modelWord": model_keyword})
if spec_page_data['generations']:
gen_url = spec_page_data['generations'][0]['url']
if not gen_url.startswith('http'): gen_url = "https://www.ultimatespecs.com" + gen_url
logger.info(f"📂 Generáció megtalálva! Visszalépés, hogy leszüreteljük a teljes családot: {gen_url}")
await page.goto(gen_url, wait_until="domcontentloaded", timeout=30000)
await asyncio.sleep(2)
final_data = await page.evaluate(js_extractor, {"makeUrl": make_url_safe, "modelWord": model_keyword})
if final_data['specs']:
return final_data['specs']
# Ha valamiért nincs generációs link (nagyon ritka), adjuk vissza a keresési találatokat.
return data['specs']
# --- 3. ESET: Keresés azonnal egy Generációs oldalt dobott ki ---
if not data['specs'] and data['generations']:
gen_url = data['generations'][0]['url']
if not gen_url.startswith('http'): gen_url = "https://www.ultimatespecs.com" + gen_url
logger.info(f"📂 A keresés közvetlenül egy Kategóriát dobott ki. Belépés: {gen_url}")
await page.goto(gen_url, wait_until="domcontentloaded", timeout=30000)
await asyncio.sleep(2)
final_data = await page.evaluate(js_extractor, {"makeUrl": make_url_safe, "modelWord": model_keyword})
return final_data['specs']
# Fallback évszám nélkül
if not data['specs'] and use_year:
logger.info(" ↳ Nincs találat évszámmal, próbálkozom évszám nélkül...")
return await self.get_car_links(page, make, model, year, use_year=False)
return data['specs']
try:
variants = await self._retry_with_backoff(
_fetch_links,
max_attempts=3,
base_delay=2,
exception_message=f"❌ Hálózati hiba a linkek keresésekor: {url}"
)
return variants if variants is not None else []
except Exception as e:
logger.error(f"❌ Keresési hiba (végleges): {str(e)[:50]}")
return []
async def scrape_car_details(self, page, url):
"""Mindenevő (Omnivorous) parser, ami minden táblázatot megeszik az oldalon."""
async def _scrape():
await page.goto(url, wait_until="networkidle", timeout=30000)
full_specs = await page.evaluate("""
() => {
let results = {};
document.querySelectorAll('table').forEach(table => {
table.querySelectorAll('tr').forEach(row => {
let cells = row.querySelectorAll('td, th');
if(cells.length >= 2) {
let k = cells[0].innerText.replace(/:/g,'').trim().toLowerCase();
let v = cells[1].innerText.trim();
if(k && v && v !== "-") {
results[k] = v;
}
}
});
});
const sections = {};
document.querySelectorAll('h2, h3, h4, .section-title, .specs-header').forEach(header => {
const title = header.innerText.trim();
if (title && title.length > 0) {
let nextElement = header.nextElementSibling;
let sectionData = {};
for (let i = 0; i < 5 && nextElement; i++) {
if (nextElement.tagName === 'TABLE') {
nextElement.querySelectorAll('tr').forEach(row => {
let cells = row.querySelectorAll('td');
if(cells.length >= 2) {
let k = cells[0].innerText.replace(/:/g,'').trim().toLowerCase();
let val = cells[1].innerText.trim();
if(k && val && val !== "-") {
sectionData[k] = val;
results[`${title.toLowerCase().replace(/ /g, '_')}_${k}`] = val;
}
}
});
}
nextElement = nextElement.nextElementSibling;
}
sections[title.toLowerCase().replace(/ /g, '_')] = sectionData;
}
});
results['_sections'] = sections;
return results;
}
""")
return full_specs
try:
logger.info(f"🌐 Scraping: {url}")
full_specs = await self._retry_with_backoff(
_scrape,
max_attempts=3,
base_delay=2,
exception_message=f"❌ Scrape hiba az oldalon: {url}"
)
return full_specs
except Exception as e:
logger.error(f"❌ Scrape hiba (végleges): {str(e)[:100]}...")
return None
async def run(self):
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context = await browser.new_context(user_agent=self.user_agent)
page = await context.new_page()
while self.running:
wait = random.uniform(3, 6)
logger.info(f"💤 Várakozás {wait:.1f} mp...")
await asyncio.sleep(wait)
async with AsyncSessionLocal() as db:
target = (await db.execute(text("""
SELECT id, make, marketing_name, year_from FROM vehicle.vehicle_model_definitions
WHERE status IN ('pending', 'manual_review_needed')
AND (vehicle_class IN ('car', 'motorcycle') OR vehicle_class IS NULL)
AND NOT (UPPER(make) = ANY(:junks))
ORDER BY priority_score DESC LIMIT 1
"""), {"junks": JUNK_LIST})).fetchone()
if not target:
logger.info("✨ Minden tétel feldolgozva.")
break
t_id, make, model, year = target
logger.info(f"🚀 CÉLPONT: {make} {model} ({year}) [ID: {t_id}]")
try:
links = await self.get_car_links(page, make, model, year)
except Exception as e:
logger.error(f"❌ Hálózati hiba linkek lekérésekor: {str(e)[:100]}")
await db.execute(text("UPDATE vehicle.vehicle_model_definitions SET status='research_failed_network' WHERE id=:id"), {"id": t_id})
await db.commit()
continue
if not links:
logger.warning(f"❌ Nem található adatlap a '{make} {model}' típushoz. research_failed_empty rögzítése.")
await db.execute(text("UPDATE vehicle.vehicle_model_definitions SET status='research_failed_empty', updated_at=NOW() WHERE id=:id"), {"id": t_id})
await db.commit()
continue
# --- 1. ELSŐ LINK DÚSÍTÁSA ---
first_link = links[0]
full_url = first_link['url'] if first_link['url'].startswith('http') else f"https://www.ultimatespecs.com{first_link['url']}"
logger.info(f"⚡ Azonnali adatgyűjtés a letöltött listából: {full_url}")
web_data = await self.scrape_car_details(page, full_url)
is_enriched = False
if web_data is None:
logger.error(f"❌ Scraping sikertelen minden próbálkozás után. research_failed_parsing rögzítése.")
await db.execute(text("UPDATE vehicle.vehicle_model_definitions SET status='research_failed_parsing' WHERE id=:id"), {"id": t_id})
await db.commit()
web_data = {}
elif len(web_data) >= 5:
updates = {}
for key, keywords in self.FUZZY_MAPPING.items():
raw_val = self.extract_fuzzy_metric(web_data, keywords)
updates[key] = self.clean_number(raw_val)
fuel_type = self.extract_fuzzy_metric(web_data, ["fuel type", "fuel"]) or 'Unknown'
transmission = self.extract_fuzzy_metric(web_data, ["transmission", "gearbox"]) or 'Unknown'
body_type = self.extract_fuzzy_metric(web_data, ["body", "type"]) or 'Unknown'
drive_type = self.extract_fuzzy_metric(web_data, ["drive", "traction"]) or 'Unknown'
power_kw = updates.get('power_kw', 0)
ccm = updates.get('engine_capacity', 0)
await db.execute(text("""
UPDATE vehicle.vehicle_model_definitions
SET power_kw = :power_kw, engine_capacity = :engine_capacity,
torque_nm = :torque_nm, max_speed = :max_speed,
curb_weight = :curb_weight,
wheelbase = :wheelbase, seats = :seats,
fuel_type = :fuel_type, transmission_type = :transmission_type,
drive_type = :drive_type, body_type = :body_type,
specifications = specifications || :full_json,
status = 'awaiting_ai_synthesis', updated_at = NOW()
WHERE id = :id
"""), {
**updates,
"id": t_id,
"fuel_type": fuel_type,
"transmission_type": transmission,
"drive_type": drive_type,
"body_type": body_type,
"full_json": json.dumps(web_data)
})
is_enriched = True
logger.info(f"✅ SIKERES DÚSÍTÁS: {make} {model} ({power_kw} kW, {ccm} ccm) -> Awaiting AI")
else:
logger.warning("⚠️ Scraping kevés adatot talált, csak a linkeket mentjük.")
# --- 2. VARIÁCIÓK MENTÉSE AZ R3-NAK ---
added = 0
for l in links:
v_url = l['url'] if l['url'].startswith('http') else f"https://www.ultimatespecs.com{l['url']}"
check = (await db.execute(text("SELECT id FROM vehicle.vehicle_model_definitions WHERE raw_api_data->>'url' = :u"), {"u": v_url})).fetchone()
if not check:
normalized = l['name'].lower().replace(' ', '_').replace('-', '_').replace('.', '').replace(',', '')[:200]
await db.execute(text("""
INSERT INTO vehicle.vehicle_model_definitions
(make, marketing_name, normalized_name, year_from, status,
raw_api_data, priority_score, source, market,
technical_code, variant_code, version_code,
specifications, marketing_name_aliases, raw_search_context)
VALUES (:make, :name, :normalized, :year, 'awaiting_ai_synthesis',
:raw, 30, 'ultimatespecs', 'EU',
'UNKNOWN', 'UNKNOWN', 'UNKNOWN',
'{}'::jsonb, '[]'::jsonb, '')
"""), {
"make": make, "name": l['name'], "normalized": normalized,
"year": year, "raw": json.dumps({"url": v_url})
})
added += 1
if not is_enriched:
await db.execute(text("UPDATE vehicle.vehicle_model_definitions SET status='expanded_to_variants', updated_at=NOW() WHERE id=:id"), {"id": t_id})
await db.commit()
logger.info(f"✅ Variációk kezelve: {added} új rekord.")
await browser.close()
if __name__ == "__main__":
scout = RobotScout()
def stop_signal(sig, frame):
logger.info("🛑 LEÁLLÍTÁS (Kérés érzékelve)...")
scout.running = False
sys.exit(0)
signal.signal(signal.SIGINT, stop_signal)
try:
asyncio.run(scout.run())
except KeyboardInterrupt:
pass

View File

@@ -0,0 +1,387 @@
#!/usr/bin/env python3
import asyncio
import json
import logging
import random
import urllib.parse
import sys
import signal
import re
from playwright.async_api import async_playwright
from sqlalchemy import text
from app.database import AsyncSessionLocal
# R2.3 - SENTINEL (Hardened & Obedient Edition)
logging.basicConfig(level=logging.INFO, format='%(asctime)s [R2.3-SENTINEL] %(message)s')
logger = logging.getLogger("R2.3")
# --- 1. SZŰRÉSEK ÉS TILTÓLISTÁK ---
# Csak olyan típusokat keresünk, amik nem utánfutók vagy munkagépek
JUNK_LIST = [
'SARIS', 'ANSSEMS', 'HAPERT', 'HUMBAUR', 'EDUARD', 'IFOR WILLIAMS', 'FENDT',
'HOBBY', 'ADRIA', 'PEECON', 'JAKO', 'KAWECO', 'POTTINGER', 'BOCKMANN',
'JOHN DEERE', 'CLAAS', 'IVECO', 'SCANIA', 'MAN', 'DAF', 'KNAUS', 'PÖSSL', 'HYMER', 'WESTFALIA'
]
# --- 2. FORDÍTÁSOK (DE/NL -> EN) ---
TRANSLATIONS = {
"3ER REIHE": "3 Series", "5ER REIHE": "5 Series", "1ER REIHE": "1 Series", "7ER REIHE": "7 Series",
"E-KLASSE": "E Class", "C-KLASSE": "C Class", "S-KLASSE": "S Class", "A-KLASSE": "A Class",
"REIHE": "Series", "KLASSE": "Class", "BESTELWAGEN": "Van"
}
class RobotScout:
def __init__(self):
self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
self.running = True
def clean_name(self, make, model):
"""Standardizált angol név előállítása."""
m = model.upper()
for de, en in TRANSLATIONS.items():
m = m.replace(de, en)
# Márkanév duplázódás törlése (pl. VOLVO VOLVO V60 -> VOLVO V60)
m = m.replace(make.upper(), "").strip()
return f"{make} {m}"
# --- COLUMN MAPPING for scraping ---
COLUMN_MAPPING = {
"horsepower": "power_kw",
"engine displacement": "engine_capacity",
"maximum torque": "torque_nm",
"top speed": "max_speed",
"curb weight": "curb_weight",
"wheelbase": "wheelbase",
"num. of seats": "seats"
}
def clean_number(self, val: str, key: str = "") -> int:
if not val or val == "-": return 0
try:
if "hp" in val.lower() or "kw" in val.lower():
kw_match = re.search(r'(\d+)\s*kw', val.lower())
if kw_match: return int(kw_match.group(1))
nums = re.findall(r'\d+', val.replace(' ', '').replace(',', '').replace('.', ''))
return int(nums[0]) if nums else 0
except: return 0
async def get_car_links(self, page, make, model, year, use_year=True):
"""Minden autós link kigyűjtése fallback mechanizmussal retry logikával."""
clean_model = self.clean_name(make, model)
search_query = f"{clean_model} {year}" if use_year else clean_model
url = f"https://www.ultimatespecs.com/index.php?q={urllib.parse.quote(search_query)}"
logger.info(f"🔎 KERESÉS: {search_query}")
async def _fetch_links():
await page.goto(url, wait_until="domcontentloaded", timeout=25000)
# 1. Ha direkt az adatlapon vagyunk
if any(x in page.url for x in ['/car-specs/', '/motorcycles-specs/']):
logger.info("🎯 Direkt találat!")
return [{"name": await page.title(), "url": page.url}]
# 2. Várakozás és linkek kigyűjtése
await asyncio.sleep(2)
variants = await page.evaluate("""
() => {
let results = [];
document.querySelectorAll('a').forEach(a => {
let href = a.getAttribute('href') || '';
let text = a.innerText.trim();
// Csak technikai adatlapokat gyűjtünk, reklámokat/kategóriákat nem
if ((href.includes('/car-specs/') || href.includes('/motorcycles-specs/'))
&& href.includes('.html') && text.length > 3) {
results.push({ name: text, url: href });
}
});
return results;
}
""")
# 3. Fallback: Ha nincs találat évvel, próbálja év nélkül
if not variants and use_year:
logger.info(" ↳ Nincs találat évszámmal, próbálkozom évszám nélkül...")
return await self.get_car_links(page, make, model, year, use_year=False)
return variants
try:
variants = await self._retry_with_backoff(
_fetch_links,
max_attempts=3,
base_delay=2,
exception_message=f"❌ Hálózati hiba a(z) {url} oldalon"
)
return variants if variants is not None else []
except Exception as e:
logger.error(f"❌ Hálózati hiba (végleges): {str(e)[:50]}")
return []
async def _retry_with_backoff(self, func, max_attempts=3, base_delay=2,
exception_message="Retry failed", retry_exceptions=True):
"""Helper function for retry logic with exponential backoff."""
for attempt in range(max_attempts):
try:
return await func()
except Exception as e:
if attempt == max_attempts - 1:
logger.error(f"{exception_message} after {max_attempts} attempts: {str(e)[:100]}")
raise
else:
delay = base_delay * (2 ** attempt) + random.uniform(0, 1)
logger.warning(f"⚠️ Attempt {attempt + 1} failed: {str(e)[:50]}. Retrying in {delay:.1f}s...")
await asyncio.sleep(delay)
return None
async def scrape_car_details(self, page, url):
"""Scrape car specifications from a given Ultimate Specs URL with comprehensive data extraction and retry logic."""
async def _scrape():
await page.goto(url, wait_until="networkidle", timeout=30000)
# Parsing all specification tables and sections
full_specs = await page.evaluate("""
() => {
let results = {};
// 1. Collect all specification tables (existing logic)
document.querySelectorAll('table.table_specs, table.responsive').forEach(table => {
table.querySelectorAll('tr').forEach(row => {
let t = row.querySelector('.table_specs_title, .td_title, td:first-child');
let v = row.querySelector('.table_specs_value, .td_value, td:last-child');
if(t && v) {
let k = t.innerText.replace(':','').trim().toLowerCase();
let val = v.innerText.trim();
if(k && val && val !== "-") results[k] = val;
}
});
});
// 2. Collect section headers and their content for additional technical data
// Look for h2, h3, h4 elements that might contain section titles
const sections = {};
const headers = document.querySelectorAll('h2, h3, h4, .section-title, .specs-header');
headers.forEach(header => {
const title = header.innerText.trim();
if (title && title.length > 0) {
// Find the next table or div with specs after this header
let nextElement = header.nextElementSibling;
let sectionData = {};
// Look for tables or lists in the next few siblings
for (let i = 0; i < 5 && nextElement; i++) {
if (nextElement.tagName === 'TABLE') {
nextElement.querySelectorAll('tr').forEach(row => {
let t = row.querySelector('td:first-child');
let v = row.querySelector('td:last-child');
if(t && v) {
let k = t.innerText.replace(':','').trim().toLowerCase();
let val = v.innerText.trim();
if(k && val && val !== "-") {
sectionData[k] = val;
// Also add to main results with section prefix
results[`${title.toLowerCase().replace(/ /g, '_')}_${k}`] = val;
}
}
});
}
nextElement = nextElement.nextElementSibling;
}
sections[title.toLowerCase().replace(/ /g, '_')] = sectionData;
}
});
// 3. Extract specific known sections by looking for text patterns
const pageText = document.body.innerText.toLowerCase();
// Check for electric/hybrid sections
if (pageText.includes('electric engine') || pageText.includes('battery')) {
// Try to find battery voltage, capacity, etc.
const batteryRegex = /battery\s*voltage[:\s]*([\d\.]+)\s*v/gi;
const match = batteryRegex.exec(document.body.innerText);
if (match) results['battery_voltage_v'] = match[1];
}
// 4. Extract dimensions data
const dimensionPatterns = {
'wheelbase': /wheelbase[:\s]*([\d\.]+)\s*cm/gi,
'length': /length[:\s]*([\d\.]+)\s*cm/gi,
'width': /width[:\s]*([\d\.]+)\s*cm/gi,
'height': /height[:\s]*([\d\.]+)\s*cm/gi,
'curb_weight': /curb\s*weight[:\s]*([\d\.]+)\s*kg/gi,
'towing_capacity': /towing\s*capacity[:\s]*([\d\.]+)\s*kg/gi
};
for (const [key, regex] of Object.entries(dimensionPatterns)) {
const match = regex.exec(document.body.innerText);
if (match) results[key] = match[1];
}
// 5. Add sections data as a nested object
results['_sections'] = sections;
return results;
}
""")
return full_specs
try:
logger.info(f"🌐 Scraping: {url}")
full_specs = await self._retry_with_backoff(
_scrape,
max_attempts=3,
base_delay=2,
exception_message=f"❌ Scrape hiba a(z) {url} oldalon"
)
return full_specs
except Exception as e:
logger.error(f"❌ Scrape hiba (végleges): {str(e)[:100]}...")
return None
async def run(self):
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context = await browser.new_context(user_agent=self.user_agent)
page = await context.new_page()
while self.running:
# --- A FÉK: 3-6 mp szigorú pihenő minden kör elején ---
wait = random.uniform(3, 6)
logger.info(f"💤 Várakozás {wait:.1f} mp...")
await asyncio.sleep(wait)
async with AsyncSessionLocal() as db:
# Következő feldolgozatlan autó (John Deere, Iveco, stb. kizárva)
target = (await db.execute(text("""
SELECT id, make, marketing_name, year_from FROM vehicle.vehicle_model_definitions
WHERE status IN ('pending', 'manual_review_needed')
AND NOT (make = ANY(:junks))
ORDER BY priority_score DESC LIMIT 1
"""), {"junks": JUNK_LIST})).fetchone()
if not target:
logger.info("✨ Minden tétel feldolgozva.")
break
t_id, make, model, year = target
logger.info(f"🚀 CÉLPONT: {make} {model} ({year}) [ID: {t_id}]")
try:
links = await self.get_car_links(page, make, model, year)
except Exception as e:
logger.error(f"❌ Hálózati hiba linkek lekérésekor: {str(e)[:100]}")
await db.execute(text("UPDATE vehicle.vehicle_model_definitions SET status='research_failed_network' WHERE id=:id"), {"id": t_id})
await db.commit()
continue
if not links:
logger.warning(f"❌ Nem található adatlap. research_failed_empty rögzítése.")
await db.execute(text("UPDATE vehicle.vehicle_model_definitions SET status='research_failed_empty' WHERE id=:id"), {"id": t_id})
await db.commit()
continue
# --- 1. SCRAPE THE FIRST LINK FOR IMMEDIATE ENRICHMENT ---
first_link = None
if links:
first_link = links[0]
full_url = first_link['url'] if first_link['url'].startswith('http') else f"https://www.ultimatespecs.com{first_link['url']}"
logger.info(f"⚡ Azonnali adatgyűjtés: {full_url}")
web_data = await self.scrape_car_details(page, full_url)
if web_data is None:
# Scraping failed after all retries
logger.error(f"❌ Scraping sikertelen minden próbálkozás után. research_failed_parsing rögzítése.")
await db.execute(text("UPDATE vehicle.vehicle_model_definitions SET status='research_failed_parsing' WHERE id=:id"), {"id": t_id})
await db.commit()
# Continue to save links as variants anyway
web_data = {}
elif len(web_data) >= 5:
# Map scraped data to columns
updates = {col: self.clean_number(web_data.get(k)) for k, col in self.COLUMN_MAPPING.items()}
# Also extract fuel_type, transmission, etc. if possible
fuel_type = web_data.get('fuel type', 'Unknown')
transmission_type = web_data.get('transmission', 'Unknown')
drive_type = web_data.get('drive type', 'Unknown')
body_type = web_data.get('body type', 'Unknown')
engine_capacity = updates.get('engine_capacity', 0)
power_kw = updates.get('power_kw', 0)
# Update the original record with scraped data
await db.execute(text("""
UPDATE vehicle.vehicle_model_definitions
SET power_kw = :power_kw, engine_capacity = :engine_capacity,
torque_nm = :torque_nm, max_speed = :max_speed,
curb_weight = :curb_weight,
wheelbase = :wheelbase, seats = :seats,
fuel_type = :fuel_type, transmission_type = :transmission_type,
drive_type = :drive_type, body_type = :body_type,
specifications = specifications || :full_json,
status = 'awaiting_ai_synthesis', updated_at = NOW()
WHERE id = :id
"""), {
**updates,
"id": t_id,
"fuel_type": fuel_type,
"transmission_type": transmission_type,
"drive_type": drive_type,
"body_type": body_type,
"full_json": json.dumps(web_data)
})
logger.info(f"✅ AZONNALI PUBLIKÁLÁS: {make} {model} ({power_kw} kW)")
else:
logger.warning("⚠️ Scraping kevés adatot talált, csak linkek mentve.")
# --- 2. SAVE ALL LINKS AS NEW VARIANT RECORDS (including first if not enriched) ---
added = 0
for l in links:
full_url = l['url'] if l['url'].startswith('http') else f"https://www.ultimatespecs.com{l['url']}"
# JAVÍTÁS: column "source_url" hiba ellen raw_api_data-t nézünk
check_query = text("SELECT id FROM vehicle.vehicle_model_definitions WHERE raw_api_data->>'url' = :u")
exists = (await db.execute(check_query, {"u": full_url})).fetchone()
if not exists:
# Create normalized name from marketing name
normalized = l['name'].lower().replace(' ', '_').replace('-', '_').replace('.', '').replace(',', '')[:200]
await db.execute(text("""
INSERT INTO vehicle.vehicle_model_definitions
(make, marketing_name, normalized_name, year_from, status,
raw_api_data, priority_score, source, market,
technical_code, variant_code, version_code,
specifications, marketing_name_aliases, raw_search_context)
VALUES (:make, :name, :normalized, :year, 'awaiting_ai_synthesis',
:raw, 30, 'ultimatespecs', 'EU',
'UNKNOWN', 'UNKNOWN', 'UNKNOWN',
'{}'::jsonb, '[]'::jsonb, '')
"""), {
"make": make, "name": l['name'], "normalized": normalized,
"year": year, "raw": json.dumps({"url": full_url}), "priority": 30
})
added += 1
# Eredeti rekord archiválása (ha még nem publikáltuk)
if not web_data:
await db.execute(text("UPDATE vehicle.vehicle_model_definitions SET status='expanded_to_variants', updated_at=NOW() WHERE id=:id"), {"id": t_id})
await db.commit()
logger.info(f"✅ SIKER: {added} új variáció mentve. R4-R5 robotok értesítve.")
await browser.close()
if __name__ == "__main__":
scout = RobotScout()
# Handle CTRL+C
def stop_signal(sig, frame):
logger.info("🛑 LEÁLLÍTÁS (Kérés érzékelve)...")
scout.running = False
sys.exit(0)
signal.signal(signal.SIGINT, stop_signal)
try:
asyncio.run(scout.run())
except KeyboardInterrupt:
pass

View File

@@ -0,0 +1,156 @@
#!/usr/bin/env python3
import asyncio
import logging
import random
import json
import re
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright
from sqlalchemy import text
from app.database import AsyncSessionLocal
logging.basicConfig(level=logging.INFO, format='%(asctime)s [R2-MASTER] %(message)s')
logger = logging.getLogger("R2-AutoData")
class AutoDataMaster:
def __init__(self):
self.base_url = "https://www.auto-data.net"
def clean_key(self, key):
if "," in key: key = key.split(",")[-1]
key = key.replace("What is the ", "").replace("How much ", "").replace("How many ", "")
key = key.split("?")[0].strip()
return key.capitalize()
async def get_soup(self, page, url):
delay = random.uniform(2, 5)
await asyncio.sleep(delay)
# JAVÍTÁS: Megvárjuk, amíg a hálózat elcsendesedik (biztosabb betöltés)
await page.goto(url, wait_until="networkidle", timeout=60000)
content = await page.content()
return BeautifulSoup(content, 'html.parser')
async def scrape_engine_details(self, page, url):
try:
soup = await self.get_soup(page, url)
data = {
"make": "", "model": "", "generation": "", "modification": "",
"year_from": None, "year_to": None, "power_kw": 0, "engine_cc": 0,
"specifications": {}, "source_url": url
}
# (Az adatkinyerő logika ugyanaz marad, mint az előzőleg sikeresen tesztelt Honda esetén)
rows = soup.find_all('tr')
for row in rows:
th, td = row.find('th'), row.find('td')
if not th or not td: continue
raw_k, val = th.get_text(strip=True), td.get_text(strip=True)
k_low = raw_k.lower()
if "brand" == k_low: data["make"] = val
elif "model" == k_low: data["model"] = val
elif "generation" == k_low: data["generation"] = val
elif "modification" == k_low: data["modification"] = val
elif "start of production" in k_low:
m = re.search(r'(\d{4})', val);
if m: data["year_from"] = int(m.group(1))
elif "end of production" in k_low:
m = re.search(r'(\d{4})', val);
if m: data["year_to"] = int(m.group(1))
elif "power" == k_low:
hp_m = re.search(r'(\d+)\s*Hp', val, re.I)
if hp_m: data["power_kw"] = int(int(hp_m.group(1)) / 1.36)
elif "displacement" in k_low:
cc_m = re.search(r'(\d+)\s*cm3', val)
if cc_m: data["engine_cc"] = int(cc_m.group(1))
clean_k = self.clean_key(raw_k)
if clean_k and val: data["specifications"][clean_k] = val
return data
except Exception as e:
logger.error(f"Hiba az adatlapon ({url}): {e}")
return None
async def save_to_db(self, data):
if not data or not data["make"]: return
async with AsyncSessionLocal() as db:
try:
await db.execute(text("""
INSERT INTO vehicle.external_reference_library
(source_name, make, model, generation, modification, year_from, year_to, power_kw, engine_cc, specifications, source_url)
VALUES ('auto-data.net', :make, :model, :gen, :mod, :y_f, :y_t, :p_kw, :e_cc, :specs, :url)
ON CONFLICT (source_url) DO UPDATE SET specifications = EXCLUDED.specifications, last_scraped_at = NOW();
"""), {
"make": data["make"], "model": data["model"], "gen": data["generation"],
"mod": data["modification"], "y_f": data["year_from"], "y_t": data["year_to"],
"p_kw": data["power_kw"], "e_cc": data["engine_cc"],
"specs": json.dumps(data["specifications"]), "url": data["source_url"]
})
await db.commit()
logger.info(f"✅ MENTVE: {data['make']} {data['model']} {data['modification']}")
except Exception as e:
logger.error(f"DB Hiba: {e}")
async def crawl(self):
logger.info("🚀 Porszívózás indul...")
async with async_playwright() as p:
# Lassított indítás és normális ablakméret a lebukás ellen
browser = await p.chromium.launch(headless=True)
context = await browser.new_context(
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
viewport={'width': 1920, 'height': 1080}
)
page = await context.new_page()
# 1. MÁRKÁK LISTÁJA - JAVÍTOTT SZELEKTOR
logger.info(f"Szint 1: Márkák betöltése...")
soup = await self.get_soup(page, f"{self.base_url}/en/allbrands")
# Az auto-data-n a márkák linkjeinek class-ja 'brandi' vagy 'brand'
brand_elements = soup.select('a.brandi') or soup.select('a.brand')
brand_links = []
for a in brand_elements:
href = a.get('href')
if href and 'brand' in href:
full_url = href if href.startswith('http') else f"{self.base_url}/{href.lstrip('/')}"
brand_links.append(full_url)
if not brand_links:
logger.error(f"❌ 0 márkát találtam! Oldalcím: {soup.title.string if soup.title else 'Nincs'}")
# Debug: mentsük el a HTML elejét, hogy lássuk mi az
logger.info(f"HTML debug (első 500 karakter): {str(soup)[:500]}")
await browser.close()
return
logger.info(f"🎯 Talált márkák: {len(brand_links)}")
# Csak az első 3 márkát nézzük meg tesztként (Abarth, Acura, Alfa Romeo)
for b_link in brand_links:
try:
logger.info(f"Szint 2: Modellek keresése itt: {b_link}")
soup = await self.get_soup(page, b_link)
# Modellek szelektor: a.modeli
model_links = [self.base_url + '/' + a['href'].lstrip('/') for a in soup.select('a.modeli')]
logger.info(f" -> {len(model_links)} modellt találtam.")
for m_link in model_links:
logger.info(f"Szint 3: Generációk itt: {m_link}")
soup = await self.get_soup(page, m_link)
# Generációk szelektor: a.generation
gen_links = [self.base_url + '/' + a['href'].lstrip('/') for a in soup.select('a.generation')]
for g_link in gen_links:
logger.info(f"Szint 4: Motorváltozatok itt: {g_link}")
soup = await self.get_soup(page, g_link)
# Motorváltozatok szelektor: a.car_specs
engine_links = [self.base_url + '/' + a['href'].lstrip('/') for a in soup.select('a.car_specs')]
for e_link in engine_links:
data = await self.scrape_engine_details(page, e_link)
if data:
await self.save_to_db(data)
except Exception as e:
logger.error(f"Hiba a folyamatban: {e}")
await browser.close()
if __name__ == "__main__":
asyncio.run(AutoDataMaster().crawl())

View File

@@ -1,238 +1,203 @@
#!/usr/bin/env python3
import asyncio
import logging
import warnings
import os
import json
from datetime import datetime
from sqlalchemy import text, update, func
from app.database import AsyncSessionLocal
from app.models.vehicle_definitions import VehicleModelDefinition
warnings.filterwarnings("ignore", category=RuntimeWarning, module='duckduckgo_search')
import httpx
import re
from bs4 import BeautifulSoup
from duckduckgo_search import DDGS
from playwright.async_api import async_playwright
from sqlalchemy import text
from app.database import AsyncSessionLocal
# MB 2.0 Szabvány naplózás
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] Robot-2-Researcher: %(message)s')
logger = logging.getLogger("Vehicle-Robot-2-Researcher")
# Figyelmeztetések némítása (a csomag átnevezése miatti zaj elkerülésére)
warnings.filterwarnings("ignore", category=RuntimeWarning, module='duckduckgo_search')
class QuotaManager:
""" Szigorú napi limit figyelő a fizetős/hatósági API-khoz """
def __init__(self, service_name: str, daily_limit: int):
self.service_name = service_name
self.daily_limit = daily_limit
self.state_file = f"/app/temp/.quota_{service_name}.json"
self._ensure_file()
def _ensure_file(self):
os.makedirs(os.path.dirname(self.state_file), exist_ok=True)
if not os.path.exists(self.state_file):
with open(self.state_file, 'w') as f:
json.dump({"date": datetime.now().strftime("%Y-%m-%d"), "count": 0}, f)
def can_make_request(self) -> bool:
with open(self.state_file, 'r') as f:
data = json.load(f)
today = datetime.now().strftime("%Y-%m-%d")
if data["date"] != today:
data = {"date": today, "count": 0} # Új nap, kvóta nullázása
if data["count"] >= self.daily_limit:
return False
# Növeljük a számlálót
data["count"] += 1
with open(self.state_file, 'w') as f:
json.dump(data, f)
return True
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s [R2-MASTER-EDITION] %(message)s'
)
logger = logging.getLogger("R2-Researcher")
class VehicleResearcher:
"""
Vehicle Robot 2.5: Sniper Researcher (Mesterlövész Adatgyűjtő)
Célzott keresésekkel és strukturált aktakészítéssel dolgozik az AI kímélése érdekében.
"""
def __init__(self):
self.max_attempts = 5
self.search_timeout = 15.0
def __init__(self, concurrency=5):
# Egyszerre 5 böngésző fület kezelünk a sebesség érdekében
self.semaphore = asyncio.Semaphore(concurrency)
self.ollama_url = "http://sf_ollama:11434/api/generate"
# Kvóta menedzserek beállítása (.env-ből olvasva)
dvla_limit = int(os.getenv("DVLA_DAILY_LIMIT", "1000"))
self.dvla_quota = QuotaManager("dvla", dvla_limit)
self.dvla_token = os.getenv("DVLA_API_KEY")
# FORDÍTÓ SZÓTÁR: Holland RDW -> Nemzetközi keresési nevek
self.translation_map = {
"ER REIHE": "Series",
"T-MODELL": "Estate",
"KLASSE": "Class",
"PERSONENAUTO": "Car",
"STATIONWAGEN": "Estate",
"MERCEDES-BENZ": "Mercedes",
"Vrachtwagen": "Truck",
"Oplegger": "Trailer"
}
async def fetch_ddg_targeted(self, label: str, query: str) -> str:
""" Célzott keresés szálbiztosan a DuckDuckGo-n. """
def clean_name(self, make, model):
"""Lefordítja a holland modellneveket, hogy a Google/Bing megtalálja őket."""
name = f"{make} {model}".upper()
for dutch, eng in self.translation_map.items():
name = name.replace(dutch, eng)
return name.title()
async def get_url(self, make, model, year, kw):
"""Keresés a DuckDuckGo-val. JAVÍTVA: 0kW fix és több találat."""
clean_n = self.clean_name(make, model)
# Ha a kW 0, None vagy érvénytelen, kihagyjuk a keresésből a találati arány javítására
kw_val = 0
try:
def search():
if kw and str(kw).replace('.','').isdigit():
kw_val = int(float(kw))
except: pass
kw_part = f"{kw_val}kW" if kw_val > 0 else ""
query = f"site:auto-data.net {clean_n} {year} {kw_part} specifications"
try:
def _search():
with DDGS() as ddgs:
# max_results=2: Nem kell sok zaj, csak a legrelevánsabb 2 találat
results = ddgs.text(query, max_results=2)
return [f"- {r.get('body', '')}" for r in results] if results else []
results = await asyncio.wait_for(asyncio.to_thread(search), timeout=self.search_timeout)
if not results:
return f"[SOURCE: {label}]\nNincs érdemi találat.\n"
content = f"[SOURCE: {label} | KERESÉS: {query}]\n"
content += "\n".join(results) + "\n"
return content
# Megnézzük az első 3 találatot, hátha az első nem direkt link
res = ddgs.search(query, max_results=3)
return [r.get('link', r.get('href', '')) for r in res if 'auto-data.net' in r.get('link', r.get('href', ''))]
links = await asyncio.to_thread(_search)
return links[0] if links else None
except Exception as e:
logger.debug(f"Keresési hiba ({label}): {e}")
return f"[SOURCE: {label}]\nKERESÉSI HIBA.\n"
logger.warning(f"Keresési hiba ({query}): {e}")
return None
def extract_specs_from_text(self, text: str) -> dict:
""" Regex alapú kinyerés a nyers szövegből: ccm, kW, motoradatok. """
import re
async def scrape_auto_data(self, url, browser):
"""Letölti az oldalt és kinyeri az összes technikai adatot."""
specs = {}
# CCM (köbcentiméter) minta: 1998 cc, 2.0 L, 2000 cm³
ccm_pattern = r'(\d{3,4})\s*(?:cc|ccm|cm³|cm3|cc\.)'
match = re.search(ccm_pattern, text, re.IGNORECASE)
if match:
specs['ccm'] = int(match.group(1))
else:
# Alternatív minta: 2.0 liter -> 2000 cc
liter_pattern = r'(\d+\.?\d*)\s*(?:L|liter|)'
match = re.search(liter_pattern, text, re.IGNORECASE)
if match:
liters = float(match.group(1))
specs['ccm'] = int(liters * 1000)
# KW (kilowatt) minta: 150 kW, 150kW, 150 KW
kw_pattern = r'(\d{2,4})\s*(?:kW|kw|KW)'
match = re.search(kw_pattern, text, re.IGNORECASE)
if match:
specs['kw'] = int(match.group(1))
else:
# Le (lóerő) átváltás: 150 LE -> 110 kW (kb)
hp_pattern = r'(\d{2,4})\s*(?:HP|hp|LE|le|Ps)'
match = re.search(hp_pattern, text, re.IGNORECASE)
if match:
hp = int(match.group(1))
specs['kw'] = int(hp * 0.7355) # hozzávetőleges átváltás
# Motor kód minta: motor kód: 1.8 TSI, engine code: N47
engine_pattern = r'(?:motor\s*kód|engine\s*code|motor\s*code)[:\s]+([A-Z0-9\.\- ]+)'
match = re.search(engine_pattern, text, re.IGNORECASE)
if match:
specs['engine_code'] = match.group(1).strip()
return specs
async def research_vehicle(self, db, vehicle_id: int, make: str, model: str, engine: str, year: str, current_attempts: int):
""" Egy jármű átvilágítása és a strukturált 'Akta' elkészítése a GPU számára. """
engine_safe = engine or ""
year_safe = str(year) if year else ""
logger.info(f"🔎 Mesterlövész Kutatás: {make} {model} (Motor: {engine_safe})")
# 1. TIER: Ingyenes, Célzott Keresések (A legmegbízhatóbb források)
queries = [
("ULTIMATE_SPECS", f"{make} {model} {engine_safe} {year_safe} site:ultimatespecs.com"),
("AUTO_DATA", f"{make} {model} {engine_safe} {year_safe} site:auto-data.net"),
("COMMON_ISSUES", f"{make} {model} {engine_safe} reliability common problems")
]
tasks = [self.fetch_ddg_targeted(label, q) for label, q in queries]
search_results = await asyncio.gather(*tasks)
# 2. TIER: Fizetős / Kvótás API-k (Példa a DVLA helyére)
# Ha a jövőben bejön brit rendszám, itt hívjuk meg a DVLA-t:
# if has_uk_plate and self.dvla_quota.can_make_request():
# uk_data = await self.fetch_dvla_data(plate)
# search_results.append(uk_data)
# 3. ÖSSZESÍTÉS (Az Akta összeállítása)
# Maximalizáljuk a szöveg hosszát, hogy az AI GPU ne fulladjon le!
full_context = "\n".join(search_results)
if len(full_context) > 2500:
full_context = full_context[:2500] + "\n...[TRUNCATED TO SAVE GPU TOKENS]"
# Regex alapú specifikáció kinyerés
extracted_specs = self.extract_specs_from_text(full_context)
full_text = ""
try:
if len(full_context.strip()) > 150: # Csökkentettük az elvárást, mert a célzott keresés tömörebb
await db.execute(
update(VehicleModelDefinition)
.where(VehicleModelDefinition.id == vehicle_id)
.values(
raw_search_context=full_context,
research_metadata=extracted_specs,
status='awaiting_ai_synthesis', # Kész az Akta, mehet az Alkimistának!
last_research_at=func.now(),
attempts=current_attempts + 1
)
)
logger.info(f"✅ Akta rögzítve ({len(full_context)} karakter): {make} {model}")
else:
new_status = 'suspended_research' if current_attempts + 1 >= self.max_attempts else 'unverified'
await db.execute(
update(VehicleModelDefinition)
.where(VehicleModelDefinition.id == vehicle_id)
.values(
status=new_status,
attempts=current_attempts + 1,
last_research_at=func.now()
)
)
if new_status == 'suspended_research':
logger.warning(f"🛑 Felfüggesztve (Nincs nyom a weben): {make} {model}")
else:
logger.warning(f"⚠️ Kevés adat: {make} {model}, visszatéve a sorba.")
page = await browser.new_page()
# Gyorsítás: képek, videók és stíluslapok tiltása
await page.route("**/*.{png,jpg,jpeg,gif,css,woff2}", lambda r: r.abort())
await page.goto(url, wait_until="domcontentloaded", timeout=20000)
html = await page.content()
# Kimentjük a tiszta szöveget is, ha az AI-nak kellene később
full_text = await page.evaluate("() => document.body.innerText")
await page.close()
soup = BeautifulSoup(html, 'html.parser')
# Végigfutunk minden táblázat soron
for row in soup.find_all('tr'):
th = row.find('th')
td = row.find('td')
if th and td:
k, v = th.get_text(strip=True).lower(), td.get_text(strip=True)
await db.commit()
# Minden fontos mező kinyerése
if "engine model/code" in k: specs["engine_code"] = v
elif "engine oil capacity" in k: specs["oil_l"] = v
elif "acceleration 0 - 100" in k: specs["acc_0_100"] = v
elif "maximum speed" in k: specs["max_speed"] = v
elif "fuel consumption" in k and "combined" in k: specs["cons_avg"] = v
elif "co2 emissions" in k: specs["co2"] = v
elif "generation" in k: specs["generation"] = v
elif "tires size" in k: specs["tires"] = v
elif "trunk (boot) space" in k: specs["trunk_l"] = v
elif "kerb weight" in k: specs["weight_kg"] = v
elif "drivetrain" in k: specs["drivetrain"] = v
elif "number of gears" in k: specs["transmission"] = v
return specs, full_text
except Exception as e:
await db.rollback()
logger.error(f"🚨 Adatbázis hiba az eredmény mentésénél ({vehicle_id}): {e}")
logger.error(f"Scraping hiba az oldalon ({url}): {e}")
return {}, ""
@classmethod
async def run(cls):
self_instance = cls()
logger.info("🚀 Vehicle Researcher 2.5 ONLINE (Sniper & Quota Manager)")
while True:
try:
async with AsyncSessionLocal() as db:
# ATOMI ZÁROLÁS
query = text("""
UPDATE vehicle.vehicle_model_definitions
SET status = 'research_in_progress'
WHERE id = (
SELECT id FROM vehicle.vehicle_model_definitions
WHERE status IN ('unverified', 'awaiting_research', 'ACTIVE')
AND attempts < :max_attempts
AND is_manual = FALSE
ORDER BY
CASE WHEN make = 'TOYOTA' THEN 1 ELSE 2 END,
attempts ASC
FOR UPDATE SKIP LOCKED
LIMIT 1
)
RETURNING id, make, marketing_name, engine_code, year_from, attempts;
""")
result = await db.execute(query, {"max_attempts": self_instance.max_attempts})
task = result.fetchone()
await db.commit()
async def ask_ai_fallback(self, raw_text):
"""Ha a BeautifulSoup nem talál táblázatot, megkérjük az Ollamát."""
if not raw_text or len(raw_text) < 200: return {}
prompt = f"Extract vehicle specs (engine_code, oil_capacity, tires, generation) as JSON from this text: {raw_text[:2500]}"
try:
async with httpx.AsyncClient(timeout=30.0) as client:
r = await client.post(self.ollama_url, json={
"model": "qwen2.5-coder:14b",
"prompt": prompt,
"stream": False,
"format": "json"
})
return json.loads(r.json().get("response", "{}"))
except: return {}
if task:
v_id, v_make, v_model, v_engine, v_year, v_attempts = task
async with AsyncSessionLocal() as process_db:
await self_instance.research_vehicle(process_db, v_id, v_make, v_model, v_engine, v_year, v_attempts)
await asyncio.sleep(2) # Rate limit védelem a DDG felé
async def process_vehicle(self, v_id, make, model, year, kw, browser):
"""Egy jármű dúsításának teljes folyamata."""
async with self.semaphore:
logger.info(f"🔍 Kutatás: {make} {model} ({year}) | kW: {kw}")
url = await self.get_url(make, model, year, kw)
specs = {}
if url:
logger.info(f"🔗 Találat: {url}")
specs, raw_text = await self.scrape_auto_data(url, browser)
# Ha a táblázatból nem jött ki elég adat, jöhet az AI fallback
if len(specs) < 3:
ai_specs = await self.ask_ai_fallback(raw_text)
specs.update(ai_specs)
# MENTÉS: Minden szál saját adatbázis kapcsolatot használ a biztonság érdekében
async with AsyncSessionLocal() as db:
# Csak akkor validation_ready, ha találtunk adatot. Ha nem, külön státuszba tesszük.
new_status = 'validation_ready' if len(specs) > 0 else 'research_failed_empty'
update_query = text("""
UPDATE vehicle.vehicle_model_definitions
SET specifications = specifications || CAST(:specs AS JSONB),
status = :status,
last_research_at = now()
WHERE id = :id
""")
await db.execute(update_query, {
"specs": json.dumps(specs),
"status": new_status,
"id": v_id
})
await db.commit()
if len(specs) > 0:
logger.info(f"✅ SIKER: {make} {model} ({len(specs)} adat kinyerve)")
else:
await asyncio.sleep(30)
logger.warning(f"❌ SIKERTELEN: {make} {model} (nem találtunk adatot a neten)")
except Exception as e:
logger.error(f"💀 Kritikus hiba a főciklusban: {e}")
await asyncio.sleep(10)
async def run(self):
logger.info("🚀 R2-Kutató MASTER-EDITION (0kW fix + AI Fallback) ONLINE")
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
while True:
try:
async with AsyncSessionLocal() as db:
# 10 autó bekérése párhuzamos feldolgozásra
res = await db.execute(text("""
UPDATE vehicle.vehicle_model_definitions SET status = 'research_in_progress'
WHERE id IN (
SELECT id FROM vehicle.vehicle_model_definitions
WHERE status = 'enrich_ready'
LIMIT 10
) RETURNING id, make, marketing_name, year_from, power_kw
"""))
rows = res.fetchall()
await db.commit()
if not rows:
await asyncio.sleep(15)
continue
tasks = [self.process_vehicle(r[0], r[1], r[2], r[3], r[4], browser) for r in rows]
await asyncio.gather(*tasks)
except Exception as e:
logger.error(f"💀 Kritikus hiba a főciklusban: {e}")
await asyncio.sleep(10)
if __name__ == "__main__":
try:
asyncio.run(VehicleResearcher.run())
except KeyboardInterrupt:
logger.info("🛑 Kutató robot leállítva.")
asyncio.run(VehicleResearcher().run())

View File

@@ -1,224 +1,232 @@
# /opt/docker/dev/service_finder/backend/app/workers/vehicle/vehicle_robot_3_alchemist_pro.py
"""
Robot 3: Alchemist Pro - AI Szintézis és Kapuőr
Javítások:
- Batch Size: 3 (Stabilitás a 14b modellhez)
- Szigorú Gatekeeper (Arany státusz ellenőrzés)
- Adatmegőrzés: Az AI nem bírálja felül a szótár alapú RDW adatokat (kW/ccm).
"""
import asyncio
import logging
import datetime
import random
import sys
import json
import os
from sqlalchemy import text, func, update, case
import re
from sqlalchemy import text, update, func
from sqlalchemy.ext.asyncio import AsyncSession
import httpx
from app.database import AsyncSessionLocal
from app.models.vehicle_definitions import VehicleModelDefinition
from app.models.asset import AssetCatalog
from app.services.ai_service import AIService
from app.models import VehicleModelDefinition
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] Vehicle-Alchemist-Pro: %(message)s', stream=sys.stdout)
logger = logging.getLogger("Vehicle-Robot-3-Alchemist-Pro")
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s [%(levelname)s] R3-Alchemist-Pro: %(message)s',
stream=sys.stdout
)
logger = logging.getLogger("Robot-3-Alchemist-Pro")
class TechEnricher:
"""
Vehicle Robot 3: Alchemist Pro (Atomi Zárolás + Kézi Moderáció Patch)
Tiszta GPU fókusz: Csak az AI elemzésre és adategyesítésre koncentrál.
Nincs felesleges webkeresés. Szigorú, de intelligens Sane-Check.
"""
OLLAMA_URL = "http://sf_ollama:11434/api/generate"
OLLAMA_MODEL = "qwen2.5-coder:14b" # A 14b paraméteres modell az agy
MAX_ATTEMPTS = 3
TIMEOUT_SECONDS = 45 # Megemelt timeout a 14b modell lassabb válaszideje miatt
BATCH_SIZE = 3 # Maximum 3 párhuzamos AI hívás a CPU fagyás elkerülésére
class AlchemistPro:
def __init__(self):
self.max_attempts = 5
self.daily_ai_limit = int(os.getenv("AI_DAILY_LIMIT", "10000"))
self.ai_calls_today = 0
self.last_reset_date = datetime.date.today()
self.client = httpx.AsyncClient(timeout=TIMEOUT_SECONDS)
def check_budget(self) -> bool:
if datetime.date.today() > self.last_reset_date:
self.ai_calls_today = 0
self.last_reset_date = datetime.date.today()
return self.ai_calls_today < self.daily_ai_limit
async def close(self):
await self.client.aclose()
def validate_merged_data(self, merged_kw: int, merged_ccm: int, v_class: str, fuel: str, current_attempts: int) -> tuple[bool, str]:
""" Intelligens validáció a MERGE után. Visszaadja a státuszt és a hiba okát. """
if merged_ccm > 18000:
return False, f"Irreális CCM érték ({merged_ccm})"
if merged_kw > 1500 and v_class != "truck":
return False, f"Irreális KW érték ({merged_kw})"
# Ha hiányzik a KW
if merged_kw == 0:
if current_attempts < 3:
return False, "Hiányzó KW adat. Újrakutatás javasolt."
else:
logger.warning("Sane-check: Többszöri próbálkozás után sincs KW, de átengedjük részlegesként.")
# Ha hiányzik a CCM (és belsőégésű)
if merged_ccm == 0 and "electric" not in fuel and "elektric" not in fuel and v_class != "trailer":
if current_attempts < 3:
return False, "Hiányzó CCM (belsőégésű motornál). Újrakutatás javasolt."
else:
logger.warning("Sane-check: Többszöri próbálkozás után sincs CCM, átengedjük részlegesként.")
return True, "OK"
async def process_single_record(self, db, record_id: int, base_info: dict, current_attempts: int):
# Pontos azonosító a logokhoz (Márka, Modell, ID, RDW adatok)
v_ident = f"{base_info['make'].upper()} {base_info['m_name']} (ID: {record_id}, RDW: {base_info['rdw_ccm']}ccm, KW: {base_info['rdw_kw']})"
attempt_str = f"[Próba: {current_attempts + 1}/{self.max_attempts}]"
async def fetch_vehicle_batch_for_processing(self, db: AsyncSession):
"""Kiválasztja azokat a járműveket, ahol a 2.1-es robot végzett, de még nem 'Arany'."""
query = text("""
SELECT id, make, marketing_name, power_kw, engine_capacity,
fuel_type, raw_api_data, raw_search_context, attempts,
vehicle_class, trim_level, transmission_type, body_type
FROM vehicle.vehicle_model_definitions
WHERE status = 'awaiting_ai_synthesis'
AND attempts < :max_attempts
AND is_manual = FALSE
ORDER BY priority_score DESC NULLS LAST, id ASC
FOR UPDATE SKIP LOCKED
LIMIT :batch_size
""")
result = await db.execute(query, {"max_attempts": MAX_ATTEMPTS, "batch_size": BATCH_SIZE})
rows = result.fetchall()
ai_data = {} # Üres dict, ha az AI hívás elszállna
try:
logger.info(f"🧠 AI dúsítás indul: {v_ident} {attempt_str}")
# 1. LÉPÉS: AI Hívás (Rábízzuk az adatokat a modellre)
ai_data = await AIService.get_clean_vehicle_data(
base_info['make'],
base_info['m_name'],
base_info
)
if not ai_data:
raise ValueError("Teljesen üres AI válasz (API hiba vagy extrém hallucináció).")
# 2. LÉPÉS: HIBRID MERGE (Még a validáció előtt!)
# Az RDW adatok felülbírálják az AI-t a hatósági paramétereknél
final_kw = base_info['rdw_kw'] if base_info['rdw_kw'] > 0 else int(ai_data.get("kw", 0) or 0)
final_ccm = base_info['rdw_ccm'] if base_info['rdw_ccm'] > 0 else int(ai_data.get("ccm", 0) or 0)
# Üzemanyag tisztítása
fuel_rdw = base_info.get('rdw_fuel', '')
final_fuel = fuel_rdw if fuel_rdw and fuel_rdw != "Unknown" else ai_data.get("fuel_type", "petrol")
final_engine = base_info['rdw_engine'] if base_info['rdw_engine'] else ai_data.get("engine_code", "Unknown")
final_euro = base_info['rdw_euro'] or ai_data.get("euro_classification")
final_cylinders = base_info['rdw_cylinders'] or ai_data.get("cylinders")
# 3. LÉPÉS: Intelligens Validáció
is_valid, error_msg = self.validate_merged_data(final_kw, final_ccm, base_info['v_type'], final_fuel.lower(), current_attempts)
if not is_valid:
raise ValueError(f"Validációs hiba: {error_msg}")
# 4. LÉPÉS: Mentés az Arany Katalógusba
clean_model = str(ai_data.get("marketing_name", base_info['m_name']))[:50].upper()
cat_stmt = text("""
INSERT INTO vehicle.vehicle_catalog
(master_definition_id, make, model, power_kw, engine_capacity, fuel_type, factory_data)
VALUES (:m_id, :make, :model, :kw, :ccm, :fuel, :factory)
ON CONFLICT ON CONSTRAINT uix_vehicle_catalog_full DO NOTHING
RETURNING id;
""")
await db.execute(cat_stmt, {
"m_id": record_id,
"make": base_info['make'].upper(),
"model": clean_model,
"kw": final_kw,
"ccm": final_ccm,
"fuel": final_fuel,
"factory": json.dumps(ai_data)
vehicles = []
for row in rows:
vehicles.append({
"id": row[0], "make": row[1], "marketing_name": row[2],
"power_kw": row[3] or 0, "engine_capacity": row[4] or 0,
"fuel_type": row[5] or "Unknown", "raw_api_data": row[6] or {},
"raw_search_context": row[7] or "", "attempts": row[8] or 0,
"vehicle_class": row[9], "trim_level": row[10],
"transmission_type": row[11], "body_type": row[12]
})
return vehicles
# 5. LÉPÉS: Staging tábla (VMD) lezárása
await db.execute(
update(VehicleModelDefinition)
.where(VehicleModelDefinition.id == record_id)
.values(
status="gold_enriched",
engine_capacity=final_ccm,
power_kw=final_kw,
fuel_type=final_fuel,
engine_code=final_engine,
euro_classification=final_euro,
cylinders=final_cylinders,
specifications=ai_data, # Elmentjük az AI teljes outputját a mestertáblába is
updated_at=func.now()
)
)
await db.commit()
logger.info(f"✨ ARANY REKORD KÉSZ: {v_ident}")
self.ai_calls_today += 1
def build_prompt(self, vehicle_data: dict) -> str:
"""Megfogalmazza a feladatot az AI számára a 14b modell erejét kihasználva."""
make = vehicle_data["make"]
model = vehicle_data["marketing_name"]
# Rövidítjük a kontextust, hogy beleférjen a kontextus ablakba
raw_api = json.dumps(vehicle_data["raw_api_data"], ensure_ascii=False)[:1000]
raw_context = (vehicle_data["raw_search_context"] or "")[:2000]
prompt = f"""
Analyze the vehicle data and return missing information in valid JSON format.
Vehicle: {make} {model}
Current Specs:
- Power: {vehicle_data['power_kw']} kW (0 means missing)
- Engine: {vehicle_data['engine_capacity']} ccm (0 means missing)
- Fuel: {vehicle_data['fuel_type']}
Context Data: {raw_api}
Search Snippets: {raw_context}
INSTRUCTIONS:
1. Identify trim_level (e.g., GTI, AMG, Highline, Titanium).
2. Identify transmission (MANUAL, AUTOMATIC, CVT, DCT).
3. Identify body_type (SEDAN, SUV, HATCHBACK, ESTATE, COUPE).
4. If Power is 0, estimate it based on the engine size and fuel in context.
5. If Engine is 0, estimate it based on model name.
Return ONLY a JSON object:
{{
"trim_level": "string",
"transmission": "string",
"body_type": "string",
"estimated_kw": integer_or_null,
"estimated_ccm": integer_or_null
}}
"""
return prompt.strip()
async def call_ollama(self, prompt: str) -> dict:
"""Kommunikáció az Ollama szerverrel."""
payload = {
"model": OLLAMA_MODEL,
"prompt": prompt,
"format": "json",
"stream": False,
"options": {"temperature": 0.1, "top_p": 0.9}
}
try:
response = await self.client.post(OLLAMA_URL, json=payload)
response.raise_for_status()
data = response.json()
return json.loads(data.get("response", "{}"))
except Exception as e:
await db.rollback()
logger.warning(f"⚠️ Alkimista hiba - {v_ident}: {e}")
raise ValueError(f"Ollama hiba: {str(e)}")
def merge_vehicle_data(self, vehicle: dict, ai_result: dict) -> dict:
"""Összefésüli a meglévő adatokat az AI eredményeivel, prioritást adva a meglévőnek."""
merged = vehicle.copy()
# A szöveges mezőket frissítjük, ha az AI talált jobbat
for field, ai_key in [("trim_level", "trim_level"), ("transmission_type", "transmission"), ("body_type", "body_type")]:
if not merged.get(field) and ai_result.get(ai_key):
merged[field] = str(ai_result[ai_key]).upper() if field != "trim_level" else ai_result[ai_key]
# MATEK VÉDELEM: Csak akkor írjuk be az AI becslését, ha a 2.1-es robot nem talált adatot (még mindig 0)
if merged["power_kw"] == 0 and ai_result.get("estimated_kw"):
merged["power_kw"] = int(ai_result["estimated_kw"])
if merged["engine_capacity"] == 0 and ai_result.get("estimated_ccm"):
merged["engine_capacity"] = int(ai_result["estimated_ccm"])
# Ha elértük a limitet, KÉZI MODERÁCIÓRA küldjük, egyébként vissza a Kutatónak
new_status = 'manual_review_needed' if current_attempts + 1 >= self.max_attempts else 'unverified'
# Elmentjük az AI részleges válaszát (vagy a hibát), hogy az admin lássa, mit rontott el a gép
review_data = ai_data if ai_data else {"error": "Nincs értékelhető JSON adat az AI-tól", "raw_context": base_info['web_context']}
await db.execute(
update(VehicleModelDefinition)
.where(VehicleModelDefinition.id == record_id)
.values(
attempts=current_attempts + 1,
last_error=str(e)[:200],
status=new_status,
specifications=review_data, # Kézi ellenőrzéshez beírjuk a törött adatot!
return merged
async def update_vehicle_record(self, db: AsyncSession, vehicle_id: int, merged_data: dict):
"""Végrehajtja a mentést és a Kapuőr logikát."""
kw = merged_data.get("power_kw", 0)
ccm = merged_data.get("engine_capacity", 0)
fuel = str(merged_data.get("fuel_type", "")).lower()
v_class = str(merged_data.get("vehicle_class", "")).lower()
# Kapuőr szabályok
is_electric = any(x in fuel for x in ['electr', 'elektri', 'hydrogen'])
is_trailer = 'trailer' in v_class
is_gold = False
if is_trailer: is_gold = True
elif is_electric: is_gold = kw > 0
else: is_gold = (kw > 0 and ccm > 0)
if is_gold:
new_status = "gold_enriched"
new_attempts = 0
msg = "✨ ARANY"
else:
new_attempts = merged_data["attempts"] + 1
new_status = "manual_review_needed" if new_attempts >= MAX_ATTEMPTS else "unverified"
msg = "🔄 VISSZADOBVA"
update_values = {
"trim_level": merged_data.get("trim_level"),
"transmission_type": merged_data.get("transmission_type"),
"body_type": merged_data.get("body_type"),
"power_kw": kw,
"engine_capacity": ccm,
"status": new_status,
"attempts": new_attempts,
"updated_at": func.now()
}
stmt = update(VehicleModelDefinition).where(VehicleModelDefinition.id == vehicle_id).values(**update_values)
await db.execute(stmt)
logger.info(f"{msg}: {merged_data['make']} {merged_data['marketing_name']} (Státusz: {new_status})")
async def process_ai_task(self, vehicle: dict):
"""AI feldolgozás párhuzamosítható része."""
try:
prompt = self.build_prompt(vehicle)
ai_result = await self.call_ollama(prompt)
return vehicle, ai_result, None
except Exception as e:
return vehicle, None, e
async def process_batch(self, db: AsyncSession, vehicles: list):
"""Batch feldolgozás: Párhuzamos AI, majd szekvenciális DB mentés."""
# 1. AI kérések párhuzamosan (CPU kímélő batch mérettel)
tasks = [self.process_ai_task(v) for v in vehicles]
results = await asyncio.gather(*tasks)
# 2. Mentés szekvenciálisan a DB lakatok elkerülésére
for vehicle, ai_result, error in results:
if error:
logger.error(f"Hiba {vehicle['id']}: {error}")
# Hiba esetén növeljük a próbálkozások számát
stmt = update(VehicleModelDefinition).where(VehicleModelDefinition.id == vehicle['id']).values(
attempts=vehicle['attempts'] + 1,
updated_at=func.now()
)
)
await db.commit()
if new_status == 'unverified':
logger.info(f"♻️ Akta visszaküldve a Robot-2-nek (Kutató). {attempt_str}")
await db.execute(stmt)
else:
logger.error(f"🛑 Max próbálkozás elérve! Kézi moderációra küldve: {v_ident}")
merged = self.merge_vehicle_data(vehicle, ai_result)
await self.update_vehicle_record(db, vehicle['id'], merged)
await db.commit()
async def run(self):
logger.info(f"🚀 Alchemist Pro HIBRID ONLINE (Atomi Zárolás + Moderáció Patch)")
logger.info(f"🚀 Robot 3 indítva. Modell: {OLLAMA_MODEL}, Batch: {BATCH_SIZE}")
while True:
if not self.check_budget():
logger.warning("💸 Napi AI limit kimerítve! Pihenés...")
await asyncio.sleep(3600); continue
try:
async with AsyncSessionLocal() as db:
# ATOMI ZÁROLÁS (A "Szent Grál" a race condition ellen)
query = text("""
UPDATE vehicle.vehicle_model_definitions
SET status = 'ai_synthesis_in_progress'
WHERE id = (
SELECT id FROM vehicle.vehicle_model_definitions
WHERE status IN ('awaiting_ai_synthesis', 'ACTIVE')
AND attempts < :max_attempts
AND is_manual = FALSE
ORDER BY
CASE WHEN status = 'awaiting_ai_synthesis' THEN 1 ELSE 2 END,
priority_score DESC
FOR UPDATE SKIP LOCKED
LIMIT 1
)
RETURNING id, make, marketing_name, vehicle_class, power_kw, engine_capacity,
fuel_type, engine_code, euro_classification, cylinders, raw_search_context, attempts;
""")
result = await db.execute(query, {"max_attempts": self.max_attempts})
task = result.fetchone()
await db.commit()
if task:
# Szétbontjuk a lekérdezett rekordot a base_info dict-be
r_id = task[0]
base_info = {
"make": task[1], "m_name": task[2], "v_type": task[3] or "car",
"rdw_kw": task[4] or 0, "rdw_ccm": task[5] or 0,
"rdw_fuel": task[6] or "petrol", "rdw_engine": task[7] or "",
"rdw_euro": task[8], "rdw_cylinders": task[9],
"web_context": task[10] or ""
}
attempts = task[11]
# Külön adatbázis kapcsolat a feldolgozáshoz (hosszú AI hívás miatt)
async with AsyncSessionLocal() as process_db:
await self.process_single_record(process_db, r_id, base_info, attempts)
# GPU hűtés / Ollama rate limit
await asyncio.sleep(random.uniform(1.5, 3.5))
else:
logger.info("😴 Nincs feldolgozandó akta, az Alkimista pihen...")
await asyncio.sleep(15)
vehicles = await self.fetch_vehicle_batch_for_processing(db)
if vehicles:
logger.info(f"📦 Feldolgozás: {len(vehicles)} jármű...")
await self.process_batch(db, vehicles)
await asyncio.sleep(1)
else:
await asyncio.sleep(10)
except Exception as e:
logger.error(f"💀 Kritikus hiba a főciklusban: {e}")
await asyncio.sleep(10)
logger.error(f"Főciklus hiba: {e}")
await asyncio.sleep(5)
if __name__ == "__main__":
asyncio.run(TechEnricher().run())
robot = AlchemistPro()
asyncio.run(robot.run())

View File

@@ -0,0 +1,261 @@
# /opt/docker/dev/service_finder/backend/app/workers/vehicle/vehicle_robot_4_validator.py
#!/usr/bin/env python3
"""
Robot-4-Validator (Publisher / Gépágyú)
Az MDM csővezeték utolsó eleme. Feladata:
1. Kivesz 50 darab gold_enriched státuszú járművet a VMD táblából (FOR UPDATE SKIP LOCKED)
2. Validálja az alapvető mezőket (make, marketing_name, power_kw, engine_capacity)
3. Ha sikeres, összeállít egy factory_data JSON-t és UPSERT-et végez a vehicle.vehicle_catalog táblába
(ON CONFLICT ON CONSTRAINT uix_vehicle_catalog_full)
4. Állítja a VMD státuszt published-re
5. Ha sikertelen, manual_review_needed státuszt állít
AI-mentes, tisztán adatbázis logika.
"""
import asyncio
import logging
import sys
import json
from datetime import datetime
from sqlalchemy import text, update, func
from sqlalchemy.ext.asyncio import AsyncSession
from app.database import AsyncSessionLocal
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s [%(levelname)s] R4-Publisher: %(message)s',
stream=sys.stdout
)
logger = logging.getLogger("Robot-4-Publisher")
BATCH_SIZE = 50
class VehicleRobot4Validator:
def __init__(self):
pass
async def fetch_gold_enriched_batch(self, db: AsyncSession):
"""
Lekérdez egy köteget gold_enriched státuszú járművekből.
FOR UPDATE SKIP LOCKED zárolással, hogy ne dolgozzon többször ugyanazon.
"""
query = text("""
SELECT id, make, marketing_name, power_kw, engine_capacity,
fuel_type, year_from, trim_level, transmission_type,
body_type, specifications, status
FROM vehicle.vehicle_model_definitions
WHERE status = 'gold_enriched'
ORDER BY priority_score DESC NULLS LAST, id ASC
FOR UPDATE SKIP LOCKED
LIMIT :batch_size
""")
result = await db.execute(query, {"batch_size": BATCH_SIZE})
rows = result.fetchall()
return rows
def validate_vehicle(self, row):
"""
Pofonegyszerű ellenőrzés minden lekérdezett sornál:
- Van make és marketing_name?
- A power_kw > 0 ÉS engine_capacity > 0?
(Kivéve, ha a fuel_type tartalmazza az "elektr" szót, mert akkor a ccm lehet 0)
"""
make = row.make
marketing_name = row.marketing_name
power_kw = row.power_kw
engine_capacity = row.engine_capacity
fuel_type = (row.fuel_type or "").lower()
# 1. make és marketing_name ellenőrzés
if not make or not marketing_name:
logger.warning(f"ID {row.id}: Hiányzó make vagy marketing_name")
return False, "missing_make_or_name"
# 2. power_kw ellenőrzés
if power_kw is None or power_kw <= 0:
# Elektromos járműveknek lehet 0 power_kw? Nem, az is pozitív kell legyen.
logger.warning(f"ID {row.id}: Érvénytelen power_kw ({power_kw})")
return False, "invalid_power"
# 3. engine_capacity ellenőrzés
if engine_capacity is None or engine_capacity < 0:
logger.warning(f"ID {row.id}: Érvénytelen engine_capacity ({engine_capacity})")
return False, "invalid_engine_capacity"
# Kivétel: elektromos járműveknél engine_capacity lehet 0
is_electric = any(x in fuel_type for x in ['electr', 'elektri', 'hydrogen'])
if not is_electric and engine_capacity == 0:
logger.warning(f"ID {row.id}: Nem elektromos jármű engine_capacity 0 (fuel: {fuel_type})")
return False, "zero_engine_capacity_non_electric"
# 4. fuel_type ellenőrzés (nem kötelező, de legyen valami)
if not fuel_type or fuel_type == "unknown":
logger.warning(f"ID {row.id}: Ismeretlen fuel_type")
# Ez nem buktató, csak figyelmeztetés
# return False, "unknown_fuel_type"
# 5. year_from ellenőrzés (opcionális)
if row.year_from is None or row.year_from <= 1900:
logger.warning(f"ID {row.id}: Érvénytelen year_from ({row.year_from})")
# Nem buktató, de lehet, hogy hiányos
return True, "valid"
async def publish_to_catalog(self, db: AsyncSession, row):
"""
Publikálás (Sikeres Validáció):
- Állít össze egy factory_data JSON objektumot
- Végez egy UPSERT-et a vehicle.vehicle_catalog táblába
- Állítja a VMD státuszt published-re
"""
# Factory_data összeállítása
factory_data = {
"trim_level": row.trim_level or "",
"transmission_type": row.transmission_type or "",
"body_type": row.body_type or "",
"specifications": row.specifications or {},
"source": "robot_4_publisher",
"published_at": datetime.utcnow().isoformat()
}
# UPSERT a vehicle_catalog táblába
# A constraint: uix_vehicle_catalog_full (make, model, year_from, fuel_type)
# Megjegyzés: a model mezőbe a marketing_name kerül
upsert_query = text("""
INSERT INTO vehicle.vehicle_catalog
(make, model, year_from, fuel_type, power_kw, engine_capacity, factory_data, master_definition_id)
VALUES
(:make, :model, :year_from, :fuel_type, :power_kw, :engine_capacity, :factory_data, :master_definition_id)
ON CONFLICT ON CONSTRAINT uix_vehicle_catalog_full
DO UPDATE SET
power_kw = EXCLUDED.power_kw,
engine_capacity = EXCLUDED.engine_capacity,
factory_data = EXCLUDED.factory_data,
master_definition_id = EXCLUDED.master_definition_id
RETURNING id
""")
params = {
"make": row.make,
"model": row.marketing_name, # A model a marketing_name
"year_from": row.year_from if row.year_from else 0,
"fuel_type": row.fuel_type or "Unknown",
"power_kw": row.power_kw,
"engine_capacity": row.engine_capacity,
"factory_data": json.dumps(factory_data),
"master_definition_id": row.id
}
result = await db.execute(upsert_query, params)
catalog_id = result.scalar()
logger.info(f"ID {row.id}: Sikeres publikálás a katalógusba (catalog_id: {catalog_id})")
# VMD státusz frissítése published-re
update_query = text("""
UPDATE vehicle.vehicle_model_definitions
SET status = 'published',
updated_at = NOW()
WHERE id = :id
""")
await db.execute(update_query, {"id": row.id})
logger.info(f"ID {row.id}: Státusz frissítve published-re")
async def mark_for_manual_review(self, db: AsyncSession, row, reason):
"""
Elutasítás (Sikertelen Validáció):
- Állítja a VMD státuszt manual_review_needed-re
"""
update_query = text("""
UPDATE vehicle.vehicle_model_definitions
SET status = 'manual_review_needed',
last_error = :reason,
updated_at = NOW()
WHERE id = :id
""")
await db.execute(update_query, {"id": row.id, "reason": reason})
logger.warning(f"ID {row.id}: Átállítva manual_review_needed-re, ok: {reason}")
async def process_batch(self):
"""
Feldolgoz egy köteget.
"""
async with AsyncSessionLocal() as db:
try:
# Tranzakció indítása
await db.execute(text("BEGIN"))
rows = await self.fetch_gold_enriched_batch(db)
if not rows:
logger.info("Nincs gold_enriched státuszú jármű a feldolgozáshoz.")
await db.execute(text("COMMIT"))
return 0
logger.info(f"{len(rows)} gold_enriched jármű lekérdezve.")
published_count = 0
manual_review_count = 0
for row in rows:
is_valid, reason = self.validate_vehicle(row)
if is_valid:
await self.publish_to_catalog(db, row)
published_count += 1
else:
await self.mark_for_manual_review(db, row, reason)
manual_review_count += 1
await db.execute(text("COMMIT"))
logger.info(f"Köteg feldolgozva. Publikálva: {published_count}, Kézi ellenőrzés: {manual_review_count}")
return published_count
except Exception as e:
await db.execute(text("ROLLBACK"))
logger.error(f"Hiba a köteg feldolgozásában: {e}", exc_info=True)
raise
async def run(self, max_iterations=None):
"""
Futtatja a robotot folyamatosan (daemon mód).
Ha nincs gold_enriched adat, vár 30 másodpercet, majd újra próbálkozik.
"""
iteration = 0
total_published = 0
while True:
if max_iterations is not None and iteration >= max_iterations:
logger.info(f"Elérte a maximális iterációt ({max_iterations}).")
break
iteration += 1
logger.info(f"--- Iteráció {iteration} ---")
published = await self.process_batch()
total_published += published
if published == 0:
logger.info("Nincs gold_enriched adat. Várakozás 30 másodperc...")
await asyncio.sleep(30)
continue # Ne lépjen ki, hanem folytassa a ciklust
# Kis szünet a következő köteg előtt
await asyncio.sleep(1)
logger.info(f"Robot leállt. Összesen publikálva: {total_published} jármű.")
return total_published
async def main():
"""
Fő függvény: indítja a robotot folyamatos módban.
"""
robot = VehicleRobot4Validator()
try:
# Végtelen ciklus (daemon mód)
total = await robot.run(max_iterations=None)
logger.info(f"Robot sikeresen lefutott. Publikálva: {total}")
except Exception as e:
logger.error(f"Robot futás közben hiba történt: {e}", exc_info=True)
sys.exit(1)
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -4,7 +4,7 @@ import sys
from sqlalchemy import select, and_, text, update
from sqlalchemy.orm import joinedload
from app.database import AsyncSessionLocal
from app.models.asset import Asset, AssetCatalog
from app.models import Asset, AssetCatalog
from app.services.ai_service import AIService
logging.basicConfig(