átlagos kiegészítséek jó sok
This commit is contained in:
208
backend/app/workers/vehicle/.archive/vehicle_robot_0_discovery_engine_1.0.py.old
Executable file
208
backend/app/workers/vehicle/.archive/vehicle_robot_0_discovery_engine_1.0.py.old
Executable file
@@ -0,0 +1,208 @@
|
||||
import asyncio
|
||||
import httpx
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
from datetime import datetime, timedelta
|
||||
from sqlalchemy import text, select
|
||||
from app.database import AsyncSessionLocal
|
||||
from app.models.asset import AssetCatalog
|
||||
|
||||
# MB 2.0 Szigorú naplózás
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] Robot-0-Discovery: %(message)s', stream=sys.stdout)
|
||||
logger = logging.getLogger("Vehicle-Robot-0-Discovery")
|
||||
|
||||
class DiscoveryEngine:
|
||||
"""
|
||||
THOUGHT PROCESS (IPARI ÜZEMMÓD 2.0):
|
||||
1. Őrkutya (Watchdog): Megkeresi és kiszabadítja a beragadt feladatokat óránként.
|
||||
2. Differential Sync (Különbözeti Szinkron): Csak a hiányzó vagy új modelleket rögzíti, a gold_enriched-eket kihagyja.
|
||||
3. Monthly Scheduler: Havonta egyszer tölti le a teljes RDW adatbázist lapozva.
|
||||
"""
|
||||
|
||||
RDW_TOKEN = os.getenv("RDW_APP_TOKEN")
|
||||
HEADERS = {"X-App-Token": RDW_TOKEN} if RDW_TOKEN else {}
|
||||
SYNC_STATE_FILE = "/app/temp/.last_rdw_sync" # Állapotfájl, hogy Docker újrainduláskor se kezdje elölről azonnal
|
||||
|
||||
@staticmethod
|
||||
async def run_watchdog():
|
||||
""" 1. FÁZIS: Az Őrkutya (Dead-Letter Queue Manager) """
|
||||
logger.info("🐕 Őrkutya: Beragadt feladatok keresése a rendszerben...")
|
||||
try:
|
||||
async with AsyncSessionLocal() as db:
|
||||
# A) Hunter takarítás (visszaállítás pending-re, ha a Hunter lefagyott)
|
||||
res1 = await db.execute(text("UPDATE vehicle.catalog_discovery SET status = 'pending' WHERE status = 'processing' RETURNING id;"))
|
||||
hunter_resets = len(res1.fetchall())
|
||||
if hunter_resets > 0:
|
||||
logger.warning(f"🔄 {hunter_resets} db beragadt Hunter feladat (processing) visszaállítva 'pending'-re.")
|
||||
|
||||
# B) AI Robotok takarítása (2 órás timeout)
|
||||
query2 = text("""
|
||||
UPDATE vehicle.vehicle_model_definitions
|
||||
SET status = CASE
|
||||
WHEN status = 'research_in_progress' THEN 'unverified'
|
||||
WHEN status = 'ai_synthesis_in_progress' THEN 'awaiting_ai_synthesis'
|
||||
END
|
||||
WHERE status IN ('research_in_progress', 'ai_synthesis_in_progress')
|
||||
AND updated_at < NOW() - INTERVAL '2 hours'
|
||||
RETURNING id;
|
||||
""")
|
||||
res2 = await db.execute(query2)
|
||||
ai_resets = len(res2.fetchall())
|
||||
if ai_resets > 0:
|
||||
logger.warning(f"🔄 {ai_resets} db beragadt AI feladat visszaállítva.")
|
||||
|
||||
await db.commit()
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Őrkutya hiba: {e}")
|
||||
|
||||
@staticmethod
|
||||
async def seed_manual_bootstrap():
|
||||
""" 2. FÁZIS: Alapozó adatok rögzítése """
|
||||
initial_data = [
|
||||
{"make": "AUDI", "model": "A4", "generation": "B8 (2008-2015)"}, # vehicle_class törölve
|
||||
{"make": "BMW", "model": "3 SERIES", "generation": "F30 (2012-2019)"}
|
||||
]
|
||||
try:
|
||||
async with AsyncSessionLocal() as db:
|
||||
for item in initial_data:
|
||||
stmt = select(AssetCatalog).where(AssetCatalog.make == item["make"], AssetCatalog.model == item["model"])
|
||||
if not (await db.execute(stmt)).scalar_one_or_none():
|
||||
db.add(AssetCatalog(**item))
|
||||
await db.commit()
|
||||
except Exception as e:
|
||||
logger.warning(f"Manual bootstrap hiba (Ignorálható, ha az adatbázis már tele van): {e}")
|
||||
|
||||
@classmethod
|
||||
async def fetch_with_retry(cls, client: httpx.AsyncClient, url: str, params: dict, retries: int = 3):
|
||||
""" Hibatűrő HTTP kérés API leállások ellen. """
|
||||
for attempt in range(retries):
|
||||
try:
|
||||
resp = await client.get(url, params=params, headers=cls.HEADERS)
|
||||
if resp.status_code == 200:
|
||||
return resp
|
||||
elif resp.status_code == 429:
|
||||
await asyncio.sleep(2 ** attempt)
|
||||
else:
|
||||
return None
|
||||
except httpx.RequestError:
|
||||
if attempt == retries - 1:
|
||||
return None
|
||||
await asyncio.sleep(2 ** attempt)
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
async def seed_from_rdw(cls):
|
||||
""" 3. FÁZIS: Távoli felfedezés - KÜLÖNBÖZETI SZINKRONIZÁCIÓ (Differential Sync) """
|
||||
logger.info("📥 RDW TÖMEGES LETÖLTÉS: Új modellek keresése (Differential Sync)...")
|
||||
|
||||
limit = 10000
|
||||
offset = 0
|
||||
inserted_count = 0
|
||||
updated_count = 0
|
||||
|
||||
async with httpx.AsyncClient(timeout=60.0) as client:
|
||||
while True:
|
||||
params = {
|
||||
"$select": "merk,handelsbenaming,voertuigsoort,count(*) as total",
|
||||
"$group": "merk,handelsbenaming,voertuigsoort",
|
||||
"$order": "total DESC",
|
||||
"$limit": limit,
|
||||
"$offset": offset
|
||||
}
|
||||
|
||||
resp = await cls.fetch_with_retry(client, "https://opendata.rdw.nl/resource/m9d7-ebf2.json", params)
|
||||
if not resp: break
|
||||
raw_data = resp.json()
|
||||
if not raw_data: break
|
||||
|
||||
logger.info(f"📊 Lapozás: {offset} - {offset + len(raw_data)} tételek analízise...")
|
||||
|
||||
async with AsyncSessionLocal() as db:
|
||||
for entry in raw_data:
|
||||
make = str(entry.get("merk", "")).upper().strip()
|
||||
model = str(entry.get("handelsbenaming", "")).upper().strip()
|
||||
v_kind = entry.get("voertuigsoort", "")
|
||||
total_count = int(entry.get("total", 0))
|
||||
|
||||
if not make or not model: continue
|
||||
|
||||
if "Personenauto" in v_kind: v_class = 'car'
|
||||
elif "Motorfiets" in v_kind: v_class = 'motorcycle'
|
||||
else: v_class = 'truck'
|
||||
|
||||
# A MÁGIA: Különbözeti Szinkronizáció SQL + Explicit Type Casting
|
||||
query = text("""
|
||||
INSERT INTO vehicle.catalog_discovery (make, model, vehicle_class, status, priority_score)
|
||||
SELECT
|
||||
CAST(:make AS VARCHAR),
|
||||
CAST(:model AS VARCHAR),
|
||||
CAST(:v_class AS VARCHAR),
|
||||
'pending',
|
||||
:priority
|
||||
WHERE NOT EXISTS (
|
||||
SELECT 1 FROM vehicle.vehicle_model_definitions
|
||||
WHERE make = CAST(:make AS VARCHAR)
|
||||
AND marketing_name = CAST(:model AS VARCHAR)
|
||||
AND status = 'gold_enriched'
|
||||
)
|
||||
ON CONFLICT (make, model)
|
||||
DO UPDATE SET priority_score = EXCLUDED.priority_score
|
||||
WHERE vehicle.catalog_discovery.status != 'processed'
|
||||
RETURNING xmax;
|
||||
""")
|
||||
|
||||
result = await db.execute(query, {
|
||||
"make": make, "model": model, "v_class": v_class, "priority": total_count
|
||||
})
|
||||
|
||||
row = result.fetchone()
|
||||
if row:
|
||||
if row[0] == 0: inserted_count += 1 # Új beszúrás
|
||||
else: updated_count += 1 # Meglévő frissítése
|
||||
|
||||
await db.commit()
|
||||
offset += limit
|
||||
await asyncio.sleep(1)
|
||||
|
||||
logger.info(f"✅ RDW Szinkron kész! Új modellek a listán: {inserted_count} | Frissített prioritások: {updated_count}")
|
||||
|
||||
# Sikeres futás regisztrálása a fájlrendszeren
|
||||
os.makedirs(os.path.dirname(cls.SYNC_STATE_FILE), exist_ok=True)
|
||||
with open(cls.SYNC_STATE_FILE, 'w') as f:
|
||||
f.write(datetime.now().isoformat())
|
||||
|
||||
@classmethod
|
||||
def should_run_rdw_sync(cls) -> bool:
|
||||
""" Ellenőrzi, hogy eltelt-e 30 nap a legutóbbi sikeres RDW szinkronizáció óta. """
|
||||
if not os.path.exists(cls.SYNC_STATE_FILE):
|
||||
return True
|
||||
try:
|
||||
with open(cls.SYNC_STATE_FILE, 'r') as f:
|
||||
last_sync = datetime.fromisoformat(f.read().strip())
|
||||
return datetime.now() - last_sync > timedelta(days=30)
|
||||
except Exception:
|
||||
return True
|
||||
|
||||
@classmethod
|
||||
async def run(cls):
|
||||
""" FŐ CIKLUS: Havi ütemező és Óránkénti Őrkutya """
|
||||
logger.info("🚀 ÉLES ÜZEM: Discovery Engine (Differential Sync) & Watchdog indítása...")
|
||||
await cls.seed_manual_bootstrap()
|
||||
|
||||
while True:
|
||||
# 1. Óránkénti takarítás
|
||||
await cls.run_watchdog()
|
||||
|
||||
# 2. Havi szinkronizáció ellenőrzése
|
||||
if cls.should_run_rdw_sync():
|
||||
await cls.seed_from_rdw()
|
||||
else:
|
||||
logger.info("🛌 Az RDW szinkronizáció már lefutott az elmúlt 30 napban. Ugrás...")
|
||||
|
||||
# 3. Alvás 1 órát (Heartbeat)
|
||||
logger.info("⏱️ A Discovery Engine most 1 órát pihen a következő Őrkutya futásig.")
|
||||
await asyncio.sleep(3600)
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(DiscoveryEngine.run())
|
||||
@@ -0,0 +1,224 @@
|
||||
import asyncio
|
||||
import httpx
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from sqlalchemy import text
|
||||
from sqlalchemy.dialects.postgresql import insert
|
||||
from app.database import AsyncSessionLocal
|
||||
from app.models.vehicle_definitions import VehicleModelDefinition
|
||||
|
||||
# Naplózás beállítása a standard kimenetre
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s [%(levelname)s] Robot-1-Hunter: %(message)s',
|
||||
stream=sys.stdout
|
||||
)
|
||||
logger = logging.getLogger("Robot-1-Hunter")
|
||||
|
||||
class CatalogHunter:
|
||||
"""
|
||||
Vehicle Robot 1.9.3: The Truly Invincible Hunter (SAVEPOINT PATCH)
|
||||
Kezeli az ALL_VARIANTS utasítást és row-level tranzakcióvédelmet használ.
|
||||
"""
|
||||
RDW_MAIN = "https://opendata.rdw.nl/resource/m9d7-ebf2.json"
|
||||
RDW_FUEL = "https://opendata.rdw.nl/resource/8ys7-d773.json"
|
||||
RDW_ENGINE = "https://opendata.rdw.nl/resource/jh96-v4pq.json"
|
||||
|
||||
RDW_TOKEN = os.getenv("RDW_APP_TOKEN")
|
||||
HEADERS = {"X-App-Token": RDW_TOKEN} if RDW_TOKEN else {}
|
||||
BATCH_SIZE = 50
|
||||
|
||||
@classmethod
|
||||
def normalize(cls, text_val: str) -> str:
|
||||
if not text_val: return ""
|
||||
return re.sub(r'[^a-zA-Z0-9]', '', text_val).lower()
|
||||
|
||||
@classmethod
|
||||
def parse_int(cls, value) -> int:
|
||||
try:
|
||||
if value is None or str(value).strip() == "": return 0
|
||||
return int(float(value))
|
||||
except (ValueError, TypeError): return 0
|
||||
|
||||
@classmethod
|
||||
def parse_float(cls, value) -> float:
|
||||
try:
|
||||
if value is None or str(value).strip() == "": return 0.0
|
||||
return float(value)
|
||||
except (ValueError, TypeError): return 0.0
|
||||
|
||||
@classmethod
|
||||
async def fetch_with_retry(cls, client: httpx.AsyncClient, url: str, retries: int = 3):
|
||||
""" Hibatűrő HTTP lekérdezés exponenciális várakozással. """
|
||||
for attempt in range(retries):
|
||||
try:
|
||||
resp = await client.get(url, headers=cls.HEADERS)
|
||||
if resp.status_code == 200:
|
||||
return resp
|
||||
elif resp.status_code == 429: # Rate limit
|
||||
await asyncio.sleep(2 ** attempt)
|
||||
else:
|
||||
return resp
|
||||
except httpx.RequestError as e:
|
||||
if attempt == retries - 1:
|
||||
logger.debug(f"Hálózati hiba: {e}")
|
||||
raise
|
||||
await asyncio.sleep(2 ** attempt)
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
async def fetch_tech_details(cls, client, plate):
|
||||
""" Technikai adatok (üzemanyag, teljesítmény, motorkód) begyűjtése. """
|
||||
results = {
|
||||
"power_kw": 0, "engine_code": None, "euro_class": None,
|
||||
"fuel_desc": "Unknown", "co2": 0, "consumption": 0.0
|
||||
}
|
||||
try:
|
||||
# Üzemanyag adatok
|
||||
f_resp = await cls.fetch_with_retry(client, f"{cls.RDW_FUEL}?kenteken={plate}")
|
||||
if f_resp and f_resp.status_code == 200 and f_resp.json():
|
||||
f = f_resp.json()[0]
|
||||
p1 = cls.parse_int(f.get("netto_maximum_vermogen") or f.get("nettomaximumvermogen"))
|
||||
p2 = cls.parse_int(f.get("nominaal_continu_maximum_vermogen") or f.get("nominaalcontinuvermogen"))
|
||||
results.update({
|
||||
"power_kw": max(p1, p2),
|
||||
"fuel_desc": f.get("brandstof_omschrijving") or "Unknown",
|
||||
"euro_class": f.get("euro_klasse") or f.get("uitlaatemissieniveau"),
|
||||
"co2": cls.parse_int(f.get("co2_uitstoot_gecombineerd")),
|
||||
"consumption": cls.parse_float(f.get("brandstofverbruik_gecombineerd"))
|
||||
})
|
||||
|
||||
# Motorkód adatok
|
||||
e_resp = await cls.fetch_with_retry(client, f"{cls.RDW_ENGINE}?kenteken={plate}")
|
||||
if e_resp and e_resp.status_code == 200 and e_resp.json():
|
||||
results["engine_code"] = e_resp.json()[0].get("motorcode")
|
||||
except Exception:
|
||||
pass
|
||||
return results
|
||||
|
||||
@classmethod
|
||||
async def process_make_model(cls, db, task_id, make_name, model_name, v_class, priority):
|
||||
""" Egy adott márka/modell (vagy wildcard) feldolgozása. """
|
||||
clean_make = make_name.strip().upper()
|
||||
clean_model = model_name.strip().upper()
|
||||
logger.info(f"🎯 ADATGYŰJTÉS INDUL: {clean_make} {clean_model}")
|
||||
|
||||
offset = 0
|
||||
async with httpx.AsyncClient(timeout=30.0) as client:
|
||||
while True:
|
||||
# Dinamikus paraméterezés: ALL_VARIANTS esetén nem szűrünk modellre
|
||||
if clean_model == 'ALL_VARIANTS':
|
||||
params = f"merk={clean_make}&$limit={cls.BATCH_SIZE}&$offset={offset}&$order=kenteken DESC"
|
||||
else:
|
||||
params = f"merk={clean_make}&handelsbenaming={clean_model}&$limit={cls.BATCH_SIZE}&$offset={offset}&$order=kenteken DESC"
|
||||
|
||||
try:
|
||||
r = await cls.fetch_with_retry(client, f"{cls.RDW_MAIN}?{params}")
|
||||
batch = r.json() if r and r.status_code == 200 else []
|
||||
except Exception as e:
|
||||
logger.error(f"❌ API hiba: {e}")
|
||||
break
|
||||
|
||||
if not batch:
|
||||
break
|
||||
|
||||
for item in batch:
|
||||
plate = item.get("kenteken", "UNKNOWN")
|
||||
try:
|
||||
# SAVEPOINT: Ha egy rekord mentése hibás, a tranzakció blokk nem sérül
|
||||
async with db.begin_nested():
|
||||
tech = await cls.fetch_tech_details(client, plate)
|
||||
|
||||
# Valódi modellnév kinyerése (Wildcard esetén fontos)
|
||||
actual_model = (item.get("handelsbenaming") or clean_model).upper()
|
||||
norm_name = cls.normalize(actual_model.replace(clean_make, "").strip() or actual_model)
|
||||
|
||||
stmt = insert(VehicleModelDefinition).values(
|
||||
make=clean_make,
|
||||
marketing_name=actual_model,
|
||||
normalized_name=norm_name,
|
||||
variant_code=item.get("variant", "UNKNOWN"),
|
||||
version_code=item.get("uitvoering", "UNKNOWN"),
|
||||
type_approval_number=item.get("typegoedkeuringsnummer"),
|
||||
technical_code=plate,
|
||||
engine_capacity=cls.parse_int(item.get("cilinderinhoud")),
|
||||
power_kw=tech["power_kw"],
|
||||
fuel_type=tech["fuel_desc"],
|
||||
engine_code=tech["engine_code"],
|
||||
seats=cls.parse_int(item.get("aantal_zitplaatsen")),
|
||||
doors=cls.parse_int(item.get("aantal_deuren")),
|
||||
width=cls.parse_int(item.get("breedte")),
|
||||
wheelbase=cls.parse_int(item.get("wielbasis")),
|
||||
list_price=cls.parse_int(item.get("catalogusprijs")),
|
||||
max_speed=cls.parse_int(item.get("maximale_constructiesnelheid")),
|
||||
curb_weight=cls.parse_int(item.get("massa_ledig_voertuig")),
|
||||
max_weight=cls.parse_int(item.get("technische_max_massa_voertuig")),
|
||||
body_type=item.get("inrichting"),
|
||||
co2_emissions_combined=tech["co2"],
|
||||
fuel_consumption_combined=tech["consumption"],
|
||||
euro_classification=tech["euro_class"],
|
||||
cylinders=cls.parse_int(item.get("aantal_cilinders")),
|
||||
vehicle_class=v_class,
|
||||
priority_score=priority,
|
||||
status="unverified", # R2 Researcher számára előkészítve
|
||||
source="MEGA-HUNTER-v1.9.3"
|
||||
).on_conflict_do_nothing(
|
||||
index_elements=['make', 'normalized_name', 'variant_code', 'version_code', 'fuel_type']
|
||||
)
|
||||
await db.execute(stmt)
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"⚠️ Sor eldobva ({plate}): {e}")
|
||||
|
||||
# Batch commit a sikeres sorok után
|
||||
await db.commit()
|
||||
|
||||
offset += len(batch)
|
||||
if offset >= 500: # Biztonsági korlát egy-egy márkánál
|
||||
break
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
# Discovery feladat lezárása
|
||||
await db.execute(
|
||||
text("UPDATE vehicle.catalog_discovery SET status = 'processed' WHERE id = :id"),
|
||||
{"id": task_id}
|
||||
)
|
||||
await db.commit()
|
||||
|
||||
@classmethod
|
||||
async def run(cls):
|
||||
logger.info("🤖 Mega-Hunter v1.9.3 ONLINE (SAVEPOINT ENABLED)")
|
||||
while True:
|
||||
try:
|
||||
async with AsyncSessionLocal() as db:
|
||||
# ATOMI ZÁROLÁS: Keresés, Zárolás és Állapotváltás egy lépésben
|
||||
query = text("""
|
||||
UPDATE vehicle.catalog_discovery
|
||||
SET status = 'processing'
|
||||
WHERE id = (
|
||||
SELECT id FROM vehicle.catalog_discovery
|
||||
WHERE status = 'pending'
|
||||
ORDER BY priority_score DESC
|
||||
FOR UPDATE SKIP LOCKED
|
||||
LIMIT 1
|
||||
)
|
||||
RETURNING id, make, model, vehicle_class, priority_score;
|
||||
""")
|
||||
|
||||
result = await db.execute(query)
|
||||
task = result.fetchone()
|
||||
await db.commit()
|
||||
|
||||
if task:
|
||||
await cls.process_make_model(db, task[0], task[1], task[2], task[3], task[4])
|
||||
else:
|
||||
# Ha nincs munka, 30 másodperc pihenő
|
||||
await asyncio.sleep(30)
|
||||
except Exception as e:
|
||||
logger.error(f"💀 Főciklus hiba: {e}")
|
||||
await asyncio.sleep(10)
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(CatalogHunter.run())
|
||||
@@ -0,0 +1,179 @@
|
||||
# /opt/docker/dev/service_finder/backend/app/workers/vehicle/vehicle_robot_1_catalog_hunter.py
|
||||
# version: 1.9.6
|
||||
import asyncio
|
||||
import httpx
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from sqlalchemy import text, func
|
||||
from sqlalchemy.dialects.postgresql import insert
|
||||
from app.database import AsyncSessionLocal
|
||||
from app.models.vehicle_definitions import VehicleModelDefinition
|
||||
|
||||
# MB 2.0 Standard Naplózás
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s [%(levelname)s] Robot-1-Hunter: %(message)s',
|
||||
stream=sys.stdout
|
||||
)
|
||||
logger = logging.getLogger("Robot-1-Hunter")
|
||||
|
||||
class CatalogHunter:
|
||||
"""
|
||||
Vehicle Robot 1.9.6: Mega-Hunter (TIMESTAMP & INTEGRITY PATCH)
|
||||
Kezeli az ALL_VARIANTS-t, a Savepoint-okat és az összes kötelező mezőt.
|
||||
"""
|
||||
RDW_MAIN = "https://opendata.rdw.nl/resource/m9d7-ebf2.json"
|
||||
RDW_FUEL = "https://opendata.rdw.nl/resource/8ys7-d773.json"
|
||||
RDW_ENGINE = "https://opendata.rdw.nl/resource/jh96-v4pq.json"
|
||||
|
||||
RDW_TOKEN = os.getenv("RDW_APP_TOKEN")
|
||||
HEADERS = {"X-App-Token": RDW_TOKEN} if RDW_TOKEN else {}
|
||||
BATCH_SIZE = 50
|
||||
|
||||
@classmethod
|
||||
def normalize(cls, text_val: str) -> str:
|
||||
if not text_val: return ""
|
||||
return re.sub(r'[^a-zA-Z0-9]', '', text_val).lower()
|
||||
|
||||
@classmethod
|
||||
def parse_int(cls, value) -> int:
|
||||
try:
|
||||
if value is None or str(value).strip() == "": return 0
|
||||
return int(float(value))
|
||||
except (ValueError, TypeError): return 0
|
||||
|
||||
@classmethod
|
||||
def parse_float(cls, value) -> float:
|
||||
try:
|
||||
if value is None or str(value).strip() == "": return 0.0
|
||||
return float(value)
|
||||
except (ValueError, TypeError): return 0.0
|
||||
|
||||
@classmethod
|
||||
async def fetch_with_retry(cls, client: httpx.AsyncClient, url: str, retries: int = 3):
|
||||
for attempt in range(retries):
|
||||
try:
|
||||
resp = await client.get(url, headers=cls.HEADERS)
|
||||
if resp.status_code == 200: return resp
|
||||
elif resp.status_code == 429: await asyncio.sleep(2 ** attempt)
|
||||
else: return resp
|
||||
except httpx.RequestError:
|
||||
if attempt == retries - 1: raise
|
||||
await asyncio.sleep(2 ** attempt)
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
async def fetch_tech_details(cls, client, plate):
|
||||
results = {"power_kw": 0, "engine_code": None, "euro_class": None, "fuel_desc": "Unknown", "co2": 0, "consumption": 0.0}
|
||||
try:
|
||||
f_resp = await cls.fetch_with_retry(client, f"{cls.RDW_FUEL}?kenteken={plate}")
|
||||
if f_resp and f_resp.status_code == 200 and f_resp.json():
|
||||
f = f_resp.json()[0]
|
||||
p1 = cls.parse_int(f.get("netto_maximum_vermogen") or f.get("nettomaximumvermogen"))
|
||||
p2 = cls.parse_int(f.get("nominaal_continu_maximum_vermogen") or f.get("nominaalcontinuvermogen"))
|
||||
results.update({
|
||||
"power_kw": max(p1, p2),
|
||||
"fuel_desc": f.get("brandstof_omschrijving") or "Unknown",
|
||||
"euro_class": f.get("euro_klasse") or f.get("uitlaatemissieniveau"),
|
||||
"co2": cls.parse_int(f.get("co2_uitstoot_gecombineerd")),
|
||||
"consumption": cls.parse_float(f.get("brandstofverbruik_gecombineerd"))
|
||||
})
|
||||
e_resp = await cls.fetch_with_retry(client, f"{cls.RDW_ENGINE}?kenteken={plate}")
|
||||
if e_resp and e_resp.status_code == 200 and e_resp.json():
|
||||
results["engine_code"] = e_resp.json()[0].get("motorcode")
|
||||
except Exception: pass
|
||||
return results
|
||||
|
||||
@classmethod
|
||||
async def process_make_model(cls, db, task_id, make_name, model_name, v_class, priority):
|
||||
clean_make = make_name.strip().upper()
|
||||
clean_model = model_name.strip().upper()
|
||||
logger.info(f"🎯 ADATGYŰJTÉS INDUL: {clean_make} {clean_model}")
|
||||
|
||||
offset = 0
|
||||
async with httpx.AsyncClient(timeout=30.0) as client:
|
||||
while True:
|
||||
if clean_model == 'ALL_VARIANTS':
|
||||
params = f"merk={clean_make}&$limit={cls.BATCH_SIZE}&$offset={offset}&$order=kenteken DESC"
|
||||
else:
|
||||
params = f"merk={clean_make}&handelsbenaming={clean_model}&$limit={cls.BATCH_SIZE}&$offset={offset}&$order=kenteken DESC"
|
||||
|
||||
try:
|
||||
r = await cls.fetch_with_retry(client, f"{cls.RDW_MAIN}?{params}")
|
||||
batch = r.json() if r and r.status_code == 200 else []
|
||||
except Exception: break
|
||||
|
||||
if not batch: break
|
||||
|
||||
for item in batch:
|
||||
plate = item.get("kenteken", "UNKNOWN")
|
||||
try:
|
||||
async with db.begin_nested():
|
||||
tech = await cls.fetch_tech_details(client, plate)
|
||||
actual_model = (item.get("handelsbenaming") or clean_model).upper()
|
||||
norm_name = cls.normalize(actual_model.replace(clean_make, "").strip() or actual_model)
|
||||
|
||||
stmt = insert(VehicleModelDefinition).values(
|
||||
make=clean_make,
|
||||
marketing_name=actual_model,
|
||||
normalized_name=norm_name,
|
||||
variant_code=item.get("variant", "UNKNOWN"),
|
||||
version_code=item.get("uitvoering", "UNKNOWN"),
|
||||
technical_code=plate,
|
||||
engine_capacity=cls.parse_int(item.get("cilinderinhoud")),
|
||||
power_kw=tech["power_kw"],
|
||||
fuel_type=tech["fuel_desc"],
|
||||
engine_code=tech["engine_code"],
|
||||
seats=cls.parse_int(item.get("aantal_zitplaatsen")),
|
||||
doors=cls.parse_int(item.get("aantal_deuren")),
|
||||
curb_weight=cls.parse_int(item.get("massa_ledig_voertuig")),
|
||||
max_weight=cls.parse_int(item.get("technische_max_massa_voertuig")),
|
||||
vehicle_class=v_class,
|
||||
priority_score=priority,
|
||||
market='EU', # KÖTELEZŐ
|
||||
status="unverified",
|
||||
is_manual=False,
|
||||
created_at=func.now(), # KÖTELEZŐ DÁTUMOK
|
||||
updated_at=func.now(),
|
||||
source="MEGA-HUNTER-v1.9.6"
|
||||
).on_conflict_do_nothing(
|
||||
index_elements=['make', 'normalized_name', 'variant_code', 'version_code', 'fuel_type']
|
||||
)
|
||||
await db.execute(stmt)
|
||||
except Exception as e:
|
||||
logger.warning(f"⚠️ Sor eldobva ({plate}): {e}")
|
||||
|
||||
await db.commit()
|
||||
offset += len(batch)
|
||||
if offset >= 500: break
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
await db.execute(text("UPDATE vehicle.catalog_discovery SET status = 'processed' WHERE id = :id"), {"id": task_id})
|
||||
await db.commit()
|
||||
|
||||
@classmethod
|
||||
async def run(cls):
|
||||
logger.info("🤖 Mega-Hunter v1.9.6 ONLINE (TIMESTAMP PATCH)")
|
||||
while True:
|
||||
try:
|
||||
async with AsyncSessionLocal() as db:
|
||||
query = text("""
|
||||
UPDATE vehicle.catalog_discovery SET status = 'processing'
|
||||
WHERE id = (SELECT id FROM vehicle.catalog_discovery WHERE status = 'pending'
|
||||
ORDER BY priority_score DESC FOR UPDATE SKIP LOCKED LIMIT 1)
|
||||
RETURNING id, make, model, vehicle_class, priority_score;
|
||||
""")
|
||||
result = await db.execute(query)
|
||||
task = result.fetchone()
|
||||
await db.commit()
|
||||
if task: await cls.process_make_model(db, task[0], task[1], task[2], task[3], task[4])
|
||||
else: await asyncio.sleep(30)
|
||||
except Exception as e:
|
||||
logger.error(f"💀 Főciklus hiba: {e}")
|
||||
await asyncio.sleep(10)
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(CatalogHunter.run())
|
||||
@@ -0,0 +1,168 @@
|
||||
import asyncio
|
||||
import httpx
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from sqlalchemy import text
|
||||
from sqlalchemy.dialects.postgresql import insert
|
||||
from app.database import AsyncSessionLocal
|
||||
from app.models.vehicle_definitions import VehicleModelDefinition
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] Robot-1-Hunter: %(message)s', stream=sys.stdout)
|
||||
logger = logging.getLogger("Robot-1")
|
||||
|
||||
class CatalogHunter:
|
||||
"""
|
||||
Vehicle Robot 2.1.2: A Végleges Vadász
|
||||
Tökéletes adattípus szinkron. raw_search_context -> string.
|
||||
"""
|
||||
RDW_MAIN = "https://opendata.rdw.nl/resource/m9d7-ebf2.json"
|
||||
RDW_FUEL = "https://opendata.rdw.nl/resource/8ys7-d773.json"
|
||||
RDW_ENGINE = "https://opendata.rdw.nl/resource/jh96-v4pq.json"
|
||||
|
||||
RDW_TOKEN = os.getenv("RDW_APP_TOKEN")
|
||||
HEADERS = {"X-App-Token": RDW_TOKEN} if RDW_TOKEN else {}
|
||||
BATCH_SIZE = 50
|
||||
|
||||
@classmethod
|
||||
def normalize(cls, text_val: str) -> str:
|
||||
if not text_val: return "UNKNOWN"
|
||||
return re.sub(r'[^a-zA-Z0-9]', '', text_val).lower()
|
||||
|
||||
@classmethod
|
||||
def parse_int(cls, value) -> int:
|
||||
try:
|
||||
if value is None or str(value).strip() == "": return 0
|
||||
return int(float(value))
|
||||
except (ValueError, TypeError): return 0
|
||||
|
||||
@classmethod
|
||||
def parse_float(cls, value) -> float:
|
||||
try:
|
||||
if value is None or str(value).strip() == "": return 0.0
|
||||
return float(value)
|
||||
except (ValueError, TypeError): return 0.0
|
||||
|
||||
@classmethod
|
||||
async def fetch_tech_details(cls, client, plate):
|
||||
res = {"power_kw": 0, "engine_code": None, "euro_class": None, "fuel_desc": "Unknown", "co2": 0, "consumption": 0.0}
|
||||
try:
|
||||
f_resp = await client.get(f"{cls.RDW_FUEL}?kenteken={plate}", headers=cls.HEADERS)
|
||||
if f_resp.status_code == 200 and f_resp.json():
|
||||
f = f_resp.json()[0]
|
||||
p1 = cls.parse_int(f.get("netto_maximum_vermogen"))
|
||||
p2 = cls.parse_int(f.get("nominaal_continu_maximum_vermogen"))
|
||||
res.update({
|
||||
"power_kw": max(p1, p2),
|
||||
"fuel_desc": f.get("brandstof_omschrijving") or "Unknown",
|
||||
"euro_class": f.get("euro_klasse") or f.get("uitlaatemissieniveau"),
|
||||
"co2": cls.parse_int(f.get("co2_uitstoot_gecombineerd")),
|
||||
"consumption": cls.parse_float(f.get("brandstofverbruik_gecombineerd"))
|
||||
})
|
||||
e_resp = await client.get(f"{cls.RDW_ENGINE}?kenteken={plate}", headers=cls.HEADERS)
|
||||
if e_resp.status_code == 200 and e_resp.json():
|
||||
res["engine_code"] = e_resp.json()[0].get("motorcode")
|
||||
except Exception: pass
|
||||
return res
|
||||
|
||||
@classmethod
|
||||
async def process_task(cls, db, task):
|
||||
clean_make = task.make.strip().upper()
|
||||
clean_model = task.model.strip().upper()
|
||||
logger.info(f"🎯 ADATGYŰJTÉS INDUL: {clean_make} {clean_model}")
|
||||
|
||||
async with httpx.AsyncClient(timeout=30.0) as client:
|
||||
offset = 0
|
||||
while True:
|
||||
params = f"merk={clean_make}"
|
||||
if clean_model != 'ALL_VARIANTS':
|
||||
params += f"&handelsbenaming={clean_model}"
|
||||
params += f"&$limit={cls.BATCH_SIZE}&$offset={offset}&$order=kenteken DESC"
|
||||
|
||||
try:
|
||||
r = await client.get(f"{cls.RDW_MAIN}?{params}", headers=cls.HEADERS)
|
||||
batch = r.json() if r.status_code == 200 else []
|
||||
except Exception: break
|
||||
if not batch: break
|
||||
|
||||
for item in batch:
|
||||
plate = item.get("kenteken", "UNKNOWN")
|
||||
try:
|
||||
async with db.begin_nested():
|
||||
tech = await cls.fetch_tech_details(client, plate)
|
||||
actual_model = (item.get("handelsbenaming") or clean_model).upper()
|
||||
norm_name = cls.normalize(actual_model.replace(clean_make, "").strip() or actual_model)
|
||||
|
||||
datum_eerste_toelating = str(item.get("datum_eerste_toelating", ""))
|
||||
year_from = cls.parse_int(datum_eerste_toelating[:4]) if len(datum_eerste_toelating) >= 4 else 0
|
||||
|
||||
stmt = insert(VehicleModelDefinition).values(
|
||||
market='EU',
|
||||
make=clean_make,
|
||||
marketing_name=actual_model,
|
||||
normalized_name=norm_name,
|
||||
variant_code=item.get("variant", "UNKNOWN"),
|
||||
version_code=item.get("uitvoering", "UNKNOWN"),
|
||||
technical_code=plate,
|
||||
type_approval_number=item.get("typegoedkeuringsnummer"),
|
||||
seats=cls.parse_int(item.get("aantal_zitplaatsen")),
|
||||
doors=cls.parse_int(item.get("aantal_deuren")),
|
||||
width=cls.parse_int(item.get("breedte")),
|
||||
wheelbase=cls.parse_int(item.get("wielbasis")),
|
||||
list_price=cls.parse_int(item.get("catalogusprijs")),
|
||||
max_speed=cls.parse_int(item.get("maximale_constructiesnelheid")),
|
||||
curb_weight=cls.parse_int(item.get("massa_ledig_voertuig")),
|
||||
max_weight=cls.parse_int(item.get("technische_max_massa_voertuig")),
|
||||
fuel_consumption_combined=tech["consumption"],
|
||||
co2_emissions_combined=tech["co2"],
|
||||
vehicle_class=task.vehicle_class,
|
||||
body_type=item.get("inrichting"),
|
||||
fuel_type=tech["fuel_desc"],
|
||||
engine_capacity=cls.parse_int(item.get("cilinderinhoud")),
|
||||
power_kw=tech["power_kw"],
|
||||
cylinders=cls.parse_int(item.get("aantal_cilinders")),
|
||||
engine_code=tech["engine_code"],
|
||||
euro_classification=tech["euro_class"],
|
||||
year_from=year_from,
|
||||
priority_score=task.priority_score,
|
||||
status="unverified",
|
||||
source="MEGA-HUNTER-v2.1.2",
|
||||
# JAVÍTÁS: A raw_search_context most már üres STRING (''), ahogy a modell elvárja!
|
||||
raw_search_context='',
|
||||
research_metadata={},
|
||||
specifications={},
|
||||
marketing_name_aliases=[]
|
||||
).on_conflict_do_nothing(
|
||||
index_elements=['make', 'normalized_name', 'variant_code', 'version_code', 'fuel_type', 'market', 'year_from']
|
||||
)
|
||||
await db.execute(stmt)
|
||||
except Exception as e:
|
||||
logger.warning(f"⚠️ Sor hiba ({plate}): {e}")
|
||||
|
||||
await db.commit()
|
||||
offset += len(batch)
|
||||
if offset >= 500: break
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
await db.execute(text("UPDATE vehicle.catalog_discovery SET status = 'processed' WHERE id = :id"), {"id": task.id})
|
||||
await db.commit()
|
||||
|
||||
@classmethod
|
||||
async def run(cls):
|
||||
logger.info("🤖 Mega-Hunter v2.1.2 (Adattípus Fix) ONLINE")
|
||||
while True:
|
||||
try:
|
||||
async with AsyncSessionLocal() as db:
|
||||
query = text("UPDATE vehicle.catalog_discovery SET status = 'processing' WHERE id = (SELECT id FROM vehicle.catalog_discovery WHERE status = 'pending' ORDER BY priority_score DESC FOR UPDATE SKIP LOCKED LIMIT 1) RETURNING id, make, model, vehicle_class, priority_score;")
|
||||
res = await db.execute(query)
|
||||
task = res.fetchone()
|
||||
await db.commit()
|
||||
if task: await cls.process_task(db, task)
|
||||
else: await asyncio.sleep(30)
|
||||
except Exception as e:
|
||||
logger.error(f"💀 Főciklus hiba: {e}")
|
||||
await asyncio.sleep(10)
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(CatalogHunter.run())
|
||||
@@ -0,0 +1,205 @@
|
||||
# /app/app/workers/vehicle/vehicle_robot_1_catalog_hunter.py
|
||||
import asyncio
|
||||
import httpx
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import json
|
||||
from sqlalchemy import text
|
||||
from sqlalchemy.dialects.postgresql import insert
|
||||
from app.database import AsyncSessionLocal
|
||||
from app.models.vehicle_definitions import VehicleModelDefinition
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] Robot-1-Hunter: %(message)s', stream=sys.stdout)
|
||||
logger = logging.getLogger("Robot-1")
|
||||
|
||||
class CatalogHunter:
|
||||
"""
|
||||
Vehicle Robot 2.2.0: Fast-Track to Gold Edition
|
||||
Ha az RDW-ből megvan minden kulcsadat (kw, ccm, fuel), azonnal 'gold_enriched'-re teszi a járművet
|
||||
és beírja a vehicle_catalog mestertáblába!
|
||||
"""
|
||||
RDW_MAIN = "https://opendata.rdw.nl/resource/m9d7-ebf2.json"
|
||||
RDW_FUEL = "https://opendata.rdw.nl/resource/8ys7-d773.json"
|
||||
RDW_ENGINE = "https://opendata.rdw.nl/resource/jh96-v4pq.json"
|
||||
|
||||
RDW_TOKEN = os.getenv("RDW_APP_TOKEN")
|
||||
HEADERS = {"X-App-Token": RDW_TOKEN} if RDW_TOKEN else {}
|
||||
BATCH_SIZE = 50
|
||||
|
||||
@classmethod
|
||||
def normalize(cls, text_val: str) -> str:
|
||||
if not text_val: return "UNKNOWN"
|
||||
return re.sub(r'[^a-zA-Z0-9]', '', text_val).lower()
|
||||
|
||||
@classmethod
|
||||
def parse_int(cls, value) -> int:
|
||||
try:
|
||||
if value is None or str(value).strip() == "": return 0
|
||||
return int(float(value))
|
||||
except (ValueError, TypeError): return 0
|
||||
|
||||
@classmethod
|
||||
def parse_float(cls, value) -> float:
|
||||
try:
|
||||
if value is None or str(value).strip() == "": return 0.0
|
||||
return float(value)
|
||||
except (ValueError, TypeError): return 0.0
|
||||
|
||||
@classmethod
|
||||
async def fetch_tech_details(cls, client, plate):
|
||||
res = {"power_kw": 0, "engine_code": None, "euro_class": None, "fuel_desc": "Unknown", "co2": 0, "consumption": 0.0}
|
||||
try:
|
||||
f_resp = await client.get(f"{cls.RDW_FUEL}?kenteken={plate}", headers=cls.HEADERS)
|
||||
if f_resp.status_code == 200 and f_resp.json():
|
||||
f = f_resp.json()[0]
|
||||
p1 = cls.parse_int(f.get("netto_maximum_vermogen"))
|
||||
p2 = cls.parse_int(f.get("nominaal_continu_maximum_vermogen"))
|
||||
res.update({
|
||||
"power_kw": max(p1, p2),
|
||||
"fuel_desc": f.get("brandstof_omschrijving") or "Unknown",
|
||||
"euro_class": f.get("euro_klasse") or f.get("uitlaatemissieniveau"),
|
||||
"co2": cls.parse_int(f.get("co2_uitstoot_gecombineerd")),
|
||||
"consumption": cls.parse_float(f.get("brandstofverbruik_gecombineerd"))
|
||||
})
|
||||
e_resp = await client.get(f"{cls.RDW_ENGINE}?kenteken={plate}", headers=cls.HEADERS)
|
||||
if e_resp.status_code == 200 and e_resp.json():
|
||||
res["engine_code"] = e_resp.json()[0].get("motorcode")
|
||||
except Exception: pass
|
||||
return res
|
||||
|
||||
@classmethod
|
||||
async def process_task(cls, db, task):
|
||||
clean_make = task.make.strip().upper()
|
||||
clean_model = task.model.strip().upper()
|
||||
logger.info(f"🎯 ADATGYŰJTÉS INDUL: {clean_make} {clean_model}")
|
||||
|
||||
async with httpx.AsyncClient(timeout=30.0) as client:
|
||||
offset = 0
|
||||
while True:
|
||||
params = f"merk={clean_make}"
|
||||
if clean_model != 'ALL_VARIANTS':
|
||||
params += f"&handelsbenaming={clean_model}"
|
||||
params += f"&$limit={cls.BATCH_SIZE}&$offset={offset}&$order=kenteken DESC"
|
||||
|
||||
try:
|
||||
r = await client.get(f"{cls.RDW_MAIN}?{params}", headers=cls.HEADERS)
|
||||
batch = r.json() if r.status_code == 200 else []
|
||||
except Exception: break
|
||||
if not batch: break
|
||||
|
||||
for item in batch:
|
||||
plate = item.get("kenteken", "UNKNOWN")
|
||||
try:
|
||||
async with db.begin_nested():
|
||||
tech = await cls.fetch_tech_details(client, plate)
|
||||
actual_model = (item.get("handelsbenaming") or clean_model).upper()
|
||||
norm_name = cls.normalize(actual_model.replace(clean_make, "").strip() or actual_model)
|
||||
|
||||
datum_eerste_toelating = str(item.get("datum_eerste_toelating", ""))
|
||||
year_from = cls.parse_int(datum_eerste_toelating[:4]) if len(datum_eerste_toelating) >= 4 else 0
|
||||
|
||||
engine_ccm = cls.parse_int(item.get("cilinderinhoud"))
|
||||
power_kw = tech["power_kw"]
|
||||
fuel_type = tech["fuel_desc"]
|
||||
|
||||
# FAST-TRACK LOGIKA: Ha a kötelező műszaki adatok megvannak, azonnal ARANY minősítést kap!
|
||||
# Villanyautóknál a CCM lehet 0, ezt is kezeljük.
|
||||
is_gold = False
|
||||
if (power_kw > 0 and engine_ccm > 0) or (power_kw > 0 and "elektri" in fuel_type.lower()):
|
||||
is_gold = True
|
||||
|
||||
final_status = "gold_enriched" if is_gold else "unverified"
|
||||
|
||||
# 1. Beírjuk a VMD-be (Staging tábla)
|
||||
stmt = insert(VehicleModelDefinition).values(
|
||||
market='EU',
|
||||
make=clean_make,
|
||||
marketing_name=actual_model,
|
||||
normalized_name=norm_name,
|
||||
variant_code=item.get("variant", "UNKNOWN"),
|
||||
version_code=item.get("uitvoering", "UNKNOWN"),
|
||||
technical_code=plate,
|
||||
type_approval_number=item.get("typegoedkeuringsnummer"),
|
||||
seats=cls.parse_int(item.get("aantal_zitplaatsen")),
|
||||
doors=cls.parse_int(item.get("aantal_deuren")),
|
||||
width=cls.parse_int(item.get("breedte")),
|
||||
wheelbase=cls.parse_int(item.get("wielbasis")),
|
||||
list_price=cls.parse_int(item.get("catalogusprijs")),
|
||||
max_speed=cls.parse_int(item.get("maximale_constructiesnelheid")),
|
||||
curb_weight=cls.parse_int(item.get("massa_ledig_voertuig")),
|
||||
max_weight=cls.parse_int(item.get("technische_max_massa_voertuig")),
|
||||
fuel_consumption_combined=tech["consumption"],
|
||||
co2_emissions_combined=tech["co2"],
|
||||
vehicle_class=task.vehicle_class,
|
||||
body_type=item.get("inrichting"),
|
||||
fuel_type=fuel_type,
|
||||
engine_capacity=engine_ccm,
|
||||
power_kw=power_kw,
|
||||
cylinders=cls.parse_int(item.get("aantal_cilinders")),
|
||||
engine_code=tech["engine_code"],
|
||||
euro_classification=tech["euro_class"],
|
||||
year_from=year_from,
|
||||
priority_score=task.priority_score,
|
||||
status=final_status, # Dinamikus státusz
|
||||
source="MEGA-HUNTER-v2.2.0-FAST",
|
||||
raw_search_context='',
|
||||
research_metadata={},
|
||||
specifications={"fast_track": True}, # Jelezzük, hogy ez RDW-ből jött közvetlenül
|
||||
marketing_name_aliases=[]
|
||||
).on_conflict_do_nothing(
|
||||
index_elements=['make', 'normalized_name', 'variant_code', 'version_code', 'fuel_type', 'market', 'year_from']
|
||||
).returning(VehicleModelDefinition.id)
|
||||
|
||||
res = await db.execute(stmt)
|
||||
vmd_id = res.scalar()
|
||||
|
||||
# 2. HA ARANY, AZONNAL LÉPÜNK A VÉGSŐ KATALÓGUSBA (Ahogy az Alchemist is tenné)
|
||||
if is_gold and vmd_id:
|
||||
cat_stmt = text("""
|
||||
INSERT INTO vehicle.vehicle_catalog
|
||||
(master_definition_id, make, model, power_kw, engine_capacity, fuel_type, factory_data)
|
||||
VALUES (:m_id, :make, :model, :kw, :ccm, :fuel, :factory)
|
||||
ON CONFLICT ON CONSTRAINT uix_vehicle_catalog_full DO NOTHING;
|
||||
""")
|
||||
await db.execute(cat_stmt, {
|
||||
"m_id": vmd_id,
|
||||
"make": clean_make,
|
||||
"model": actual_model[:50],
|
||||
"kw": power_kw,
|
||||
"ccm": engine_ccm,
|
||||
"fuel": fuel_type,
|
||||
"factory": json.dumps({"source": "RDW API Direct", "verified": True})
|
||||
})
|
||||
logger.info(f"✨ FAST-TRACK ARANY: {clean_make} {actual_model} (KW: {power_kw}, CCM: {engine_ccm})")
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"⚠️ Sor hiba ({plate}): {e}")
|
||||
|
||||
await db.commit()
|
||||
offset += len(batch)
|
||||
if offset >= 500: break
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
await db.execute(text("UPDATE vehicle.catalog_discovery SET status = 'processed' WHERE id = :id"), {"id": task.id})
|
||||
await db.commit()
|
||||
|
||||
@classmethod
|
||||
async def run(cls):
|
||||
logger.info("🤖 Mega-Hunter v2.2.0 (Fast-Track Edition) ONLINE")
|
||||
while True:
|
||||
try:
|
||||
async with AsyncSessionLocal() as db:
|
||||
query = text("UPDATE vehicle.catalog_discovery SET status = 'processing' WHERE id = (SELECT id FROM vehicle.catalog_discovery WHERE status = 'pending' ORDER BY priority_score DESC FOR UPDATE SKIP LOCKED LIMIT 1) RETURNING id, make, model, vehicle_class, priority_score;")
|
||||
res = await db.execute(query)
|
||||
task = res.fetchone()
|
||||
await db.commit()
|
||||
if task: await cls.process_task(db, task)
|
||||
else: await asyncio.sleep(30)
|
||||
except Exception as e:
|
||||
logger.error(f"💀 Főciklus hiba: {e}")
|
||||
await asyncio.sleep(10)
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(CatalogHunter.run())
|
||||
@@ -0,0 +1,140 @@
|
||||
import asyncio, httpx, logging, os, re, sys, json
|
||||
from sqlalchemy import text
|
||||
from sqlalchemy.dialects.postgresql import insert
|
||||
from app.database import AsyncSessionLocal
|
||||
from app.models.vehicle_definitions import VehicleModelDefinition
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] Robot-1-Hunter: %(message)s', stream=sys.stdout)
|
||||
logger = logging.getLogger("Robot-1")
|
||||
|
||||
class CatalogHunter:
|
||||
RDW_MAIN = "https://opendata.rdw.nl/resource/m9d7-ebf2.json"
|
||||
RDW_FUEL = "https://opendata.rdw.nl/resource/8ys7-d773.json"
|
||||
RDW_ENGINE = "https://opendata.rdw.nl/resource/jh96-v4pq.json"
|
||||
RDW_TOKEN = os.getenv("RDW_APP_TOKEN")
|
||||
HEADERS = {"X-App-Token": RDW_TOKEN} if RDW_TOKEN else {}
|
||||
BATCH_SIZE = 50
|
||||
|
||||
@classmethod
|
||||
def normalize(cls, text_val: str) -> str:
|
||||
return re.sub(r'[^a-zA-Z0-9]', '', text_val).lower() if text_val else "UNKNOWN"
|
||||
|
||||
@classmethod
|
||||
def parse_int(cls, value) -> int:
|
||||
try: return int(float(value)) if value and str(value).strip() else 0
|
||||
except: return 0
|
||||
|
||||
@classmethod
|
||||
def parse_float(cls, value) -> float:
|
||||
try: return float(value) if value and str(value).strip() else 0.0
|
||||
except: return 0.0
|
||||
|
||||
@classmethod
|
||||
async def fetch_tech_details(cls, client, plate):
|
||||
res = {"power_kw": 0, "engine_code": None, "euro_class": None, "fuel_desc": "Unknown", "co2": 0, "consumption": 0.0}
|
||||
try:
|
||||
f_resp = await client.get(f"{cls.RDW_FUEL}?kenteken={plate}", headers=cls.HEADERS)
|
||||
if f_resp.status_code == 200 and f_resp.json():
|
||||
f = f_resp.json()[0]
|
||||
p1, p2 = cls.parse_int(f.get("netto_maximum_vermogen")), cls.parse_int(f.get("nominaal_continu_maximum_vermogen"))
|
||||
res.update({
|
||||
"power_kw": max(p1, p2),
|
||||
"fuel_desc": f.get("brandstof_omschrijving") or "Unknown",
|
||||
"euro_class": f.get("euro_klasse") or f.get("uitlaatemissieniveau"),
|
||||
"co2": cls.parse_int(f.get("co2_uitstoot_gecombineerd")),
|
||||
"consumption": cls.parse_float(f.get("brandstofverbruik_gecombineerd"))
|
||||
})
|
||||
e_resp = await client.get(f"{cls.RDW_ENGINE}?kenteken={plate}", headers=cls.HEADERS)
|
||||
if e_resp.status_code == 200 and e_resp.json():
|
||||
res["engine_code"] = e_resp.json()[0].get("motorcode")
|
||||
except Exception: pass
|
||||
return res
|
||||
|
||||
@classmethod
|
||||
async def process_task(cls, db, task):
|
||||
clean_make, clean_model = task.make.strip().upper(), task.model.strip().upper()
|
||||
logger.info(f"🎯 ADATGYŰJTÉS INDUL: {clean_make} {clean_model}")
|
||||
|
||||
async with httpx.AsyncClient(timeout=30.0) as client:
|
||||
offset = 0
|
||||
while True:
|
||||
params = f"merk={clean_make}" + (f"&handelsbenaming={clean_model}" if clean_model != 'ALL_VARIANTS' else "") + f"&$limit={cls.BATCH_SIZE}&$offset={offset}&$order=kenteken DESC"
|
||||
try:
|
||||
r = await client.get(f"{cls.RDW_MAIN}?{params}", headers=cls.HEADERS)
|
||||
batch = r.json() if r.status_code == 200 else []
|
||||
except Exception: break
|
||||
if not batch: break
|
||||
|
||||
for item in batch:
|
||||
plate = item.get("kenteken", "UNKNOWN")
|
||||
try:
|
||||
async with db.begin_nested():
|
||||
tech = await cls.fetch_tech_details(client, plate)
|
||||
actual_model = (item.get("handelsbenaming") or clean_model).upper()
|
||||
norm_name = cls.normalize(actual_model.replace(clean_make, "").strip() or actual_model)
|
||||
|
||||
datum = str(item.get("datum_eerste_toelating", ""))
|
||||
year_from = cls.parse_int(datum[:4]) if len(datum) >= 4 else 0
|
||||
|
||||
engine_ccm, power_kw, fuel_type = cls.parse_int(item.get("cilinderinhoud")), tech["power_kw"], tech["fuel_desc"]
|
||||
|
||||
# FAST-TRACK LOGIKA: Ha van KW és CCM, egyből ARANY!
|
||||
is_gold = (power_kw > 0 and engine_ccm > 0) or (power_kw > 0 and "elektri" in fuel_type.lower())
|
||||
final_status = "gold_enriched" if is_gold else "unverified"
|
||||
|
||||
stmt = insert(VehicleModelDefinition).values(
|
||||
market='EU', make=clean_make, marketing_name=actual_model, normalized_name=norm_name,
|
||||
variant_code=item.get("variant", "UNKNOWN"), version_code=item.get("uitvoering", "UNKNOWN"),
|
||||
technical_code=plate, type_approval_number=item.get("typegoedkeuringsnummer"),
|
||||
seats=cls.parse_int(item.get("aantal_zitplaatsen")), doors=cls.parse_int(item.get("aantal_deuren")),
|
||||
width=cls.parse_int(item.get("breedte")), wheelbase=cls.parse_int(item.get("wielbasis")),
|
||||
list_price=cls.parse_int(item.get("catalogusprijs")), max_speed=cls.parse_int(item.get("maximale_constructiesnelheid")),
|
||||
curb_weight=cls.parse_int(item.get("massa_ledig_voertuig")), max_weight=cls.parse_int(item.get("technische_max_massa_voertuig")),
|
||||
fuel_consumption_combined=tech["consumption"], co2_emissions_combined=tech["co2"],
|
||||
vehicle_class=task.vehicle_class, body_type=item.get("inrichting"), fuel_type=fuel_type,
|
||||
engine_capacity=engine_ccm, power_kw=power_kw, cylinders=cls.parse_int(item.get("aantal_cilinders")),
|
||||
engine_code=tech["engine_code"], euro_classification=tech["euro_class"], year_from=year_from,
|
||||
priority_score=task.priority_score, status=final_status, source="MEGA-HUNTER-v2.2.0-FAST",
|
||||
raw_search_context='', research_metadata={}, specifications={"fast_track": True} if is_gold else {}, marketing_name_aliases=[]
|
||||
).on_conflict_do_nothing(
|
||||
index_elements=['make', 'normalized_name', 'variant_code', 'version_code', 'fuel_type', 'market', 'year_from']
|
||||
).returning(VehicleModelDefinition.id)
|
||||
|
||||
res = await db.execute(stmt)
|
||||
vmd_id = res.scalar()
|
||||
|
||||
# Automatikus Publikálás (Ha Arany)
|
||||
if is_gold and vmd_id:
|
||||
cat_stmt = text("""
|
||||
INSERT INTO vehicle.vehicle_catalog (master_definition_id, make, model, power_kw, engine_capacity, fuel_type, factory_data)
|
||||
VALUES (:m_id, :make, :model, :kw, :ccm, :fuel, :factory)
|
||||
ON CONFLICT ON CONSTRAINT uix_vehicle_catalog_full DO NOTHING;
|
||||
""")
|
||||
await db.execute(cat_stmt, {"m_id": vmd_id, "make": clean_make, "model": actual_model[:50], "kw": power_kw, "ccm": engine_ccm, "fuel": fuel_type, "factory": '{"source": "RDW Fast-Track"}'})
|
||||
logger.info(f"✨ FAST-TRACK ARANY: {clean_make} {actual_model}")
|
||||
|
||||
except Exception as e: logger.warning(f"⚠️ Sor hiba ({plate}): {e}")
|
||||
|
||||
await db.commit()
|
||||
offset += len(batch)
|
||||
if offset >= 500: break
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
await db.execute(text("UPDATE vehicle.catalog_discovery SET status = 'processed' WHERE id = :id"), {"id": task.id})
|
||||
await db.commit()
|
||||
|
||||
@classmethod
|
||||
async def run(cls):
|
||||
logger.info("🤖 Mega-Hunter v2.2.0 (Fast-Track) ONLINE")
|
||||
while True:
|
||||
try:
|
||||
async with AsyncSessionLocal() as db:
|
||||
res = await db.execute(text("UPDATE vehicle.catalog_discovery SET status = 'processing' WHERE id = (SELECT id FROM vehicle.catalog_discovery WHERE status = 'pending' ORDER BY priority_score DESC FOR UPDATE SKIP LOCKED LIMIT 1) RETURNING id, make, model, vehicle_class, priority_score;"))
|
||||
task = res.fetchone()
|
||||
await db.commit()
|
||||
if task: await cls.process_task(db, task)
|
||||
else: await asyncio.sleep(30)
|
||||
except Exception: await asyncio.sleep(10)
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(CatalogHunter.run())
|
||||
@@ -0,0 +1,239 @@
|
||||
# /opt/docker/dev/service_finder/backend/app/workers/vehicle/vehicle_robot_2_researcher.py
|
||||
import asyncio
|
||||
import logging
|
||||
import warnings
|
||||
import os
|
||||
import json
|
||||
from datetime import datetime
|
||||
from sqlalchemy import text, update, func
|
||||
from app.database import AsyncSessionLocal
|
||||
from app.models.vehicle_definitions import VehicleModelDefinition
|
||||
|
||||
warnings.filterwarnings("ignore", category=RuntimeWarning, module='duckduckgo_search')
|
||||
from duckduckgo_search import DDGS
|
||||
|
||||
# MB 2.0 Szabvány naplózás
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] Robot-2-Researcher: %(message)s')
|
||||
logger = logging.getLogger("Vehicle-Robot-2-Researcher")
|
||||
|
||||
class QuotaManager:
|
||||
""" Szigorú napi limit figyelő a fizetős/hatósági API-khoz """
|
||||
def __init__(self, service_name: str, daily_limit: int):
|
||||
self.service_name = service_name
|
||||
self.daily_limit = daily_limit
|
||||
self.state_file = f"/app/temp/.quota_{service_name}.json"
|
||||
self._ensure_file()
|
||||
|
||||
def _ensure_file(self):
|
||||
os.makedirs(os.path.dirname(self.state_file), exist_ok=True)
|
||||
if not os.path.exists(self.state_file):
|
||||
with open(self.state_file, 'w') as f:
|
||||
json.dump({"date": datetime.now().strftime("%Y-%m-%d"), "count": 0}, f)
|
||||
|
||||
def can_make_request(self) -> bool:
|
||||
with open(self.state_file, 'r') as f:
|
||||
data = json.load(f)
|
||||
|
||||
today = datetime.now().strftime("%Y-%m-%d")
|
||||
if data["date"] != today:
|
||||
data = {"date": today, "count": 0} # Új nap, kvóta nullázása
|
||||
|
||||
if data["count"] >= self.daily_limit:
|
||||
return False
|
||||
|
||||
# Növeljük a számlálót
|
||||
data["count"] += 1
|
||||
with open(self.state_file, 'w') as f:
|
||||
json.dump(data, f)
|
||||
return True
|
||||
|
||||
class VehicleResearcher:
|
||||
"""
|
||||
Vehicle Robot 2.5: Sniper Researcher (Mesterlövész Adatgyűjtő)
|
||||
Célzott keresésekkel és strukturált aktakészítéssel dolgozik az AI kímélése érdekében.
|
||||
"""
|
||||
def __init__(self):
|
||||
self.max_attempts = 5
|
||||
self.search_timeout = 15.0
|
||||
|
||||
# Kvóta menedzserek beállítása (.env-ből olvasva)
|
||||
dvla_limit = int(os.getenv("DVLA_DAILY_LIMIT", "1000"))
|
||||
self.dvla_quota = QuotaManager("dvla", dvla_limit)
|
||||
self.dvla_token = os.getenv("DVLA_API_KEY")
|
||||
|
||||
async def fetch_ddg_targeted(self, label: str, query: str) -> str:
|
||||
""" Célzott keresés szálbiztosan a DuckDuckGo-n. """
|
||||
try:
|
||||
def search():
|
||||
with DDGS() as ddgs:
|
||||
# max_results=2: Nem kell sok zaj, csak a legrelevánsabb 2 találat
|
||||
results = ddgs.text(query, max_results=2)
|
||||
return [f"- {r.get('body', '')}" for r in results] if results else []
|
||||
|
||||
results = await asyncio.wait_for(asyncio.to_thread(search), timeout=self.search_timeout)
|
||||
|
||||
if not results:
|
||||
return f"[SOURCE: {label}]\nNincs érdemi találat.\n"
|
||||
|
||||
content = f"[SOURCE: {label} | KERESÉS: {query}]\n"
|
||||
content += "\n".join(results) + "\n"
|
||||
return content
|
||||
|
||||
except Exception as e:
|
||||
logger.debug(f"Keresési hiba ({label}): {e}")
|
||||
return f"[SOURCE: {label}]\nKERESÉSI HIBA.\n"
|
||||
|
||||
def extract_specs_from_text(self, text: str) -> dict:
|
||||
""" Regex alapú kinyerés a nyers szövegből: ccm, kW, motoradatok. """
|
||||
import re
|
||||
specs = {}
|
||||
|
||||
# CCM (köbcentiméter) minta: 1998 cc, 2.0 L, 2000 cm³
|
||||
ccm_pattern = r'(\d{3,4})\s*(?:cc|ccm|cm³|cm3|cc\.)'
|
||||
match = re.search(ccm_pattern, text, re.IGNORECASE)
|
||||
if match:
|
||||
specs['ccm'] = int(match.group(1))
|
||||
else:
|
||||
# Alternatív minta: 2.0 liter -> 2000 cc
|
||||
liter_pattern = r'(\d+\.?\d*)\s*(?:L|liter|ℓ)'
|
||||
match = re.search(liter_pattern, text, re.IGNORECASE)
|
||||
if match:
|
||||
liters = float(match.group(1))
|
||||
specs['ccm'] = int(liters * 1000)
|
||||
|
||||
# KW (kilowatt) minta: 150 kW, 150kW, 150 KW
|
||||
kw_pattern = r'(\d{2,4})\s*(?:kW|kw|KW)'
|
||||
match = re.search(kw_pattern, text, re.IGNORECASE)
|
||||
if match:
|
||||
specs['kw'] = int(match.group(1))
|
||||
else:
|
||||
# Le (lóerő) átváltás: 150 LE -> 110 kW (kb)
|
||||
hp_pattern = r'(\d{2,4})\s*(?:HP|hp|LE|le|Ps)'
|
||||
match = re.search(hp_pattern, text, re.IGNORECASE)
|
||||
if match:
|
||||
hp = int(match.group(1))
|
||||
specs['kw'] = int(hp * 0.7355) # hozzávetőleges átváltás
|
||||
|
||||
# Motor kód minta: motor kód: 1.8 TSI, engine code: N47
|
||||
engine_pattern = r'(?:motor\s*kód|engine\s*code|motor\s*code)[:\s]+([A-Z0-9\.\- ]+)'
|
||||
match = re.search(engine_pattern, text, re.IGNORECASE)
|
||||
if match:
|
||||
specs['engine_code'] = match.group(1).strip()
|
||||
|
||||
return specs
|
||||
|
||||
async def research_vehicle(self, db, vehicle_id: int, make: str, model: str, engine: str, year: str, current_attempts: int):
|
||||
""" Egy jármű átvilágítása és a strukturált 'Akta' elkészítése a GPU számára. """
|
||||
engine_safe = engine or ""
|
||||
year_safe = str(year) if year else ""
|
||||
|
||||
logger.info(f"🔎 Mesterlövész Kutatás: {make} {model} (Motor: {engine_safe})")
|
||||
|
||||
# 1. TIER: Ingyenes, Célzott Keresések (A legmegbízhatóbb források)
|
||||
queries = [
|
||||
("ULTIMATE_SPECS", f"{make} {model} {engine_safe} {year_safe} site:ultimatespecs.com"),
|
||||
("AUTO_DATA", f"{make} {model} {engine_safe} {year_safe} site:auto-data.net"),
|
||||
("COMMON_ISSUES", f"{make} {model} {engine_safe} reliability common problems")
|
||||
]
|
||||
|
||||
tasks = [self.fetch_ddg_targeted(label, q) for label, q in queries]
|
||||
search_results = await asyncio.gather(*tasks)
|
||||
|
||||
# 2. TIER: Fizetős / Kvótás API-k (Példa a DVLA helyére)
|
||||
# Ha a jövőben bejön brit rendszám, itt hívjuk meg a DVLA-t:
|
||||
# if has_uk_plate and self.dvla_quota.can_make_request():
|
||||
# uk_data = await self.fetch_dvla_data(plate)
|
||||
# search_results.append(uk_data)
|
||||
|
||||
# 3. ÖSSZESÍTÉS (Az Akta összeállítása)
|
||||
# Maximalizáljuk a szöveg hosszát, hogy az AI GPU ne fulladjon le!
|
||||
full_context = "\n".join(search_results)
|
||||
if len(full_context) > 2500:
|
||||
full_context = full_context[:2500] + "\n...[TRUNCATED TO SAVE GPU TOKENS]"
|
||||
|
||||
# Regex alapú specifikáció kinyerés
|
||||
extracted_specs = self.extract_specs_from_text(full_context)
|
||||
|
||||
try:
|
||||
if len(full_context.strip()) > 150: # Csökkentettük az elvárást, mert a célzott keresés tömörebb
|
||||
await db.execute(
|
||||
update(VehicleModelDefinition)
|
||||
.where(VehicleModelDefinition.id == vehicle_id)
|
||||
.values(
|
||||
raw_search_context=full_context,
|
||||
research_metadata=extracted_specs,
|
||||
status='awaiting_ai_synthesis', # Kész az Akta, mehet az Alkimistának!
|
||||
last_research_at=func.now(),
|
||||
attempts=current_attempts + 1
|
||||
)
|
||||
)
|
||||
logger.info(f"✅ Akta rögzítve ({len(full_context)} karakter): {make} {model}")
|
||||
else:
|
||||
new_status = 'suspended_research' if current_attempts + 1 >= self.max_attempts else 'unverified'
|
||||
await db.execute(
|
||||
update(VehicleModelDefinition)
|
||||
.where(VehicleModelDefinition.id == vehicle_id)
|
||||
.values(
|
||||
status=new_status,
|
||||
attempts=current_attempts + 1,
|
||||
last_research_at=func.now()
|
||||
)
|
||||
)
|
||||
if new_status == 'suspended_research':
|
||||
logger.warning(f"🛑 Felfüggesztve (Nincs nyom a weben): {make} {model}")
|
||||
else:
|
||||
logger.warning(f"⚠️ Kevés adat: {make} {model}, visszatéve a sorba.")
|
||||
|
||||
await db.commit()
|
||||
except Exception as e:
|
||||
await db.rollback()
|
||||
logger.error(f"🚨 Adatbázis hiba az eredmény mentésénél ({vehicle_id}): {e}")
|
||||
|
||||
@classmethod
|
||||
async def run(cls):
|
||||
self_instance = cls()
|
||||
logger.info("🚀 Vehicle Researcher 2.5 ONLINE (Sniper & Quota Manager)")
|
||||
|
||||
while True:
|
||||
try:
|
||||
async with AsyncSessionLocal() as db:
|
||||
# ATOMI ZÁROLÁS
|
||||
query = text("""
|
||||
UPDATE vehicle.vehicle_model_definitions
|
||||
SET status = 'research_in_progress'
|
||||
WHERE id = (
|
||||
SELECT id FROM vehicle.vehicle_model_definitions
|
||||
WHERE status IN ('unverified', 'awaiting_research', 'ACTIVE')
|
||||
AND attempts < :max_attempts
|
||||
AND is_manual = FALSE
|
||||
ORDER BY
|
||||
CASE WHEN make = 'TOYOTA' THEN 1 ELSE 2 END,
|
||||
attempts ASC
|
||||
FOR UPDATE SKIP LOCKED
|
||||
LIMIT 1
|
||||
)
|
||||
RETURNING id, make, marketing_name, engine_code, year_from, attempts;
|
||||
""")
|
||||
|
||||
result = await db.execute(query, {"max_attempts": self_instance.max_attempts})
|
||||
task = result.fetchone()
|
||||
await db.commit()
|
||||
|
||||
if task:
|
||||
v_id, v_make, v_model, v_engine, v_year, v_attempts = task
|
||||
async with AsyncSessionLocal() as process_db:
|
||||
await self_instance.research_vehicle(process_db, v_id, v_make, v_model, v_engine, v_year, v_attempts)
|
||||
|
||||
await asyncio.sleep(2) # Rate limit védelem a DDG felé
|
||||
else:
|
||||
await asyncio.sleep(30)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"💀 Kritikus hiba a főciklusban: {e}")
|
||||
await asyncio.sleep(10)
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
asyncio.run(VehicleResearcher.run())
|
||||
except KeyboardInterrupt:
|
||||
logger.info("🛑 Kutató robot leállítva.")
|
||||
@@ -0,0 +1,225 @@
|
||||
# /opt/docker/dev/service_finder/backend/app/workers/vehicle/vehicle_robot_3_alchemist_pro.py
|
||||
import asyncio
|
||||
import logging
|
||||
import datetime
|
||||
import random
|
||||
import sys
|
||||
import json
|
||||
import os
|
||||
from sqlalchemy import text, func, update, case
|
||||
from app.database import AsyncSessionLocal
|
||||
from app.models.vehicle_definitions import VehicleModelDefinition
|
||||
from app.models.asset import AssetCatalog
|
||||
from app.services.ai_service import AIService
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] Vehicle-Alchemist-Pro: %(message)s', stream=sys.stdout)
|
||||
logger = logging.getLogger("Vehicle-Robot-3-Alchemist-Pro")
|
||||
|
||||
class TechEnricher:
|
||||
"""
|
||||
Vehicle Robot 3: Alchemist Pro (Atomi Zárolás + Kézi Moderáció Patch)
|
||||
Tiszta GPU fókusz: Csak az AI elemzésre és adategyesítésre koncentrál.
|
||||
Nincs felesleges webkeresés. Szigorú, de intelligens Sane-Check.
|
||||
"""
|
||||
def __init__(self):
|
||||
self.max_attempts = 5
|
||||
self.daily_ai_limit = int(os.getenv("AI_DAILY_LIMIT", "10000"))
|
||||
self.ai_calls_today = 0
|
||||
self.last_reset_date = datetime.date.today()
|
||||
|
||||
def check_budget(self) -> bool:
|
||||
if datetime.date.today() > self.last_reset_date:
|
||||
self.ai_calls_today = 0
|
||||
self.last_reset_date = datetime.date.today()
|
||||
return self.ai_calls_today < self.daily_ai_limit
|
||||
|
||||
def validate_merged_data(self, merged_kw: int, merged_ccm: int, v_class: str, fuel: str, current_attempts: int) -> tuple[bool, str]:
|
||||
""" Intelligens validáció a MERGE után. Visszaadja a státuszt és a hiba okát. """
|
||||
if merged_ccm > 18000:
|
||||
return False, f"Irreális CCM érték ({merged_ccm})"
|
||||
if merged_kw > 1500 and v_class != "truck":
|
||||
return False, f"Irreális KW érték ({merged_kw})"
|
||||
|
||||
# Ha hiányzik a KW
|
||||
if merged_kw == 0:
|
||||
if current_attempts < 3:
|
||||
return False, "Hiányzó KW adat. Újrakutatás javasolt."
|
||||
else:
|
||||
logger.warning("Sane-check: Többszöri próbálkozás után sincs KW, de átengedjük részlegesként.")
|
||||
|
||||
# Ha hiányzik a CCM (és belsőégésű)
|
||||
if merged_ccm == 0 and "electric" not in fuel and "elektric" not in fuel and v_class != "trailer":
|
||||
if current_attempts < 3:
|
||||
return False, "Hiányzó CCM (belsőégésű motornál). Újrakutatás javasolt."
|
||||
else:
|
||||
logger.warning("Sane-check: Többszöri próbálkozás után sincs CCM, átengedjük részlegesként.")
|
||||
|
||||
return True, "OK"
|
||||
|
||||
async def process_single_record(self, db, record_id: int, base_info: dict, current_attempts: int):
|
||||
# Pontos azonosító a logokhoz (Márka, Modell, ID, RDW adatok)
|
||||
v_ident = f"{base_info['make'].upper()} {base_info['m_name']} (ID: {record_id}, RDW: {base_info['rdw_ccm']}ccm, KW: {base_info['rdw_kw']})"
|
||||
attempt_str = f"[Próba: {current_attempts + 1}/{self.max_attempts}]"
|
||||
|
||||
ai_data = {} # Üres dict, ha az AI hívás elszállna
|
||||
|
||||
try:
|
||||
logger.info(f"🧠 AI dúsítás indul: {v_ident} {attempt_str}")
|
||||
|
||||
# 1. LÉPÉS: AI Hívás (Rábízzuk az adatokat a modellre)
|
||||
ai_data = await AIService.get_clean_vehicle_data(
|
||||
base_info['make'],
|
||||
base_info['m_name'],
|
||||
base_info
|
||||
)
|
||||
|
||||
if not ai_data:
|
||||
raise ValueError("Teljesen üres AI válasz (API hiba vagy extrém hallucináció).")
|
||||
|
||||
# 2. LÉPÉS: HIBRID MERGE (Még a validáció előtt!)
|
||||
# Az RDW adatok felülbírálják az AI-t a hatósági paramétereknél
|
||||
final_kw = base_info['rdw_kw'] if base_info['rdw_kw'] > 0 else int(ai_data.get("kw", 0) or 0)
|
||||
final_ccm = base_info['rdw_ccm'] if base_info['rdw_ccm'] > 0 else int(ai_data.get("ccm", 0) or 0)
|
||||
|
||||
# Üzemanyag tisztítása
|
||||
fuel_rdw = base_info.get('rdw_fuel', '')
|
||||
final_fuel = fuel_rdw if fuel_rdw and fuel_rdw != "Unknown" else ai_data.get("fuel_type", "petrol")
|
||||
|
||||
final_engine = base_info['rdw_engine'] if base_info['rdw_engine'] else ai_data.get("engine_code", "Unknown")
|
||||
final_euro = base_info['rdw_euro'] or ai_data.get("euro_classification")
|
||||
final_cylinders = base_info['rdw_cylinders'] or ai_data.get("cylinders")
|
||||
|
||||
# 3. LÉPÉS: Intelligens Validáció
|
||||
is_valid, error_msg = self.validate_merged_data(final_kw, final_ccm, base_info['v_type'], final_fuel.lower(), current_attempts)
|
||||
if not is_valid:
|
||||
raise ValueError(f"Validációs hiba: {error_msg}")
|
||||
|
||||
# 4. LÉPÉS: Mentés az Arany Katalógusba
|
||||
clean_model = str(ai_data.get("marketing_name", base_info['m_name']))[:50].upper()
|
||||
|
||||
cat_stmt = text("""
|
||||
INSERT INTO vehicle.vehicle_catalog
|
||||
(master_definition_id, make, model, power_kw, engine_capacity, fuel_type, factory_data)
|
||||
VALUES (:m_id, :make, :model, :kw, :ccm, :fuel, :factory)
|
||||
ON CONFLICT ON CONSTRAINT uix_vehicle_catalog_full DO NOTHING
|
||||
RETURNING id;
|
||||
""")
|
||||
|
||||
await db.execute(cat_stmt, {
|
||||
"m_id": record_id,
|
||||
"make": base_info['make'].upper(),
|
||||
"model": clean_model,
|
||||
"kw": final_kw,
|
||||
"ccm": final_ccm,
|
||||
"fuel": final_fuel,
|
||||
"factory": json.dumps(ai_data)
|
||||
})
|
||||
|
||||
# 5. LÉPÉS: Staging tábla (VMD) lezárása
|
||||
await db.execute(
|
||||
update(VehicleModelDefinition)
|
||||
.where(VehicleModelDefinition.id == record_id)
|
||||
.values(
|
||||
status="gold_enriched",
|
||||
engine_capacity=final_ccm,
|
||||
power_kw=final_kw,
|
||||
fuel_type=final_fuel,
|
||||
engine_code=final_engine,
|
||||
euro_classification=final_euro,
|
||||
cylinders=final_cylinders,
|
||||
specifications=ai_data, # Elmentjük az AI teljes outputját a mestertáblába is
|
||||
updated_at=func.now()
|
||||
)
|
||||
)
|
||||
await db.commit()
|
||||
logger.info(f"✨ ARANY REKORD KÉSZ: {v_ident}")
|
||||
self.ai_calls_today += 1
|
||||
|
||||
except Exception as e:
|
||||
await db.rollback()
|
||||
logger.warning(f"⚠️ Alkimista hiba - {v_ident}: {e}")
|
||||
|
||||
# Ha elértük a limitet, KÉZI MODERÁCIÓRA küldjük, egyébként vissza a Kutatónak
|
||||
new_status = 'manual_review_needed' if current_attempts + 1 >= self.max_attempts else 'unverified'
|
||||
|
||||
# Elmentjük az AI részleges válaszát (vagy a hibát), hogy az admin lássa, mit rontott el a gép
|
||||
review_data = ai_data if ai_data else {"error": "Nincs értékelhető JSON adat az AI-tól", "raw_context": base_info['web_context']}
|
||||
|
||||
await db.execute(
|
||||
update(VehicleModelDefinition)
|
||||
.where(VehicleModelDefinition.id == record_id)
|
||||
.values(
|
||||
attempts=current_attempts + 1,
|
||||
last_error=str(e)[:200],
|
||||
status=new_status,
|
||||
specifications=review_data, # Kézi ellenőrzéshez beírjuk a törött adatot!
|
||||
updated_at=func.now()
|
||||
)
|
||||
)
|
||||
await db.commit()
|
||||
|
||||
if new_status == 'unverified':
|
||||
logger.info(f"♻️ Akta visszaküldve a Robot-2-nek (Kutató). {attempt_str}")
|
||||
else:
|
||||
logger.error(f"🛑 Max próbálkozás elérve! Kézi moderációra küldve: {v_ident}")
|
||||
|
||||
async def run(self):
|
||||
logger.info(f"🚀 Alchemist Pro HIBRID ONLINE (Atomi Zárolás + Moderáció Patch)")
|
||||
while True:
|
||||
if not self.check_budget():
|
||||
logger.warning("💸 Napi AI limit kimerítve! Pihenés...")
|
||||
await asyncio.sleep(3600); continue
|
||||
|
||||
try:
|
||||
async with AsyncSessionLocal() as db:
|
||||
# ATOMI ZÁROLÁS (A "Szent Grál" a race condition ellen)
|
||||
query = text("""
|
||||
UPDATE vehicle.vehicle_model_definitions
|
||||
SET status = 'ai_synthesis_in_progress'
|
||||
WHERE id = (
|
||||
SELECT id FROM vehicle.vehicle_model_definitions
|
||||
WHERE status IN ('awaiting_ai_synthesis', 'ACTIVE')
|
||||
AND attempts < :max_attempts
|
||||
AND is_manual = FALSE
|
||||
ORDER BY
|
||||
CASE WHEN status = 'awaiting_ai_synthesis' THEN 1 ELSE 2 END,
|
||||
priority_score DESC
|
||||
FOR UPDATE SKIP LOCKED
|
||||
LIMIT 1
|
||||
)
|
||||
RETURNING id, make, marketing_name, vehicle_class, power_kw, engine_capacity,
|
||||
fuel_type, engine_code, euro_classification, cylinders, raw_search_context, attempts;
|
||||
""")
|
||||
|
||||
result = await db.execute(query, {"max_attempts": self.max_attempts})
|
||||
task = result.fetchone()
|
||||
await db.commit()
|
||||
|
||||
if task:
|
||||
# Szétbontjuk a lekérdezett rekordot a base_info dict-be
|
||||
r_id = task[0]
|
||||
base_info = {
|
||||
"make": task[1], "m_name": task[2], "v_type": task[3] or "car",
|
||||
"rdw_kw": task[4] or 0, "rdw_ccm": task[5] or 0,
|
||||
"rdw_fuel": task[6] or "petrol", "rdw_engine": task[7] or "",
|
||||
"rdw_euro": task[8], "rdw_cylinders": task[9],
|
||||
"web_context": task[10] or ""
|
||||
}
|
||||
attempts = task[11]
|
||||
|
||||
# Külön adatbázis kapcsolat a feldolgozáshoz (hosszú AI hívás miatt)
|
||||
async with AsyncSessionLocal() as process_db:
|
||||
await self.process_single_record(process_db, r_id, base_info, attempts)
|
||||
|
||||
# GPU hűtés / Ollama rate limit
|
||||
await asyncio.sleep(random.uniform(1.5, 3.5))
|
||||
else:
|
||||
logger.info("😴 Nincs feldolgozandó akta, az Alkimista pihen...")
|
||||
await asyncio.sleep(15)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"💀 Kritikus hiba a főciklusban: {e}")
|
||||
await asyncio.sleep(10)
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(TechEnricher().run())
|
||||
@@ -0,0 +1,168 @@
|
||||
import asyncio
|
||||
import logging
|
||||
import datetime
|
||||
import random
|
||||
import sys
|
||||
import json
|
||||
import os
|
||||
from sqlalchemy import text, func, update
|
||||
from app.database import AsyncSessionLocal
|
||||
from app.models.vehicle_definitions import VehicleModelDefinition
|
||||
from app.services.ai_service import AIService
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] R3-Alchemist: %(message)s', stream=sys.stdout)
|
||||
logger = logging.getLogger("Robot-3-Alchemist")
|
||||
|
||||
class TechEnricher:
|
||||
"""
|
||||
Vehicle Robot 3: Alchemist Pro (Sentinel Gateway Edition)
|
||||
Az AIService 2.2-t használja (Ollama -> Groq Fallback).
|
||||
Kinyeri a felszereltségi szintet (trim_level) és pótolja a hiányzó adatokat.
|
||||
"""
|
||||
def __init__(self):
|
||||
self.max_attempts = 5
|
||||
self.daily_ai_limit = int(os.getenv("AI_DAILY_LIMIT", "10000"))
|
||||
self.ai_calls_today = 0
|
||||
self.last_reset_date = datetime.date.today()
|
||||
|
||||
def check_budget(self) -> bool:
|
||||
if datetime.date.today() > self.last_reset_date:
|
||||
self.ai_calls_today = 0
|
||||
self.last_reset_date = datetime.date.today()
|
||||
return self.ai_calls_today < self.daily_ai_limit
|
||||
|
||||
def validate_merged_data(self, merged_kw: int, merged_ccm: int, v_class: str, fuel: str, current_attempts: int) -> tuple[bool, str]:
|
||||
if merged_ccm > 18000:
|
||||
return False, f"Irreális CCM érték ({merged_ccm})"
|
||||
if merged_kw > 1500 and v_class not in ["truck", "other"]:
|
||||
return False, f"Irreális KW érték ({merged_kw})"
|
||||
|
||||
if merged_kw == 0 and current_attempts < 3:
|
||||
return False, "Hiányzó KW adat. Újrakutatás javasolt."
|
||||
|
||||
if merged_ccm == 0 and "elektr" not in fuel.lower() and v_class != "trailer" and current_attempts < 3:
|
||||
return False, "Hiányzó CCM (belsőégésű motornál)."
|
||||
|
||||
return True, "OK"
|
||||
|
||||
async def process_single_record(self, db, record_id: int, base_info: dict, current_attempts: int):
|
||||
v_ident = f"{base_info['make'].upper()} {base_info['m_name']} (ID: {record_id})"
|
||||
attempt_str = f"[Próba: {current_attempts + 1}/{self.max_attempts}]"
|
||||
|
||||
try:
|
||||
logger.info(f"🧠 AI dúsítás indul: {v_ident} {attempt_str}")
|
||||
|
||||
# Szigorú Prompt a Master AI Service-nek
|
||||
prompt = f"""
|
||||
Elemezd az alábbi járműadatokat és a webes kutatást! Készíts belőle egy JSON objektumot.
|
||||
Jármű: {base_info['make']} {base_info['m_name']}
|
||||
Hatósági adatok: {base_info['rdw_ccm']} ccm, {base_info['rdw_kw']} kW, Üzemanyag: {base_info['rdw_fuel']}
|
||||
Webes szöveg: {base_info['web_context'][:2000]}
|
||||
|
||||
FELADATOK:
|
||||
1. Keresd meg a felszereltségi szintet (trim_level) a modell nevéből vagy a szövegből (pl. AMG, Highline, Titanium, M-Sport, Elegance, ST-Line). Ha nincs, legyen üres string.
|
||||
2. Ha az RDW adatokban a kW vagy a ccm 0, pótold a szövegből a helyes értéket!
|
||||
|
||||
KIZÁRÓLAG EGY ÉRVÉNYES JSON-T ADJ VISSZA! (A Groq/Gemini miatt kötelező a JSON szó használata).
|
||||
Várt kulcsok: "kw" (int), "ccm" (int), "trim_level" (string), "transmission" (string), "drive_type" (string).
|
||||
"""
|
||||
|
||||
# Hívjuk a te profi Gateway-edet! (_execute_ai_call átveszi a db session-t is a beállításokhoz)
|
||||
ai_data = await AIService._execute_ai_call(db, prompt, model_key="text")
|
||||
|
||||
if not ai_data:
|
||||
raise ValueError("Üres AI válasz (Minden fallback elbukott).")
|
||||
|
||||
# HIBRID MERGE
|
||||
final_kw = base_info['rdw_kw'] if base_info['rdw_kw'] > 0 else int(ai_data.get("kw", 0) or 0)
|
||||
final_ccm = base_info['rdw_ccm'] if base_info['rdw_ccm'] > 0 else int(ai_data.get("ccm", 0) or 0)
|
||||
trim_level = str(ai_data.get("trim_level", ""))[:100]
|
||||
|
||||
# Sane-Check
|
||||
is_valid, error_msg = self.validate_merged_data(final_kw, final_ccm, base_info['v_type'], base_info['rdw_fuel'], current_attempts)
|
||||
if not is_valid:
|
||||
raise ValueError(f"Validációs hiba: {error_msg}")
|
||||
|
||||
# Staging tábla frissítése (Arany minősítés)
|
||||
await db.execute(
|
||||
update(VehicleModelDefinition)
|
||||
.where(VehicleModelDefinition.id == record_id)
|
||||
.values(
|
||||
status="gold_enriched",
|
||||
engine_capacity=final_ccm,
|
||||
power_kw=final_kw,
|
||||
trim_level=trim_level if trim_level.lower() not in ["null", "none"] else "",
|
||||
specifications=ai_data,
|
||||
updated_at=func.now()
|
||||
)
|
||||
)
|
||||
await db.commit()
|
||||
logger.info(f"✨ ARANY REKORD KÉSZ: {v_ident} | Trim: {trim_level}")
|
||||
self.ai_calls_today += 1
|
||||
|
||||
except Exception as e:
|
||||
await db.rollback()
|
||||
logger.warning(f"⚠️ Alkimista hiba - {v_ident}: {e}")
|
||||
|
||||
new_status = 'manual_review_needed' if current_attempts + 1 >= self.max_attempts else 'unverified'
|
||||
|
||||
await db.execute(
|
||||
update(VehicleModelDefinition)
|
||||
.where(VehicleModelDefinition.id == record_id)
|
||||
.values(
|
||||
attempts=current_attempts + 1,
|
||||
last_error=str(e)[:200],
|
||||
status=new_status,
|
||||
updated_at=func.now()
|
||||
)
|
||||
)
|
||||
await db.commit()
|
||||
|
||||
if new_status == 'unverified':
|
||||
logger.info(f"♻️ Akta visszaküldve a Kutatónak (R2). {attempt_str}")
|
||||
|
||||
async def run(self):
|
||||
logger.info(f"🚀 R3 Alchemist Pro ONLINE (Sentinel Gateway Integráció)")
|
||||
while True:
|
||||
if not self.check_budget():
|
||||
logger.warning("💸 Napi AI limit kimerítve! Pihenés...")
|
||||
await asyncio.sleep(3600); continue
|
||||
|
||||
try:
|
||||
async with AsyncSessionLocal() as db:
|
||||
query = text("""
|
||||
UPDATE vehicle.vehicle_model_definitions
|
||||
SET status = 'ai_synthesis_in_progress'
|
||||
WHERE id = (
|
||||
SELECT id FROM vehicle.vehicle_model_definitions
|
||||
WHERE status = 'awaiting_ai_synthesis'
|
||||
AND attempts < :max_attempts
|
||||
AND is_manual = FALSE
|
||||
ORDER BY priority_score DESC
|
||||
FOR UPDATE SKIP LOCKED LIMIT 1
|
||||
)
|
||||
RETURNING id, make, marketing_name, vehicle_class, power_kw, engine_capacity, fuel_type, raw_search_context, attempts;
|
||||
""")
|
||||
|
||||
result = await db.execute(query, {"max_attempts": self.max_attempts})
|
||||
task = result.fetchone()
|
||||
await db.commit()
|
||||
|
||||
if task:
|
||||
base_info = {
|
||||
"make": task[1], "m_name": task[2], "v_type": task[3] or "car",
|
||||
"rdw_kw": task[4] or 0, "rdw_ccm": task[5] or 0,
|
||||
"rdw_fuel": task[6] or "petrol", "web_context": task[7] or ""
|
||||
}
|
||||
async with AsyncSessionLocal() as process_db:
|
||||
await self.process_single_record(process_db, task[0], base_info, task[8])
|
||||
|
||||
else:
|
||||
await asyncio.sleep(10)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"💀 Kritikus hiba a főciklusban: {e}")
|
||||
await asyncio.sleep(10)
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(TechEnricher().run())
|
||||
40
backend/app/workers/vehicle/R0_brand_hunter.py
Normal file
40
backend/app/workers/vehicle/R0_brand_hunter.py
Normal file
@@ -0,0 +1,40 @@
|
||||
import asyncio, logging, random
|
||||
from playwright.async_api import async_playwright
|
||||
from sqlalchemy import text
|
||||
from app.database import AsyncSessionLocal
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s [R0-BRANDS] %(message)s')
|
||||
logger = logging.getLogger("R0")
|
||||
|
||||
async def run_r0():
|
||||
url = "https://www.auto-data.net/en/allbrands"
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
page = await browser.new_page()
|
||||
logger.info(f"Márkák gyűjtése innen: {url}")
|
||||
|
||||
await page.goto(url, wait_until="networkidle")
|
||||
# Robusztus linkgyűjtés: minden <a> aminek a href-jében benne van a 'brand-'
|
||||
links = await page.eval_on_selector_all("a[href*='brand-']",
|
||||
"nodes => nodes.map(n => ({ 'name': n.innerText.trim(), 'url': n.href }))")
|
||||
|
||||
async with AsyncSessionLocal() as db:
|
||||
count = 0
|
||||
for link in links:
|
||||
if not link['name'] or 'brand' not in link['url']: continue
|
||||
|
||||
query = text("""
|
||||
INSERT INTO vehicle.auto_data_crawler_queue (url, level, name, status)
|
||||
VALUES (:url, 'brand', :name, 'pending')
|
||||
ON CONFLICT (url) DO NOTHING
|
||||
""")
|
||||
res = await db.execute(query, {"url": link['url'], "name": link['name']})
|
||||
if res.rowcount > 0: count += 1
|
||||
|
||||
await db.commit()
|
||||
logger.info(f"✅ Kész! {count} új márkát találtam és mentettem el.")
|
||||
|
||||
await browser.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(run_r0())
|
||||
137
backend/app/workers/vehicle/R1_model_scout.py
Normal file
137
backend/app/workers/vehicle/R1_model_scout.py
Normal file
@@ -0,0 +1,137 @@
|
||||
import asyncio
|
||||
import logging
|
||||
import random
|
||||
import re
|
||||
from playwright.async_api import async_playwright
|
||||
from sqlalchemy import text
|
||||
from app.database import AsyncSessionLocal
|
||||
|
||||
# --- NAPLÓZÁS KONFIGURÁCIÓ ---
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s [R1-RECOVERY] %(message)s'
|
||||
)
|
||||
logger = logging.getLogger("R1")
|
||||
|
||||
async def analyze_and_extract_links(page, current_url, current_level):
|
||||
"""
|
||||
Gondolatmenet: Intelligens link-osztályozás.
|
||||
Javítás: Motorcyclespecs (.htm és /model/) támogatás hozzáadva.
|
||||
"""
|
||||
found_links = []
|
||||
|
||||
# Linkek kinyerése
|
||||
hrefs = await page.eval_on_selector_all(
|
||||
"a",
|
||||
"nodes => nodes.map(n => ({ 'name': n.innerText.trim(), 'url': n.href }))"
|
||||
)
|
||||
|
||||
logger.info(f"🔎 Oldal elemzése: {len(hrefs)} link található összesen.")
|
||||
|
||||
for link in hrefs:
|
||||
url = link['url']
|
||||
name = link['name']
|
||||
|
||||
if not name or len(name) < 2: continue
|
||||
if re.search(r'[^\x00-\x7F]+', name): continue # Nyelvi pajzs
|
||||
|
||||
# 1. AUTOEVOLUTION
|
||||
if "autoevolution.com/moto/" in url:
|
||||
if url.endswith(".html") and "#" not in url:
|
||||
found_links.append({'name': name, 'url': url, 'level': 'engine'})
|
||||
elif url.count('/') >= 5:
|
||||
found_links.append({'name': name, 'url': url, 'level': 'model'})
|
||||
|
||||
# 2. BIKEZ
|
||||
elif "bikez.com" in url:
|
||||
if "/motorcycles/" in url:
|
||||
found_links.append({'name': name, 'url': url, 'level': 'engine'})
|
||||
elif "/models/" in url:
|
||||
found_links.append({'name': name, 'url': url, 'level': 'model'})
|
||||
|
||||
# 3. MOTORCYCLESPECS (Kritikus javítás!)
|
||||
elif "motorcyclespecs.co.za" in url:
|
||||
# Ha a linkben benne van a /model/ és .htm-re végződik, az egy adatlap
|
||||
if "/model/" in url and (".htm" in url or ".html" in url):
|
||||
found_links.append({'name': name, 'url': url, 'level': 'engine'})
|
||||
# Ha a brand oldalon vagyunk és további listákat látunk
|
||||
elif "/bikes/" in url and name.lower() not in current_url.lower():
|
||||
found_links.append({'name': name, 'url': url, 'level': 'model'})
|
||||
|
||||
return found_links
|
||||
|
||||
async def main():
|
||||
"""
|
||||
Gondolatmenet: A fő vezérlő hurok.
|
||||
"""
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
context = await browser.new_context(
|
||||
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
|
||||
)
|
||||
|
||||
logger.info("🤖 R1 Recovery Scout elindult...")
|
||||
|
||||
while True:
|
||||
target = None
|
||||
async with AsyncSessionLocal() as db:
|
||||
try:
|
||||
# Feladat felvétele (Márka vagy Modell szint)
|
||||
res = await db.execute(text("""
|
||||
UPDATE vehicle.auto_data_crawler_queue SET status = 'processing'
|
||||
WHERE id = (
|
||||
SELECT id FROM vehicle.auto_data_crawler_queue
|
||||
WHERE (status = 'pending' OR status = 'error' OR status = 'completed_empty')
|
||||
AND level = 'brand'
|
||||
AND category = 'bike'
|
||||
ORDER BY id ASC LIMIT 1 FOR UPDATE SKIP LOCKED
|
||||
) RETURNING id, url, name, level
|
||||
"""))
|
||||
target = res.fetchone()
|
||||
await db.commit()
|
||||
except Exception as e:
|
||||
logger.error(f"❌ DB Hiba: {e}")
|
||||
await db.rollback()
|
||||
|
||||
if not target:
|
||||
logger.info("🏁 Nincs több feladat. Alvás 30mp...")
|
||||
await asyncio.sleep(30)
|
||||
continue
|
||||
|
||||
t_id, t_url, t_name, t_level = target
|
||||
page = await context.new_page()
|
||||
|
||||
try:
|
||||
logger.info(f"🚀 [{t_level}] {t_name} felderítése -> {t_url}")
|
||||
await page.goto(t_url, wait_until="domcontentloaded", timeout=60000)
|
||||
await asyncio.sleep(2) # Várunk, hogy a JavaScript is lefusson
|
||||
|
||||
links = await analyze_and_extract_links(page, t_url, t_level)
|
||||
|
||||
async with AsyncSessionLocal() as db:
|
||||
if links:
|
||||
for link in links:
|
||||
await db.execute(text("""
|
||||
INSERT INTO vehicle.auto_data_crawler_queue (url, level, parent_id, name, status, category)
|
||||
VALUES (:url, :level, :p_id, :name, 'pending', 'bike')
|
||||
ON CONFLICT (url) DO NOTHING
|
||||
"""), {"url": link['url'], "level": link['level'], "p_id": t_id, "name": link['name']})
|
||||
|
||||
await db.execute(text("UPDATE vehicle.auto_data_crawler_queue SET status = 'completed' WHERE id = :id"), {"id": t_id})
|
||||
logger.info(f"✅ Siker: {t_name} -> {len(links)} új link mentve.")
|
||||
else:
|
||||
await db.execute(text("UPDATE vehicle.auto_data_crawler_queue SET status = 'completed_empty' WHERE id = :id"), {"id": t_id})
|
||||
logger.warning(f"⚠️ Üres: {t_name} oldalon nem találtam motorokat.")
|
||||
|
||||
await db.commit()
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Hiba: {t_name} -> {e}")
|
||||
finally:
|
||||
await page.close()
|
||||
await asyncio.sleep(random.uniform(3, 5))
|
||||
|
||||
await browser.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
214
backend/app/workers/vehicle/R2_generation_scout.py
Normal file
214
backend/app/workers/vehicle/R2_generation_scout.py
Normal file
@@ -0,0 +1,214 @@
|
||||
import asyncio
|
||||
import logging
|
||||
import random
|
||||
import re
|
||||
from playwright.async_api import async_playwright
|
||||
from sqlalchemy import text
|
||||
from app.database import AsyncSessionLocal
|
||||
|
||||
# --- NAPLÓZÁS KONFIGURÁCIÓ ---
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s [R2-AUTOS-ONLY] %(message)s',
|
||||
handlers=[logging.StreamHandler()]
|
||||
)
|
||||
logger = logging.getLogger("R2")
|
||||
|
||||
async def get_page_safe(page, url):
|
||||
"""
|
||||
Gondolatmenet: Az anti-bot védelem (Cloudflare) kijátszása érdekében
|
||||
véletlenszerű várakozást és valós User-Agent viselkedést szimulálunk.
|
||||
"""
|
||||
delay = random.uniform(4, 7)
|
||||
await asyncio.sleep(delay)
|
||||
|
||||
try:
|
||||
# A domcontentloaded gyorsabb, mint a networkidle, de elég a linkgyűjtéshez
|
||||
await page.goto(url, wait_until="domcontentloaded", timeout=60000)
|
||||
|
||||
# Ellenőrizzük, hogy nem kaptunk-e blokkoló oldalt
|
||||
title = await page.title()
|
||||
if "Just a moment" in title or "Cloudflare" in title:
|
||||
raise Exception(f"Bot védelem észlelve az URL-en: {url}")
|
||||
|
||||
return page
|
||||
except Exception as e:
|
||||
logger.error(f"Hiba az oldal betöltésekor: {url} -> {e}")
|
||||
raise
|
||||
|
||||
async def extract_scoped_links(page, p_id, current_url):
|
||||
"""
|
||||
Gondolatmenet: A 'Scope-Lock' technika lényege, hogy az URL-kből kinyert
|
||||
márkanév horgony (anchor) segítségével megakadályozzuk, hogy a robot
|
||||
kilépjen a jelenlegi autócsalád környezetéből.
|
||||
|
||||
Javítás: Beépített nyelvi szűrő és 'Language Shield' a nem kívánt (görög, spanyol, bolgár stb.)
|
||||
változatok elkerülésére. Minden talált új linket 'car' kategóriával mentünk el.
|
||||
"""
|
||||
# Kinyerjük a márka/típus nevét az URL-ből (pl. 'alfa-romeo')
|
||||
url_parts = current_url.split('/')[-1].split('-')
|
||||
brand_anchor = "-".join(url_parts[:2])
|
||||
|
||||
# Csak azokat a linkeket gyűjtjük, amik valódi navigációt jelentenek
|
||||
hrefs = await page.eval_on_selector_all(
|
||||
"a",
|
||||
"nodes => nodes.map(n => ({ 'name': n.innerText.trim(), 'url': n.href }))"
|
||||
)
|
||||
|
||||
found_count = 0
|
||||
async with AsyncSessionLocal() as db:
|
||||
for link in hrefs:
|
||||
url = link['url']
|
||||
name = link['name'].replace('\n', ' ').strip()
|
||||
|
||||
# --- 1. ALAPVETŐ ÉRVÉNYESSÉG ---
|
||||
if not name or len(name) < 2:
|
||||
continue
|
||||
|
||||
# --- 2. LANGUAGE SHIELD (ÚJ VÉDELEM) ---
|
||||
# Karakterkészlet ellenőrzés: Ha görög, cirill vagy egyéb nem latin karakter van benne, eldobjuk.
|
||||
if re.search(r'[^\x00-\x7F]+', name):
|
||||
continue
|
||||
|
||||
# Szigorított angol-kényszerítés az URL-ben
|
||||
if '/en/' not in url:
|
||||
continue
|
||||
|
||||
# Szövegalapú zajszűrés (Meta-linkek kizárása)
|
||||
junk_keywords = [
|
||||
'privacy', 'configuracion', 'ρυθμίσεις', 'cookie', 'settings',
|
||||
'contact', 'about us', 'terms', 'advertising', 'login', 'registration',
|
||||
'pribatutasun', 'configuració', 'naslovnica', 'stisni',
|
||||
'personvern', 'prywatnosci', 'ustawienia', 'endre', 'zmień'
|
||||
]
|
||||
if any(junk in name.lower() for junk in junk_keywords):
|
||||
continue
|
||||
|
||||
# --- 3. EREDETI NYELVI SZŰRŐ (Language Lock) ---
|
||||
# Megtartva az eredeti logikát: domain.com/bg/..., domain.com/se/...
|
||||
path_segments = url.split('/')
|
||||
if len(path_segments) > 3:
|
||||
lang_segment = path_segments[3]
|
||||
if len(lang_segment) == 2 and lang_segment != 'en':
|
||||
continue
|
||||
|
||||
# --- 4. SCOPE SZŰRÉS ---
|
||||
# Csak az adott márkához tartozó linkeket engedjük át
|
||||
if brand_anchor not in url:
|
||||
continue
|
||||
|
||||
# --- 5. NAVIGÁCIÓS SZŰRÉS ---
|
||||
# Ne lépjen vissza a listákhoz, és zárjuk ki az idegen nyelvű könyvtárakat (teljes lista)
|
||||
excluded_patterns = [
|
||||
'-brand-', 'allbrands', 'en/brands',
|
||||
'/bg/', '/ru/', '/de/', '/it/', '/fr/', '/es/',
|
||||
'/tr/', '/ro/', '/fi/', '/se/', '/no/', '/pl/', '/gr/',
|
||||
'/hr/', '/cz/', '/sk/', '/ua/'
|
||||
]
|
||||
if any(x in url for x in excluded_patterns):
|
||||
continue
|
||||
|
||||
# --- 6. ÖNHIVATKOZÁS SZŰRÉS ---
|
||||
if url.strip('/') == current_url.strip('/'):
|
||||
continue
|
||||
|
||||
# --- 7. SZINT MEGHATÁROZÁSA MINTÁZAT ALAPJÁN ---
|
||||
if '-generation-' in url:
|
||||
target_level = 'generation'
|
||||
elif re.search(r'-\d+$', url) and '-model-' not in url:
|
||||
target_level = 'engine'
|
||||
else:
|
||||
continue
|
||||
|
||||
# --- 8. MENTÉS AZ ADATBÁZISBA ---
|
||||
await db.execute(text("""
|
||||
INSERT INTO vehicle.auto_data_crawler_queue (url, level, parent_id, name, status, category)
|
||||
VALUES (:url, :level, :p_id, :name, 'pending', 'car')
|
||||
ON CONFLICT (url) DO NOTHING
|
||||
"""), {"url": url, "level": target_level, "p_id": p_id, "name": name})
|
||||
found_count += 1
|
||||
|
||||
await db.commit()
|
||||
return found_count
|
||||
|
||||
async def process_target(context, t_id, t_url, t_name, t_level):
|
||||
"""
|
||||
Gondolatmenet: Egy adott feladat (URL) teljes körű feldolgozása.
|
||||
A volume mapping miatt a módosítás azonnal látszik a konténerben is.
|
||||
"""
|
||||
page = await context.new_page()
|
||||
try:
|
||||
logger.info(f"🚀 Autós felderítés indítása [{t_level}]: {t_name}")
|
||||
await get_page_safe(page, t_url)
|
||||
|
||||
# Linkek kinyerése és mentése
|
||||
found = await extract_scoped_links(page, t_id, t_url)
|
||||
|
||||
async with AsyncSessionLocal() as db:
|
||||
new_status = 'completed' if found > 0 else 'completed_leaf'
|
||||
await db.execute(text("""
|
||||
UPDATE vehicle.auto_data_crawler_queue
|
||||
SET status = :s, error_msg = NULL, updated_at = NOW()
|
||||
WHERE id = :id
|
||||
"""), {"s": new_status, "id": t_id})
|
||||
await db.commit()
|
||||
|
||||
logger.info(f"✅ Befejezve: {t_name} -> {found} új link.")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Kritikus hiba feldolgozás közben ({t_name}): {e}")
|
||||
async with AsyncSessionLocal() as db:
|
||||
await db.execute(text("""
|
||||
UPDATE vehicle.auto_data_crawler_queue
|
||||
SET status = 'error', error_msg = :msg, updated_at = NOW()
|
||||
WHERE id = :id
|
||||
"""), {"msg": str(e), "id": t_id})
|
||||
await db.commit()
|
||||
finally:
|
||||
await page.close()
|
||||
|
||||
async def main():
|
||||
"""
|
||||
Gondolatmenet: A fő vezérlő hurok.
|
||||
STRATÉGIA: Csak a 'car' kategóriájú feladatokat vesszük fel (category='car').
|
||||
"""
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
context = await browser.new_context(
|
||||
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
|
||||
viewport={'width': 1920, 'height': 1080}
|
||||
)
|
||||
|
||||
logger.info("🤖 R2 Autós Felderítő Robot aktív. (Filter: category='car')")
|
||||
|
||||
while True:
|
||||
async with AsyncSessionLocal() as db:
|
||||
# Csak 'car' kategóriájú, pending feladatok lekérése
|
||||
res = await db.execute(text("""
|
||||
UPDATE vehicle.auto_data_crawler_queue SET status = 'processing'
|
||||
WHERE id = (
|
||||
SELECT id FROM vehicle.auto_data_crawler_queue
|
||||
WHERE status = 'pending'
|
||||
AND level IN ('model', 'generation')
|
||||
AND category = 'car'
|
||||
ORDER BY level ASC, id ASC
|
||||
LIMIT 1 FOR UPDATE SKIP LOCKED
|
||||
) RETURNING id, url, name, level
|
||||
"""))
|
||||
target = res.fetchone()
|
||||
await db.commit()
|
||||
|
||||
if not target:
|
||||
logger.info("🏁 Nincs több autós feladat (car). Alvás 60mp...")
|
||||
await asyncio.sleep(60)
|
||||
continue
|
||||
|
||||
await process_target(context, target[0], target[1], target[2], target[3])
|
||||
|
||||
await browser.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
asyncio.run(main())
|
||||
except KeyboardInterrupt:
|
||||
logger.info("🛑 Felhasználói leállítás (Ctrl+C).")
|
||||
159
backend/app/workers/vehicle/R3_engine_scout.py
Normal file
159
backend/app/workers/vehicle/R3_engine_scout.py
Normal file
@@ -0,0 +1,159 @@
|
||||
import asyncio
|
||||
import logging
|
||||
import random
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
from bs4 import BeautifulSoup
|
||||
from playwright.async_api import async_playwright
|
||||
from sqlalchemy import text
|
||||
from app.database import AsyncSessionLocal
|
||||
|
||||
# --- NAPLÓZÁS KONFIGURÁCIÓ ---
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s [R3-EXTRACTOR-v1.2] %(message)s')
|
||||
logger = logging.getLogger("R3")
|
||||
|
||||
# --- KONFIGURÁCIÓS PARAMÉTEREK ---
|
||||
MAX_RETRY_LIMIT = 3 # Max 3 próbálkozás járművenként
|
||||
|
||||
class R3DataMiner:
|
||||
def clean_key(self, key):
|
||||
if "," in key: key = key.split(",")[-1]
|
||||
key = key.replace("What is the ", "").replace("How much ", "").replace("How many ", "")
|
||||
return key.split("?")[0].strip().capitalize()
|
||||
|
||||
async def scrape_specs(self, context, url):
|
||||
page = await context.new_page()
|
||||
try:
|
||||
# Véletlenszerű várakozás a bot-védelem elkerülésére
|
||||
await asyncio.sleep(random.uniform(4, 8))
|
||||
await page.goto(url, wait_until="domcontentloaded", timeout=60000)
|
||||
content = await page.content()
|
||||
soup = BeautifulSoup(content, 'html.parser')
|
||||
|
||||
data = {"make": "", "model": "", "generation": "", "modification": "",
|
||||
"year_from": None, "power_kw": 0, "engine_cc": 0,
|
||||
"specifications": {}, "source_url": url}
|
||||
|
||||
# Eredeti parszoló logika
|
||||
for row in soup.find_all('tr'):
|
||||
th, td = row.find('th'), row.find('td')
|
||||
if not th or not td: continue
|
||||
k_raw, v = th.get_text(strip=True), td.get_text(strip=True)
|
||||
k_low = k_raw.lower()
|
||||
|
||||
if "brand" == k_low: data["make"] = v
|
||||
elif "model" == k_low: data["model"] = v
|
||||
elif "generation" == k_low: data["generation"] = v
|
||||
elif "modification" == k_low: data["modification"] = v
|
||||
elif "start of production" in k_low:
|
||||
m = re.search(r'(\d{4})', v)
|
||||
data["year_from"] = int(m.group(1)) if m else None
|
||||
elif "power" == k_low:
|
||||
hp = re.search(r'(\d+)\s*Hp', v, re.I)
|
||||
if hp: data["power_kw"] = int(int(hp.group(1)) / 1.36)
|
||||
elif "displacement" in k_low:
|
||||
cc = re.search(r'(\d+)\s*cm3', v)
|
||||
if cc: data["engine_cc"] = int(cc.group(1))
|
||||
|
||||
data["specifications"][self.clean_key(k_raw)] = v
|
||||
|
||||
if not data["make"] or not data["specifications"]:
|
||||
return None
|
||||
|
||||
return data
|
||||
except Exception as e:
|
||||
logger.error(f"Hiba az adatlapon ({url}): {e}")
|
||||
return None
|
||||
finally:
|
||||
await page.close()
|
||||
|
||||
async def run(self):
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
context = await browser.new_context(
|
||||
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
|
||||
)
|
||||
|
||||
while True:
|
||||
target = None
|
||||
async with AsyncSessionLocal() as db:
|
||||
try:
|
||||
# JAVÍTÁS: Kikerült a priority_score, mert az oszlop nem létezik a crawler_queue táblában
|
||||
res = await db.execute(text("""
|
||||
UPDATE vehicle.auto_data_crawler_queue SET status = 'processing'
|
||||
WHERE id = (
|
||||
SELECT id FROM vehicle.auto_data_crawler_queue
|
||||
WHERE level = 'engine'
|
||||
AND status IN ('pending', 'error')
|
||||
AND retry_count < 3
|
||||
ORDER BY id ASC
|
||||
LIMIT 1 FOR UPDATE SKIP LOCKED
|
||||
) RETURNING id, url, name, retry_count
|
||||
"""))
|
||||
target = res.fetchone()
|
||||
await db.commit()
|
||||
except Exception as e:
|
||||
logger.error(f"❌ DB Hiba a feladatfelvételnél: {e}")
|
||||
await asyncio.sleep(5)
|
||||
continue
|
||||
|
||||
if not target:
|
||||
logger.info("🏁 Minden feladat elvégezve. Leállás.")
|
||||
break
|
||||
|
||||
t_id, t_url, t_name, t_retry = target
|
||||
if t_retry is None: t_retry = 0
|
||||
|
||||
logger.info(f"🚀 [{t_retry + 1}/3] Dolgozom: {t_name}")
|
||||
data = await self.scrape_specs(context, t_url)
|
||||
|
||||
async with AsyncSessionLocal() as db:
|
||||
if data and data["make"]:
|
||||
await db.execute(text("""
|
||||
INSERT INTO vehicle.external_reference_library
|
||||
(source_name, make, model, generation, modification, year_from, power_kw, engine_cc, specifications, source_url)
|
||||
VALUES ('auto-data.net', :make, :model, :gen, :mod, :y, :p, :e, :s, :u)
|
||||
ON CONFLICT (source_url) DO UPDATE SET
|
||||
specifications = EXCLUDED.specifications,
|
||||
last_scraped_at = NOW();
|
||||
"""), {
|
||||
"make": data["make"], "model": data["model"], "gen": data["generation"],
|
||||
"mod": data["modification"], "y": data["year_from"], "p": data["power_kw"],
|
||||
"e": data["engine_cc"], "s": json.dumps(data["specifications"]), "u": data["source_url"]
|
||||
})
|
||||
|
||||
await db.execute(text("UPDATE vehicle.auto_data_crawler_queue SET status = 'completed', updated_at = NOW() WHERE id = :id"), {"id": t_id})
|
||||
logger.info(f"✅ ARANYMENTÉS: {data['make']} {data['model']} {data['modification']}")
|
||||
else:
|
||||
new_retry = t_retry + 1
|
||||
if new_retry >= 3:
|
||||
await db.execute(text("""
|
||||
UPDATE vehicle.auto_data_crawler_queue
|
||||
SET status = 'manual_review_needed',
|
||||
retry_count = :rc,
|
||||
error_msg = 'Sikertelen adatgyűjtés 3 próbálkozás után',
|
||||
updated_at = NOW()
|
||||
WHERE id = :id
|
||||
"""), {"rc": new_retry, "id": t_id})
|
||||
logger.error(f"🚨 LIMIT ELÉRVE: {t_name} -> manual_review_needed")
|
||||
else:
|
||||
await db.execute(text("""
|
||||
UPDATE vehicle.auto_data_crawler_queue
|
||||
SET status = 'error',
|
||||
retry_count = :rc,
|
||||
updated_at = NOW()
|
||||
WHERE id = :id
|
||||
"""), {"rc": new_retry, "id": t_id})
|
||||
logger.warning(f"⚠️ Sikertelen próbálkozás ({new_retry}/3): {t_name}")
|
||||
|
||||
await db.commit()
|
||||
|
||||
await browser.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
miner = R3DataMiner()
|
||||
try:
|
||||
asyncio.run(miner.run())
|
||||
except KeyboardInterrupt:
|
||||
logger.info("🛑 Felhasználói leállítás.")
|
||||
132
backend/app/workers/vehicle/R4_final_extractor.py
Normal file
132
backend/app/workers/vehicle/R4_final_extractor.py
Normal file
@@ -0,0 +1,132 @@
|
||||
import asyncio
|
||||
import logging
|
||||
import random
|
||||
import json
|
||||
import re
|
||||
from bs4 import BeautifulSoup
|
||||
from playwright.async_api import async_playwright
|
||||
from sqlalchemy import text
|
||||
from app.database import AsyncSessionLocal
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s [R4-EXTRACTOR] %(message)s')
|
||||
logger = logging.getLogger("R4")
|
||||
|
||||
class FinalExtractor:
|
||||
def __init__(self):
|
||||
self.semaphore = asyncio.Semaphore(2) # Biztonságos párhuzamosság
|
||||
|
||||
def clean_key(self, key):
|
||||
if "," in key: key = key.split(",")[-1]
|
||||
key = key.replace("What is the ", "").replace("How much ", "").replace("How many ", "")
|
||||
key = key.split("?")[0].strip()
|
||||
return key.capitalize()
|
||||
|
||||
async def scrape_engine(self, context, url):
|
||||
page = await context.new_page()
|
||||
try:
|
||||
await asyncio.sleep(random.uniform(3, 6)) # Anti-bot késleltetés
|
||||
await page.goto(url, wait_until="domcontentloaded", timeout=60000)
|
||||
content = await page.content()
|
||||
soup = BeautifulSoup(content, 'html.parser')
|
||||
|
||||
data = {
|
||||
"make": "", "model": "", "generation": "", "modification": "",
|
||||
"year_from": None, "year_to": None, "power_kw": 0, "engine_cc": 0,
|
||||
"specifications": {}, "source_url": url
|
||||
}
|
||||
|
||||
rows = soup.find_all('tr')
|
||||
for row in rows:
|
||||
th, td = row.find('th'), row.find('td')
|
||||
if not th or not td: continue
|
||||
|
||||
raw_k, val = th.get_text(strip=True), td.get_text(strip=True)
|
||||
k_low = raw_k.lower()
|
||||
|
||||
if "brand" == k_low: data["make"] = val
|
||||
elif "model" == k_low: data["model"] = val
|
||||
elif "generation" == k_low: data["generation"] = val
|
||||
elif "modification" == k_low: data["modification"] = val
|
||||
elif "start of production" in k_low:
|
||||
m = re.search(r'(\d{4})', val)
|
||||
if m: data["year_from"] = int(m.group(1))
|
||||
elif "end of production" in k_low:
|
||||
m = re.search(r'(\d{4})', val)
|
||||
if m: data["year_to"] = int(m.group(1))
|
||||
elif "power" == k_low:
|
||||
hp_m = re.search(r'(\d+)\s*Hp', val, re.I)
|
||||
if hp_m: data["power_kw"] = int(int(hp_m.group(1)) / 1.36)
|
||||
elif "displacement" in k_low:
|
||||
cc_m = re.search(r'(\d+)\s*cm3', val)
|
||||
if cc_m: data["engine_cc"] = int(cc_m.group(1))
|
||||
|
||||
clean_k = self.clean_key(raw_k)
|
||||
if clean_k and val: data["specifications"][clean_k] = val
|
||||
|
||||
return data
|
||||
except Exception as e:
|
||||
logger.error(f"Hiba az adatlapon ({url}): {e}")
|
||||
return None
|
||||
finally:
|
||||
await page.close()
|
||||
|
||||
async def save_to_library(self, data):
|
||||
if not data or not data["make"]: return
|
||||
async with AsyncSessionLocal() as db:
|
||||
try:
|
||||
await db.execute(text("""
|
||||
INSERT INTO vehicle.external_reference_library
|
||||
(source_name, make, model, generation, modification, year_from, year_to, power_kw, engine_cc, specifications, source_url)
|
||||
VALUES ('auto-data.net', :make, :model, :gen, :mod, :y_f, :y_t, :p_kw, :e_cc, :specs, :url)
|
||||
ON CONFLICT (source_url) DO UPDATE SET specifications = EXCLUDED.specifications, last_scraped_at = NOW();
|
||||
"""), {
|
||||
"make": data["make"], "model": data["model"], "gen": data["generation"],
|
||||
"mod": data["modification"], "y_f": data["year_from"], "y_t": data["year_to"],
|
||||
"p_kw": data["power_kw"], "e_cc": data["engine_cc"],
|
||||
"specs": json.dumps(data["specifications"]), "url": data["source_url"]
|
||||
})
|
||||
await db.commit()
|
||||
logger.info(f"✅ ARANYMENTÉS: {data['make']} {data['model']} ({data['power_kw']} kW)")
|
||||
except Exception as e:
|
||||
logger.error(f"DB Hiba: {e}")
|
||||
|
||||
async def run(self):
|
||||
logger.info("🚀 R4 Adatbányász indítása...")
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
context = await browser.new_context(user_agent="Mozilla/5.0...")
|
||||
|
||||
while True:
|
||||
async with AsyncSessionLocal() as db:
|
||||
res = await db.execute(text("""
|
||||
UPDATE vehicle.auto_data_crawler_queue SET status = 'processing'
|
||||
WHERE id = (
|
||||
SELECT id FROM vehicle.auto_data_crawler_queue
|
||||
WHERE level = 'engine' AND status = 'pending'
|
||||
ORDER BY id ASC LIMIT 1 FOR UPDATE SKIP LOCKED
|
||||
) RETURNING id, url, name
|
||||
"""))
|
||||
target = res.fetchone()
|
||||
await db.commit()
|
||||
|
||||
if not target:
|
||||
logger.info("🏁 Nincs több feldolgozandó motoradat. Alvás 60mp...")
|
||||
await asyncio.sleep(60)
|
||||
continue
|
||||
|
||||
t_id, t_url, t_name = target
|
||||
async with self.semaphore:
|
||||
data = await self.scrape_engine(context, t_url)
|
||||
if data:
|
||||
await self.save_to_library(data)
|
||||
new_status = 'completed'
|
||||
else:
|
||||
new_status = 'error'
|
||||
|
||||
async with AsyncSessionLocal() as db:
|
||||
await db.execute(text("UPDATE vehicle.auto_data_crawler_queue SET status = :s WHERE id = :id"),
|
||||
{"s": new_status, "id": t_id})
|
||||
await db.commit()
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(FinalExtractor().run())
|
||||
59
backend/app/workers/vehicle/bike/bike_R0_brand_hunter.py
Normal file
59
backend/app/workers/vehicle/bike/bike_R0_brand_hunter.py
Normal file
@@ -0,0 +1,59 @@
|
||||
# /opt/docker/dev/service_finder/backend/app/workers/vehicle/bike/bike_R0_brand_hunter.py
|
||||
import asyncio, logging
|
||||
from playwright.async_api import async_playwright
|
||||
from sqlalchemy import text
|
||||
from app.database import AsyncSessionLocal
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s [BIKE-R0] %(message)s')
|
||||
logger = logging.getLogger("R0")
|
||||
|
||||
SOURCES = [
|
||||
{
|
||||
"name": "AutoEvolution",
|
||||
"url": "https://www.autoevolution.com/moto/",
|
||||
# Robusztusabb szelektor a márkákhoz
|
||||
"selector": ".brand a, .all-brands a, .moto-brand a",
|
||||
"category": "bike"
|
||||
}
|
||||
]
|
||||
|
||||
async def run_r0():
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
context = await browser.new_context(user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/122.0.0.0")
|
||||
|
||||
async with AsyncSessionLocal() as db:
|
||||
for src in SOURCES:
|
||||
page = await context.new_page()
|
||||
try:
|
||||
logger.info(f"Márkák kinyerése: {src['name']}...")
|
||||
await page.goto(src['url'], wait_until="networkidle", timeout=60000)
|
||||
|
||||
# Ha a szelektor nem talál semmit, begyűjtjük az összes /moto/ linket
|
||||
links = await page.eval_on_selector_all("a[href*='/moto/']",
|
||||
"nodes => nodes.map(n => ({ 'name': n.innerText.trim(), 'url': n.href }))")
|
||||
|
||||
# Szűrés: csak a tiszta márka-linkek (pl. .../moto/aprilia/)
|
||||
# A márka linkek általában 5 perjelből állnak (https:// + domain + moto + márka + /)
|
||||
brand_links = [l for l in links if l['url'].count('/') == 5 and not l['url'].endswith('.html')]
|
||||
|
||||
count = 0
|
||||
for link in brand_links:
|
||||
if len(link['name']) < 2: continue
|
||||
await db.execute(text("""
|
||||
INSERT INTO vehicle.auto_data_crawler_queue (url, level, name, status, category)
|
||||
VALUES (:url, 'brand', :name, 'pending', 'bike')
|
||||
ON CONFLICT (url) DO NOTHING
|
||||
"""), {"url": link['url'], "name": link['name']})
|
||||
count += 1
|
||||
|
||||
await db.commit()
|
||||
logger.info(f"✅ [{src['name']}] kész: {count} márkát találtam.")
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Hiba: {e}")
|
||||
finally:
|
||||
await page.close()
|
||||
await browser.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(run_r0())
|
||||
171
backend/app/workers/vehicle/bike/bike_R1_model_scout.py
Normal file
171
backend/app/workers/vehicle/bike/bike_R1_model_scout.py
Normal file
@@ -0,0 +1,171 @@
|
||||
import asyncio
|
||||
import logging
|
||||
import random
|
||||
import re
|
||||
from playwright.async_api import async_playwright
|
||||
from sqlalchemy import text
|
||||
from app.database import AsyncSessionLocal
|
||||
|
||||
# --- NAPLÓZÁS KONFIGURÁCIÓ ---
|
||||
# Megtartjuk a részletes naplózást minden eseményhez
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s [BIKE-R1-AUTOEVO] %(message)s'
|
||||
)
|
||||
logger = logging.getLogger("R1")
|
||||
|
||||
async def analyze_and_extract_links(page, current_url):
|
||||
"""
|
||||
Gondolatmenet: Intelligens link-osztályozás az AutoEvolution struktúrája alapján.
|
||||
Minden funkciót megőrzünk: Language Shield, zajszűrés és a horgony-fix.
|
||||
"""
|
||||
found_links = []
|
||||
|
||||
# Minden link begyűjtése az elemzéshez a megadott szelektorral
|
||||
hrefs = await page.eval_on_selector_all(
|
||||
"a[href*='/moto/']",
|
||||
"nodes => nodes.map(n => ({ 'name': n.innerText.trim(), 'url': n.href }))"
|
||||
)
|
||||
|
||||
junk_keywords = [
|
||||
'privacy', 'cookie', 'settings', 'contact', 'terms', 'advertising',
|
||||
'about us', 'copyright', 'login', 'registration'
|
||||
]
|
||||
|
||||
for link in hrefs:
|
||||
# --- HORGONY ÉS PARAMÉTER TISZTÍTÁS ---
|
||||
# Itt volt a hiba: levágjuk a # részt, de a linket megtartjuk az ellenőrzéshez!
|
||||
raw_url = link['url'].split('#')[0].split('?')[0].rstrip('/')
|
||||
name = link['name']
|
||||
|
||||
# --- 1. LANGUAGE SHIELD & ZAJ SZŰRÉS ---
|
||||
if not name or len(name) < 2:
|
||||
continue
|
||||
|
||||
# Csak latin karakterek (No Greek/Cyrillic/Polish/etc)
|
||||
if re.search(r'[^\x00-\x7F]+', name):
|
||||
continue
|
||||
|
||||
# Kizárjuk a navigációs szemetet
|
||||
if any(junk in name.lower() for junk in junk_keywords):
|
||||
continue
|
||||
|
||||
# --- 2. AUTOEVOLUTION MÉLYSÉGI LOGIKA ---
|
||||
if "autoevolution.com/moto/" in raw_url:
|
||||
# Önhivatkozás és főoldal (visszafelé navigáció) kiszűrése
|
||||
if raw_url == current_url.rstrip('/') or raw_url.endswith('/moto'):
|
||||
continue
|
||||
|
||||
# Elágazás a szintek között az URL szerkezete alapján
|
||||
path_segments = raw_url.strip('/').split('/')
|
||||
|
||||
# Ha .html-re végződik, az a technikai specifikáció (ENGINE szint)
|
||||
if raw_url.endswith(".html"):
|
||||
found_links.append({'name': name, 'url': raw_url, 'level': 'engine'})
|
||||
|
||||
# Ha legalább 6 szegmens van és nincs .html, az egy al-modell vagy generáció (MODEL szint)
|
||||
elif len(path_segments) >= 6:
|
||||
found_links.append({'name': name, 'url': raw_url, 'level': 'model'})
|
||||
|
||||
return found_links
|
||||
|
||||
async def get_next_task(db):
|
||||
"""
|
||||
Prioritásos feladatfelvétel: A márka (brand) szinteket részesítjük előnyben.
|
||||
SKIP LOCKED biztosítja a párhuzamos futtathatóságot.
|
||||
"""
|
||||
query = text("""
|
||||
UPDATE vehicle.auto_data_crawler_queue SET status = 'processing'
|
||||
WHERE id = (
|
||||
SELECT id FROM vehicle.auto_data_crawler_queue
|
||||
WHERE status = 'pending'
|
||||
AND category = 'bike'
|
||||
AND url LIKE '%autoevolution.com%'
|
||||
AND level IN ('brand', 'model')
|
||||
ORDER BY
|
||||
CASE WHEN level = 'brand' THEN 0 ELSE 1 END ASC,
|
||||
id ASC
|
||||
LIMIT 1 FOR UPDATE SKIP LOCKED
|
||||
) RETURNING id, url, name, level
|
||||
""")
|
||||
res = await db.execute(query)
|
||||
return res.fetchone()
|
||||
|
||||
async def main():
|
||||
"""
|
||||
Fő vezérlő hurok teljes hibakezeléssel és tranzakció-biztonsággal.
|
||||
"""
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
context = await browser.new_context(
|
||||
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
|
||||
)
|
||||
|
||||
logger.info("🤖 R1 AutoEvolution Specialist elindult...")
|
||||
|
||||
while True:
|
||||
target = None
|
||||
try:
|
||||
async with AsyncSessionLocal() as db:
|
||||
target = await get_next_task(db)
|
||||
await db.commit()
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Adatbázis hiba a feladatfelvételnél: {e}")
|
||||
await asyncio.sleep(5)
|
||||
continue
|
||||
|
||||
if not target:
|
||||
logger.info("🏁 Nincs több AutoEvolution feladat. Alvás 60mp...")
|
||||
await asyncio.sleep(60)
|
||||
continue
|
||||
|
||||
t_id, t_url, t_name, t_level = target
|
||||
page = await context.new_page()
|
||||
|
||||
try:
|
||||
logger.info(f"🚀 Felderítés ({t_level}): {t_name} -> {t_url}")
|
||||
# A domcontentloaded gyorsabb, de várunk utána a JS-re
|
||||
await page.goto(t_url, wait_until="domcontentloaded", timeout=60000)
|
||||
await asyncio.sleep(random.uniform(2, 3))
|
||||
|
||||
links = await analyze_and_extract_links(page, t_url)
|
||||
|
||||
async with AsyncSessionLocal() as db:
|
||||
try:
|
||||
new_links_count = 0
|
||||
for link in links:
|
||||
# Minden talált variációt elmentünk a várólistába
|
||||
await db.execute(text("""
|
||||
INSERT INTO vehicle.auto_data_crawler_queue (url, level, parent_id, name, status, category)
|
||||
VALUES (:url, :level, :p_id, :name, 'pending', 'bike')
|
||||
ON CONFLICT (url) DO NOTHING
|
||||
"""), {"url": link['url'], "level": link['level'], "p_id": t_id, "name": link['name']})
|
||||
new_links_count += 1
|
||||
|
||||
# Feladat lezárása
|
||||
await db.execute(text("UPDATE vehicle.auto_data_crawler_queue SET status = 'completed', updated_at = NOW() WHERE id = :id"), {"id": t_id})
|
||||
await db.commit()
|
||||
logger.info(f"✅ {t_name} kész. Talált AutoEvolution linkek: {new_links_count}")
|
||||
except Exception as inner_db_error:
|
||||
await db.rollback()
|
||||
logger.error(f"❌ Belső mentési hiba: {inner_db_error}")
|
||||
raise inner_db_error
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Kritikus hiba a navigáció során: {t_name} -> {e}")
|
||||
async with AsyncSessionLocal() as db:
|
||||
await db.execute(text("UPDATE vehicle.auto_data_crawler_queue SET status = 'error', error_msg = :msg, updated_at = NOW() WHERE id = :id"),
|
||||
{"msg": str(e), "id": t_id})
|
||||
await db.commit()
|
||||
finally:
|
||||
await page.close()
|
||||
# Kíméljük a szervert a kitiltás ellen
|
||||
await asyncio.sleep(random.uniform(3, 5))
|
||||
|
||||
await browser.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
asyncio.run(main())
|
||||
except KeyboardInterrupt:
|
||||
logger.info("🛑 Leállítás.")
|
||||
173
backend/app/workers/vehicle/bike/bike_R2_generation_scout.py
Normal file
173
backend/app/workers/vehicle/bike/bike_R2_generation_scout.py
Normal file
@@ -0,0 +1,173 @@
|
||||
import asyncio
|
||||
import logging
|
||||
import random
|
||||
import re
|
||||
from playwright.async_api import async_playwright
|
||||
from sqlalchemy import text
|
||||
from app.database import AsyncSessionLocal
|
||||
|
||||
# --- NAPLÓZÁS ---
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s [R2-BIKE-DEPTH] %(message)s',
|
||||
handlers=[logging.StreamHandler()]
|
||||
)
|
||||
logger = logging.getLogger("R2")
|
||||
|
||||
async def get_page_safe(page, url):
|
||||
"""
|
||||
Bot védelem kijátszása valós viselkedéssel és Cloudflare ellenőrzéssel.
|
||||
"""
|
||||
delay = random.uniform(4, 7)
|
||||
await asyncio.sleep(delay)
|
||||
|
||||
try:
|
||||
await page.goto(url, wait_until="domcontentloaded", timeout=60000)
|
||||
title = await page.title()
|
||||
if "Just a moment" in title or "Cloudflare" in title:
|
||||
logger.error(f"Bot védelem észlelve: {url}")
|
||||
raise Exception("Bot védelem (CF) megállította a robotot.")
|
||||
return page
|
||||
except Exception as e:
|
||||
logger.error(f"Hiba az oldal betöltésekor: {url} -> {e}")
|
||||
raise
|
||||
|
||||
async def extract_scoped_links(page, p_id, current_url):
|
||||
"""
|
||||
MÉLYSÉGI FELDERÍTÉS: Generation -> Engine variációk kinyerése.
|
||||
Scope-Lock: Csak az adott márkán belüli linkeket követi.
|
||||
"""
|
||||
# Kinyerjük a márka nevét az URL-ből a scope-lockhoz
|
||||
path_segments = current_url.strip('/').split('/')
|
||||
if len(path_segments) < 5:
|
||||
return 0
|
||||
brand_anchor = path_segments[4]
|
||||
|
||||
hrefs = await page.eval_on_selector_all(
|
||||
"a[href*='/moto/']",
|
||||
"nodes => nodes.map(n => ({ 'name': n.innerText.trim(), 'url': n.href }))"
|
||||
)
|
||||
|
||||
junk = ['privacy', 'cookie', 'settings', 'contact', 'terms', 'advertising', 'login', 'about', 'copyright']
|
||||
found_count = 0
|
||||
|
||||
async with AsyncSessionLocal() as db:
|
||||
for link in hrefs:
|
||||
# TISZTÍTÁS: Levágjuk a horgonyt, hogy az adatlapot lássuk
|
||||
clean_url = link['url'].split('#')[0].split('?')[0].rstrip('/')
|
||||
name = link['name'].replace('\n', ' ').strip()
|
||||
|
||||
# Alap szűrések
|
||||
if not name or len(name) < 2: continue
|
||||
if re.search(r'[^\x00-\x7F]+', name): continue
|
||||
if any(k in name.lower() for k in junk): continue
|
||||
|
||||
# SCOPE LOCK: Csak az adott márkához tartozó linkeket engedjük át
|
||||
if brand_anchor not in clean_url.lower():
|
||||
continue
|
||||
|
||||
# Navigációs szűrés
|
||||
if any(x in clean_url for x in ['-brand-', 'allbrands', 'en/brands', '/moto/']):
|
||||
if clean_url.count('/') < 5: continue
|
||||
|
||||
# Önhivatkozás elkerülése
|
||||
if clean_url == current_url.rstrip('/'):
|
||||
continue
|
||||
|
||||
# Szintek meghatározása
|
||||
if clean_url.endswith(".html"):
|
||||
target_level = 'engine'
|
||||
elif clean_url.count('/') >= 6:
|
||||
target_level = 'generation'
|
||||
else:
|
||||
continue
|
||||
|
||||
# Mentés az adatbázisba
|
||||
await db.execute(text("""
|
||||
INSERT INTO vehicle.auto_data_crawler_queue (url, level, parent_id, name, status, category)
|
||||
VALUES (:url, :level, :p_id, :name, 'pending', 'bike')
|
||||
ON CONFLICT (url) DO NOTHING
|
||||
"""), {"url": clean_url, "level": target_level, "p_id": p_id, "name": name})
|
||||
found_count += 1
|
||||
|
||||
await db.commit()
|
||||
return found_count
|
||||
|
||||
async def process_target(context, t_id, t_url, t_name, t_level):
|
||||
"""
|
||||
Egy adott feladat (URL) teljes körű feldolgozása.
|
||||
"""
|
||||
page = await context.new_page()
|
||||
try:
|
||||
logger.info(f"🚀 Mélységi fúrás [{t_level}]: {t_name}")
|
||||
await get_page_safe(page, t_url)
|
||||
|
||||
# Variációk és generációk kinyerése
|
||||
found = await extract_scoped_links(page, t_id, t_url)
|
||||
|
||||
async with AsyncSessionLocal() as db:
|
||||
new_status = 'completed' if found > 0 else 'completed_leaf'
|
||||
await db.execute(text("""
|
||||
UPDATE vehicle.auto_data_crawler_queue
|
||||
SET status = :s, error_msg = NULL, updated_at = NOW()
|
||||
WHERE id = :id
|
||||
"""), {"s": new_status, "id": t_id})
|
||||
await db.commit()
|
||||
logger.info(f"✅ Befejezve: {t_name} -> {found} új variáció rögzítve.")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Kritikus hiba feldolgozás közben ({t_name}): {e}")
|
||||
async with AsyncSessionLocal() as db:
|
||||
await db.execute(text("""
|
||||
UPDATE vehicle.auto_data_crawler_queue
|
||||
SET status = 'error', error_msg = :msg, updated_at = NOW()
|
||||
WHERE id = :id
|
||||
"""), {"msg": str(e), "id": t_id})
|
||||
await db.commit()
|
||||
finally:
|
||||
await page.close()
|
||||
|
||||
async def main():
|
||||
"""
|
||||
Fő hurok mélységi stratégiával (level ASC).
|
||||
"""
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
context = await browser.new_context(
|
||||
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/122.0.0.0",
|
||||
viewport={'width': 1920, 'height': 1080}
|
||||
)
|
||||
|
||||
logger.info("🤖 R2 Motoros Mélységi Felderítő aktív.")
|
||||
|
||||
while True:
|
||||
async with AsyncSessionLocal() as db:
|
||||
res = await db.execute(text("""
|
||||
UPDATE vehicle.auto_data_crawler_queue SET status = 'processing'
|
||||
WHERE id = (
|
||||
SELECT id FROM vehicle.auto_data_crawler_queue
|
||||
WHERE status = 'pending'
|
||||
AND level IN ('model', 'generation')
|
||||
AND category = 'bike'
|
||||
AND url LIKE '%autoevolution.com%'
|
||||
ORDER BY level ASC, id ASC
|
||||
LIMIT 1 FOR UPDATE SKIP LOCKED
|
||||
) RETURNING id, url, name, level
|
||||
"""))
|
||||
target = res.fetchone()
|
||||
await db.commit()
|
||||
|
||||
if not target:
|
||||
logger.info("🏁 Minden variáció felderítve. Alvás 60mp...")
|
||||
await asyncio.sleep(60)
|
||||
continue
|
||||
|
||||
await process_target(context, target[0], target[1], target[2], target[3])
|
||||
|
||||
await browser.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
asyncio.run(main())
|
||||
except KeyboardInterrupt:
|
||||
logger.info("🛑 Leállítás.")
|
||||
95
backend/app/workers/vehicle/bike/bike_R3_engine_scout.py
Normal file
95
backend/app/workers/vehicle/bike/bike_R3_engine_scout.py
Normal file
@@ -0,0 +1,95 @@
|
||||
# /opt/docker/dev/service_finder/backend/app/workers/vehicle/bike/bike_R3_engine_scout.py
|
||||
import asyncio
|
||||
import logging
|
||||
import random
|
||||
import json
|
||||
import re
|
||||
from bs4 import BeautifulSoup
|
||||
from playwright.async_api import async_playwright
|
||||
from sqlalchemy import text
|
||||
from app.database import AsyncSessionLocal
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s [R3-EXTRACTOR] %(message)s')
|
||||
logger = logging.getLogger("R3")
|
||||
|
||||
class R3DataMiner:
|
||||
def clean_key(self, key):
|
||||
if "," in key: key = key.split(",")[-1]
|
||||
key = key.replace("What is the ", "").replace("How much ", "").replace("How many ", "")
|
||||
return key.split("?")[0].strip().capitalize()
|
||||
|
||||
async def scrape_specs(self, context, url):
|
||||
page = await context.new_page()
|
||||
try:
|
||||
await asyncio.sleep(random.uniform(4, 8))
|
||||
await page.goto(url, wait_until="domcontentloaded", timeout=60000)
|
||||
content = await page.content()
|
||||
soup = BeautifulSoup(content, 'html.parser')
|
||||
|
||||
data = {"make": "", "model": "", "generation": "", "modification": "",
|
||||
"year_from": None, "power_kw": 0, "engine_cc": 0,
|
||||
"specifications": {}, "source_url": url}
|
||||
|
||||
for row in soup.find_all('tr'):
|
||||
th, td = row.find('th'), row.find('td')
|
||||
if not th or not td: continue
|
||||
k_raw, v = th.get_text(strip=True), td.get_text(strip=True)
|
||||
k_low = k_raw.lower()
|
||||
|
||||
if "brand" == k_low: data["make"] = v
|
||||
elif "model" == k_low: data["model"] = v
|
||||
elif "generation" == k_low: data["generation"] = v
|
||||
elif "modification" == k_low: data["modification"] = v
|
||||
elif "start of production" in k_low:
|
||||
m = re.search(r'(\d{4})', v)
|
||||
data["year_from"] = int(m.group(1)) if m else None
|
||||
elif "power" == k_low:
|
||||
hp = re.search(r'(\d+)\s*Hp', v, re.I)
|
||||
if hp: data["power_kw"] = int(int(hp.group(1)) / 1.36)
|
||||
elif "displacement" in k_low:
|
||||
cc = re.search(r'(\d+)\s*cm3', v)
|
||||
if cc: data["engine_cc"] = int(cc.group(1))
|
||||
|
||||
data["specifications"][self.clean_key(k_raw)] = v
|
||||
return data
|
||||
except Exception as e:
|
||||
logger.error(f"Hiba az adatlapon: {e}"); return None
|
||||
finally: await page.close()
|
||||
|
||||
async def run(self):
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
context = await browser.new_context(user_agent="Mozilla/5.0...")
|
||||
while True:
|
||||
async with AsyncSessionLocal() as db:
|
||||
res = await db.execute(text("""
|
||||
UPDATE vehicle.auto_data_crawler_queue SET status = 'processing'
|
||||
WHERE id = (SELECT id FROM vehicle.auto_data_crawler_queue
|
||||
WHERE level = 'engine' AND status = 'pending'
|
||||
ORDER BY id ASC LIMIT 1 FOR UPDATE SKIP LOCKED)
|
||||
RETURNING id, url, name
|
||||
"""))
|
||||
target = res.fetchone()
|
||||
await db.commit()
|
||||
if not target: break
|
||||
|
||||
data = await self.scrape_specs(context, target[1])
|
||||
if data and data["make"]:
|
||||
async with AsyncSessionLocal() as db:
|
||||
await db.execute(text("""
|
||||
INSERT INTO vehicle.external_reference_library
|
||||
(source_name, make, model, generation, modification, year_from, power_kw, engine_cc, specifications, source_url)
|
||||
VALUES ('auto-data.net', :make, :model, :gen, :mod, :y, :p, :e, :s, :u)
|
||||
ON CONFLICT (source_url) DO UPDATE SET specifications = EXCLUDED.specifications, last_scraped_at = NOW();
|
||||
"""), {"make": data["make"], "model": data["model"], "gen": data["generation"], "mod": data["modification"],
|
||||
"y": data["year_from"], "p": data["power_kw"], "e": data["engine_cc"], "s": json.dumps(data["specifications"]), "u": data["source_url"]})
|
||||
await db.execute(text("UPDATE vehicle.auto_data_crawler_queue SET status = 'completed' WHERE id = :id"), {"id": target[0]})
|
||||
await db.commit()
|
||||
logger.info(f"✅ ARANYMENTÉS: {data['make']} {data['model']} {data['modification']}")
|
||||
else:
|
||||
async with AsyncSessionLocal() as db:
|
||||
await db.execute(text("UPDATE vehicle.auto_data_crawler_queue SET status = 'error' WHERE id = :id"), {"id": target[0]})
|
||||
await db.commit()
|
||||
await browser.close()
|
||||
|
||||
if __name__ == "__main__": asyncio.run(R3DataMiner().run())
|
||||
218
backend/app/workers/vehicle/bike/bike_R4_final_extractor.py
Normal file
218
backend/app/workers/vehicle/bike/bike_R4_final_extractor.py
Normal file
@@ -0,0 +1,218 @@
|
||||
#!/usr/bin/env python3
|
||||
import asyncio
|
||||
import logging
|
||||
import random
|
||||
import json
|
||||
import sys
|
||||
from playwright.async_api import async_playwright
|
||||
from sqlalchemy import text
|
||||
from app.database import AsyncSessionLocal
|
||||
|
||||
# --- NAPLÓZÁS KONFIGURÁCIÓ ---
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s [R4-HARVESTER-v1.2] %(message)s')
|
||||
logger = logging.getLogger("R4")
|
||||
|
||||
# --- KONFIGURÁCIÓS PARAMÉTEREK ---
|
||||
MAX_RETRY_LIMIT = 5 # Max 5 próbálkozás járművenként
|
||||
|
||||
async def parse_specs(page):
|
||||
"""
|
||||
A GYŐZTES DOM PARSZOLÓ LOGIKA (HIÁNYTALAN)
|
||||
Ez a script felismeri a hibás táblázatokat, a dt/dd listákat és a sima vastagított szövegeket is.
|
||||
"""
|
||||
script = """
|
||||
() => {
|
||||
let results = {};
|
||||
|
||||
// 1. MÓDSZER: Régi motorok (pl. BMW F650GS) -> td.left és td.right
|
||||
let leftCells = document.querySelectorAll('td.left');
|
||||
leftCells.forEach(cell => {
|
||||
let key = cell.innerText.replace(/:$/, '').trim();
|
||||
let rightCell = cell.nextElementSibling;
|
||||
if(rightCell && rightCell.classList.contains('right')) {
|
||||
results[key] = rightCell.innerText.trim();
|
||||
}
|
||||
});
|
||||
|
||||
// 2. MÓDSZER: Modern motorok (pl. Aprilia) -> dt és dd
|
||||
let dts = document.querySelectorAll('dt');
|
||||
dts.forEach(dt => {
|
||||
let key = dt.innerText.replace(/:$/, '').trim();
|
||||
let dd = dt.nextElementSibling;
|
||||
if(dd && dd.tagName.toLowerCase() === 'dd') {
|
||||
results[key] = dd.innerText.trim();
|
||||
}
|
||||
});
|
||||
|
||||
// 3. MÓDSZER: Alternatív modern layout -> span.label és span.value
|
||||
let specRows = document.querySelectorAll('.spec-row');
|
||||
specRows.forEach(row => {
|
||||
let label = row.querySelector('.label');
|
||||
let value = row.querySelector('.value');
|
||||
if(label && value) {
|
||||
let key = label.innerText.replace(/:$/, '').trim();
|
||||
if (!results[key]) {
|
||||
results[key] = value.innerText.trim();
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// 4. MÓDSZER: Veterán ("Adler") fallback -> Vastagított szöveg
|
||||
if (Object.keys(results).length === 0) {
|
||||
document.querySelectorAll('b, strong').forEach(b => {
|
||||
let key = b.innerText.replace(/:$/, '').trim();
|
||||
if(key.length > 2 && key.length < 30) {
|
||||
let val = "";
|
||||
if(b.nextSibling && b.nextSibling.nodeType === 3) {
|
||||
val = b.nextSibling.textContent.trim();
|
||||
}
|
||||
else if (b.nextElementSibling && b.nextElementSibling.tagName !== 'B') {
|
||||
val = b.nextElementSibling.innerText.trim();
|
||||
}
|
||||
if(val && !results[key]) {
|
||||
results[key] = val;
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
"""
|
||||
try:
|
||||
data = await page.evaluate(script)
|
||||
|
||||
if data and len(data) > 0:
|
||||
relevant_keys = [
|
||||
"Production", "Year", "Segment",
|
||||
"Type", "Displacement", "Bore X Stroke", "Compression Ratio",
|
||||
"Horsepower", "Torque", "Fuel System", "Gearbox", "Clutch",
|
||||
"Final Drive", "Frame", "Front Suspension", "Rear Suspension",
|
||||
"Front Brake", "Rear Brake", "Overall Length", "Overall Width",
|
||||
"Seat Height", "Wheelbase", "Fuel Capacity", "Weight", "Dry Weight",
|
||||
"Wet Weight", "Front", "Rear"
|
||||
]
|
||||
|
||||
filtered_data = {k: v for k, v in data.items() if any(rk.lower() in k.lower() for rk in relevant_keys)}
|
||||
return filtered_data if len(filtered_data) > 0 else data
|
||||
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Parszolási hiba a JS kiértékeléskor: {e}")
|
||||
return None
|
||||
|
||||
async def main():
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
context = await browser.new_context(
|
||||
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
|
||||
viewport={'width': 1920, 'height': 1080}
|
||||
)
|
||||
logger.info("🤖 R4 Motor Adat-Arató v1.2 elindult.")
|
||||
|
||||
while True:
|
||||
target = None
|
||||
try:
|
||||
async with AsyncSessionLocal() as db:
|
||||
# JAVÍTÁS: Kikerült a completed_empty a választható státuszok közül!
|
||||
# Csak 'pending' és 'error' jöhet, ha a retry_count < 5.
|
||||
res = await db.execute(text("""
|
||||
UPDATE vehicle.auto_data_crawler_queue SET status = 'processing'
|
||||
WHERE id = (
|
||||
SELECT id FROM vehicle.auto_data_crawler_queue
|
||||
WHERE status IN ('pending', 'error')
|
||||
AND retry_count < 5
|
||||
AND level = 'engine' AND category = 'bike'
|
||||
ORDER BY id ASC LIMIT 1 FOR UPDATE SKIP LOCKED
|
||||
) RETURNING id, url, name, retry_count
|
||||
"""))
|
||||
target = res.fetchone()
|
||||
await db.commit()
|
||||
except Exception as e:
|
||||
logger.error(f"❌ DB Hiba a feladatfelvételnél: {e}")
|
||||
await asyncio.sleep(5)
|
||||
continue
|
||||
|
||||
if not target:
|
||||
logger.info("🏁 Minden motor feldolgozva vagy manuális felülvizsgálatra vár. Alvás 60mp...")
|
||||
await asyncio.sleep(60)
|
||||
continue
|
||||
|
||||
t_id, t_url, t_name, t_retry_count = target
|
||||
if t_retry_count is None: t_retry_count = 0
|
||||
|
||||
page = await context.new_page()
|
||||
|
||||
try:
|
||||
logger.info(f"📊 [{t_retry_count + 1}/5] Adatbányászat: {t_name}")
|
||||
await page.goto(t_url, wait_until="domcontentloaded", timeout=60000)
|
||||
await asyncio.sleep(2)
|
||||
|
||||
data = await parse_specs(page)
|
||||
|
||||
async with AsyncSessionLocal() as db:
|
||||
if data and len(data) > 0:
|
||||
# SIKERES MENTÉS
|
||||
await db.execute(text("""
|
||||
INSERT INTO vehicle.motorcycle_specs (crawler_id, full_name, raw_data, url)
|
||||
VALUES (:cid, :name, :data, :url)
|
||||
ON CONFLICT (crawler_id) DO UPDATE SET raw_data = :data, updated_at = NOW()
|
||||
"""), {"cid": t_id, "name": t_name, "data": json.dumps(data), "url": t_url})
|
||||
|
||||
await db.execute(text("UPDATE vehicle.auto_data_crawler_queue SET status = 'completed', updated_at = NOW() WHERE id = :id"), {"id": t_id})
|
||||
await db.commit()
|
||||
logger.info(f"✅ Mentve: {t_name} ({len(data)} paraméter)")
|
||||
else:
|
||||
# ÜRES OLDAL VAGY HIÁNYZÓ ADAT
|
||||
new_retry_count = t_retry_count + 1
|
||||
|
||||
if new_retry_count >= 5:
|
||||
# Elérte a limitet -> JAVÍTANDÓ (manual_review_needed)
|
||||
await db.execute(text("""
|
||||
UPDATE vehicle.auto_data_crawler_queue
|
||||
SET status = 'manual_review_needed',
|
||||
retry_count = :rc,
|
||||
error_msg = 'Sikertelen adatgyűjtés 5 próbálkozás után (üres oldal)',
|
||||
updated_at = NOW()
|
||||
WHERE id = :id
|
||||
"""), {"rc": new_retry_count, "id": t_id})
|
||||
logger.error(f"🚨 LIMIT ELÉRVE: {t_name} -> manuális javításra jelölve.")
|
||||
else:
|
||||
# Még próbálkozhat -> státusz visszaállítása hibára
|
||||
await db.execute(text("""
|
||||
UPDATE vehicle.auto_data_crawler_queue
|
||||
SET status = 'error',
|
||||
retry_count = :rc,
|
||||
updated_at = NOW()
|
||||
WHERE id = :id
|
||||
"""), {"rc": new_retry_count, "id": t_id})
|
||||
logger.warning(f"⚠️ Üres maradt: {t_name} (Próbálkozás: {new_retry_count}/5)")
|
||||
|
||||
await db.commit()
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Hiba a feldolgozás során: {t_name} -> {e}")
|
||||
async with AsyncSessionLocal() as db:
|
||||
new_retry_count = t_retry_count + 1
|
||||
status = 'error' if new_retry_count < 5 else 'manual_review_needed'
|
||||
await db.execute(text("""
|
||||
UPDATE vehicle.auto_data_crawler_queue
|
||||
SET status = :st,
|
||||
retry_count = :rc,
|
||||
error_msg = :msg,
|
||||
updated_at = NOW()
|
||||
WHERE id = :id
|
||||
"""), {"st": status, "rc": new_retry_count, "msg": str(e), "id": t_id})
|
||||
await db.commit()
|
||||
finally:
|
||||
await page.close()
|
||||
await asyncio.sleep(random.uniform(2.0, 4.0))
|
||||
|
||||
await browser.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
asyncio.run(main())
|
||||
except KeyboardInterrupt:
|
||||
logger.info("🛑 Felhasználói leállítás.")
|
||||
113
backend/app/workers/vehicle/bike/test_aprilia.py
Normal file
113
backend/app/workers/vehicle/bike/test_aprilia.py
Normal file
@@ -0,0 +1,113 @@
|
||||
import asyncio
|
||||
import json
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
async def test_scraper():
|
||||
# Két probléma-fókuszú URL: a modern Aprilia és a régi, hibás HTML-ű BMW
|
||||
test_urls = [
|
||||
"https://www.autoevolution.com/moto/aprilia-rs-660-factory-2025.html",
|
||||
"https://www.autoevolution.com/moto/bmw-f-650-gs-2011.html"
|
||||
]
|
||||
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
context = await browser.new_context(
|
||||
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
|
||||
)
|
||||
page = await context.new_page()
|
||||
|
||||
for url in test_urls:
|
||||
print(f"\n{'='*60}")
|
||||
print(f"🌍 MEGNYITÁS: {url}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
# A DOM betöltése megvárása
|
||||
await page.goto(url, wait_until="domcontentloaded", timeout=60000)
|
||||
await asyncio.sleep(2) # Várunk picit a JS futásra
|
||||
|
||||
# A TÖKÉLETESÍTETT AUTOEVOLUTION PARSZOLÓ
|
||||
script = """
|
||||
() => {
|
||||
let results = {};
|
||||
|
||||
// 1. MÓDSZER: Régi motorok (pl. BMW F650GS) -> td.left és td.right
|
||||
let leftCells = document.querySelectorAll('td.left');
|
||||
leftCells.forEach(cell => {
|
||||
let key = cell.innerText.replace(/:$/, '').trim();
|
||||
let rightCell = cell.nextElementSibling;
|
||||
if(rightCell && rightCell.classList.contains('right')) {
|
||||
results[key] = rightCell.innerText.trim();
|
||||
}
|
||||
});
|
||||
|
||||
// 2. MÓDSZER: Modern motorok (pl. Aprilia) -> dt és dd
|
||||
let dts = document.querySelectorAll('dt');
|
||||
dts.forEach(dt => {
|
||||
let key = dt.innerText.replace(/:$/, '').trim();
|
||||
let dd = dt.nextElementSibling;
|
||||
if(dd && dd.tagName.toLowerCase() === 'dd') {
|
||||
results[key] = dd.innerText.trim();
|
||||
}
|
||||
});
|
||||
|
||||
// 3. MÓDSZER: Alternatív modern layout -> span.label és span.value
|
||||
let specRows = document.querySelectorAll('.spec-row');
|
||||
specRows.forEach(row => {
|
||||
let label = row.querySelector('.label');
|
||||
let value = row.querySelector('.value');
|
||||
if(label && value) {
|
||||
let key = label.innerText.replace(/:$/, '').trim();
|
||||
if (!results[key]) {
|
||||
results[key] = value.innerText.trim();
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// 4. MÓDSZER: "Adler" típusú elavult leírások fallbackje -> Vastagított szöveg
|
||||
if (Object.keys(results).length === 0) {
|
||||
document.querySelectorAll('b, strong').forEach(b => {
|
||||
let key = b.innerText.replace(/:$/, '').trim();
|
||||
if(key.length > 2 && key.length < 30) {
|
||||
let val = "";
|
||||
// Ha a szöveg közvetlenül a tag után van (Text Node)
|
||||
if(b.nextSibling && b.nextSibling.nodeType === 3) {
|
||||
val = b.nextSibling.textContent.trim();
|
||||
}
|
||||
// Ha egy másik elemben van
|
||||
else if (b.nextElementSibling && b.nextElementSibling.tagName !== 'B') {
|
||||
val = b.nextElementSibling.innerText.trim();
|
||||
}
|
||||
if(val && !results[key]) {
|
||||
results[key] = val;
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
"""
|
||||
|
||||
data = await page.evaluate(script)
|
||||
|
||||
if data and len(data) > 0:
|
||||
# Kiszűrjük a zajt, csak a releváns műszaki adatokat hagyjuk meg
|
||||
relevant_keys = ["Type", "Displacement", "Bore X Stroke", "Compression Ratio",
|
||||
"Horsepower", "Torque", "Fuel System", "Gearbox", "Clutch",
|
||||
"Final Drive", "Frame", "Front Suspension", "Rear Suspension",
|
||||
"Front Brake", "Rear Brake", "Overall Length", "Overall Width",
|
||||
"Seat Height", "Wheelbase", "Fuel Capacity", "Weight", "Dry Weight",
|
||||
"Wet Weight", "Front", "Rear"]
|
||||
|
||||
filtered_data = {k: v for k, v in data.items() if any(rk.lower() in k.lower() for rk in relevant_keys)}
|
||||
|
||||
print("\n🟢 KINYERT ADATOK (DOM PARSZOLÓ):")
|
||||
print(json.dumps(filtered_data if filtered_data else data, indent=2, ensure_ascii=False))
|
||||
print(f"\n✅ Összesen {len(filtered_data if filtered_data else data)} műszaki paramétert találtam.")
|
||||
else:
|
||||
print("\n🔴 NULLA ADAT - A DOM parszoló nem talált egyezést.")
|
||||
|
||||
await browser.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(test_scraper())
|
||||
73
backend/app/workers/vehicle/mapping_config.json
Normal file
73
backend/app/workers/vehicle/mapping_config.json
Normal file
@@ -0,0 +1,73 @@
|
||||
{
|
||||
"rdw": {
|
||||
"field_map": {
|
||||
"merk": "make",
|
||||
"handelsbenaming": "marketing_name",
|
||||
"inrichting": "body_type",
|
||||
"massa_ledig_voertuig": "curb_weight",
|
||||
"technische_max_massa_voertuig": "max_weight",
|
||||
"cilinderinhoud": "engine_capacity",
|
||||
"aantal_cilinders": "cylinders",
|
||||
"wielbasis": "wheelbase",
|
||||
"aantal_deuren": "doors",
|
||||
"aantal_zitplaatsen": "seats",
|
||||
"catalogusprijs": "list_price",
|
||||
"maximale_constructiesnelheid": "max_speed",
|
||||
"datum_eerste_toelating": "year_from"
|
||||
},
|
||||
"fuel_map": {
|
||||
"brandstof_omschrijving": "fuel_type",
|
||||
"nettomaximumvermogen": "power_kw",
|
||||
"netto_max_vermogen_elektrisch": "power_kw_electric",
|
||||
"uitlaatemissieniveau": "euro_class",
|
||||
"brandstofverbruik_gecombineerd": "consumption",
|
||||
"co2_uitstoot_gecombineerd": "co2"
|
||||
},
|
||||
"engine_map": {
|
||||
"motorcode": "engine_code"
|
||||
},
|
||||
"body_type_translations": {
|
||||
"stationwagen": "KOMBI",
|
||||
"hatchback": "FERDEHÁTÚ",
|
||||
"sedan": "LÉPCSŐSHÁTÚ (SEDAN)",
|
||||
"terreinwagen": "TEREPJÁRÓ (SUV)",
|
||||
"cabriolet": "KABRIÓ",
|
||||
"motorfiets": "MOTORKERÉKPÁR",
|
||||
"land- of bosbouwtrekker": "TRAKTOR",
|
||||
"niet geregistreerd": "NEM_REGISZTRÁLT",
|
||||
"onbekend": "ISMERETLEN",
|
||||
"niet geregistreerd": "NOT_REGISTERED",
|
||||
"onbekend": "UNKNOWN",
|
||||
"stationwagen": "ESTATE",
|
||||
"hatchback": "HATCHBACK",
|
||||
"sedan": "SEDAN",
|
||||
"mpv": "MPV",
|
||||
"terreinwagen": "SUV",
|
||||
"cabriolet": "CONVERTIBLE",
|
||||
"coupe": "COUPE",
|
||||
"personenbus": "MPV",
|
||||
"pick-up": "PICKUP",
|
||||
"open wagen": "PICKUP",
|
||||
"gesloten opbouw": "VAN",
|
||||
"kampeerwagen": "RV"
|
||||
},
|
||||
"power_calculation": {
|
||||
"ratio_source": "vermogen_massarijklaar",
|
||||
"weight_source": "massa_rijklaar"
|
||||
},
|
||||
"fuel_translations": {
|
||||
"Benzine": "Benzin",
|
||||
"Elektriciteit": "Elektromos",
|
||||
"Diesel": "Dízel",
|
||||
"LPG": "Autógáz (LPG)",
|
||||
"Niet geregistreerd": "ISMERETLEN",
|
||||
"Benzine": "Petrol",
|
||||
"Elektriciteit": "Electric",
|
||||
"Diesel": "Diesel",
|
||||
"LPG": "LPG",
|
||||
"CNG": "CNG",
|
||||
"Waterstof": "Hydrogen",
|
||||
"Niet geregistreerd": "UNKNOWN"
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,4 +1,4 @@
|
||||
# /app/app/workers/vehicle/mapping_rules.py
|
||||
# /opt/docker/dev/service_finder/backend/app/workers/vehicle/mapping_rules.py
|
||||
|
||||
SOURCE_MAPPINGS = {
|
||||
"os-vehicle-db": {
|
||||
|
||||
113
backend/app/workers/vehicle/r5_test.py
Normal file
113
backend/app/workers/vehicle/r5_test.py
Normal file
@@ -0,0 +1,113 @@
|
||||
import asyncio
|
||||
import json
|
||||
import re
|
||||
import requests
|
||||
from sqlalchemy import text
|
||||
from app.database import AsyncSessionLocal
|
||||
|
||||
# --- TECHNIKAI SZÓTÁR ÉS MAPPING ---
|
||||
# Ez a szótár fordítja le az UltimateSpecs kulcsokat az adatbázis oszlopneveire
|
||||
MAPPING = {
|
||||
"Maximum power": "power_kw",
|
||||
"Engine capacity": "engine_capacity",
|
||||
"Maximum torque": "torque_nm",
|
||||
"Top Speed": "max_speed",
|
||||
"Acceleration 0 to 100 km/h": "acceleration_0_100",
|
||||
"Curb Weight": "curb_weight",
|
||||
"Wheelbase": "wheelbase",
|
||||
"Num. of Seats": "seats",
|
||||
"Drive wheels - Traction - Layout": "drive_type",
|
||||
"Body": "body_type"
|
||||
}
|
||||
|
||||
async def r5_test_run():
|
||||
print("🚀 R5 Hibrid Robot indítása (Teszt üzemmód)...")
|
||||
|
||||
async with AsyncSessionLocal() as db:
|
||||
# 1. KIVÁLASZTÁS: Kiveszünk egy olyan autót, ami még nincs dúsítva (R1 bázisból)
|
||||
query = text("""
|
||||
SELECT id, make, marketing_name, year_from, technical_code, fuel_type
|
||||
FROM vehicle.vehicle_model_definitions
|
||||
WHERE (power_kw IS NULL OR power_kw = 0 OR engine_capacity IS NULL OR engine_capacity = 0)
|
||||
AND status IN ('manual_review_needed', 'research_failed_empty', 'pending', 'enrich_ready')
|
||||
ORDER BY priority_score DESC
|
||||
LIMIT 1
|
||||
""")
|
||||
target = (await db.execute(query)).fetchone()
|
||||
|
||||
if not target:
|
||||
print("✨ Nincs feldolgozatlan autó az adatbázisban.")
|
||||
return
|
||||
|
||||
t_id, make, model, year, tech_code, fuel = target
|
||||
print(f"🎯 Célpont: {make} {model} ({year})")
|
||||
print(f"📌 Technical Code: {tech_code or 'Nincs megadva'}")
|
||||
|
||||
# 2. RDW ADATOK (Holland hatósági bázis)
|
||||
# Ha van technical_code (pl. Fiatnál a típusazonosító), az RDW-ből pontos adatot kapunk
|
||||
rdw_data = {}
|
||||
if tech_code:
|
||||
print("🇳🇱 RDW adatok lekérése...")
|
||||
# Az RDW API m9d7-ebf2 táblája tartalmazza a típus specifikációkat
|
||||
rdw_url = f"https://opendata.rdw.nl/resource/m9d7-ebf2.json?handelsbenaming={tech_code.upper()}"
|
||||
try:
|
||||
res = requests.get(rdw_url, timeout=5).json()
|
||||
if res:
|
||||
rdw_data = {
|
||||
"power_kw": int(float(res[0].get('nettomaximumvermogen', 0))),
|
||||
"engine_capacity": int(res[0].get('cilinderinhoud', 0)),
|
||||
"curb_weight": int(res[0].get('massa_ledig_voertuig', 0))
|
||||
}
|
||||
print("✅ RDW adatok sikeresen betöltve.")
|
||||
except:
|
||||
print("⚠️ RDW nem elérhető vagy nincs találat.")
|
||||
|
||||
# 3. ULTIMATESPECS ADATOK (Szimulált kaparás a kért logika alapján)
|
||||
print("🏁 UltimateSpecs adatok gyűjtése...")
|
||||
# Itt futna a Playwright scraper, ami kinyeri a táblázatot
|
||||
# Példa nyers adatokra, amit az oldalról szedünk le:
|
||||
raw_web_data = {
|
||||
"Maximum power": "103 PS / 76 kW @ 5750 rpm",
|
||||
"Engine capacity": "1581 cm3",
|
||||
"Maximum torque": "144 Nm @ 4000 rpm",
|
||||
"Top Speed": "180 km/h",
|
||||
"Acceleration 0 to 100 km/h": "11.5 s",
|
||||
"Curb Weight": "1090 kg",
|
||||
"Wheelbase": "254 cm",
|
||||
"Body": "Hatchback"
|
||||
}
|
||||
|
||||
# 4. ÖSSZEFŰZÉS ÉS FORDÍTÁS
|
||||
final_mdm_record = {
|
||||
"id": t_id,
|
||||
"make": make,
|
||||
"marketing_name": model,
|
||||
"year_from": year,
|
||||
"fuel_type": fuel
|
||||
}
|
||||
|
||||
# Alkalmazzuk a mappinget és a regex tisztítást
|
||||
for web_key, db_key in MAPPING.items():
|
||||
val = raw_web_data.get(web_key)
|
||||
if val:
|
||||
# Számértékek kinyerése (pl. "76 kW" -> 76, "1581 cm3" -> 1581)
|
||||
numbers = re.findall(r'\d+', str(val))
|
||||
if numbers:
|
||||
# Ha több szám van (pl. kW és LE), a relevánsat választjuk
|
||||
final_mdm_record[db_key] = numbers[1] if "kW" in str(val) and len(numbers)>1 else numbers[0]
|
||||
else:
|
||||
final_mdm_record[db_key] = val
|
||||
|
||||
# RDW adatok prioritása (ezek a legpontosabbak, felülírják a webet)
|
||||
final_mdm_record.update({k: v for k, v in rdw_data.items() if v})
|
||||
|
||||
# --- TERMINÁL KIMENET ---
|
||||
print("\n" + "="*50)
|
||||
print("📊 VÉGLEGES MDM REKORD (ELŐNÉZET)")
|
||||
print("="*50)
|
||||
print(json.dumps(final_mdm_record, indent=2, ensure_ascii=False))
|
||||
print("="*50)
|
||||
print("\n[R5] Ha az adatok rendben vannak, mehet az élesítés?")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(r5_test_run())
|
||||
138
backend/app/workers/vehicle/r5_ultimate_harvester.py
Normal file
138
backend/app/workers/vehicle/r5_ultimate_harvester.py
Normal file
@@ -0,0 +1,138 @@
|
||||
#!/usr/bin/env python3
|
||||
import asyncio
|
||||
import json
|
||||
import re
|
||||
import logging
|
||||
import random
|
||||
import urllib.parse
|
||||
from playwright.async_api import async_playwright
|
||||
from sqlalchemy import text
|
||||
from app.database import AsyncSessionLocal
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] [R5-SENTINEL] %(message)s')
|
||||
logger = logging.getLogger("R5")
|
||||
|
||||
COLUMN_MAPPING = {
|
||||
"horsepower": "power_kw",
|
||||
"engine displacement": "engine_capacity",
|
||||
"maximum torque": "torque_nm",
|
||||
"top speed": "max_speed",
|
||||
"acceleration 0 to 100 km/h": "acceleration_0_100",
|
||||
"curb weight": "curb_weight",
|
||||
"wheelbase": "wheelbase",
|
||||
"num. of seats": "seats"
|
||||
}
|
||||
|
||||
class R5Harvester:
|
||||
def __init__(self):
|
||||
self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
|
||||
|
||||
def clean_number(self, val: str, key: str = "") -> int:
|
||||
if not val or val == "-": return 0
|
||||
try:
|
||||
if "hp" in val.lower() or "kw" in val.lower():
|
||||
kw_match = re.search(r'(\d+)\s*kw', val.lower())
|
||||
if kw_match: return int(kw_match.group(1))
|
||||
nums = re.findall(r'\d+', val.replace(' ', '').replace(',', '').replace('.', ''))
|
||||
return int(nums[0]) if nums else 0
|
||||
except: return 0
|
||||
|
||||
async def scrape_car_details(self, page, make, model, year):
|
||||
try:
|
||||
# 1. Belső keresés
|
||||
search_url = f"https://www.ultimatespecs.com/index.php?brand={urllib.parse.quote(make)}&q={urllib.parse.quote(model + ' ' + str(year))}"
|
||||
logger.info(f"🔍 Keresés indítása...")
|
||||
await page.goto(search_url, wait_until="networkidle", timeout=30000)
|
||||
|
||||
# 2. Megkeressük a linket, de NEM kattintunk, hanem elkérjük az URL-t
|
||||
# Rugalmasabb szelektor a 75 találat kezeléséhez
|
||||
link_element = await page.wait_for_selector("a[href*='/car-specs/']", timeout=15000)
|
||||
if not link_element:
|
||||
return None
|
||||
|
||||
href = await link_element.get_attribute("href")
|
||||
target_url = href if href.startswith("http") else f"https://www.ultimatespecs.com{href}"
|
||||
|
||||
# 3. KÖZVETLEN UGRÁS (Direct Jump) - Ez kikerüli a hirdetéseket
|
||||
logger.info(f"🚀 Közvetlen ugrás az adatlapra: {target_url}")
|
||||
await page.goto(target_url, wait_until="networkidle", timeout=30000)
|
||||
|
||||
# 4. Parszolás (Minden táblázatot nézünk)
|
||||
full_specs = await page.evaluate("""
|
||||
() => {
|
||||
let results = {};
|
||||
document.querySelectorAll('table.table_specs, table.responsive').forEach(table => {
|
||||
table.querySelectorAll('tr').forEach(row => {
|
||||
let t = row.querySelector('.table_specs_title, .td_title, td:first-child');
|
||||
let v = row.querySelector('.table_specs_value, .td_value, td:last-child');
|
||||
if(t && v) {
|
||||
let k = t.innerText.replace(':','').trim().toLowerCase();
|
||||
let val = v.innerText.trim();
|
||||
if(k && val && val !== "-") results[k] = val;
|
||||
}
|
||||
});
|
||||
});
|
||||
return results;
|
||||
}
|
||||
""")
|
||||
return full_specs
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Scrape hiba: {str(e)[:100]}...")
|
||||
return None
|
||||
|
||||
async def run(self):
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
context = await browser.new_context(user_agent=self.user_agent)
|
||||
page = await context.new_page()
|
||||
|
||||
while True:
|
||||
async with AsyncSessionLocal() as db:
|
||||
query = text("""
|
||||
SELECT id, make, marketing_name, year_from
|
||||
FROM vehicle.vehicle_model_definitions
|
||||
WHERE (power_kw IS NULL OR power_kw = 0)
|
||||
AND status IN ('manual_review_needed', 'pending', 'enrich_ready')
|
||||
ORDER BY priority_score DESC LIMIT 1
|
||||
""")
|
||||
target = (await db.execute(query)).fetchone()
|
||||
|
||||
if not target:
|
||||
logger.info("✨ Pipeline üres.")
|
||||
break
|
||||
|
||||
t_id, make, model, year = target
|
||||
logger.info(f"🚜 Feldolgozás: {make} {model} ({year})")
|
||||
|
||||
web_data = await self.scrape_car_details(page, make, model, year)
|
||||
|
||||
if not web_data or len(web_data) < 5:
|
||||
logger.warning(f"⚠️ Sikertelen gyűjtés, státusz: research_failed_empty")
|
||||
await db.execute(text("UPDATE vehicle.vehicle_model_definitions SET status = 'research_failed_empty' WHERE id = :id"), {"id": t_id})
|
||||
await db.commit()
|
||||
continue
|
||||
|
||||
updates = {col: self.clean_number(web_data.get(k)) for k, col in COLUMN_MAPPING.items()}
|
||||
|
||||
if updates.get('power_kw', 0) > 0:
|
||||
await db.execute(text("""
|
||||
UPDATE vehicle.vehicle_model_definitions
|
||||
SET power_kw = :power_kw, engine_capacity = :engine_capacity,
|
||||
torque_nm = :torque_nm, max_speed = :max_speed,
|
||||
acceleration_0_100 = :acceleration_0_100, curb_weight = :curb_weight,
|
||||
wheelbase = :wheelbase, specifications = specifications || :full_json,
|
||||
status = 'published', updated_at = NOW()
|
||||
WHERE id = :id
|
||||
"""), {**updates, "id": t_id, "full_json": json.dumps(web_data)})
|
||||
await db.commit()
|
||||
logger.info(f"✅ PUBLIKÁLVA: {make} {model} ({updates['power_kw']} kW)")
|
||||
else:
|
||||
await db.execute(text("UPDATE vehicle.vehicle_model_definitions SET status = 'research_failed_empty' WHERE id = :id"), {"id": t_id})
|
||||
await db.commit()
|
||||
|
||||
await asyncio.sleep(random.uniform(3, 6))
|
||||
await browser.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
harvester = R5Harvester()
|
||||
asyncio.run(harvester.run())
|
||||
@@ -1,4 +1,5 @@
|
||||
# /opt/docker/dev/service_finder/backend/app/workers/vehicle/robot_report.py
|
||||
# docker exec sf_api python -m app.workers.vehicle.robot_report
|
||||
import asyncio
|
||||
import psutil
|
||||
import pynvml
|
||||
|
||||
@@ -0,0 +1,425 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Worker: vehicle_ultimate_r0_spider
|
||||
Producer-Consumer lánc első eleme. Kivesz egy autót a vehicle.vehicle_model_definitions táblából,
|
||||
keres az UltimateSpecs oldalán, és a talált .html linkeket beszúrja a vehicle.auto_data_crawler_queue táblába.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import random
|
||||
import sys
|
||||
import signal
|
||||
import urllib.parse
|
||||
from datetime import datetime
|
||||
from typing import Optional, Dict, Any, List
|
||||
|
||||
from playwright.async_api import async_playwright, Page, Browser, BrowserContext
|
||||
from sqlalchemy import text, select, and_, or_
|
||||
from sqlalchemy.exc import IntegrityError
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.database import AsyncSessionLocal
|
||||
from app.models.vehicle.external_reference_queue import ExternalReferenceQueue
|
||||
from app.models.vehicle.vehicle_definitions import VehicleModelDefinition
|
||||
|
||||
# Logging konfiguráció
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s [R0-SPIDER] %(message)s',
|
||||
datefmt='%Y-%m-%d %H:%M:%S'
|
||||
)
|
||||
logger = logging.getLogger("R0-SPIDER")
|
||||
|
||||
# Konfiguráció
|
||||
SLEEP_INTERVAL = random.uniform(3, 6) # 3-6 mp között várakozás
|
||||
MAX_RETRIES = 3
|
||||
BASE_URL = "https://www.ultimatespecs.com/index.php?q={query}"
|
||||
|
||||
|
||||
class UltimateSpecsSpider:
|
||||
def __init__(self):
|
||||
self.running = True
|
||||
self.playwright = None
|
||||
self.browser: Optional[Browser] = None
|
||||
self.context: Optional[BrowserContext] = None
|
||||
self.user_agent = (
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
||||
"(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
|
||||
)
|
||||
|
||||
async def init_browser(self):
|
||||
"""Playwright böngésző inicializálása"""
|
||||
try:
|
||||
self.playwright = await async_playwright().start()
|
||||
self.browser = await self.playwright.chromium.launch(
|
||||
headless=True,
|
||||
args=[
|
||||
'--disable-blink-features=AutomationControlled',
|
||||
'--disable-dev-shm-usage',
|
||||
'--no-sandbox',
|
||||
]
|
||||
)
|
||||
self.context = await self.browser.new_context(
|
||||
user_agent=self.user_agent,
|
||||
viewport={'width': 1920, 'height': 1080},
|
||||
java_script_enabled=True
|
||||
)
|
||||
logger.info("Playwright böngésző inicializálva")
|
||||
except Exception as e:
|
||||
logger.error(f"Hiba a böngésző inicializálásakor: {e}")
|
||||
raise
|
||||
|
||||
async def close_browser(self):
|
||||
"""Playwright böngésző lezárása"""
|
||||
if self.context:
|
||||
await self.context.close()
|
||||
if self.browser:
|
||||
await self.browser.close()
|
||||
if self.playwright:
|
||||
await self.playwright.stop()
|
||||
logger.info("Playwright böngésző lezárva")
|
||||
|
||||
async def fetch_next_vehicle(self, session: AsyncSession) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Kivesz egy feldolgozandó járművet a vehicle_model_definitions táblából.
|
||||
"""
|
||||
query = text("""
|
||||
SELECT id, make, marketing_name, year_from, vehicle_class
|
||||
FROM vehicle.vehicle_model_definitions
|
||||
WHERE status IN ('pending', 'manual_review_needed')
|
||||
AND vehicle_class IN ('car', 'motorcycle')
|
||||
ORDER BY priority_score DESC, updated_at ASC
|
||||
LIMIT 1
|
||||
FOR UPDATE SKIP LOCKED
|
||||
""")
|
||||
|
||||
try:
|
||||
result = await session.execute(query)
|
||||
row = result.fetchone()
|
||||
if row:
|
||||
return {
|
||||
'id': row[0],
|
||||
'make': row[1],
|
||||
'marketing_name': row[2],
|
||||
'year_from': row[3],
|
||||
'vehicle_class': row[4]
|
||||
}
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.error(f"Hiba a következő jármű lekérdezésekor: {e}")
|
||||
return None
|
||||
|
||||
def build_search_query(self, make: str, marketing_name: str, year_from: Optional[int]) -> str:
|
||||
"""
|
||||
Build search query for UltimateSpecs.
|
||||
"""
|
||||
# Clean and prepare the query
|
||||
make_clean = make.lower().replace(' ', '-').replace('.', '')
|
||||
model_clean = marketing_name.lower().replace(' ', '-').replace('.', '')
|
||||
|
||||
# Remove common suffixes
|
||||
for suffix in ['-', 'series', 'class', 'model']:
|
||||
if model_clean.endswith(suffix):
|
||||
model_clean = model_clean[:-len(suffix)].rstrip('-')
|
||||
|
||||
query_parts = [make_clean, model_clean]
|
||||
if year_from:
|
||||
query_parts.append(str(year_from))
|
||||
|
||||
return ' '.join(query_parts)
|
||||
|
||||
async def extract_links_with_js(self, page: Page, make_url: str, model_word: str) -> List[Dict[str, str]]:
|
||||
"""
|
||||
Extract .html links from the page using the provided JavaScript filter.
|
||||
"""
|
||||
js_code = """
|
||||
(args) => {
|
||||
let targetMakeUrl = args.makeUrl; // pl. 'honda' vagy 'alfa-romeo'
|
||||
let targetModel = args.modelWord; // pl. 'civic'
|
||||
let specs = [];
|
||||
document.querySelectorAll('a').forEach(a => {
|
||||
let href = a.getAttribute('href') || '';
|
||||
let text = a.innerText.trim();
|
||||
let hrefLow = href.toLowerCase();
|
||||
let textLow = text.toLowerCase();
|
||||
if (hrefLow.includes('/car-specs/') || hrefLow.includes('/motorcycles-specs/')) {
|
||||
// SZIGORÚ MÁRKA SZŰRŐ AZ URL-BEN (Reklámok ellen)
|
||||
if (hrefLow.includes('/' + targetMakeUrl + '/') || hrefLow.includes(targetMakeUrl + '-models')) {
|
||||
// MODELL SZŰRŐ A SZÖVEGBEN VAGY URL-BEN
|
||||
if (targetModel === '' || textLow.includes(targetModel) || hrefLow.includes(targetModel)) {
|
||||
if (hrefLow.endsWith('.html') && text.length > 1) {
|
||||
specs.push({ name: text, url: href });
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
return specs;
|
||||
}
|
||||
"""
|
||||
|
||||
try:
|
||||
# Prepare arguments for the JS function
|
||||
args = {
|
||||
'makeUrl': make_url.lower(),
|
||||
'modelWord': model_word.lower()
|
||||
}
|
||||
|
||||
# Execute the JavaScript
|
||||
specs = await page.evaluate(js_code, args)
|
||||
return specs
|
||||
except Exception as e:
|
||||
logger.error(f"Hiba a JS szűrő futtatásakor: {e}")
|
||||
return []
|
||||
|
||||
async def search_and_extract_links(self, vehicle: Dict[str, Any]) -> List[Dict[str, str]]:
|
||||
"""
|
||||
Search on UltimateSpecs and extract links using two-step drill-down.
|
||||
"""
|
||||
search_query = self.build_search_query(
|
||||
vehicle['make'],
|
||||
vehicle['marketing_name'],
|
||||
vehicle['year_from']
|
||||
)
|
||||
|
||||
# Prepare make URL part
|
||||
make_url = vehicle['make'].lower().replace(' ', '-').replace('.', '')
|
||||
model_word = vehicle['marketing_name'].lower().split()[0] if vehicle['marketing_name'] else ''
|
||||
|
||||
encoded_query = urllib.parse.quote(search_query)
|
||||
search_url = BASE_URL.format(query=encoded_query)
|
||||
|
||||
logger.info(f"Keresés: {search_query} | URL: {search_url}")
|
||||
|
||||
page = None
|
||||
try:
|
||||
page = await self.context.new_page()
|
||||
|
||||
# 1. Step: Go to search page
|
||||
await page.goto(search_url, wait_until='networkidle', timeout=30000)
|
||||
|
||||
# Check if we're on a category page or search results
|
||||
current_url = page.url
|
||||
|
||||
# 2. Step: Extract links with JS filter
|
||||
all_links = await self.extract_links_with_js(page, make_url, model_word)
|
||||
|
||||
# If no links found on first page, try to click on first result
|
||||
if not all_links and 'index.php' in current_url:
|
||||
# Try to find and click on first relevant link
|
||||
first_link = await page.query_selector('a[href*="/car-specs/"], a[href*="/motorcycles-specs/"]')
|
||||
if first_link:
|
||||
await first_link.click()
|
||||
await page.wait_for_load_state('networkidle')
|
||||
|
||||
# Extract links from the new page
|
||||
all_links = await self.extract_links_with_js(page, make_url, model_word)
|
||||
|
||||
# Ensure URLs are absolute
|
||||
for link in all_links:
|
||||
if not link['url'].startswith('http'):
|
||||
link['url'] = f"https://www.ultimatespecs.com{link['url']}"
|
||||
|
||||
logger.info(f"{len(all_links)} link találva")
|
||||
return all_links
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Hiba a keresés során: {e}")
|
||||
return []
|
||||
finally:
|
||||
if page:
|
||||
await page.close()
|
||||
|
||||
async def save_links_to_queue(self, session: AsyncSession, links: List[Dict[str, str]],
|
||||
vehicle: Dict[str, Any]) -> int:
|
||||
"""
|
||||
Save extracted links to the external reference queue.
|
||||
"""
|
||||
saved_count = 0
|
||||
|
||||
for link in links:
|
||||
try:
|
||||
# Check if URL already exists
|
||||
existing_query = select(ExternalReferenceQueue).where(
|
||||
ExternalReferenceQueue.url == link['url']
|
||||
)
|
||||
existing_result = await session.execute(existing_query)
|
||||
if existing_result.scalar_one_or_none():
|
||||
logger.debug(f"URL már létezik: {link['url']}")
|
||||
continue
|
||||
|
||||
# Create new queue entry
|
||||
queue_entry = ExternalReferenceQueue(
|
||||
url=link['url'],
|
||||
level='engine',
|
||||
category=vehicle['vehicle_class'] or 'car',
|
||||
name=link['name'][:255],
|
||||
parent_id=vehicle['id'],
|
||||
status='pending'
|
||||
)
|
||||
|
||||
session.add(queue_entry)
|
||||
await session.commit()
|
||||
saved_count += 1
|
||||
logger.debug(f"URL mentve: {link['url']}")
|
||||
|
||||
except IntegrityError:
|
||||
await session.rollback()
|
||||
logger.debug(f"URL már létezik (integrity): {link['url']}")
|
||||
except Exception as e:
|
||||
await session.rollback()
|
||||
logger.error(f"Hiba a URL mentésekor: {e}")
|
||||
|
||||
return saved_count
|
||||
|
||||
async def update_vehicle_status(self, session: AsyncSession, vehicle_id: int,
|
||||
status: str, error_msg: str = None):
|
||||
"""
|
||||
Update the vehicle's status in the database.
|
||||
"""
|
||||
try:
|
||||
query = text("""
|
||||
UPDATE vehicle.vehicle_model_definitions
|
||||
SET status = :status,
|
||||
last_error = :error_msg,
|
||||
updated_at = NOW(),
|
||||
attempts = attempts + 1
|
||||
WHERE id = :id
|
||||
""")
|
||||
|
||||
await session.execute(
|
||||
query,
|
||||
{'status': status, 'error_msg': error_msg, 'id': vehicle_id}
|
||||
)
|
||||
await session.commit()
|
||||
logger.info(f"Jármű státusz frissítve: {vehicle_id} -> {status}")
|
||||
|
||||
except Exception as e:
|
||||
await session.rollback()
|
||||
logger.error(f"Hiba a státusz frissítésekor: {e}")
|
||||
|
||||
async def process_single_vehicle(self):
|
||||
"""
|
||||
Process a single vehicle: fetch, search, extract links, save to queue.
|
||||
"""
|
||||
async with AsyncSessionLocal() as session:
|
||||
try:
|
||||
# 1. Fetch next vehicle
|
||||
vehicle = await self.fetch_next_vehicle(session)
|
||||
if not vehicle:
|
||||
logger.info("Nincs feldolgozandó jármű")
|
||||
return False
|
||||
|
||||
logger.info(f"Feldolgozás: {vehicle['make']} {vehicle['marketing_name']} "
|
||||
f"(ID: {vehicle['id']})")
|
||||
|
||||
# 2. Search and extract links
|
||||
links = await self.search_and_extract_links(vehicle)
|
||||
|
||||
if not links:
|
||||
# No links found
|
||||
await self.update_vehicle_status(
|
||||
session, vehicle['id'],
|
||||
'research_failed_empty',
|
||||
'No links found on UltimateSpecs'
|
||||
)
|
||||
logger.warning(f"Nem található link: {vehicle['make']} {vehicle['marketing_name']}")
|
||||
return True
|
||||
|
||||
# 3. Save links to queue
|
||||
saved_count = await self.save_links_to_queue(session, links, vehicle)
|
||||
|
||||
# 4. Update vehicle status
|
||||
if saved_count > 0:
|
||||
await self.update_vehicle_status(
|
||||
session, vehicle['id'],
|
||||
'spider_dispatched',
|
||||
f'{saved_count} links added to queue'
|
||||
)
|
||||
logger.info(f"{saved_count} link mentve a queue-ba")
|
||||
else:
|
||||
# All links already existed
|
||||
await self.update_vehicle_status(
|
||||
session, vehicle['id'],
|
||||
'spider_dispatched',
|
||||
'All links already in queue'
|
||||
)
|
||||
logger.info("Minden link már szerepel a queue-ban")
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Hiba a jármű feldolgozása során: {e}")
|
||||
# Try to update status with error
|
||||
try:
|
||||
if 'vehicle' in locals():
|
||||
await self.update_vehicle_status(
|
||||
session, vehicle['id'],
|
||||
'research_failed_network',
|
||||
str(e)[:500]
|
||||
)
|
||||
except:
|
||||
pass
|
||||
return True
|
||||
|
||||
async def run(self):
|
||||
"""
|
||||
Main loop of the spider.
|
||||
"""
|
||||
logger.info("UltimateSpecs R0 Spider indítása...")
|
||||
|
||||
try:
|
||||
await self.init_browser()
|
||||
|
||||
while self.running:
|
||||
try:
|
||||
# Process a single vehicle
|
||||
processed = await self.process_single_vehicle()
|
||||
|
||||
if not processed:
|
||||
# No vehicles to process, wait longer
|
||||
await asyncio.sleep(SLEEP_INTERVAL * 2)
|
||||
else:
|
||||
# Wait before next iteration
|
||||
await asyncio.sleep(SLEEP_INTERVAL)
|
||||
|
||||
except KeyboardInterrupt:
|
||||
logger.info("Keyboard interrupt, leállítás...")
|
||||
self.running = False
|
||||
break
|
||||
except Exception as e:
|
||||
logger.error(f"Hiba a fő ciklusban: {e}")
|
||||
await asyncio.sleep(SLEEP_INTERVAL)
|
||||
|
||||
finally:
|
||||
await self.close_browser()
|
||||
logger.info("UltimateSpecs R0 Spider leállt")
|
||||
|
||||
def stop(self):
|
||||
"""Stop the spider gracefully."""
|
||||
self.running = False
|
||||
logger.info("Leállítás kérése érkezett")
|
||||
|
||||
|
||||
async def main():
|
||||
"""Main entry point."""
|
||||
spider = UltimateSpecsSpider()
|
||||
|
||||
# Signal handling for graceful shutdown
|
||||
def signal_handler(signum, frame):
|
||||
logger.info(f"Signal {signum} received, stopping...")
|
||||
spider.stop()
|
||||
|
||||
signal.signal(signal.SIGINT, signal_handler)
|
||||
signal.signal(signal.SIGTERM, signal_handler)
|
||||
|
||||
try:
|
||||
await spider.run()
|
||||
except Exception as e:
|
||||
logger.error(f"Váratlan hiba: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -0,0 +1,355 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Worker: vehicle_ultimate_r1_scraper
|
||||
Producer-Consumer lánc második eleme (A Nyers Letöltő).
|
||||
Kivesz egy feldolgozandó linket a vehicle.auto_data_crawler_queue táblából (level='engine'),
|
||||
letölti a HTML tartalmat Playwright böngészővel, kinyeri a specifikációkat JS parserrel,
|
||||
és elmenti a vehicle.external_reference_library táblába.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import random
|
||||
import sys
|
||||
import signal
|
||||
import json
|
||||
from datetime import datetime
|
||||
from typing import Optional, Dict, Any, List
|
||||
|
||||
from playwright.async_api import async_playwright, Page, Browser, BrowserContext, TimeoutError as PlaywrightTimeoutError
|
||||
from sqlalchemy import text, select, and_, or_
|
||||
from sqlalchemy.exc import IntegrityError
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.database import AsyncSessionLocal, ensure_models_loaded
|
||||
from app.models.vehicle.external_reference_queue import ExternalReferenceQueue
|
||||
from app.models.vehicle.external_reference import ExternalReferenceLibrary
|
||||
|
||||
# Logging konfiguráció
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s [R1-SCRAPER] %(message)s',
|
||||
datefmt='%Y-%m-%d %H:%M:%S'
|
||||
)
|
||||
logger = logging.getLogger("R1-SCRAPER")
|
||||
|
||||
# Konfiguráció
|
||||
SLEEP_INTERVAL = random.uniform(3, 6) # 3-6 mp között várakozás
|
||||
MAX_RETRIES = 3
|
||||
CLOUDFLARE_KEYWORDS = ["just a moment", "cloudflare", "checking your browser"]
|
||||
|
||||
|
||||
class UltimateSpecsScraper:
|
||||
def __init__(self):
|
||||
self.running = True
|
||||
self.playwright = None
|
||||
self.browser: Optional[Browser] = None
|
||||
self.context: Optional[BrowserContext] = None
|
||||
self.user_agent = (
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
||||
"(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
|
||||
)
|
||||
|
||||
async def init_browser(self):
|
||||
"""Playwright böngésző inicializálása"""
|
||||
try:
|
||||
self.playwright = await async_playwright().start()
|
||||
self.browser = await self.playwright.chromium.launch(
|
||||
headless=True,
|
||||
args=[
|
||||
'--disable-blink-features=AutomationControlled',
|
||||
'--disable-dev-shm-usage',
|
||||
'--no-sandbox',
|
||||
]
|
||||
)
|
||||
self.context = await self.browser.new_context(
|
||||
user_agent=self.user_agent,
|
||||
viewport={'width': 1920, 'height': 1080},
|
||||
java_script_enabled=True
|
||||
)
|
||||
logger.info("Playwright böngésző inicializálva")
|
||||
except Exception as e:
|
||||
logger.error(f"Hiba a böngésző inicializálásakor: {e}")
|
||||
raise
|
||||
|
||||
async def close_browser(self):
|
||||
"""Playwright böngésző lezárása"""
|
||||
if self.context:
|
||||
await self.context.close()
|
||||
if self.browser:
|
||||
await self.browser.close()
|
||||
if self.playwright:
|
||||
await self.playwright.stop()
|
||||
logger.info("Playwright böngésző lezárva")
|
||||
|
||||
async def fetch_next_queue_item(self, session: AsyncSession) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Kivesz egy feldolgozandó linket a vehicle.auto_data_crawler_queue táblából.
|
||||
"""
|
||||
query = text("""
|
||||
SELECT id, url, category, parent_id
|
||||
FROM vehicle.auto_data_crawler_queue
|
||||
WHERE level = 'engine' AND status = 'pending'
|
||||
FOR UPDATE SKIP LOCKED LIMIT 1
|
||||
""")
|
||||
|
||||
try:
|
||||
result = await session.execute(query)
|
||||
row = result.fetchone()
|
||||
if row:
|
||||
return {
|
||||
"id": row[0],
|
||||
"url": row[1],
|
||||
"category": row[2],
|
||||
"parent_id": row[3]
|
||||
}
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.error(f"Hiba a queue lekérdezésekor: {e}")
|
||||
return None
|
||||
|
||||
async def scrape_with_retry(self, url: str, max_retries: int = MAX_RETRIES) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Playwright böngészővel letölti a HTML tartalmat, retry logikával.
|
||||
"""
|
||||
for attempt in range(1, max_retries + 1):
|
||||
try:
|
||||
logger.info(f"Próbálkozás {attempt}/{max_retries}: {url}")
|
||||
page = await self.context.new_page()
|
||||
|
||||
# Navigáció
|
||||
await page.goto(url, wait_until="domcontentloaded", timeout=30000)
|
||||
|
||||
# Várjunk a táblázatokra
|
||||
try:
|
||||
await page.wait_for_selector('table', timeout=5000)
|
||||
except PlaywrightTimeoutError:
|
||||
logger.warning("Nem található táblázat 5 másodpercen belül, de folytatjuk")
|
||||
|
||||
# Ellenőrizzük Cloudflare blokkolást
|
||||
title = await page.title()
|
||||
title_lower = title.lower()
|
||||
if any(keyword in title_lower for keyword in CLOUDFLARE_KEYWORDS):
|
||||
raise Exception(f"Cloudflare blokkolás észlelve: {title}")
|
||||
|
||||
# JS parser futtatása
|
||||
specs = await page.evaluate("""() => {
|
||||
let results = {};
|
||||
// 1. ÖSSZES táblázat letapogatása
|
||||
document.querySelectorAll('table').forEach(table => {
|
||||
table.querySelectorAll('tr').forEach(row => {
|
||||
let t = row.querySelector('.table_specs_title, .td_title, td:first-child, th:first-child');
|
||||
let v = row.querySelector('.table_specs_value, .td_value, td:last-child');
|
||||
if(t && v) {
|
||||
let k = t.innerText.replace(/:/g,'').trim().toLowerCase();
|
||||
let val = v.innerText.trim();
|
||||
if(k && val && val !== "-") { results[k] = val; }
|
||||
}
|
||||
});
|
||||
});
|
||||
// 2. Extra szekciók és dimenziók mentése
|
||||
const sections = {};
|
||||
document.querySelectorAll('h2, h3, h4, .section-title, .specs-header').forEach(header => {
|
||||
const title = header.innerText.trim();
|
||||
if (title && title.length > 0) {
|
||||
let nextElement = header.nextElementSibling;
|
||||
let sectionData = {};
|
||||
for (let i = 0; i < 5 && nextElement; i++) {
|
||||
if (nextElement.tagName === 'TABLE') {
|
||||
nextElement.querySelectorAll('tr').forEach(row => {
|
||||
let t = row.querySelector('td:first-child, th:first-child');
|
||||
let v = row.querySelector('td:last-child');
|
||||
if(t && v) {
|
||||
let k = t.innerText.replace(/:/g,'').trim().toLowerCase();
|
||||
let val = v.innerText.trim();
|
||||
if(k && val && val !== "-") {
|
||||
sectionData[k] = val;
|
||||
results[`${title.toLowerCase().replace(/ /g, '_')}_${k}`] = val;
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
nextElement = nextElement.nextElementSibling;
|
||||
}
|
||||
sections[title.toLowerCase().replace(/ /g, '_')] = sectionData;
|
||||
}
|
||||
});
|
||||
results['_sections'] = sections;
|
||||
return results;
|
||||
}""")
|
||||
|
||||
await page.close()
|
||||
|
||||
if specs and len(specs) > 0:
|
||||
logger.info(f"Sikeres letöltés, {len(specs)} specifikáció kinyerve")
|
||||
return specs
|
||||
else:
|
||||
logger.warning("Üres specifikációk, újrapróbálkozás")
|
||||
raise Exception("Üres specifikációk")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Hiba a {attempt}. próbálkozásnál: {e}")
|
||||
if attempt < max_retries:
|
||||
backoff = random.uniform(2, 5)
|
||||
logger.info(f"Várakozás {backoff:.1f} másodpercet...")
|
||||
await asyncio.sleep(backoff)
|
||||
else:
|
||||
logger.error(f"Összes próbálkozás sikertelen: {e}")
|
||||
return None
|
||||
|
||||
return None
|
||||
|
||||
async def process_queue_item(self, session: AsyncSession, item: Dict[str, Any]) -> bool:
|
||||
"""
|
||||
Feldolgoz egy queue tételt: letölti, kinyeri, elmenti.
|
||||
"""
|
||||
queue_id = item["id"]
|
||||
url = item["url"]
|
||||
category = item["category"]
|
||||
|
||||
try:
|
||||
# 1. Letöltés
|
||||
specs = await self.scrape_with_retry(url)
|
||||
|
||||
if not specs:
|
||||
# Hiba esetén frissítjük a queue-t
|
||||
await session.execute(
|
||||
text("""
|
||||
UPDATE vehicle.auto_data_crawler_queue
|
||||
SET status = 'error', error_msg = :error_msg, retry_count = retry_count + 1
|
||||
WHERE id = :id
|
||||
"""),
|
||||
{"error_msg": "Sikertelen letöltés (üres specifikációk vagy Cloudflare)", "id": queue_id}
|
||||
)
|
||||
await session.commit()
|
||||
logger.error(f"Queue {queue_id} sikertelen, státusz: error")
|
||||
return False
|
||||
|
||||
# 2. Új rekord létrehozása az external_reference_library táblában (nyers SQL)
|
||||
# A specifications dict-et JSON stringgé alakítjuk
|
||||
import json
|
||||
specs_json = json.dumps(specs)
|
||||
insert_query = text("""
|
||||
INSERT INTO vehicle.external_reference_library
|
||||
(source_name, source_url, category, specifications, pipeline_status, created_at, last_scraped_at)
|
||||
VALUES (:source_name, :source_url, :category, CAST(:specifications AS jsonb), :pipeline_status, NOW(), NOW())
|
||||
RETURNING id
|
||||
""")
|
||||
result = await session.execute(
|
||||
insert_query,
|
||||
{
|
||||
"source_name": "ultimatespecs",
|
||||
"source_url": url,
|
||||
"category": category,
|
||||
"specifications": specs_json,
|
||||
"pipeline_status": "pending_enrich"
|
||||
}
|
||||
)
|
||||
new_id = result.scalar()
|
||||
|
||||
# 3. Queue tétel frissítése completed-re
|
||||
await session.execute(
|
||||
text("""
|
||||
UPDATE vehicle.auto_data_crawler_queue
|
||||
SET status = 'completed', updated_at = NOW()
|
||||
WHERE id = :id
|
||||
"""),
|
||||
{"id": queue_id}
|
||||
)
|
||||
|
||||
await session.commit()
|
||||
logger.info(f"Queue {queue_id} sikeresen feldolgozva, library ID: {new_id}")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Hiba a queue {queue_id} feldolgozásakor: {e}")
|
||||
await session.rollback()
|
||||
|
||||
# Hiba esetén error státusz
|
||||
try:
|
||||
await session.execute(
|
||||
text("""
|
||||
UPDATE vehicle.auto_data_crawler_queue
|
||||
SET status = 'error', error_msg = :error_msg, retry_count = retry_count + 1
|
||||
WHERE id = :id
|
||||
"""),
|
||||
{"error_msg": str(e)[:500], "id": queue_id}
|
||||
)
|
||||
await session.commit()
|
||||
except Exception as update_err:
|
||||
logger.error(f"Hiba a queue frissítésekor: {update_err}")
|
||||
|
||||
return False
|
||||
|
||||
async def run_once(self):
|
||||
"""Egyetlen feldolgozási ciklus"""
|
||||
# Biztosítjuk, hogy a modellek regisztrálva legyenek
|
||||
ensure_models_loaded()
|
||||
|
||||
async with AsyncSessionLocal() as session:
|
||||
try:
|
||||
# Tranzakció kezdése
|
||||
async with session.begin():
|
||||
item = await self.fetch_next_queue_item(session)
|
||||
if not item:
|
||||
logger.info("Nincs feldolgozandó queue tétel")
|
||||
return False
|
||||
|
||||
logger.info(f"Feldolgozás: {item['url']}")
|
||||
success = await self.process_queue_item(session, item)
|
||||
return success
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Hiba a run_once-ban: {e}")
|
||||
return False
|
||||
|
||||
async def run_loop(self):
|
||||
"""Fő ciklus: végtelen while, 3-6 mp várakozással"""
|
||||
await self.init_browser()
|
||||
|
||||
try:
|
||||
while self.running:
|
||||
success = await self.run_once()
|
||||
|
||||
if not success:
|
||||
# Ha nincs munka, várjunk egy kicsit
|
||||
sleep_time = SLEEP_INTERVAL
|
||||
logger.debug(f"Várakozás {sleep_time:.1f} másodpercet...")
|
||||
await asyncio.sleep(sleep_time)
|
||||
else:
|
||||
# Sikeres feldolgozás után rövid várakozás
|
||||
await asyncio.sleep(random.uniform(1, 2))
|
||||
|
||||
except KeyboardInterrupt:
|
||||
logger.info("Keyboard interrupt, leállítás...")
|
||||
except Exception as e:
|
||||
logger.error(f"Váratlan hiba a fő ciklusban: {e}")
|
||||
finally:
|
||||
await self.close_browser()
|
||||
|
||||
def stop(self):
|
||||
"""Leállítási jelzés"""
|
||||
self.running = False
|
||||
logger.info("Leállítási jelzés küldve")
|
||||
|
||||
|
||||
async def main():
|
||||
"""Fő függvény"""
|
||||
scraper = UltimateSpecsScraper()
|
||||
|
||||
# Signal kezelés
|
||||
def signal_handler(signum, frame):
|
||||
scraper.stop()
|
||||
|
||||
signal.signal(signal.SIGINT, signal_handler)
|
||||
signal.signal(signal.SIGTERM, signal_handler)
|
||||
|
||||
try:
|
||||
await scraper.run_loop()
|
||||
except Exception as e:
|
||||
logger.error(f"Fatal error: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -0,0 +1,299 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Worker: vehicle_ultimate_r2_enricher
|
||||
Producer-Consumer lánc harmadik eleme (Az Elemző). Offline adattisztítást és strukturálást végez.
|
||||
Kivesz egy feldolgozandó sort a vehicle.external_reference_library táblából (pipeline_status='pending_enrich'),
|
||||
hozzácsatolja a vehicle.auto_data_crawler_queue adatait, kinyeri a standard értékeket a nyers JSON-ből,
|
||||
és strukturált JSON-be csomagolja (standardized + _raw).
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import random
|
||||
import sys
|
||||
import signal
|
||||
import json
|
||||
import re
|
||||
from datetime import datetime
|
||||
from typing import Optional, Dict, Any, List, Tuple
|
||||
|
||||
from sqlalchemy import text, select, and_, or_
|
||||
from sqlalchemy.exc import IntegrityError, SQLAlchemyError
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.database import AsyncSessionLocal
|
||||
|
||||
# Logging konfiguráció
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s [R2-ENRICHER] %(message)s',
|
||||
datefmt='%Y-%m-%d %H:%M:%S'
|
||||
)
|
||||
logger = logging.getLogger("R2-ENRICHER")
|
||||
|
||||
# Konfiguráció
|
||||
SLEEP_INTERVAL = random.uniform(1, 3) # 1-3 mp között várakozás
|
||||
|
||||
# Fuzzy mapping a metrikákhoz
|
||||
FUZZY_MAPPING = {
|
||||
"power_kw": ["horsepower", "total electric power", "engine power", "maximum power", "power"],
|
||||
"engine_capacity": ["engine displacement", "displacement", "capacity", "cm3", "cu-in"],
|
||||
"torque_nm": ["maximum torque", "total electric torque", "torque"],
|
||||
"max_speed": ["top speed", "maximum speed"],
|
||||
"curb_weight": ["curb weight", "weight"],
|
||||
"wheelbase": ["wheelbase"],
|
||||
"seats": ["num. of seats", "seats"]
|
||||
}
|
||||
|
||||
# Szöveges mezők keresési kulcsszavai
|
||||
TEXT_FIELD_KEYWORDS = {
|
||||
"fuel_type": ["fuel type", "fuel", "engine fuel", "fuel system"],
|
||||
"transmission_type": ["transmission", "gear", "gearbox"],
|
||||
"drive_type": ["drive type", "drive", "drivetrain"],
|
||||
"body_type": ["body type", "body", "car body"]
|
||||
}
|
||||
|
||||
|
||||
class UltimateSpecsEnricher:
|
||||
def __init__(self):
|
||||
self.running = True
|
||||
|
||||
async def fetch_next_library_item(self, session: AsyncSession) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Kivesz egy feldolgozandó sort a Library-ből.
|
||||
"""
|
||||
query = text("""
|
||||
SELECT id, specifications, make, model, year_from
|
||||
FROM vehicle.external_reference_library
|
||||
WHERE pipeline_status = 'pending_enrich'
|
||||
FOR UPDATE SKIP LOCKED LIMIT 1
|
||||
""")
|
||||
|
||||
try:
|
||||
result = await session.execute(query)
|
||||
row = result.fetchone()
|
||||
if row:
|
||||
return {
|
||||
"id": row[0],
|
||||
"specifications": row[1] if isinstance(row[1], dict) else {},
|
||||
"make": row[2],
|
||||
"model": row[3],
|
||||
"year_from": row[4]
|
||||
}
|
||||
return None
|
||||
except SQLAlchemyError as e:
|
||||
logger.error(f"SQL hiba a lekérdezés során: {e}")
|
||||
return None
|
||||
|
||||
def extract_fuzzy_metric(self, specifications: Dict[str, Any], target_key: str, keywords: List[str]) -> Optional[float]:
|
||||
"""
|
||||
Keres a specifications szótárban a megadott kulcsszavak alapján, és számot próbál kinyerni.
|
||||
"""
|
||||
if not specifications:
|
||||
return None
|
||||
|
||||
# Először próbáljuk meg a kulcsokat (case-insensitive)
|
||||
spec_lower = {k.lower(): v for k, v in specifications.items()}
|
||||
|
||||
for keyword in keywords:
|
||||
for key, value in spec_lower.items():
|
||||
if keyword.lower() in key:
|
||||
# Ha a érték szám vagy string, próbáljuk kinyerni a számot
|
||||
num = self.clean_number(value)
|
||||
if num is not None:
|
||||
# Ha a kulcs tartalmazza a "hp" vagy "horsepower" és a cél kW, konvertáljuk
|
||||
if target_key == "power_kw" and ("hp" in key or "horsepower" in key):
|
||||
# hp -> kW konverzió (1 hp = 0.7457 kW)
|
||||
num = num * 0.7457
|
||||
return num
|
||||
return None
|
||||
|
||||
def clean_number(self, value: Any) -> Optional[float]:
|
||||
"""
|
||||
Kinyeri a számot egy stringből vagy más típusból.
|
||||
"""
|
||||
if value is None:
|
||||
return None
|
||||
if isinstance(value, (int, float)):
|
||||
return float(value)
|
||||
if isinstance(value, str):
|
||||
# Távolítsuk el a nem szám karaktereket, kivéve pont és mínusz
|
||||
# Keresünk mintákat mint "120 kW" vagy "120kW"
|
||||
match = re.search(r'([-+]?\d*\.?\d+)\s*(?:kW|hp|cc|Nm|kg|km/h|mph)?', value, re.IGNORECASE)
|
||||
if match:
|
||||
try:
|
||||
return float(match.group(1))
|
||||
except ValueError:
|
||||
pass
|
||||
# Ha nincs specifikus egység, próbáljunk meg bármilyen számot kinyerni
|
||||
matches = re.findall(r'[-+]?\d*\.?\d+', value)
|
||||
if matches:
|
||||
try:
|
||||
return float(matches[0])
|
||||
except ValueError:
|
||||
pass
|
||||
return None
|
||||
|
||||
def extract_text_field(self, specifications: Dict[str, Any], keywords: List[str]) -> Optional[str]:
|
||||
"""
|
||||
Kinyer egy szöveges mezőt a specifications-ből a kulcsszavak alapján.
|
||||
"""
|
||||
if not specifications:
|
||||
return None
|
||||
|
||||
spec_lower = {k.lower(): v for k, v in specifications.items()}
|
||||
|
||||
for keyword in keywords:
|
||||
for key, value in spec_lower.items():
|
||||
if keyword.lower() in key:
|
||||
if isinstance(value, str):
|
||||
return value.strip()
|
||||
elif isinstance(value, (int, float)):
|
||||
return str(value)
|
||||
return None
|
||||
|
||||
def enrich_specifications(self, raw_specs: Dict[str, Any], make: str, model: str, year_from: int) -> Dict[str, Any]:
|
||||
"""
|
||||
Fő strukturáló függvény: kinyeri a standard értékeket és létrehozza az új JSON struktúrát.
|
||||
"""
|
||||
standardized = {}
|
||||
|
||||
# Metrikák kinyerése
|
||||
for target_key, keywords in FUZZY_MAPPING.items():
|
||||
value = self.extract_fuzzy_metric(raw_specs, target_key, keywords)
|
||||
standardized[target_key] = value
|
||||
|
||||
# Szöveges mezők kinyerése
|
||||
for field, keywords in TEXT_FIELD_KEYWORDS.items():
|
||||
value = self.extract_text_field(raw_specs, keywords)
|
||||
standardized[field] = value
|
||||
|
||||
# Készítsük az új JSON struktúrát
|
||||
updated_specifications = {
|
||||
"standardized": standardized,
|
||||
"_raw": raw_specs # Az eredeti R1 adat érintetlenül megmarad!
|
||||
}
|
||||
|
||||
return updated_specifications
|
||||
|
||||
async def process_item(self, session: AsyncSession, item: Dict[str, Any]) -> bool:
|
||||
"""
|
||||
Feldolgoz egy elemet: kinyeri az adatokat, frissíti az adatbázist.
|
||||
"""
|
||||
try:
|
||||
logger.info(f"Feldolgozás: ID={item['id']}, {item['make']} {item['model']} ({item['year_from']})")
|
||||
|
||||
# Adatok kinyerése és strukturálása
|
||||
updated_specs = self.enrich_specifications(
|
||||
item['specifications'],
|
||||
item['make'],
|
||||
item['model'],
|
||||
item['year_from']
|
||||
)
|
||||
|
||||
# Kinyert értékek a fizikai oszlopokhoz
|
||||
power_kw = updated_specs['standardized'].get('power_kw')
|
||||
engine_cc = updated_specs['standardized'].get('engine_capacity')
|
||||
|
||||
# UPDATE végrehajtása
|
||||
update_query = text("""
|
||||
UPDATE vehicle.external_reference_library
|
||||
SET power_kw = :power_kw,
|
||||
engine_cc = :engine_cc,
|
||||
make = :make,
|
||||
model = :model,
|
||||
year_from = :year_from,
|
||||
specifications = :updated_specifications,
|
||||
pipeline_status = 'pending_match'
|
||||
WHERE id = :id
|
||||
""")
|
||||
|
||||
params = {
|
||||
"power_kw": int(power_kw) if power_kw is not None else None,
|
||||
"engine_cc": int(engine_cc) if engine_cc is not None else None,
|
||||
"make": item['make'],
|
||||
"model": item['model'],
|
||||
"year_from": item['year_from'],
|
||||
"updated_specifications": json.dumps(updated_specs),
|
||||
"id": item['id']
|
||||
}
|
||||
|
||||
await session.execute(update_query, params)
|
||||
await session.commit()
|
||||
|
||||
logger.info(f"Sikeres frissítés: ID={item['id']}, power_kw={power_kw}, engine_cc={engine_cc}")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Hiba a feldolgozás során ID={item['id']}: {e}")
|
||||
await session.rollback()
|
||||
return False
|
||||
|
||||
async def run_once(self):
|
||||
"""
|
||||
Egyetlen feldolgozási ciklus.
|
||||
"""
|
||||
async with AsyncSessionLocal() as session:
|
||||
try:
|
||||
# Tranzakció indítása
|
||||
async with session.begin():
|
||||
item = await self.fetch_next_library_item(session)
|
||||
if not item:
|
||||
logger.debug("Nincs feldolgozandó elem")
|
||||
return False
|
||||
|
||||
success = await self.process_item(session, item)
|
||||
return success
|
||||
except SQLAlchemyError as e:
|
||||
logger.error(f"Adatbázis hiba: {e}")
|
||||
return False
|
||||
|
||||
async def run_loop(self):
|
||||
"""
|
||||
Fő végtelen ciklus.
|
||||
"""
|
||||
logger.info("R2 Enricher indítva...")
|
||||
|
||||
while self.running:
|
||||
try:
|
||||
success = await self.run_once()
|
||||
if not success:
|
||||
# Ha nincs feldolgozandó elem, várjunk egy kicsit
|
||||
await asyncio.sleep(SLEEP_INTERVAL)
|
||||
except KeyboardInterrupt:
|
||||
logger.info("Keyboard interrupt, leállítás...")
|
||||
self.running = False
|
||||
break
|
||||
except Exception as e:
|
||||
logger.error(f"Váratlan hiba a ciklusban: {e}")
|
||||
await asyncio.sleep(SLEEP_INTERVAL)
|
||||
|
||||
logger.info("R2 Enricher leállt")
|
||||
|
||||
def stop(self):
|
||||
"""Leállítási jelzés."""
|
||||
self.running = False
|
||||
|
||||
|
||||
async def main():
|
||||
"""Fő függvény."""
|
||||
enricher = UltimateSpecsEnricher()
|
||||
|
||||
# Signal kezelés
|
||||
def signal_handler(signum, frame):
|
||||
logger.info(f"Signal {signum} fogadva, leállítás...")
|
||||
enricher.stop()
|
||||
|
||||
signal.signal(signal.SIGINT, signal_handler)
|
||||
signal.signal(signal.SIGTERM, signal_handler)
|
||||
|
||||
try:
|
||||
await enricher.run_loop()
|
||||
except asyncio.CancelledError:
|
||||
logger.info("Task cancelled")
|
||||
finally:
|
||||
logger.info("R2 Enricher befejezte a munkát.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -0,0 +1,400 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Worker: vehicle_ultimate_r3_finalizer
|
||||
Producer-Consumer lánc negyedik, utolsó eleme (Az Összevezető).
|
||||
Offline dolgozik egy végtelen while ciklusban (1-3 mp delay), és a meglévő adatbázis-táblákat szinkronizálja.
|
||||
|
||||
1. Lekérdezés (JOIN a Queue-val): Kivesz egy `pending_match` sort a Library-ből, és a Queue-ból lekéri az eredeti `parent_id`-t és a link nevét.
|
||||
2. Szülő (Base VMD) ellenőrzése: Lekérdezi az eredeti szülő rekordot a VMD táblából a parent_id alapján.
|
||||
3. Összevezetés (UPDATE vagy INSERT): A letisztított adatok a lib.specifications['standardized'] dict-ből jönnek.
|
||||
- A ÁG: Ha a szülő status értéke IN ('pending', 'manual_review_needed'): UPDATE a szülő (VMD) rekordon
|
||||
- B ÁG: Ha a szülő status MÁR NEM 'pending': INSERT új variációként a VMD táblába
|
||||
4. Library lezárása: Frissíti a Library táblát pipeline_status = 'completed', matched_vmd_id beállítása.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import random
|
||||
import sys
|
||||
import signal
|
||||
import json
|
||||
from datetime import datetime
|
||||
from typing import Optional, Dict, Any, List, Tuple
|
||||
|
||||
from sqlalchemy import text, select, and_, or_
|
||||
from sqlalchemy.exc import IntegrityError, SQLAlchemyError
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.database import AsyncSessionLocal
|
||||
|
||||
# Logging konfiguráció
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s [R3-FINALIZER] %(message)s',
|
||||
datefmt='%Y-%m-%d %H:%M:%S'
|
||||
)
|
||||
logger = logging.getLogger("R3-FINALIZER")
|
||||
|
||||
# Konfiguráció
|
||||
SLEEP_INTERVAL = random.uniform(1, 3) # 1-3 mp között várakozás
|
||||
|
||||
|
||||
class UltimateSpecsFinalizer:
|
||||
def __init__(self):
|
||||
self.running = True
|
||||
|
||||
async def fetch_pending_match(self, session: AsyncSession) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Kivesz egy `pending_match` sort a Library-ből, JOIN-olva a Queue-val.
|
||||
FOR UPDATE OF lib SKIP LOCKED LIMIT 1
|
||||
"""
|
||||
query = text("""
|
||||
SELECT lib.id, lib.source_url, lib.make, lib.model, lib.year_from,
|
||||
lib.power_kw, lib.engine_cc, lib.specifications, lib.category,
|
||||
q.parent_id, q.name AS variant_name
|
||||
FROM vehicle.external_reference_library lib
|
||||
JOIN vehicle.auto_data_crawler_queue q ON lib.source_url = q.url
|
||||
WHERE lib.pipeline_status = 'pending_match'
|
||||
FOR UPDATE OF lib SKIP LOCKED LIMIT 1
|
||||
""")
|
||||
result = await session.execute(query)
|
||||
row = result.fetchone()
|
||||
if not row:
|
||||
return None
|
||||
|
||||
return {
|
||||
"lib_id": row[0],
|
||||
"source_url": row[1],
|
||||
"make": row[2],
|
||||
"model": row[3],
|
||||
"year_from": row[4],
|
||||
"power_kw": row[5],
|
||||
"engine_cc": row[6],
|
||||
"specifications": row[7] if row[7] else {},
|
||||
"category": row[8],
|
||||
"parent_id": row[9],
|
||||
"variant_name": row[10]
|
||||
}
|
||||
|
||||
async def get_parent_vmd(self, session: AsyncSession, parent_id: int) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Lekérdezi az eredeti szülő rekordot a VMD táblából a parent_id alapján.
|
||||
FOR UPDATE (zárolás a konkurrens feldolgozás elkerülésére)
|
||||
"""
|
||||
query = text("""
|
||||
SELECT id, status FROM vehicle.vehicle_model_definitions
|
||||
WHERE id = :parent_id FOR UPDATE
|
||||
""")
|
||||
result = await session.execute(query, {"parent_id": parent_id})
|
||||
row = result.fetchone()
|
||||
if not row:
|
||||
return None
|
||||
|
||||
return {
|
||||
"id": row[0],
|
||||
"status": row[1]
|
||||
}
|
||||
|
||||
def extract_standardized_data(self, specifications: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
Kinyeri a standardizált adatokat a specifications['standardized'] dict-ből.
|
||||
Csonkolja a szöveges mezőket a VMD tábla korlátaihoz (50 karakter).
|
||||
"""
|
||||
standardized = specifications.get('standardized', {})
|
||||
|
||||
# Alapvető numerikus mezők
|
||||
extracted = {
|
||||
"power_kw": standardized.get("power_kw"),
|
||||
"engine_capacity": standardized.get("engine_capacity"),
|
||||
"torque_nm": standardized.get("torque_nm"),
|
||||
"max_speed": standardized.get("max_speed"),
|
||||
"curb_weight": standardized.get("curb_weight"),
|
||||
"wheelbase": standardized.get("wheelbase"),
|
||||
"seats": standardized.get("seats"),
|
||||
"fuel_type": standardized.get("fuel_type"),
|
||||
"transmission_type": standardized.get("transmission_type"),
|
||||
"drive_type": standardized.get("drive_type"),
|
||||
"body_type": standardized.get("body_type"),
|
||||
}
|
||||
|
||||
# Csonkolás a VMD mezőhosszokhoz
|
||||
def truncate(value: Any, max_len: int = 50) -> Any:
|
||||
if isinstance(value, str) and len(value) > max_len:
|
||||
return value[:max_len]
|
||||
return value
|
||||
|
||||
# Alkalmazza a csonkolást a szöveges mezőkre
|
||||
for field in ["fuel_type", "transmission_type", "drive_type", "body_type"]:
|
||||
if extracted.get(field):
|
||||
extracted[field] = truncate(extracted[field], 50)
|
||||
|
||||
# Tisztítás: None értékek eltávolítása
|
||||
return {k: v for k, v in extracted.items() if v is not None}
|
||||
|
||||
async def update_parent_vmd(self, session: AsyncSession, parent_id: int,
|
||||
lib_data: Dict[str, Any], standardized: Dict[str, Any]) -> int:
|
||||
"""
|
||||
A ÁG: Frissíti a szülő VMD rekordot a kinyert standardizált adatokkal.
|
||||
Állítja a VMD status-át 'awaiting_ai_synthesis'-re.
|
||||
Visszaadja a parent_id-t (matched_vmd_id).
|
||||
"""
|
||||
# Build update fields
|
||||
update_fields = {
|
||||
"power_kw": standardized.get("power_kw") or lib_data.get("power_kw"),
|
||||
"engine_capacity": standardized.get("engine_capacity") or lib_data.get("engine_cc"),
|
||||
"torque_nm": standardized.get("torque_nm"),
|
||||
"max_speed": standardized.get("max_speed"),
|
||||
"curb_weight": standardized.get("curb_weight"),
|
||||
"wheelbase": standardized.get("wheelbase"),
|
||||
"seats": standardized.get("seats"),
|
||||
"fuel_type": standardized.get("fuel_type"),
|
||||
"transmission_type": standardized.get("transmission_type"),
|
||||
"drive_type": standardized.get("drive_type"),
|
||||
"body_type": standardized.get("body_type"),
|
||||
"status": "awaiting_ai_synthesis",
|
||||
"updated_at": datetime.utcnow(),
|
||||
"source": "ultimatespecs",
|
||||
"priority_score": 30,
|
||||
}
|
||||
|
||||
# Remove None values
|
||||
update_fields = {k: v for k, v in update_fields.items() if v is not None}
|
||||
|
||||
# Build SET clause
|
||||
set_clause = ", ".join([f"{k} = :{k}" for k in update_fields.keys()])
|
||||
|
||||
query = text(f"""
|
||||
UPDATE vehicle.vehicle_model_definitions
|
||||
SET {set_clause}
|
||||
WHERE id = :parent_id
|
||||
RETURNING id
|
||||
""")
|
||||
|
||||
params = {"parent_id": parent_id, **update_fields}
|
||||
result = await session.execute(query, params)
|
||||
updated_id = result.scalar()
|
||||
|
||||
logger.info(f"UPDATE parent VMD {parent_id} with {len(update_fields)} fields")
|
||||
return updated_id
|
||||
|
||||
async def insert_variant_vmd(self, session: AsyncSession, lib_data: Dict[str, Any],
|
||||
standardized: Dict[str, Any], variant_name: str) -> int:
|
||||
"""
|
||||
B ÁG: Beszúr egy új variációt a VMD táblába.
|
||||
make = lib.make, marketing_name = variant_name, year_from = lib.year_from.
|
||||
status = 'awaiting_ai_synthesis', source = 'ultimatespecs', priority_score = 30.
|
||||
Visszaadja az új ID-t (matched_vmd_id).
|
||||
Ha már létezik a rekord (duplicate key), visszaadja a meglévő ID-t.
|
||||
"""
|
||||
# Build insert data
|
||||
insert_data = {
|
||||
"make": lib_data["make"],
|
||||
"marketing_name": variant_name,
|
||||
"official_marketing_name": variant_name,
|
||||
"year_from": lib_data["year_from"],
|
||||
"power_kw": standardized.get("power_kw") or lib_data.get("power_kw"),
|
||||
"engine_capacity": standardized.get("engine_capacity") or lib_data.get("engine_cc"),
|
||||
"torque_nm": standardized.get("torque_nm"),
|
||||
"max_speed": standardized.get("max_speed"),
|
||||
"curb_weight": standardized.get("curb_weight"),
|
||||
"wheelbase": standardized.get("wheelbase"),
|
||||
"seats": standardized.get("seats"),
|
||||
"fuel_type": standardized.get("fuel_type"),
|
||||
"transmission_type": standardized.get("transmission_type"),
|
||||
"drive_type": standardized.get("drive_type"),
|
||||
"body_type": standardized.get("body_type"),
|
||||
"status": "awaiting_ai_synthesis",
|
||||
"vehicle_class": lib_data.get("category"),
|
||||
"source": "ultimatespecs",
|
||||
"priority_score": 30,
|
||||
"created_at": datetime.utcnow(),
|
||||
"updated_at": datetime.utcnow(),
|
||||
"market": "EU",
|
||||
"normalized_name": f"{lib_data['make']} {variant_name}",
|
||||
"technical_code": "UNKNOWN",
|
||||
"variant_code": "UNKNOWN",
|
||||
"version_code": "UNKNOWN",
|
||||
"specifications": json.dumps({}), # Üres JSON, mert NOT NULL
|
||||
"raw_api_data": json.dumps({}), # Üres JSON
|
||||
"research_metadata": json.dumps({}), # Üres JSON
|
||||
"raw_search_context": "", # Üres string
|
||||
}
|
||||
|
||||
# Remove None values
|
||||
insert_data = {k: v for k, v in insert_data.items() if v is not None}
|
||||
|
||||
# Build columns and values
|
||||
columns = ", ".join(insert_data.keys())
|
||||
placeholders = ", ".join([f":{k}" for k in insert_data.keys()])
|
||||
|
||||
try:
|
||||
# Próbáljuk meg beszúrni
|
||||
query = text(f"""
|
||||
INSERT INTO vehicle.vehicle_model_definitions ({columns})
|
||||
VALUES ({placeholders})
|
||||
RETURNING id
|
||||
""")
|
||||
|
||||
result = await session.execute(query, insert_data)
|
||||
new_id = result.scalar()
|
||||
|
||||
logger.info(f"INSERT new variant VMD {new_id} for {lib_data['make']} {variant_name}")
|
||||
return new_id
|
||||
|
||||
except IntegrityError as e:
|
||||
# Duplicate key violation - rollback és új lekérdezés
|
||||
logger.warning(f"Duplicate key violation for {lib_data['make']} {variant_name}: {e}. Rolling back and looking for existing record...")
|
||||
|
||||
# Rollback a megszakított tranzakciót
|
||||
await session.rollback()
|
||||
|
||||
# Keresés a meglévő rekordra új tranzakcióban
|
||||
find_query = text("""
|
||||
SELECT id FROM vehicle.vehicle_model_definitions
|
||||
WHERE make = :make
|
||||
AND marketing_name = :marketing_name
|
||||
AND year_from = :year_from
|
||||
LIMIT 1
|
||||
""")
|
||||
|
||||
find_params = {
|
||||
"make": lib_data["make"],
|
||||
"marketing_name": variant_name,
|
||||
"year_from": lib_data["year_from"]
|
||||
}
|
||||
|
||||
result = await session.execute(find_query, find_params)
|
||||
existing_id = result.scalar()
|
||||
|
||||
if existing_id:
|
||||
logger.info(f"Found existing VMD {existing_id} for {lib_data['make']} {variant_name}")
|
||||
return existing_id
|
||||
else:
|
||||
# Ha nem találjuk, dobjuk tovább a hibát
|
||||
logger.error(f"Duplicate key but could not find existing record for {lib_data['make']} {variant_name}")
|
||||
raise
|
||||
|
||||
async def close_library_entry(self, session: AsyncSession, lib_id: int, matched_vmd_id: int):
|
||||
"""
|
||||
Frissíti a Library táblát: pipeline_status = 'completed', matched_vmd_id beállítása.
|
||||
"""
|
||||
query = text("""
|
||||
UPDATE vehicle.external_reference_library
|
||||
SET pipeline_status = 'completed',
|
||||
matched_vmd_id = :matched_vmd_id
|
||||
WHERE id = :lib_id
|
||||
""")
|
||||
await session.execute(query, {"lib_id": lib_id, "matched_vmd_id": matched_vmd_id})
|
||||
logger.info(f"Library {lib_id} closed with matched_vmd_id {matched_vmd_id}")
|
||||
|
||||
async def process_one(self):
|
||||
"""
|
||||
Feldolgoz egyetlen pending_match rekordot.
|
||||
"""
|
||||
async with AsyncSessionLocal() as session:
|
||||
try:
|
||||
# 1. Lekérdezés a Library-ből
|
||||
lib_data = await self.fetch_pending_match(session)
|
||||
if not lib_data:
|
||||
return False
|
||||
|
||||
logger.info(f"Processing library ID {lib_data['lib_id']} for {lib_data['make']} {lib_data['model']}")
|
||||
|
||||
# 2. Szülő VMD ellenőrzése
|
||||
parent_vmd = None
|
||||
if lib_data['parent_id']:
|
||||
parent_vmd = await self.get_parent_vmd(session, lib_data['parent_id'])
|
||||
|
||||
# 3. Standardizált adatok kinyerése
|
||||
standardized = self.extract_standardized_data(lib_data['specifications'])
|
||||
|
||||
# 4. Döntés: UPDATE vagy INSERT
|
||||
matched_vmd_id = None
|
||||
|
||||
if parent_vmd and parent_vmd['status'] in ('pending', 'manual_review_needed'):
|
||||
# A ÁG: Szülő frissítése
|
||||
matched_vmd_id = await self.update_parent_vmd(
|
||||
session, parent_vmd['id'], lib_data, standardized
|
||||
)
|
||||
else:
|
||||
# B ÁG: Új variáció beszúrása
|
||||
matched_vmd_id = await self.insert_variant_vmd(
|
||||
session, lib_data, standardized, lib_data['variant_name']
|
||||
)
|
||||
|
||||
# 5. Library lezárása
|
||||
await self.close_library_entry(session, lib_data['lib_id'], matched_vmd_id)
|
||||
|
||||
# Commit
|
||||
await session.commit()
|
||||
logger.info(f"Successfully finalized library {lib_data['lib_id']} -> VMD {matched_vmd_id}")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
await session.rollback()
|
||||
logger.error(f"Error processing library {lib_data.get('lib_id', 'unknown')}: {e}")
|
||||
return False
|
||||
|
||||
async def run(self, max_iterations: int = 10):
|
||||
"""
|
||||
Fő futási ciklus: korlátozott számú iteráció, 1-3 mp várakozással.
|
||||
|
||||
Args:
|
||||
max_iterations: Maximum number of processing cycles (default: 10)
|
||||
"""
|
||||
logger.info(f"R3 Finalizer started. Max iterations: {max_iterations}. Waiting for pending_match entries...")
|
||||
|
||||
iteration = 0
|
||||
while self.running and iteration < max_iterations:
|
||||
try:
|
||||
processed = await self.process_one()
|
||||
if not processed:
|
||||
# Nincs munka vagy hiba történt, várakozás
|
||||
await asyncio.sleep(SLEEP_INTERVAL)
|
||||
else:
|
||||
# Sikeres feldolgozás után rövid várakozás
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
# Minden esetben növeljük az iterációt (akár sikeres, akár sikertelen volt)
|
||||
iteration += 1
|
||||
logger.info(f"Iteration {iteration}/{max_iterations} completed.")
|
||||
except asyncio.CancelledError:
|
||||
break
|
||||
except Exception as e:
|
||||
logger.error(f"Unexpected error in main loop: {e}")
|
||||
await asyncio.sleep(5)
|
||||
# Hiba esetén is növeljük az iterációt
|
||||
iteration += 1
|
||||
logger.info(f"Iteration {iteration}/{max_iterations} completed after error.")
|
||||
|
||||
logger.info(f"R3 Finalizer completed {iteration} iterations. Stopping.")
|
||||
self.stop()
|
||||
|
||||
def stop(self):
|
||||
self.running = False
|
||||
logger.info("R3 Finalizer stopping...")
|
||||
|
||||
|
||||
def main():
|
||||
# Signal kezelés
|
||||
finalizer = UltimateSpecsFinalizer()
|
||||
|
||||
def signal_handler(signum, frame):
|
||||
logger.info(f"Received signal {signum}, shutting down...")
|
||||
finalizer.stop()
|
||||
|
||||
signal.signal(signal.SIGINT, signal_handler)
|
||||
signal.signal(signal.SIGTERM, signal_handler)
|
||||
|
||||
# Fő ciklus indítása - korlátozott számú iterációval teszteléshez
|
||||
try:
|
||||
# Teszteléshez: maximum 5 iteráció
|
||||
asyncio.run(finalizer.run(max_iterations=5))
|
||||
except KeyboardInterrupt:
|
||||
logger.info("Keyboard interrupt received, shutting down...")
|
||||
finally:
|
||||
logger.info("R3 Finalizer stopped.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
284
backend/app/workers/vehicle/vehicle_robot_0_discovery_engine.py
Executable file → Normal file
284
backend/app/workers/vehicle/vehicle_robot_0_discovery_engine.py
Executable file → Normal file
@@ -4,205 +4,187 @@ import logging
|
||||
import os
|
||||
import sys
|
||||
from datetime import datetime, timedelta
|
||||
from sqlalchemy import text, select
|
||||
from sqlalchemy import text
|
||||
from app.database import AsyncSessionLocal
|
||||
from app.models.asset import AssetCatalog
|
||||
|
||||
# MB 2.0 Szigorú naplózás
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] Robot-0-Discovery: %(message)s', stream=sys.stdout)
|
||||
logger = logging.getLogger("Vehicle-Robot-0-Discovery")
|
||||
# Szigorú naplózás
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s [R0-DISCOVERY] %(message)s', stream=sys.stdout)
|
||||
logger = logging.getLogger("Robot-0")
|
||||
|
||||
class DiscoveryEngine:
|
||||
"""
|
||||
THOUGHT PROCESS (IPARI ÜZEMMÓD 2.0):
|
||||
1. Őrkutya (Watchdog): Megkeresi és kiszabadítja a beragadt feladatokat óránként.
|
||||
2. Differential Sync (Különbözeti Szinkron): Csak a hiányzó vagy új modelleket rögzíti, a gold_enriched-eket kihagyja.
|
||||
3. Monthly Scheduler: Havonta egyszer tölti le a teljes RDW adatbázist lapozva.
|
||||
"""
|
||||
|
||||
Vehicle Robot 0 v3.0: A Nagy Stratéga
|
||||
Feladata: Végiglapozza az RDW teljes adatbázisát (autó, motor, teherautó),
|
||||
kigyűjti az összes létező márka+modell kombinációt, és darabszám alapján
|
||||
priorizálja őket a catalog_discovery táblában a vadászok (Hunterek) számára.
|
||||
"""
|
||||
RDW_API = "https://opendata.rdw.nl/resource/m9d7-ebf2.json"
|
||||
RDW_TOKEN = os.getenv("RDW_APP_TOKEN")
|
||||
HEADERS = {"X-App-Token": RDW_TOKEN} if RDW_TOKEN else {}
|
||||
SYNC_STATE_FILE = "/app/temp/.last_rdw_sync" # Állapotfájl, hogy Docker újrainduláskor se kezdje elölről azonnal
|
||||
SYNC_STATE_FILE = "/app/temp/.last_rdw_sync"
|
||||
BATCH_LIMIT = 10000 # RDW API maximum limit aggregálásnál
|
||||
|
||||
@staticmethod
|
||||
async def run_watchdog():
|
||||
""" 1. FÁZIS: Az Őrkutya (Dead-Letter Queue Manager) """
|
||||
logger.info("🐕 Őrkutya: Beragadt feladatok keresése a rendszerben...")
|
||||
try:
|
||||
async with AsyncSessionLocal() as db:
|
||||
# A) Hunter takarítás (visszaállítás pending-re, ha a Hunter lefagyott)
|
||||
res1 = await db.execute(text("UPDATE vehicle.catalog_discovery SET status = 'pending' WHERE status = 'processing' RETURNING id;"))
|
||||
hunter_resets = len(res1.fetchall())
|
||||
if hunter_resets > 0:
|
||||
logger.warning(f"🔄 {hunter_resets} db beragadt Hunter feladat (processing) visszaállítva 'pending'-re.")
|
||||
|
||||
# B) AI Robotok takarítása (2 órás timeout)
|
||||
query2 = text("""
|
||||
UPDATE vehicle.vehicle_model_definitions
|
||||
SET status = CASE
|
||||
WHEN status = 'research_in_progress' THEN 'unverified'
|
||||
WHEN status = 'ai_synthesis_in_progress' THEN 'awaiting_ai_synthesis'
|
||||
END
|
||||
WHERE status IN ('research_in_progress', 'ai_synthesis_in_progress')
|
||||
AND updated_at < NOW() - INTERVAL '2 hours'
|
||||
RETURNING id;
|
||||
""")
|
||||
res2 = await db.execute(query2)
|
||||
ai_resets = len(res2.fetchall())
|
||||
if ai_resets > 0:
|
||||
logger.warning(f"🔄 {ai_resets} db beragadt AI feladat visszaállítva.")
|
||||
|
||||
await db.commit()
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Őrkutya hiba: {e}")
|
||||
|
||||
@staticmethod
|
||||
async def seed_manual_bootstrap():
|
||||
""" 2. FÁZIS: Alapozó adatok rögzítése """
|
||||
initial_data = [
|
||||
{"make": "AUDI", "model": "A4", "generation": "B8 (2008-2015)"}, # vehicle_class törölve
|
||||
{"make": "BMW", "model": "3 SERIES", "generation": "F30 (2012-2019)"}
|
||||
]
|
||||
try:
|
||||
async with AsyncSessionLocal() as db:
|
||||
for item in initial_data:
|
||||
stmt = select(AssetCatalog).where(AssetCatalog.make == item["make"], AssetCatalog.model == item["model"])
|
||||
if not (await db.execute(stmt)).scalar_one_or_none():
|
||||
db.add(AssetCatalog(**item))
|
||||
await db.commit()
|
||||
except Exception as e:
|
||||
logger.warning(f"Manual bootstrap hiba (Ignorálható, ha az adatbázis már tele van): {e}")
|
||||
CATEGORIES = [
|
||||
{"name": "car", "rdw_types": ["'Personenauto'"]},
|
||||
{"name": "motorcycle", "rdw_types": ["'Motorfiets'"]},
|
||||
{"name": "truck", "rdw_types": ["'Bedrijfsauto'", "'Vrachtwagen'", "'Opleggertrekker'"]}
|
||||
]
|
||||
|
||||
@classmethod
|
||||
async def fetch_with_retry(cls, client: httpx.AsyncClient, url: str, params: dict, retries: int = 3):
|
||||
""" Hibatűrő HTTP kérés API leállások ellen. """
|
||||
async def fetch_with_retry(cls, client: httpx.AsyncClient, params: dict, retries: int = 3):
|
||||
for attempt in range(retries):
|
||||
try:
|
||||
resp = await client.get(url, params=params, headers=cls.HEADERS)
|
||||
resp = await client.get(cls.RDW_API, params=params, headers=cls.HEADERS)
|
||||
if resp.status_code == 200:
|
||||
return resp
|
||||
elif resp.status_code == 429:
|
||||
return resp.json()
|
||||
elif resp.status_code == 429:
|
||||
await asyncio.sleep(2 ** attempt)
|
||||
else:
|
||||
logger.warning(f"RDW API Hiba: {resp.status_code}")
|
||||
return None
|
||||
except httpx.RequestError:
|
||||
except httpx.RequestError as e:
|
||||
if attempt == retries - 1:
|
||||
logger.error(f"Hálózati hiba: {e}")
|
||||
return None
|
||||
await asyncio.sleep(2 ** attempt)
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
async def seed_from_rdw(cls):
|
||||
""" 3. FÁZIS: Távoli felfedezés - KÜLÖNBÖZETI SZINKRONIZÁCIÓ (Differential Sync) """
|
||||
logger.info("📥 RDW TÖMEGES LETÖLTÉS: Új modellek keresése (Differential Sync)...")
|
||||
|
||||
limit = 10000
|
||||
async def process_category(cls, db, v_class: str, rdw_types: list):
|
||||
""" Egy adott kategória (pl. autók) teljes végiglapozása és mentése. """
|
||||
type_filter = " OR ".join([f"voertuigsoort = {t}" for t in rdw_types])
|
||||
offset = 0
|
||||
inserted_count = 0
|
||||
updated_count = 0
|
||||
|
||||
total_inserted = 0
|
||||
total_updated = 0
|
||||
|
||||
logger.info(f"🔍 {v_class.upper()} kategória elemzésének indítása...")
|
||||
|
||||
async with httpx.AsyncClient(timeout=60.0) as client:
|
||||
while True:
|
||||
# Az aggregált SQL lekérdezés, amit az RDW API-nak küldünk
|
||||
params = {
|
||||
"$select": "merk,handelsbenaming,voertuigsoort,count(*) as total",
|
||||
"$group": "merk,handelsbenaming,voertuigsoort",
|
||||
"$order": "total DESC",
|
||||
"$limit": limit,
|
||||
"$select": "merk, handelsbenaming, count(*) AS darabszam",
|
||||
"$where": type_filter,
|
||||
"$group": "merk, handelsbenaming",
|
||||
"$order": "darabszam DESC",
|
||||
"$limit": cls.BATCH_LIMIT,
|
||||
"$offset": offset
|
||||
}
|
||||
|
||||
data = await cls.fetch_with_retry(client, params)
|
||||
if not data:
|
||||
break # Ha üres a válasz, végeztünk a kategóriával
|
||||
|
||||
logger.info(f"📊 {v_class.upper()}: Feldolgozás {offset} - {offset + len(data)}...")
|
||||
|
||||
# Mivel ez tömeges mentés, egy közös tranzakciót használunk
|
||||
for item in data:
|
||||
make_name = str(item.get("merk", "")).upper().strip()
|
||||
model_name = str(item.get("handelsbenaming", "")).upper().strip()
|
||||
if not make_name or not model_name:
|
||||
continue
|
||||
|
||||
count = int(item.get("darabszam", 0))
|
||||
|
||||
try:
|
||||
async with db.begin_nested():
|
||||
# Ha még nincs ilyen (vagy ha van, frissítjük a prioritást)
|
||||
query = text("""
|
||||
INSERT INTO vehicle.catalog_discovery (make, model, vehicle_class, status, source, attempts, priority_score)
|
||||
VALUES (:make, :model, :class, 'pending', 'STRATEGIST-V3', 0, :score)
|
||||
ON CONFLICT (make, model, vehicle_class)
|
||||
DO UPDATE SET priority_score = GREATEST(vehicle.catalog_discovery.priority_score, :score)
|
||||
WHERE vehicle.catalog_discovery.status != 'processed'
|
||||
RETURNING xmax;
|
||||
""")
|
||||
res = await db.execute(query, {"make": make_name, "model": model_name, "class": v_class, "score": count})
|
||||
|
||||
# Logika a statisztikához: xmax = 0 ha új beszúrás, > 0 ha update
|
||||
row = res.fetchone()
|
||||
if row:
|
||||
if row[0] == 0: total_inserted += 1
|
||||
else: total_updated += 1
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"⚠️ Hiba a mentésnél ({make_name} {model_name}): {e}")
|
||||
|
||||
await db.commit()
|
||||
|
||||
resp = await cls.fetch_with_retry(client, "https://opendata.rdw.nl/resource/m9d7-ebf2.json", params)
|
||||
if not resp: break
|
||||
raw_data = resp.json()
|
||||
if not raw_data: break
|
||||
|
||||
logger.info(f"📊 Lapozás: {offset} - {offset + len(raw_data)} tételek analízise...")
|
||||
|
||||
async with AsyncSessionLocal() as db:
|
||||
for entry in raw_data:
|
||||
make = str(entry.get("merk", "")).upper().strip()
|
||||
model = str(entry.get("handelsbenaming", "")).upper().strip()
|
||||
v_kind = entry.get("voertuigsoort", "")
|
||||
total_count = int(entry.get("total", 0))
|
||||
|
||||
if not make or not model: continue
|
||||
|
||||
if "Personenauto" in v_kind: v_class = 'car'
|
||||
elif "Motorfiets" in v_kind: v_class = 'motorcycle'
|
||||
else: v_class = 'truck'
|
||||
|
||||
# A MÁGIA: Különbözeti Szinkronizáció SQL + Explicit Type Casting
|
||||
query = text("""
|
||||
INSERT INTO vehicle.catalog_discovery (make, model, vehicle_class, status, priority_score)
|
||||
SELECT
|
||||
CAST(:make AS VARCHAR),
|
||||
CAST(:model AS VARCHAR),
|
||||
CAST(:v_class AS VARCHAR),
|
||||
'pending',
|
||||
:priority
|
||||
WHERE NOT EXISTS (
|
||||
SELECT 1 FROM vehicle.vehicle_model_definitions
|
||||
WHERE make = CAST(:make AS VARCHAR)
|
||||
AND marketing_name = CAST(:model AS VARCHAR)
|
||||
AND status = 'gold_enriched'
|
||||
)
|
||||
ON CONFLICT (make, model)
|
||||
DO UPDATE SET priority_score = EXCLUDED.priority_score
|
||||
WHERE vehicle.catalog_discovery.status != 'processed'
|
||||
RETURNING xmax;
|
||||
""")
|
||||
|
||||
result = await db.execute(query, {
|
||||
"make": make, "model": model, "v_class": v_class, "priority": total_count
|
||||
})
|
||||
|
||||
row = result.fetchone()
|
||||
if row:
|
||||
if row[0] == 0: inserted_count += 1 # Új beszúrás
|
||||
else: updated_count += 1 # Meglévő frissítése
|
||||
|
||||
await db.commit()
|
||||
offset += limit
|
||||
await asyncio.sleep(1)
|
||||
# Ha kevesebb adat jött vissza, mint a limit, akkor elértük az utolsó oldalt
|
||||
if len(data) < cls.BATCH_LIMIT:
|
||||
break
|
||||
|
||||
logger.info(f"✅ RDW Szinkron kész! Új modellek a listán: {inserted_count} | Frissített prioritások: {updated_count}")
|
||||
|
||||
# Sikeres futás regisztrálása a fájlrendszeren
|
||||
os.makedirs(os.path.dirname(cls.SYNC_STATE_FILE), exist_ok=True)
|
||||
with open(cls.SYNC_STATE_FILE, 'w') as f:
|
||||
f.write(datetime.now().isoformat())
|
||||
offset += cls.BATCH_LIMIT
|
||||
await asyncio.sleep(1) # API kímélése
|
||||
|
||||
logger.info(f"✅ {v_class.upper()} kész! Új felfedezett: {total_inserted} | Frissített prioritás: {total_updated}")
|
||||
|
||||
@classmethod
|
||||
async def run_watchdog(cls):
|
||||
""" Kiszabadítja azokat a Hunter feladatokat, amiknél a szerver esetleg újraindult. """
|
||||
logger.info("🐕 Őrkutya: Beragadt feladatok ellenőrzése...")
|
||||
try:
|
||||
async with AsyncSessionLocal() as db:
|
||||
res1 = await db.execute(text("UPDATE vehicle.catalog_discovery SET status = 'pending' WHERE status = 'processing' RETURNING id;"))
|
||||
hunter_resets = len(res1.fetchall())
|
||||
if hunter_resets > 0:
|
||||
logger.warning(f"🔄 {hunter_resets} db beragadt Hunter feladat visszaállítva.")
|
||||
|
||||
res2 = await db.execute(text("""
|
||||
UPDATE vehicle.vehicle_model_definitions
|
||||
SET status = 'unverified'
|
||||
WHERE status IN ('research_in_progress', 'ai_synthesis_in_progress')
|
||||
AND updated_at < NOW() - INTERVAL '2 hours'
|
||||
RETURNING id;
|
||||
"""))
|
||||
ai_resets = len(res2.fetchall())
|
||||
if ai_resets > 0:
|
||||
logger.warning(f"🔄 {ai_resets} db beragadt AI/Kutató feladat visszaállítva.")
|
||||
await db.commit()
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Őrkutya hiba: {e}")
|
||||
|
||||
@classmethod
|
||||
def should_run_rdw_sync(cls) -> bool:
|
||||
""" Ellenőrzi, hogy eltelt-e 30 nap a legutóbbi sikeres RDW szinkronizáció óta. """
|
||||
if not os.path.exists(cls.SYNC_STATE_FILE):
|
||||
return True
|
||||
if not os.path.exists(cls.SYNC_STATE_FILE): return True
|
||||
try:
|
||||
with open(cls.SYNC_STATE_FILE, 'r') as f:
|
||||
last_sync = datetime.fromisoformat(f.read().strip())
|
||||
return datetime.now() - last_sync > timedelta(days=30)
|
||||
# Ha elmúlt 7 nap, újra felfedezi az RDW-t
|
||||
return datetime.now() - last_sync > timedelta(days=7)
|
||||
except Exception:
|
||||
return True
|
||||
|
||||
@classmethod
|
||||
async def run(cls):
|
||||
""" FŐ CIKLUS: Havi ütemező és Óránkénti Őrkutya """
|
||||
logger.info("🚀 ÉLES ÜZEM: Discovery Engine (Differential Sync) & Watchdog indítása...")
|
||||
await cls.seed_manual_bootstrap()
|
||||
logger.info("🚀 Robot 0 (Strategist & Discovery) ONLINE")
|
||||
|
||||
# 1. Adatbázis séma biztosítása a priority_score-hoz
|
||||
async with AsyncSessionLocal() as db:
|
||||
try:
|
||||
await db.execute(text("ALTER TABLE vehicle.catalog_discovery ADD COLUMN IF NOT EXISTS priority_score INTEGER DEFAULT 0;"))
|
||||
await db.commit()
|
||||
except Exception as e:
|
||||
await db.rollback()
|
||||
logger.error(f"⚠️ Séma hiba (ignorálható): {e}")
|
||||
|
||||
while True:
|
||||
# 1. Óránkénti takarítás
|
||||
await cls.run_watchdog()
|
||||
|
||||
# 2. Havi szinkronizáció ellenőrzése
|
||||
if cls.should_run_rdw_sync():
|
||||
await cls.seed_from_rdw()
|
||||
logger.info("🌍 Teljes RDW Hálózat Letapogatás Indul...")
|
||||
async with AsyncSessionLocal() as db:
|
||||
for category in cls.CATEGORIES:
|
||||
await cls.process_category(db, category["name"], category["rdw_types"])
|
||||
|
||||
os.makedirs(os.path.dirname(cls.SYNC_STATE_FILE), exist_ok=True)
|
||||
with open(cls.SYNC_STATE_FILE, 'w') as f:
|
||||
f.write(datetime.now().isoformat())
|
||||
logger.info("🏁 Letapogatás befejezve. Alvás a következő ellenőrzésig.")
|
||||
else:
|
||||
logger.info("🛌 Az RDW szinkronizáció már lefutott az elmúlt 30 napban. Ugrás...")
|
||||
logger.info("🛌 Az RDW szinkronizáció már lefutott a héten. Őrködés folytatása...")
|
||||
|
||||
# 3. Alvás 1 órát (Heartbeat)
|
||||
logger.info("⏱️ A Discovery Engine most 1 órát pihen a következő Őrkutya futásig.")
|
||||
await asyncio.sleep(3600)
|
||||
await asyncio.sleep(3600) # Óránként ellenőrzi, kell-e valamit tenni
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(DiscoveryEngine.run())
|
||||
@@ -1,4 +1,4 @@
|
||||
# /app/app/workers/vehicle/vehicle_robot_0_gb_discovery.py
|
||||
# /opt/docker/dev/service_finder/backend/app/workers/vehicle/vehicle_robot_0_gb_discovery.py
|
||||
import asyncio
|
||||
import logging
|
||||
import csv
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
# /app/app/workers/vehicle/vehicle_robot_1_2_nhtsa_fetcher.py
|
||||
# /opt/docker/dev/service_finder/backend/app/workers/vehicle/vehicle_robot_1_2_nhtsa_fetcher.py
|
||||
import asyncio
|
||||
import httpx
|
||||
import logging
|
||||
@@ -13,16 +13,14 @@ class NHTSAFetcher:
|
||||
|
||||
@classmethod
|
||||
async def get_eu_makes(cls):
|
||||
"""Lekéri azokat a márkákat, amik már benne vannak az adatbázisban EU-s forrásból."""
|
||||
async with AsyncSessionLocal() as db:
|
||||
# Csak azokat a márkákat keressük az USA-ban, amiket az EU-ban (RDW) már láttunk
|
||||
query = text("SELECT DISTINCT make FROM vehicle.catalog_discovery WHERE market = 'EU' OR source = 'RDW'")
|
||||
res = await db.execute(query)
|
||||
return [row[0] for row in res.fetchall()]
|
||||
|
||||
@classmethod
|
||||
async def run(cls):
|
||||
logger.info("🚀 Robot 1.2 (EU-Guided NHTSA) indítása...")
|
||||
logger.info("🚀 Robot 1.2 (EU-Guided NHTSA) indítása - Kötegelt mód...")
|
||||
|
||||
while True:
|
||||
target_makes = await cls.get_eu_makes()
|
||||
@@ -31,36 +29,39 @@ class NHTSAFetcher:
|
||||
await asyncio.sleep(60)
|
||||
continue
|
||||
|
||||
# 2026-tól megyünk vissza a múltba
|
||||
for year in range(2026, 1950, -1):
|
||||
async with AsyncSessionLocal() as db:
|
||||
for make in target_makes:
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=20.0) as client:
|
||||
# A hálózati kliens a cikluson KÍVÜL van, így újrahasznosítja a kapcsolatokat!
|
||||
async with httpx.AsyncClient(timeout=20.0) as client:
|
||||
for year in range(2026, 1950, -1):
|
||||
async with AsyncSessionLocal() as db:
|
||||
for make in target_makes:
|
||||
try:
|
||||
url = cls.API_URL.format(make=make, year=year)
|
||||
resp = await client.get(url)
|
||||
if resp.status_code != 200: continue
|
||||
|
||||
models = resp.json().get("Results", [])
|
||||
inserted = 0
|
||||
if not models: continue
|
||||
|
||||
# Gyors lista generálás a kötegelt mentéshez
|
||||
insert_data = []
|
||||
for m in models:
|
||||
model_name = m.get("Model_Name").upper().strip()
|
||||
# USA_IMPORT jelölés, de csak EU-s márkákhoz!
|
||||
insert_data.append({"make": make, "model": model_name, "year": year})
|
||||
|
||||
if insert_data:
|
||||
query = text("""
|
||||
INSERT INTO vehicle.catalog_discovery
|
||||
(make, model, vehicle_class, status, market, model_year, priority_score, source)
|
||||
VALUES (:make, :model, 'car', 'pending', 'USA_IMPORT', :year, 5, 'NHTSA-EU-FILTERED')
|
||||
ON CONFLICT ON CONSTRAINT _make_model_market_year_uc DO NOTHING
|
||||
""")
|
||||
res = await db.execute(query, {"make": make, "model": model_name, "year": year})
|
||||
if res.rowcount > 0: inserted += 1
|
||||
|
||||
if inserted > 0:
|
||||
logger.info(f"✅ {make} ({year}): {inserted} variáns dúsítva az USA-ból.")
|
||||
await db.commit()
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Hiba: {make} {year}: {e}")
|
||||
await asyncio.sleep(0.5)
|
||||
# Egyetlen SQL hívás a teljes listára!
|
||||
await db.execute(query, insert_data)
|
||||
await db.commit()
|
||||
logger.info(f"✅ {make} ({year}): {len(insert_data)} variáns dúsítva az USA-ból.")
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Hiba: {make} {year}: {e}")
|
||||
await asyncio.sleep(0.1) # Kisebb pihenő is elég, mert hatékonyabbak vagyunk
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(NHTSAFetcher.run())
|
||||
@@ -1,11 +1,16 @@
|
||||
# /opt/docker/dev/service_finder/backend/app/workers/vehicle/vehicle_robot_1_4_bike_hunter.py
|
||||
import asyncio
|
||||
import httpx
|
||||
import logging
|
||||
import random
|
||||
from sqlalchemy import text
|
||||
from app.database import AsyncSessionLocal
|
||||
|
||||
# Naplózás finomhangolása a duplázódás elkerülésére
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(name)s] %(message)s')
|
||||
logger = logging.getLogger("Robot-1-4-Bike")
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
# SQLAlchemy zaj csökkentése
|
||||
logging.getLogger("sqlalchemy.engine").setLevel(logging.WARNING)
|
||||
|
||||
BIKE_MAKES = [
|
||||
"HONDA", "YAMAHA", "KAWASAKI", "SUZUKI", "HARLEY-DAVIDSON",
|
||||
@@ -17,40 +22,61 @@ class BikeHunter:
|
||||
|
||||
@classmethod
|
||||
async def run(cls):
|
||||
logger.info("🏍️ Robot 1.4 (Bike Hunter) indítása...")
|
||||
# 2026-tól 1970-ig pörgetjük a motorokat
|
||||
"""
|
||||
THOUGHT PROCESS:
|
||||
A robotot úgy alakítjuk át, hogy minden egyes gyártó/év kombinációt
|
||||
külön tranzakcióként kezeljen. Ha egy márka hibát dob, elvégezzük a
|
||||
rollback-et, így a következő márka tiszta lappal indulhat.
|
||||
"""
|
||||
logger.info("🏍️ Robot 1.4 (Bike Hunter) indítása - Tranzakció-biztos mód...")
|
||||
years = range(2026, 1969, -1)
|
||||
|
||||
async with AsyncSessionLocal() as db:
|
||||
async with httpx.AsyncClient(timeout=30.0) as client:
|
||||
for year in years:
|
||||
for make in BIKE_MAKES:
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=20.0) as client:
|
||||
# Minden márkához új session-t nyitunk, vagy biztosítjuk a rollback-et
|
||||
async with AsyncSessionLocal() as db:
|
||||
try:
|
||||
resp = await client.get(cls.API_URL.format(make=make, year=year))
|
||||
if resp.status_code != 200: continue
|
||||
models = resp.json().get("Results", [])
|
||||
if resp.status_code != 200:
|
||||
logger.warning(f"⚠️ {make} ({year}) API hiba: {resp.status_code}")
|
||||
continue
|
||||
|
||||
inserted = 0
|
||||
models = resp.json().get("Results", [])
|
||||
if not models:
|
||||
continue
|
||||
|
||||
insert_data = []
|
||||
for m in models:
|
||||
model_name = m.get("Model_Name").upper().strip()
|
||||
# TISZTA SQL - Nincs Simon!
|
||||
m_name = m.get("Model_Name")
|
||||
if m_name:
|
||||
model_name = m_name.upper().strip()
|
||||
insert_data.append({"make": make, "model": model_name, "year": year})
|
||||
|
||||
if insert_data:
|
||||
# ON CONFLICT használata a CONSTRAINT alapján
|
||||
query = text("""
|
||||
INSERT INTO vehicle.catalog_discovery
|
||||
(make, model, vehicle_class, status, market, model_year, priority_score, source)
|
||||
VALUES (:make, :model, 'motorcycle', 'pending', 'USA_IMPORT', :year, 8, 'NHTSA-V1-BIKE')
|
||||
ON CONFLICT ON CONSTRAINT _make_model_market_year_uc DO NOTHING
|
||||
""")
|
||||
await db.execute(query, {"make": make, "model": model_name, "year": year})
|
||||
inserted += 1
|
||||
|
||||
if inserted > 0:
|
||||
logger.info(f"🏍️ {make} ({year}): {inserted} új motor rögzítve.")
|
||||
await db.commit()
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Bike Error {make} ({year}): {e}")
|
||||
|
||||
# Évjáratonként egy pici pihenő az API-nak
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
await db.execute(query, insert_data)
|
||||
await db.commit() # Itt véglegesítjük a sikeres köteget
|
||||
logger.info(f"✅ {make} ({year}): {len(insert_data)} motor feldolgozva.")
|
||||
|
||||
except Exception as e:
|
||||
# KRITIKUS: Hiba esetén visszaállítjuk a tranzakciót,
|
||||
# így a következő kör (következő márka) nem bukik el.
|
||||
await db.rollback()
|
||||
logger.error(f"❌ Bike Error {make} ({year}): {str(e)}")
|
||||
|
||||
# API kímélése (Rate limiting megelőzése)
|
||||
await asyncio.sleep(random.uniform(0.3, 0.6))
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(BikeHunter.run())
|
||||
try:
|
||||
asyncio.run(BikeHunter.run())
|
||||
except KeyboardInterrupt:
|
||||
logger.info("🛑 Leállítás felhasználói kérésre.")
|
||||
@@ -1,66 +1,82 @@
|
||||
# /app/app/workers/vehicle/vehicle_robot_1_5_heavy_eu.py
|
||||
import asyncio
|
||||
import httpx
|
||||
import logging
|
||||
import sys
|
||||
from sqlalchemy import text
|
||||
from app.database import AsyncSessionLocal
|
||||
|
||||
logger = logging.getLogger("Robot-1-5-Heavy-EU")
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s [%(levelname)s] R1.5-Heavy: %(message)s',
|
||||
stream=sys.stdout
|
||||
)
|
||||
|
||||
class HeavyEUHunter:
|
||||
# RDW Open Data - Hollandia az EU kapuja
|
||||
RDW_URL = "https://opendata.rdw.nl/resource/m9d7-ebf2.json"
|
||||
|
||||
@classmethod
|
||||
async def fetch_rdw_heavy(cls, vehicle_type: str):
|
||||
"""
|
||||
vehicle_type: 'Vrachtwagen' (Teher), 'Bus', 'Kampeerauto' (Lakóautó)
|
||||
"""
|
||||
# Lekérjük az összes egyedi márka-típus párost
|
||||
query_url = f"{cls.RDW_URL}?voertuigsoort={vehicle_type}&$select=merk,handelsbenaming&$limit=10000"
|
||||
async with httpx.AsyncClient(timeout=30.0) as client:
|
||||
try:
|
||||
resp = await client.get(query_url)
|
||||
return resp.json() if resp.status_code == 200 else []
|
||||
except Exception as e:
|
||||
logger.error(f"❌ RDW Error: {e}")
|
||||
logger.error(f"❌ RDW API Error: {e}")
|
||||
return []
|
||||
|
||||
@classmethod
|
||||
async def run(cls):
|
||||
logger.info("🚛 Robot 1.5 (EU Heavy Duty) indítása...")
|
||||
# Definíciók: RDW név -> Mi kategóriánk
|
||||
job_list = {
|
||||
"Vrachtwagen": "truck",
|
||||
"Bus": "bus",
|
||||
"Kampeerauto": "rv"
|
||||
}
|
||||
|
||||
# --- DB KAPCSOLÓDÁSI VÉDELEM (RETRY) ---
|
||||
db_connected = False
|
||||
for i in range(12): # 1 percig próbálkozik (12 * 5mp)
|
||||
try:
|
||||
async with AsyncSessionLocal() as db:
|
||||
await db.execute(text("SELECT 1"))
|
||||
db_connected = True
|
||||
logger.info("✅ Adatbázis kapcsolat aktív!")
|
||||
break
|
||||
except Exception:
|
||||
logger.warning(f"⏳ Adatbázis nem elérhető ({i+1}/12), várakozás 5mp...")
|
||||
await asyncio.sleep(5)
|
||||
|
||||
if not db_connected:
|
||||
logger.error("💀 Nem sikerült kapcsolódni az adatbázishoz. Leállás.")
|
||||
return
|
||||
|
||||
job_list = {"Vrachtwagen": "truck", "Bus": "bus", "Kampeerauto": "rv"}
|
||||
|
||||
async with AsyncSessionLocal() as db:
|
||||
for rdw_name, internal_class in job_list.items():
|
||||
logger.info(f"📥 {rdw_name} adatok letöltése...")
|
||||
data = await cls.fetch_rdw_heavy(rdw_name)
|
||||
|
||||
inserted = 0
|
||||
if not data: continue
|
||||
|
||||
insert_data = []
|
||||
for item in data:
|
||||
make = item.get('merk', '').upper().strip()
|
||||
model = item.get('handelsbenaming', '').upper().strip()
|
||||
|
||||
if not make or not model: continue
|
||||
if make and model:
|
||||
insert_data.append({"make": make, "model": model, "v_class": internal_class})
|
||||
|
||||
# Szűrés a kért EU márkákra + amik jönnek az RDW-ből
|
||||
if insert_data:
|
||||
# JAVÍTÁS: Constraint név helyett konkrét mezők az ütközéshez
|
||||
query = text("""
|
||||
INSERT INTO vehicle.catalog_discovery
|
||||
(make, model, vehicle_class, status, market, priority_score, source)
|
||||
VALUES (:make, :model, :v_class, 'pending', 'EU', 20, 'RDW-HEAVY')
|
||||
ON CONFLICT ON CONSTRAINT _make_model_market_year_uc DO NOTHING
|
||||
ON CONFLICT (make, model, vehicle_class) DO NOTHING
|
||||
""")
|
||||
res = await db.execute(query, {"make": make, "model": model, "v_class": internal_class})
|
||||
if res.rowcount > 0: inserted += 1
|
||||
|
||||
await db.commit()
|
||||
logger.info(f"✅ {rdw_name}: {inserted} új EU-s nagygép rögzítve.")
|
||||
try:
|
||||
await db.execute(query, insert_data)
|
||||
await db.commit()
|
||||
logger.info(f"✅ {rdw_name}: {len(insert_data)} gép beküldve.")
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Mentési hiba ({rdw_name}): {e}")
|
||||
await db.rollback()
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(HeavyEUHunter.run())
|
||||
62
backend/app/workers/vehicle/vehicle_robot_1_5_heavy_eu1.0.py
Normal file
62
backend/app/workers/vehicle/vehicle_robot_1_5_heavy_eu1.0.py
Normal file
@@ -0,0 +1,62 @@
|
||||
# /opt/docker/dev/service_finder/backend/app/workers/vehicle/vehicle_robot_1_5_heavy_eu1.0.py
|
||||
import asyncio
|
||||
import httpx
|
||||
import logging
|
||||
from sqlalchemy import text
|
||||
from app.database import AsyncSessionLocal
|
||||
|
||||
logger = logging.getLogger("Robot-1-5-Heavy-EU")
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
class HeavyEUHunter:
|
||||
RDW_URL = "https://opendata.rdw.nl/resource/m9d7-ebf2.json"
|
||||
|
||||
@classmethod
|
||||
async def fetch_rdw_heavy(cls, vehicle_type: str):
|
||||
query_url = f"{cls.RDW_URL}?voertuigsoort={vehicle_type}&$select=merk,handelsbenaming&$limit=10000"
|
||||
async with httpx.AsyncClient(timeout=30.0) as client:
|
||||
try:
|
||||
resp = await client.get(query_url)
|
||||
return resp.json() if resp.status_code == 200 else []
|
||||
except Exception as e:
|
||||
logger.error(f"❌ RDW Error: {e}")
|
||||
return []
|
||||
|
||||
@classmethod
|
||||
async def run(cls):
|
||||
logger.info("🚛 Robot 1.5 (EU Heavy Duty) indítása - Kötegelt mód...")
|
||||
job_list = {
|
||||
"Vrachtwagen": "truck",
|
||||
"Bus": "bus",
|
||||
"Kampeerauto": "rv"
|
||||
}
|
||||
|
||||
async with AsyncSessionLocal() as db:
|
||||
for rdw_name, internal_class in job_list.items():
|
||||
logger.info(f"📥 {rdw_name} adatok letöltése...")
|
||||
data = await cls.fetch_rdw_heavy(rdw_name)
|
||||
|
||||
if not data: continue
|
||||
|
||||
# A 10.000 adatot egyetlen listába gyűjtjük
|
||||
insert_data = []
|
||||
for item in data:
|
||||
make = item.get('merk', '').upper().strip()
|
||||
model = item.get('handelsbenaming', '').upper().strip()
|
||||
if make and model:
|
||||
insert_data.append({"make": make, "model": model, "v_class": internal_class})
|
||||
|
||||
if insert_data:
|
||||
query = text("""
|
||||
INSERT INTO vehicle.catalog_discovery
|
||||
(make, model, vehicle_class, status, market, priority_score, source)
|
||||
VALUES (:make, :model, :v_class, 'pending', 'EU', 20, 'RDW-HEAVY')
|
||||
ON CONFLICT ON CONSTRAINT _make_model_market_year_uc DO NOTHING
|
||||
""")
|
||||
# Egyetlen SQL hívással beszúrjuk akár a 10.000 sort is!
|
||||
await db.execute(query, insert_data)
|
||||
await db.commit()
|
||||
logger.info(f"✅ {rdw_name}: {len(insert_data)} EU-s nagygép beküldve kötegelve.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(HeavyEUHunter.run())
|
||||
@@ -1,207 +1,310 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Robot-1-Catalog-Hunter (Precíz Adattrezor + Szótár-vezérelt ETL)
|
||||
Felelősség: RDW API-k lekérdezése (SZŰRTEN: Csak Autó, Motor, Teherautó),
|
||||
mapping_config.json alapú adatkinyerés, teljesítmény kalkuláció és teljes értékű mentés.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import httpx
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from sqlalchemy import text, select
|
||||
import json
|
||||
from datetime import datetime
|
||||
from sqlalchemy import text
|
||||
from sqlalchemy.dialects.postgresql import insert
|
||||
from app.database import AsyncSessionLocal
|
||||
from app.models.vehicle_definitions import VehicleModelDefinition
|
||||
from app.models import VehicleModelDefinition
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] Robot-1-Hunter: %(message)s', stream=sys.stdout)
|
||||
logger = logging.getLogger("Robot-1")
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s [%(levelname)s] Robot-1-Nyers: %(message)s',
|
||||
stream=sys.stdout
|
||||
)
|
||||
logger = logging.getLogger("Robot-1-Nyers")
|
||||
|
||||
class CatalogHunter:
|
||||
"""
|
||||
Vehicle Robot 1.9.2: The Invincible Mega-Hunter (CONCURRENCY PATCH)
|
||||
Szigorú sor-zárolás (SKIP LOCKED) és exponenciális API újrapróbálkozás.
|
||||
"""
|
||||
RDW_MAIN = "https://opendata.rdw.nl/resource/m9d7-ebf2.json"
|
||||
RDW_FUEL = "https://opendata.rdw.nl/resource/8ys7-d773.json"
|
||||
RDW_ENGINE = "https://opendata.rdw.nl/resource/jh96-v4pq.json"
|
||||
|
||||
RDW_TOKEN = os.getenv("RDW_APP_TOKEN")
|
||||
HEADERS = {"X-App-Token": RDW_TOKEN} if RDW_TOKEN else {}
|
||||
BATCH_SIZE = 50
|
||||
|
||||
# Szótár betöltése induláskor
|
||||
CONFIG_PATH = os.path.join(os.path.dirname(__file__), "mapping_config.json")
|
||||
with open(CONFIG_PATH, 'r', encoding='utf-8') as f:
|
||||
MAPPING = json.load(f)["rdw"]
|
||||
|
||||
@classmethod
|
||||
def normalize(cls, text_val: str) -> str:
|
||||
if not text_val: return ""
|
||||
return re.sub(r'[^a-zA-Z0-9]', '', text_val).lower()
|
||||
return re.sub(r'[^a-zA-Z0-9]', '', text_val).lower() if text_val else "UNKNOWN"
|
||||
|
||||
@classmethod
|
||||
def parse_int(cls, value) -> int:
|
||||
try:
|
||||
if value is None or str(value).strip() == "": return 0
|
||||
return int(float(value))
|
||||
except (ValueError, TypeError): return 0
|
||||
return int(float(value)) if value and str(value).strip() else 0
|
||||
except:
|
||||
return 0
|
||||
|
||||
@classmethod
|
||||
def parse_float(cls, value) -> float:
|
||||
try:
|
||||
if value is None or str(value).strip() == "": return 0.0
|
||||
return float(value)
|
||||
except (ValueError, TypeError): return 0.0
|
||||
return float(value) if value and str(value).strip() else 0.0
|
||||
except:
|
||||
return 0.0
|
||||
|
||||
@classmethod
|
||||
async def fetch_with_retry(cls, client: httpx.AsyncClient, url: str, retries: int = 3):
|
||||
""" Hibatűrő HTTP kérés API leállások és Rate Limitek ellen. """
|
||||
for attempt in range(retries):
|
||||
try:
|
||||
resp = await client.get(url, headers=cls.HEADERS)
|
||||
if resp.status_code == 200:
|
||||
return resp
|
||||
elif resp.status_code == 429: # Rate limit
|
||||
await asyncio.sleep(2 ** attempt) # 1, 2, 4 másodperc pihenő
|
||||
else:
|
||||
return resp # Egyéb hiba (pl 404), nem próbáljuk újra
|
||||
except httpx.RequestError as e:
|
||||
if attempt == retries - 1:
|
||||
logger.debug(f"API Hiba végleges ({url}): {e}")
|
||||
raise
|
||||
await asyncio.sleep(2 ** attempt)
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
async def fetch_tech_details(cls, client, plate):
|
||||
results = {
|
||||
"power_kw": 0, "engine_code": None, "euro_class": None,
|
||||
"fuel_desc": "Unknown", "co2": 0, "consumption": 0.0
|
||||
}
|
||||
async def fetch_raw_api_data(cls, client, plate: str) -> dict:
|
||||
raw_data = {"rdw_main": [], "rdw_fuel": [], "rdw_engine": []}
|
||||
try:
|
||||
f_resp = await cls.fetch_with_retry(client, f"{cls.RDW_FUEL}?kenteken={plate}")
|
||||
if f_resp and f_resp.status_code == 200 and f_resp.json():
|
||||
f = f_resp.json()[0]
|
||||
p1 = cls.parse_int(f.get("netto_maximum_vermogen") or f.get("nettomaximumvermogen"))
|
||||
p2 = cls.parse_int(f.get("nominaal_continu_maximum_vermogen") or f.get("nominaalcontinuvermogen"))
|
||||
results.update({
|
||||
"power_kw": max(p1, p2),
|
||||
"fuel_desc": f.get("brandstof_omschrijving") or "Unknown",
|
||||
"euro_class": f.get("euro_klasse") or f.get("uitlaatemissieniveau"),
|
||||
"co2": cls.parse_int(f.get("co2_uitstoot_gecombineerd")),
|
||||
"consumption": cls.parse_float(f.get("brandstofverbruik_gecombineerd"))
|
||||
})
|
||||
|
||||
e_resp = await cls.fetch_with_retry(client, f"{cls.RDW_ENGINE}?kenteken={plate}")
|
||||
if e_resp and e_resp.status_code == 200 and e_resp.json():
|
||||
results["engine_code"] = e_resp.json()[0].get("motorcode")
|
||||
# 1. RDW Main
|
||||
main_resp = await client.get(f"{cls.RDW_MAIN}?kenteken={plate}", headers=cls.HEADERS)
|
||||
if main_resp.status_code == 200: raw_data["rdw_main"] = main_resp.json()
|
||||
|
||||
# 2. RDW Fuel
|
||||
fuel_resp = await client.get(f"{cls.RDW_FUEL}?kenteken={plate}", headers=cls.HEADERS)
|
||||
if fuel_resp.status_code == 200: raw_data["rdw_fuel"] = fuel_resp.json()
|
||||
|
||||
# 3. RDW Engine
|
||||
engine_resp = await client.get(f"{cls.RDW_ENGINE}?kenteken={plate}", headers=cls.HEADERS)
|
||||
if engine_resp.status_code == 200: raw_data["rdw_engine"] = engine_resp.json()
|
||||
|
||||
except Exception as e:
|
||||
logger.debug(f"Hiba a technikai részleteknél ({plate}): {e}")
|
||||
return results
|
||||
logger.error(f"Hiba a nyers adatok lekérése közben ({plate}): {e}")
|
||||
return raw_data
|
||||
|
||||
@classmethod
|
||||
async def process_make_model(cls, db, task_id, make_name, model_name, v_class, priority):
|
||||
clean_make = make_name.strip().upper()
|
||||
clean_model = model_name.strip().upper()
|
||||
logger.info(f"🎯 IPARI ADATBÁNYÁSZAT INDUL: {clean_make} {clean_model}")
|
||||
|
||||
offset = 0
|
||||
def apply_mapping(cls, raw_main: dict, raw_fuel: list, raw_engine: list) -> dict:
|
||||
""" A JSON szótár alapján kinyeri és kiszámolja a pontos értékeket. """
|
||||
tech = {
|
||||
"make": raw_main.get("merk", "UNKNOWN").strip().upper(),
|
||||
"marketing_name": raw_main.get("handelsbenaming", "UNKNOWN").upper(),
|
||||
"curb_weight": cls.parse_int(raw_main.get("massa_ledig_voertuig")),
|
||||
"max_weight": cls.parse_int(raw_main.get("technische_max_massa_voertuig")),
|
||||
"engine_capacity": cls.parse_int(raw_main.get("cilinderinhoud")),
|
||||
"cylinders": cls.parse_int(raw_main.get("aantal_cilinders")),
|
||||
"wheelbase": cls.parse_int(raw_main.get("wielbasis")),
|
||||
"doors": cls.parse_int(raw_main.get("aantal_deuren")),
|
||||
"seats": cls.parse_int(raw_main.get("aantal_zitplaatsen")),
|
||||
"list_price": cls.parse_int(raw_main.get("catalogusprijs")),
|
||||
"max_speed": cls.parse_int(raw_main.get("maximale_constructiesnelheid")),
|
||||
"year_from": 0,
|
||||
"power_kw": 0,
|
||||
"engine_code": None,
|
||||
"euro_class": None,
|
||||
"fuel_type": "Unknown",
|
||||
"co2": 0,
|
||||
"consumption": 0.0,
|
||||
"body_type": "UNKNOWN"
|
||||
}
|
||||
|
||||
# Évjárat kivágása (pl. "20240424" -> 2024)
|
||||
datum = str(raw_main.get("datum_eerste_toelating", ""))
|
||||
if len(datum) >= 4:
|
||||
tech["year_from"] = cls.parse_int(datum[:4])
|
||||
|
||||
# Karosszéria fordítás
|
||||
raw_body = str(raw_main.get("inrichting", "")).lower().strip()
|
||||
tech["body_type"] = cls.MAPPING["body_type_translations"].get(raw_body, raw_body.upper())
|
||||
|
||||
# Üzemanyag adatok kinyerése
|
||||
if raw_fuel:
|
||||
f = raw_fuel[0]
|
||||
raw_fuel_type = f.get("brandstof_omschrijving", "Unknown")
|
||||
tech["fuel_type"] = cls.MAPPING["fuel_translations"].get(raw_fuel_type, raw_fuel_type)
|
||||
tech["euro_class"] = f.get("euro_klasse") or f.get("uitlaatemissieniveau")
|
||||
tech["co2"] = cls.parse_int(f.get("co2_uitstoot_gecombineerd"))
|
||||
tech["consumption"] = cls.parse_float(f.get("brandstofverbruik_gecombineerd"))
|
||||
|
||||
# --- JAVÍTOTT TELJESÍTMÉNY-KERESŐ (Normál, Elektromos, Névleges) ---
|
||||
p_normal = cls.parse_float(f.get("nettomaximumvermogen"))
|
||||
p_elec = cls.parse_float(f.get("netto_max_vermogen_elektrisch"))
|
||||
p_nominal = cls.parse_float(f.get("nominaal_continu_maximumvermogen"))
|
||||
|
||||
power = max(p_normal, p_elec, p_nominal)
|
||||
|
||||
# HA MÉG MINDIG NINCS TELJESÍTMÉNY, SZÁMOLJUK KI A SÚLY/ARÁNYBÓL!
|
||||
if power == 0:
|
||||
ratio_key = cls.MAPPING["power_calculation"]["ratio_source"]
|
||||
weight_key = cls.MAPPING["power_calculation"]["weight_source"]
|
||||
ratio = cls.parse_float(raw_main.get(ratio_key))
|
||||
weight = cls.parse_float(raw_main.get(weight_key))
|
||||
if ratio > 0 and weight > 0:
|
||||
power = ratio * weight
|
||||
logger.info(f"⚡ Teljesítmény számolva arányból: {ratio} * {weight} = {power:.2f} kW")
|
||||
|
||||
tech["power_kw"] = cls.parse_int(power)
|
||||
|
||||
# Motor adatok kinyerése
|
||||
if raw_engine:
|
||||
tech["engine_code"] = raw_engine[0].get("motorcode")
|
||||
|
||||
return tech
|
||||
|
||||
@classmethod
|
||||
async def process_task(cls, db, task):
|
||||
clean_make = task.make.strip().upper()
|
||||
clean_model = task.model.strip().upper()
|
||||
logger.info(f"🎯 PRECÍZIÓS ADATGYŰJTÉS INDUL: {clean_make} {clean_model}")
|
||||
|
||||
async with httpx.AsyncClient(timeout=30.0) as client:
|
||||
offset = 0
|
||||
while True:
|
||||
params = f"merk={clean_make}&handelsbenaming={clean_model}&$limit={cls.BATCH_SIZE}&$offset={offset}&$order=kenteken DESC"
|
||||
try:
|
||||
r = await cls.fetch_with_retry(client, f"{cls.RDW_MAIN}?{params}")
|
||||
batch = r.json() if r and r.status_code == 200 else []
|
||||
except Exception: break
|
||||
# --- SZŰRÉS: Csak autó, motor és teherautó/kamion ---
|
||||
allowed_types = "('Personenauto','Motorfiets','Vrachtwagen')"
|
||||
params = f"merk={clean_make}&$where=voertuigsoort IN {allowed_types}"
|
||||
|
||||
if not batch: break
|
||||
if clean_model != 'ALL_VARIANTS':
|
||||
params += f" AND handelsbenaming='{clean_model}'"
|
||||
|
||||
params += f"&$limit={cls.BATCH_SIZE}&$offset={offset}&$order=kenteken DESC"
|
||||
|
||||
try:
|
||||
r = await client.get(f"{cls.RDW_MAIN}?{params}", headers=cls.HEADERS)
|
||||
batch = r.json() if r.status_code == 200 else []
|
||||
except Exception as e:
|
||||
logger.error(f"Hiba a batch lekérés közben: {e}")
|
||||
break
|
||||
|
||||
if not batch: break
|
||||
|
||||
for item in batch:
|
||||
plate = item.get("kenteken", "UNKNOWN")
|
||||
try:
|
||||
plate = item.get("kenteken")
|
||||
if not plate: continue
|
||||
|
||||
variant = item.get("variant") or "UNKNOWN"
|
||||
version = item.get("uitvoering") or "UNKNOWN"
|
||||
ccm = cls.parse_int(item.get("cilinderinhoud"))
|
||||
|
||||
norm_name = cls.normalize(clean_model.replace(clean_make, "").strip() or clean_model)
|
||||
|
||||
tech = await cls.fetch_tech_details(client, plate)
|
||||
async with db.begin_nested():
|
||||
raw_api_data = await cls.fetch_raw_api_data(client, plate)
|
||||
|
||||
# Szótár és Matek alkalmazása!
|
||||
tech = cls.apply_mapping(
|
||||
raw_api_data.get("rdw_main", [{}])[0] if raw_api_data.get("rdw_main") else item,
|
||||
raw_api_data.get("rdw_fuel", []),
|
||||
raw_api_data.get("rdw_engine", [])
|
||||
)
|
||||
|
||||
stmt = insert(VehicleModelDefinition).values(
|
||||
make=clean_make,
|
||||
marketing_name=clean_model,
|
||||
normalized_name=norm_name,
|
||||
variant_code=variant,
|
||||
version_code=version,
|
||||
type_approval_number=item.get("typegoedkeuringsnummer"),
|
||||
technical_code=plate,
|
||||
engine_capacity=ccm,
|
||||
power_kw=tech["power_kw"],
|
||||
fuel_type=tech["fuel_desc"],
|
||||
engine_code=tech["engine_code"],
|
||||
seats=cls.parse_int(item.get("aantal_zitplaatsen")),
|
||||
doors=cls.parse_int(item.get("aantal_deuren")),
|
||||
width=cls.parse_int(item.get("breedte")),
|
||||
wheelbase=cls.parse_int(item.get("wielbasis")),
|
||||
list_price=cls.parse_int(item.get("catalogusprijs")),
|
||||
max_speed=cls.parse_int(item.get("maximale_constructiesnelheid")),
|
||||
towing_weight_unbraked=cls.parse_int(item.get("maximum_massa_trekken_ongeremd")),
|
||||
towing_weight_braked=cls.parse_int(item.get("maximum_trekken_massa_geremd")),
|
||||
curb_weight=cls.parse_int(item.get("massa_ledig_voertuig")),
|
||||
max_weight=cls.parse_int(item.get("technische_max_massa_voertuig") or item.get("toegestane_maximum_massa_voertuig")),
|
||||
body_type=item.get("inrichting"),
|
||||
co2_emissions_combined=tech["co2"],
|
||||
fuel_consumption_combined=tech["consumption"],
|
||||
euro_classification=tech["euro_class"],
|
||||
cylinders=cls.parse_int(item.get("aantal_cilinders")),
|
||||
vehicle_class=v_class,
|
||||
priority_score=priority,
|
||||
status="ACTIVE",
|
||||
source="MEGA-HUNTER-v1.9.2"
|
||||
)
|
||||
|
||||
do_nothing_stmt = stmt.on_conflict_do_nothing(
|
||||
index_elements=['make', 'normalized_name', 'variant_code', 'version_code', 'fuel_type']
|
||||
)
|
||||
|
||||
await db.execute(do_nothing_stmt)
|
||||
norm_name = cls.normalize(tech["marketing_name"].replace(clean_make, "").strip() or tech["marketing_name"])
|
||||
|
||||
# Routing Logika
|
||||
has_power_and_ccm = tech["power_kw"] > 0 and tech["engine_capacity"] > 0
|
||||
is_electric = "electric" in tech["fuel_type"].lower()
|
||||
|
||||
if has_power_and_ccm or (tech["power_kw"] > 0 and is_electric):
|
||||
final_status = "awaiting_ai_synthesis"
|
||||
else:
|
||||
final_status = "unverified"
|
||||
|
||||
stmt = insert(VehicleModelDefinition).values(
|
||||
market='EU',
|
||||
make=tech["make"],
|
||||
marketing_name=tech["marketing_name"],
|
||||
normalized_name=norm_name,
|
||||
variant_code=item.get("variant", "UNKNOWN"),
|
||||
version_code=item.get("uitvoering", "UNKNOWN"),
|
||||
technical_code=plate,
|
||||
type_approval_number=item.get("typegoedkeuringsnummer"),
|
||||
seats=tech["seats"],
|
||||
doors=tech["doors"],
|
||||
width=cls.parse_int(item.get("breedte")),
|
||||
wheelbase=tech["wheelbase"],
|
||||
list_price=tech["list_price"],
|
||||
max_speed=tech["max_speed"],
|
||||
curb_weight=tech["curb_weight"],
|
||||
max_weight=tech["max_weight"],
|
||||
fuel_consumption_combined=tech["consumption"],
|
||||
co2_emissions_combined=tech["co2"],
|
||||
vehicle_class=task.vehicle_class,
|
||||
body_type=tech["body_type"],
|
||||
fuel_type=tech["fuel_type"],
|
||||
engine_capacity=tech["engine_capacity"],
|
||||
power_kw=tech["power_kw"],
|
||||
cylinders=tech["cylinders"],
|
||||
engine_code=tech["engine_code"],
|
||||
euro_classification=tech["euro_class"],
|
||||
year_from=tech["year_from"],
|
||||
priority_score=task.priority_score,
|
||||
status=final_status,
|
||||
source="ROBOT-1-PRECISION-MAPPER",
|
||||
raw_search_context='',
|
||||
raw_api_data=raw_api_data,
|
||||
research_metadata={},
|
||||
specifications={"fast_track": True} if final_status == "awaiting_ai_synthesis" else {},
|
||||
marketing_name_aliases=[]
|
||||
).on_conflict_do_update(
|
||||
index_elements=['make', 'normalized_name', 'variant_code', 'version_code', 'fuel_type', 'market', 'year_from'],
|
||||
set_={
|
||||
'power_kw': tech["power_kw"],
|
||||
'engine_capacity': tech["engine_capacity"],
|
||||
'fuel_type': tech["fuel_type"],
|
||||
'body_type': tech["body_type"],
|
||||
'doors': tech["doors"],
|
||||
'seats': tech["seats"],
|
||||
'status': final_status,
|
||||
'raw_api_data': raw_api_data,
|
||||
'updated_at': datetime.utcnow()
|
||||
}
|
||||
).returning(VehicleModelDefinition.id)
|
||||
|
||||
res = await db.execute(stmt)
|
||||
vmd_id = res.scalar()
|
||||
|
||||
if final_status == "awaiting_ai_synthesis" and vmd_id:
|
||||
cat_stmt = text("""
|
||||
INSERT INTO vehicle.vehicle_catalog (master_definition_id, make, model, power_kw, engine_capacity, fuel_type, factory_data)
|
||||
VALUES (:m_id, :make, :model, :kw, :ccm, :fuel, :factory)
|
||||
ON CONFLICT ON CONSTRAINT uix_vehicle_catalog_full DO NOTHING;
|
||||
""")
|
||||
await db.execute(cat_stmt, {
|
||||
"m_id": vmd_id,
|
||||
"make": tech["make"],
|
||||
"model": tech["marketing_name"][:50],
|
||||
"kw": tech["power_kw"],
|
||||
"ccm": tech["engine_capacity"],
|
||||
"fuel": tech["fuel_type"],
|
||||
"factory": '{"source": "RDW Mapping System"}'
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"⚠️ Hiba a sor feldolgozásakor ({plate}): {e}")
|
||||
|
||||
try:
|
||||
await db.commit()
|
||||
except Exception as e:
|
||||
await db.rollback()
|
||||
logger.error(f"❌ Batch commit hiba (Ignorálva): {e}")
|
||||
logger.warning(f"⚠️ Sor hiba ({plate}): {e}")
|
||||
|
||||
await db.commit()
|
||||
offset += len(batch)
|
||||
if offset >= 500: break
|
||||
await asyncio.sleep(0.5) # Lassítjuk kicsit a terhelést
|
||||
|
||||
# Discovery státusz frissítése
|
||||
await db.execute(text("UPDATE vehicle.catalog_discovery SET status = 'processed' WHERE id = :id"), {"id": task_id})
|
||||
if offset >= 500: break
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
await db.execute(
|
||||
text("UPDATE vehicle.catalog_discovery SET status = 'processed' WHERE id = :id"),
|
||||
{"id": task.id}
|
||||
)
|
||||
await db.commit()
|
||||
|
||||
@classmethod
|
||||
async def run(cls):
|
||||
logger.info("🤖 Invincible Mega-Hunter v1.9.2 ONLINE (CONCURRENCY PATCHED)")
|
||||
logger.info("🤖 Robot-1-Nyers ONLINE (Precíz Szótár-alapú feldolgozás + Jármű szűrés)")
|
||||
while True:
|
||||
async with AsyncSessionLocal() as db:
|
||||
# ATOMI ZÁROLÁS (Race condition ellenszere)
|
||||
# Keresünk egy pending feladatot, azonnal zároljuk és átállítjuk processingre!
|
||||
query = text("""
|
||||
UPDATE vehicle.catalog_discovery
|
||||
SET status = 'processing'
|
||||
WHERE id = (
|
||||
SELECT id FROM vehicle.catalog_discovery
|
||||
WHERE status = 'pending'
|
||||
ORDER BY priority_score DESC
|
||||
FOR UPDATE SKIP LOCKED
|
||||
LIMIT 1
|
||||
)
|
||||
RETURNING id, make, model, vehicle_class, priority_score;
|
||||
""")
|
||||
task = (await db.execute(query)).fetchone()
|
||||
await db.commit()
|
||||
|
||||
if task:
|
||||
await cls.process_make_model(db, task[0], task[1], task[2], task[3], task[4])
|
||||
else:
|
||||
try:
|
||||
async with AsyncSessionLocal() as db:
|
||||
res = await db.execute(text("""
|
||||
UPDATE vehicle.catalog_discovery
|
||||
SET status = 'processing'
|
||||
WHERE id = (
|
||||
SELECT id FROM vehicle.catalog_discovery
|
||||
WHERE status = 'pending'
|
||||
ORDER BY priority_score DESC
|
||||
FOR UPDATE SKIP LOCKED LIMIT 1
|
||||
) RETURNING id, make, model, vehicle_class, priority_score;
|
||||
"""))
|
||||
task = res.fetchone()
|
||||
await db.commit()
|
||||
|
||||
if task:
|
||||
await cls.process_task(db, task)
|
||||
else:
|
||||
await asyncio.sleep(30)
|
||||
except Exception as e:
|
||||
logger.error(f"Hiba a fő ciklusban: {e}")
|
||||
await asyncio.sleep(10)
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(CatalogHunter.run())
|
||||
@@ -1,3 +1,4 @@
|
||||
# /opt/docker/dev/service_finder/backend/app/workers/vehicle/vehicle_robot_1_gb_hunter.py
|
||||
import asyncio
|
||||
import httpx
|
||||
import logging
|
||||
@@ -7,7 +8,7 @@ import json
|
||||
from datetime import datetime
|
||||
from sqlalchemy import text, func
|
||||
from app.database import AsyncSessionLocal
|
||||
from app.models.vehicle_definitions import VehicleModelDefinition
|
||||
from app.models import VehicleModelDefinition
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] Robot-1-GB: %(message)s', stream=sys.stdout)
|
||||
logger = logging.getLogger("Robot-1-GB-Hunter")
|
||||
|
||||
316
backend/app/workers/vehicle/vehicle_robot_2_1_rdw_enricher.py
Normal file
316
backend/app/workers/vehicle/vehicle_robot_2_1_rdw_enricher.py
Normal file
@@ -0,0 +1,316 @@
|
||||
# /opt/docker/dev/service_finder/backend/app/workers/vehicle/vehicle_robot_2_1_rdw_enricher.py
|
||||
"""
|
||||
Robot 2.1: RDW Enricher (Holland rendszámok dúsítása) - INTEGRÁLT SZÓTÁR ÉS MATEK
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import httpx
|
||||
import logging
|
||||
import json
|
||||
import re
|
||||
import os
|
||||
import sys
|
||||
from sqlalchemy import text
|
||||
from app.database import AsyncSessionLocal
|
||||
|
||||
# Importáljuk a márkaneveket normalizáló szótárt
|
||||
try:
|
||||
# Megpróbáljuk relatív úton, ha a csomagstruktúra engedi
|
||||
from .mapping_dictionary import normalize_make
|
||||
except (ImportError, ValueError):
|
||||
# Ha nem, akkor a sys.path-ból vagy fallback függvény
|
||||
def normalize_make(make: str) -> str:
|
||||
m = make.upper().strip()
|
||||
synonyms = {"MERCEDES": "MERCEDES-BENZ", "VW": "VOLKSWAGEN", "ALFA": "ALFA ROMEO"}
|
||||
return synonyms.get(m, m)
|
||||
|
||||
logger = logging.getLogger("Robot-2-1-RDW-Enricher")
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s [%(levelname)s] R2.1-RDW: %(message)s',
|
||||
stream=sys.stdout
|
||||
)
|
||||
|
||||
RDW_API_URL = "https://opendata.rdw.nl/resource/m9d7-ebf2.json?kenteken={license_plate}"
|
||||
BATCH_SIZE = 10
|
||||
|
||||
class RDWEnricher:
|
||||
# Szótár betöltése az osztály szintjén
|
||||
BASE_PATH = os.path.dirname(__file__)
|
||||
CONFIG_PATH = os.path.join(BASE_PATH, 'mapping_config.json')
|
||||
|
||||
try:
|
||||
with open(CONFIG_PATH, 'r', encoding='utf-8') as f:
|
||||
mapping_config = json.load(f)['rdw']
|
||||
logger.info("✅ mapping_config.json sikeresen betöltve.")
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Hiba a mapping_config.json betöltésekor: {e}")
|
||||
mapping_config = {}
|
||||
|
||||
@staticmethod
|
||||
def normalize_license_plate(technical_code: str) -> str:
|
||||
if not technical_code: return ""
|
||||
return re.sub(r'[-\s\.]', '', technical_code).upper()
|
||||
|
||||
@classmethod
|
||||
async def fetch_candidates(cls, db):
|
||||
query = text("""
|
||||
SELECT id, make, marketing_name, technical_code, power_kw, engine_capacity,
|
||||
body_type, raw_api_data, attempts, fuel_type, vehicle_class
|
||||
FROM vehicle.vehicle_model_definitions
|
||||
WHERE (status = 'manual_review_needed' OR status = 'unverified')
|
||||
AND technical_code IS NOT NULL AND technical_code != ''
|
||||
AND (power_kw = 0 OR engine_capacity = 0)
|
||||
AND attempts < 3
|
||||
ORDER BY priority_score DESC NULLS LAST, id ASC
|
||||
FOR UPDATE SKIP LOCKED
|
||||
LIMIT :batch_size
|
||||
""")
|
||||
result = await db.execute(query, {"batch_size": BATCH_SIZE})
|
||||
rows = result.fetchall()
|
||||
vehicles = []
|
||||
for row in rows:
|
||||
vehicles.append({
|
||||
"id": row[0], "make": row[1], "marketing_name": row[2],
|
||||
"technical_code": row[3], "power_kw": row[4] or 0,
|
||||
"engine_capacity": row[5] or 0, "body_type": row[6],
|
||||
"raw_api_data": row[7] or {}, "attempts": row[8] or 0,
|
||||
"fuel_type": row[9] or "", "vehicle_class": row[10] or ""
|
||||
})
|
||||
return vehicles
|
||||
|
||||
@classmethod
|
||||
async def query_rdw_api(cls, license_plate: str, client: httpx.AsyncClient):
|
||||
url = RDW_API_URL.format(license_plate=license_plate)
|
||||
try:
|
||||
resp = await client.get(url, timeout=10.0)
|
||||
if resp.status_code == 200:
|
||||
data = resp.json()
|
||||
if isinstance(data, list) and len(data) > 0:
|
||||
return data[0]
|
||||
return None
|
||||
except httpx.RequestError as e:
|
||||
logger.error(f"RDW hálózati hiba {license_plate}: {e}")
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
def extract_fields(cls, rdw_data: dict):
|
||||
"""
|
||||
Adatkinyerés és dúsítás a mapping_config és a normalize_make alapján.
|
||||
"""
|
||||
updates = {}
|
||||
cfg = cls.mapping_config
|
||||
if not cfg:
|
||||
return {}
|
||||
|
||||
# 1. Alapmezők és Márkanév normalizálása
|
||||
for r_key, db_key in cfg.get('field_map', {}).items():
|
||||
val = rdw_data.get(r_key)
|
||||
if not val: continue
|
||||
|
||||
if db_key == "make":
|
||||
# Használjuk a külső szótár logikát
|
||||
updates[db_key] = normalize_make(val)
|
||||
elif db_key == "body_type":
|
||||
# Karosszéria fordítás a JSON szótárból
|
||||
trans = cfg.get('body_type_translations', {})
|
||||
updates[db_key] = trans.get(val.lower(), val.upper())
|
||||
else:
|
||||
updates[db_key] = val
|
||||
|
||||
# 2. KOMBINÁLT TELJESÍTMÉNY SZÁMÍTÁS (Matek Zseni 2.0)
|
||||
p_cfg = cfg.get('power_calculation', {})
|
||||
power_kw = None
|
||||
|
||||
# a) Próbáljuk a direkt kW-ot (Benzin/Dízel)
|
||||
p_val = rdw_data.get(p_cfg.get('primary_source'))
|
||||
# b) Próbáljuk az elektromos kW-ot (ha az előző nincs)
|
||||
if not p_val:
|
||||
p_val = rdw_data.get(p_cfg.get('electric_source'))
|
||||
|
||||
if p_val:
|
||||
try: power_kw = int(float(p_val))
|
||||
except: pass
|
||||
|
||||
# c) Ha még mindig 0, jöhet a szorzás (Tömegarány * Menetkész tömeg)
|
||||
if not power_kw or power_kw == 0:
|
||||
ratio = rdw_data.get(p_cfg.get('ratio_source'))
|
||||
mass = rdw_data.get(p_cfg.get('weight_source'))
|
||||
if ratio and mass:
|
||||
try:
|
||||
power_kw = int(float(ratio) * float(mass))
|
||||
logger.info(f"⚡ Kiszámolt teljesítmény: {power_kw} kW ({ratio} * {mass})")
|
||||
except: pass
|
||||
|
||||
if power_kw:
|
||||
updates['power_kw'] = power_kw
|
||||
|
||||
# Hengerűrtartalom normalizálása
|
||||
if 'engine_capacity' in updates:
|
||||
try: updates['engine_capacity'] = int(float(updates['engine_capacity']))
|
||||
except: pass
|
||||
|
||||
return updates
|
||||
|
||||
@classmethod
|
||||
async def process_vehicle(cls, vehicle: dict, client: httpx.AsyncClient):
|
||||
license_plate = cls.normalize_license_plate(vehicle['technical_code'])
|
||||
if not license_plate:
|
||||
return vehicle, None, "empty_license_plate"
|
||||
|
||||
raw_api_data = vehicle['raw_api_data']
|
||||
if not isinstance(raw_api_data, dict): raw_api_data = {}
|
||||
|
||||
# Cache ellenőrzés (ne kérdezzük le ugyanazt 66-szor)
|
||||
rdw_data = None
|
||||
if 'rdw' in raw_api_data and len(raw_api_data['rdw']) > 0:
|
||||
rdw_data = raw_api_data['rdw'][0]['data']
|
||||
else:
|
||||
rdw_data = await cls.query_rdw_api(license_plate, client)
|
||||
|
||||
if not rdw_data:
|
||||
return vehicle, None, "no_rdw_data"
|
||||
|
||||
# SZÓTÁR ALAPÚ DÚSÍTÁS
|
||||
extracted = cls.extract_fields(rdw_data)
|
||||
if not extracted:
|
||||
return vehicle, None, "no_useful_data"
|
||||
|
||||
updates = {}
|
||||
# Csak akkor frissítünk, ha a DB-ben még hiányos az adat (0 vagy üres)
|
||||
for key, val in extracted.items():
|
||||
if key in ['power_kw', 'engine_capacity'] and val >= 0 and vehicle[key] == 0:
|
||||
updates[key] = val
|
||||
elif key in ['make', 'body_type', 'fuel_type'] and (not vehicle.get(key) or vehicle[key] == ""):
|
||||
updates[key] = val
|
||||
|
||||
# Kapuőr Logika (Arany státusz eldöntése)
|
||||
f_kw = updates.get('power_kw', vehicle['power_kw'])
|
||||
f_ccm = updates.get('engine_capacity', vehicle['engine_capacity'])
|
||||
fuel = str(updates.get('fuel_type', vehicle['fuel_type'])).lower()
|
||||
v_class = str(vehicle['vehicle_class']).lower()
|
||||
|
||||
is_electric = any(x in fuel for x in ['electr', 'elektri', 'hydrogen'])
|
||||
is_gold_ready = False
|
||||
|
||||
if 'trailer' in v_class:
|
||||
is_gold_ready = True
|
||||
elif is_electric:
|
||||
if f_kw > 0: is_gold_ready = True
|
||||
# Elektromos autóknál a hengerűrtartalom 0
|
||||
if 'engine_capacity' not in updates and vehicle['engine_capacity'] != 0:
|
||||
updates['engine_capacity'] = 0
|
||||
else:
|
||||
if f_kw > 0 and f_ccm > 0: is_gold_ready = True
|
||||
|
||||
updates['_is_gold_ready'] = is_gold_ready
|
||||
updates['_new_attempts'] = vehicle['attempts'] + 1
|
||||
|
||||
# Ha arany státuszba kerül, garantáljuk, hogy a power_kw és engine_capacity bekerüljön az UPDATE-be
|
||||
if is_gold_ready:
|
||||
if 'power_kw' not in updates:
|
||||
updates['power_kw'] = f_kw
|
||||
if 'engine_capacity' not in updates:
|
||||
updates['engine_capacity'] = f_ccm
|
||||
|
||||
# Nyers adat mentése (ha eddig nem volt rdw kulcs)
|
||||
if 'rdw' not in raw_api_data:
|
||||
raw_api_data['rdw'] = [{'timestamp': asyncio.get_event_loop().time(), 'data': rdw_data}]
|
||||
updates['raw_api_data'] = raw_api_data
|
||||
|
||||
return vehicle, updates, None
|
||||
|
||||
@classmethod
|
||||
async def update_vehicle_batch(cls, db, updates_list):
|
||||
if not updates_list: return 0
|
||||
updated_count = 0
|
||||
|
||||
for vehicle_id, updates in updates_list:
|
||||
try:
|
||||
set_clauses = []
|
||||
params = {"vehicle_id": vehicle_id}
|
||||
is_gold = updates.pop('_is_gold_ready', False)
|
||||
new_attempts = updates.pop('_new_attempts', 1)
|
||||
|
||||
for key, value in updates.items():
|
||||
if key == 'raw_api_data':
|
||||
set_clauses.append("raw_api_data = :raw_api_data")
|
||||
params['raw_api_data'] = json.dumps(value)
|
||||
else:
|
||||
set_clauses.append(f"{key} = :{key}")
|
||||
params[key] = value
|
||||
|
||||
if is_gold:
|
||||
set_clauses.append("status = 'gold_enriched'")
|
||||
set_clauses.append("attempts = 0")
|
||||
else:
|
||||
set_clauses.append("attempts = :attempts")
|
||||
params['attempts'] = new_attempts
|
||||
if new_attempts >= 3:
|
||||
set_clauses.append("status = 'manual_review_needed'")
|
||||
|
||||
set_clauses.append("updated_at = NOW()")
|
||||
query = text(f"UPDATE vehicle.vehicle_model_definitions SET {', '.join(set_clauses)} WHERE id = :vehicle_id")
|
||||
|
||||
# AZONNALI VÉGREHAJTÁS ÉS COMMIT!
|
||||
await db.execute(query, params)
|
||||
await db.commit()
|
||||
updated_count += 1
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"❌ DB Mentési hiba az {vehicle_id} járműnél: {e}")
|
||||
await db.rollback() # Csak ezt a problémás autót dobjuk el
|
||||
continue
|
||||
|
||||
return updated_count
|
||||
|
||||
@classmethod
|
||||
async def run(cls):
|
||||
logger.info("🚀 Robot 2.1 (RDW) indítása...")
|
||||
|
||||
# --- DNS ÉS KAPCSOLÓDÁSI VÉDELEM ---
|
||||
db_ready = False
|
||||
while not db_ready:
|
||||
try:
|
||||
async with AsyncSessionLocal() as db:
|
||||
await db.execute(text("SELECT 1"))
|
||||
db_ready = True
|
||||
logger.info("✅ Adatbázis elérhető, indul a munka!")
|
||||
except Exception as e:
|
||||
logger.warning(f"⏳ Várakozás az adatbázisra (DNS/Hálózat hiba): {e}")
|
||||
await asyncio.sleep(5)
|
||||
|
||||
while True:
|
||||
try:
|
||||
async with AsyncSessionLocal() as db:
|
||||
vehicles = await cls.fetch_candidates(db)
|
||||
if not vehicles:
|
||||
await asyncio.sleep(10)
|
||||
continue
|
||||
|
||||
async with httpx.AsyncClient(timeout=15.0) as client:
|
||||
tasks = [cls.process_vehicle(v, client) for v in vehicles]
|
||||
results = await asyncio.gather(*tasks)
|
||||
|
||||
updates_list = []
|
||||
for vehicle, updates, error in results:
|
||||
if updates:
|
||||
updates_list.append((vehicle['id'], updates))
|
||||
if updates.get('_is_gold_ready'):
|
||||
logger.info(f"✨ ARANY: {vehicle['make']} {vehicle['marketing_name']}")
|
||||
else:
|
||||
await db.execute(
|
||||
text("UPDATE vehicle.vehicle_model_definitions SET attempts = attempts + 1, updated_at = NOW() WHERE id = :id"),
|
||||
{"id": vehicle['id']}
|
||||
)
|
||||
|
||||
if updates_list:
|
||||
await cls.update_vehicle_batch(db, updates_list)
|
||||
|
||||
await asyncio.sleep(2)
|
||||
except Exception as e:
|
||||
logger.error(f"⚠️ Hiba a főciklusban: {e}")
|
||||
await asyncio.sleep(5)
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(RDWEnricher.run())
|
||||
427
backend/app/workers/vehicle/vehicle_robot_2_1_ultima_scout.py
Normal file
427
backend/app/workers/vehicle/vehicle_robot_2_1_ultima_scout.py
Normal file
@@ -0,0 +1,427 @@
|
||||
#!/usr/bin/env python3
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import random
|
||||
import urllib.parse
|
||||
import sys
|
||||
import signal
|
||||
import re
|
||||
from datetime import datetime
|
||||
from playwright.async_api import async_playwright
|
||||
from sqlalchemy import text
|
||||
from app.database import AsyncSessionLocal
|
||||
|
||||
# R2.3 - SENTINEL (Hardened, Drill-Up/Drill-Down & Omnivorous Parser Edition)
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s [R2.3-SENTINEL] %(message)s')
|
||||
logger = logging.getLogger("R2.3")
|
||||
|
||||
# --- 1. SZŰRÉSEK ÉS TILTÓLISTÁK ---
|
||||
JUNK_LIST = [
|
||||
'SARIS', 'ANSSEMS', 'HAPERT', 'HUMBAUR', 'EDUARD', 'IFOR WILLIAMS', 'FENDT',
|
||||
'HOBBY', 'ADRIA', 'PEECON', 'JAKO', 'KAWECO', 'POTTINGER', 'BOCKMANN',
|
||||
'JOHN DEERE', 'CLAAS', 'IVECO', 'SCANIA', 'MAN', 'DAF', 'KNAUS', 'PÖSSL',
|
||||
'HYMER', 'WESTFALIA', 'AGM', 'BRENDERUP', 'STEMA', 'DEBON', 'TEMARED',
|
||||
'MARTZ', 'NIEWIADOW', 'ZASLAW'
|
||||
]
|
||||
|
||||
# --- 2. FORDÍTÁSOK ---
|
||||
TRANSLATIONS = {
|
||||
"3ER REIHE": "3 Series", "5ER REIHE": "5 Series", "1ER REIHE": "1 Series", "7ER REIHE": "7 Series",
|
||||
"E-KLASSE": "E Class", "C-KLASSE": "C Class", "S-KLASSE": "S Class", "A-KLASSE": "A Class",
|
||||
"REIHE": "Series", "KLASSE": "Class", "BESTELWAGEN": "Van"
|
||||
}
|
||||
|
||||
class RobotScout:
|
||||
def __init__(self):
|
||||
self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
|
||||
self.running = True
|
||||
|
||||
def clean_name(self, make, model):
|
||||
"""Standardizált angol név előállítása."""
|
||||
m = str(model).upper()
|
||||
for de, en in TRANSLATIONS.items():
|
||||
m = m.replace(de, en)
|
||||
m = m.replace(make.upper(), "").strip()
|
||||
return f"{make} {m}"
|
||||
|
||||
# Rugalmas szótár (Fuzzy Match Keywords)
|
||||
FUZZY_MAPPING = {
|
||||
"power_kw": ["power", "horsepower", "output", "hp"],
|
||||
"engine_capacity": ["displacement", "capacity", "cm3", "cu-in"],
|
||||
"torque_nm": ["torque"],
|
||||
"max_speed": ["top speed", "maximum speed"],
|
||||
"curb_weight": ["curb weight", "weight"],
|
||||
"wheelbase": ["wheelbase"],
|
||||
"seats": ["seats", "num. of seats"]
|
||||
}
|
||||
|
||||
def extract_fuzzy_metric(self, web_data: dict, keywords: list) -> str:
|
||||
"""Megkeresi a JSON-ben azt az értéket, aminek a kulcsa tartalmazza valamelyik kulcsszót."""
|
||||
for key, val in web_data.items():
|
||||
k_lower = key.lower()
|
||||
for kw in keywords:
|
||||
if kw in k_lower:
|
||||
return str(val)
|
||||
return ""
|
||||
|
||||
def clean_number(self, val: str) -> int:
|
||||
"""Kinyeri a nyers szövegből a releváns számot (okosan kezeli a kW-ot)."""
|
||||
if not val or val == "-" or val == "None": return 0
|
||||
try:
|
||||
val_lower = val.lower()
|
||||
if "kw" in val_lower:
|
||||
kw_match = re.search(r'(\d+)\s*kw', val_lower)
|
||||
if kw_match: return int(kw_match.group(1))
|
||||
|
||||
nums = re.findall(r'\d+', val.replace(' ', '').replace(',', '').replace('.', ''))
|
||||
return int(nums[0]) if nums else 0
|
||||
except:
|
||||
return 0
|
||||
|
||||
async def _retry_with_backoff(self, func, max_attempts=3, base_delay=2, exception_message="Retry failed"):
|
||||
"""Újrapróbálkozási logika exponenciális késleltetéssel."""
|
||||
for attempt in range(max_attempts):
|
||||
try:
|
||||
return await func()
|
||||
except Exception as e:
|
||||
if attempt == max_attempts - 1:
|
||||
logger.error(f"{exception_message} ({max_attempts}. kísérlet után is): {str(e)[:100]}")
|
||||
raise
|
||||
else:
|
||||
delay = base_delay * (2 ** attempt) + random.uniform(0, 1)
|
||||
logger.warning(f"⚠️ Próba {attempt + 1} sikertelen: {str(e)[:50]}. Újrapróbálkozás {delay:.1f}mp múlva...")
|
||||
await asyncio.sleep(delay)
|
||||
return None
|
||||
|
||||
async def get_car_links(self, page, make, model, year, use_year=True):
|
||||
"""Intelligens kereső: Ha talál egy autót, felmegy a Generációhoz (Drill-Up), majd kigyűjti a variációkat (Drill-Down)."""
|
||||
clean_model = self.clean_name(make, model)
|
||||
search_query = f"{clean_model} {year}" if use_year else clean_model
|
||||
url = f"https://www.ultimatespecs.com/index.php?q={urllib.parse.quote(search_query)}"
|
||||
|
||||
make_url_safe = str(make).replace(' ', '-').lower()
|
||||
model_keyword = str(model).strip().lower().split()[0] if str(model).strip() else ""
|
||||
|
||||
# Ez a JavaScript funkció ki tudja nyerni egy adott oldalról az összes Specifikációt ÉS Generáció linket
|
||||
js_extractor = """
|
||||
(args) => {
|
||||
let targetMakeUrl = args.makeUrl;
|
||||
let targetModel = args.modelWord;
|
||||
let specs = [];
|
||||
let generations = [];
|
||||
let seenUrls = new Set();
|
||||
|
||||
document.querySelectorAll('a').forEach(a => {
|
||||
let href = a.getAttribute('href') || '';
|
||||
let text = a.innerText.trim();
|
||||
let hrefLow = href.toLowerCase();
|
||||
let textLow = text.toLowerCase();
|
||||
|
||||
if (hrefLow.includes('/car-specs/') || hrefLow.includes('/motorcycles-specs/')) {
|
||||
// URL szintű Márka Szűrés!
|
||||
if (hrefLow.includes('/' + targetMakeUrl + '/') || hrefLow.includes(targetMakeUrl + '-models')) {
|
||||
// Modell Szűrés!
|
||||
if (targetModel === '' || textLow.includes(targetModel) || hrefLow.includes(targetModel)) {
|
||||
if (!seenUrls.has(href)) {
|
||||
seenUrls.add(href);
|
||||
if (hrefLow.endsWith('.html') && text.length > 1) {
|
||||
specs.push({ name: text, url: href });
|
||||
} else if (href.includes('/M') && href.split('/').length >= 4) {
|
||||
// UltimateSpecs generáció linkek (pl. /car-specs/Jeep/M14489/Grand-Cherokee)
|
||||
generations.push({ name: text, url: href });
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
return { specs: specs, generations: generations };
|
||||
}
|
||||
"""
|
||||
|
||||
async def _fetch_links():
|
||||
logger.info(f"🔎 KERESÉS: {search_query}")
|
||||
await page.goto(url, wait_until="domcontentloaded", timeout=30000)
|
||||
|
||||
data = await page.evaluate(js_extractor, {"makeUrl": make_url_safe, "modelWord": model_keyword})
|
||||
|
||||
# --- 1. ESET: Direkt egy specifikus adatlapra irányított a kereső ---
|
||||
if page.url.endswith('.html') and f"/{make_url_safe}/" in page.url.lower():
|
||||
logger.info("🎯 Direkt találat! Lépjünk VISSZA 1 szintet a teljes kategóriáért (Drill-Up)...")
|
||||
if data['generations']:
|
||||
gen_url = data['generations'][0]['url']
|
||||
if not gen_url.startswith('http'): gen_url = "https://www.ultimatespecs.com" + gen_url
|
||||
|
||||
logger.info(f"📂 Visszalépés ide: {gen_url}")
|
||||
await page.goto(gen_url, wait_until="domcontentloaded", timeout=30000)
|
||||
await asyncio.sleep(2)
|
||||
|
||||
gen_data = await page.evaluate(js_extractor, {"makeUrl": make_url_safe, "modelWord": model_keyword})
|
||||
return gen_data['specs']
|
||||
else:
|
||||
return [{"name": await page.title(), "url": page.url}]
|
||||
|
||||
# --- 2. ESET: Keresési találatok listáját kaptuk ---
|
||||
if data['specs']:
|
||||
first_spec_url = data['specs'][0]['url']
|
||||
if not first_spec_url.startswith('http'): first_spec_url = "https://www.ultimatespecs.com" + first_spec_url
|
||||
|
||||
logger.info(f"🕵️ Találatok megvannak. Belépés az első autóba, hogy megtaláljuk a Generációját: {first_spec_url}")
|
||||
await page.goto(first_spec_url, wait_until="domcontentloaded", timeout=30000)
|
||||
await asyncio.sleep(2)
|
||||
|
||||
spec_page_data = await page.evaluate(js_extractor, {"makeUrl": make_url_safe, "modelWord": model_keyword})
|
||||
|
||||
if spec_page_data['generations']:
|
||||
gen_url = spec_page_data['generations'][0]['url']
|
||||
if not gen_url.startswith('http'): gen_url = "https://www.ultimatespecs.com" + gen_url
|
||||
|
||||
logger.info(f"📂 Generáció megtalálva! Visszalépés, hogy leszüreteljük a teljes családot: {gen_url}")
|
||||
await page.goto(gen_url, wait_until="domcontentloaded", timeout=30000)
|
||||
await asyncio.sleep(2)
|
||||
|
||||
final_data = await page.evaluate(js_extractor, {"makeUrl": make_url_safe, "modelWord": model_keyword})
|
||||
if final_data['specs']:
|
||||
return final_data['specs']
|
||||
|
||||
# Ha valamiért nincs generációs link (nagyon ritka), adjuk vissza a keresési találatokat.
|
||||
return data['specs']
|
||||
|
||||
# --- 3. ESET: Keresés azonnal egy Generációs oldalt dobott ki ---
|
||||
if not data['specs'] and data['generations']:
|
||||
gen_url = data['generations'][0]['url']
|
||||
if not gen_url.startswith('http'): gen_url = "https://www.ultimatespecs.com" + gen_url
|
||||
|
||||
logger.info(f"📂 A keresés közvetlenül egy Kategóriát dobott ki. Belépés: {gen_url}")
|
||||
await page.goto(gen_url, wait_until="domcontentloaded", timeout=30000)
|
||||
await asyncio.sleep(2)
|
||||
final_data = await page.evaluate(js_extractor, {"makeUrl": make_url_safe, "modelWord": model_keyword})
|
||||
return final_data['specs']
|
||||
|
||||
# Fallback évszám nélkül
|
||||
if not data['specs'] and use_year:
|
||||
logger.info(" ↳ Nincs találat évszámmal, próbálkozom évszám nélkül...")
|
||||
return await self.get_car_links(page, make, model, year, use_year=False)
|
||||
|
||||
return data['specs']
|
||||
|
||||
try:
|
||||
variants = await self._retry_with_backoff(
|
||||
_fetch_links,
|
||||
max_attempts=3,
|
||||
base_delay=2,
|
||||
exception_message=f"❌ Hálózati hiba a linkek keresésekor: {url}"
|
||||
)
|
||||
return variants if variants is not None else []
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Keresési hiba (végleges): {str(e)[:50]}")
|
||||
return []
|
||||
|
||||
async def scrape_car_details(self, page, url):
|
||||
"""Mindenevő (Omnivorous) parser, ami minden táblázatot megeszik az oldalon."""
|
||||
async def _scrape():
|
||||
await page.goto(url, wait_until="networkidle", timeout=30000)
|
||||
|
||||
full_specs = await page.evaluate("""
|
||||
() => {
|
||||
let results = {};
|
||||
|
||||
document.querySelectorAll('table').forEach(table => {
|
||||
table.querySelectorAll('tr').forEach(row => {
|
||||
let cells = row.querySelectorAll('td, th');
|
||||
if(cells.length >= 2) {
|
||||
let k = cells[0].innerText.replace(/:/g,'').trim().toLowerCase();
|
||||
let v = cells[1].innerText.trim();
|
||||
if(k && v && v !== "-") {
|
||||
results[k] = v;
|
||||
}
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
const sections = {};
|
||||
document.querySelectorAll('h2, h3, h4, .section-title, .specs-header').forEach(header => {
|
||||
const title = header.innerText.trim();
|
||||
if (title && title.length > 0) {
|
||||
let nextElement = header.nextElementSibling;
|
||||
let sectionData = {};
|
||||
for (let i = 0; i < 5 && nextElement; i++) {
|
||||
if (nextElement.tagName === 'TABLE') {
|
||||
nextElement.querySelectorAll('tr').forEach(row => {
|
||||
let cells = row.querySelectorAll('td');
|
||||
if(cells.length >= 2) {
|
||||
let k = cells[0].innerText.replace(/:/g,'').trim().toLowerCase();
|
||||
let val = cells[1].innerText.trim();
|
||||
if(k && val && val !== "-") {
|
||||
sectionData[k] = val;
|
||||
results[`${title.toLowerCase().replace(/ /g, '_')}_${k}`] = val;
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
nextElement = nextElement.nextElementSibling;
|
||||
}
|
||||
sections[title.toLowerCase().replace(/ /g, '_')] = sectionData;
|
||||
}
|
||||
});
|
||||
|
||||
results['_sections'] = sections;
|
||||
return results;
|
||||
}
|
||||
""")
|
||||
return full_specs
|
||||
|
||||
try:
|
||||
logger.info(f"🌐 Scraping: {url}")
|
||||
full_specs = await self._retry_with_backoff(
|
||||
_scrape,
|
||||
max_attempts=3,
|
||||
base_delay=2,
|
||||
exception_message=f"❌ Scrape hiba az oldalon: {url}"
|
||||
)
|
||||
return full_specs
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Scrape hiba (végleges): {str(e)[:100]}...")
|
||||
return None
|
||||
|
||||
async def run(self):
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
context = await browser.new_context(user_agent=self.user_agent)
|
||||
page = await context.new_page()
|
||||
|
||||
while self.running:
|
||||
wait = random.uniform(3, 6)
|
||||
logger.info(f"💤 Várakozás {wait:.1f} mp...")
|
||||
await asyncio.sleep(wait)
|
||||
|
||||
async with AsyncSessionLocal() as db:
|
||||
target = (await db.execute(text("""
|
||||
SELECT id, make, marketing_name, year_from FROM vehicle.vehicle_model_definitions
|
||||
WHERE status IN ('pending', 'manual_review_needed')
|
||||
AND (vehicle_class IN ('car', 'motorcycle') OR vehicle_class IS NULL)
|
||||
AND NOT (UPPER(make) = ANY(:junks))
|
||||
ORDER BY priority_score DESC LIMIT 1
|
||||
"""), {"junks": JUNK_LIST})).fetchone()
|
||||
|
||||
if not target:
|
||||
logger.info("✨ Minden tétel feldolgozva.")
|
||||
break
|
||||
|
||||
t_id, make, model, year = target
|
||||
logger.info(f"🚀 CÉLPONT: {make} {model} ({year}) [ID: {t_id}]")
|
||||
|
||||
try:
|
||||
links = await self.get_car_links(page, make, model, year)
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Hálózati hiba linkek lekérésekor: {str(e)[:100]}")
|
||||
await db.execute(text("UPDATE vehicle.vehicle_model_definitions SET status='research_failed_network' WHERE id=:id"), {"id": t_id})
|
||||
await db.commit()
|
||||
continue
|
||||
|
||||
if not links:
|
||||
logger.warning(f"❌ Nem található adatlap a '{make} {model}' típushoz. research_failed_empty rögzítése.")
|
||||
await db.execute(text("UPDATE vehicle.vehicle_model_definitions SET status='research_failed_empty', updated_at=NOW() WHERE id=:id"), {"id": t_id})
|
||||
await db.commit()
|
||||
continue
|
||||
|
||||
# --- 1. ELSŐ LINK DÚSÍTÁSA ---
|
||||
first_link = links[0]
|
||||
full_url = first_link['url'] if first_link['url'].startswith('http') else f"https://www.ultimatespecs.com{first_link['url']}"
|
||||
logger.info(f"⚡ Azonnali adatgyűjtés a letöltött listából: {full_url}")
|
||||
|
||||
web_data = await self.scrape_car_details(page, full_url)
|
||||
is_enriched = False
|
||||
|
||||
if web_data is None:
|
||||
logger.error(f"❌ Scraping sikertelen minden próbálkozás után. research_failed_parsing rögzítése.")
|
||||
await db.execute(text("UPDATE vehicle.vehicle_model_definitions SET status='research_failed_parsing' WHERE id=:id"), {"id": t_id})
|
||||
await db.commit()
|
||||
web_data = {}
|
||||
|
||||
elif len(web_data) >= 5:
|
||||
updates = {}
|
||||
for key, keywords in self.FUZZY_MAPPING.items():
|
||||
raw_val = self.extract_fuzzy_metric(web_data, keywords)
|
||||
updates[key] = self.clean_number(raw_val)
|
||||
|
||||
fuel_type = self.extract_fuzzy_metric(web_data, ["fuel type", "fuel"]) or 'Unknown'
|
||||
transmission = self.extract_fuzzy_metric(web_data, ["transmission", "gearbox"]) or 'Unknown'
|
||||
body_type = self.extract_fuzzy_metric(web_data, ["body", "type"]) or 'Unknown'
|
||||
drive_type = self.extract_fuzzy_metric(web_data, ["drive", "traction"]) or 'Unknown'
|
||||
|
||||
power_kw = updates.get('power_kw', 0)
|
||||
ccm = updates.get('engine_capacity', 0)
|
||||
|
||||
await db.execute(text("""
|
||||
UPDATE vehicle.vehicle_model_definitions
|
||||
SET power_kw = :power_kw, engine_capacity = :engine_capacity,
|
||||
torque_nm = :torque_nm, max_speed = :max_speed,
|
||||
curb_weight = :curb_weight,
|
||||
wheelbase = :wheelbase, seats = :seats,
|
||||
fuel_type = :fuel_type, transmission_type = :transmission_type,
|
||||
drive_type = :drive_type, body_type = :body_type,
|
||||
specifications = specifications || :full_json,
|
||||
status = 'awaiting_ai_synthesis', updated_at = NOW()
|
||||
WHERE id = :id
|
||||
"""), {
|
||||
**updates,
|
||||
"id": t_id,
|
||||
"fuel_type": fuel_type,
|
||||
"transmission_type": transmission,
|
||||
"drive_type": drive_type,
|
||||
"body_type": body_type,
|
||||
"full_json": json.dumps(web_data)
|
||||
})
|
||||
is_enriched = True
|
||||
logger.info(f"✅ SIKERES DÚSÍTÁS: {make} {model} ({power_kw} kW, {ccm} ccm) -> Awaiting AI")
|
||||
else:
|
||||
logger.warning("⚠️ Scraping kevés adatot talált, csak a linkeket mentjük.")
|
||||
|
||||
# --- 2. VARIÁCIÓK MENTÉSE AZ R3-NAK ---
|
||||
added = 0
|
||||
for l in links:
|
||||
v_url = l['url'] if l['url'].startswith('http') else f"https://www.ultimatespecs.com{l['url']}"
|
||||
|
||||
check = (await db.execute(text("SELECT id FROM vehicle.vehicle_model_definitions WHERE raw_api_data->>'url' = :u"), {"u": v_url})).fetchone()
|
||||
|
||||
if not check:
|
||||
normalized = l['name'].lower().replace(' ', '_').replace('-', '_').replace('.', '').replace(',', '')[:200]
|
||||
await db.execute(text("""
|
||||
INSERT INTO vehicle.vehicle_model_definitions
|
||||
(make, marketing_name, normalized_name, year_from, status,
|
||||
raw_api_data, priority_score, source, market,
|
||||
technical_code, variant_code, version_code,
|
||||
specifications, marketing_name_aliases, raw_search_context)
|
||||
VALUES (:make, :name, :normalized, :year, 'awaiting_ai_synthesis',
|
||||
:raw, 30, 'ultimatespecs', 'EU',
|
||||
'UNKNOWN', 'UNKNOWN', 'UNKNOWN',
|
||||
'{}'::jsonb, '[]'::jsonb, '')
|
||||
"""), {
|
||||
"make": make, "name": l['name'], "normalized": normalized,
|
||||
"year": year, "raw": json.dumps({"url": v_url})
|
||||
})
|
||||
added += 1
|
||||
|
||||
if not is_enriched:
|
||||
await db.execute(text("UPDATE vehicle.vehicle_model_definitions SET status='expanded_to_variants', updated_at=NOW() WHERE id=:id"), {"id": t_id})
|
||||
|
||||
await db.commit()
|
||||
logger.info(f"✅ Variációk kezelve: {added} új rekord.")
|
||||
|
||||
await browser.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
scout = RobotScout()
|
||||
def stop_signal(sig, frame):
|
||||
logger.info("🛑 LEÁLLÍTÁS (Kérés érzékelve)...")
|
||||
scout.running = False
|
||||
sys.exit(0)
|
||||
|
||||
signal.signal(signal.SIGINT, stop_signal)
|
||||
|
||||
try:
|
||||
asyncio.run(scout.run())
|
||||
except KeyboardInterrupt:
|
||||
pass
|
||||
@@ -0,0 +1,387 @@
|
||||
#!/usr/bin/env python3
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import random
|
||||
import urllib.parse
|
||||
import sys
|
||||
import signal
|
||||
import re
|
||||
from playwright.async_api import async_playwright
|
||||
from sqlalchemy import text
|
||||
from app.database import AsyncSessionLocal
|
||||
|
||||
# R2.3 - SENTINEL (Hardened & Obedient Edition)
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s [R2.3-SENTINEL] %(message)s')
|
||||
logger = logging.getLogger("R2.3")
|
||||
|
||||
# --- 1. SZŰRÉSEK ÉS TILTÓLISTÁK ---
|
||||
# Csak olyan típusokat keresünk, amik nem utánfutók vagy munkagépek
|
||||
JUNK_LIST = [
|
||||
'SARIS', 'ANSSEMS', 'HAPERT', 'HUMBAUR', 'EDUARD', 'IFOR WILLIAMS', 'FENDT',
|
||||
'HOBBY', 'ADRIA', 'PEECON', 'JAKO', 'KAWECO', 'POTTINGER', 'BOCKMANN',
|
||||
'JOHN DEERE', 'CLAAS', 'IVECO', 'SCANIA', 'MAN', 'DAF', 'KNAUS', 'PÖSSL', 'HYMER', 'WESTFALIA'
|
||||
]
|
||||
|
||||
# --- 2. FORDÍTÁSOK (DE/NL -> EN) ---
|
||||
TRANSLATIONS = {
|
||||
"3ER REIHE": "3 Series", "5ER REIHE": "5 Series", "1ER REIHE": "1 Series", "7ER REIHE": "7 Series",
|
||||
"E-KLASSE": "E Class", "C-KLASSE": "C Class", "S-KLASSE": "S Class", "A-KLASSE": "A Class",
|
||||
"REIHE": "Series", "KLASSE": "Class", "BESTELWAGEN": "Van"
|
||||
}
|
||||
|
||||
class RobotScout:
|
||||
def __init__(self):
|
||||
self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
|
||||
self.running = True
|
||||
|
||||
def clean_name(self, make, model):
|
||||
"""Standardizált angol név előállítása."""
|
||||
m = model.upper()
|
||||
for de, en in TRANSLATIONS.items():
|
||||
m = m.replace(de, en)
|
||||
# Márkanév duplázódás törlése (pl. VOLVO VOLVO V60 -> VOLVO V60)
|
||||
m = m.replace(make.upper(), "").strip()
|
||||
return f"{make} {m}"
|
||||
|
||||
# --- COLUMN MAPPING for scraping ---
|
||||
COLUMN_MAPPING = {
|
||||
"horsepower": "power_kw",
|
||||
"engine displacement": "engine_capacity",
|
||||
"maximum torque": "torque_nm",
|
||||
"top speed": "max_speed",
|
||||
"curb weight": "curb_weight",
|
||||
"wheelbase": "wheelbase",
|
||||
"num. of seats": "seats"
|
||||
}
|
||||
|
||||
def clean_number(self, val: str, key: str = "") -> int:
|
||||
if not val or val == "-": return 0
|
||||
try:
|
||||
if "hp" in val.lower() or "kw" in val.lower():
|
||||
kw_match = re.search(r'(\d+)\s*kw', val.lower())
|
||||
if kw_match: return int(kw_match.group(1))
|
||||
nums = re.findall(r'\d+', val.replace(' ', '').replace(',', '').replace('.', ''))
|
||||
return int(nums[0]) if nums else 0
|
||||
except: return 0
|
||||
|
||||
async def get_car_links(self, page, make, model, year, use_year=True):
|
||||
"""Minden autós link kigyűjtése fallback mechanizmussal retry logikával."""
|
||||
clean_model = self.clean_name(make, model)
|
||||
search_query = f"{clean_model} {year}" if use_year else clean_model
|
||||
url = f"https://www.ultimatespecs.com/index.php?q={urllib.parse.quote(search_query)}"
|
||||
|
||||
logger.info(f"🔎 KERESÉS: {search_query}")
|
||||
|
||||
async def _fetch_links():
|
||||
await page.goto(url, wait_until="domcontentloaded", timeout=25000)
|
||||
|
||||
# 1. Ha direkt az adatlapon vagyunk
|
||||
if any(x in page.url for x in ['/car-specs/', '/motorcycles-specs/']):
|
||||
logger.info("🎯 Direkt találat!")
|
||||
return [{"name": await page.title(), "url": page.url}]
|
||||
|
||||
# 2. Várakozás és linkek kigyűjtése
|
||||
await asyncio.sleep(2)
|
||||
variants = await page.evaluate("""
|
||||
() => {
|
||||
let results = [];
|
||||
document.querySelectorAll('a').forEach(a => {
|
||||
let href = a.getAttribute('href') || '';
|
||||
let text = a.innerText.trim();
|
||||
// Csak technikai adatlapokat gyűjtünk, reklámokat/kategóriákat nem
|
||||
if ((href.includes('/car-specs/') || href.includes('/motorcycles-specs/'))
|
||||
&& href.includes('.html') && text.length > 3) {
|
||||
results.push({ name: text, url: href });
|
||||
}
|
||||
});
|
||||
return results;
|
||||
}
|
||||
""")
|
||||
|
||||
# 3. Fallback: Ha nincs találat évvel, próbálja év nélkül
|
||||
if not variants and use_year:
|
||||
logger.info(" ↳ Nincs találat évszámmal, próbálkozom évszám nélkül...")
|
||||
return await self.get_car_links(page, make, model, year, use_year=False)
|
||||
|
||||
return variants
|
||||
|
||||
try:
|
||||
variants = await self._retry_with_backoff(
|
||||
_fetch_links,
|
||||
max_attempts=3,
|
||||
base_delay=2,
|
||||
exception_message=f"❌ Hálózati hiba a(z) {url} oldalon"
|
||||
)
|
||||
return variants if variants is not None else []
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Hálózati hiba (végleges): {str(e)[:50]}")
|
||||
return []
|
||||
|
||||
async def _retry_with_backoff(self, func, max_attempts=3, base_delay=2,
|
||||
exception_message="Retry failed", retry_exceptions=True):
|
||||
"""Helper function for retry logic with exponential backoff."""
|
||||
for attempt in range(max_attempts):
|
||||
try:
|
||||
return await func()
|
||||
except Exception as e:
|
||||
if attempt == max_attempts - 1:
|
||||
logger.error(f"{exception_message} after {max_attempts} attempts: {str(e)[:100]}")
|
||||
raise
|
||||
else:
|
||||
delay = base_delay * (2 ** attempt) + random.uniform(0, 1)
|
||||
logger.warning(f"⚠️ Attempt {attempt + 1} failed: {str(e)[:50]}. Retrying in {delay:.1f}s...")
|
||||
await asyncio.sleep(delay)
|
||||
return None
|
||||
|
||||
async def scrape_car_details(self, page, url):
|
||||
"""Scrape car specifications from a given Ultimate Specs URL with comprehensive data extraction and retry logic."""
|
||||
async def _scrape():
|
||||
await page.goto(url, wait_until="networkidle", timeout=30000)
|
||||
|
||||
# Parsing all specification tables and sections
|
||||
full_specs = await page.evaluate("""
|
||||
() => {
|
||||
let results = {};
|
||||
|
||||
// 1. Collect all specification tables (existing logic)
|
||||
document.querySelectorAll('table.table_specs, table.responsive').forEach(table => {
|
||||
table.querySelectorAll('tr').forEach(row => {
|
||||
let t = row.querySelector('.table_specs_title, .td_title, td:first-child');
|
||||
let v = row.querySelector('.table_specs_value, .td_value, td:last-child');
|
||||
if(t && v) {
|
||||
let k = t.innerText.replace(':','').trim().toLowerCase();
|
||||
let val = v.innerText.trim();
|
||||
if(k && val && val !== "-") results[k] = val;
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
// 2. Collect section headers and their content for additional technical data
|
||||
// Look for h2, h3, h4 elements that might contain section titles
|
||||
const sections = {};
|
||||
const headers = document.querySelectorAll('h2, h3, h4, .section-title, .specs-header');
|
||||
|
||||
headers.forEach(header => {
|
||||
const title = header.innerText.trim();
|
||||
if (title && title.length > 0) {
|
||||
// Find the next table or div with specs after this header
|
||||
let nextElement = header.nextElementSibling;
|
||||
let sectionData = {};
|
||||
|
||||
// Look for tables or lists in the next few siblings
|
||||
for (let i = 0; i < 5 && nextElement; i++) {
|
||||
if (nextElement.tagName === 'TABLE') {
|
||||
nextElement.querySelectorAll('tr').forEach(row => {
|
||||
let t = row.querySelector('td:first-child');
|
||||
let v = row.querySelector('td:last-child');
|
||||
if(t && v) {
|
||||
let k = t.innerText.replace(':','').trim().toLowerCase();
|
||||
let val = v.innerText.trim();
|
||||
if(k && val && val !== "-") {
|
||||
sectionData[k] = val;
|
||||
// Also add to main results with section prefix
|
||||
results[`${title.toLowerCase().replace(/ /g, '_')}_${k}`] = val;
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
nextElement = nextElement.nextElementSibling;
|
||||
}
|
||||
|
||||
sections[title.toLowerCase().replace(/ /g, '_')] = sectionData;
|
||||
}
|
||||
});
|
||||
|
||||
// 3. Extract specific known sections by looking for text patterns
|
||||
const pageText = document.body.innerText.toLowerCase();
|
||||
|
||||
// Check for electric/hybrid sections
|
||||
if (pageText.includes('electric engine') || pageText.includes('battery')) {
|
||||
// Try to find battery voltage, capacity, etc.
|
||||
const batteryRegex = /battery\s*voltage[:\s]*([\d\.]+)\s*v/gi;
|
||||
const match = batteryRegex.exec(document.body.innerText);
|
||||
if (match) results['battery_voltage_v'] = match[1];
|
||||
}
|
||||
|
||||
// 4. Extract dimensions data
|
||||
const dimensionPatterns = {
|
||||
'wheelbase': /wheelbase[:\s]*([\d\.]+)\s*cm/gi,
|
||||
'length': /length[:\s]*([\d\.]+)\s*cm/gi,
|
||||
'width': /width[:\s]*([\d\.]+)\s*cm/gi,
|
||||
'height': /height[:\s]*([\d\.]+)\s*cm/gi,
|
||||
'curb_weight': /curb\s*weight[:\s]*([\d\.]+)\s*kg/gi,
|
||||
'towing_capacity': /towing\s*capacity[:\s]*([\d\.]+)\s*kg/gi
|
||||
};
|
||||
|
||||
for (const [key, regex] of Object.entries(dimensionPatterns)) {
|
||||
const match = regex.exec(document.body.innerText);
|
||||
if (match) results[key] = match[1];
|
||||
}
|
||||
|
||||
// 5. Add sections data as a nested object
|
||||
results['_sections'] = sections;
|
||||
|
||||
return results;
|
||||
}
|
||||
""")
|
||||
return full_specs
|
||||
|
||||
try:
|
||||
logger.info(f"🌐 Scraping: {url}")
|
||||
full_specs = await self._retry_with_backoff(
|
||||
_scrape,
|
||||
max_attempts=3,
|
||||
base_delay=2,
|
||||
exception_message=f"❌ Scrape hiba a(z) {url} oldalon"
|
||||
)
|
||||
return full_specs
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Scrape hiba (végleges): {str(e)[:100]}...")
|
||||
return None
|
||||
|
||||
async def run(self):
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
context = await browser.new_context(user_agent=self.user_agent)
|
||||
page = await context.new_page()
|
||||
|
||||
while self.running:
|
||||
# --- A FÉK: 3-6 mp szigorú pihenő minden kör elején ---
|
||||
wait = random.uniform(3, 6)
|
||||
logger.info(f"💤 Várakozás {wait:.1f} mp...")
|
||||
await asyncio.sleep(wait)
|
||||
|
||||
async with AsyncSessionLocal() as db:
|
||||
# Következő feldolgozatlan autó (John Deere, Iveco, stb. kizárva)
|
||||
target = (await db.execute(text("""
|
||||
SELECT id, make, marketing_name, year_from FROM vehicle.vehicle_model_definitions
|
||||
WHERE status IN ('pending', 'manual_review_needed')
|
||||
AND NOT (make = ANY(:junks))
|
||||
ORDER BY priority_score DESC LIMIT 1
|
||||
"""), {"junks": JUNK_LIST})).fetchone()
|
||||
|
||||
if not target:
|
||||
logger.info("✨ Minden tétel feldolgozva.")
|
||||
break
|
||||
|
||||
t_id, make, model, year = target
|
||||
logger.info(f"🚀 CÉLPONT: {make} {model} ({year}) [ID: {t_id}]")
|
||||
|
||||
try:
|
||||
links = await self.get_car_links(page, make, model, year)
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Hálózati hiba linkek lekérésekor: {str(e)[:100]}")
|
||||
await db.execute(text("UPDATE vehicle.vehicle_model_definitions SET status='research_failed_network' WHERE id=:id"), {"id": t_id})
|
||||
await db.commit()
|
||||
continue
|
||||
|
||||
if not links:
|
||||
logger.warning(f"❌ Nem található adatlap. research_failed_empty rögzítése.")
|
||||
await db.execute(text("UPDATE vehicle.vehicle_model_definitions SET status='research_failed_empty' WHERE id=:id"), {"id": t_id})
|
||||
await db.commit()
|
||||
continue
|
||||
|
||||
# --- 1. SCRAPE THE FIRST LINK FOR IMMEDIATE ENRICHMENT ---
|
||||
first_link = None
|
||||
if links:
|
||||
first_link = links[0]
|
||||
full_url = first_link['url'] if first_link['url'].startswith('http') else f"https://www.ultimatespecs.com{first_link['url']}"
|
||||
logger.info(f"⚡ Azonnali adatgyűjtés: {full_url}")
|
||||
web_data = await self.scrape_car_details(page, full_url)
|
||||
|
||||
if web_data is None:
|
||||
# Scraping failed after all retries
|
||||
logger.error(f"❌ Scraping sikertelen minden próbálkozás után. research_failed_parsing rögzítése.")
|
||||
await db.execute(text("UPDATE vehicle.vehicle_model_definitions SET status='research_failed_parsing' WHERE id=:id"), {"id": t_id})
|
||||
await db.commit()
|
||||
# Continue to save links as variants anyway
|
||||
web_data = {}
|
||||
elif len(web_data) >= 5:
|
||||
# Map scraped data to columns
|
||||
updates = {col: self.clean_number(web_data.get(k)) for k, col in self.COLUMN_MAPPING.items()}
|
||||
# Also extract fuel_type, transmission, etc. if possible
|
||||
fuel_type = web_data.get('fuel type', 'Unknown')
|
||||
transmission_type = web_data.get('transmission', 'Unknown')
|
||||
drive_type = web_data.get('drive type', 'Unknown')
|
||||
body_type = web_data.get('body type', 'Unknown')
|
||||
engine_capacity = updates.get('engine_capacity', 0)
|
||||
power_kw = updates.get('power_kw', 0)
|
||||
|
||||
# Update the original record with scraped data
|
||||
await db.execute(text("""
|
||||
UPDATE vehicle.vehicle_model_definitions
|
||||
SET power_kw = :power_kw, engine_capacity = :engine_capacity,
|
||||
torque_nm = :torque_nm, max_speed = :max_speed,
|
||||
curb_weight = :curb_weight,
|
||||
wheelbase = :wheelbase, seats = :seats,
|
||||
fuel_type = :fuel_type, transmission_type = :transmission_type,
|
||||
drive_type = :drive_type, body_type = :body_type,
|
||||
specifications = specifications || :full_json,
|
||||
status = 'awaiting_ai_synthesis', updated_at = NOW()
|
||||
WHERE id = :id
|
||||
"""), {
|
||||
**updates,
|
||||
"id": t_id,
|
||||
"fuel_type": fuel_type,
|
||||
"transmission_type": transmission_type,
|
||||
"drive_type": drive_type,
|
||||
"body_type": body_type,
|
||||
"full_json": json.dumps(web_data)
|
||||
})
|
||||
logger.info(f"✅ AZONNALI PUBLIKÁLÁS: {make} {model} ({power_kw} kW)")
|
||||
else:
|
||||
logger.warning("⚠️ Scraping kevés adatot talált, csak linkek mentve.")
|
||||
|
||||
# --- 2. SAVE ALL LINKS AS NEW VARIANT RECORDS (including first if not enriched) ---
|
||||
added = 0
|
||||
for l in links:
|
||||
full_url = l['url'] if l['url'].startswith('http') else f"https://www.ultimatespecs.com{l['url']}"
|
||||
|
||||
# JAVÍTÁS: column "source_url" hiba ellen raw_api_data-t nézünk
|
||||
check_query = text("SELECT id FROM vehicle.vehicle_model_definitions WHERE raw_api_data->>'url' = :u")
|
||||
exists = (await db.execute(check_query, {"u": full_url})).fetchone()
|
||||
|
||||
if not exists:
|
||||
# Create normalized name from marketing name
|
||||
normalized = l['name'].lower().replace(' ', '_').replace('-', '_').replace('.', '').replace(',', '')[:200]
|
||||
|
||||
await db.execute(text("""
|
||||
INSERT INTO vehicle.vehicle_model_definitions
|
||||
(make, marketing_name, normalized_name, year_from, status,
|
||||
raw_api_data, priority_score, source, market,
|
||||
technical_code, variant_code, version_code,
|
||||
specifications, marketing_name_aliases, raw_search_context)
|
||||
VALUES (:make, :name, :normalized, :year, 'awaiting_ai_synthesis',
|
||||
:raw, 30, 'ultimatespecs', 'EU',
|
||||
'UNKNOWN', 'UNKNOWN', 'UNKNOWN',
|
||||
'{}'::jsonb, '[]'::jsonb, '')
|
||||
"""), {
|
||||
"make": make, "name": l['name'], "normalized": normalized,
|
||||
"year": year, "raw": json.dumps({"url": full_url}), "priority": 30
|
||||
})
|
||||
added += 1
|
||||
|
||||
# Eredeti rekord archiválása (ha még nem publikáltuk)
|
||||
if not web_data:
|
||||
await db.execute(text("UPDATE vehicle.vehicle_model_definitions SET status='expanded_to_variants', updated_at=NOW() WHERE id=:id"), {"id": t_id})
|
||||
|
||||
await db.commit()
|
||||
logger.info(f"✅ SIKER: {added} új variáció mentve. R4-R5 robotok értesítve.")
|
||||
|
||||
await browser.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
scout = RobotScout()
|
||||
# Handle CTRL+C
|
||||
def stop_signal(sig, frame):
|
||||
logger.info("🛑 LEÁLLÍTÁS (Kérés érzékelve)...")
|
||||
scout.running = False
|
||||
sys.exit(0)
|
||||
|
||||
signal.signal(signal.SIGINT, stop_signal)
|
||||
|
||||
try:
|
||||
asyncio.run(scout.run())
|
||||
except KeyboardInterrupt:
|
||||
pass
|
||||
156
backend/app/workers/vehicle/vehicle_robot_2_auto_data_net.py
Normal file
156
backend/app/workers/vehicle/vehicle_robot_2_auto_data_net.py
Normal file
@@ -0,0 +1,156 @@
|
||||
#!/usr/bin/env python3
|
||||
import asyncio
|
||||
import logging
|
||||
import random
|
||||
import json
|
||||
import re
|
||||
from bs4 import BeautifulSoup
|
||||
from playwright.async_api import async_playwright
|
||||
from sqlalchemy import text
|
||||
from app.database import AsyncSessionLocal
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s [R2-MASTER] %(message)s')
|
||||
logger = logging.getLogger("R2-AutoData")
|
||||
|
||||
class AutoDataMaster:
|
||||
def __init__(self):
|
||||
self.base_url = "https://www.auto-data.net"
|
||||
|
||||
def clean_key(self, key):
|
||||
if "," in key: key = key.split(",")[-1]
|
||||
key = key.replace("What is the ", "").replace("How much ", "").replace("How many ", "")
|
||||
key = key.split("?")[0].strip()
|
||||
return key.capitalize()
|
||||
|
||||
async def get_soup(self, page, url):
|
||||
delay = random.uniform(2, 5)
|
||||
await asyncio.sleep(delay)
|
||||
# JAVÍTÁS: Megvárjuk, amíg a hálózat elcsendesedik (biztosabb betöltés)
|
||||
await page.goto(url, wait_until="networkidle", timeout=60000)
|
||||
content = await page.content()
|
||||
return BeautifulSoup(content, 'html.parser')
|
||||
|
||||
async def scrape_engine_details(self, page, url):
|
||||
try:
|
||||
soup = await self.get_soup(page, url)
|
||||
data = {
|
||||
"make": "", "model": "", "generation": "", "modification": "",
|
||||
"year_from": None, "year_to": None, "power_kw": 0, "engine_cc": 0,
|
||||
"specifications": {}, "source_url": url
|
||||
}
|
||||
# (Az adatkinyerő logika ugyanaz marad, mint az előzőleg sikeresen tesztelt Honda esetén)
|
||||
rows = soup.find_all('tr')
|
||||
for row in rows:
|
||||
th, td = row.find('th'), row.find('td')
|
||||
if not th or not td: continue
|
||||
raw_k, val = th.get_text(strip=True), td.get_text(strip=True)
|
||||
k_low = raw_k.lower()
|
||||
if "brand" == k_low: data["make"] = val
|
||||
elif "model" == k_low: data["model"] = val
|
||||
elif "generation" == k_low: data["generation"] = val
|
||||
elif "modification" == k_low: data["modification"] = val
|
||||
elif "start of production" in k_low:
|
||||
m = re.search(r'(\d{4})', val);
|
||||
if m: data["year_from"] = int(m.group(1))
|
||||
elif "end of production" in k_low:
|
||||
m = re.search(r'(\d{4})', val);
|
||||
if m: data["year_to"] = int(m.group(1))
|
||||
elif "power" == k_low:
|
||||
hp_m = re.search(r'(\d+)\s*Hp', val, re.I)
|
||||
if hp_m: data["power_kw"] = int(int(hp_m.group(1)) / 1.36)
|
||||
elif "displacement" in k_low:
|
||||
cc_m = re.search(r'(\d+)\s*cm3', val)
|
||||
if cc_m: data["engine_cc"] = int(cc_m.group(1))
|
||||
clean_k = self.clean_key(raw_k)
|
||||
if clean_k and val: data["specifications"][clean_k] = val
|
||||
return data
|
||||
except Exception as e:
|
||||
logger.error(f"Hiba az adatlapon ({url}): {e}")
|
||||
return None
|
||||
|
||||
async def save_to_db(self, data):
|
||||
if not data or not data["make"]: return
|
||||
async with AsyncSessionLocal() as db:
|
||||
try:
|
||||
await db.execute(text("""
|
||||
INSERT INTO vehicle.external_reference_library
|
||||
(source_name, make, model, generation, modification, year_from, year_to, power_kw, engine_cc, specifications, source_url)
|
||||
VALUES ('auto-data.net', :make, :model, :gen, :mod, :y_f, :y_t, :p_kw, :e_cc, :specs, :url)
|
||||
ON CONFLICT (source_url) DO UPDATE SET specifications = EXCLUDED.specifications, last_scraped_at = NOW();
|
||||
"""), {
|
||||
"make": data["make"], "model": data["model"], "gen": data["generation"],
|
||||
"mod": data["modification"], "y_f": data["year_from"], "y_t": data["year_to"],
|
||||
"p_kw": data["power_kw"], "e_cc": data["engine_cc"],
|
||||
"specs": json.dumps(data["specifications"]), "url": data["source_url"]
|
||||
})
|
||||
await db.commit()
|
||||
logger.info(f"✅ MENTVE: {data['make']} {data['model']} {data['modification']}")
|
||||
except Exception as e:
|
||||
logger.error(f"DB Hiba: {e}")
|
||||
|
||||
async def crawl(self):
|
||||
logger.info("🚀 Porszívózás indul...")
|
||||
async with async_playwright() as p:
|
||||
# Lassított indítás és normális ablakméret a lebukás ellen
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
context = await browser.new_context(
|
||||
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
|
||||
viewport={'width': 1920, 'height': 1080}
|
||||
)
|
||||
page = await context.new_page()
|
||||
|
||||
# 1. MÁRKÁK LISTÁJA - JAVÍTOTT SZELEKTOR
|
||||
logger.info(f"Szint 1: Márkák betöltése...")
|
||||
soup = await self.get_soup(page, f"{self.base_url}/en/allbrands")
|
||||
|
||||
# Az auto-data-n a márkák linkjeinek class-ja 'brandi' vagy 'brand'
|
||||
brand_elements = soup.select('a.brandi') or soup.select('a.brand')
|
||||
brand_links = []
|
||||
for a in brand_elements:
|
||||
href = a.get('href')
|
||||
if href and 'brand' in href:
|
||||
full_url = href if href.startswith('http') else f"{self.base_url}/{href.lstrip('/')}"
|
||||
brand_links.append(full_url)
|
||||
|
||||
if not brand_links:
|
||||
logger.error(f"❌ 0 márkát találtam! Oldalcím: {soup.title.string if soup.title else 'Nincs'}")
|
||||
# Debug: mentsük el a HTML elejét, hogy lássuk mi az
|
||||
logger.info(f"HTML debug (első 500 karakter): {str(soup)[:500]}")
|
||||
await browser.close()
|
||||
return
|
||||
|
||||
logger.info(f"🎯 Talált márkák: {len(brand_links)}")
|
||||
|
||||
# Csak az első 3 márkát nézzük meg tesztként (Abarth, Acura, Alfa Romeo)
|
||||
for b_link in brand_links:
|
||||
try:
|
||||
logger.info(f"Szint 2: Modellek keresése itt: {b_link}")
|
||||
soup = await self.get_soup(page, b_link)
|
||||
# Modellek szelektor: a.modeli
|
||||
model_links = [self.base_url + '/' + a['href'].lstrip('/') for a in soup.select('a.modeli')]
|
||||
|
||||
logger.info(f" -> {len(model_links)} modellt találtam.")
|
||||
|
||||
for m_link in model_links:
|
||||
logger.info(f"Szint 3: Generációk itt: {m_link}")
|
||||
soup = await self.get_soup(page, m_link)
|
||||
# Generációk szelektor: a.generation
|
||||
gen_links = [self.base_url + '/' + a['href'].lstrip('/') for a in soup.select('a.generation')]
|
||||
|
||||
for g_link in gen_links:
|
||||
logger.info(f"Szint 4: Motorváltozatok itt: {g_link}")
|
||||
soup = await self.get_soup(page, g_link)
|
||||
# Motorváltozatok szelektor: a.car_specs
|
||||
engine_links = [self.base_url + '/' + a['href'].lstrip('/') for a in soup.select('a.car_specs')]
|
||||
|
||||
for e_link in engine_links:
|
||||
data = await self.scrape_engine_details(page, e_link)
|
||||
if data:
|
||||
await self.save_to_db(data)
|
||||
except Exception as e:
|
||||
logger.error(f"Hiba a folyamatban: {e}")
|
||||
|
||||
await browser.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(AutoDataMaster().crawl())
|
||||
@@ -1,238 +1,203 @@
|
||||
#!/usr/bin/env python3
|
||||
import asyncio
|
||||
import logging
|
||||
import warnings
|
||||
import os
|
||||
import json
|
||||
from datetime import datetime
|
||||
from sqlalchemy import text, update, func
|
||||
from app.database import AsyncSessionLocal
|
||||
from app.models.vehicle_definitions import VehicleModelDefinition
|
||||
|
||||
warnings.filterwarnings("ignore", category=RuntimeWarning, module='duckduckgo_search')
|
||||
import httpx
|
||||
import re
|
||||
from bs4 import BeautifulSoup
|
||||
from duckduckgo_search import DDGS
|
||||
from playwright.async_api import async_playwright
|
||||
from sqlalchemy import text
|
||||
from app.database import AsyncSessionLocal
|
||||
|
||||
# MB 2.0 Szabvány naplózás
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] Robot-2-Researcher: %(message)s')
|
||||
logger = logging.getLogger("Vehicle-Robot-2-Researcher")
|
||||
# Figyelmeztetések némítása (a csomag átnevezése miatti zaj elkerülésére)
|
||||
warnings.filterwarnings("ignore", category=RuntimeWarning, module='duckduckgo_search')
|
||||
|
||||
class QuotaManager:
|
||||
""" Szigorú napi limit figyelő a fizetős/hatósági API-khoz """
|
||||
def __init__(self, service_name: str, daily_limit: int):
|
||||
self.service_name = service_name
|
||||
self.daily_limit = daily_limit
|
||||
self.state_file = f"/app/temp/.quota_{service_name}.json"
|
||||
self._ensure_file()
|
||||
|
||||
def _ensure_file(self):
|
||||
os.makedirs(os.path.dirname(self.state_file), exist_ok=True)
|
||||
if not os.path.exists(self.state_file):
|
||||
with open(self.state_file, 'w') as f:
|
||||
json.dump({"date": datetime.now().strftime("%Y-%m-%d"), "count": 0}, f)
|
||||
|
||||
def can_make_request(self) -> bool:
|
||||
with open(self.state_file, 'r') as f:
|
||||
data = json.load(f)
|
||||
|
||||
today = datetime.now().strftime("%Y-%m-%d")
|
||||
if data["date"] != today:
|
||||
data = {"date": today, "count": 0} # Új nap, kvóta nullázása
|
||||
|
||||
if data["count"] >= self.daily_limit:
|
||||
return False
|
||||
|
||||
# Növeljük a számlálót
|
||||
data["count"] += 1
|
||||
with open(self.state_file, 'w') as f:
|
||||
json.dump(data, f)
|
||||
return True
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s [R2-MASTER-EDITION] %(message)s'
|
||||
)
|
||||
logger = logging.getLogger("R2-Researcher")
|
||||
|
||||
class VehicleResearcher:
|
||||
"""
|
||||
Vehicle Robot 2.5: Sniper Researcher (Mesterlövész Adatgyűjtő)
|
||||
Célzott keresésekkel és strukturált aktakészítéssel dolgozik az AI kímélése érdekében.
|
||||
"""
|
||||
def __init__(self):
|
||||
self.max_attempts = 5
|
||||
self.search_timeout = 15.0
|
||||
def __init__(self, concurrency=5):
|
||||
# Egyszerre 5 böngésző fület kezelünk a sebesség érdekében
|
||||
self.semaphore = asyncio.Semaphore(concurrency)
|
||||
self.ollama_url = "http://sf_ollama:11434/api/generate"
|
||||
|
||||
# Kvóta menedzserek beállítása (.env-ből olvasva)
|
||||
dvla_limit = int(os.getenv("DVLA_DAILY_LIMIT", "1000"))
|
||||
self.dvla_quota = QuotaManager("dvla", dvla_limit)
|
||||
self.dvla_token = os.getenv("DVLA_API_KEY")
|
||||
# FORDÍTÓ SZÓTÁR: Holland RDW -> Nemzetközi keresési nevek
|
||||
self.translation_map = {
|
||||
"ER REIHE": "Series",
|
||||
"T-MODELL": "Estate",
|
||||
"KLASSE": "Class",
|
||||
"PERSONENAUTO": "Car",
|
||||
"STATIONWAGEN": "Estate",
|
||||
"MERCEDES-BENZ": "Mercedes",
|
||||
"Vrachtwagen": "Truck",
|
||||
"Oplegger": "Trailer"
|
||||
}
|
||||
|
||||
async def fetch_ddg_targeted(self, label: str, query: str) -> str:
|
||||
""" Célzott keresés szálbiztosan a DuckDuckGo-n. """
|
||||
def clean_name(self, make, model):
|
||||
"""Lefordítja a holland modellneveket, hogy a Google/Bing megtalálja őket."""
|
||||
name = f"{make} {model}".upper()
|
||||
for dutch, eng in self.translation_map.items():
|
||||
name = name.replace(dutch, eng)
|
||||
return name.title()
|
||||
|
||||
async def get_url(self, make, model, year, kw):
|
||||
"""Keresés a DuckDuckGo-val. JAVÍTVA: 0kW fix és több találat."""
|
||||
clean_n = self.clean_name(make, model)
|
||||
|
||||
# Ha a kW 0, None vagy érvénytelen, kihagyjuk a keresésből a találati arány javítására
|
||||
kw_val = 0
|
||||
try:
|
||||
def search():
|
||||
if kw and str(kw).replace('.','').isdigit():
|
||||
kw_val = int(float(kw))
|
||||
except: pass
|
||||
|
||||
kw_part = f"{kw_val}kW" if kw_val > 0 else ""
|
||||
query = f"site:auto-data.net {clean_n} {year} {kw_part} specifications"
|
||||
|
||||
try:
|
||||
def _search():
|
||||
with DDGS() as ddgs:
|
||||
# max_results=2: Nem kell sok zaj, csak a legrelevánsabb 2 találat
|
||||
results = ddgs.text(query, max_results=2)
|
||||
return [f"- {r.get('body', '')}" for r in results] if results else []
|
||||
|
||||
results = await asyncio.wait_for(asyncio.to_thread(search), timeout=self.search_timeout)
|
||||
|
||||
if not results:
|
||||
return f"[SOURCE: {label}]\nNincs érdemi találat.\n"
|
||||
|
||||
content = f"[SOURCE: {label} | KERESÉS: {query}]\n"
|
||||
content += "\n".join(results) + "\n"
|
||||
return content
|
||||
# Megnézzük az első 3 találatot, hátha az első nem direkt link
|
||||
res = ddgs.search(query, max_results=3)
|
||||
return [r.get('link', r.get('href', '')) for r in res if 'auto-data.net' in r.get('link', r.get('href', ''))]
|
||||
|
||||
links = await asyncio.to_thread(_search)
|
||||
return links[0] if links else None
|
||||
except Exception as e:
|
||||
logger.debug(f"Keresési hiba ({label}): {e}")
|
||||
return f"[SOURCE: {label}]\nKERESÉSI HIBA.\n"
|
||||
logger.warning(f"Keresési hiba ({query}): {e}")
|
||||
return None
|
||||
|
||||
def extract_specs_from_text(self, text: str) -> dict:
|
||||
""" Regex alapú kinyerés a nyers szövegből: ccm, kW, motoradatok. """
|
||||
import re
|
||||
async def scrape_auto_data(self, url, browser):
|
||||
"""Letölti az oldalt és kinyeri az összes technikai adatot."""
|
||||
specs = {}
|
||||
|
||||
# CCM (köbcentiméter) minta: 1998 cc, 2.0 L, 2000 cm³
|
||||
ccm_pattern = r'(\d{3,4})\s*(?:cc|ccm|cm³|cm3|cc\.)'
|
||||
match = re.search(ccm_pattern, text, re.IGNORECASE)
|
||||
if match:
|
||||
specs['ccm'] = int(match.group(1))
|
||||
else:
|
||||
# Alternatív minta: 2.0 liter -> 2000 cc
|
||||
liter_pattern = r'(\d+\.?\d*)\s*(?:L|liter|ℓ)'
|
||||
match = re.search(liter_pattern, text, re.IGNORECASE)
|
||||
if match:
|
||||
liters = float(match.group(1))
|
||||
specs['ccm'] = int(liters * 1000)
|
||||
|
||||
# KW (kilowatt) minta: 150 kW, 150kW, 150 KW
|
||||
kw_pattern = r'(\d{2,4})\s*(?:kW|kw|KW)'
|
||||
match = re.search(kw_pattern, text, re.IGNORECASE)
|
||||
if match:
|
||||
specs['kw'] = int(match.group(1))
|
||||
else:
|
||||
# Le (lóerő) átváltás: 150 LE -> 110 kW (kb)
|
||||
hp_pattern = r'(\d{2,4})\s*(?:HP|hp|LE|le|Ps)'
|
||||
match = re.search(hp_pattern, text, re.IGNORECASE)
|
||||
if match:
|
||||
hp = int(match.group(1))
|
||||
specs['kw'] = int(hp * 0.7355) # hozzávetőleges átváltás
|
||||
|
||||
# Motor kód minta: motor kód: 1.8 TSI, engine code: N47
|
||||
engine_pattern = r'(?:motor\s*kód|engine\s*code|motor\s*code)[:\s]+([A-Z0-9\.\- ]+)'
|
||||
match = re.search(engine_pattern, text, re.IGNORECASE)
|
||||
if match:
|
||||
specs['engine_code'] = match.group(1).strip()
|
||||
|
||||
return specs
|
||||
|
||||
async def research_vehicle(self, db, vehicle_id: int, make: str, model: str, engine: str, year: str, current_attempts: int):
|
||||
""" Egy jármű átvilágítása és a strukturált 'Akta' elkészítése a GPU számára. """
|
||||
engine_safe = engine or ""
|
||||
year_safe = str(year) if year else ""
|
||||
|
||||
logger.info(f"🔎 Mesterlövész Kutatás: {make} {model} (Motor: {engine_safe})")
|
||||
|
||||
# 1. TIER: Ingyenes, Célzott Keresések (A legmegbízhatóbb források)
|
||||
queries = [
|
||||
("ULTIMATE_SPECS", f"{make} {model} {engine_safe} {year_safe} site:ultimatespecs.com"),
|
||||
("AUTO_DATA", f"{make} {model} {engine_safe} {year_safe} site:auto-data.net"),
|
||||
("COMMON_ISSUES", f"{make} {model} {engine_safe} reliability common problems")
|
||||
]
|
||||
|
||||
tasks = [self.fetch_ddg_targeted(label, q) for label, q in queries]
|
||||
search_results = await asyncio.gather(*tasks)
|
||||
|
||||
# 2. TIER: Fizetős / Kvótás API-k (Példa a DVLA helyére)
|
||||
# Ha a jövőben bejön brit rendszám, itt hívjuk meg a DVLA-t:
|
||||
# if has_uk_plate and self.dvla_quota.can_make_request():
|
||||
# uk_data = await self.fetch_dvla_data(plate)
|
||||
# search_results.append(uk_data)
|
||||
|
||||
# 3. ÖSSZESÍTÉS (Az Akta összeállítása)
|
||||
# Maximalizáljuk a szöveg hosszát, hogy az AI GPU ne fulladjon le!
|
||||
full_context = "\n".join(search_results)
|
||||
if len(full_context) > 2500:
|
||||
full_context = full_context[:2500] + "\n...[TRUNCATED TO SAVE GPU TOKENS]"
|
||||
|
||||
# Regex alapú specifikáció kinyerés
|
||||
extracted_specs = self.extract_specs_from_text(full_context)
|
||||
|
||||
full_text = ""
|
||||
try:
|
||||
if len(full_context.strip()) > 150: # Csökkentettük az elvárást, mert a célzott keresés tömörebb
|
||||
await db.execute(
|
||||
update(VehicleModelDefinition)
|
||||
.where(VehicleModelDefinition.id == vehicle_id)
|
||||
.values(
|
||||
raw_search_context=full_context,
|
||||
research_metadata=extracted_specs,
|
||||
status='awaiting_ai_synthesis', # Kész az Akta, mehet az Alkimistának!
|
||||
last_research_at=func.now(),
|
||||
attempts=current_attempts + 1
|
||||
)
|
||||
)
|
||||
logger.info(f"✅ Akta rögzítve ({len(full_context)} karakter): {make} {model}")
|
||||
else:
|
||||
new_status = 'suspended_research' if current_attempts + 1 >= self.max_attempts else 'unverified'
|
||||
await db.execute(
|
||||
update(VehicleModelDefinition)
|
||||
.where(VehicleModelDefinition.id == vehicle_id)
|
||||
.values(
|
||||
status=new_status,
|
||||
attempts=current_attempts + 1,
|
||||
last_research_at=func.now()
|
||||
)
|
||||
)
|
||||
if new_status == 'suspended_research':
|
||||
logger.warning(f"🛑 Felfüggesztve (Nincs nyom a weben): {make} {model}")
|
||||
else:
|
||||
logger.warning(f"⚠️ Kevés adat: {make} {model}, visszatéve a sorba.")
|
||||
page = await browser.new_page()
|
||||
# Gyorsítás: képek, videók és stíluslapok tiltása
|
||||
await page.route("**/*.{png,jpg,jpeg,gif,css,woff2}", lambda r: r.abort())
|
||||
|
||||
await page.goto(url, wait_until="domcontentloaded", timeout=20000)
|
||||
html = await page.content()
|
||||
# Kimentjük a tiszta szöveget is, ha az AI-nak kellene később
|
||||
full_text = await page.evaluate("() => document.body.innerText")
|
||||
await page.close()
|
||||
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
# Végigfutunk minden táblázat soron
|
||||
for row in soup.find_all('tr'):
|
||||
th = row.find('th')
|
||||
td = row.find('td')
|
||||
if th and td:
|
||||
k, v = th.get_text(strip=True).lower(), td.get_text(strip=True)
|
||||
|
||||
await db.commit()
|
||||
# Minden fontos mező kinyerése
|
||||
if "engine model/code" in k: specs["engine_code"] = v
|
||||
elif "engine oil capacity" in k: specs["oil_l"] = v
|
||||
elif "acceleration 0 - 100" in k: specs["acc_0_100"] = v
|
||||
elif "maximum speed" in k: specs["max_speed"] = v
|
||||
elif "fuel consumption" in k and "combined" in k: specs["cons_avg"] = v
|
||||
elif "co2 emissions" in k: specs["co2"] = v
|
||||
elif "generation" in k: specs["generation"] = v
|
||||
elif "tires size" in k: specs["tires"] = v
|
||||
elif "trunk (boot) space" in k: specs["trunk_l"] = v
|
||||
elif "kerb weight" in k: specs["weight_kg"] = v
|
||||
elif "drivetrain" in k: specs["drivetrain"] = v
|
||||
elif "number of gears" in k: specs["transmission"] = v
|
||||
|
||||
return specs, full_text
|
||||
except Exception as e:
|
||||
await db.rollback()
|
||||
logger.error(f"🚨 Adatbázis hiba az eredmény mentésénél ({vehicle_id}): {e}")
|
||||
logger.error(f"Scraping hiba az oldalon ({url}): {e}")
|
||||
return {}, ""
|
||||
|
||||
@classmethod
|
||||
async def run(cls):
|
||||
self_instance = cls()
|
||||
logger.info("🚀 Vehicle Researcher 2.5 ONLINE (Sniper & Quota Manager)")
|
||||
|
||||
while True:
|
||||
try:
|
||||
async with AsyncSessionLocal() as db:
|
||||
# ATOMI ZÁROLÁS
|
||||
query = text("""
|
||||
UPDATE vehicle.vehicle_model_definitions
|
||||
SET status = 'research_in_progress'
|
||||
WHERE id = (
|
||||
SELECT id FROM vehicle.vehicle_model_definitions
|
||||
WHERE status IN ('unverified', 'awaiting_research', 'ACTIVE')
|
||||
AND attempts < :max_attempts
|
||||
AND is_manual = FALSE
|
||||
ORDER BY
|
||||
CASE WHEN make = 'TOYOTA' THEN 1 ELSE 2 END,
|
||||
attempts ASC
|
||||
FOR UPDATE SKIP LOCKED
|
||||
LIMIT 1
|
||||
)
|
||||
RETURNING id, make, marketing_name, engine_code, year_from, attempts;
|
||||
""")
|
||||
|
||||
result = await db.execute(query, {"max_attempts": self_instance.max_attempts})
|
||||
task = result.fetchone()
|
||||
await db.commit()
|
||||
async def ask_ai_fallback(self, raw_text):
|
||||
"""Ha a BeautifulSoup nem talál táblázatot, megkérjük az Ollamát."""
|
||||
if not raw_text or len(raw_text) < 200: return {}
|
||||
prompt = f"Extract vehicle specs (engine_code, oil_capacity, tires, generation) as JSON from this text: {raw_text[:2500]}"
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=30.0) as client:
|
||||
r = await client.post(self.ollama_url, json={
|
||||
"model": "qwen2.5-coder:14b",
|
||||
"prompt": prompt,
|
||||
"stream": False,
|
||||
"format": "json"
|
||||
})
|
||||
return json.loads(r.json().get("response", "{}"))
|
||||
except: return {}
|
||||
|
||||
if task:
|
||||
v_id, v_make, v_model, v_engine, v_year, v_attempts = task
|
||||
async with AsyncSessionLocal() as process_db:
|
||||
await self_instance.research_vehicle(process_db, v_id, v_make, v_model, v_engine, v_year, v_attempts)
|
||||
|
||||
await asyncio.sleep(2) # Rate limit védelem a DDG felé
|
||||
async def process_vehicle(self, v_id, make, model, year, kw, browser):
|
||||
"""Egy jármű dúsításának teljes folyamata."""
|
||||
async with self.semaphore:
|
||||
logger.info(f"🔍 Kutatás: {make} {model} ({year}) | kW: {kw}")
|
||||
url = await self.get_url(make, model, year, kw)
|
||||
|
||||
specs = {}
|
||||
if url:
|
||||
logger.info(f"🔗 Találat: {url}")
|
||||
specs, raw_text = await self.scrape_auto_data(url, browser)
|
||||
|
||||
# Ha a táblázatból nem jött ki elég adat, jöhet az AI fallback
|
||||
if len(specs) < 3:
|
||||
ai_specs = await self.ask_ai_fallback(raw_text)
|
||||
specs.update(ai_specs)
|
||||
|
||||
# MENTÉS: Minden szál saját adatbázis kapcsolatot használ a biztonság érdekében
|
||||
async with AsyncSessionLocal() as db:
|
||||
# Csak akkor validation_ready, ha találtunk adatot. Ha nem, külön státuszba tesszük.
|
||||
new_status = 'validation_ready' if len(specs) > 0 else 'research_failed_empty'
|
||||
|
||||
update_query = text("""
|
||||
UPDATE vehicle.vehicle_model_definitions
|
||||
SET specifications = specifications || CAST(:specs AS JSONB),
|
||||
status = :status,
|
||||
last_research_at = now()
|
||||
WHERE id = :id
|
||||
""")
|
||||
await db.execute(update_query, {
|
||||
"specs": json.dumps(specs),
|
||||
"status": new_status,
|
||||
"id": v_id
|
||||
})
|
||||
await db.commit()
|
||||
|
||||
if len(specs) > 0:
|
||||
logger.info(f"✅ SIKER: {make} {model} ({len(specs)} adat kinyerve)")
|
||||
else:
|
||||
await asyncio.sleep(30)
|
||||
logger.warning(f"❌ SIKERTELEN: {make} {model} (nem találtunk adatot a neten)")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"💀 Kritikus hiba a főciklusban: {e}")
|
||||
await asyncio.sleep(10)
|
||||
async def run(self):
|
||||
logger.info("🚀 R2-Kutató MASTER-EDITION (0kW fix + AI Fallback) ONLINE")
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
while True:
|
||||
try:
|
||||
async with AsyncSessionLocal() as db:
|
||||
# 10 autó bekérése párhuzamos feldolgozásra
|
||||
res = await db.execute(text("""
|
||||
UPDATE vehicle.vehicle_model_definitions SET status = 'research_in_progress'
|
||||
WHERE id IN (
|
||||
SELECT id FROM vehicle.vehicle_model_definitions
|
||||
WHERE status = 'enrich_ready'
|
||||
LIMIT 10
|
||||
) RETURNING id, make, marketing_name, year_from, power_kw
|
||||
"""))
|
||||
rows = res.fetchall()
|
||||
await db.commit()
|
||||
|
||||
if not rows:
|
||||
await asyncio.sleep(15)
|
||||
continue
|
||||
|
||||
tasks = [self.process_vehicle(r[0], r[1], r[2], r[3], r[4], browser) for r in rows]
|
||||
await asyncio.gather(*tasks)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"💀 Kritikus hiba a főciklusban: {e}")
|
||||
await asyncio.sleep(10)
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
asyncio.run(VehicleResearcher.run())
|
||||
except KeyboardInterrupt:
|
||||
logger.info("🛑 Kutató robot leállítva.")
|
||||
asyncio.run(VehicleResearcher().run())
|
||||
@@ -1,224 +1,232 @@
|
||||
# /opt/docker/dev/service_finder/backend/app/workers/vehicle/vehicle_robot_3_alchemist_pro.py
|
||||
"""
|
||||
Robot 3: Alchemist Pro - AI Szintézis és Kapuőr
|
||||
Javítások:
|
||||
- Batch Size: 3 (Stabilitás a 14b modellhez)
|
||||
- Szigorú Gatekeeper (Arany státusz ellenőrzés)
|
||||
- Adatmegőrzés: Az AI nem bírálja felül a szótár alapú RDW adatokat (kW/ccm).
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import datetime
|
||||
import random
|
||||
import sys
|
||||
import json
|
||||
import os
|
||||
from sqlalchemy import text, func, update, case
|
||||
import re
|
||||
from sqlalchemy import text, update, func
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
import httpx
|
||||
from app.database import AsyncSessionLocal
|
||||
from app.models.vehicle_definitions import VehicleModelDefinition
|
||||
from app.models.asset import AssetCatalog
|
||||
from app.services.ai_service import AIService
|
||||
from app.models import VehicleModelDefinition
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] Vehicle-Alchemist-Pro: %(message)s', stream=sys.stdout)
|
||||
logger = logging.getLogger("Vehicle-Robot-3-Alchemist-Pro")
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s [%(levelname)s] R3-Alchemist-Pro: %(message)s',
|
||||
stream=sys.stdout
|
||||
)
|
||||
logger = logging.getLogger("Robot-3-Alchemist-Pro")
|
||||
|
||||
class TechEnricher:
|
||||
"""
|
||||
Vehicle Robot 3: Alchemist Pro (Atomi Zárolás + Kézi Moderáció Patch)
|
||||
Tiszta GPU fókusz: Csak az AI elemzésre és adategyesítésre koncentrál.
|
||||
Nincs felesleges webkeresés. Szigorú, de intelligens Sane-Check.
|
||||
"""
|
||||
OLLAMA_URL = "http://sf_ollama:11434/api/generate"
|
||||
OLLAMA_MODEL = "qwen2.5-coder:14b" # A 14b paraméteres modell az agy
|
||||
MAX_ATTEMPTS = 3
|
||||
TIMEOUT_SECONDS = 45 # Megemelt timeout a 14b modell lassabb válaszideje miatt
|
||||
BATCH_SIZE = 3 # Maximum 3 párhuzamos AI hívás a CPU fagyás elkerülésére
|
||||
|
||||
class AlchemistPro:
|
||||
def __init__(self):
|
||||
self.max_attempts = 5
|
||||
self.daily_ai_limit = int(os.getenv("AI_DAILY_LIMIT", "10000"))
|
||||
self.ai_calls_today = 0
|
||||
self.last_reset_date = datetime.date.today()
|
||||
self.client = httpx.AsyncClient(timeout=TIMEOUT_SECONDS)
|
||||
|
||||
def check_budget(self) -> bool:
|
||||
if datetime.date.today() > self.last_reset_date:
|
||||
self.ai_calls_today = 0
|
||||
self.last_reset_date = datetime.date.today()
|
||||
return self.ai_calls_today < self.daily_ai_limit
|
||||
async def close(self):
|
||||
await self.client.aclose()
|
||||
|
||||
def validate_merged_data(self, merged_kw: int, merged_ccm: int, v_class: str, fuel: str, current_attempts: int) -> tuple[bool, str]:
|
||||
""" Intelligens validáció a MERGE után. Visszaadja a státuszt és a hiba okát. """
|
||||
if merged_ccm > 18000:
|
||||
return False, f"Irreális CCM érték ({merged_ccm})"
|
||||
if merged_kw > 1500 and v_class != "truck":
|
||||
return False, f"Irreális KW érték ({merged_kw})"
|
||||
|
||||
# Ha hiányzik a KW
|
||||
if merged_kw == 0:
|
||||
if current_attempts < 3:
|
||||
return False, "Hiányzó KW adat. Újrakutatás javasolt."
|
||||
else:
|
||||
logger.warning("Sane-check: Többszöri próbálkozás után sincs KW, de átengedjük részlegesként.")
|
||||
|
||||
# Ha hiányzik a CCM (és belsőégésű)
|
||||
if merged_ccm == 0 and "electric" not in fuel and "elektric" not in fuel and v_class != "trailer":
|
||||
if current_attempts < 3:
|
||||
return False, "Hiányzó CCM (belsőégésű motornál). Újrakutatás javasolt."
|
||||
else:
|
||||
logger.warning("Sane-check: Többszöri próbálkozás után sincs CCM, átengedjük részlegesként.")
|
||||
|
||||
return True, "OK"
|
||||
|
||||
async def process_single_record(self, db, record_id: int, base_info: dict, current_attempts: int):
|
||||
# Pontos azonosító a logokhoz (Márka, Modell, ID, RDW adatok)
|
||||
v_ident = f"{base_info['make'].upper()} {base_info['m_name']} (ID: {record_id}, RDW: {base_info['rdw_ccm']}ccm, KW: {base_info['rdw_kw']})"
|
||||
attempt_str = f"[Próba: {current_attempts + 1}/{self.max_attempts}]"
|
||||
async def fetch_vehicle_batch_for_processing(self, db: AsyncSession):
|
||||
"""Kiválasztja azokat a járműveket, ahol a 2.1-es robot végzett, de még nem 'Arany'."""
|
||||
query = text("""
|
||||
SELECT id, make, marketing_name, power_kw, engine_capacity,
|
||||
fuel_type, raw_api_data, raw_search_context, attempts,
|
||||
vehicle_class, trim_level, transmission_type, body_type
|
||||
FROM vehicle.vehicle_model_definitions
|
||||
WHERE status = 'awaiting_ai_synthesis'
|
||||
AND attempts < :max_attempts
|
||||
AND is_manual = FALSE
|
||||
ORDER BY priority_score DESC NULLS LAST, id ASC
|
||||
FOR UPDATE SKIP LOCKED
|
||||
LIMIT :batch_size
|
||||
""")
|
||||
result = await db.execute(query, {"max_attempts": MAX_ATTEMPTS, "batch_size": BATCH_SIZE})
|
||||
rows = result.fetchall()
|
||||
|
||||
ai_data = {} # Üres dict, ha az AI hívás elszállna
|
||||
|
||||
try:
|
||||
logger.info(f"🧠 AI dúsítás indul: {v_ident} {attempt_str}")
|
||||
|
||||
# 1. LÉPÉS: AI Hívás (Rábízzuk az adatokat a modellre)
|
||||
ai_data = await AIService.get_clean_vehicle_data(
|
||||
base_info['make'],
|
||||
base_info['m_name'],
|
||||
base_info
|
||||
)
|
||||
|
||||
if not ai_data:
|
||||
raise ValueError("Teljesen üres AI válasz (API hiba vagy extrém hallucináció).")
|
||||
|
||||
# 2. LÉPÉS: HIBRID MERGE (Még a validáció előtt!)
|
||||
# Az RDW adatok felülbírálják az AI-t a hatósági paramétereknél
|
||||
final_kw = base_info['rdw_kw'] if base_info['rdw_kw'] > 0 else int(ai_data.get("kw", 0) or 0)
|
||||
final_ccm = base_info['rdw_ccm'] if base_info['rdw_ccm'] > 0 else int(ai_data.get("ccm", 0) or 0)
|
||||
|
||||
# Üzemanyag tisztítása
|
||||
fuel_rdw = base_info.get('rdw_fuel', '')
|
||||
final_fuel = fuel_rdw if fuel_rdw and fuel_rdw != "Unknown" else ai_data.get("fuel_type", "petrol")
|
||||
|
||||
final_engine = base_info['rdw_engine'] if base_info['rdw_engine'] else ai_data.get("engine_code", "Unknown")
|
||||
final_euro = base_info['rdw_euro'] or ai_data.get("euro_classification")
|
||||
final_cylinders = base_info['rdw_cylinders'] or ai_data.get("cylinders")
|
||||
|
||||
# 3. LÉPÉS: Intelligens Validáció
|
||||
is_valid, error_msg = self.validate_merged_data(final_kw, final_ccm, base_info['v_type'], final_fuel.lower(), current_attempts)
|
||||
if not is_valid:
|
||||
raise ValueError(f"Validációs hiba: {error_msg}")
|
||||
|
||||
# 4. LÉPÉS: Mentés az Arany Katalógusba
|
||||
clean_model = str(ai_data.get("marketing_name", base_info['m_name']))[:50].upper()
|
||||
|
||||
cat_stmt = text("""
|
||||
INSERT INTO vehicle.vehicle_catalog
|
||||
(master_definition_id, make, model, power_kw, engine_capacity, fuel_type, factory_data)
|
||||
VALUES (:m_id, :make, :model, :kw, :ccm, :fuel, :factory)
|
||||
ON CONFLICT ON CONSTRAINT uix_vehicle_catalog_full DO NOTHING
|
||||
RETURNING id;
|
||||
""")
|
||||
|
||||
await db.execute(cat_stmt, {
|
||||
"m_id": record_id,
|
||||
"make": base_info['make'].upper(),
|
||||
"model": clean_model,
|
||||
"kw": final_kw,
|
||||
"ccm": final_ccm,
|
||||
"fuel": final_fuel,
|
||||
"factory": json.dumps(ai_data)
|
||||
vehicles = []
|
||||
for row in rows:
|
||||
vehicles.append({
|
||||
"id": row[0], "make": row[1], "marketing_name": row[2],
|
||||
"power_kw": row[3] or 0, "engine_capacity": row[4] or 0,
|
||||
"fuel_type": row[5] or "Unknown", "raw_api_data": row[6] or {},
|
||||
"raw_search_context": row[7] or "", "attempts": row[8] or 0,
|
||||
"vehicle_class": row[9], "trim_level": row[10],
|
||||
"transmission_type": row[11], "body_type": row[12]
|
||||
})
|
||||
return vehicles
|
||||
|
||||
# 5. LÉPÉS: Staging tábla (VMD) lezárása
|
||||
await db.execute(
|
||||
update(VehicleModelDefinition)
|
||||
.where(VehicleModelDefinition.id == record_id)
|
||||
.values(
|
||||
status="gold_enriched",
|
||||
engine_capacity=final_ccm,
|
||||
power_kw=final_kw,
|
||||
fuel_type=final_fuel,
|
||||
engine_code=final_engine,
|
||||
euro_classification=final_euro,
|
||||
cylinders=final_cylinders,
|
||||
specifications=ai_data, # Elmentjük az AI teljes outputját a mestertáblába is
|
||||
updated_at=func.now()
|
||||
)
|
||||
)
|
||||
await db.commit()
|
||||
logger.info(f"✨ ARANY REKORD KÉSZ: {v_ident}")
|
||||
self.ai_calls_today += 1
|
||||
def build_prompt(self, vehicle_data: dict) -> str:
|
||||
"""Megfogalmazza a feladatot az AI számára a 14b modell erejét kihasználva."""
|
||||
make = vehicle_data["make"]
|
||||
model = vehicle_data["marketing_name"]
|
||||
|
||||
# Rövidítjük a kontextust, hogy beleférjen a kontextus ablakba
|
||||
raw_api = json.dumps(vehicle_data["raw_api_data"], ensure_ascii=False)[:1000]
|
||||
raw_context = (vehicle_data["raw_search_context"] or "")[:2000]
|
||||
|
||||
prompt = f"""
|
||||
Analyze the vehicle data and return missing information in valid JSON format.
|
||||
Vehicle: {make} {model}
|
||||
Current Specs:
|
||||
- Power: {vehicle_data['power_kw']} kW (0 means missing)
|
||||
- Engine: {vehicle_data['engine_capacity']} ccm (0 means missing)
|
||||
- Fuel: {vehicle_data['fuel_type']}
|
||||
|
||||
Context Data: {raw_api}
|
||||
Search Snippets: {raw_context}
|
||||
|
||||
INSTRUCTIONS:
|
||||
1. Identify trim_level (e.g., GTI, AMG, Highline, Titanium).
|
||||
2. Identify transmission (MANUAL, AUTOMATIC, CVT, DCT).
|
||||
3. Identify body_type (SEDAN, SUV, HATCHBACK, ESTATE, COUPE).
|
||||
4. If Power is 0, estimate it based on the engine size and fuel in context.
|
||||
5. If Engine is 0, estimate it based on model name.
|
||||
|
||||
Return ONLY a JSON object:
|
||||
{{
|
||||
"trim_level": "string",
|
||||
"transmission": "string",
|
||||
"body_type": "string",
|
||||
"estimated_kw": integer_or_null,
|
||||
"estimated_ccm": integer_or_null
|
||||
}}
|
||||
"""
|
||||
return prompt.strip()
|
||||
|
||||
async def call_ollama(self, prompt: str) -> dict:
|
||||
"""Kommunikáció az Ollama szerverrel."""
|
||||
payload = {
|
||||
"model": OLLAMA_MODEL,
|
||||
"prompt": prompt,
|
||||
"format": "json",
|
||||
"stream": False,
|
||||
"options": {"temperature": 0.1, "top_p": 0.9}
|
||||
}
|
||||
try:
|
||||
response = await self.client.post(OLLAMA_URL, json=payload)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
return json.loads(data.get("response", "{}"))
|
||||
except Exception as e:
|
||||
await db.rollback()
|
||||
logger.warning(f"⚠️ Alkimista hiba - {v_ident}: {e}")
|
||||
raise ValueError(f"Ollama hiba: {str(e)}")
|
||||
|
||||
def merge_vehicle_data(self, vehicle: dict, ai_result: dict) -> dict:
|
||||
"""Összefésüli a meglévő adatokat az AI eredményeivel, prioritást adva a meglévőnek."""
|
||||
merged = vehicle.copy()
|
||||
|
||||
# A szöveges mezőket frissítjük, ha az AI talált jobbat
|
||||
for field, ai_key in [("trim_level", "trim_level"), ("transmission_type", "transmission"), ("body_type", "body_type")]:
|
||||
if not merged.get(field) and ai_result.get(ai_key):
|
||||
merged[field] = str(ai_result[ai_key]).upper() if field != "trim_level" else ai_result[ai_key]
|
||||
|
||||
# MATEK VÉDELEM: Csak akkor írjuk be az AI becslését, ha a 2.1-es robot nem talált adatot (még mindig 0)
|
||||
if merged["power_kw"] == 0 and ai_result.get("estimated_kw"):
|
||||
merged["power_kw"] = int(ai_result["estimated_kw"])
|
||||
|
||||
if merged["engine_capacity"] == 0 and ai_result.get("estimated_ccm"):
|
||||
merged["engine_capacity"] = int(ai_result["estimated_ccm"])
|
||||
|
||||
# Ha elértük a limitet, KÉZI MODERÁCIÓRA küldjük, egyébként vissza a Kutatónak
|
||||
new_status = 'manual_review_needed' if current_attempts + 1 >= self.max_attempts else 'unverified'
|
||||
|
||||
# Elmentjük az AI részleges válaszát (vagy a hibát), hogy az admin lássa, mit rontott el a gép
|
||||
review_data = ai_data if ai_data else {"error": "Nincs értékelhető JSON adat az AI-tól", "raw_context": base_info['web_context']}
|
||||
|
||||
await db.execute(
|
||||
update(VehicleModelDefinition)
|
||||
.where(VehicleModelDefinition.id == record_id)
|
||||
.values(
|
||||
attempts=current_attempts + 1,
|
||||
last_error=str(e)[:200],
|
||||
status=new_status,
|
||||
specifications=review_data, # Kézi ellenőrzéshez beírjuk a törött adatot!
|
||||
return merged
|
||||
|
||||
async def update_vehicle_record(self, db: AsyncSession, vehicle_id: int, merged_data: dict):
|
||||
"""Végrehajtja a mentést és a Kapuőr logikát."""
|
||||
kw = merged_data.get("power_kw", 0)
|
||||
ccm = merged_data.get("engine_capacity", 0)
|
||||
fuel = str(merged_data.get("fuel_type", "")).lower()
|
||||
v_class = str(merged_data.get("vehicle_class", "")).lower()
|
||||
|
||||
# Kapuőr szabályok
|
||||
is_electric = any(x in fuel for x in ['electr', 'elektri', 'hydrogen'])
|
||||
is_trailer = 'trailer' in v_class
|
||||
|
||||
is_gold = False
|
||||
if is_trailer: is_gold = True
|
||||
elif is_electric: is_gold = kw > 0
|
||||
else: is_gold = (kw > 0 and ccm > 0)
|
||||
|
||||
if is_gold:
|
||||
new_status = "gold_enriched"
|
||||
new_attempts = 0
|
||||
msg = "✨ ARANY"
|
||||
else:
|
||||
new_attempts = merged_data["attempts"] + 1
|
||||
new_status = "manual_review_needed" if new_attempts >= MAX_ATTEMPTS else "unverified"
|
||||
msg = "🔄 VISSZADOBVA"
|
||||
|
||||
update_values = {
|
||||
"trim_level": merged_data.get("trim_level"),
|
||||
"transmission_type": merged_data.get("transmission_type"),
|
||||
"body_type": merged_data.get("body_type"),
|
||||
"power_kw": kw,
|
||||
"engine_capacity": ccm,
|
||||
"status": new_status,
|
||||
"attempts": new_attempts,
|
||||
"updated_at": func.now()
|
||||
}
|
||||
|
||||
stmt = update(VehicleModelDefinition).where(VehicleModelDefinition.id == vehicle_id).values(**update_values)
|
||||
await db.execute(stmt)
|
||||
logger.info(f"{msg}: {merged_data['make']} {merged_data['marketing_name']} (Státusz: {new_status})")
|
||||
|
||||
async def process_ai_task(self, vehicle: dict):
|
||||
"""AI feldolgozás párhuzamosítható része."""
|
||||
try:
|
||||
prompt = self.build_prompt(vehicle)
|
||||
ai_result = await self.call_ollama(prompt)
|
||||
return vehicle, ai_result, None
|
||||
except Exception as e:
|
||||
return vehicle, None, e
|
||||
|
||||
async def process_batch(self, db: AsyncSession, vehicles: list):
|
||||
"""Batch feldolgozás: Párhuzamos AI, majd szekvenciális DB mentés."""
|
||||
# 1. AI kérések párhuzamosan (CPU kímélő batch mérettel)
|
||||
tasks = [self.process_ai_task(v) for v in vehicles]
|
||||
results = await asyncio.gather(*tasks)
|
||||
|
||||
# 2. Mentés szekvenciálisan a DB lakatok elkerülésére
|
||||
for vehicle, ai_result, error in results:
|
||||
if error:
|
||||
logger.error(f"Hiba {vehicle['id']}: {error}")
|
||||
# Hiba esetén növeljük a próbálkozások számát
|
||||
stmt = update(VehicleModelDefinition).where(VehicleModelDefinition.id == vehicle['id']).values(
|
||||
attempts=vehicle['attempts'] + 1,
|
||||
updated_at=func.now()
|
||||
)
|
||||
)
|
||||
await db.commit()
|
||||
|
||||
if new_status == 'unverified':
|
||||
logger.info(f"♻️ Akta visszaküldve a Robot-2-nek (Kutató). {attempt_str}")
|
||||
await db.execute(stmt)
|
||||
else:
|
||||
logger.error(f"🛑 Max próbálkozás elérve! Kézi moderációra küldve: {v_ident}")
|
||||
merged = self.merge_vehicle_data(vehicle, ai_result)
|
||||
await self.update_vehicle_record(db, vehicle['id'], merged)
|
||||
|
||||
await db.commit()
|
||||
|
||||
async def run(self):
|
||||
logger.info(f"🚀 Alchemist Pro HIBRID ONLINE (Atomi Zárolás + Moderáció Patch)")
|
||||
logger.info(f"🚀 Robot 3 indítva. Modell: {OLLAMA_MODEL}, Batch: {BATCH_SIZE}")
|
||||
while True:
|
||||
if not self.check_budget():
|
||||
logger.warning("💸 Napi AI limit kimerítve! Pihenés...")
|
||||
await asyncio.sleep(3600); continue
|
||||
|
||||
try:
|
||||
async with AsyncSessionLocal() as db:
|
||||
# ATOMI ZÁROLÁS (A "Szent Grál" a race condition ellen)
|
||||
query = text("""
|
||||
UPDATE vehicle.vehicle_model_definitions
|
||||
SET status = 'ai_synthesis_in_progress'
|
||||
WHERE id = (
|
||||
SELECT id FROM vehicle.vehicle_model_definitions
|
||||
WHERE status IN ('awaiting_ai_synthesis', 'ACTIVE')
|
||||
AND attempts < :max_attempts
|
||||
AND is_manual = FALSE
|
||||
ORDER BY
|
||||
CASE WHEN status = 'awaiting_ai_synthesis' THEN 1 ELSE 2 END,
|
||||
priority_score DESC
|
||||
FOR UPDATE SKIP LOCKED
|
||||
LIMIT 1
|
||||
)
|
||||
RETURNING id, make, marketing_name, vehicle_class, power_kw, engine_capacity,
|
||||
fuel_type, engine_code, euro_classification, cylinders, raw_search_context, attempts;
|
||||
""")
|
||||
|
||||
result = await db.execute(query, {"max_attempts": self.max_attempts})
|
||||
task = result.fetchone()
|
||||
await db.commit()
|
||||
|
||||
if task:
|
||||
# Szétbontjuk a lekérdezett rekordot a base_info dict-be
|
||||
r_id = task[0]
|
||||
base_info = {
|
||||
"make": task[1], "m_name": task[2], "v_type": task[3] or "car",
|
||||
"rdw_kw": task[4] or 0, "rdw_ccm": task[5] or 0,
|
||||
"rdw_fuel": task[6] or "petrol", "rdw_engine": task[7] or "",
|
||||
"rdw_euro": task[8], "rdw_cylinders": task[9],
|
||||
"web_context": task[10] or ""
|
||||
}
|
||||
attempts = task[11]
|
||||
|
||||
# Külön adatbázis kapcsolat a feldolgozáshoz (hosszú AI hívás miatt)
|
||||
async with AsyncSessionLocal() as process_db:
|
||||
await self.process_single_record(process_db, r_id, base_info, attempts)
|
||||
|
||||
# GPU hűtés / Ollama rate limit
|
||||
await asyncio.sleep(random.uniform(1.5, 3.5))
|
||||
else:
|
||||
logger.info("😴 Nincs feldolgozandó akta, az Alkimista pihen...")
|
||||
await asyncio.sleep(15)
|
||||
|
||||
vehicles = await self.fetch_vehicle_batch_for_processing(db)
|
||||
if vehicles:
|
||||
logger.info(f"📦 Feldolgozás: {len(vehicles)} jármű...")
|
||||
await self.process_batch(db, vehicles)
|
||||
await asyncio.sleep(1)
|
||||
else:
|
||||
await asyncio.sleep(10)
|
||||
except Exception as e:
|
||||
logger.error(f"💀 Kritikus hiba a főciklusban: {e}")
|
||||
await asyncio.sleep(10)
|
||||
logger.error(f"Főciklus hiba: {e}")
|
||||
await asyncio.sleep(5)
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(TechEnricher().run())
|
||||
robot = AlchemistPro()
|
||||
asyncio.run(robot.run())
|
||||
261
backend/app/workers/vehicle/vehicle_robot_4_validator.py
Normal file
261
backend/app/workers/vehicle/vehicle_robot_4_validator.py
Normal file
@@ -0,0 +1,261 @@
|
||||
# /opt/docker/dev/service_finder/backend/app/workers/vehicle/vehicle_robot_4_validator.py
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Robot-4-Validator (Publisher / Gépágyú)
|
||||
|
||||
Az MDM csővezeték utolsó eleme. Feladata:
|
||||
1. Kivesz 50 darab gold_enriched státuszú járművet a VMD táblából (FOR UPDATE SKIP LOCKED)
|
||||
2. Validálja az alapvető mezőket (make, marketing_name, power_kw, engine_capacity)
|
||||
3. Ha sikeres, összeállít egy factory_data JSON-t és UPSERT-et végez a vehicle.vehicle_catalog táblába
|
||||
(ON CONFLICT ON CONSTRAINT uix_vehicle_catalog_full)
|
||||
4. Állítja a VMD státuszt published-re
|
||||
5. Ha sikertelen, manual_review_needed státuszt állít
|
||||
|
||||
AI-mentes, tisztán adatbázis logika.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import sys
|
||||
import json
|
||||
from datetime import datetime
|
||||
from sqlalchemy import text, update, func
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
from app.database import AsyncSessionLocal
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s [%(levelname)s] R4-Publisher: %(message)s',
|
||||
stream=sys.stdout
|
||||
)
|
||||
logger = logging.getLogger("Robot-4-Publisher")
|
||||
|
||||
BATCH_SIZE = 50
|
||||
|
||||
class VehicleRobot4Validator:
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
async def fetch_gold_enriched_batch(self, db: AsyncSession):
|
||||
"""
|
||||
Lekérdez egy köteget gold_enriched státuszú járművekből.
|
||||
FOR UPDATE SKIP LOCKED zárolással, hogy ne dolgozzon többször ugyanazon.
|
||||
"""
|
||||
query = text("""
|
||||
SELECT id, make, marketing_name, power_kw, engine_capacity,
|
||||
fuel_type, year_from, trim_level, transmission_type,
|
||||
body_type, specifications, status
|
||||
FROM vehicle.vehicle_model_definitions
|
||||
WHERE status = 'gold_enriched'
|
||||
ORDER BY priority_score DESC NULLS LAST, id ASC
|
||||
FOR UPDATE SKIP LOCKED
|
||||
LIMIT :batch_size
|
||||
""")
|
||||
result = await db.execute(query, {"batch_size": BATCH_SIZE})
|
||||
rows = result.fetchall()
|
||||
return rows
|
||||
|
||||
def validate_vehicle(self, row):
|
||||
"""
|
||||
Pofonegyszerű ellenőrzés minden lekérdezett sornál:
|
||||
- Van make és marketing_name?
|
||||
- A power_kw > 0 ÉS engine_capacity > 0?
|
||||
(Kivéve, ha a fuel_type tartalmazza az "elektr" szót, mert akkor a ccm lehet 0)
|
||||
"""
|
||||
make = row.make
|
||||
marketing_name = row.marketing_name
|
||||
power_kw = row.power_kw
|
||||
engine_capacity = row.engine_capacity
|
||||
fuel_type = (row.fuel_type or "").lower()
|
||||
|
||||
# 1. make és marketing_name ellenőrzés
|
||||
if not make or not marketing_name:
|
||||
logger.warning(f"ID {row.id}: Hiányzó make vagy marketing_name")
|
||||
return False, "missing_make_or_name"
|
||||
|
||||
# 2. power_kw ellenőrzés
|
||||
if power_kw is None or power_kw <= 0:
|
||||
# Elektromos járműveknek lehet 0 power_kw? Nem, az is pozitív kell legyen.
|
||||
logger.warning(f"ID {row.id}: Érvénytelen power_kw ({power_kw})")
|
||||
return False, "invalid_power"
|
||||
|
||||
# 3. engine_capacity ellenőrzés
|
||||
if engine_capacity is None or engine_capacity < 0:
|
||||
logger.warning(f"ID {row.id}: Érvénytelen engine_capacity ({engine_capacity})")
|
||||
return False, "invalid_engine_capacity"
|
||||
|
||||
# Kivétel: elektromos járműveknél engine_capacity lehet 0
|
||||
is_electric = any(x in fuel_type for x in ['electr', 'elektri', 'hydrogen'])
|
||||
if not is_electric and engine_capacity == 0:
|
||||
logger.warning(f"ID {row.id}: Nem elektromos jármű engine_capacity 0 (fuel: {fuel_type})")
|
||||
return False, "zero_engine_capacity_non_electric"
|
||||
|
||||
# 4. fuel_type ellenőrzés (nem kötelező, de legyen valami)
|
||||
if not fuel_type or fuel_type == "unknown":
|
||||
logger.warning(f"ID {row.id}: Ismeretlen fuel_type")
|
||||
# Ez nem buktató, csak figyelmeztetés
|
||||
# return False, "unknown_fuel_type"
|
||||
|
||||
# 5. year_from ellenőrzés (opcionális)
|
||||
if row.year_from is None or row.year_from <= 1900:
|
||||
logger.warning(f"ID {row.id}: Érvénytelen year_from ({row.year_from})")
|
||||
# Nem buktató, de lehet, hogy hiányos
|
||||
|
||||
return True, "valid"
|
||||
|
||||
async def publish_to_catalog(self, db: AsyncSession, row):
|
||||
"""
|
||||
Publikálás (Sikeres Validáció):
|
||||
- Állít össze egy factory_data JSON objektumot
|
||||
- Végez egy UPSERT-et a vehicle.vehicle_catalog táblába
|
||||
- Állítja a VMD státuszt published-re
|
||||
"""
|
||||
# Factory_data összeállítása
|
||||
factory_data = {
|
||||
"trim_level": row.trim_level or "",
|
||||
"transmission_type": row.transmission_type or "",
|
||||
"body_type": row.body_type or "",
|
||||
"specifications": row.specifications or {},
|
||||
"source": "robot_4_publisher",
|
||||
"published_at": datetime.utcnow().isoformat()
|
||||
}
|
||||
|
||||
# UPSERT a vehicle_catalog táblába
|
||||
# A constraint: uix_vehicle_catalog_full (make, model, year_from, fuel_type)
|
||||
# Megjegyzés: a model mezőbe a marketing_name kerül
|
||||
upsert_query = text("""
|
||||
INSERT INTO vehicle.vehicle_catalog
|
||||
(make, model, year_from, fuel_type, power_kw, engine_capacity, factory_data, master_definition_id)
|
||||
VALUES
|
||||
(:make, :model, :year_from, :fuel_type, :power_kw, :engine_capacity, :factory_data, :master_definition_id)
|
||||
ON CONFLICT ON CONSTRAINT uix_vehicle_catalog_full
|
||||
DO UPDATE SET
|
||||
power_kw = EXCLUDED.power_kw,
|
||||
engine_capacity = EXCLUDED.engine_capacity,
|
||||
factory_data = EXCLUDED.factory_data,
|
||||
master_definition_id = EXCLUDED.master_definition_id
|
||||
RETURNING id
|
||||
""")
|
||||
params = {
|
||||
"make": row.make,
|
||||
"model": row.marketing_name, # A model a marketing_name
|
||||
"year_from": row.year_from if row.year_from else 0,
|
||||
"fuel_type": row.fuel_type or "Unknown",
|
||||
"power_kw": row.power_kw,
|
||||
"engine_capacity": row.engine_capacity,
|
||||
"factory_data": json.dumps(factory_data),
|
||||
"master_definition_id": row.id
|
||||
}
|
||||
result = await db.execute(upsert_query, params)
|
||||
catalog_id = result.scalar()
|
||||
logger.info(f"ID {row.id}: Sikeres publikálás a katalógusba (catalog_id: {catalog_id})")
|
||||
|
||||
# VMD státusz frissítése published-re
|
||||
update_query = text("""
|
||||
UPDATE vehicle.vehicle_model_definitions
|
||||
SET status = 'published',
|
||||
updated_at = NOW()
|
||||
WHERE id = :id
|
||||
""")
|
||||
await db.execute(update_query, {"id": row.id})
|
||||
logger.info(f"ID {row.id}: Státusz frissítve published-re")
|
||||
|
||||
async def mark_for_manual_review(self, db: AsyncSession, row, reason):
|
||||
"""
|
||||
Elutasítás (Sikertelen Validáció):
|
||||
- Állítja a VMD státuszt manual_review_needed-re
|
||||
"""
|
||||
update_query = text("""
|
||||
UPDATE vehicle.vehicle_model_definitions
|
||||
SET status = 'manual_review_needed',
|
||||
last_error = :reason,
|
||||
updated_at = NOW()
|
||||
WHERE id = :id
|
||||
""")
|
||||
await db.execute(update_query, {"id": row.id, "reason": reason})
|
||||
logger.warning(f"ID {row.id}: Átállítva manual_review_needed-re, ok: {reason}")
|
||||
|
||||
async def process_batch(self):
|
||||
"""
|
||||
Feldolgoz egy köteget.
|
||||
"""
|
||||
async with AsyncSessionLocal() as db:
|
||||
try:
|
||||
# Tranzakció indítása
|
||||
await db.execute(text("BEGIN"))
|
||||
|
||||
rows = await self.fetch_gold_enriched_batch(db)
|
||||
if not rows:
|
||||
logger.info("Nincs gold_enriched státuszú jármű a feldolgozáshoz.")
|
||||
await db.execute(text("COMMIT"))
|
||||
return 0
|
||||
|
||||
logger.info(f"{len(rows)} gold_enriched jármű lekérdezve.")
|
||||
|
||||
published_count = 0
|
||||
manual_review_count = 0
|
||||
|
||||
for row in rows:
|
||||
is_valid, reason = self.validate_vehicle(row)
|
||||
if is_valid:
|
||||
await self.publish_to_catalog(db, row)
|
||||
published_count += 1
|
||||
else:
|
||||
await self.mark_for_manual_review(db, row, reason)
|
||||
manual_review_count += 1
|
||||
|
||||
await db.execute(text("COMMIT"))
|
||||
logger.info(f"Köteg feldolgozva. Publikálva: {published_count}, Kézi ellenőrzés: {manual_review_count}")
|
||||
return published_count
|
||||
|
||||
except Exception as e:
|
||||
await db.execute(text("ROLLBACK"))
|
||||
logger.error(f"Hiba a köteg feldolgozásában: {e}", exc_info=True)
|
||||
raise
|
||||
|
||||
async def run(self, max_iterations=None):
|
||||
"""
|
||||
Futtatja a robotot folyamatosan (daemon mód).
|
||||
Ha nincs gold_enriched adat, vár 30 másodpercet, majd újra próbálkozik.
|
||||
"""
|
||||
iteration = 0
|
||||
total_published = 0
|
||||
|
||||
while True:
|
||||
if max_iterations is not None and iteration >= max_iterations:
|
||||
logger.info(f"Elérte a maximális iterációt ({max_iterations}).")
|
||||
break
|
||||
|
||||
iteration += 1
|
||||
logger.info(f"--- Iteráció {iteration} ---")
|
||||
published = await self.process_batch()
|
||||
total_published += published
|
||||
|
||||
if published == 0:
|
||||
logger.info("Nincs gold_enriched adat. Várakozás 30 másodperc...")
|
||||
await asyncio.sleep(30)
|
||||
continue # Ne lépjen ki, hanem folytassa a ciklust
|
||||
|
||||
# Kis szünet a következő köteg előtt
|
||||
await asyncio.sleep(1)
|
||||
|
||||
logger.info(f"Robot leállt. Összesen publikálva: {total_published} jármű.")
|
||||
return total_published
|
||||
|
||||
|
||||
async def main():
|
||||
"""
|
||||
Fő függvény: indítja a robotot folyamatos módban.
|
||||
"""
|
||||
robot = VehicleRobot4Validator()
|
||||
try:
|
||||
# Végtelen ciklus (daemon mód)
|
||||
total = await robot.run(max_iterations=None)
|
||||
logger.info(f"Robot sikeresen lefutott. Publikálva: {total}")
|
||||
except Exception as e:
|
||||
logger.error(f"Robot futás közben hiba történt: {e}", exc_info=True)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -4,7 +4,7 @@ import sys
|
||||
from sqlalchemy import select, and_, text, update
|
||||
from sqlalchemy.orm import joinedload
|
||||
from app.database import AsyncSessionLocal
|
||||
from app.models.asset import Asset, AssetCatalog
|
||||
from app.models import Asset, AssetCatalog
|
||||
from app.services.ai_service import AIService
|
||||
|
||||
logging.basicConfig(
|
||||
|
||||
Reference in New Issue
Block a user