átlagos kiegészítséek jó sok

This commit is contained in:
Roo
2026-03-22 11:02:05 +00:00
parent f53e0b53df
commit 5d44339f21
249 changed files with 20922 additions and 2253 deletions

View File

@@ -0,0 +1,138 @@
#!/usr/bin/env python3
import asyncio
import json
import re
import logging
import random
import urllib.parse
from playwright.async_api import async_playwright
from sqlalchemy import text
from app.database import AsyncSessionLocal
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] [R5-SENTINEL] %(message)s')
logger = logging.getLogger("R5")
COLUMN_MAPPING = {
"horsepower": "power_kw",
"engine displacement": "engine_capacity",
"maximum torque": "torque_nm",
"top speed": "max_speed",
"acceleration 0 to 100 km/h": "acceleration_0_100",
"curb weight": "curb_weight",
"wheelbase": "wheelbase",
"num. of seats": "seats"
}
class R5Harvester:
def __init__(self):
self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
def clean_number(self, val: str, key: str = "") -> int:
if not val or val == "-": return 0
try:
if "hp" in val.lower() or "kw" in val.lower():
kw_match = re.search(r'(\d+)\s*kw', val.lower())
if kw_match: return int(kw_match.group(1))
nums = re.findall(r'\d+', val.replace(' ', '').replace(',', '').replace('.', ''))
return int(nums[0]) if nums else 0
except: return 0
async def scrape_car_details(self, page, make, model, year):
try:
# 1. Belső keresés
search_url = f"https://www.ultimatespecs.com/index.php?brand={urllib.parse.quote(make)}&q={urllib.parse.quote(model + ' ' + str(year))}"
logger.info(f"🔍 Keresés indítása...")
await page.goto(search_url, wait_until="networkidle", timeout=30000)
# 2. Megkeressük a linket, de NEM kattintunk, hanem elkérjük az URL-t
# Rugalmasabb szelektor a 75 találat kezeléséhez
link_element = await page.wait_for_selector("a[href*='/car-specs/']", timeout=15000)
if not link_element:
return None
href = await link_element.get_attribute("href")
target_url = href if href.startswith("http") else f"https://www.ultimatespecs.com{href}"
# 3. KÖZVETLEN UGRÁS (Direct Jump) - Ez kikerüli a hirdetéseket
logger.info(f"🚀 Közvetlen ugrás az adatlapra: {target_url}")
await page.goto(target_url, wait_until="networkidle", timeout=30000)
# 4. Parszolás (Minden táblázatot nézünk)
full_specs = await page.evaluate("""
() => {
let results = {};
document.querySelectorAll('table.table_specs, table.responsive').forEach(table => {
table.querySelectorAll('tr').forEach(row => {
let t = row.querySelector('.table_specs_title, .td_title, td:first-child');
let v = row.querySelector('.table_specs_value, .td_value, td:last-child');
if(t && v) {
let k = t.innerText.replace(':','').trim().toLowerCase();
let val = v.innerText.trim();
if(k && val && val !== "-") results[k] = val;
}
});
});
return results;
}
""")
return full_specs
except Exception as e:
logger.error(f"❌ Scrape hiba: {str(e)[:100]}...")
return None
async def run(self):
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context = await browser.new_context(user_agent=self.user_agent)
page = await context.new_page()
while True:
async with AsyncSessionLocal() as db:
query = text("""
SELECT id, make, marketing_name, year_from
FROM vehicle.vehicle_model_definitions
WHERE (power_kw IS NULL OR power_kw = 0)
AND status IN ('manual_review_needed', 'pending', 'enrich_ready')
ORDER BY priority_score DESC LIMIT 1
""")
target = (await db.execute(query)).fetchone()
if not target:
logger.info("✨ Pipeline üres.")
break
t_id, make, model, year = target
logger.info(f"🚜 Feldolgozás: {make} {model} ({year})")
web_data = await self.scrape_car_details(page, make, model, year)
if not web_data or len(web_data) < 5:
logger.warning(f"⚠️ Sikertelen gyűjtés, státusz: research_failed_empty")
await db.execute(text("UPDATE vehicle.vehicle_model_definitions SET status = 'research_failed_empty' WHERE id = :id"), {"id": t_id})
await db.commit()
continue
updates = {col: self.clean_number(web_data.get(k)) for k, col in COLUMN_MAPPING.items()}
if updates.get('power_kw', 0) > 0:
await db.execute(text("""
UPDATE vehicle.vehicle_model_definitions
SET power_kw = :power_kw, engine_capacity = :engine_capacity,
torque_nm = :torque_nm, max_speed = :max_speed,
acceleration_0_100 = :acceleration_0_100, curb_weight = :curb_weight,
wheelbase = :wheelbase, specifications = specifications || :full_json,
status = 'published', updated_at = NOW()
WHERE id = :id
"""), {**updates, "id": t_id, "full_json": json.dumps(web_data)})
await db.commit()
logger.info(f"✅ PUBLIKÁLVA: {make} {model} ({updates['power_kw']} kW)")
else:
await db.execute(text("UPDATE vehicle.vehicle_model_definitions SET status = 'research_failed_empty' WHERE id = :id"), {"id": t_id})
await db.commit()
await asyncio.sleep(random.uniform(3, 6))
await browser.close()
if __name__ == "__main__":
harvester = R5Harvester()
asyncio.run(harvester.run())