átlagos kiegészítséek jó sok
This commit is contained in:
138
backend/app/workers/vehicle/r5_ultimate_harvester.py
Normal file
138
backend/app/workers/vehicle/r5_ultimate_harvester.py
Normal file
@@ -0,0 +1,138 @@
|
||||
#!/usr/bin/env python3
|
||||
import asyncio
|
||||
import json
|
||||
import re
|
||||
import logging
|
||||
import random
|
||||
import urllib.parse
|
||||
from playwright.async_api import async_playwright
|
||||
from sqlalchemy import text
|
||||
from app.database import AsyncSessionLocal
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] [R5-SENTINEL] %(message)s')
|
||||
logger = logging.getLogger("R5")
|
||||
|
||||
COLUMN_MAPPING = {
|
||||
"horsepower": "power_kw",
|
||||
"engine displacement": "engine_capacity",
|
||||
"maximum torque": "torque_nm",
|
||||
"top speed": "max_speed",
|
||||
"acceleration 0 to 100 km/h": "acceleration_0_100",
|
||||
"curb weight": "curb_weight",
|
||||
"wheelbase": "wheelbase",
|
||||
"num. of seats": "seats"
|
||||
}
|
||||
|
||||
class R5Harvester:
|
||||
def __init__(self):
|
||||
self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
|
||||
|
||||
def clean_number(self, val: str, key: str = "") -> int:
|
||||
if not val or val == "-": return 0
|
||||
try:
|
||||
if "hp" in val.lower() or "kw" in val.lower():
|
||||
kw_match = re.search(r'(\d+)\s*kw', val.lower())
|
||||
if kw_match: return int(kw_match.group(1))
|
||||
nums = re.findall(r'\d+', val.replace(' ', '').replace(',', '').replace('.', ''))
|
||||
return int(nums[0]) if nums else 0
|
||||
except: return 0
|
||||
|
||||
async def scrape_car_details(self, page, make, model, year):
|
||||
try:
|
||||
# 1. Belső keresés
|
||||
search_url = f"https://www.ultimatespecs.com/index.php?brand={urllib.parse.quote(make)}&q={urllib.parse.quote(model + ' ' + str(year))}"
|
||||
logger.info(f"🔍 Keresés indítása...")
|
||||
await page.goto(search_url, wait_until="networkidle", timeout=30000)
|
||||
|
||||
# 2. Megkeressük a linket, de NEM kattintunk, hanem elkérjük az URL-t
|
||||
# Rugalmasabb szelektor a 75 találat kezeléséhez
|
||||
link_element = await page.wait_for_selector("a[href*='/car-specs/']", timeout=15000)
|
||||
if not link_element:
|
||||
return None
|
||||
|
||||
href = await link_element.get_attribute("href")
|
||||
target_url = href if href.startswith("http") else f"https://www.ultimatespecs.com{href}"
|
||||
|
||||
# 3. KÖZVETLEN UGRÁS (Direct Jump) - Ez kikerüli a hirdetéseket
|
||||
logger.info(f"🚀 Közvetlen ugrás az adatlapra: {target_url}")
|
||||
await page.goto(target_url, wait_until="networkidle", timeout=30000)
|
||||
|
||||
# 4. Parszolás (Minden táblázatot nézünk)
|
||||
full_specs = await page.evaluate("""
|
||||
() => {
|
||||
let results = {};
|
||||
document.querySelectorAll('table.table_specs, table.responsive').forEach(table => {
|
||||
table.querySelectorAll('tr').forEach(row => {
|
||||
let t = row.querySelector('.table_specs_title, .td_title, td:first-child');
|
||||
let v = row.querySelector('.table_specs_value, .td_value, td:last-child');
|
||||
if(t && v) {
|
||||
let k = t.innerText.replace(':','').trim().toLowerCase();
|
||||
let val = v.innerText.trim();
|
||||
if(k && val && val !== "-") results[k] = val;
|
||||
}
|
||||
});
|
||||
});
|
||||
return results;
|
||||
}
|
||||
""")
|
||||
return full_specs
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Scrape hiba: {str(e)[:100]}...")
|
||||
return None
|
||||
|
||||
async def run(self):
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
context = await browser.new_context(user_agent=self.user_agent)
|
||||
page = await context.new_page()
|
||||
|
||||
while True:
|
||||
async with AsyncSessionLocal() as db:
|
||||
query = text("""
|
||||
SELECT id, make, marketing_name, year_from
|
||||
FROM vehicle.vehicle_model_definitions
|
||||
WHERE (power_kw IS NULL OR power_kw = 0)
|
||||
AND status IN ('manual_review_needed', 'pending', 'enrich_ready')
|
||||
ORDER BY priority_score DESC LIMIT 1
|
||||
""")
|
||||
target = (await db.execute(query)).fetchone()
|
||||
|
||||
if not target:
|
||||
logger.info("✨ Pipeline üres.")
|
||||
break
|
||||
|
||||
t_id, make, model, year = target
|
||||
logger.info(f"🚜 Feldolgozás: {make} {model} ({year})")
|
||||
|
||||
web_data = await self.scrape_car_details(page, make, model, year)
|
||||
|
||||
if not web_data or len(web_data) < 5:
|
||||
logger.warning(f"⚠️ Sikertelen gyűjtés, státusz: research_failed_empty")
|
||||
await db.execute(text("UPDATE vehicle.vehicle_model_definitions SET status = 'research_failed_empty' WHERE id = :id"), {"id": t_id})
|
||||
await db.commit()
|
||||
continue
|
||||
|
||||
updates = {col: self.clean_number(web_data.get(k)) for k, col in COLUMN_MAPPING.items()}
|
||||
|
||||
if updates.get('power_kw', 0) > 0:
|
||||
await db.execute(text("""
|
||||
UPDATE vehicle.vehicle_model_definitions
|
||||
SET power_kw = :power_kw, engine_capacity = :engine_capacity,
|
||||
torque_nm = :torque_nm, max_speed = :max_speed,
|
||||
acceleration_0_100 = :acceleration_0_100, curb_weight = :curb_weight,
|
||||
wheelbase = :wheelbase, specifications = specifications || :full_json,
|
||||
status = 'published', updated_at = NOW()
|
||||
WHERE id = :id
|
||||
"""), {**updates, "id": t_id, "full_json": json.dumps(web_data)})
|
||||
await db.commit()
|
||||
logger.info(f"✅ PUBLIKÁLVA: {make} {model} ({updates['power_kw']} kW)")
|
||||
else:
|
||||
await db.execute(text("UPDATE vehicle.vehicle_model_definitions SET status = 'research_failed_empty' WHERE id = :id"), {"id": t_id})
|
||||
await db.commit()
|
||||
|
||||
await asyncio.sleep(random.uniform(3, 6))
|
||||
await browser.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
harvester = R5Harvester()
|
||||
asyncio.run(harvester.run())
|
||||
Reference in New Issue
Block a user