138 lines
6.4 KiB
Python
138 lines
6.4 KiB
Python
#!/usr/bin/env python3
|
|
import asyncio
|
|
import json
|
|
import re
|
|
import logging
|
|
import random
|
|
import urllib.parse
|
|
from playwright.async_api import async_playwright
|
|
from sqlalchemy import text
|
|
from app.database import AsyncSessionLocal
|
|
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] [R5-SENTINEL] %(message)s')
|
|
logger = logging.getLogger("R5")
|
|
|
|
COLUMN_MAPPING = {
|
|
"horsepower": "power_kw",
|
|
"engine displacement": "engine_capacity",
|
|
"maximum torque": "torque_nm",
|
|
"top speed": "max_speed",
|
|
"acceleration 0 to 100 km/h": "acceleration_0_100",
|
|
"curb weight": "curb_weight",
|
|
"wheelbase": "wheelbase",
|
|
"num. of seats": "seats"
|
|
}
|
|
|
|
class R5Harvester:
|
|
def __init__(self):
|
|
self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
|
|
|
|
def clean_number(self, val: str, key: str = "") -> int:
|
|
if not val or val == "-": return 0
|
|
try:
|
|
if "hp" in val.lower() or "kw" in val.lower():
|
|
kw_match = re.search(r'(\d+)\s*kw', val.lower())
|
|
if kw_match: return int(kw_match.group(1))
|
|
nums = re.findall(r'\d+', val.replace(' ', '').replace(',', '').replace('.', ''))
|
|
return int(nums[0]) if nums else 0
|
|
except: return 0
|
|
|
|
async def scrape_car_details(self, page, make, model, year):
|
|
try:
|
|
# 1. Belső keresés
|
|
search_url = f"https://www.ultimatespecs.com/index.php?brand={urllib.parse.quote(make)}&q={urllib.parse.quote(model + ' ' + str(year))}"
|
|
logger.info(f"🔍 Keresés indítása...")
|
|
await page.goto(search_url, wait_until="networkidle", timeout=30000)
|
|
|
|
# 2. Megkeressük a linket, de NEM kattintunk, hanem elkérjük az URL-t
|
|
# Rugalmasabb szelektor a 75 találat kezeléséhez
|
|
link_element = await page.wait_for_selector("a[href*='/car-specs/']", timeout=15000)
|
|
if not link_element:
|
|
return None
|
|
|
|
href = await link_element.get_attribute("href")
|
|
target_url = href if href.startswith("http") else f"https://www.ultimatespecs.com{href}"
|
|
|
|
# 3. KÖZVETLEN UGRÁS (Direct Jump) - Ez kikerüli a hirdetéseket
|
|
logger.info(f"🚀 Közvetlen ugrás az adatlapra: {target_url}")
|
|
await page.goto(target_url, wait_until="networkidle", timeout=30000)
|
|
|
|
# 4. Parszolás (Minden táblázatot nézünk)
|
|
full_specs = await page.evaluate("""
|
|
() => {
|
|
let results = {};
|
|
document.querySelectorAll('table.table_specs, table.responsive').forEach(table => {
|
|
table.querySelectorAll('tr').forEach(row => {
|
|
let t = row.querySelector('.table_specs_title, .td_title, td:first-child');
|
|
let v = row.querySelector('.table_specs_value, .td_value, td:last-child');
|
|
if(t && v) {
|
|
let k = t.innerText.replace(':','').trim().toLowerCase();
|
|
let val = v.innerText.trim();
|
|
if(k && val && val !== "-") results[k] = val;
|
|
}
|
|
});
|
|
});
|
|
return results;
|
|
}
|
|
""")
|
|
return full_specs
|
|
except Exception as e:
|
|
logger.error(f"❌ Scrape hiba: {str(e)[:100]}...")
|
|
return None
|
|
|
|
async def run(self):
|
|
async with async_playwright() as p:
|
|
browser = await p.chromium.launch(headless=True)
|
|
context = await browser.new_context(user_agent=self.user_agent)
|
|
page = await context.new_page()
|
|
|
|
while True:
|
|
async with AsyncSessionLocal() as db:
|
|
query = text("""
|
|
SELECT id, make, marketing_name, year_from
|
|
FROM vehicle.vehicle_model_definitions
|
|
WHERE (power_kw IS NULL OR power_kw = 0)
|
|
AND status IN ('manual_review_needed', 'pending', 'enrich_ready')
|
|
ORDER BY priority_score DESC LIMIT 1
|
|
""")
|
|
target = (await db.execute(query)).fetchone()
|
|
|
|
if not target:
|
|
logger.info("✨ Pipeline üres.")
|
|
break
|
|
|
|
t_id, make, model, year = target
|
|
logger.info(f"🚜 Feldolgozás: {make} {model} ({year})")
|
|
|
|
web_data = await self.scrape_car_details(page, make, model, year)
|
|
|
|
if not web_data or len(web_data) < 5:
|
|
logger.warning(f"⚠️ Sikertelen gyűjtés, státusz: research_failed_empty")
|
|
await db.execute(text("UPDATE vehicle.vehicle_model_definitions SET status = 'research_failed_empty' WHERE id = :id"), {"id": t_id})
|
|
await db.commit()
|
|
continue
|
|
|
|
updates = {col: self.clean_number(web_data.get(k)) for k, col in COLUMN_MAPPING.items()}
|
|
|
|
if updates.get('power_kw', 0) > 0:
|
|
await db.execute(text("""
|
|
UPDATE vehicle.vehicle_model_definitions
|
|
SET power_kw = :power_kw, engine_capacity = :engine_capacity,
|
|
torque_nm = :torque_nm, max_speed = :max_speed,
|
|
acceleration_0_100 = :acceleration_0_100, curb_weight = :curb_weight,
|
|
wheelbase = :wheelbase, specifications = specifications || :full_json,
|
|
status = 'published', updated_at = NOW()
|
|
WHERE id = :id
|
|
"""), {**updates, "id": t_id, "full_json": json.dumps(web_data)})
|
|
await db.commit()
|
|
logger.info(f"✅ PUBLIKÁLVA: {make} {model} ({updates['power_kw']} kW)")
|
|
else:
|
|
await db.execute(text("UPDATE vehicle.vehicle_model_definitions SET status = 'research_failed_empty' WHERE id = :id"), {"id": t_id})
|
|
await db.commit()
|
|
|
|
await asyncio.sleep(random.uniform(3, 6))
|
|
await browser.close()
|
|
|
|
if __name__ == "__main__":
|
|
harvester = R5Harvester()
|
|
asyncio.run(harvester.run()) |