átlagos kiegészítséek jó sok
This commit is contained in:
@@ -0,0 +1,425 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Worker: vehicle_ultimate_r0_spider
|
||||
Producer-Consumer lánc első eleme. Kivesz egy autót a vehicle.vehicle_model_definitions táblából,
|
||||
keres az UltimateSpecs oldalán, és a talált .html linkeket beszúrja a vehicle.auto_data_crawler_queue táblába.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import random
|
||||
import sys
|
||||
import signal
|
||||
import urllib.parse
|
||||
from datetime import datetime
|
||||
from typing import Optional, Dict, Any, List
|
||||
|
||||
from playwright.async_api import async_playwright, Page, Browser, BrowserContext
|
||||
from sqlalchemy import text, select, and_, or_
|
||||
from sqlalchemy.exc import IntegrityError
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.database import AsyncSessionLocal
|
||||
from app.models.vehicle.external_reference_queue import ExternalReferenceQueue
|
||||
from app.models.vehicle.vehicle_definitions import VehicleModelDefinition
|
||||
|
||||
# Logging konfiguráció
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s [R0-SPIDER] %(message)s',
|
||||
datefmt='%Y-%m-%d %H:%M:%S'
|
||||
)
|
||||
logger = logging.getLogger("R0-SPIDER")
|
||||
|
||||
# Konfiguráció
|
||||
SLEEP_INTERVAL = random.uniform(3, 6) # 3-6 mp között várakozás
|
||||
MAX_RETRIES = 3
|
||||
BASE_URL = "https://www.ultimatespecs.com/index.php?q={query}"
|
||||
|
||||
|
||||
class UltimateSpecsSpider:
|
||||
def __init__(self):
|
||||
self.running = True
|
||||
self.playwright = None
|
||||
self.browser: Optional[Browser] = None
|
||||
self.context: Optional[BrowserContext] = None
|
||||
self.user_agent = (
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
||||
"(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
|
||||
)
|
||||
|
||||
async def init_browser(self):
|
||||
"""Playwright böngésző inicializálása"""
|
||||
try:
|
||||
self.playwright = await async_playwright().start()
|
||||
self.browser = await self.playwright.chromium.launch(
|
||||
headless=True,
|
||||
args=[
|
||||
'--disable-blink-features=AutomationControlled',
|
||||
'--disable-dev-shm-usage',
|
||||
'--no-sandbox',
|
||||
]
|
||||
)
|
||||
self.context = await self.browser.new_context(
|
||||
user_agent=self.user_agent,
|
||||
viewport={'width': 1920, 'height': 1080},
|
||||
java_script_enabled=True
|
||||
)
|
||||
logger.info("Playwright böngésző inicializálva")
|
||||
except Exception as e:
|
||||
logger.error(f"Hiba a böngésző inicializálásakor: {e}")
|
||||
raise
|
||||
|
||||
async def close_browser(self):
|
||||
"""Playwright böngésző lezárása"""
|
||||
if self.context:
|
||||
await self.context.close()
|
||||
if self.browser:
|
||||
await self.browser.close()
|
||||
if self.playwright:
|
||||
await self.playwright.stop()
|
||||
logger.info("Playwright böngésző lezárva")
|
||||
|
||||
async def fetch_next_vehicle(self, session: AsyncSession) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Kivesz egy feldolgozandó járművet a vehicle_model_definitions táblából.
|
||||
"""
|
||||
query = text("""
|
||||
SELECT id, make, marketing_name, year_from, vehicle_class
|
||||
FROM vehicle.vehicle_model_definitions
|
||||
WHERE status IN ('pending', 'manual_review_needed')
|
||||
AND vehicle_class IN ('car', 'motorcycle')
|
||||
ORDER BY priority_score DESC, updated_at ASC
|
||||
LIMIT 1
|
||||
FOR UPDATE SKIP LOCKED
|
||||
""")
|
||||
|
||||
try:
|
||||
result = await session.execute(query)
|
||||
row = result.fetchone()
|
||||
if row:
|
||||
return {
|
||||
'id': row[0],
|
||||
'make': row[1],
|
||||
'marketing_name': row[2],
|
||||
'year_from': row[3],
|
||||
'vehicle_class': row[4]
|
||||
}
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.error(f"Hiba a következő jármű lekérdezésekor: {e}")
|
||||
return None
|
||||
|
||||
def build_search_query(self, make: str, marketing_name: str, year_from: Optional[int]) -> str:
|
||||
"""
|
||||
Build search query for UltimateSpecs.
|
||||
"""
|
||||
# Clean and prepare the query
|
||||
make_clean = make.lower().replace(' ', '-').replace('.', '')
|
||||
model_clean = marketing_name.lower().replace(' ', '-').replace('.', '')
|
||||
|
||||
# Remove common suffixes
|
||||
for suffix in ['-', 'series', 'class', 'model']:
|
||||
if model_clean.endswith(suffix):
|
||||
model_clean = model_clean[:-len(suffix)].rstrip('-')
|
||||
|
||||
query_parts = [make_clean, model_clean]
|
||||
if year_from:
|
||||
query_parts.append(str(year_from))
|
||||
|
||||
return ' '.join(query_parts)
|
||||
|
||||
async def extract_links_with_js(self, page: Page, make_url: str, model_word: str) -> List[Dict[str, str]]:
|
||||
"""
|
||||
Extract .html links from the page using the provided JavaScript filter.
|
||||
"""
|
||||
js_code = """
|
||||
(args) => {
|
||||
let targetMakeUrl = args.makeUrl; // pl. 'honda' vagy 'alfa-romeo'
|
||||
let targetModel = args.modelWord; // pl. 'civic'
|
||||
let specs = [];
|
||||
document.querySelectorAll('a').forEach(a => {
|
||||
let href = a.getAttribute('href') || '';
|
||||
let text = a.innerText.trim();
|
||||
let hrefLow = href.toLowerCase();
|
||||
let textLow = text.toLowerCase();
|
||||
if (hrefLow.includes('/car-specs/') || hrefLow.includes('/motorcycles-specs/')) {
|
||||
// SZIGORÚ MÁRKA SZŰRŐ AZ URL-BEN (Reklámok ellen)
|
||||
if (hrefLow.includes('/' + targetMakeUrl + '/') || hrefLow.includes(targetMakeUrl + '-models')) {
|
||||
// MODELL SZŰRŐ A SZÖVEGBEN VAGY URL-BEN
|
||||
if (targetModel === '' || textLow.includes(targetModel) || hrefLow.includes(targetModel)) {
|
||||
if (hrefLow.endsWith('.html') && text.length > 1) {
|
||||
specs.push({ name: text, url: href });
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
return specs;
|
||||
}
|
||||
"""
|
||||
|
||||
try:
|
||||
# Prepare arguments for the JS function
|
||||
args = {
|
||||
'makeUrl': make_url.lower(),
|
||||
'modelWord': model_word.lower()
|
||||
}
|
||||
|
||||
# Execute the JavaScript
|
||||
specs = await page.evaluate(js_code, args)
|
||||
return specs
|
||||
except Exception as e:
|
||||
logger.error(f"Hiba a JS szűrő futtatásakor: {e}")
|
||||
return []
|
||||
|
||||
async def search_and_extract_links(self, vehicle: Dict[str, Any]) -> List[Dict[str, str]]:
|
||||
"""
|
||||
Search on UltimateSpecs and extract links using two-step drill-down.
|
||||
"""
|
||||
search_query = self.build_search_query(
|
||||
vehicle['make'],
|
||||
vehicle['marketing_name'],
|
||||
vehicle['year_from']
|
||||
)
|
||||
|
||||
# Prepare make URL part
|
||||
make_url = vehicle['make'].lower().replace(' ', '-').replace('.', '')
|
||||
model_word = vehicle['marketing_name'].lower().split()[0] if vehicle['marketing_name'] else ''
|
||||
|
||||
encoded_query = urllib.parse.quote(search_query)
|
||||
search_url = BASE_URL.format(query=encoded_query)
|
||||
|
||||
logger.info(f"Keresés: {search_query} | URL: {search_url}")
|
||||
|
||||
page = None
|
||||
try:
|
||||
page = await self.context.new_page()
|
||||
|
||||
# 1. Step: Go to search page
|
||||
await page.goto(search_url, wait_until='networkidle', timeout=30000)
|
||||
|
||||
# Check if we're on a category page or search results
|
||||
current_url = page.url
|
||||
|
||||
# 2. Step: Extract links with JS filter
|
||||
all_links = await self.extract_links_with_js(page, make_url, model_word)
|
||||
|
||||
# If no links found on first page, try to click on first result
|
||||
if not all_links and 'index.php' in current_url:
|
||||
# Try to find and click on first relevant link
|
||||
first_link = await page.query_selector('a[href*="/car-specs/"], a[href*="/motorcycles-specs/"]')
|
||||
if first_link:
|
||||
await first_link.click()
|
||||
await page.wait_for_load_state('networkidle')
|
||||
|
||||
# Extract links from the new page
|
||||
all_links = await self.extract_links_with_js(page, make_url, model_word)
|
||||
|
||||
# Ensure URLs are absolute
|
||||
for link in all_links:
|
||||
if not link['url'].startswith('http'):
|
||||
link['url'] = f"https://www.ultimatespecs.com{link['url']}"
|
||||
|
||||
logger.info(f"{len(all_links)} link találva")
|
||||
return all_links
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Hiba a keresés során: {e}")
|
||||
return []
|
||||
finally:
|
||||
if page:
|
||||
await page.close()
|
||||
|
||||
async def save_links_to_queue(self, session: AsyncSession, links: List[Dict[str, str]],
|
||||
vehicle: Dict[str, Any]) -> int:
|
||||
"""
|
||||
Save extracted links to the external reference queue.
|
||||
"""
|
||||
saved_count = 0
|
||||
|
||||
for link in links:
|
||||
try:
|
||||
# Check if URL already exists
|
||||
existing_query = select(ExternalReferenceQueue).where(
|
||||
ExternalReferenceQueue.url == link['url']
|
||||
)
|
||||
existing_result = await session.execute(existing_query)
|
||||
if existing_result.scalar_one_or_none():
|
||||
logger.debug(f"URL már létezik: {link['url']}")
|
||||
continue
|
||||
|
||||
# Create new queue entry
|
||||
queue_entry = ExternalReferenceQueue(
|
||||
url=link['url'],
|
||||
level='engine',
|
||||
category=vehicle['vehicle_class'] or 'car',
|
||||
name=link['name'][:255],
|
||||
parent_id=vehicle['id'],
|
||||
status='pending'
|
||||
)
|
||||
|
||||
session.add(queue_entry)
|
||||
await session.commit()
|
||||
saved_count += 1
|
||||
logger.debug(f"URL mentve: {link['url']}")
|
||||
|
||||
except IntegrityError:
|
||||
await session.rollback()
|
||||
logger.debug(f"URL már létezik (integrity): {link['url']}")
|
||||
except Exception as e:
|
||||
await session.rollback()
|
||||
logger.error(f"Hiba a URL mentésekor: {e}")
|
||||
|
||||
return saved_count
|
||||
|
||||
async def update_vehicle_status(self, session: AsyncSession, vehicle_id: int,
|
||||
status: str, error_msg: str = None):
|
||||
"""
|
||||
Update the vehicle's status in the database.
|
||||
"""
|
||||
try:
|
||||
query = text("""
|
||||
UPDATE vehicle.vehicle_model_definitions
|
||||
SET status = :status,
|
||||
last_error = :error_msg,
|
||||
updated_at = NOW(),
|
||||
attempts = attempts + 1
|
||||
WHERE id = :id
|
||||
""")
|
||||
|
||||
await session.execute(
|
||||
query,
|
||||
{'status': status, 'error_msg': error_msg, 'id': vehicle_id}
|
||||
)
|
||||
await session.commit()
|
||||
logger.info(f"Jármű státusz frissítve: {vehicle_id} -> {status}")
|
||||
|
||||
except Exception as e:
|
||||
await session.rollback()
|
||||
logger.error(f"Hiba a státusz frissítésekor: {e}")
|
||||
|
||||
async def process_single_vehicle(self):
|
||||
"""
|
||||
Process a single vehicle: fetch, search, extract links, save to queue.
|
||||
"""
|
||||
async with AsyncSessionLocal() as session:
|
||||
try:
|
||||
# 1. Fetch next vehicle
|
||||
vehicle = await self.fetch_next_vehicle(session)
|
||||
if not vehicle:
|
||||
logger.info("Nincs feldolgozandó jármű")
|
||||
return False
|
||||
|
||||
logger.info(f"Feldolgozás: {vehicle['make']} {vehicle['marketing_name']} "
|
||||
f"(ID: {vehicle['id']})")
|
||||
|
||||
# 2. Search and extract links
|
||||
links = await self.search_and_extract_links(vehicle)
|
||||
|
||||
if not links:
|
||||
# No links found
|
||||
await self.update_vehicle_status(
|
||||
session, vehicle['id'],
|
||||
'research_failed_empty',
|
||||
'No links found on UltimateSpecs'
|
||||
)
|
||||
logger.warning(f"Nem található link: {vehicle['make']} {vehicle['marketing_name']}")
|
||||
return True
|
||||
|
||||
# 3. Save links to queue
|
||||
saved_count = await self.save_links_to_queue(session, links, vehicle)
|
||||
|
||||
# 4. Update vehicle status
|
||||
if saved_count > 0:
|
||||
await self.update_vehicle_status(
|
||||
session, vehicle['id'],
|
||||
'spider_dispatched',
|
||||
f'{saved_count} links added to queue'
|
||||
)
|
||||
logger.info(f"{saved_count} link mentve a queue-ba")
|
||||
else:
|
||||
# All links already existed
|
||||
await self.update_vehicle_status(
|
||||
session, vehicle['id'],
|
||||
'spider_dispatched',
|
||||
'All links already in queue'
|
||||
)
|
||||
logger.info("Minden link már szerepel a queue-ban")
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Hiba a jármű feldolgozása során: {e}")
|
||||
# Try to update status with error
|
||||
try:
|
||||
if 'vehicle' in locals():
|
||||
await self.update_vehicle_status(
|
||||
session, vehicle['id'],
|
||||
'research_failed_network',
|
||||
str(e)[:500]
|
||||
)
|
||||
except:
|
||||
pass
|
||||
return True
|
||||
|
||||
async def run(self):
|
||||
"""
|
||||
Main loop of the spider.
|
||||
"""
|
||||
logger.info("UltimateSpecs R0 Spider indítása...")
|
||||
|
||||
try:
|
||||
await self.init_browser()
|
||||
|
||||
while self.running:
|
||||
try:
|
||||
# Process a single vehicle
|
||||
processed = await self.process_single_vehicle()
|
||||
|
||||
if not processed:
|
||||
# No vehicles to process, wait longer
|
||||
await asyncio.sleep(SLEEP_INTERVAL * 2)
|
||||
else:
|
||||
# Wait before next iteration
|
||||
await asyncio.sleep(SLEEP_INTERVAL)
|
||||
|
||||
except KeyboardInterrupt:
|
||||
logger.info("Keyboard interrupt, leállítás...")
|
||||
self.running = False
|
||||
break
|
||||
except Exception as e:
|
||||
logger.error(f"Hiba a fő ciklusban: {e}")
|
||||
await asyncio.sleep(SLEEP_INTERVAL)
|
||||
|
||||
finally:
|
||||
await self.close_browser()
|
||||
logger.info("UltimateSpecs R0 Spider leállt")
|
||||
|
||||
def stop(self):
|
||||
"""Stop the spider gracefully."""
|
||||
self.running = False
|
||||
logger.info("Leállítás kérése érkezett")
|
||||
|
||||
|
||||
async def main():
|
||||
"""Main entry point."""
|
||||
spider = UltimateSpecsSpider()
|
||||
|
||||
# Signal handling for graceful shutdown
|
||||
def signal_handler(signum, frame):
|
||||
logger.info(f"Signal {signum} received, stopping...")
|
||||
spider.stop()
|
||||
|
||||
signal.signal(signal.SIGINT, signal_handler)
|
||||
signal.signal(signal.SIGTERM, signal_handler)
|
||||
|
||||
try:
|
||||
await spider.run()
|
||||
except Exception as e:
|
||||
logger.error(f"Váratlan hiba: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -0,0 +1,355 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Worker: vehicle_ultimate_r1_scraper
|
||||
Producer-Consumer lánc második eleme (A Nyers Letöltő).
|
||||
Kivesz egy feldolgozandó linket a vehicle.auto_data_crawler_queue táblából (level='engine'),
|
||||
letölti a HTML tartalmat Playwright böngészővel, kinyeri a specifikációkat JS parserrel,
|
||||
és elmenti a vehicle.external_reference_library táblába.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import random
|
||||
import sys
|
||||
import signal
|
||||
import json
|
||||
from datetime import datetime
|
||||
from typing import Optional, Dict, Any, List
|
||||
|
||||
from playwright.async_api import async_playwright, Page, Browser, BrowserContext, TimeoutError as PlaywrightTimeoutError
|
||||
from sqlalchemy import text, select, and_, or_
|
||||
from sqlalchemy.exc import IntegrityError
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.database import AsyncSessionLocal, ensure_models_loaded
|
||||
from app.models.vehicle.external_reference_queue import ExternalReferenceQueue
|
||||
from app.models.vehicle.external_reference import ExternalReferenceLibrary
|
||||
|
||||
# Logging konfiguráció
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s [R1-SCRAPER] %(message)s',
|
||||
datefmt='%Y-%m-%d %H:%M:%S'
|
||||
)
|
||||
logger = logging.getLogger("R1-SCRAPER")
|
||||
|
||||
# Konfiguráció
|
||||
SLEEP_INTERVAL = random.uniform(3, 6) # 3-6 mp között várakozás
|
||||
MAX_RETRIES = 3
|
||||
CLOUDFLARE_KEYWORDS = ["just a moment", "cloudflare", "checking your browser"]
|
||||
|
||||
|
||||
class UltimateSpecsScraper:
|
||||
def __init__(self):
|
||||
self.running = True
|
||||
self.playwright = None
|
||||
self.browser: Optional[Browser] = None
|
||||
self.context: Optional[BrowserContext] = None
|
||||
self.user_agent = (
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
||||
"(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
|
||||
)
|
||||
|
||||
async def init_browser(self):
|
||||
"""Playwright böngésző inicializálása"""
|
||||
try:
|
||||
self.playwright = await async_playwright().start()
|
||||
self.browser = await self.playwright.chromium.launch(
|
||||
headless=True,
|
||||
args=[
|
||||
'--disable-blink-features=AutomationControlled',
|
||||
'--disable-dev-shm-usage',
|
||||
'--no-sandbox',
|
||||
]
|
||||
)
|
||||
self.context = await self.browser.new_context(
|
||||
user_agent=self.user_agent,
|
||||
viewport={'width': 1920, 'height': 1080},
|
||||
java_script_enabled=True
|
||||
)
|
||||
logger.info("Playwright böngésző inicializálva")
|
||||
except Exception as e:
|
||||
logger.error(f"Hiba a böngésző inicializálásakor: {e}")
|
||||
raise
|
||||
|
||||
async def close_browser(self):
|
||||
"""Playwright böngésző lezárása"""
|
||||
if self.context:
|
||||
await self.context.close()
|
||||
if self.browser:
|
||||
await self.browser.close()
|
||||
if self.playwright:
|
||||
await self.playwright.stop()
|
||||
logger.info("Playwright böngésző lezárva")
|
||||
|
||||
async def fetch_next_queue_item(self, session: AsyncSession) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Kivesz egy feldolgozandó linket a vehicle.auto_data_crawler_queue táblából.
|
||||
"""
|
||||
query = text("""
|
||||
SELECT id, url, category, parent_id
|
||||
FROM vehicle.auto_data_crawler_queue
|
||||
WHERE level = 'engine' AND status = 'pending'
|
||||
FOR UPDATE SKIP LOCKED LIMIT 1
|
||||
""")
|
||||
|
||||
try:
|
||||
result = await session.execute(query)
|
||||
row = result.fetchone()
|
||||
if row:
|
||||
return {
|
||||
"id": row[0],
|
||||
"url": row[1],
|
||||
"category": row[2],
|
||||
"parent_id": row[3]
|
||||
}
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.error(f"Hiba a queue lekérdezésekor: {e}")
|
||||
return None
|
||||
|
||||
async def scrape_with_retry(self, url: str, max_retries: int = MAX_RETRIES) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Playwright böngészővel letölti a HTML tartalmat, retry logikával.
|
||||
"""
|
||||
for attempt in range(1, max_retries + 1):
|
||||
try:
|
||||
logger.info(f"Próbálkozás {attempt}/{max_retries}: {url}")
|
||||
page = await self.context.new_page()
|
||||
|
||||
# Navigáció
|
||||
await page.goto(url, wait_until="domcontentloaded", timeout=30000)
|
||||
|
||||
# Várjunk a táblázatokra
|
||||
try:
|
||||
await page.wait_for_selector('table', timeout=5000)
|
||||
except PlaywrightTimeoutError:
|
||||
logger.warning("Nem található táblázat 5 másodpercen belül, de folytatjuk")
|
||||
|
||||
# Ellenőrizzük Cloudflare blokkolást
|
||||
title = await page.title()
|
||||
title_lower = title.lower()
|
||||
if any(keyword in title_lower for keyword in CLOUDFLARE_KEYWORDS):
|
||||
raise Exception(f"Cloudflare blokkolás észlelve: {title}")
|
||||
|
||||
# JS parser futtatása
|
||||
specs = await page.evaluate("""() => {
|
||||
let results = {};
|
||||
// 1. ÖSSZES táblázat letapogatása
|
||||
document.querySelectorAll('table').forEach(table => {
|
||||
table.querySelectorAll('tr').forEach(row => {
|
||||
let t = row.querySelector('.table_specs_title, .td_title, td:first-child, th:first-child');
|
||||
let v = row.querySelector('.table_specs_value, .td_value, td:last-child');
|
||||
if(t && v) {
|
||||
let k = t.innerText.replace(/:/g,'').trim().toLowerCase();
|
||||
let val = v.innerText.trim();
|
||||
if(k && val && val !== "-") { results[k] = val; }
|
||||
}
|
||||
});
|
||||
});
|
||||
// 2. Extra szekciók és dimenziók mentése
|
||||
const sections = {};
|
||||
document.querySelectorAll('h2, h3, h4, .section-title, .specs-header').forEach(header => {
|
||||
const title = header.innerText.trim();
|
||||
if (title && title.length > 0) {
|
||||
let nextElement = header.nextElementSibling;
|
||||
let sectionData = {};
|
||||
for (let i = 0; i < 5 && nextElement; i++) {
|
||||
if (nextElement.tagName === 'TABLE') {
|
||||
nextElement.querySelectorAll('tr').forEach(row => {
|
||||
let t = row.querySelector('td:first-child, th:first-child');
|
||||
let v = row.querySelector('td:last-child');
|
||||
if(t && v) {
|
||||
let k = t.innerText.replace(/:/g,'').trim().toLowerCase();
|
||||
let val = v.innerText.trim();
|
||||
if(k && val && val !== "-") {
|
||||
sectionData[k] = val;
|
||||
results[`${title.toLowerCase().replace(/ /g, '_')}_${k}`] = val;
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
nextElement = nextElement.nextElementSibling;
|
||||
}
|
||||
sections[title.toLowerCase().replace(/ /g, '_')] = sectionData;
|
||||
}
|
||||
});
|
||||
results['_sections'] = sections;
|
||||
return results;
|
||||
}""")
|
||||
|
||||
await page.close()
|
||||
|
||||
if specs and len(specs) > 0:
|
||||
logger.info(f"Sikeres letöltés, {len(specs)} specifikáció kinyerve")
|
||||
return specs
|
||||
else:
|
||||
logger.warning("Üres specifikációk, újrapróbálkozás")
|
||||
raise Exception("Üres specifikációk")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Hiba a {attempt}. próbálkozásnál: {e}")
|
||||
if attempt < max_retries:
|
||||
backoff = random.uniform(2, 5)
|
||||
logger.info(f"Várakozás {backoff:.1f} másodpercet...")
|
||||
await asyncio.sleep(backoff)
|
||||
else:
|
||||
logger.error(f"Összes próbálkozás sikertelen: {e}")
|
||||
return None
|
||||
|
||||
return None
|
||||
|
||||
async def process_queue_item(self, session: AsyncSession, item: Dict[str, Any]) -> bool:
|
||||
"""
|
||||
Feldolgoz egy queue tételt: letölti, kinyeri, elmenti.
|
||||
"""
|
||||
queue_id = item["id"]
|
||||
url = item["url"]
|
||||
category = item["category"]
|
||||
|
||||
try:
|
||||
# 1. Letöltés
|
||||
specs = await self.scrape_with_retry(url)
|
||||
|
||||
if not specs:
|
||||
# Hiba esetén frissítjük a queue-t
|
||||
await session.execute(
|
||||
text("""
|
||||
UPDATE vehicle.auto_data_crawler_queue
|
||||
SET status = 'error', error_msg = :error_msg, retry_count = retry_count + 1
|
||||
WHERE id = :id
|
||||
"""),
|
||||
{"error_msg": "Sikertelen letöltés (üres specifikációk vagy Cloudflare)", "id": queue_id}
|
||||
)
|
||||
await session.commit()
|
||||
logger.error(f"Queue {queue_id} sikertelen, státusz: error")
|
||||
return False
|
||||
|
||||
# 2. Új rekord létrehozása az external_reference_library táblában (nyers SQL)
|
||||
# A specifications dict-et JSON stringgé alakítjuk
|
||||
import json
|
||||
specs_json = json.dumps(specs)
|
||||
insert_query = text("""
|
||||
INSERT INTO vehicle.external_reference_library
|
||||
(source_name, source_url, category, specifications, pipeline_status, created_at, last_scraped_at)
|
||||
VALUES (:source_name, :source_url, :category, CAST(:specifications AS jsonb), :pipeline_status, NOW(), NOW())
|
||||
RETURNING id
|
||||
""")
|
||||
result = await session.execute(
|
||||
insert_query,
|
||||
{
|
||||
"source_name": "ultimatespecs",
|
||||
"source_url": url,
|
||||
"category": category,
|
||||
"specifications": specs_json,
|
||||
"pipeline_status": "pending_enrich"
|
||||
}
|
||||
)
|
||||
new_id = result.scalar()
|
||||
|
||||
# 3. Queue tétel frissítése completed-re
|
||||
await session.execute(
|
||||
text("""
|
||||
UPDATE vehicle.auto_data_crawler_queue
|
||||
SET status = 'completed', updated_at = NOW()
|
||||
WHERE id = :id
|
||||
"""),
|
||||
{"id": queue_id}
|
||||
)
|
||||
|
||||
await session.commit()
|
||||
logger.info(f"Queue {queue_id} sikeresen feldolgozva, library ID: {new_id}")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Hiba a queue {queue_id} feldolgozásakor: {e}")
|
||||
await session.rollback()
|
||||
|
||||
# Hiba esetén error státusz
|
||||
try:
|
||||
await session.execute(
|
||||
text("""
|
||||
UPDATE vehicle.auto_data_crawler_queue
|
||||
SET status = 'error', error_msg = :error_msg, retry_count = retry_count + 1
|
||||
WHERE id = :id
|
||||
"""),
|
||||
{"error_msg": str(e)[:500], "id": queue_id}
|
||||
)
|
||||
await session.commit()
|
||||
except Exception as update_err:
|
||||
logger.error(f"Hiba a queue frissítésekor: {update_err}")
|
||||
|
||||
return False
|
||||
|
||||
async def run_once(self):
|
||||
"""Egyetlen feldolgozási ciklus"""
|
||||
# Biztosítjuk, hogy a modellek regisztrálva legyenek
|
||||
ensure_models_loaded()
|
||||
|
||||
async with AsyncSessionLocal() as session:
|
||||
try:
|
||||
# Tranzakció kezdése
|
||||
async with session.begin():
|
||||
item = await self.fetch_next_queue_item(session)
|
||||
if not item:
|
||||
logger.info("Nincs feldolgozandó queue tétel")
|
||||
return False
|
||||
|
||||
logger.info(f"Feldolgozás: {item['url']}")
|
||||
success = await self.process_queue_item(session, item)
|
||||
return success
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Hiba a run_once-ban: {e}")
|
||||
return False
|
||||
|
||||
async def run_loop(self):
|
||||
"""Fő ciklus: végtelen while, 3-6 mp várakozással"""
|
||||
await self.init_browser()
|
||||
|
||||
try:
|
||||
while self.running:
|
||||
success = await self.run_once()
|
||||
|
||||
if not success:
|
||||
# Ha nincs munka, várjunk egy kicsit
|
||||
sleep_time = SLEEP_INTERVAL
|
||||
logger.debug(f"Várakozás {sleep_time:.1f} másodpercet...")
|
||||
await asyncio.sleep(sleep_time)
|
||||
else:
|
||||
# Sikeres feldolgozás után rövid várakozás
|
||||
await asyncio.sleep(random.uniform(1, 2))
|
||||
|
||||
except KeyboardInterrupt:
|
||||
logger.info("Keyboard interrupt, leállítás...")
|
||||
except Exception as e:
|
||||
logger.error(f"Váratlan hiba a fő ciklusban: {e}")
|
||||
finally:
|
||||
await self.close_browser()
|
||||
|
||||
def stop(self):
|
||||
"""Leállítási jelzés"""
|
||||
self.running = False
|
||||
logger.info("Leállítási jelzés küldve")
|
||||
|
||||
|
||||
async def main():
|
||||
"""Fő függvény"""
|
||||
scraper = UltimateSpecsScraper()
|
||||
|
||||
# Signal kezelés
|
||||
def signal_handler(signum, frame):
|
||||
scraper.stop()
|
||||
|
||||
signal.signal(signal.SIGINT, signal_handler)
|
||||
signal.signal(signal.SIGTERM, signal_handler)
|
||||
|
||||
try:
|
||||
await scraper.run_loop()
|
||||
except Exception as e:
|
||||
logger.error(f"Fatal error: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -0,0 +1,299 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Worker: vehicle_ultimate_r2_enricher
|
||||
Producer-Consumer lánc harmadik eleme (Az Elemző). Offline adattisztítást és strukturálást végez.
|
||||
Kivesz egy feldolgozandó sort a vehicle.external_reference_library táblából (pipeline_status='pending_enrich'),
|
||||
hozzácsatolja a vehicle.auto_data_crawler_queue adatait, kinyeri a standard értékeket a nyers JSON-ből,
|
||||
és strukturált JSON-be csomagolja (standardized + _raw).
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import random
|
||||
import sys
|
||||
import signal
|
||||
import json
|
||||
import re
|
||||
from datetime import datetime
|
||||
from typing import Optional, Dict, Any, List, Tuple
|
||||
|
||||
from sqlalchemy import text, select, and_, or_
|
||||
from sqlalchemy.exc import IntegrityError, SQLAlchemyError
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.database import AsyncSessionLocal
|
||||
|
||||
# Logging konfiguráció
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s [R2-ENRICHER] %(message)s',
|
||||
datefmt='%Y-%m-%d %H:%M:%S'
|
||||
)
|
||||
logger = logging.getLogger("R2-ENRICHER")
|
||||
|
||||
# Konfiguráció
|
||||
SLEEP_INTERVAL = random.uniform(1, 3) # 1-3 mp között várakozás
|
||||
|
||||
# Fuzzy mapping a metrikákhoz
|
||||
FUZZY_MAPPING = {
|
||||
"power_kw": ["horsepower", "total electric power", "engine power", "maximum power", "power"],
|
||||
"engine_capacity": ["engine displacement", "displacement", "capacity", "cm3", "cu-in"],
|
||||
"torque_nm": ["maximum torque", "total electric torque", "torque"],
|
||||
"max_speed": ["top speed", "maximum speed"],
|
||||
"curb_weight": ["curb weight", "weight"],
|
||||
"wheelbase": ["wheelbase"],
|
||||
"seats": ["num. of seats", "seats"]
|
||||
}
|
||||
|
||||
# Szöveges mezők keresési kulcsszavai
|
||||
TEXT_FIELD_KEYWORDS = {
|
||||
"fuel_type": ["fuel type", "fuel", "engine fuel", "fuel system"],
|
||||
"transmission_type": ["transmission", "gear", "gearbox"],
|
||||
"drive_type": ["drive type", "drive", "drivetrain"],
|
||||
"body_type": ["body type", "body", "car body"]
|
||||
}
|
||||
|
||||
|
||||
class UltimateSpecsEnricher:
|
||||
def __init__(self):
|
||||
self.running = True
|
||||
|
||||
async def fetch_next_library_item(self, session: AsyncSession) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Kivesz egy feldolgozandó sort a Library-ből.
|
||||
"""
|
||||
query = text("""
|
||||
SELECT id, specifications, make, model, year_from
|
||||
FROM vehicle.external_reference_library
|
||||
WHERE pipeline_status = 'pending_enrich'
|
||||
FOR UPDATE SKIP LOCKED LIMIT 1
|
||||
""")
|
||||
|
||||
try:
|
||||
result = await session.execute(query)
|
||||
row = result.fetchone()
|
||||
if row:
|
||||
return {
|
||||
"id": row[0],
|
||||
"specifications": row[1] if isinstance(row[1], dict) else {},
|
||||
"make": row[2],
|
||||
"model": row[3],
|
||||
"year_from": row[4]
|
||||
}
|
||||
return None
|
||||
except SQLAlchemyError as e:
|
||||
logger.error(f"SQL hiba a lekérdezés során: {e}")
|
||||
return None
|
||||
|
||||
def extract_fuzzy_metric(self, specifications: Dict[str, Any], target_key: str, keywords: List[str]) -> Optional[float]:
|
||||
"""
|
||||
Keres a specifications szótárban a megadott kulcsszavak alapján, és számot próbál kinyerni.
|
||||
"""
|
||||
if not specifications:
|
||||
return None
|
||||
|
||||
# Először próbáljuk meg a kulcsokat (case-insensitive)
|
||||
spec_lower = {k.lower(): v for k, v in specifications.items()}
|
||||
|
||||
for keyword in keywords:
|
||||
for key, value in spec_lower.items():
|
||||
if keyword.lower() in key:
|
||||
# Ha a érték szám vagy string, próbáljuk kinyerni a számot
|
||||
num = self.clean_number(value)
|
||||
if num is not None:
|
||||
# Ha a kulcs tartalmazza a "hp" vagy "horsepower" és a cél kW, konvertáljuk
|
||||
if target_key == "power_kw" and ("hp" in key or "horsepower" in key):
|
||||
# hp -> kW konverzió (1 hp = 0.7457 kW)
|
||||
num = num * 0.7457
|
||||
return num
|
||||
return None
|
||||
|
||||
def clean_number(self, value: Any) -> Optional[float]:
|
||||
"""
|
||||
Kinyeri a számot egy stringből vagy más típusból.
|
||||
"""
|
||||
if value is None:
|
||||
return None
|
||||
if isinstance(value, (int, float)):
|
||||
return float(value)
|
||||
if isinstance(value, str):
|
||||
# Távolítsuk el a nem szám karaktereket, kivéve pont és mínusz
|
||||
# Keresünk mintákat mint "120 kW" vagy "120kW"
|
||||
match = re.search(r'([-+]?\d*\.?\d+)\s*(?:kW|hp|cc|Nm|kg|km/h|mph)?', value, re.IGNORECASE)
|
||||
if match:
|
||||
try:
|
||||
return float(match.group(1))
|
||||
except ValueError:
|
||||
pass
|
||||
# Ha nincs specifikus egység, próbáljunk meg bármilyen számot kinyerni
|
||||
matches = re.findall(r'[-+]?\d*\.?\d+', value)
|
||||
if matches:
|
||||
try:
|
||||
return float(matches[0])
|
||||
except ValueError:
|
||||
pass
|
||||
return None
|
||||
|
||||
def extract_text_field(self, specifications: Dict[str, Any], keywords: List[str]) -> Optional[str]:
|
||||
"""
|
||||
Kinyer egy szöveges mezőt a specifications-ből a kulcsszavak alapján.
|
||||
"""
|
||||
if not specifications:
|
||||
return None
|
||||
|
||||
spec_lower = {k.lower(): v for k, v in specifications.items()}
|
||||
|
||||
for keyword in keywords:
|
||||
for key, value in spec_lower.items():
|
||||
if keyword.lower() in key:
|
||||
if isinstance(value, str):
|
||||
return value.strip()
|
||||
elif isinstance(value, (int, float)):
|
||||
return str(value)
|
||||
return None
|
||||
|
||||
def enrich_specifications(self, raw_specs: Dict[str, Any], make: str, model: str, year_from: int) -> Dict[str, Any]:
|
||||
"""
|
||||
Fő strukturáló függvény: kinyeri a standard értékeket és létrehozza az új JSON struktúrát.
|
||||
"""
|
||||
standardized = {}
|
||||
|
||||
# Metrikák kinyerése
|
||||
for target_key, keywords in FUZZY_MAPPING.items():
|
||||
value = self.extract_fuzzy_metric(raw_specs, target_key, keywords)
|
||||
standardized[target_key] = value
|
||||
|
||||
# Szöveges mezők kinyerése
|
||||
for field, keywords in TEXT_FIELD_KEYWORDS.items():
|
||||
value = self.extract_text_field(raw_specs, keywords)
|
||||
standardized[field] = value
|
||||
|
||||
# Készítsük az új JSON struktúrát
|
||||
updated_specifications = {
|
||||
"standardized": standardized,
|
||||
"_raw": raw_specs # Az eredeti R1 adat érintetlenül megmarad!
|
||||
}
|
||||
|
||||
return updated_specifications
|
||||
|
||||
async def process_item(self, session: AsyncSession, item: Dict[str, Any]) -> bool:
|
||||
"""
|
||||
Feldolgoz egy elemet: kinyeri az adatokat, frissíti az adatbázist.
|
||||
"""
|
||||
try:
|
||||
logger.info(f"Feldolgozás: ID={item['id']}, {item['make']} {item['model']} ({item['year_from']})")
|
||||
|
||||
# Adatok kinyerése és strukturálása
|
||||
updated_specs = self.enrich_specifications(
|
||||
item['specifications'],
|
||||
item['make'],
|
||||
item['model'],
|
||||
item['year_from']
|
||||
)
|
||||
|
||||
# Kinyert értékek a fizikai oszlopokhoz
|
||||
power_kw = updated_specs['standardized'].get('power_kw')
|
||||
engine_cc = updated_specs['standardized'].get('engine_capacity')
|
||||
|
||||
# UPDATE végrehajtása
|
||||
update_query = text("""
|
||||
UPDATE vehicle.external_reference_library
|
||||
SET power_kw = :power_kw,
|
||||
engine_cc = :engine_cc,
|
||||
make = :make,
|
||||
model = :model,
|
||||
year_from = :year_from,
|
||||
specifications = :updated_specifications,
|
||||
pipeline_status = 'pending_match'
|
||||
WHERE id = :id
|
||||
""")
|
||||
|
||||
params = {
|
||||
"power_kw": int(power_kw) if power_kw is not None else None,
|
||||
"engine_cc": int(engine_cc) if engine_cc is not None else None,
|
||||
"make": item['make'],
|
||||
"model": item['model'],
|
||||
"year_from": item['year_from'],
|
||||
"updated_specifications": json.dumps(updated_specs),
|
||||
"id": item['id']
|
||||
}
|
||||
|
||||
await session.execute(update_query, params)
|
||||
await session.commit()
|
||||
|
||||
logger.info(f"Sikeres frissítés: ID={item['id']}, power_kw={power_kw}, engine_cc={engine_cc}")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Hiba a feldolgozás során ID={item['id']}: {e}")
|
||||
await session.rollback()
|
||||
return False
|
||||
|
||||
async def run_once(self):
|
||||
"""
|
||||
Egyetlen feldolgozási ciklus.
|
||||
"""
|
||||
async with AsyncSessionLocal() as session:
|
||||
try:
|
||||
# Tranzakció indítása
|
||||
async with session.begin():
|
||||
item = await self.fetch_next_library_item(session)
|
||||
if not item:
|
||||
logger.debug("Nincs feldolgozandó elem")
|
||||
return False
|
||||
|
||||
success = await self.process_item(session, item)
|
||||
return success
|
||||
except SQLAlchemyError as e:
|
||||
logger.error(f"Adatbázis hiba: {e}")
|
||||
return False
|
||||
|
||||
async def run_loop(self):
|
||||
"""
|
||||
Fő végtelen ciklus.
|
||||
"""
|
||||
logger.info("R2 Enricher indítva...")
|
||||
|
||||
while self.running:
|
||||
try:
|
||||
success = await self.run_once()
|
||||
if not success:
|
||||
# Ha nincs feldolgozandó elem, várjunk egy kicsit
|
||||
await asyncio.sleep(SLEEP_INTERVAL)
|
||||
except KeyboardInterrupt:
|
||||
logger.info("Keyboard interrupt, leállítás...")
|
||||
self.running = False
|
||||
break
|
||||
except Exception as e:
|
||||
logger.error(f"Váratlan hiba a ciklusban: {e}")
|
||||
await asyncio.sleep(SLEEP_INTERVAL)
|
||||
|
||||
logger.info("R2 Enricher leállt")
|
||||
|
||||
def stop(self):
|
||||
"""Leállítási jelzés."""
|
||||
self.running = False
|
||||
|
||||
|
||||
async def main():
|
||||
"""Fő függvény."""
|
||||
enricher = UltimateSpecsEnricher()
|
||||
|
||||
# Signal kezelés
|
||||
def signal_handler(signum, frame):
|
||||
logger.info(f"Signal {signum} fogadva, leállítás...")
|
||||
enricher.stop()
|
||||
|
||||
signal.signal(signal.SIGINT, signal_handler)
|
||||
signal.signal(signal.SIGTERM, signal_handler)
|
||||
|
||||
try:
|
||||
await enricher.run_loop()
|
||||
except asyncio.CancelledError:
|
||||
logger.info("Task cancelled")
|
||||
finally:
|
||||
logger.info("R2 Enricher befejezte a munkát.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -0,0 +1,400 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Worker: vehicle_ultimate_r3_finalizer
|
||||
Producer-Consumer lánc negyedik, utolsó eleme (Az Összevezető).
|
||||
Offline dolgozik egy végtelen while ciklusban (1-3 mp delay), és a meglévő adatbázis-táblákat szinkronizálja.
|
||||
|
||||
1. Lekérdezés (JOIN a Queue-val): Kivesz egy `pending_match` sort a Library-ből, és a Queue-ból lekéri az eredeti `parent_id`-t és a link nevét.
|
||||
2. Szülő (Base VMD) ellenőrzése: Lekérdezi az eredeti szülő rekordot a VMD táblából a parent_id alapján.
|
||||
3. Összevezetés (UPDATE vagy INSERT): A letisztított adatok a lib.specifications['standardized'] dict-ből jönnek.
|
||||
- A ÁG: Ha a szülő status értéke IN ('pending', 'manual_review_needed'): UPDATE a szülő (VMD) rekordon
|
||||
- B ÁG: Ha a szülő status MÁR NEM 'pending': INSERT új variációként a VMD táblába
|
||||
4. Library lezárása: Frissíti a Library táblát pipeline_status = 'completed', matched_vmd_id beállítása.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import random
|
||||
import sys
|
||||
import signal
|
||||
import json
|
||||
from datetime import datetime
|
||||
from typing import Optional, Dict, Any, List, Tuple
|
||||
|
||||
from sqlalchemy import text, select, and_, or_
|
||||
from sqlalchemy.exc import IntegrityError, SQLAlchemyError
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.database import AsyncSessionLocal
|
||||
|
||||
# Logging konfiguráció
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s [R3-FINALIZER] %(message)s',
|
||||
datefmt='%Y-%m-%d %H:%M:%S'
|
||||
)
|
||||
logger = logging.getLogger("R3-FINALIZER")
|
||||
|
||||
# Konfiguráció
|
||||
SLEEP_INTERVAL = random.uniform(1, 3) # 1-3 mp között várakozás
|
||||
|
||||
|
||||
class UltimateSpecsFinalizer:
|
||||
def __init__(self):
|
||||
self.running = True
|
||||
|
||||
async def fetch_pending_match(self, session: AsyncSession) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Kivesz egy `pending_match` sort a Library-ből, JOIN-olva a Queue-val.
|
||||
FOR UPDATE OF lib SKIP LOCKED LIMIT 1
|
||||
"""
|
||||
query = text("""
|
||||
SELECT lib.id, lib.source_url, lib.make, lib.model, lib.year_from,
|
||||
lib.power_kw, lib.engine_cc, lib.specifications, lib.category,
|
||||
q.parent_id, q.name AS variant_name
|
||||
FROM vehicle.external_reference_library lib
|
||||
JOIN vehicle.auto_data_crawler_queue q ON lib.source_url = q.url
|
||||
WHERE lib.pipeline_status = 'pending_match'
|
||||
FOR UPDATE OF lib SKIP LOCKED LIMIT 1
|
||||
""")
|
||||
result = await session.execute(query)
|
||||
row = result.fetchone()
|
||||
if not row:
|
||||
return None
|
||||
|
||||
return {
|
||||
"lib_id": row[0],
|
||||
"source_url": row[1],
|
||||
"make": row[2],
|
||||
"model": row[3],
|
||||
"year_from": row[4],
|
||||
"power_kw": row[5],
|
||||
"engine_cc": row[6],
|
||||
"specifications": row[7] if row[7] else {},
|
||||
"category": row[8],
|
||||
"parent_id": row[9],
|
||||
"variant_name": row[10]
|
||||
}
|
||||
|
||||
async def get_parent_vmd(self, session: AsyncSession, parent_id: int) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Lekérdezi az eredeti szülő rekordot a VMD táblából a parent_id alapján.
|
||||
FOR UPDATE (zárolás a konkurrens feldolgozás elkerülésére)
|
||||
"""
|
||||
query = text("""
|
||||
SELECT id, status FROM vehicle.vehicle_model_definitions
|
||||
WHERE id = :parent_id FOR UPDATE
|
||||
""")
|
||||
result = await session.execute(query, {"parent_id": parent_id})
|
||||
row = result.fetchone()
|
||||
if not row:
|
||||
return None
|
||||
|
||||
return {
|
||||
"id": row[0],
|
||||
"status": row[1]
|
||||
}
|
||||
|
||||
def extract_standardized_data(self, specifications: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
Kinyeri a standardizált adatokat a specifications['standardized'] dict-ből.
|
||||
Csonkolja a szöveges mezőket a VMD tábla korlátaihoz (50 karakter).
|
||||
"""
|
||||
standardized = specifications.get('standardized', {})
|
||||
|
||||
# Alapvető numerikus mezők
|
||||
extracted = {
|
||||
"power_kw": standardized.get("power_kw"),
|
||||
"engine_capacity": standardized.get("engine_capacity"),
|
||||
"torque_nm": standardized.get("torque_nm"),
|
||||
"max_speed": standardized.get("max_speed"),
|
||||
"curb_weight": standardized.get("curb_weight"),
|
||||
"wheelbase": standardized.get("wheelbase"),
|
||||
"seats": standardized.get("seats"),
|
||||
"fuel_type": standardized.get("fuel_type"),
|
||||
"transmission_type": standardized.get("transmission_type"),
|
||||
"drive_type": standardized.get("drive_type"),
|
||||
"body_type": standardized.get("body_type"),
|
||||
}
|
||||
|
||||
# Csonkolás a VMD mezőhosszokhoz
|
||||
def truncate(value: Any, max_len: int = 50) -> Any:
|
||||
if isinstance(value, str) and len(value) > max_len:
|
||||
return value[:max_len]
|
||||
return value
|
||||
|
||||
# Alkalmazza a csonkolást a szöveges mezőkre
|
||||
for field in ["fuel_type", "transmission_type", "drive_type", "body_type"]:
|
||||
if extracted.get(field):
|
||||
extracted[field] = truncate(extracted[field], 50)
|
||||
|
||||
# Tisztítás: None értékek eltávolítása
|
||||
return {k: v for k, v in extracted.items() if v is not None}
|
||||
|
||||
async def update_parent_vmd(self, session: AsyncSession, parent_id: int,
|
||||
lib_data: Dict[str, Any], standardized: Dict[str, Any]) -> int:
|
||||
"""
|
||||
A ÁG: Frissíti a szülő VMD rekordot a kinyert standardizált adatokkal.
|
||||
Állítja a VMD status-át 'awaiting_ai_synthesis'-re.
|
||||
Visszaadja a parent_id-t (matched_vmd_id).
|
||||
"""
|
||||
# Build update fields
|
||||
update_fields = {
|
||||
"power_kw": standardized.get("power_kw") or lib_data.get("power_kw"),
|
||||
"engine_capacity": standardized.get("engine_capacity") or lib_data.get("engine_cc"),
|
||||
"torque_nm": standardized.get("torque_nm"),
|
||||
"max_speed": standardized.get("max_speed"),
|
||||
"curb_weight": standardized.get("curb_weight"),
|
||||
"wheelbase": standardized.get("wheelbase"),
|
||||
"seats": standardized.get("seats"),
|
||||
"fuel_type": standardized.get("fuel_type"),
|
||||
"transmission_type": standardized.get("transmission_type"),
|
||||
"drive_type": standardized.get("drive_type"),
|
||||
"body_type": standardized.get("body_type"),
|
||||
"status": "awaiting_ai_synthesis",
|
||||
"updated_at": datetime.utcnow(),
|
||||
"source": "ultimatespecs",
|
||||
"priority_score": 30,
|
||||
}
|
||||
|
||||
# Remove None values
|
||||
update_fields = {k: v for k, v in update_fields.items() if v is not None}
|
||||
|
||||
# Build SET clause
|
||||
set_clause = ", ".join([f"{k} = :{k}" for k in update_fields.keys()])
|
||||
|
||||
query = text(f"""
|
||||
UPDATE vehicle.vehicle_model_definitions
|
||||
SET {set_clause}
|
||||
WHERE id = :parent_id
|
||||
RETURNING id
|
||||
""")
|
||||
|
||||
params = {"parent_id": parent_id, **update_fields}
|
||||
result = await session.execute(query, params)
|
||||
updated_id = result.scalar()
|
||||
|
||||
logger.info(f"UPDATE parent VMD {parent_id} with {len(update_fields)} fields")
|
||||
return updated_id
|
||||
|
||||
async def insert_variant_vmd(self, session: AsyncSession, lib_data: Dict[str, Any],
|
||||
standardized: Dict[str, Any], variant_name: str) -> int:
|
||||
"""
|
||||
B ÁG: Beszúr egy új variációt a VMD táblába.
|
||||
make = lib.make, marketing_name = variant_name, year_from = lib.year_from.
|
||||
status = 'awaiting_ai_synthesis', source = 'ultimatespecs', priority_score = 30.
|
||||
Visszaadja az új ID-t (matched_vmd_id).
|
||||
Ha már létezik a rekord (duplicate key), visszaadja a meglévő ID-t.
|
||||
"""
|
||||
# Build insert data
|
||||
insert_data = {
|
||||
"make": lib_data["make"],
|
||||
"marketing_name": variant_name,
|
||||
"official_marketing_name": variant_name,
|
||||
"year_from": lib_data["year_from"],
|
||||
"power_kw": standardized.get("power_kw") or lib_data.get("power_kw"),
|
||||
"engine_capacity": standardized.get("engine_capacity") or lib_data.get("engine_cc"),
|
||||
"torque_nm": standardized.get("torque_nm"),
|
||||
"max_speed": standardized.get("max_speed"),
|
||||
"curb_weight": standardized.get("curb_weight"),
|
||||
"wheelbase": standardized.get("wheelbase"),
|
||||
"seats": standardized.get("seats"),
|
||||
"fuel_type": standardized.get("fuel_type"),
|
||||
"transmission_type": standardized.get("transmission_type"),
|
||||
"drive_type": standardized.get("drive_type"),
|
||||
"body_type": standardized.get("body_type"),
|
||||
"status": "awaiting_ai_synthesis",
|
||||
"vehicle_class": lib_data.get("category"),
|
||||
"source": "ultimatespecs",
|
||||
"priority_score": 30,
|
||||
"created_at": datetime.utcnow(),
|
||||
"updated_at": datetime.utcnow(),
|
||||
"market": "EU",
|
||||
"normalized_name": f"{lib_data['make']} {variant_name}",
|
||||
"technical_code": "UNKNOWN",
|
||||
"variant_code": "UNKNOWN",
|
||||
"version_code": "UNKNOWN",
|
||||
"specifications": json.dumps({}), # Üres JSON, mert NOT NULL
|
||||
"raw_api_data": json.dumps({}), # Üres JSON
|
||||
"research_metadata": json.dumps({}), # Üres JSON
|
||||
"raw_search_context": "", # Üres string
|
||||
}
|
||||
|
||||
# Remove None values
|
||||
insert_data = {k: v for k, v in insert_data.items() if v is not None}
|
||||
|
||||
# Build columns and values
|
||||
columns = ", ".join(insert_data.keys())
|
||||
placeholders = ", ".join([f":{k}" for k in insert_data.keys()])
|
||||
|
||||
try:
|
||||
# Próbáljuk meg beszúrni
|
||||
query = text(f"""
|
||||
INSERT INTO vehicle.vehicle_model_definitions ({columns})
|
||||
VALUES ({placeholders})
|
||||
RETURNING id
|
||||
""")
|
||||
|
||||
result = await session.execute(query, insert_data)
|
||||
new_id = result.scalar()
|
||||
|
||||
logger.info(f"INSERT new variant VMD {new_id} for {lib_data['make']} {variant_name}")
|
||||
return new_id
|
||||
|
||||
except IntegrityError as e:
|
||||
# Duplicate key violation - rollback és új lekérdezés
|
||||
logger.warning(f"Duplicate key violation for {lib_data['make']} {variant_name}: {e}. Rolling back and looking for existing record...")
|
||||
|
||||
# Rollback a megszakított tranzakciót
|
||||
await session.rollback()
|
||||
|
||||
# Keresés a meglévő rekordra új tranzakcióban
|
||||
find_query = text("""
|
||||
SELECT id FROM vehicle.vehicle_model_definitions
|
||||
WHERE make = :make
|
||||
AND marketing_name = :marketing_name
|
||||
AND year_from = :year_from
|
||||
LIMIT 1
|
||||
""")
|
||||
|
||||
find_params = {
|
||||
"make": lib_data["make"],
|
||||
"marketing_name": variant_name,
|
||||
"year_from": lib_data["year_from"]
|
||||
}
|
||||
|
||||
result = await session.execute(find_query, find_params)
|
||||
existing_id = result.scalar()
|
||||
|
||||
if existing_id:
|
||||
logger.info(f"Found existing VMD {existing_id} for {lib_data['make']} {variant_name}")
|
||||
return existing_id
|
||||
else:
|
||||
# Ha nem találjuk, dobjuk tovább a hibát
|
||||
logger.error(f"Duplicate key but could not find existing record for {lib_data['make']} {variant_name}")
|
||||
raise
|
||||
|
||||
async def close_library_entry(self, session: AsyncSession, lib_id: int, matched_vmd_id: int):
|
||||
"""
|
||||
Frissíti a Library táblát: pipeline_status = 'completed', matched_vmd_id beállítása.
|
||||
"""
|
||||
query = text("""
|
||||
UPDATE vehicle.external_reference_library
|
||||
SET pipeline_status = 'completed',
|
||||
matched_vmd_id = :matched_vmd_id
|
||||
WHERE id = :lib_id
|
||||
""")
|
||||
await session.execute(query, {"lib_id": lib_id, "matched_vmd_id": matched_vmd_id})
|
||||
logger.info(f"Library {lib_id} closed with matched_vmd_id {matched_vmd_id}")
|
||||
|
||||
async def process_one(self):
|
||||
"""
|
||||
Feldolgoz egyetlen pending_match rekordot.
|
||||
"""
|
||||
async with AsyncSessionLocal() as session:
|
||||
try:
|
||||
# 1. Lekérdezés a Library-ből
|
||||
lib_data = await self.fetch_pending_match(session)
|
||||
if not lib_data:
|
||||
return False
|
||||
|
||||
logger.info(f"Processing library ID {lib_data['lib_id']} for {lib_data['make']} {lib_data['model']}")
|
||||
|
||||
# 2. Szülő VMD ellenőrzése
|
||||
parent_vmd = None
|
||||
if lib_data['parent_id']:
|
||||
parent_vmd = await self.get_parent_vmd(session, lib_data['parent_id'])
|
||||
|
||||
# 3. Standardizált adatok kinyerése
|
||||
standardized = self.extract_standardized_data(lib_data['specifications'])
|
||||
|
||||
# 4. Döntés: UPDATE vagy INSERT
|
||||
matched_vmd_id = None
|
||||
|
||||
if parent_vmd and parent_vmd['status'] in ('pending', 'manual_review_needed'):
|
||||
# A ÁG: Szülő frissítése
|
||||
matched_vmd_id = await self.update_parent_vmd(
|
||||
session, parent_vmd['id'], lib_data, standardized
|
||||
)
|
||||
else:
|
||||
# B ÁG: Új variáció beszúrása
|
||||
matched_vmd_id = await self.insert_variant_vmd(
|
||||
session, lib_data, standardized, lib_data['variant_name']
|
||||
)
|
||||
|
||||
# 5. Library lezárása
|
||||
await self.close_library_entry(session, lib_data['lib_id'], matched_vmd_id)
|
||||
|
||||
# Commit
|
||||
await session.commit()
|
||||
logger.info(f"Successfully finalized library {lib_data['lib_id']} -> VMD {matched_vmd_id}")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
await session.rollback()
|
||||
logger.error(f"Error processing library {lib_data.get('lib_id', 'unknown')}: {e}")
|
||||
return False
|
||||
|
||||
async def run(self, max_iterations: int = 10):
|
||||
"""
|
||||
Fő futási ciklus: korlátozott számú iteráció, 1-3 mp várakozással.
|
||||
|
||||
Args:
|
||||
max_iterations: Maximum number of processing cycles (default: 10)
|
||||
"""
|
||||
logger.info(f"R3 Finalizer started. Max iterations: {max_iterations}. Waiting for pending_match entries...")
|
||||
|
||||
iteration = 0
|
||||
while self.running and iteration < max_iterations:
|
||||
try:
|
||||
processed = await self.process_one()
|
||||
if not processed:
|
||||
# Nincs munka vagy hiba történt, várakozás
|
||||
await asyncio.sleep(SLEEP_INTERVAL)
|
||||
else:
|
||||
# Sikeres feldolgozás után rövid várakozás
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
# Minden esetben növeljük az iterációt (akár sikeres, akár sikertelen volt)
|
||||
iteration += 1
|
||||
logger.info(f"Iteration {iteration}/{max_iterations} completed.")
|
||||
except asyncio.CancelledError:
|
||||
break
|
||||
except Exception as e:
|
||||
logger.error(f"Unexpected error in main loop: {e}")
|
||||
await asyncio.sleep(5)
|
||||
# Hiba esetén is növeljük az iterációt
|
||||
iteration += 1
|
||||
logger.info(f"Iteration {iteration}/{max_iterations} completed after error.")
|
||||
|
||||
logger.info(f"R3 Finalizer completed {iteration} iterations. Stopping.")
|
||||
self.stop()
|
||||
|
||||
def stop(self):
|
||||
self.running = False
|
||||
logger.info("R3 Finalizer stopping...")
|
||||
|
||||
|
||||
def main():
|
||||
# Signal kezelés
|
||||
finalizer = UltimateSpecsFinalizer()
|
||||
|
||||
def signal_handler(signum, frame):
|
||||
logger.info(f"Received signal {signum}, shutting down...")
|
||||
finalizer.stop()
|
||||
|
||||
signal.signal(signal.SIGINT, signal_handler)
|
||||
signal.signal(signal.SIGTERM, signal_handler)
|
||||
|
||||
# Fő ciklus indítása - korlátozott számú iterációval teszteléshez
|
||||
try:
|
||||
# Teszteléshez: maximum 5 iteráció
|
||||
asyncio.run(finalizer.run(max_iterations=5))
|
||||
except KeyboardInterrupt:
|
||||
logger.info("Keyboard interrupt received, shutting down...")
|
||||
finally:
|
||||
logger.info("R3 Finalizer stopped.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user