átlagos kiegészítséek jó sok
This commit is contained in:
59
backend/app/workers/vehicle/bike/bike_R0_brand_hunter.py
Normal file
59
backend/app/workers/vehicle/bike/bike_R0_brand_hunter.py
Normal file
@@ -0,0 +1,59 @@
|
||||
# /opt/docker/dev/service_finder/backend/app/workers/vehicle/bike/bike_R0_brand_hunter.py
|
||||
import asyncio, logging
|
||||
from playwright.async_api import async_playwright
|
||||
from sqlalchemy import text
|
||||
from app.database import AsyncSessionLocal
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s [BIKE-R0] %(message)s')
|
||||
logger = logging.getLogger("R0")
|
||||
|
||||
SOURCES = [
|
||||
{
|
||||
"name": "AutoEvolution",
|
||||
"url": "https://www.autoevolution.com/moto/",
|
||||
# Robusztusabb szelektor a márkákhoz
|
||||
"selector": ".brand a, .all-brands a, .moto-brand a",
|
||||
"category": "bike"
|
||||
}
|
||||
]
|
||||
|
||||
async def run_r0():
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
context = await browser.new_context(user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/122.0.0.0")
|
||||
|
||||
async with AsyncSessionLocal() as db:
|
||||
for src in SOURCES:
|
||||
page = await context.new_page()
|
||||
try:
|
||||
logger.info(f"Márkák kinyerése: {src['name']}...")
|
||||
await page.goto(src['url'], wait_until="networkidle", timeout=60000)
|
||||
|
||||
# Ha a szelektor nem talál semmit, begyűjtjük az összes /moto/ linket
|
||||
links = await page.eval_on_selector_all("a[href*='/moto/']",
|
||||
"nodes => nodes.map(n => ({ 'name': n.innerText.trim(), 'url': n.href }))")
|
||||
|
||||
# Szűrés: csak a tiszta márka-linkek (pl. .../moto/aprilia/)
|
||||
# A márka linkek általában 5 perjelből állnak (https:// + domain + moto + márka + /)
|
||||
brand_links = [l for l in links if l['url'].count('/') == 5 and not l['url'].endswith('.html')]
|
||||
|
||||
count = 0
|
||||
for link in brand_links:
|
||||
if len(link['name']) < 2: continue
|
||||
await db.execute(text("""
|
||||
INSERT INTO vehicle.auto_data_crawler_queue (url, level, name, status, category)
|
||||
VALUES (:url, 'brand', :name, 'pending', 'bike')
|
||||
ON CONFLICT (url) DO NOTHING
|
||||
"""), {"url": link['url'], "name": link['name']})
|
||||
count += 1
|
||||
|
||||
await db.commit()
|
||||
logger.info(f"✅ [{src['name']}] kész: {count} márkát találtam.")
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Hiba: {e}")
|
||||
finally:
|
||||
await page.close()
|
||||
await browser.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(run_r0())
|
||||
171
backend/app/workers/vehicle/bike/bike_R1_model_scout.py
Normal file
171
backend/app/workers/vehicle/bike/bike_R1_model_scout.py
Normal file
@@ -0,0 +1,171 @@
|
||||
import asyncio
|
||||
import logging
|
||||
import random
|
||||
import re
|
||||
from playwright.async_api import async_playwright
|
||||
from sqlalchemy import text
|
||||
from app.database import AsyncSessionLocal
|
||||
|
||||
# --- NAPLÓZÁS KONFIGURÁCIÓ ---
|
||||
# Megtartjuk a részletes naplózást minden eseményhez
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s [BIKE-R1-AUTOEVO] %(message)s'
|
||||
)
|
||||
logger = logging.getLogger("R1")
|
||||
|
||||
async def analyze_and_extract_links(page, current_url):
|
||||
"""
|
||||
Gondolatmenet: Intelligens link-osztályozás az AutoEvolution struktúrája alapján.
|
||||
Minden funkciót megőrzünk: Language Shield, zajszűrés és a horgony-fix.
|
||||
"""
|
||||
found_links = []
|
||||
|
||||
# Minden link begyűjtése az elemzéshez a megadott szelektorral
|
||||
hrefs = await page.eval_on_selector_all(
|
||||
"a[href*='/moto/']",
|
||||
"nodes => nodes.map(n => ({ 'name': n.innerText.trim(), 'url': n.href }))"
|
||||
)
|
||||
|
||||
junk_keywords = [
|
||||
'privacy', 'cookie', 'settings', 'contact', 'terms', 'advertising',
|
||||
'about us', 'copyright', 'login', 'registration'
|
||||
]
|
||||
|
||||
for link in hrefs:
|
||||
# --- HORGONY ÉS PARAMÉTER TISZTÍTÁS ---
|
||||
# Itt volt a hiba: levágjuk a # részt, de a linket megtartjuk az ellenőrzéshez!
|
||||
raw_url = link['url'].split('#')[0].split('?')[0].rstrip('/')
|
||||
name = link['name']
|
||||
|
||||
# --- 1. LANGUAGE SHIELD & ZAJ SZŰRÉS ---
|
||||
if not name or len(name) < 2:
|
||||
continue
|
||||
|
||||
# Csak latin karakterek (No Greek/Cyrillic/Polish/etc)
|
||||
if re.search(r'[^\x00-\x7F]+', name):
|
||||
continue
|
||||
|
||||
# Kizárjuk a navigációs szemetet
|
||||
if any(junk in name.lower() for junk in junk_keywords):
|
||||
continue
|
||||
|
||||
# --- 2. AUTOEVOLUTION MÉLYSÉGI LOGIKA ---
|
||||
if "autoevolution.com/moto/" in raw_url:
|
||||
# Önhivatkozás és főoldal (visszafelé navigáció) kiszűrése
|
||||
if raw_url == current_url.rstrip('/') or raw_url.endswith('/moto'):
|
||||
continue
|
||||
|
||||
# Elágazás a szintek között az URL szerkezete alapján
|
||||
path_segments = raw_url.strip('/').split('/')
|
||||
|
||||
# Ha .html-re végződik, az a technikai specifikáció (ENGINE szint)
|
||||
if raw_url.endswith(".html"):
|
||||
found_links.append({'name': name, 'url': raw_url, 'level': 'engine'})
|
||||
|
||||
# Ha legalább 6 szegmens van és nincs .html, az egy al-modell vagy generáció (MODEL szint)
|
||||
elif len(path_segments) >= 6:
|
||||
found_links.append({'name': name, 'url': raw_url, 'level': 'model'})
|
||||
|
||||
return found_links
|
||||
|
||||
async def get_next_task(db):
|
||||
"""
|
||||
Prioritásos feladatfelvétel: A márka (brand) szinteket részesítjük előnyben.
|
||||
SKIP LOCKED biztosítja a párhuzamos futtathatóságot.
|
||||
"""
|
||||
query = text("""
|
||||
UPDATE vehicle.auto_data_crawler_queue SET status = 'processing'
|
||||
WHERE id = (
|
||||
SELECT id FROM vehicle.auto_data_crawler_queue
|
||||
WHERE status = 'pending'
|
||||
AND category = 'bike'
|
||||
AND url LIKE '%autoevolution.com%'
|
||||
AND level IN ('brand', 'model')
|
||||
ORDER BY
|
||||
CASE WHEN level = 'brand' THEN 0 ELSE 1 END ASC,
|
||||
id ASC
|
||||
LIMIT 1 FOR UPDATE SKIP LOCKED
|
||||
) RETURNING id, url, name, level
|
||||
""")
|
||||
res = await db.execute(query)
|
||||
return res.fetchone()
|
||||
|
||||
async def main():
|
||||
"""
|
||||
Fő vezérlő hurok teljes hibakezeléssel és tranzakció-biztonsággal.
|
||||
"""
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
context = await browser.new_context(
|
||||
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
|
||||
)
|
||||
|
||||
logger.info("🤖 R1 AutoEvolution Specialist elindult...")
|
||||
|
||||
while True:
|
||||
target = None
|
||||
try:
|
||||
async with AsyncSessionLocal() as db:
|
||||
target = await get_next_task(db)
|
||||
await db.commit()
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Adatbázis hiba a feladatfelvételnél: {e}")
|
||||
await asyncio.sleep(5)
|
||||
continue
|
||||
|
||||
if not target:
|
||||
logger.info("🏁 Nincs több AutoEvolution feladat. Alvás 60mp...")
|
||||
await asyncio.sleep(60)
|
||||
continue
|
||||
|
||||
t_id, t_url, t_name, t_level = target
|
||||
page = await context.new_page()
|
||||
|
||||
try:
|
||||
logger.info(f"🚀 Felderítés ({t_level}): {t_name} -> {t_url}")
|
||||
# A domcontentloaded gyorsabb, de várunk utána a JS-re
|
||||
await page.goto(t_url, wait_until="domcontentloaded", timeout=60000)
|
||||
await asyncio.sleep(random.uniform(2, 3))
|
||||
|
||||
links = await analyze_and_extract_links(page, t_url)
|
||||
|
||||
async with AsyncSessionLocal() as db:
|
||||
try:
|
||||
new_links_count = 0
|
||||
for link in links:
|
||||
# Minden talált variációt elmentünk a várólistába
|
||||
await db.execute(text("""
|
||||
INSERT INTO vehicle.auto_data_crawler_queue (url, level, parent_id, name, status, category)
|
||||
VALUES (:url, :level, :p_id, :name, 'pending', 'bike')
|
||||
ON CONFLICT (url) DO NOTHING
|
||||
"""), {"url": link['url'], "level": link['level'], "p_id": t_id, "name": link['name']})
|
||||
new_links_count += 1
|
||||
|
||||
# Feladat lezárása
|
||||
await db.execute(text("UPDATE vehicle.auto_data_crawler_queue SET status = 'completed', updated_at = NOW() WHERE id = :id"), {"id": t_id})
|
||||
await db.commit()
|
||||
logger.info(f"✅ {t_name} kész. Talált AutoEvolution linkek: {new_links_count}")
|
||||
except Exception as inner_db_error:
|
||||
await db.rollback()
|
||||
logger.error(f"❌ Belső mentési hiba: {inner_db_error}")
|
||||
raise inner_db_error
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Kritikus hiba a navigáció során: {t_name} -> {e}")
|
||||
async with AsyncSessionLocal() as db:
|
||||
await db.execute(text("UPDATE vehicle.auto_data_crawler_queue SET status = 'error', error_msg = :msg, updated_at = NOW() WHERE id = :id"),
|
||||
{"msg": str(e), "id": t_id})
|
||||
await db.commit()
|
||||
finally:
|
||||
await page.close()
|
||||
# Kíméljük a szervert a kitiltás ellen
|
||||
await asyncio.sleep(random.uniform(3, 5))
|
||||
|
||||
await browser.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
asyncio.run(main())
|
||||
except KeyboardInterrupt:
|
||||
logger.info("🛑 Leállítás.")
|
||||
173
backend/app/workers/vehicle/bike/bike_R2_generation_scout.py
Normal file
173
backend/app/workers/vehicle/bike/bike_R2_generation_scout.py
Normal file
@@ -0,0 +1,173 @@
|
||||
import asyncio
|
||||
import logging
|
||||
import random
|
||||
import re
|
||||
from playwright.async_api import async_playwright
|
||||
from sqlalchemy import text
|
||||
from app.database import AsyncSessionLocal
|
||||
|
||||
# --- NAPLÓZÁS ---
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s [R2-BIKE-DEPTH] %(message)s',
|
||||
handlers=[logging.StreamHandler()]
|
||||
)
|
||||
logger = logging.getLogger("R2")
|
||||
|
||||
async def get_page_safe(page, url):
|
||||
"""
|
||||
Bot védelem kijátszása valós viselkedéssel és Cloudflare ellenőrzéssel.
|
||||
"""
|
||||
delay = random.uniform(4, 7)
|
||||
await asyncio.sleep(delay)
|
||||
|
||||
try:
|
||||
await page.goto(url, wait_until="domcontentloaded", timeout=60000)
|
||||
title = await page.title()
|
||||
if "Just a moment" in title or "Cloudflare" in title:
|
||||
logger.error(f"Bot védelem észlelve: {url}")
|
||||
raise Exception("Bot védelem (CF) megállította a robotot.")
|
||||
return page
|
||||
except Exception as e:
|
||||
logger.error(f"Hiba az oldal betöltésekor: {url} -> {e}")
|
||||
raise
|
||||
|
||||
async def extract_scoped_links(page, p_id, current_url):
|
||||
"""
|
||||
MÉLYSÉGI FELDERÍTÉS: Generation -> Engine variációk kinyerése.
|
||||
Scope-Lock: Csak az adott márkán belüli linkeket követi.
|
||||
"""
|
||||
# Kinyerjük a márka nevét az URL-ből a scope-lockhoz
|
||||
path_segments = current_url.strip('/').split('/')
|
||||
if len(path_segments) < 5:
|
||||
return 0
|
||||
brand_anchor = path_segments[4]
|
||||
|
||||
hrefs = await page.eval_on_selector_all(
|
||||
"a[href*='/moto/']",
|
||||
"nodes => nodes.map(n => ({ 'name': n.innerText.trim(), 'url': n.href }))"
|
||||
)
|
||||
|
||||
junk = ['privacy', 'cookie', 'settings', 'contact', 'terms', 'advertising', 'login', 'about', 'copyright']
|
||||
found_count = 0
|
||||
|
||||
async with AsyncSessionLocal() as db:
|
||||
for link in hrefs:
|
||||
# TISZTÍTÁS: Levágjuk a horgonyt, hogy az adatlapot lássuk
|
||||
clean_url = link['url'].split('#')[0].split('?')[0].rstrip('/')
|
||||
name = link['name'].replace('\n', ' ').strip()
|
||||
|
||||
# Alap szűrések
|
||||
if not name or len(name) < 2: continue
|
||||
if re.search(r'[^\x00-\x7F]+', name): continue
|
||||
if any(k in name.lower() for k in junk): continue
|
||||
|
||||
# SCOPE LOCK: Csak az adott márkához tartozó linkeket engedjük át
|
||||
if brand_anchor not in clean_url.lower():
|
||||
continue
|
||||
|
||||
# Navigációs szűrés
|
||||
if any(x in clean_url for x in ['-brand-', 'allbrands', 'en/brands', '/moto/']):
|
||||
if clean_url.count('/') < 5: continue
|
||||
|
||||
# Önhivatkozás elkerülése
|
||||
if clean_url == current_url.rstrip('/'):
|
||||
continue
|
||||
|
||||
# Szintek meghatározása
|
||||
if clean_url.endswith(".html"):
|
||||
target_level = 'engine'
|
||||
elif clean_url.count('/') >= 6:
|
||||
target_level = 'generation'
|
||||
else:
|
||||
continue
|
||||
|
||||
# Mentés az adatbázisba
|
||||
await db.execute(text("""
|
||||
INSERT INTO vehicle.auto_data_crawler_queue (url, level, parent_id, name, status, category)
|
||||
VALUES (:url, :level, :p_id, :name, 'pending', 'bike')
|
||||
ON CONFLICT (url) DO NOTHING
|
||||
"""), {"url": clean_url, "level": target_level, "p_id": p_id, "name": name})
|
||||
found_count += 1
|
||||
|
||||
await db.commit()
|
||||
return found_count
|
||||
|
||||
async def process_target(context, t_id, t_url, t_name, t_level):
|
||||
"""
|
||||
Egy adott feladat (URL) teljes körű feldolgozása.
|
||||
"""
|
||||
page = await context.new_page()
|
||||
try:
|
||||
logger.info(f"🚀 Mélységi fúrás [{t_level}]: {t_name}")
|
||||
await get_page_safe(page, t_url)
|
||||
|
||||
# Variációk és generációk kinyerése
|
||||
found = await extract_scoped_links(page, t_id, t_url)
|
||||
|
||||
async with AsyncSessionLocal() as db:
|
||||
new_status = 'completed' if found > 0 else 'completed_leaf'
|
||||
await db.execute(text("""
|
||||
UPDATE vehicle.auto_data_crawler_queue
|
||||
SET status = :s, error_msg = NULL, updated_at = NOW()
|
||||
WHERE id = :id
|
||||
"""), {"s": new_status, "id": t_id})
|
||||
await db.commit()
|
||||
logger.info(f"✅ Befejezve: {t_name} -> {found} új variáció rögzítve.")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Kritikus hiba feldolgozás közben ({t_name}): {e}")
|
||||
async with AsyncSessionLocal() as db:
|
||||
await db.execute(text("""
|
||||
UPDATE vehicle.auto_data_crawler_queue
|
||||
SET status = 'error', error_msg = :msg, updated_at = NOW()
|
||||
WHERE id = :id
|
||||
"""), {"msg": str(e), "id": t_id})
|
||||
await db.commit()
|
||||
finally:
|
||||
await page.close()
|
||||
|
||||
async def main():
|
||||
"""
|
||||
Fő hurok mélységi stratégiával (level ASC).
|
||||
"""
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
context = await browser.new_context(
|
||||
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/122.0.0.0",
|
||||
viewport={'width': 1920, 'height': 1080}
|
||||
)
|
||||
|
||||
logger.info("🤖 R2 Motoros Mélységi Felderítő aktív.")
|
||||
|
||||
while True:
|
||||
async with AsyncSessionLocal() as db:
|
||||
res = await db.execute(text("""
|
||||
UPDATE vehicle.auto_data_crawler_queue SET status = 'processing'
|
||||
WHERE id = (
|
||||
SELECT id FROM vehicle.auto_data_crawler_queue
|
||||
WHERE status = 'pending'
|
||||
AND level IN ('model', 'generation')
|
||||
AND category = 'bike'
|
||||
AND url LIKE '%autoevolution.com%'
|
||||
ORDER BY level ASC, id ASC
|
||||
LIMIT 1 FOR UPDATE SKIP LOCKED
|
||||
) RETURNING id, url, name, level
|
||||
"""))
|
||||
target = res.fetchone()
|
||||
await db.commit()
|
||||
|
||||
if not target:
|
||||
logger.info("🏁 Minden variáció felderítve. Alvás 60mp...")
|
||||
await asyncio.sleep(60)
|
||||
continue
|
||||
|
||||
await process_target(context, target[0], target[1], target[2], target[3])
|
||||
|
||||
await browser.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
asyncio.run(main())
|
||||
except KeyboardInterrupt:
|
||||
logger.info("🛑 Leállítás.")
|
||||
95
backend/app/workers/vehicle/bike/bike_R3_engine_scout.py
Normal file
95
backend/app/workers/vehicle/bike/bike_R3_engine_scout.py
Normal file
@@ -0,0 +1,95 @@
|
||||
# /opt/docker/dev/service_finder/backend/app/workers/vehicle/bike/bike_R3_engine_scout.py
|
||||
import asyncio
|
||||
import logging
|
||||
import random
|
||||
import json
|
||||
import re
|
||||
from bs4 import BeautifulSoup
|
||||
from playwright.async_api import async_playwright
|
||||
from sqlalchemy import text
|
||||
from app.database import AsyncSessionLocal
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s [R3-EXTRACTOR] %(message)s')
|
||||
logger = logging.getLogger("R3")
|
||||
|
||||
class R3DataMiner:
|
||||
def clean_key(self, key):
|
||||
if "," in key: key = key.split(",")[-1]
|
||||
key = key.replace("What is the ", "").replace("How much ", "").replace("How many ", "")
|
||||
return key.split("?")[0].strip().capitalize()
|
||||
|
||||
async def scrape_specs(self, context, url):
|
||||
page = await context.new_page()
|
||||
try:
|
||||
await asyncio.sleep(random.uniform(4, 8))
|
||||
await page.goto(url, wait_until="domcontentloaded", timeout=60000)
|
||||
content = await page.content()
|
||||
soup = BeautifulSoup(content, 'html.parser')
|
||||
|
||||
data = {"make": "", "model": "", "generation": "", "modification": "",
|
||||
"year_from": None, "power_kw": 0, "engine_cc": 0,
|
||||
"specifications": {}, "source_url": url}
|
||||
|
||||
for row in soup.find_all('tr'):
|
||||
th, td = row.find('th'), row.find('td')
|
||||
if not th or not td: continue
|
||||
k_raw, v = th.get_text(strip=True), td.get_text(strip=True)
|
||||
k_low = k_raw.lower()
|
||||
|
||||
if "brand" == k_low: data["make"] = v
|
||||
elif "model" == k_low: data["model"] = v
|
||||
elif "generation" == k_low: data["generation"] = v
|
||||
elif "modification" == k_low: data["modification"] = v
|
||||
elif "start of production" in k_low:
|
||||
m = re.search(r'(\d{4})', v)
|
||||
data["year_from"] = int(m.group(1)) if m else None
|
||||
elif "power" == k_low:
|
||||
hp = re.search(r'(\d+)\s*Hp', v, re.I)
|
||||
if hp: data["power_kw"] = int(int(hp.group(1)) / 1.36)
|
||||
elif "displacement" in k_low:
|
||||
cc = re.search(r'(\d+)\s*cm3', v)
|
||||
if cc: data["engine_cc"] = int(cc.group(1))
|
||||
|
||||
data["specifications"][self.clean_key(k_raw)] = v
|
||||
return data
|
||||
except Exception as e:
|
||||
logger.error(f"Hiba az adatlapon: {e}"); return None
|
||||
finally: await page.close()
|
||||
|
||||
async def run(self):
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
context = await browser.new_context(user_agent="Mozilla/5.0...")
|
||||
while True:
|
||||
async with AsyncSessionLocal() as db:
|
||||
res = await db.execute(text("""
|
||||
UPDATE vehicle.auto_data_crawler_queue SET status = 'processing'
|
||||
WHERE id = (SELECT id FROM vehicle.auto_data_crawler_queue
|
||||
WHERE level = 'engine' AND status = 'pending'
|
||||
ORDER BY id ASC LIMIT 1 FOR UPDATE SKIP LOCKED)
|
||||
RETURNING id, url, name
|
||||
"""))
|
||||
target = res.fetchone()
|
||||
await db.commit()
|
||||
if not target: break
|
||||
|
||||
data = await self.scrape_specs(context, target[1])
|
||||
if data and data["make"]:
|
||||
async with AsyncSessionLocal() as db:
|
||||
await db.execute(text("""
|
||||
INSERT INTO vehicle.external_reference_library
|
||||
(source_name, make, model, generation, modification, year_from, power_kw, engine_cc, specifications, source_url)
|
||||
VALUES ('auto-data.net', :make, :model, :gen, :mod, :y, :p, :e, :s, :u)
|
||||
ON CONFLICT (source_url) DO UPDATE SET specifications = EXCLUDED.specifications, last_scraped_at = NOW();
|
||||
"""), {"make": data["make"], "model": data["model"], "gen": data["generation"], "mod": data["modification"],
|
||||
"y": data["year_from"], "p": data["power_kw"], "e": data["engine_cc"], "s": json.dumps(data["specifications"]), "u": data["source_url"]})
|
||||
await db.execute(text("UPDATE vehicle.auto_data_crawler_queue SET status = 'completed' WHERE id = :id"), {"id": target[0]})
|
||||
await db.commit()
|
||||
logger.info(f"✅ ARANYMENTÉS: {data['make']} {data['model']} {data['modification']}")
|
||||
else:
|
||||
async with AsyncSessionLocal() as db:
|
||||
await db.execute(text("UPDATE vehicle.auto_data_crawler_queue SET status = 'error' WHERE id = :id"), {"id": target[0]})
|
||||
await db.commit()
|
||||
await browser.close()
|
||||
|
||||
if __name__ == "__main__": asyncio.run(R3DataMiner().run())
|
||||
218
backend/app/workers/vehicle/bike/bike_R4_final_extractor.py
Normal file
218
backend/app/workers/vehicle/bike/bike_R4_final_extractor.py
Normal file
@@ -0,0 +1,218 @@
|
||||
#!/usr/bin/env python3
|
||||
import asyncio
|
||||
import logging
|
||||
import random
|
||||
import json
|
||||
import sys
|
||||
from playwright.async_api import async_playwright
|
||||
from sqlalchemy import text
|
||||
from app.database import AsyncSessionLocal
|
||||
|
||||
# --- NAPLÓZÁS KONFIGURÁCIÓ ---
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s [R4-HARVESTER-v1.2] %(message)s')
|
||||
logger = logging.getLogger("R4")
|
||||
|
||||
# --- KONFIGURÁCIÓS PARAMÉTEREK ---
|
||||
MAX_RETRY_LIMIT = 5 # Max 5 próbálkozás járművenként
|
||||
|
||||
async def parse_specs(page):
|
||||
"""
|
||||
A GYŐZTES DOM PARSZOLÓ LOGIKA (HIÁNYTALAN)
|
||||
Ez a script felismeri a hibás táblázatokat, a dt/dd listákat és a sima vastagított szövegeket is.
|
||||
"""
|
||||
script = """
|
||||
() => {
|
||||
let results = {};
|
||||
|
||||
// 1. MÓDSZER: Régi motorok (pl. BMW F650GS) -> td.left és td.right
|
||||
let leftCells = document.querySelectorAll('td.left');
|
||||
leftCells.forEach(cell => {
|
||||
let key = cell.innerText.replace(/:$/, '').trim();
|
||||
let rightCell = cell.nextElementSibling;
|
||||
if(rightCell && rightCell.classList.contains('right')) {
|
||||
results[key] = rightCell.innerText.trim();
|
||||
}
|
||||
});
|
||||
|
||||
// 2. MÓDSZER: Modern motorok (pl. Aprilia) -> dt és dd
|
||||
let dts = document.querySelectorAll('dt');
|
||||
dts.forEach(dt => {
|
||||
let key = dt.innerText.replace(/:$/, '').trim();
|
||||
let dd = dt.nextElementSibling;
|
||||
if(dd && dd.tagName.toLowerCase() === 'dd') {
|
||||
results[key] = dd.innerText.trim();
|
||||
}
|
||||
});
|
||||
|
||||
// 3. MÓDSZER: Alternatív modern layout -> span.label és span.value
|
||||
let specRows = document.querySelectorAll('.spec-row');
|
||||
specRows.forEach(row => {
|
||||
let label = row.querySelector('.label');
|
||||
let value = row.querySelector('.value');
|
||||
if(label && value) {
|
||||
let key = label.innerText.replace(/:$/, '').trim();
|
||||
if (!results[key]) {
|
||||
results[key] = value.innerText.trim();
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// 4. MÓDSZER: Veterán ("Adler") fallback -> Vastagított szöveg
|
||||
if (Object.keys(results).length === 0) {
|
||||
document.querySelectorAll('b, strong').forEach(b => {
|
||||
let key = b.innerText.replace(/:$/, '').trim();
|
||||
if(key.length > 2 && key.length < 30) {
|
||||
let val = "";
|
||||
if(b.nextSibling && b.nextSibling.nodeType === 3) {
|
||||
val = b.nextSibling.textContent.trim();
|
||||
}
|
||||
else if (b.nextElementSibling && b.nextElementSibling.tagName !== 'B') {
|
||||
val = b.nextElementSibling.innerText.trim();
|
||||
}
|
||||
if(val && !results[key]) {
|
||||
results[key] = val;
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
"""
|
||||
try:
|
||||
data = await page.evaluate(script)
|
||||
|
||||
if data and len(data) > 0:
|
||||
relevant_keys = [
|
||||
"Production", "Year", "Segment",
|
||||
"Type", "Displacement", "Bore X Stroke", "Compression Ratio",
|
||||
"Horsepower", "Torque", "Fuel System", "Gearbox", "Clutch",
|
||||
"Final Drive", "Frame", "Front Suspension", "Rear Suspension",
|
||||
"Front Brake", "Rear Brake", "Overall Length", "Overall Width",
|
||||
"Seat Height", "Wheelbase", "Fuel Capacity", "Weight", "Dry Weight",
|
||||
"Wet Weight", "Front", "Rear"
|
||||
]
|
||||
|
||||
filtered_data = {k: v for k, v in data.items() if any(rk.lower() in k.lower() for rk in relevant_keys)}
|
||||
return filtered_data if len(filtered_data) > 0 else data
|
||||
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Parszolási hiba a JS kiértékeléskor: {e}")
|
||||
return None
|
||||
|
||||
async def main():
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
context = await browser.new_context(
|
||||
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
|
||||
viewport={'width': 1920, 'height': 1080}
|
||||
)
|
||||
logger.info("🤖 R4 Motor Adat-Arató v1.2 elindult.")
|
||||
|
||||
while True:
|
||||
target = None
|
||||
try:
|
||||
async with AsyncSessionLocal() as db:
|
||||
# JAVÍTÁS: Kikerült a completed_empty a választható státuszok közül!
|
||||
# Csak 'pending' és 'error' jöhet, ha a retry_count < 5.
|
||||
res = await db.execute(text("""
|
||||
UPDATE vehicle.auto_data_crawler_queue SET status = 'processing'
|
||||
WHERE id = (
|
||||
SELECT id FROM vehicle.auto_data_crawler_queue
|
||||
WHERE status IN ('pending', 'error')
|
||||
AND retry_count < 5
|
||||
AND level = 'engine' AND category = 'bike'
|
||||
ORDER BY id ASC LIMIT 1 FOR UPDATE SKIP LOCKED
|
||||
) RETURNING id, url, name, retry_count
|
||||
"""))
|
||||
target = res.fetchone()
|
||||
await db.commit()
|
||||
except Exception as e:
|
||||
logger.error(f"❌ DB Hiba a feladatfelvételnél: {e}")
|
||||
await asyncio.sleep(5)
|
||||
continue
|
||||
|
||||
if not target:
|
||||
logger.info("🏁 Minden motor feldolgozva vagy manuális felülvizsgálatra vár. Alvás 60mp...")
|
||||
await asyncio.sleep(60)
|
||||
continue
|
||||
|
||||
t_id, t_url, t_name, t_retry_count = target
|
||||
if t_retry_count is None: t_retry_count = 0
|
||||
|
||||
page = await context.new_page()
|
||||
|
||||
try:
|
||||
logger.info(f"📊 [{t_retry_count + 1}/5] Adatbányászat: {t_name}")
|
||||
await page.goto(t_url, wait_until="domcontentloaded", timeout=60000)
|
||||
await asyncio.sleep(2)
|
||||
|
||||
data = await parse_specs(page)
|
||||
|
||||
async with AsyncSessionLocal() as db:
|
||||
if data and len(data) > 0:
|
||||
# SIKERES MENTÉS
|
||||
await db.execute(text("""
|
||||
INSERT INTO vehicle.motorcycle_specs (crawler_id, full_name, raw_data, url)
|
||||
VALUES (:cid, :name, :data, :url)
|
||||
ON CONFLICT (crawler_id) DO UPDATE SET raw_data = :data, updated_at = NOW()
|
||||
"""), {"cid": t_id, "name": t_name, "data": json.dumps(data), "url": t_url})
|
||||
|
||||
await db.execute(text("UPDATE vehicle.auto_data_crawler_queue SET status = 'completed', updated_at = NOW() WHERE id = :id"), {"id": t_id})
|
||||
await db.commit()
|
||||
logger.info(f"✅ Mentve: {t_name} ({len(data)} paraméter)")
|
||||
else:
|
||||
# ÜRES OLDAL VAGY HIÁNYZÓ ADAT
|
||||
new_retry_count = t_retry_count + 1
|
||||
|
||||
if new_retry_count >= 5:
|
||||
# Elérte a limitet -> JAVÍTANDÓ (manual_review_needed)
|
||||
await db.execute(text("""
|
||||
UPDATE vehicle.auto_data_crawler_queue
|
||||
SET status = 'manual_review_needed',
|
||||
retry_count = :rc,
|
||||
error_msg = 'Sikertelen adatgyűjtés 5 próbálkozás után (üres oldal)',
|
||||
updated_at = NOW()
|
||||
WHERE id = :id
|
||||
"""), {"rc": new_retry_count, "id": t_id})
|
||||
logger.error(f"🚨 LIMIT ELÉRVE: {t_name} -> manuális javításra jelölve.")
|
||||
else:
|
||||
# Még próbálkozhat -> státusz visszaállítása hibára
|
||||
await db.execute(text("""
|
||||
UPDATE vehicle.auto_data_crawler_queue
|
||||
SET status = 'error',
|
||||
retry_count = :rc,
|
||||
updated_at = NOW()
|
||||
WHERE id = :id
|
||||
"""), {"rc": new_retry_count, "id": t_id})
|
||||
logger.warning(f"⚠️ Üres maradt: {t_name} (Próbálkozás: {new_retry_count}/5)")
|
||||
|
||||
await db.commit()
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Hiba a feldolgozás során: {t_name} -> {e}")
|
||||
async with AsyncSessionLocal() as db:
|
||||
new_retry_count = t_retry_count + 1
|
||||
status = 'error' if new_retry_count < 5 else 'manual_review_needed'
|
||||
await db.execute(text("""
|
||||
UPDATE vehicle.auto_data_crawler_queue
|
||||
SET status = :st,
|
||||
retry_count = :rc,
|
||||
error_msg = :msg,
|
||||
updated_at = NOW()
|
||||
WHERE id = :id
|
||||
"""), {"st": status, "rc": new_retry_count, "msg": str(e), "id": t_id})
|
||||
await db.commit()
|
||||
finally:
|
||||
await page.close()
|
||||
await asyncio.sleep(random.uniform(2.0, 4.0))
|
||||
|
||||
await browser.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
asyncio.run(main())
|
||||
except KeyboardInterrupt:
|
||||
logger.info("🛑 Felhasználói leállítás.")
|
||||
113
backend/app/workers/vehicle/bike/test_aprilia.py
Normal file
113
backend/app/workers/vehicle/bike/test_aprilia.py
Normal file
@@ -0,0 +1,113 @@
|
||||
import asyncio
|
||||
import json
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
async def test_scraper():
|
||||
# Két probléma-fókuszú URL: a modern Aprilia és a régi, hibás HTML-ű BMW
|
||||
test_urls = [
|
||||
"https://www.autoevolution.com/moto/aprilia-rs-660-factory-2025.html",
|
||||
"https://www.autoevolution.com/moto/bmw-f-650-gs-2011.html"
|
||||
]
|
||||
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
context = await browser.new_context(
|
||||
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
|
||||
)
|
||||
page = await context.new_page()
|
||||
|
||||
for url in test_urls:
|
||||
print(f"\n{'='*60}")
|
||||
print(f"🌍 MEGNYITÁS: {url}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
# A DOM betöltése megvárása
|
||||
await page.goto(url, wait_until="domcontentloaded", timeout=60000)
|
||||
await asyncio.sleep(2) # Várunk picit a JS futásra
|
||||
|
||||
# A TÖKÉLETESÍTETT AUTOEVOLUTION PARSZOLÓ
|
||||
script = """
|
||||
() => {
|
||||
let results = {};
|
||||
|
||||
// 1. MÓDSZER: Régi motorok (pl. BMW F650GS) -> td.left és td.right
|
||||
let leftCells = document.querySelectorAll('td.left');
|
||||
leftCells.forEach(cell => {
|
||||
let key = cell.innerText.replace(/:$/, '').trim();
|
||||
let rightCell = cell.nextElementSibling;
|
||||
if(rightCell && rightCell.classList.contains('right')) {
|
||||
results[key] = rightCell.innerText.trim();
|
||||
}
|
||||
});
|
||||
|
||||
// 2. MÓDSZER: Modern motorok (pl. Aprilia) -> dt és dd
|
||||
let dts = document.querySelectorAll('dt');
|
||||
dts.forEach(dt => {
|
||||
let key = dt.innerText.replace(/:$/, '').trim();
|
||||
let dd = dt.nextElementSibling;
|
||||
if(dd && dd.tagName.toLowerCase() === 'dd') {
|
||||
results[key] = dd.innerText.trim();
|
||||
}
|
||||
});
|
||||
|
||||
// 3. MÓDSZER: Alternatív modern layout -> span.label és span.value
|
||||
let specRows = document.querySelectorAll('.spec-row');
|
||||
specRows.forEach(row => {
|
||||
let label = row.querySelector('.label');
|
||||
let value = row.querySelector('.value');
|
||||
if(label && value) {
|
||||
let key = label.innerText.replace(/:$/, '').trim();
|
||||
if (!results[key]) {
|
||||
results[key] = value.innerText.trim();
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// 4. MÓDSZER: "Adler" típusú elavult leírások fallbackje -> Vastagított szöveg
|
||||
if (Object.keys(results).length === 0) {
|
||||
document.querySelectorAll('b, strong').forEach(b => {
|
||||
let key = b.innerText.replace(/:$/, '').trim();
|
||||
if(key.length > 2 && key.length < 30) {
|
||||
let val = "";
|
||||
// Ha a szöveg közvetlenül a tag után van (Text Node)
|
||||
if(b.nextSibling && b.nextSibling.nodeType === 3) {
|
||||
val = b.nextSibling.textContent.trim();
|
||||
}
|
||||
// Ha egy másik elemben van
|
||||
else if (b.nextElementSibling && b.nextElementSibling.tagName !== 'B') {
|
||||
val = b.nextElementSibling.innerText.trim();
|
||||
}
|
||||
if(val && !results[key]) {
|
||||
results[key] = val;
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
"""
|
||||
|
||||
data = await page.evaluate(script)
|
||||
|
||||
if data and len(data) > 0:
|
||||
# Kiszűrjük a zajt, csak a releváns műszaki adatokat hagyjuk meg
|
||||
relevant_keys = ["Type", "Displacement", "Bore X Stroke", "Compression Ratio",
|
||||
"Horsepower", "Torque", "Fuel System", "Gearbox", "Clutch",
|
||||
"Final Drive", "Frame", "Front Suspension", "Rear Suspension",
|
||||
"Front Brake", "Rear Brake", "Overall Length", "Overall Width",
|
||||
"Seat Height", "Wheelbase", "Fuel Capacity", "Weight", "Dry Weight",
|
||||
"Wet Weight", "Front", "Rear"]
|
||||
|
||||
filtered_data = {k: v for k, v in data.items() if any(rk.lower() in k.lower() for rk in relevant_keys)}
|
||||
|
||||
print("\n🟢 KINYERT ADATOK (DOM PARSZOLÓ):")
|
||||
print(json.dumps(filtered_data if filtered_data else data, indent=2, ensure_ascii=False))
|
||||
print(f"\n✅ Összesen {len(filtered_data if filtered_data else data)} műszaki paramétert találtam.")
|
||||
else:
|
||||
print("\n🔴 NULLA ADAT - A DOM parszoló nem talált egyezést.")
|
||||
|
||||
await browser.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(test_scraper())
|
||||
Reference in New Issue
Block a user