113 lines
5.3 KiB
Python
113 lines
5.3 KiB
Python
import asyncio
|
|
import json
|
|
from playwright.async_api import async_playwright
|
|
|
|
async def test_scraper():
|
|
# Két probléma-fókuszú URL: a modern Aprilia és a régi, hibás HTML-ű BMW
|
|
test_urls = [
|
|
"https://www.autoevolution.com/moto/aprilia-rs-660-factory-2025.html",
|
|
"https://www.autoevolution.com/moto/bmw-f-650-gs-2011.html"
|
|
]
|
|
|
|
async with async_playwright() as p:
|
|
browser = await p.chromium.launch(headless=True)
|
|
context = await browser.new_context(
|
|
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
|
|
)
|
|
page = await context.new_page()
|
|
|
|
for url in test_urls:
|
|
print(f"\n{'='*60}")
|
|
print(f"🌍 MEGNYITÁS: {url}")
|
|
print(f"{'='*60}")
|
|
|
|
# A DOM betöltése megvárása
|
|
await page.goto(url, wait_until="domcontentloaded", timeout=60000)
|
|
await asyncio.sleep(2) # Várunk picit a JS futásra
|
|
|
|
# A TÖKÉLETESÍTETT AUTOEVOLUTION PARSZOLÓ
|
|
script = """
|
|
() => {
|
|
let results = {};
|
|
|
|
// 1. MÓDSZER: Régi motorok (pl. BMW F650GS) -> td.left és td.right
|
|
let leftCells = document.querySelectorAll('td.left');
|
|
leftCells.forEach(cell => {
|
|
let key = cell.innerText.replace(/:$/, '').trim();
|
|
let rightCell = cell.nextElementSibling;
|
|
if(rightCell && rightCell.classList.contains('right')) {
|
|
results[key] = rightCell.innerText.trim();
|
|
}
|
|
});
|
|
|
|
// 2. MÓDSZER: Modern motorok (pl. Aprilia) -> dt és dd
|
|
let dts = document.querySelectorAll('dt');
|
|
dts.forEach(dt => {
|
|
let key = dt.innerText.replace(/:$/, '').trim();
|
|
let dd = dt.nextElementSibling;
|
|
if(dd && dd.tagName.toLowerCase() === 'dd') {
|
|
results[key] = dd.innerText.trim();
|
|
}
|
|
});
|
|
|
|
// 3. MÓDSZER: Alternatív modern layout -> span.label és span.value
|
|
let specRows = document.querySelectorAll('.spec-row');
|
|
specRows.forEach(row => {
|
|
let label = row.querySelector('.label');
|
|
let value = row.querySelector('.value');
|
|
if(label && value) {
|
|
let key = label.innerText.replace(/:$/, '').trim();
|
|
if (!results[key]) {
|
|
results[key] = value.innerText.trim();
|
|
}
|
|
}
|
|
});
|
|
|
|
// 4. MÓDSZER: "Adler" típusú elavult leírások fallbackje -> Vastagított szöveg
|
|
if (Object.keys(results).length === 0) {
|
|
document.querySelectorAll('b, strong').forEach(b => {
|
|
let key = b.innerText.replace(/:$/, '').trim();
|
|
if(key.length > 2 && key.length < 30) {
|
|
let val = "";
|
|
// Ha a szöveg közvetlenül a tag után van (Text Node)
|
|
if(b.nextSibling && b.nextSibling.nodeType === 3) {
|
|
val = b.nextSibling.textContent.trim();
|
|
}
|
|
// Ha egy másik elemben van
|
|
else if (b.nextElementSibling && b.nextElementSibling.tagName !== 'B') {
|
|
val = b.nextElementSibling.innerText.trim();
|
|
}
|
|
if(val && !results[key]) {
|
|
results[key] = val;
|
|
}
|
|
}
|
|
});
|
|
}
|
|
|
|
return results;
|
|
}
|
|
"""
|
|
|
|
data = await page.evaluate(script)
|
|
|
|
if data and len(data) > 0:
|
|
# Kiszűrjük a zajt, csak a releváns műszaki adatokat hagyjuk meg
|
|
relevant_keys = ["Type", "Displacement", "Bore X Stroke", "Compression Ratio",
|
|
"Horsepower", "Torque", "Fuel System", "Gearbox", "Clutch",
|
|
"Final Drive", "Frame", "Front Suspension", "Rear Suspension",
|
|
"Front Brake", "Rear Brake", "Overall Length", "Overall Width",
|
|
"Seat Height", "Wheelbase", "Fuel Capacity", "Weight", "Dry Weight",
|
|
"Wet Weight", "Front", "Rear"]
|
|
|
|
filtered_data = {k: v for k, v in data.items() if any(rk.lower() in k.lower() for rk in relevant_keys)}
|
|
|
|
print("\n🟢 KINYERT ADATOK (DOM PARSZOLÓ):")
|
|
print(json.dumps(filtered_data if filtered_data else data, indent=2, ensure_ascii=False))
|
|
print(f"\n✅ Összesen {len(filtered_data if filtered_data else data)} műszaki paramétert találtam.")
|
|
else:
|
|
print("\n🔴 NULLA ADAT - A DOM parszoló nem talált egyezést.")
|
|
|
|
await browser.close()
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(test_scraper()) |