183 lines
6.8 KiB
Python
183 lines
6.8 KiB
Python
"""
|
|
DeduplicationService - Explicit deduplikáció a márka, technikai kód és jármű típus alapján.
|
|
Integrálja a mapping_rules.py és mapping_dictionary.py fájlokat.
|
|
"""
|
|
import logging
|
|
from typing import Optional, Dict, Any
|
|
from sqlalchemy import select, and_, or_
|
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
|
|
from app.models import VehicleModelDefinition
|
|
from app.workers.vehicle.mapping_rules import SOURCE_MAPPINGS, unify_data
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Ha nincs mapping_dictionary, hozzunk létre egy egyszerű szinonima szótárt
|
|
MAPPING_DICTIONARY = {
|
|
"make_synonyms": {
|
|
"BMW": ["BMW", "Bayerische Motoren Werke"],
|
|
"MERCEDES": ["MERCEDES", "MERCEDES-BENZ", "MERCEDES BENZ"],
|
|
"VOLKSWAGEN": ["VOLKSWAGEN", "VW"],
|
|
"AUDI": ["AUDI"],
|
|
"TOYOTA": ["TOYOTA"],
|
|
"FORD": ["FORD"],
|
|
# További márkák...
|
|
},
|
|
"technical_code_synonyms": {
|
|
# Példa: "1.8 TSI" -> ["1.8 TSI", "1.8TSI", "1.8 TSI 180"]
|
|
},
|
|
"vehicle_class_synonyms": {
|
|
"SUV": ["SUV", "SPORT UTILITY VEHICLE"],
|
|
"SEDAN": ["SEDAN", "SALOON"],
|
|
"HATCHBACK": ["HATCHBACK", "HATCH"],
|
|
"COUPE": ["COUPE", "COUPÉ"],
|
|
}
|
|
}
|
|
|
|
class DeduplicationService:
|
|
"""Szolgáltatás a duplikált járműmodell rekordok azonosítására és kezelésére."""
|
|
|
|
@staticmethod
|
|
def normalize_make(make: str) -> str:
|
|
"""Normalizálja a márka nevet a szinonimák alapján."""
|
|
make_upper = make.strip().upper()
|
|
for canonical, synonyms in MAPPING_DICTIONARY["make_synonyms"].items():
|
|
if make_upper in synonyms or make_upper == canonical:
|
|
return canonical
|
|
return make_upper
|
|
|
|
@staticmethod
|
|
def normalize_technical_code(technical_code: Optional[str]) -> str:
|
|
"""Normalizálja a technikai kódot (pl. motor kód)."""
|
|
if not technical_code:
|
|
return ""
|
|
# Egyszerű whitespace és pont eltávolítás
|
|
code = technical_code.strip().upper()
|
|
# További normalizáció: eltávolítás speciális karakterek
|
|
import re
|
|
code = re.sub(r'[^A-Z0-9]', '', code)
|
|
return code
|
|
|
|
@staticmethod
|
|
def normalize_vehicle_class(vehicle_class: Optional[str]) -> str:
|
|
"""Normalizálja a jármű osztályt."""
|
|
if not vehicle_class:
|
|
return ""
|
|
class_upper = vehicle_class.strip().upper()
|
|
for canonical, synonyms in MAPPING_DICTIONARY["vehicle_class_synonyms"].items():
|
|
if class_upper in synonyms or class_upper == canonical:
|
|
return canonical
|
|
return class_upper
|
|
|
|
@classmethod
|
|
async def find_duplicate(
|
|
cls,
|
|
session: AsyncSession,
|
|
make: str,
|
|
technical_code: str,
|
|
vehicle_class: str,
|
|
exclude_id: Optional[int] = None
|
|
) -> Optional[VehicleModelDefinition]:
|
|
"""
|
|
Megkeresi, hogy létezik-e már ugyanilyen (normalizált) rekord a vehicle_model_definitions táblában.
|
|
|
|
Args:
|
|
session: SQLAlchemy async session
|
|
make: márka (pl. "BMW")
|
|
technical_code: technikai kód (pl. "N47")
|
|
vehicle_class: jármű osztály (pl. "SEDAN")
|
|
exclude_id: kizárni kívánt rekord ID (pl. frissítésnél)
|
|
|
|
Returns:
|
|
VehicleModelDefinition instance ha talált duplikátumot, egyébként None.
|
|
"""
|
|
norm_make = cls.normalize_make(make)
|
|
norm_technical_code = cls.normalize_technical_code(technical_code)
|
|
norm_vehicle_class = cls.normalize_vehicle_class(vehicle_class)
|
|
|
|
# Keresés a normalizált értékek alapján
|
|
stmt = select(VehicleModelDefinition).where(
|
|
and_(
|
|
VehicleModelDefinition.make.ilike(f"%{norm_make}%"),
|
|
VehicleModelDefinition.technical_code.ilike(f"%{norm_technical_code}%"),
|
|
VehicleModelDefinition.vehicle_class.ilike(f"%{norm_vehicle_class}%")
|
|
)
|
|
)
|
|
if exclude_id:
|
|
stmt = stmt.where(VehicleModelDefinition.id != exclude_id)
|
|
|
|
result = await session.execute(stmt)
|
|
duplicate = result.scalar_one_or_none()
|
|
|
|
if duplicate:
|
|
logger.info(f"Duplikátum találva: ID {duplicate.id} - {duplicate.make} {duplicate.technical_code} {duplicate.vehicle_class}")
|
|
return duplicate
|
|
|
|
@classmethod
|
|
async def ensure_no_duplicate(
|
|
cls,
|
|
session: AsyncSession,
|
|
make: str,
|
|
technical_code: str,
|
|
vehicle_class: str,
|
|
exclude_id: Optional[int] = None
|
|
) -> bool:
|
|
"""
|
|
Ellenőrzi, hogy nincs-e duplikátum. Ha van, False-t ad vissza.
|
|
"""
|
|
duplicate = await cls.find_duplicate(session, make, technical_code, vehicle_class, exclude_id)
|
|
return duplicate is None
|
|
|
|
@classmethod
|
|
async def deduplicate_and_merge(
|
|
cls,
|
|
session: AsyncSession,
|
|
new_record: Dict[str, Any],
|
|
source_name: str = "manual"
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Duplikáció ellenőrzése és esetleges merge logika.
|
|
Ha talál duplikátumot, visszaadja a meglévő rekord adatait.
|
|
Ha nem, visszaadja a normalizált új rekordot.
|
|
|
|
Args:
|
|
session: SQLAlchemy async session
|
|
new_record: új rekord adatai (make, technical_code, vehicle_class, stb.)
|
|
source_name: adatforrás neve a mapping_rules-hoz
|
|
|
|
Returns:
|
|
Dict with keys:
|
|
- is_duplicate: bool
|
|
- existing_id: int if duplicate else None
|
|
- normalized_data: normalizált adatok
|
|
"""
|
|
# Normalizálás mapping_rules segítségével
|
|
unified = unify_data(new_record, source_name)
|
|
|
|
make = unified.get("normalized_make", new_record.get("make", ""))
|
|
technical_code = new_record.get("technical_code", "")
|
|
vehicle_class = new_record.get("vehicle_class", "")
|
|
|
|
duplicate = await cls.find_duplicate(session, make, technical_code, vehicle_class)
|
|
|
|
if duplicate:
|
|
return {
|
|
"is_duplicate": True,
|
|
"existing_id": duplicate.id,
|
|
"normalized_data": {
|
|
"make": duplicate.make,
|
|
"technical_code": duplicate.technical_code,
|
|
"vehicle_class": duplicate.vehicle_class,
|
|
}
|
|
}
|
|
|
|
# Nincs duplikátum, normalizált adatokkal tér vissza
|
|
return {
|
|
"is_duplicate": False,
|
|
"existing_id": None,
|
|
"normalized_data": {
|
|
"make": cls.normalize_make(make),
|
|
"technical_code": cls.normalize_technical_code(technical_code),
|
|
"vehicle_class": cls.normalize_vehicle_class(vehicle_class),
|
|
}
|
|
} |