service-finder/backend/app/scripts/audit_scanner.py

#!/usr/bin/env python3
"""
Audit Scanner for Codebase Analysis (#42)

This script performs a comprehensive audit of the Python codebase:
1. Recursively scans the backend/app directory for .py files
2. Excludes __init__.py files and alembic/versions directory
3. Groups files by directory structure (api, services, models, etc.)
4. Extracts docstrings and class/function names from each file
5. Generates a Markdown audit ledger with checkboxes for tracking
"""

import os
import re
import ast
from pathlib import Path
from typing import Dict, List, Tuple, Set
import datetime

# Project root (relative to script location in container)
PROJECT_ROOT = Path("/app")
BACKEND_DIR = PROJECT_ROOT / "app"  # /app/app is the backend root in container
OUTPUT_FILE = Path("/app/.roo/audit_ledger_94.md")

# Directories to exclude
EXCLUDE_DIRS = {"__pycache__", ".git", "alembic/versions", "migrations"}
EXCLUDE_FILES = {"__init__.py"}

def extract_python_info(file_path: Path) -> Tuple[str, List[str], List[str]]:
    """
    Extract docstring and class/function names from a Python file.
    Returns: (docstring, class_names, function_names)
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()

        # Try to parse with AST
        try:
            tree = ast.parse(content)

            # Extract module docstring
            docstring = ast.get_docstring(tree) or ""

            # Extract class and function names
            class_names = []
            function_names = []

            for node in ast.walk(tree):
                if isinstance(node, ast.ClassDef):
                    class_names.append(node.name)
                elif isinstance(node, ast.FunctionDef):
                    # Only top-level functions (not methods)
                    if not isinstance(node.parent, ast.ClassDef):
                        function_names.append(node.name)

            return docstring, class_names, function_names

        except (SyntaxError, ValueError):
            # If AST parsing fails, use simple regex extraction
            docstring_match = re.search(r'"""(.*?)"""', content, re.DOTALL)
            docstring = docstring_match.group(1).strip() if docstring_match else ""

            # Simple regex for class and function definitions
            class_matches = re.findall(r'^class\s+(\w+)', content, re.MULTILINE)
            func_matches = re.findall(r'^def\s+(\w+)', content, re.MULTILINE)

            return docstring, class_matches, func_matches

    except Exception as e:
        return f"Error reading file: {e}", [], []

def get_file_summary(docstring: str, class_names: List[str], function_names: List[str]) -> str:
    """Create a summary string from extracted information."""
    parts = []

    if docstring:
        # Take first line of docstring, max 100 chars
        first_line = docstring.split('\n')[0].strip()
        if len(first_line) > 100:
            first_line = first_line[:97] + "..."
        parts.append(f'"{first_line}"')

    if class_names:
        parts.append(f"Classes: {', '.join(class_names[:5])}")
        if len(class_names) > 5:
            parts[-1] += f" (+{len(class_names)-5} more)"

    if function_names:
        parts.append(f"Functions: {', '.join(function_names[:5])}")
        if len(function_names) > 5:
            parts[-1] += f" (+{len(function_names)-5} more)"

    return " - ".join(parts) if parts else "No docstring or definitions found"

def scan_python_files(root_dir: Path) -> Dict[str, List[Tuple[Path, str]]]:
    """
    Scan for Python files and group them by directory category.
    Returns: {category: [(file_path, summary), ...]}
    """
    categories = {}

    for py_file in root_dir.rglob("*.py"):
        # Skip excluded directories
        if any(excluded in str(py_file) for excluded in EXCLUDE_DIRS):
            continue

        # Skip excluded files
        if py_file.name in EXCLUDE_FILES:
            continue

        # Determine category based on directory structure
        rel_path = py_file.relative_to(root_dir)
        path_parts = list(rel_path.parts)

        # Categorize based on first few directory levels
        category = "Other"
        if len(path_parts) >= 2:
            if path_parts[0] == "api":
                category = "API Endpoints"
            elif path_parts[0] == "services":
                category = "Services"
            elif path_parts[0] == "models":
                category = "Models"
            elif path_parts[0] == "core":
                category = "Core"
            elif path_parts[0] == "workers":
                category = "Workers"
            elif path_parts[0] == "scripts":
                category = "Scripts"
            elif path_parts[0] == "tests" or path_parts[0] == "tests_internal" or path_parts[0] == "test_outside":
                category = "Tests"
            elif path_parts[0] == "crud":
                category = "CRUD"
            elif path_parts[0] == "schemas":
                category = "Schemas"
            elif path_parts[0] == "templates":
                category = "Templates"
            elif path_parts[0] == "static":
                category = "Static"

        # Extract file info
        docstring, class_names, function_names = extract_python_info(py_file)
        summary = get_file_summary(docstring, class_names, function_names)

        # Add to category
        if category not in categories:
            categories[category] = []

        categories[category].append((rel_path, summary))

    return categories

def generate_markdown(categories: Dict[str, List[Tuple[Path, str]]]) -> str:
    """Generate Markdown content from categorized files."""
    lines = []

    # Header
    lines.append("# Codebase Audit Ledger (#42)")
    lines.append("")
    lines.append(f"*Generated: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}*")
    lines.append(f"*Total files scanned: {sum(len(files) for files in categories.values())}*")
    lines.append("")
    lines.append("## 📋 Audit Checklist")
    lines.append("")
    lines.append("Check each file after audit completion. Use this ledger to track progress.")
    lines.append("")

    # Sort categories for consistent output
    sorted_categories = sorted(categories.items(), key=lambda x: x[0])

    for category, files in sorted_categories:
        lines.append(f"## {category} (`backend/app/{category.lower().replace(' ', '_')}/...`)")
        lines.append("")

        # Sort files alphabetically
        files.sort(key=lambda x: str(x[0]))

        for file_path, summary in files:
            # Create checkbox and file entry
            lines.append(f"- [ ] `{file_path}` - {summary}")

        lines.append("")

    # Add statistics
    lines.append("## 📊 Statistics")
    lines.append("")
    lines.append("| Category | File Count |")
    lines.append("|----------|------------|")
    for category, files in sorted_categories:
        lines.append(f"| {category} | {len(files)} |")

    lines.append("")
    lines.append("## 🎯 Next Steps")
    lines.append("")
    lines.append("1. **Review each file** for functionality and dependencies")
    lines.append("2. **Document findings** in individual audit reports")
    lines.append("3. **Identify gaps** in test coverage and documentation")
    lines.append("4. **Prioritize refactoring** based on complexity and criticality")
    lines.append("")
    lines.append("*This ledger is automatically generated by `audit_scanner.py`*")

    return "\n".join(lines)

def main():
    print("🔍 Starting codebase audit scan...")
    print(f"Scanning directory: {BACKEND_DIR}")

    if not BACKEND_DIR.exists():
        print(f"Error: Directory {BACKEND_DIR} does not exist!")
        return 1

    # Scan files
    categories = scan_python_files(BACKEND_DIR)

    # Generate markdown
    markdown_content = generate_markdown(categories)

    # Write output
    OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
    with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
        f.write(markdown_content)

    total_files = sum(len(files) for files in categories.values())
    print(f"✅ Scan complete! Found {total_files} Python files.")
    print(f"📄 Report generated: {OUTPUT_FILE}")

    # Print summary
    print("\n📊 Category breakdown:")
    for category, files in sorted(categories.items(), key=lambda x: x[0]):
        print(f"  {category}: {len(files)} files")

    return 0

if __name__ == "__main__":
    exit(main())