#!/usr/bin/env python3 """ Audit Scanner for Codebase Analysis (#42) This script performs a comprehensive audit of the Python codebase: 1. Recursively scans the backend/app directory for .py files 2. Excludes __init__.py files and alembic/versions directory 3. Groups files by directory structure (api, services, models, etc.) 4. Extracts docstrings and class/function names from each file 5. Generates a Markdown audit ledger with checkboxes for tracking """ import os import re import ast from pathlib import Path from typing import Dict, List, Tuple, Set import datetime # Project root (relative to script location in container) PROJECT_ROOT = Path("/app") BACKEND_DIR = PROJECT_ROOT / "app" # /app/app is the backend root in container OUTPUT_FILE = Path("/app/.roo/audit_ledger_94.md") # Directories to exclude EXCLUDE_DIRS = {"__pycache__", ".git", "alembic/versions", "migrations"} EXCLUDE_FILES = {"__init__.py"} def extract_python_info(file_path: Path) -> Tuple[str, List[str], List[str]]: """ Extract docstring and class/function names from a Python file. Returns: (docstring, class_names, function_names) """ try: with open(file_path, 'r', encoding='utf-8') as f: content = f.read() # Try to parse with AST try: tree = ast.parse(content) # Extract module docstring docstring = ast.get_docstring(tree) or "" # Extract class and function names class_names = [] function_names = [] for node in ast.walk(tree): if isinstance(node, ast.ClassDef): class_names.append(node.name) elif isinstance(node, ast.FunctionDef): # Only top-level functions (not methods) if not isinstance(node.parent, ast.ClassDef): function_names.append(node.name) return docstring, class_names, function_names except (SyntaxError, ValueError): # If AST parsing fails, use simple regex extraction docstring_match = re.search(r'"""(.*?)"""', content, re.DOTALL) docstring = docstring_match.group(1).strip() if docstring_match else "" # Simple regex for class and function definitions class_matches = re.findall(r'^class\s+(\w+)', content, re.MULTILINE) func_matches = re.findall(r'^def\s+(\w+)', content, re.MULTILINE) return docstring, class_matches, func_matches except Exception as e: return f"Error reading file: {e}", [], [] def get_file_summary(docstring: str, class_names: List[str], function_names: List[str]) -> str: """Create a summary string from extracted information.""" parts = [] if docstring: # Take first line of docstring, max 100 chars first_line = docstring.split('\n')[0].strip() if len(first_line) > 100: first_line = first_line[:97] + "..." parts.append(f'"{first_line}"') if class_names: parts.append(f"Classes: {', '.join(class_names[:5])}") if len(class_names) > 5: parts[-1] += f" (+{len(class_names)-5} more)" if function_names: parts.append(f"Functions: {', '.join(function_names[:5])}") if len(function_names) > 5: parts[-1] += f" (+{len(function_names)-5} more)" return " - ".join(parts) if parts else "No docstring or definitions found" def scan_python_files(root_dir: Path) -> Dict[str, List[Tuple[Path, str]]]: """ Scan for Python files and group them by directory category. Returns: {category: [(file_path, summary), ...]} """ categories = {} for py_file in root_dir.rglob("*.py"): # Skip excluded directories if any(excluded in str(py_file) for excluded in EXCLUDE_DIRS): continue # Skip excluded files if py_file.name in EXCLUDE_FILES: continue # Determine category based on directory structure rel_path = py_file.relative_to(root_dir) path_parts = list(rel_path.parts) # Categorize based on first few directory levels category = "Other" if len(path_parts) >= 2: if path_parts[0] == "api": category = "API Endpoints" elif path_parts[0] == "services": category = "Services" elif path_parts[0] == "models": category = "Models" elif path_parts[0] == "core": category = "Core" elif path_parts[0] == "workers": category = "Workers" elif path_parts[0] == "scripts": category = "Scripts" elif path_parts[0] == "tests" or path_parts[0] == "tests_internal" or path_parts[0] == "test_outside": category = "Tests" elif path_parts[0] == "crud": category = "CRUD" elif path_parts[0] == "schemas": category = "Schemas" elif path_parts[0] == "templates": category = "Templates" elif path_parts[0] == "static": category = "Static" # Extract file info docstring, class_names, function_names = extract_python_info(py_file) summary = get_file_summary(docstring, class_names, function_names) # Add to category if category not in categories: categories[category] = [] categories[category].append((rel_path, summary)) return categories def generate_markdown(categories: Dict[str, List[Tuple[Path, str]]]) -> str: """Generate Markdown content from categorized files.""" lines = [] # Header lines.append("# Codebase Audit Ledger (#42)") lines.append("") lines.append(f"*Generated: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}*") lines.append(f"*Total files scanned: {sum(len(files) for files in categories.values())}*") lines.append("") lines.append("## šŸ“‹ Audit Checklist") lines.append("") lines.append("Check each file after audit completion. Use this ledger to track progress.") lines.append("") # Sort categories for consistent output sorted_categories = sorted(categories.items(), key=lambda x: x[0]) for category, files in sorted_categories: lines.append(f"## {category} (`backend/app/{category.lower().replace(' ', '_')}/...`)") lines.append("") # Sort files alphabetically files.sort(key=lambda x: str(x[0])) for file_path, summary in files: # Create checkbox and file entry lines.append(f"- [ ] `{file_path}` - {summary}") lines.append("") # Add statistics lines.append("## šŸ“Š Statistics") lines.append("") lines.append("| Category | File Count |") lines.append("|----------|------------|") for category, files in sorted_categories: lines.append(f"| {category} | {len(files)} |") lines.append("") lines.append("## šŸŽÆ Next Steps") lines.append("") lines.append("1. **Review each file** for functionality and dependencies") lines.append("2. **Document findings** in individual audit reports") lines.append("3. **Identify gaps** in test coverage and documentation") lines.append("4. **Prioritize refactoring** based on complexity and criticality") lines.append("") lines.append("*This ledger is automatically generated by `audit_scanner.py`*") return "\n".join(lines) def main(): print("šŸ” Starting codebase audit scan...") print(f"Scanning directory: {BACKEND_DIR}") if not BACKEND_DIR.exists(): print(f"Error: Directory {BACKEND_DIR} does not exist!") return 1 # Scan files categories = scan_python_files(BACKEND_DIR) # Generate markdown markdown_content = generate_markdown(categories) # Write output OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True) with open(OUTPUT_FILE, 'w', encoding='utf-8') as f: f.write(markdown_content) total_files = sum(len(files) for files in categories.values()) print(f"āœ… Scan complete! Found {total_files} Python files.") print(f"šŸ“„ Report generated: {OUTPUT_FILE}") # Print summary print("\nšŸ“Š Category breakdown:") for category, files in sorted(categories.items(), key=lambda x: x[0]): print(f" {category}: {len(files)} files") return 0 if __name__ == "__main__": exit(main())