Files
service-finder/backend/app/scripts/audit_scanner.py
2026-03-22 18:59:27 +00:00

236 lines
8.5 KiB
Python

#!/usr/bin/env python3
"""
Audit Scanner for Codebase Analysis (#42)
This script performs a comprehensive audit of the Python codebase:
1. Recursively scans the backend/app directory for .py files
2. Excludes __init__.py files and alembic/versions directory
3. Groups files by directory structure (api, services, models, etc.)
4. Extracts docstrings and class/function names from each file
5. Generates a Markdown audit ledger with checkboxes for tracking
"""
import os
import re
import ast
from pathlib import Path
from typing import Dict, List, Tuple, Set
import datetime
# Project root (relative to script location in container)
PROJECT_ROOT = Path("/app")
BACKEND_DIR = PROJECT_ROOT / "app" # /app/app is the backend root in container
OUTPUT_FILE = Path("/app/.roo/audit_ledger_94.md")
# Directories to exclude
EXCLUDE_DIRS = {"__pycache__", ".git", "alembic/versions", "migrations"}
EXCLUDE_FILES = {"__init__.py"}
def extract_python_info(file_path: Path) -> Tuple[str, List[str], List[str]]:
"""
Extract docstring and class/function names from a Python file.
Returns: (docstring, class_names, function_names)
"""
try:
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
# Try to parse with AST
try:
tree = ast.parse(content)
# Extract module docstring
docstring = ast.get_docstring(tree) or ""
# Extract class and function names
class_names = []
function_names = []
for node in ast.walk(tree):
if isinstance(node, ast.ClassDef):
class_names.append(node.name)
elif isinstance(node, ast.FunctionDef):
# Only top-level functions (not methods)
if not isinstance(node.parent, ast.ClassDef):
function_names.append(node.name)
return docstring, class_names, function_names
except (SyntaxError, ValueError):
# If AST parsing fails, use simple regex extraction
docstring_match = re.search(r'"""(.*?)"""', content, re.DOTALL)
docstring = docstring_match.group(1).strip() if docstring_match else ""
# Simple regex for class and function definitions
class_matches = re.findall(r'^class\s+(\w+)', content, re.MULTILINE)
func_matches = re.findall(r'^def\s+(\w+)', content, re.MULTILINE)
return docstring, class_matches, func_matches
except Exception as e:
return f"Error reading file: {e}", [], []
def get_file_summary(docstring: str, class_names: List[str], function_names: List[str]) -> str:
"""Create a summary string from extracted information."""
parts = []
if docstring:
# Take first line of docstring, max 100 chars
first_line = docstring.split('\n')[0].strip()
if len(first_line) > 100:
first_line = first_line[:97] + "..."
parts.append(f'"{first_line}"')
if class_names:
parts.append(f"Classes: {', '.join(class_names[:5])}")
if len(class_names) > 5:
parts[-1] += f" (+{len(class_names)-5} more)"
if function_names:
parts.append(f"Functions: {', '.join(function_names[:5])}")
if len(function_names) > 5:
parts[-1] += f" (+{len(function_names)-5} more)"
return " - ".join(parts) if parts else "No docstring or definitions found"
def scan_python_files(root_dir: Path) -> Dict[str, List[Tuple[Path, str]]]:
"""
Scan for Python files and group them by directory category.
Returns: {category: [(file_path, summary), ...]}
"""
categories = {}
for py_file in root_dir.rglob("*.py"):
# Skip excluded directories
if any(excluded in str(py_file) for excluded in EXCLUDE_DIRS):
continue
# Skip excluded files
if py_file.name in EXCLUDE_FILES:
continue
# Determine category based on directory structure
rel_path = py_file.relative_to(root_dir)
path_parts = list(rel_path.parts)
# Categorize based on first few directory levels
category = "Other"
if len(path_parts) >= 2:
if path_parts[0] == "api":
category = "API Endpoints"
elif path_parts[0] == "services":
category = "Services"
elif path_parts[0] == "models":
category = "Models"
elif path_parts[0] == "core":
category = "Core"
elif path_parts[0] == "workers":
category = "Workers"
elif path_parts[0] == "scripts":
category = "Scripts"
elif path_parts[0] == "tests" or path_parts[0] == "tests_internal" or path_parts[0] == "test_outside":
category = "Tests"
elif path_parts[0] == "crud":
category = "CRUD"
elif path_parts[0] == "schemas":
category = "Schemas"
elif path_parts[0] == "templates":
category = "Templates"
elif path_parts[0] == "static":
category = "Static"
# Extract file info
docstring, class_names, function_names = extract_python_info(py_file)
summary = get_file_summary(docstring, class_names, function_names)
# Add to category
if category not in categories:
categories[category] = []
categories[category].append((rel_path, summary))
return categories
def generate_markdown(categories: Dict[str, List[Tuple[Path, str]]]) -> str:
"""Generate Markdown content from categorized files."""
lines = []
# Header
lines.append("# Codebase Audit Ledger (#42)")
lines.append("")
lines.append(f"*Generated: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}*")
lines.append(f"*Total files scanned: {sum(len(files) for files in categories.values())}*")
lines.append("")
lines.append("## 📋 Audit Checklist")
lines.append("")
lines.append("Check each file after audit completion. Use this ledger to track progress.")
lines.append("")
# Sort categories for consistent output
sorted_categories = sorted(categories.items(), key=lambda x: x[0])
for category, files in sorted_categories:
lines.append(f"## {category} (`backend/app/{category.lower().replace(' ', '_')}/...`)")
lines.append("")
# Sort files alphabetically
files.sort(key=lambda x: str(x[0]))
for file_path, summary in files:
# Create checkbox and file entry
lines.append(f"- [ ] `{file_path}` - {summary}")
lines.append("")
# Add statistics
lines.append("## 📊 Statistics")
lines.append("")
lines.append("| Category | File Count |")
lines.append("|----------|------------|")
for category, files in sorted_categories:
lines.append(f"| {category} | {len(files)} |")
lines.append("")
lines.append("## 🎯 Next Steps")
lines.append("")
lines.append("1. **Review each file** for functionality and dependencies")
lines.append("2. **Document findings** in individual audit reports")
lines.append("3. **Identify gaps** in test coverage and documentation")
lines.append("4. **Prioritize refactoring** based on complexity and criticality")
lines.append("")
lines.append("*This ledger is automatically generated by `audit_scanner.py`*")
return "\n".join(lines)
def main():
print("🔍 Starting codebase audit scan...")
print(f"Scanning directory: {BACKEND_DIR}")
if not BACKEND_DIR.exists():
print(f"Error: Directory {BACKEND_DIR} does not exist!")
return 1
# Scan files
categories = scan_python_files(BACKEND_DIR)
# Generate markdown
markdown_content = generate_markdown(categories)
# Write output
OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
f.write(markdown_content)
total_files = sum(len(files) for files in categories.values())
print(f"✅ Scan complete! Found {total_files} Python files.")
print(f"📄 Report generated: {OUTPUT_FILE}")
# Print summary
print("\n📊 Category breakdown:")
for category, files in sorted(categories.items(), key=lambda x: x[0]):
print(f" {category}: {len(files)} files")
return 0
if __name__ == "__main__":
exit(main())