CI: add code-size check for files crossing LOC threshold (#12810)

* CI: add code-size check for files crossing LOC threshold * feat(ci): add duplicate function detection to CI code-size check The --compare-to mode now also detects new duplicate function names introduced by a PR. Uses git diff to scope checks to changed files only, keeping CI fast. * fix(ci): address review feedback for code-size check - Validate git ref upfront; exit 2 if ref doesn't exist - Distinguish 'file missing at ref' from genuine git errors - Explicitly fetch base branch ref in CI workflow - Raise threshold from 700 to 1000 lines * fix(ci): exclude Swabble, skills, .pi from code analysis * update gitignore for pycache * ci: make code-size check informational (no failure on violations)
2026-03-07 22:44:16 +00:00 · 2026-02-09 11:34:18 -08:00
parent 268094938b
commit 50b3d32d3c
3 changed files with 268 additions and 8 deletions
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -386,6 +386,31 @@ jobs:
      - name: Run ${{ matrix.task }}
        run: ${{ matrix.command }}

+  # Check for files that grew past LOC threshold in this PR (delta-only).
+  code-size:
+    if: github.event_name == 'pull_request'
+    runs-on: blacksmith-4vcpu-ubuntu-2404
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          submodules: false
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+
+      - name: Fetch base branch
+        run: git fetch origin ${{ github.base_ref }}:refs/remotes/origin/${{ github.base_ref }}
+
+      - name: Check code file sizes
+        run: |
+          python scripts/analyze_code_files.py \
+            --compare-to origin/${{ github.base_ref }} \
+            --threshold 1000
+
  secrets:
    runs-on: blacksmith-4vcpu-ubuntu-2404
    steps:
--- a/.gitignore
+++ b/.gitignore
@@ -7,6 +7,8 @@ pnpm-lock.yaml
 bun.lock
 bun.lockb
 coverage
+__pycache__/
+*.pyc
 .tsbuildinfo
 .pnpm-store
 .worktrees/
--- a/scripts/analyze_code_files.py
+++ b/scripts/analyze_code_files.py
@@ -2,13 +2,18 @@
 """
 Lists the longest and shortest code files in the project, and counts duplicated function names across files. Useful for identifying potential refactoring targets and enforcing code size guidelines.
 Threshold can be set to warn about files longer or shorter than a certain number of lines.
+
+CI mode (--compare-to): Only warns about files that grew past threshold compared to a base ref.
+Use --strict to exit non-zero on violations for CI gating.
 """

 import os
 import re
+import sys
+import subprocess
 import argparse
 from pathlib import Path
-from typing import List, Tuple, Dict, Set
+from typing import List, Tuple, Dict, Set, Optional
 from collections import defaultdict

 # File extensions to consider as code files
@@ -23,7 +28,10 @@ CODE_EXTENSIONS = {
 SKIP_DIRS = {
    'node_modules', '.git', 'dist', 'build', 'coverage',
    '__pycache__', '.turbo', 'out', '.worktrees', 'vendor',
-    'Pods', 'DerivedData', '.gradle', '.idea'
+    'Pods', 'DerivedData', '.gradle', '.idea',
+    'Swabble',  # Separate Swift package
+    'skills',   # Standalone skill scripts
+    '.pi',      # Pi editor extensions
 }

 # Filename patterns to skip in short-file warnings (barrel exports, stubs)
@@ -125,12 +133,7 @@ def extract_functions(file_path: Path) -> Set[str]:
    except Exception:
        return set()
    
-    functions = set()
-    for pattern in TS_FUNCTION_PATTERNS:
-        for match in pattern.finditer(content):
-            functions.add(match.group(1))
-    
-    return functions
+    return extract_functions_from_content(content)


 def find_duplicate_functions(files: List[Tuple[Path, int]], root_dir: Path) -> Dict[str, List[Path]]:
@@ -155,6 +158,170 @@ def find_duplicate_functions(files: List[Tuple[Path, int]], root_dir: Path) -> D
    return {name: paths for name, paths in function_locations.items() if len(paths) > 1}


+def validate_git_ref(root_dir: Path, ref: str) -> bool:
+    """Validate that a git ref exists. Exits with error if not."""
+    try:
+        result = subprocess.run(
+            ['git', 'rev-parse', '--verify', ref],
+            capture_output=True,
+            cwd=root_dir,
+            encoding='utf-8',
+        )
+        return result.returncode == 0
+    except Exception:
+        return False
+
+
+def get_file_content_at_ref(file_path: Path, root_dir: Path, ref: str) -> Optional[str]:
+    """Get content of a file at a specific git ref. Returns None if file doesn't exist at ref."""
+    try:
+        relative_path = file_path.relative_to(root_dir)
+        # Use forward slashes for git paths
+        git_path = str(relative_path).replace('\\', '/')
+        result = subprocess.run(
+            ['git', 'show', f'{ref}:{git_path}'],
+            capture_output=True,
+            cwd=root_dir,
+            encoding='utf-8',
+            errors='ignore',
+        )
+        if result.returncode != 0:
+            stderr = result.stderr.strip()
+            # "does not exist" or "exists on disk, but not in" = file missing at ref (OK)
+            if 'does not exist' in stderr or 'exists on disk' in stderr:
+                return None
+            # Other errors (bad ref, git broken) = genuine failure
+            if stderr:
+                print(f"⚠️  git show error for {git_path}: {stderr}", file=sys.stderr)
+            return None
+        return result.stdout
+    except Exception as e:
+        print(f"⚠️  failed to read {file_path} at {ref}: {e}", file=sys.stderr)
+        return None
+
+
+def get_line_count_at_ref(file_path: Path, root_dir: Path, ref: str) -> Optional[int]:
+    """Get line count of a file at a specific git ref. Returns None if file doesn't exist at ref."""
+    content = get_file_content_at_ref(file_path, root_dir, ref)
+    if content is None:
+        return None
+    return len(content.splitlines())
+
+
+def extract_functions_from_content(content: str) -> Set[str]:
+    """Extract function names from TypeScript content string."""
+    functions = set()
+    for pattern in TS_FUNCTION_PATTERNS:
+        for match in pattern.finditer(content):
+            functions.add(match.group(1))
+    return functions
+
+
+def get_changed_files(root_dir: Path, compare_ref: str) -> Set[str]:
+    """Get set of files changed between compare_ref and HEAD (relative paths with forward slashes)."""
+    try:
+        result = subprocess.run(
+            ['git', 'diff', '--name-only', compare_ref, 'HEAD'],
+            capture_output=True,
+            cwd=root_dir,
+            encoding='utf-8',
+            errors='ignore',
+        )
+        if result.returncode != 0:
+            return set()
+        return {line.strip() for line in result.stdout.splitlines() if line.strip()}
+    except Exception:
+        return set()
+
+
+def find_duplicate_regressions(
+    files: List[Tuple[Path, int]],
+    root_dir: Path,
+    compare_ref: str,
+) -> Dict[str, List[Path]]:
+    """
+    Find new duplicate function names that didn't exist at the base ref.
+    Only checks functions in files that changed to keep CI fast.
+    Returns dict of function_name -> list of current file paths, only for
+    duplicates that are new (weren't duplicated at compare_ref).
+    """
+    # Build current duplicate map
+    current_dupes = find_duplicate_functions(files, root_dir)
+    if not current_dupes:
+        return {}
+
+    # Get changed files to scope the comparison
+    changed_files = get_changed_files(root_dir, compare_ref)
+    if not changed_files:
+        return {}  # Nothing changed, no new duplicates possible
+
+    # Only check duplicate functions that involve at least one changed file
+    relevant_dupes: Dict[str, List[Path]] = {}
+    for func_name, paths in current_dupes.items():
+        involves_changed = any(
+            str(p.relative_to(root_dir)).replace('\\', '/') in changed_files
+            for p in paths
+        )
+        if involves_changed:
+            relevant_dupes[func_name] = paths
+
+    if not relevant_dupes:
+        return {}
+
+    # For relevant duplicates, check if they were already duplicated at base ref
+    # Only need to read base versions of files involved in these duplicates
+    files_to_check: Set[Path] = set()
+    for paths in relevant_dupes.values():
+        files_to_check.update(paths)
+
+    base_function_locations: Dict[str, List[Path]] = defaultdict(list)
+    for file_path in files_to_check:
+        if file_path.suffix.lower() not in {'.ts', '.tsx'}:
+            continue
+        content = get_file_content_at_ref(file_path, root_dir, compare_ref)
+        if content is None:
+            continue
+        functions = extract_functions_from_content(content)
+        for func in functions:
+            if func in SKIP_DUPLICATE_FUNCTIONS:
+                continue
+            if any(func.startswith(prefix) for prefix in SKIP_DUPLICATE_PREFIXES):
+                continue
+            base_function_locations[func].append(file_path)
+
+    base_dupes = {name for name, paths in base_function_locations.items() if len(paths) > 1}
+
+    # Return only new duplicates
+    return {name: paths for name, paths in relevant_dupes.items() if name not in base_dupes}
+
+
+def find_threshold_regressions(
+    files: List[Tuple[Path, int]],
+    root_dir: Path,
+    compare_ref: str,
+    threshold: int,
+) -> List[Tuple[Path, int, Optional[int]]]:
+    """
+    Find files that crossed the threshold compared to a base ref.
+    Returns list of (path, current_lines, base_lines) for files that:
+    - Were under threshold (or didn't exist) at compare_ref
+    - Are now at or over threshold
+    """
+    regressions = []
+    
+    for file_path, current_lines in files:
+        if current_lines < threshold:
+            continue  # Not over threshold now, skip
+        
+        base_lines = get_line_count_at_ref(file_path, root_dir, compare_ref)
+        
+        # Regression if: file is new OR was under threshold before
+        if base_lines is None or base_lines < threshold:
+            regressions.append((file_path, current_lines, base_lines))
+    
+    return regressions
+
+
 def main():
    parser = argparse.ArgumentParser(
        description='Analyze code files: list longest/shortest files, find duplicate function names'
@@ -189,10 +356,72 @@ def main():
        default='.',
        help='Directory to scan (default: current directory)'
    )
+    parser.add_argument(
+        '--compare-to',
+        type=str,
+        default=None,
+        help='Git ref to compare against (e.g., origin/main). Only warn about files that grew past threshold.'
+    )
+    parser.add_argument(
+        '--strict',
+        action='store_true',
+        help='Exit with non-zero status if any violations found (for CI)'
+    )
    
    args = parser.parse_args()
    
    root_dir = Path(args.directory).resolve()
+    
+    # CI delta mode: only show regressions
+    if args.compare_to:
+        print(f"\n📂 Scanning: {root_dir}")
+        print(f"🔍 Comparing to: {args.compare_to}\n")
+
+        if not validate_git_ref(root_dir, args.compare_to):
+            print(f"❌ Invalid git ref: {args.compare_to}", file=sys.stderr)
+            print("   Make sure the ref exists (e.g. run 'git fetch origin <branch>')", file=sys.stderr)
+            sys.exit(2)
+        
+        files = find_code_files(root_dir)
+        violations = False
+
+        # Check file length regressions
+        regressions = find_threshold_regressions(files, root_dir, args.compare_to, args.threshold)
+        
+        if regressions:
+            print(f"⚠️  {len(regressions)} file(s) crossed {args.threshold} line threshold:\n")
+            for file_path, current, base in regressions:
+                relative_path = file_path.relative_to(root_dir)
+                if base is None:
+                    print(f"   {relative_path}: {current:,} lines (new file)")
+                else:
+                    print(f"   {relative_path}: {base:,} → {current:,} lines (+{current - base:,})")
+            print()
+            violations = True
+        else:
+            print(f"✅ No files crossed {args.threshold} line threshold")
+
+        # Check new duplicate function names
+        new_dupes = find_duplicate_regressions(files, root_dir, args.compare_to)
+
+        if new_dupes:
+            print(f"⚠️  {len(new_dupes)} new duplicate function name(s):\n")
+            for func_name in sorted(new_dupes.keys()):
+                paths = new_dupes[func_name]
+                print(f"   {func_name}:")
+                for path in paths:
+                    print(f"       {path.relative_to(root_dir)}")
+            print()
+            violations = True
+        else:
+            print(f"✅ No new duplicate function names")
+
+        print()
+        if args.strict and violations:
+            sys.exit(1)
+        
+        return
+    
    print(f"\n📂 Scanning: {root_dir}\n")
    
    # Find and sort files by line count
@@ -306,6 +535,10 @@ def main():
        print(f"\n✅ No duplicate function names")
    
    print()
+    
+    # Exit with error if --strict and there are violations
+    if args.strict and long_warnings:
+        sys.exit(1)


 if __name__ == '__main__':