Delete docsgpt_scanner.py

This commit is contained in:
Pavel
2025-03-30 11:45:33 +01:00
committed by GitHub
parent 6d3134c944
commit c740933782

View File

@@ -1,636 +0,0 @@
import os
import argparse
from pathlib import Path
import datetime
import re
import json
class DocsGPTDocumentationGenerator:
def __init__(self, root_dir, config=None):
"""
Initialize the documentation generator with customized settings for DocsGPT.
Args:
root_dir (str): The path to the root directory of the project.
config (dict, optional): Configuration overrides.
"""
self.root_dir = os.path.abspath(root_dir)
# Default configuration optimized for DocsGPT
self.config = {
# Directories to exclude completely
'excluded_dirs': [
'__pycache__', 'venv', '.venv', 'node_modules', '.git', '.idea', '.vscode',
'dist', 'build', 'model', 'temp', 'indexes', 'model', 'postgres_data',
'logs', 'out', 'vectors'
],
# File patterns to exclude
'excluded_patterns': [
'*.pyc', '*.bin', '*.faiss', '*.pkl', '*.so', '*.o',
'*.jpg', '*.jpeg', '*.png', '*.gif', '*.webp', '*.ico', '*.lock',
'*.pdf' # Exclude PDFs as they're just data in your project
],
# Files that should always be included despite other exclusions
'always_include': [
'README.md', 'LICENSE', 'CONTRIBUTING.md', 'requirements.txt',
'package.json', 'Dockerfile', 'docker-compose*.yaml', 'docker-compose*.yml'
],
# Core code directories to focus on
'core_dirs': [
'application', 'frontend/src', 'extensions', 'docs'
],
# File types to include content in documentation
'content_file_types': [
'.py', '.js', '.jsx', '.ts', '.tsx', '.md', '.txt',
'.yaml', '.yml', '.json', '.dockerfile'
],
# Max file size to include content (100KB)
'max_content_size': 100 * 1024,
# Max number of files to include full content for each directory
'max_files_per_dir': 5,
# Max characters to show in file previews
'preview_length': 500
}
# Override config with provided values
if config:
self.config.update(config)
def should_exclude(self, path):
"""
Determine if a path should be excluded from the documentation.
Args:
path (str): The path to check.
Returns:
bool: True if the path should be excluded, False otherwise.
"""
name = os.path.basename(path)
# Always include certain files
for pattern in self.config['always_include']:
if self._match_pattern(name, pattern):
return False
# Check if it's in excluded directories
parts = Path(path).relative_to(self.root_dir).parts
for part in parts:
for excluded_dir in self.config['excluded_dirs']:
if self._match_pattern(part, excluded_dir):
return True
# Check excluded patterns
for pattern in self.config['excluded_patterns']:
if self._match_pattern(name, pattern):
return True
# Exclude hidden files
if name.startswith('.') and name not in ['.env.example']:
return True
return False
def _match_pattern(self, name, pattern):
"""
Check if a name matches a pattern with simple wildcard support.
Args:
name (str): The name to check.
pattern (str): The pattern to match against.
Returns:
bool: True if the name matches the pattern, False otherwise.
"""
if pattern.startswith('*.'):
# Extension pattern
return name.endswith(pattern[1:])
elif '*' in pattern:
# Convert to regex pattern
regex_pattern = pattern.replace('.', r'\.').replace('*', '.*')
return bool(re.match(f"^{regex_pattern}$", name))
else:
# Exact match
return name == pattern
def scan_directory(self):
"""
Scan the directory and build a structure representation.
Returns:
dict: A dictionary representation of the project structure
"""
structure = {}
for root, dirs, files in os.walk(self.root_dir):
# Skip excluded directories
dirs[:] = [d for d in dirs if not self.should_exclude(os.path.join(root, d))]
# Get the relative path from the root directory
rel_path = os.path.relpath(root, self.root_dir)
if rel_path == '.':
rel_path = ''
# Filter files based on excluded patterns
filtered_files = [file for file in files if not self.should_exclude(os.path.join(root, file))]
# Add directory and its files to the structure
if rel_path:
current_level = structure
for part in rel_path.split(os.path.sep):
if part not in current_level:
current_level[part] = {}
current_level = current_level[part]
current_level['__files__'] = filtered_files
else:
structure['__files__'] = filtered_files
return structure
def print_structure(self, structure=None, indent=0, is_last=True, prefix="", file=None):
"""
Print the directory structure in a tree-like format.
Args:
structure (dict): Dictionary representing the directory structure.
indent (int): Current indentation level.
is_last (bool): Whether this is the last item in its parent.
prefix (str): Prefix for the current line.
file: File object to write to.
"""
if structure is None:
# First call, print the root directory name
structure = self.scan_directory()
root_name = os.path.basename(self.root_dir) + "/"
line = root_name
if file:
file.write(f"{line}\n")
print(line)
# Print files
if '__files__' in structure:
files = structure.pop('__files__')
for i, file_name in enumerate(sorted(files)):
is_last_file = (i == len(files) - 1) and len(structure) == 0
connector = "└── " if is_last_file else "├── "
line = f"{prefix}{connector}{file_name}"
if file:
file.write(f"{line}\n")
print(line)
# Process directories
items = list(sorted(structure.items()))
for i, (dir_name, contents) in enumerate(items):
is_last_dir = i == len(items) - 1
connector = "└── " if is_last_dir else "├── "
line = f"{prefix}{connector}{dir_name}/"
if file:
file.write(f"{line}\n")
print(line)
new_prefix = prefix + (" " if is_last_dir else "")
self.print_structure(contents, indent + 1, is_last_dir, new_prefix, file)
def _get_file_language(self, file_path):
"""
Determine the language of a file for code block formatting.
Args:
file_path (str): Path to the file.
Returns:
str: Language identifier for markdown code block.
"""
ext = os.path.splitext(file_path)[1].lower()
name = os.path.basename(file_path)
# Map file extensions to language identifiers
ext_to_lang = {
'.py': 'python',
'.js': 'javascript',
'.jsx': 'jsx',
'.ts': 'typescript',
'.tsx': 'tsx',
'.html': 'html',
'.css': 'css',
'.scss': 'scss',
'.md': 'markdown',
'.json': 'json',
'.yaml': 'yaml',
'.yml': 'yaml',
'.sh': 'bash'
}
# Special files
if name in ['Dockerfile']:
return 'dockerfile'
elif name in ['docker-compose.yml', 'docker-compose.yaml']:
return 'yaml'
elif name in ['Makefile']:
return 'makefile'
elif name in ['.gitignore', 'requirements.txt', '.env.example']:
return '' # Plain text
return ext_to_lang.get(ext, '')
def should_include_content(self, file_path):
"""
Check if a file's content should be included in the documentation.
Args:
file_path (str): The path to the file.
Returns:
bool: True if content should be included, False otherwise.
"""
# Check file size
if os.path.getsize(file_path) > self.config['max_content_size']:
return False
# Check file extension
ext = os.path.splitext(file_path)[1].lower()
if ext not in self.config['content_file_types']:
return False
# Check if file is in a core directory
rel_path = os.path.relpath(file_path, self.root_dir)
for core_dir in self.config['core_dirs']:
if rel_path.startswith(core_dir):
return True
# Include any README or key configuration files
name = os.path.basename(file_path)
if any(self._match_pattern(name, pattern) for pattern in self.config['always_include']):
return True
return False
def count_files_by_type(self):
"""
Count the number of files by type in the project.
Returns:
dict: A dictionary mapping file extensions to counts.
"""
ext_counts = {}
for root, _, files in os.walk(self.root_dir):
if self.should_exclude(root):
continue
for file in files:
file_path = os.path.join(root, file)
if self.should_exclude(file_path):
continue
ext = os.path.splitext(file)[1].lower()
if not ext:
ext = '(no extension)'
ext_counts[ext] = ext_counts.get(ext, 0) + 1
return ext_counts
def generate_code_snippets(self, structure=None, path="", snippets=None):
"""
Generate representative code snippets from the project.
Args:
structure (dict): Project structure dictionary.
path (str): Current path in the structure.
snippets (dict): Dictionary to store snippets by directory.
Returns:
dict: Dictionary mapping directories to lists of file snippets.
"""
if snippets is None:
snippets = {}
structure = self.scan_directory()
# Process files in the current directory
if '__files__' in structure:
files = structure.pop('__files__')
dir_snippets = []
# Sort files to prioritize key files
sorted_files = sorted(files, key=lambda f: f.startswith(('README', 'main', 'app')) and not f.startswith('.'), reverse=True)
for file in sorted_files[:self.config['max_files_per_dir']]:
file_path = os.path.join(self.root_dir, path, file)
if self.should_include_content(file_path):
try:
with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
content = f.read(self.config['preview_length'])
too_long = len(content) >= self.config['preview_length']
dir_snippets.append({
'name': file,
'path': os.path.join(path, file),
'language': self._get_file_language(file_path),
'content': content + ('...' if too_long else ''),
'full_path': file_path
})
except Exception as e:
# Skip files that can't be read
pass
if dir_snippets:
snippets[path or '.'] = dir_snippets
# Process subdirectories
for dir_name, contents in structure.items():
self.generate_code_snippets(contents, os.path.join(path, dir_name), snippets)
return snippets
def find_important_files(self):
"""
Find and return a list of important files in the project.
Returns:
list: List of important file paths.
"""
important_files = []
# Files to look for in any directory
common_important_files = [
'README.md', 'Dockerfile', 'docker-compose.yml', 'docker-compose.yaml',
'requirements.txt', 'setup.py', 'package.json', 'app.py', 'main.py',
'settings.py', 'config.py', 'wsgi.py', '.env.example'
]
for root, _, files in os.walk(self.root_dir):
if self.should_exclude(root):
continue
for file in files:
if file in common_important_files:
important_files.append(os.path.join(root, file))
return important_files
def generate_markdown(self, output_file):
"""
Generate a comprehensive markdown document for the DocsGPT project.
Args:
output_file (str): Path to the output markdown file.
"""
structure = self.scan_directory()
ext_counts = self.count_files_by_type()
important_files = self.find_important_files()
snippets = self.generate_code_snippets()
with open(output_file, 'w', encoding='utf-8') as md_file:
# Title and metadata
md_file.write(f"# DocsGPT Project Documentation\n\n")
md_file.write(f"Generated: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M')}\n\n")
# Project Overview
md_file.write("## 1. Project Overview\n\n")
# Try to include README content
readme_path = os.path.join(self.root_dir, "README.md")
if os.path.exists(readme_path):
try:
with open(readme_path, 'r', encoding='utf-8', errors='replace') as readme:
content = readme.read()
md_file.write("### From README.md\n\n")
md_file.write(f"{content}\n\n")
except Exception:
md_file.write("*Error reading README.md*\n\n")
# Project stats
md_file.write("### Project Statistics\n\n")
# Count directories and files
total_dirs = 0
total_files = 0
for root, dirs, files in os.walk(self.root_dir):
if not self.should_exclude(root):
total_dirs += sum(1 for d in dirs if not self.should_exclude(os.path.join(root, d)))
total_files += sum(1 for f in files if not self.should_exclude(os.path.join(root, f)))
md_file.write(f"- **Total Directories:** {total_dirs}\n")
md_file.write(f"- **Total Files:** {total_files}\n\n")
md_file.write("#### File Types\n\n")
for ext, count in sorted(ext_counts.items(), key=lambda x: x[1], reverse=True)[:15]:
md_file.write(f"- **{ext}:** {count} files\n")
md_file.write("\n")
# Directory Structure
md_file.write("## 2. Directory Structure\n\n")
md_file.write("```\n")
self.print_structure(file=md_file)
md_file.write("```\n\n")
# Key Components
md_file.write("## 3. Key Components\n\n")
# Application component
md_file.write("### 3.1. Application Core\n\n")
if 'application' in snippets:
md_file.write("The application core contains the main backend logic for DocsGPT.\n\n")
for snippet in snippets['application'][:3]:
md_file.write(f"#### {snippet['path']}\n\n")
md_file.write(f"```{snippet['language']}\n{snippet['content']}\n```\n\n")
# Frontend component
md_file.write("### 3.2. Frontend\n\n")
frontend_snippets = [s for path, files in snippets.items()
for s in files if path.startswith('frontend/src')]
if frontend_snippets:
md_file.write("The frontend is built with React and provides the user interface.\n\n")
for snippet in frontend_snippets[:3]:
md_file.write(f"#### {snippet['path']}\n\n")
md_file.write(f"```{snippet['language']}\n{snippet['content']}\n```\n\n")
# Extensions
md_file.write("### 3.3. Extensions\n\n")
extension_snippets = [s for path, files in snippets.items()
for s in files if path.startswith('extensions')]
if extension_snippets:
md_file.write("DocsGPT includes various extensions for different platforms.\n\n")
for snippet in extension_snippets[:3]:
md_file.write(f"#### {snippet['path']}\n\n")
md_file.write(f"```{snippet['language']}\n{snippet['content']}\n```\n\n")
# Configuration Files
md_file.write("## 4. Configuration Files\n\n")
# Docker files
md_file.write("### 4.1. Docker Configuration\n\n")
docker_files = [f for f in important_files if os.path.basename(f) in
['Dockerfile', 'docker-compose.yml', 'docker-compose.yaml']]
for file_path in docker_files:
try:
with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
content = f.read()
rel_path = os.path.relpath(file_path, self.root_dir)
md_file.write(f"#### {rel_path}\n\n")
lang = 'dockerfile' if os.path.basename(file_path) == 'Dockerfile' else 'yaml'
md_file.write(f"```{lang}\n{content}\n```\n\n")
except Exception as e:
md_file.write(f"*Error reading {os.path.relpath(file_path, self.root_dir)}: {e}*\n\n")
# Requirements and package files
md_file.write("### 4.2. Dependencies\n\n")
dep_files = [f for f in important_files if os.path.basename(f) in
['requirements.txt', 'package.json']]
for file_path in dep_files:
try:
with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
content = f.read()
rel_path = os.path.relpath(file_path, self.root_dir)
md_file.write(f"#### {rel_path}\n\n")
lang = 'json' if file_path.endswith('.json') else ''
md_file.write(f"```{lang}\n{content}\n```\n\n")
except Exception as e:
md_file.write(f"*Error reading {os.path.relpath(file_path, self.root_dir)}: {e}*\n\n")
# Environment files
env_files = [f for f in important_files if os.path.basename(f) == '.env.example']
if env_files:
md_file.write("### 4.3. Environment Configuration\n\n")
for file_path in env_files:
try:
with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
content = f.read()
rel_path = os.path.relpath(file_path, self.root_dir)
md_file.write(f"#### {rel_path}\n\n")
md_file.write(f"```\n{content}\n```\n\n")
except Exception as e:
md_file.write(f"*Error reading {os.path.relpath(file_path, self.root_dir)}: {e}*\n\n")
# API Documentation (if we can find routes)
md_file.write("## 5. API Documentation\n\n")
api_files = []
for root, _, files in os.walk(os.path.join(self.root_dir, 'application/api')):
if self.should_exclude(root):
continue
for file in files:
if file == 'routes.py':
api_files.append(os.path.join(root, file))
if api_files:
md_file.write("### API Routes\n\n")
for file_path in api_files[:5]: # Limit to 5 route files
try:
with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
content = f.read()
rel_path = os.path.relpath(file_path, self.root_dir)
md_file.write(f"#### {rel_path}\n\n")
md_file.write(f"```python\n{content}\n```\n\n")
except Exception as e:
md_file.write(f"*Error reading {os.path.relpath(file_path, self.root_dir)}: {e}*\n\n")
# Conclusion
md_file.write("## 6. Additional Information\n\n")
md_file.write("This documentation provides an overview of the DocsGPT project structure and key components. "
"For more detailed information, please refer to the official documentation and code comments.\n\n")
md_file.write("### License\n\n")
license_path = os.path.join(self.root_dir, "LICENSE")
if os.path.exists(license_path):
try:
with open(license_path, 'r', encoding='utf-8', errors='replace') as f:
content = f.read(500) # Just read the beginning of the license
md_file.write(f"```\n{content}...\n```\n\n")
except Exception:
md_file.write("*Error reading LICENSE file*\n\n")
# Generation metadata
md_file.write("---\n\n")
md_file.write(f"*Documentation generated on {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}*\n")
md_file.write(f"*Generator: DocsGPT Project Documentation Generator*\n")
def main():
parser = argparse.ArgumentParser(description='DocsGPT Project Documentation Generator')
parser.add_argument('--root', '-r', type=str, default='.',
help='Root directory of the project (default: current directory)')
parser.add_argument('--output', '-o', type=str,
help='Output markdown file (default: project_name_docs.md in the root directory)')
parser.add_argument('--exclude-dirs', '-e', type=str, nargs='+',
help='Additional directories to exclude')
parser.add_argument('--exclude-files', '-ef', type=str, nargs='+',
help='Additional file patterns to exclude')
parser.add_argument('--include-files', '-if', type=str, nargs='+',
help='Files to always include despite exclusions')
parser.add_argument('--core-dirs', '-c', type=str, nargs='+',
help='Core directories to focus on for code snippets')
parser.add_argument('--config-file', '-cf', type=str,
help='Path to JSON configuration file')
parser.add_argument('--tree-only', action='store_true',
help='Only print the directory tree structure, do not generate documentation')
args = parser.parse_args()
# Get absolute path of the root directory
root_dir = os.path.abspath(args.root)
# Load configuration from file if provided
config = None
if args.config_file:
try:
with open(args.config_file, 'r') as f:
config = json.load(f)
except Exception as e:
print(f"Error loading configuration file: {e}")
return
else:
# Build configuration from command line arguments
config = {}
if args.exclude_dirs:
config['excluded_dirs'] = args.exclude_dirs
if args.exclude_files:
config['excluded_patterns'] = args.exclude_files
if args.include_files:
config['always_include'] = args.include_files
if args.core_dirs:
config['core_dirs'] = args.core_dirs
# Create the generator
generator = DocsGPTDocumentationGenerator(root_dir=root_dir, config=config)
if args.tree_only:
# Just print the tree structure
print(f"Directory structure for: {root_dir}\n")
generator.print_structure()
else:
# Generate full documentation
output_file = args.output
if not output_file:
project_name = os.path.basename(root_dir)
output_file = os.path.join(root_dir, f"{project_name}_documentation.md")
print(f"Generating documentation for {root_dir}...")
generator.generate_markdown(output_file)
print(f"Documentation saved to: {output_file}")
if __name__ == "__main__":
main()