From c7409337821724d9380c0bc39adc90fd2c14b106 Mon Sep 17 00:00:00 2001 From: Pavel <32868631+pabik@users.noreply.github.com> Date: Sun, 30 Mar 2025 11:45:33 +0100 Subject: [PATCH] Delete docsgpt_scanner.py --- docsgpt_scanner.py | 636 --------------------------------------------- 1 file changed, 636 deletions(-) delete mode 100644 docsgpt_scanner.py diff --git a/docsgpt_scanner.py b/docsgpt_scanner.py deleted file mode 100644 index 98a8e0f6..00000000 --- a/docsgpt_scanner.py +++ /dev/null @@ -1,636 +0,0 @@ -import os -import argparse -from pathlib import Path -import datetime -import re -import json - -class DocsGPTDocumentationGenerator: - def __init__(self, root_dir, config=None): - """ - Initialize the documentation generator with customized settings for DocsGPT. - - Args: - root_dir (str): The path to the root directory of the project. - config (dict, optional): Configuration overrides. - """ - self.root_dir = os.path.abspath(root_dir) - - # Default configuration optimized for DocsGPT - self.config = { - # Directories to exclude completely - 'excluded_dirs': [ - '__pycache__', 'venv', '.venv', 'node_modules', '.git', '.idea', '.vscode', - 'dist', 'build', 'model', 'temp', 'indexes', 'model', 'postgres_data', - 'logs', 'out', 'vectors' - ], - - # File patterns to exclude - 'excluded_patterns': [ - '*.pyc', '*.bin', '*.faiss', '*.pkl', '*.so', '*.o', - '*.jpg', '*.jpeg', '*.png', '*.gif', '*.webp', '*.ico', '*.lock', - '*.pdf' # Exclude PDFs as they're just data in your project - ], - - # Files that should always be included despite other exclusions - 'always_include': [ - 'README.md', 'LICENSE', 'CONTRIBUTING.md', 'requirements.txt', - 'package.json', 'Dockerfile', 'docker-compose*.yaml', 'docker-compose*.yml' - ], - - # Core code directories to focus on - 'core_dirs': [ - 'application', 'frontend/src', 'extensions', 'docs' - ], - - # File types to include content in documentation - 'content_file_types': [ - '.py', '.js', '.jsx', '.ts', '.tsx', '.md', '.txt', - '.yaml', '.yml', '.json', '.dockerfile' - ], - - # Max file size to include content (100KB) - 'max_content_size': 100 * 1024, - - # Max number of files to include full content for each directory - 'max_files_per_dir': 5, - - # Max characters to show in file previews - 'preview_length': 500 - } - - # Override config with provided values - if config: - self.config.update(config) - - def should_exclude(self, path): - """ - Determine if a path should be excluded from the documentation. - - Args: - path (str): The path to check. - - Returns: - bool: True if the path should be excluded, False otherwise. - """ - name = os.path.basename(path) - - # Always include certain files - for pattern in self.config['always_include']: - if self._match_pattern(name, pattern): - return False - - # Check if it's in excluded directories - parts = Path(path).relative_to(self.root_dir).parts - for part in parts: - for excluded_dir in self.config['excluded_dirs']: - if self._match_pattern(part, excluded_dir): - return True - - # Check excluded patterns - for pattern in self.config['excluded_patterns']: - if self._match_pattern(name, pattern): - return True - - # Exclude hidden files - if name.startswith('.') and name not in ['.env.example']: - return True - - return False - - def _match_pattern(self, name, pattern): - """ - Check if a name matches a pattern with simple wildcard support. - - Args: - name (str): The name to check. - pattern (str): The pattern to match against. - - Returns: - bool: True if the name matches the pattern, False otherwise. - """ - if pattern.startswith('*.'): - # Extension pattern - return name.endswith(pattern[1:]) - elif '*' in pattern: - # Convert to regex pattern - regex_pattern = pattern.replace('.', r'\.').replace('*', '.*') - return bool(re.match(f"^{regex_pattern}$", name)) - else: - # Exact match - return name == pattern - - def scan_directory(self): - """ - Scan the directory and build a structure representation. - - Returns: - dict: A dictionary representation of the project structure - """ - structure = {} - - for root, dirs, files in os.walk(self.root_dir): - # Skip excluded directories - dirs[:] = [d for d in dirs if not self.should_exclude(os.path.join(root, d))] - - # Get the relative path from the root directory - rel_path = os.path.relpath(root, self.root_dir) - if rel_path == '.': - rel_path = '' - - # Filter files based on excluded patterns - filtered_files = [file for file in files if not self.should_exclude(os.path.join(root, file))] - - # Add directory and its files to the structure - if rel_path: - current_level = structure - for part in rel_path.split(os.path.sep): - if part not in current_level: - current_level[part] = {} - current_level = current_level[part] - current_level['__files__'] = filtered_files - else: - structure['__files__'] = filtered_files - - return structure - - def print_structure(self, structure=None, indent=0, is_last=True, prefix="", file=None): - """ - Print the directory structure in a tree-like format. - - Args: - structure (dict): Dictionary representing the directory structure. - indent (int): Current indentation level. - is_last (bool): Whether this is the last item in its parent. - prefix (str): Prefix for the current line. - file: File object to write to. - """ - if structure is None: - # First call, print the root directory name - structure = self.scan_directory() - root_name = os.path.basename(self.root_dir) + "/" - line = root_name - if file: - file.write(f"{line}\n") - print(line) - - # Print files - if '__files__' in structure: - files = structure.pop('__files__') - for i, file_name in enumerate(sorted(files)): - is_last_file = (i == len(files) - 1) and len(structure) == 0 - connector = "└── " if is_last_file else "├── " - line = f"{prefix}{connector}{file_name}" - if file: - file.write(f"{line}\n") - print(line) - - # Process directories - items = list(sorted(structure.items())) - for i, (dir_name, contents) in enumerate(items): - is_last_dir = i == len(items) - 1 - connector = "└── " if is_last_dir else "├── " - line = f"{prefix}{connector}{dir_name}/" - if file: - file.write(f"{line}\n") - print(line) - - new_prefix = prefix + (" " if is_last_dir else "│ ") - self.print_structure(contents, indent + 1, is_last_dir, new_prefix, file) - - def _get_file_language(self, file_path): - """ - Determine the language of a file for code block formatting. - - Args: - file_path (str): Path to the file. - - Returns: - str: Language identifier for markdown code block. - """ - ext = os.path.splitext(file_path)[1].lower() - name = os.path.basename(file_path) - - # Map file extensions to language identifiers - ext_to_lang = { - '.py': 'python', - '.js': 'javascript', - '.jsx': 'jsx', - '.ts': 'typescript', - '.tsx': 'tsx', - '.html': 'html', - '.css': 'css', - '.scss': 'scss', - '.md': 'markdown', - '.json': 'json', - '.yaml': 'yaml', - '.yml': 'yaml', - '.sh': 'bash' - } - - # Special files - if name in ['Dockerfile']: - return 'dockerfile' - elif name in ['docker-compose.yml', 'docker-compose.yaml']: - return 'yaml' - elif name in ['Makefile']: - return 'makefile' - elif name in ['.gitignore', 'requirements.txt', '.env.example']: - return '' # Plain text - - return ext_to_lang.get(ext, '') - - def should_include_content(self, file_path): - """ - Check if a file's content should be included in the documentation. - - Args: - file_path (str): The path to the file. - - Returns: - bool: True if content should be included, False otherwise. - """ - # Check file size - if os.path.getsize(file_path) > self.config['max_content_size']: - return False - - # Check file extension - ext = os.path.splitext(file_path)[1].lower() - if ext not in self.config['content_file_types']: - return False - - # Check if file is in a core directory - rel_path = os.path.relpath(file_path, self.root_dir) - for core_dir in self.config['core_dirs']: - if rel_path.startswith(core_dir): - return True - - # Include any README or key configuration files - name = os.path.basename(file_path) - if any(self._match_pattern(name, pattern) for pattern in self.config['always_include']): - return True - - return False - - def count_files_by_type(self): - """ - Count the number of files by type in the project. - - Returns: - dict: A dictionary mapping file extensions to counts. - """ - ext_counts = {} - - for root, _, files in os.walk(self.root_dir): - if self.should_exclude(root): - continue - - for file in files: - file_path = os.path.join(root, file) - if self.should_exclude(file_path): - continue - - ext = os.path.splitext(file)[1].lower() - if not ext: - ext = '(no extension)' - - ext_counts[ext] = ext_counts.get(ext, 0) + 1 - - return ext_counts - - def generate_code_snippets(self, structure=None, path="", snippets=None): - """ - Generate representative code snippets from the project. - - Args: - structure (dict): Project structure dictionary. - path (str): Current path in the structure. - snippets (dict): Dictionary to store snippets by directory. - - Returns: - dict: Dictionary mapping directories to lists of file snippets. - """ - if snippets is None: - snippets = {} - structure = self.scan_directory() - - # Process files in the current directory - if '__files__' in structure: - files = structure.pop('__files__') - dir_snippets = [] - - # Sort files to prioritize key files - sorted_files = sorted(files, key=lambda f: f.startswith(('README', 'main', 'app')) and not f.startswith('.'), reverse=True) - - for file in sorted_files[:self.config['max_files_per_dir']]: - file_path = os.path.join(self.root_dir, path, file) - - if self.should_include_content(file_path): - try: - with open(file_path, 'r', encoding='utf-8', errors='replace') as f: - content = f.read(self.config['preview_length']) - too_long = len(content) >= self.config['preview_length'] - - dir_snippets.append({ - 'name': file, - 'path': os.path.join(path, file), - 'language': self._get_file_language(file_path), - 'content': content + ('...' if too_long else ''), - 'full_path': file_path - }) - except Exception as e: - # Skip files that can't be read - pass - - if dir_snippets: - snippets[path or '.'] = dir_snippets - - # Process subdirectories - for dir_name, contents in structure.items(): - self.generate_code_snippets(contents, os.path.join(path, dir_name), snippets) - - return snippets - - def find_important_files(self): - """ - Find and return a list of important files in the project. - - Returns: - list: List of important file paths. - """ - important_files = [] - - # Files to look for in any directory - common_important_files = [ - 'README.md', 'Dockerfile', 'docker-compose.yml', 'docker-compose.yaml', - 'requirements.txt', 'setup.py', 'package.json', 'app.py', 'main.py', - 'settings.py', 'config.py', 'wsgi.py', '.env.example' - ] - - for root, _, files in os.walk(self.root_dir): - if self.should_exclude(root): - continue - - for file in files: - if file in common_important_files: - important_files.append(os.path.join(root, file)) - - return important_files - - def generate_markdown(self, output_file): - """ - Generate a comprehensive markdown document for the DocsGPT project. - - Args: - output_file (str): Path to the output markdown file. - """ - structure = self.scan_directory() - ext_counts = self.count_files_by_type() - important_files = self.find_important_files() - snippets = self.generate_code_snippets() - - with open(output_file, 'w', encoding='utf-8') as md_file: - # Title and metadata - md_file.write(f"# DocsGPT Project Documentation\n\n") - md_file.write(f"Generated: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M')}\n\n") - - # Project Overview - md_file.write("## 1. Project Overview\n\n") - - # Try to include README content - readme_path = os.path.join(self.root_dir, "README.md") - if os.path.exists(readme_path): - try: - with open(readme_path, 'r', encoding='utf-8', errors='replace') as readme: - content = readme.read() - md_file.write("### From README.md\n\n") - md_file.write(f"{content}\n\n") - except Exception: - md_file.write("*Error reading README.md*\n\n") - - # Project stats - md_file.write("### Project Statistics\n\n") - - # Count directories and files - total_dirs = 0 - total_files = 0 - for root, dirs, files in os.walk(self.root_dir): - if not self.should_exclude(root): - total_dirs += sum(1 for d in dirs if not self.should_exclude(os.path.join(root, d))) - total_files += sum(1 for f in files if not self.should_exclude(os.path.join(root, f))) - - md_file.write(f"- **Total Directories:** {total_dirs}\n") - md_file.write(f"- **Total Files:** {total_files}\n\n") - - md_file.write("#### File Types\n\n") - for ext, count in sorted(ext_counts.items(), key=lambda x: x[1], reverse=True)[:15]: - md_file.write(f"- **{ext}:** {count} files\n") - md_file.write("\n") - - # Directory Structure - md_file.write("## 2. Directory Structure\n\n") - md_file.write("```\n") - self.print_structure(file=md_file) - md_file.write("```\n\n") - - # Key Components - md_file.write("## 3. Key Components\n\n") - - # Application component - md_file.write("### 3.1. Application Core\n\n") - if 'application' in snippets: - md_file.write("The application core contains the main backend logic for DocsGPT.\n\n") - for snippet in snippets['application'][:3]: - md_file.write(f"#### {snippet['path']}\n\n") - md_file.write(f"```{snippet['language']}\n{snippet['content']}\n```\n\n") - - # Frontend component - md_file.write("### 3.2. Frontend\n\n") - frontend_snippets = [s for path, files in snippets.items() - for s in files if path.startswith('frontend/src')] - if frontend_snippets: - md_file.write("The frontend is built with React and provides the user interface.\n\n") - for snippet in frontend_snippets[:3]: - md_file.write(f"#### {snippet['path']}\n\n") - md_file.write(f"```{snippet['language']}\n{snippet['content']}\n```\n\n") - - # Extensions - md_file.write("### 3.3. Extensions\n\n") - extension_snippets = [s for path, files in snippets.items() - for s in files if path.startswith('extensions')] - if extension_snippets: - md_file.write("DocsGPT includes various extensions for different platforms.\n\n") - for snippet in extension_snippets[:3]: - md_file.write(f"#### {snippet['path']}\n\n") - md_file.write(f"```{snippet['language']}\n{snippet['content']}\n```\n\n") - - # Configuration Files - md_file.write("## 4. Configuration Files\n\n") - - # Docker files - md_file.write("### 4.1. Docker Configuration\n\n") - docker_files = [f for f in important_files if os.path.basename(f) in - ['Dockerfile', 'docker-compose.yml', 'docker-compose.yaml']] - - for file_path in docker_files: - try: - with open(file_path, 'r', encoding='utf-8', errors='replace') as f: - content = f.read() - rel_path = os.path.relpath(file_path, self.root_dir) - md_file.write(f"#### {rel_path}\n\n") - - lang = 'dockerfile' if os.path.basename(file_path) == 'Dockerfile' else 'yaml' - md_file.write(f"```{lang}\n{content}\n```\n\n") - except Exception as e: - md_file.write(f"*Error reading {os.path.relpath(file_path, self.root_dir)}: {e}*\n\n") - - # Requirements and package files - md_file.write("### 4.2. Dependencies\n\n") - dep_files = [f for f in important_files if os.path.basename(f) in - ['requirements.txt', 'package.json']] - - for file_path in dep_files: - try: - with open(file_path, 'r', encoding='utf-8', errors='replace') as f: - content = f.read() - rel_path = os.path.relpath(file_path, self.root_dir) - md_file.write(f"#### {rel_path}\n\n") - - lang = 'json' if file_path.endswith('.json') else '' - md_file.write(f"```{lang}\n{content}\n```\n\n") - except Exception as e: - md_file.write(f"*Error reading {os.path.relpath(file_path, self.root_dir)}: {e}*\n\n") - - # Environment files - env_files = [f for f in important_files if os.path.basename(f) == '.env.example'] - if env_files: - md_file.write("### 4.3. Environment Configuration\n\n") - - for file_path in env_files: - try: - with open(file_path, 'r', encoding='utf-8', errors='replace') as f: - content = f.read() - rel_path = os.path.relpath(file_path, self.root_dir) - md_file.write(f"#### {rel_path}\n\n") - md_file.write(f"```\n{content}\n```\n\n") - except Exception as e: - md_file.write(f"*Error reading {os.path.relpath(file_path, self.root_dir)}: {e}*\n\n") - - # API Documentation (if we can find routes) - md_file.write("## 5. API Documentation\n\n") - api_files = [] - for root, _, files in os.walk(os.path.join(self.root_dir, 'application/api')): - if self.should_exclude(root): - continue - - for file in files: - if file == 'routes.py': - api_files.append(os.path.join(root, file)) - - if api_files: - md_file.write("### API Routes\n\n") - for file_path in api_files[:5]: # Limit to 5 route files - try: - with open(file_path, 'r', encoding='utf-8', errors='replace') as f: - content = f.read() - rel_path = os.path.relpath(file_path, self.root_dir) - md_file.write(f"#### {rel_path}\n\n") - md_file.write(f"```python\n{content}\n```\n\n") - except Exception as e: - md_file.write(f"*Error reading {os.path.relpath(file_path, self.root_dir)}: {e}*\n\n") - - # Conclusion - md_file.write("## 6. Additional Information\n\n") - md_file.write("This documentation provides an overview of the DocsGPT project structure and key components. " - "For more detailed information, please refer to the official documentation and code comments.\n\n") - - md_file.write("### License\n\n") - license_path = os.path.join(self.root_dir, "LICENSE") - if os.path.exists(license_path): - try: - with open(license_path, 'r', encoding='utf-8', errors='replace') as f: - content = f.read(500) # Just read the beginning of the license - md_file.write(f"```\n{content}...\n```\n\n") - except Exception: - md_file.write("*Error reading LICENSE file*\n\n") - - # Generation metadata - md_file.write("---\n\n") - md_file.write(f"*Documentation generated on {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}*\n") - md_file.write(f"*Generator: DocsGPT Project Documentation Generator*\n") - - -def main(): - parser = argparse.ArgumentParser(description='DocsGPT Project Documentation Generator') - - parser.add_argument('--root', '-r', type=str, default='.', - help='Root directory of the project (default: current directory)') - - parser.add_argument('--output', '-o', type=str, - help='Output markdown file (default: project_name_docs.md in the root directory)') - - parser.add_argument('--exclude-dirs', '-e', type=str, nargs='+', - help='Additional directories to exclude') - - parser.add_argument('--exclude-files', '-ef', type=str, nargs='+', - help='Additional file patterns to exclude') - - parser.add_argument('--include-files', '-if', type=str, nargs='+', - help='Files to always include despite exclusions') - - parser.add_argument('--core-dirs', '-c', type=str, nargs='+', - help='Core directories to focus on for code snippets') - - parser.add_argument('--config-file', '-cf', type=str, - help='Path to JSON configuration file') - - parser.add_argument('--tree-only', action='store_true', - help='Only print the directory tree structure, do not generate documentation') - - args = parser.parse_args() - - # Get absolute path of the root directory - root_dir = os.path.abspath(args.root) - - # Load configuration from file if provided - config = None - if args.config_file: - try: - with open(args.config_file, 'r') as f: - config = json.load(f) - except Exception as e: - print(f"Error loading configuration file: {e}") - return - else: - # Build configuration from command line arguments - config = {} - if args.exclude_dirs: - config['excluded_dirs'] = args.exclude_dirs - if args.exclude_files: - config['excluded_patterns'] = args.exclude_files - if args.include_files: - config['always_include'] = args.include_files - if args.core_dirs: - config['core_dirs'] = args.core_dirs - - # Create the generator - generator = DocsGPTDocumentationGenerator(root_dir=root_dir, config=config) - - if args.tree_only: - # Just print the tree structure - print(f"Directory structure for: {root_dir}\n") - generator.print_structure() - else: - # Generate full documentation - output_file = args.output - if not output_file: - project_name = os.path.basename(root_dir) - output_file = os.path.join(root_dir, f"{project_name}_documentation.md") - - print(f"Generating documentation for {root_dir}...") - generator.generate_markdown(output_file) - print(f"Documentation saved to: {output_file}") - - -if __name__ == "__main__": - main() \ No newline at end of file