Update documentation structure and enhance .gitignore

- Added generated index files and report directories to .gitignore to prevent unnecessary tracking of transient files. - Updated README links to reflect new documentation paths for better navigation. - Improved documentation organization by ensuring all links point to the correct locations, enhancing user experience and accessibility.
2025-12-12 21:18:55 -08:00
parent 664707d912
commit fe0365757a
106 changed files with 4666 additions and 2294 deletions
--- a/scripts/analyze-markdown.py
+++ b/scripts/analyze-markdown.py
@@ -0,0 +1,213 @@
+#!/usr/bin/env python3
+"""
+Markdown Analysis Script
+Analyzes all Markdown files for duplicates and generates an index mapping content to files and line numbers.
+"""
+
+import os
+import hashlib
+import re
+from pathlib import Path
+from collections import defaultdict
+from typing import Dict, List, Tuple, Set
+import json
+
+class MarkdownAnalyzer:
+    def __init__(self, root_dir: str = '.'):
+        self.root_dir = Path(root_dir)
+        self.md_files: List[Path] = []
+        self.content_index: Dict[str, Dict] = {}
+        self.duplicates: Dict[str, List[str]] = defaultdict(list)
+        self.file_structure: Dict[str, List[str]] = defaultdict(list)
+        
+    def find_all_markdown(self):
+        """Find all markdown files in the project."""
+        for md_file in self.root_dir.rglob('*.md'):
+            # Skip node_modules, .git, and other ignored directories
+            parts = md_file.parts
+            if any(ignore in parts for ignore in ['node_modules', '.git', 'dist', 'build', '.next']):
+                continue
+            self.md_files.append(md_file)
+    
+    def analyze_duplicates(self):
+        """Find duplicate files by content hash."""
+        content_hashes = defaultdict(list)
+        
+        for md_file in self.md_files:
+            try:
+                with open(md_file, 'rb') as f:
+                    content = f.read()
+                    content_hash = hashlib.md5(content).hexdigest()
+                    rel_path = str(md_file.relative_to(self.root_dir))
+                    content_hashes[content_hash].append(rel_path)
+            except Exception as e:
+                print(f"Error reading {md_file}: {e}")
+        
+        # Find duplicates
+        for content_hash, files in content_hashes.items():
+            if len(files) > 1:
+                self.duplicates[content_hash] = files
+    
+    def index_content(self):
+        """Create detailed index of markdown content with line numbers."""
+        for md_file in self.md_files:
+            rel_path = str(md_file.relative_to(self.root_dir))
+            
+            try:
+                with open(md_file, 'r', encoding='utf-8', errors='ignore') as f:
+                    lines = f.readlines()
+                    
+                # Extract metadata
+                title = None
+                headings = []
+                code_blocks = []
+                links = []
+                
+                for line_num, line in enumerate(lines, 1):
+                    # Find title (first H1)
+                    if not title and line.strip().startswith('# '):
+                        title = line.strip()[2:].strip()
+                    
+                    # Find all headings
+                    heading_match = re.match(r'^(#{1,6})\s+(.+)$', line.strip())
+                    if heading_match:
+                        level = len(heading_match.group(1))
+                        heading_text = heading_match.group(2).strip()
+                        headings.append({
+                            'level': level,
+                            'text': heading_text,
+                            'line': line_num
+                        })
+                    
+                    # Find code blocks
+                    if line.strip().startswith('```'):
+                        code_blocks.append({
+                            'line': line_num,
+                            'type': 'code_block'
+                        })
+                    
+                    # Find links
+                    link_pattern = r'\[([^\]]+)\]\(([^\)]+)\)'
+                    for match in re.finditer(link_pattern, line):
+                        links.append({
+                            'text': match.group(1),
+                            'url': match.group(2),
+                            'line': line_num
+                        })
+                
+                self.content_index[rel_path] = {
+                    'path': rel_path,
+                    'title': title,
+                    'line_count': len(lines),
+                    'headings': headings,
+                    'code_blocks': len(code_blocks),
+                    'links': links,
+                    'size_bytes': md_file.stat().st_size
+                }
+                
+            except Exception as e:
+                print(f"Error indexing {md_file}: {e}")
+    
+    def categorize_files(self):
+        """Categorize files by location."""
+        for md_file in self.md_files:
+            rel_path = str(md_file.relative_to(self.root_dir))
+            parts = rel_path.split('/')
+            
+            if len(parts) == 1:
+                category = 'root'
+            elif parts[0] == 'docs':
+                if len(parts) > 1:
+                    category = f"docs/{parts[1]}"
+                else:
+                    category = 'docs'
+            elif parts[0] in ['api', 'portal', 'scripts', 'crossplane-provider-proxmox']:
+                category = parts[0]
+            else:
+                category = 'other'
+            
+            self.file_structure[category].append(rel_path)
+    
+    def generate_report(self) -> Dict:
+        """Generate comprehensive analysis report."""
+        return {
+            'total_files': len(self.md_files),
+            'unique_files': len(self.content_index),
+            'duplicate_groups': len(self.duplicates),
+            'duplicates': dict(self.duplicates),
+            'categories': {k: len(v) for k, v in self.file_structure.items()},
+            'index': self.content_index
+        }
+    
+    def find_similar_content(self) -> Dict[str, List[str]]:
+        """Find files with similar titles (potential duplicates)."""
+        similar = defaultdict(list)
+        
+        for rel_path, data in self.content_index.items():
+            if data['title']:
+                title_key = data['title'].lower().strip()
+                similar[title_key].append(rel_path)
+        
+        return {k: v for k, v in similar.items() if len(v) > 1}
+
+def main():
+    analyzer = MarkdownAnalyzer('.')
+    
+    print("Finding all Markdown files...")
+    analyzer.find_all_markdown()
+    print(f"Found {len(analyzer.md_files)} Markdown files\n")
+    
+    print("Analyzing duplicates...")
+    analyzer.analyze_duplicates()
+    print(f"Found {len(analyzer.duplicates)} duplicate groups\n")
+    
+    print("Indexing content...")
+    analyzer.index_content()
+    print(f"Indexed {len(analyzer.content_index)} files\n")
+    
+    print("Categorizing files...")
+    analyzer.categorize_files()
+    
+    print("Finding similar content...")
+    similar = analyzer.find_similar_content()
+    
+    # Generate report
+    report = analyzer.generate_report()
+    
+    # Print summary
+    print("\n" + "="*60)
+    print("MARKDOWN ANALYSIS SUMMARY")
+    print("="*60)
+    print(f"Total Markdown files: {report['total_files']}")
+    print(f"Unique files: {report['unique_files']}")
+    print(f"Duplicate groups: {report['duplicate_groups']}")
+    
+    if report['duplicate_groups'] > 0:
+        print("\nDuplicate files:")
+        for hash_val, files in list(report['duplicates'].items())[:10]:
+            print(f"\n  Hash: {hash_val[:16]}... ({len(files)} files)")
+            for f in files:
+                print(f"    - {f}")
+    
+    print(f"\nSimilar titles (potential duplicates): {len(similar)}")
+    for title, files in list(similar.items())[:10]:
+        print(f"\n  '{title}':")
+        for f in files:
+            print(f"    - {f}")
+    
+    print("\nFiles by category:")
+    for category, count in sorted(report['categories'].items()):
+        print(f"  {category}: {count} files")
+    
+    # Save detailed report
+    output_file = 'docs/MARKDOWN_INDEX.json'
+    with open(output_file, 'w', encoding='utf-8') as f:
+        json.dump(report, f, indent=2, ensure_ascii=False)
+    
+    print(f"\nDetailed index saved to: {output_file}")
+    
+    return analyzer, report
+
+if __name__ == '__main__':
+    analyzer, report = main()
+