#!/usr/bin/env python3 """ Markdown Analysis Script Analyzes all Markdown files for duplicates and generates an index mapping content to files and line numbers. """ import os import hashlib import re from pathlib import Path from collections import defaultdict from typing import Dict, List, Tuple, Set import json class MarkdownAnalyzer: def __init__(self, root_dir: str = '.'): self.root_dir = Path(root_dir) self.md_files: List[Path] = [] self.content_index: Dict[str, Dict] = {} self.duplicates: Dict[str, List[str]] = defaultdict(list) self.file_structure: Dict[str, List[str]] = defaultdict(list) def find_all_markdown(self): """Find all markdown files in the project.""" for md_file in self.root_dir.rglob('*.md'): # Skip node_modules, .git, and other ignored directories parts = md_file.parts if any(ignore in parts for ignore in ['node_modules', '.git', 'dist', 'build', '.next']): continue self.md_files.append(md_file) def analyze_duplicates(self): """Find duplicate files by content hash.""" content_hashes = defaultdict(list) for md_file in self.md_files: try: with open(md_file, 'rb') as f: content = f.read() content_hash = hashlib.md5(content).hexdigest() rel_path = str(md_file.relative_to(self.root_dir)) content_hashes[content_hash].append(rel_path) except Exception as e: print(f"Error reading {md_file}: {e}") # Find duplicates for content_hash, files in content_hashes.items(): if len(files) > 1: self.duplicates[content_hash] = files def index_content(self): """Create detailed index of markdown content with line numbers.""" for md_file in self.md_files: rel_path = str(md_file.relative_to(self.root_dir)) try: with open(md_file, 'r', encoding='utf-8', errors='ignore') as f: lines = f.readlines() # Extract metadata title = None headings = [] code_blocks = [] links = [] for line_num, line in enumerate(lines, 1): # Find title (first H1) if not title and line.strip().startswith('# '): title = line.strip()[2:].strip() # Find all headings heading_match = re.match(r'^(#{1,6})\s+(.+)$', line.strip()) if heading_match: level = len(heading_match.group(1)) heading_text = heading_match.group(2).strip() headings.append({ 'level': level, 'text': heading_text, 'line': line_num }) # Find code blocks if line.strip().startswith('```'): code_blocks.append({ 'line': line_num, 'type': 'code_block' }) # Find links link_pattern = r'\[([^\]]+)\]\(([^\)]+)\)' for match in re.finditer(link_pattern, line): links.append({ 'text': match.group(1), 'url': match.group(2), 'line': line_num }) self.content_index[rel_path] = { 'path': rel_path, 'title': title, 'line_count': len(lines), 'headings': headings, 'code_blocks': len(code_blocks), 'links': links, 'size_bytes': md_file.stat().st_size } except Exception as e: print(f"Error indexing {md_file}: {e}") def categorize_files(self): """Categorize files by location.""" for md_file in self.md_files: rel_path = str(md_file.relative_to(self.root_dir)) parts = rel_path.split('/') if len(parts) == 1: category = 'root' elif parts[0] == 'docs': if len(parts) > 1: category = f"docs/{parts[1]}" else: category = 'docs' elif parts[0] in ['api', 'portal', 'scripts', 'crossplane-provider-proxmox']: category = parts[0] else: category = 'other' self.file_structure[category].append(rel_path) def generate_report(self) -> Dict: """Generate comprehensive analysis report.""" return { 'total_files': len(self.md_files), 'unique_files': len(self.content_index), 'duplicate_groups': len(self.duplicates), 'duplicates': dict(self.duplicates), 'categories': {k: len(v) for k, v in self.file_structure.items()}, 'index': self.content_index } def find_similar_content(self) -> Dict[str, List[str]]: """Find files with similar titles (potential duplicates).""" similar = defaultdict(list) for rel_path, data in self.content_index.items(): if data['title']: title_key = data['title'].lower().strip() similar[title_key].append(rel_path) return {k: v for k, v in similar.items() if len(v) > 1} def main(): analyzer = MarkdownAnalyzer('.') print("Finding all Markdown files...") analyzer.find_all_markdown() print(f"Found {len(analyzer.md_files)} Markdown files\n") print("Analyzing duplicates...") analyzer.analyze_duplicates() print(f"Found {len(analyzer.duplicates)} duplicate groups\n") print("Indexing content...") analyzer.index_content() print(f"Indexed {len(analyzer.content_index)} files\n") print("Categorizing files...") analyzer.categorize_files() print("Finding similar content...") similar = analyzer.find_similar_content() # Generate report report = analyzer.generate_report() # Print summary print("\n" + "="*60) print("MARKDOWN ANALYSIS SUMMARY") print("="*60) print(f"Total Markdown files: {report['total_files']}") print(f"Unique files: {report['unique_files']}") print(f"Duplicate groups: {report['duplicate_groups']}") if report['duplicate_groups'] > 0: print("\nDuplicate files:") for hash_val, files in list(report['duplicates'].items())[:10]: print(f"\n Hash: {hash_val[:16]}... ({len(files)} files)") for f in files: print(f" - {f}") print(f"\nSimilar titles (potential duplicates): {len(similar)}") for title, files in list(similar.items())[:10]: print(f"\n '{title}':") for f in files: print(f" - {f}") print("\nFiles by category:") for category, count in sorted(report['categories'].items()): print(f" {category}: {count} files") # Save detailed report output_file = 'docs/MARKDOWN_INDEX.json' with open(output_file, 'w', encoding='utf-8') as f: json.dump(report, f, indent=2, ensure_ascii=False) print(f"\nDetailed index saved to: {output_file}") return analyzer, report if __name__ == '__main__': analyzer, report = main()