Update documentation structure and enhance .gitignore
- Added generated index files and report directories to .gitignore to prevent unnecessary tracking of transient files. - Updated README links to reflect new documentation paths for better navigation. - Improved documentation organization by ensuring all links point to the correct locations, enhancing user experience and accessibility.
This commit is contained in:
213
scripts/analyze-markdown.py
Normal file
213
scripts/analyze-markdown.py
Normal file
@@ -0,0 +1,213 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Markdown Analysis Script
|
||||
Analyzes all Markdown files for duplicates and generates an index mapping content to files and line numbers.
|
||||
"""
|
||||
|
||||
import os
|
||||
import hashlib
|
||||
import re
|
||||
from pathlib import Path
|
||||
from collections import defaultdict
|
||||
from typing import Dict, List, Tuple, Set
|
||||
import json
|
||||
|
||||
class MarkdownAnalyzer:
|
||||
def __init__(self, root_dir: str = '.'):
|
||||
self.root_dir = Path(root_dir)
|
||||
self.md_files: List[Path] = []
|
||||
self.content_index: Dict[str, Dict] = {}
|
||||
self.duplicates: Dict[str, List[str]] = defaultdict(list)
|
||||
self.file_structure: Dict[str, List[str]] = defaultdict(list)
|
||||
|
||||
def find_all_markdown(self):
|
||||
"""Find all markdown files in the project."""
|
||||
for md_file in self.root_dir.rglob('*.md'):
|
||||
# Skip node_modules, .git, and other ignored directories
|
||||
parts = md_file.parts
|
||||
if any(ignore in parts for ignore in ['node_modules', '.git', 'dist', 'build', '.next']):
|
||||
continue
|
||||
self.md_files.append(md_file)
|
||||
|
||||
def analyze_duplicates(self):
|
||||
"""Find duplicate files by content hash."""
|
||||
content_hashes = defaultdict(list)
|
||||
|
||||
for md_file in self.md_files:
|
||||
try:
|
||||
with open(md_file, 'rb') as f:
|
||||
content = f.read()
|
||||
content_hash = hashlib.md5(content).hexdigest()
|
||||
rel_path = str(md_file.relative_to(self.root_dir))
|
||||
content_hashes[content_hash].append(rel_path)
|
||||
except Exception as e:
|
||||
print(f"Error reading {md_file}: {e}")
|
||||
|
||||
# Find duplicates
|
||||
for content_hash, files in content_hashes.items():
|
||||
if len(files) > 1:
|
||||
self.duplicates[content_hash] = files
|
||||
|
||||
def index_content(self):
|
||||
"""Create detailed index of markdown content with line numbers."""
|
||||
for md_file in self.md_files:
|
||||
rel_path = str(md_file.relative_to(self.root_dir))
|
||||
|
||||
try:
|
||||
with open(md_file, 'r', encoding='utf-8', errors='ignore') as f:
|
||||
lines = f.readlines()
|
||||
|
||||
# Extract metadata
|
||||
title = None
|
||||
headings = []
|
||||
code_blocks = []
|
||||
links = []
|
||||
|
||||
for line_num, line in enumerate(lines, 1):
|
||||
# Find title (first H1)
|
||||
if not title and line.strip().startswith('# '):
|
||||
title = line.strip()[2:].strip()
|
||||
|
||||
# Find all headings
|
||||
heading_match = re.match(r'^(#{1,6})\s+(.+)$', line.strip())
|
||||
if heading_match:
|
||||
level = len(heading_match.group(1))
|
||||
heading_text = heading_match.group(2).strip()
|
||||
headings.append({
|
||||
'level': level,
|
||||
'text': heading_text,
|
||||
'line': line_num
|
||||
})
|
||||
|
||||
# Find code blocks
|
||||
if line.strip().startswith('```'):
|
||||
code_blocks.append({
|
||||
'line': line_num,
|
||||
'type': 'code_block'
|
||||
})
|
||||
|
||||
# Find links
|
||||
link_pattern = r'\[([^\]]+)\]\(([^\)]+)\)'
|
||||
for match in re.finditer(link_pattern, line):
|
||||
links.append({
|
||||
'text': match.group(1),
|
||||
'url': match.group(2),
|
||||
'line': line_num
|
||||
})
|
||||
|
||||
self.content_index[rel_path] = {
|
||||
'path': rel_path,
|
||||
'title': title,
|
||||
'line_count': len(lines),
|
||||
'headings': headings,
|
||||
'code_blocks': len(code_blocks),
|
||||
'links': links,
|
||||
'size_bytes': md_file.stat().st_size
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error indexing {md_file}: {e}")
|
||||
|
||||
def categorize_files(self):
|
||||
"""Categorize files by location."""
|
||||
for md_file in self.md_files:
|
||||
rel_path = str(md_file.relative_to(self.root_dir))
|
||||
parts = rel_path.split('/')
|
||||
|
||||
if len(parts) == 1:
|
||||
category = 'root'
|
||||
elif parts[0] == 'docs':
|
||||
if len(parts) > 1:
|
||||
category = f"docs/{parts[1]}"
|
||||
else:
|
||||
category = 'docs'
|
||||
elif parts[0] in ['api', 'portal', 'scripts', 'crossplane-provider-proxmox']:
|
||||
category = parts[0]
|
||||
else:
|
||||
category = 'other'
|
||||
|
||||
self.file_structure[category].append(rel_path)
|
||||
|
||||
def generate_report(self) -> Dict:
|
||||
"""Generate comprehensive analysis report."""
|
||||
return {
|
||||
'total_files': len(self.md_files),
|
||||
'unique_files': len(self.content_index),
|
||||
'duplicate_groups': len(self.duplicates),
|
||||
'duplicates': dict(self.duplicates),
|
||||
'categories': {k: len(v) for k, v in self.file_structure.items()},
|
||||
'index': self.content_index
|
||||
}
|
||||
|
||||
def find_similar_content(self) -> Dict[str, List[str]]:
|
||||
"""Find files with similar titles (potential duplicates)."""
|
||||
similar = defaultdict(list)
|
||||
|
||||
for rel_path, data in self.content_index.items():
|
||||
if data['title']:
|
||||
title_key = data['title'].lower().strip()
|
||||
similar[title_key].append(rel_path)
|
||||
|
||||
return {k: v for k, v in similar.items() if len(v) > 1}
|
||||
|
||||
def main():
|
||||
analyzer = MarkdownAnalyzer('.')
|
||||
|
||||
print("Finding all Markdown files...")
|
||||
analyzer.find_all_markdown()
|
||||
print(f"Found {len(analyzer.md_files)} Markdown files\n")
|
||||
|
||||
print("Analyzing duplicates...")
|
||||
analyzer.analyze_duplicates()
|
||||
print(f"Found {len(analyzer.duplicates)} duplicate groups\n")
|
||||
|
||||
print("Indexing content...")
|
||||
analyzer.index_content()
|
||||
print(f"Indexed {len(analyzer.content_index)} files\n")
|
||||
|
||||
print("Categorizing files...")
|
||||
analyzer.categorize_files()
|
||||
|
||||
print("Finding similar content...")
|
||||
similar = analyzer.find_similar_content()
|
||||
|
||||
# Generate report
|
||||
report = analyzer.generate_report()
|
||||
|
||||
# Print summary
|
||||
print("\n" + "="*60)
|
||||
print("MARKDOWN ANALYSIS SUMMARY")
|
||||
print("="*60)
|
||||
print(f"Total Markdown files: {report['total_files']}")
|
||||
print(f"Unique files: {report['unique_files']}")
|
||||
print(f"Duplicate groups: {report['duplicate_groups']}")
|
||||
|
||||
if report['duplicate_groups'] > 0:
|
||||
print("\nDuplicate files:")
|
||||
for hash_val, files in list(report['duplicates'].items())[:10]:
|
||||
print(f"\n Hash: {hash_val[:16]}... ({len(files)} files)")
|
||||
for f in files:
|
||||
print(f" - {f}")
|
||||
|
||||
print(f"\nSimilar titles (potential duplicates): {len(similar)}")
|
||||
for title, files in list(similar.items())[:10]:
|
||||
print(f"\n '{title}':")
|
||||
for f in files:
|
||||
print(f" - {f}")
|
||||
|
||||
print("\nFiles by category:")
|
||||
for category, count in sorted(report['categories'].items()):
|
||||
print(f" {category}: {count} files")
|
||||
|
||||
# Save detailed report
|
||||
output_file = 'docs/MARKDOWN_INDEX.json'
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(report, f, indent=2, ensure_ascii=False)
|
||||
|
||||
print(f"\nDetailed index saved to: {output_file}")
|
||||
|
||||
return analyzer, report
|
||||
|
||||
if __name__ == '__main__':
|
||||
analyzer, report = main()
|
||||
|
||||
Reference in New Issue
Block a user