Files
proxmox/scripts/check-content-inconsistencies.py
2026-01-06 01:42:29 -08:00

309 lines
12 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Content Inconsistency Checker
Compares related markdown files for inconsistencies in:
- Dates
- Status information
- Configuration values
- References to other files
"""
import os
import re
import json
from pathlib import Path
from collections import defaultdict
from typing import Dict, List, Set, Tuple
from datetime import datetime
class ContentInconsistencyChecker:
def __init__(self, root_dir: str):
self.root_dir = Path(root_dir)
self.inconsistencies = []
self.file_contents = {}
def check(self):
"""Run all consistency checks"""
print("🔍 Checking content inconsistencies...")
# Load file contents
self._load_files()
# Check for inconsistencies
print("\n📅 Checking date inconsistencies...")
self._check_dates()
print("\n📊 Checking status inconsistencies...")
self._check_status()
print("\n🔗 Checking cross-references...")
self._check_references()
print("\n⚙️ Checking configuration values...")
self._check_config_values()
print("\n📝 Checking duplicate content...")
self._check_duplicate_content()
return self._generate_report()
def _load_files(self):
"""Load markdown file contents"""
exclude_dirs = {'.git', 'node_modules', '__pycache__', '.next', 'dist', 'build', 'venv'}
for md_file in self.root_dir.rglob('*.md'):
if any(part in exclude_dirs for part in md_file.parts):
continue
try:
with open(md_file, 'r', encoding='utf-8', errors='ignore') as f:
content = f.read()
rel_path = str(md_file.relative_to(self.root_dir))
self.file_contents[rel_path] = {
'content': content,
'path': rel_path,
'lines': content.split('\n')
}
except Exception as e:
pass
def _check_dates(self):
"""Check for inconsistent dates"""
date_patterns = [
r'(\d{4}-\d{2}-\d{2})', # YYYY-MM-DD
r'(\d{1,2}/\d{1,2}/\d{4})', # MM/DD/YYYY
r'Date[:\s]+(\d{4}-\d{2}-\d{2})',
r'Generated[:\s]+(\d{4}-\d{2}-\d{2})',
r'Last Updated[:\s]+(\d{4}-\d{2}-\d{2})',
]
# Group files by project/component
project_files = defaultdict(list)
for path in self.file_contents:
if 'rpc-translator-138' in path:
project_files['rpc-translator-138'].append(path)
elif path.startswith('docs/'):
project_files['docs'].append(path)
elif path.startswith('reports/'):
project_files['reports'].append(path)
elif '/' not in path or path.count('/') == 0:
project_files['root'].append(path)
# Check dates within each project
for project, files in project_files.items():
dates_found = []
for file_path in files:
content = self.file_contents[file_path]['content']
for pattern in date_patterns:
matches = re.findall(pattern, content)
for match in matches:
dates_found.append((file_path, match))
# Check for very old dates (>1 year)
now = datetime.now()
for file_path, date_str in dates_found:
try:
if '-' in date_str:
date_obj = datetime.strptime(date_str, '%Y-%m-%d')
elif '/' in date_str:
parts = date_str.split('/')
if len(parts) == 3:
date_obj = datetime.strptime(date_str, '%m/%d/%Y')
else:
continue
else:
continue
days_diff = (now - date_obj).days
if days_diff > 365:
self.inconsistencies.append({
'type': 'old_date',
'file': file_path,
'issue': f'Date {date_str} is {days_diff} days old',
'severity': 'medium'
})
except:
pass
def _check_status(self):
"""Check for inconsistent status information"""
status_patterns = [
r'Status[:\s]+([✅❌🔄⚠️]+|COMPLETE|INCOMPLETE|PENDING|ACTIVE|INACTIVE)',
r'\*\*Status\*\*[:\s]+([✅❌🔄⚠️]+|COMPLETE|INCOMPLETE|PENDING)',
]
# Group related status files
status_groups = defaultdict(list)
for path in self.file_contents:
filename = Path(path).name
if 'COMPLETE' in filename or 'STATUS' in filename or 'FINAL' in filename:
# Extract base name
base = re.sub(r'_(COMPLETE|FINAL|STATUS).*', '', filename)
base = re.sub(r'COMPLETE|FINAL|STATUS', '', base)
status_groups[base].append(path)
# Check for conflicting statuses
for base, files in status_groups.items():
if len(files) > 1:
statuses = []
for file_path in files:
content = self.file_contents[file_path]['content']
for pattern in status_patterns:
matches = re.findall(pattern, content, re.IGNORECASE)
statuses.extend([(file_path, m) for m in matches])
if len(set(s[1] for s in statuses)) > 1:
self.inconsistencies.append({
'type': 'conflicting_status',
'files': files,
'issue': f'Multiple status files for {base} with different statuses',
'severity': 'high'
})
def _check_references(self):
"""Check for broken or inconsistent cross-references"""
reference_pattern = r'\[([^\]]+)\]\(([^)]+)\)'
for path, data in self.file_contents.items():
content = data['content']
matches = re.findall(reference_pattern, content)
for link_text, link_path in matches:
# Skip external links
if link_path.startswith('http'):
continue
# Check if referenced file exists
if '#' in link_path:
file_path, anchor = link_path.split('#', 1)
else:
file_path = link_path
anchor = None
# Resolve relative paths
if not file_path.startswith('/'):
current_dir = Path(path).parent
resolved = (current_dir / file_path).resolve()
try:
relative_resolved = resolved.relative_to(self.root_dir)
except ValueError:
# Path is outside project root, skip
continue
else:
relative_resolved = Path(file_path.lstrip('/'))
# Check if file exists
full_path = self.root_dir / relative_resolved
if not full_path.exists():
self.inconsistencies.append({
'type': 'broken_reference',
'file': path,
'issue': f'Broken link to {link_path}',
'severity': 'medium'
})
def _check_config_values(self):
"""Check for inconsistent configuration values"""
# Look for IP addresses, VMIDs, ports
ip_pattern = r'192\.168\.11\.(\d+)'
vmid_pattern = r'VMID[:\s]+(\d+)'
configs_by_component = defaultdict(lambda: defaultdict(set))
for path, data in self.file_contents.items():
content = data['content']
# Extract IPs
ips = re.findall(ip_pattern, content)
for ip in ips:
component = self._identify_component(path)
configs_by_component[component]['ips'].add(f'192.168.11.{ip}')
# Extract VMIDs
vmids = re.findall(vmid_pattern, content, re.IGNORECASE)
for vmid in vmids:
component = self._identify_component(path)
configs_by_component[component]['vmids'].add(vmid)
# Check for inconsistencies (same component, different values)
for component, configs in configs_by_component.items():
if len(configs['ips']) > 10: # Too many IPs might indicate inconsistency
self.inconsistencies.append({
'type': 'too_many_ips',
'component': component,
'issue': f'Component {component} references {len(configs["ips"])} different IPs',
'severity': 'low'
})
def _check_duplicate_content(self):
"""Check for duplicate or near-duplicate content"""
# Simple check: files with very similar first 10 lines
file_signatures = {}
for path, data in self.file_contents.items():
first_lines = '\n'.join(data['lines'][:10])
signature = hash(first_lines)
if signature in file_signatures:
self.inconsistencies.append({
'type': 'duplicate_intro',
'files': [file_signatures[signature], path],
'issue': 'Files have identical first 10 lines',
'severity': 'low'
})
else:
file_signatures[signature] = path
def _identify_component(self, path: str) -> str:
"""Identify component from file path"""
if 'rpc-translator' in path:
return 'rpc-translator-138'
elif 'besu' in path.lower():
return 'besu'
elif 'dbis' in path.lower():
return 'dbis'
elif 'firefly' in path.lower():
return 'firefly'
else:
return 'other'
def _generate_report(self) -> Dict:
"""Generate inconsistency report"""
report = {
'summary': {
'total_inconsistencies': len(self.inconsistencies),
'by_type': defaultdict(int),
'by_severity': defaultdict(int)
},
'inconsistencies': []
}
for inc in self.inconsistencies:
report['summary']['by_type'][inc['type']] += 1
report['summary']['by_severity'][inc['severity']] += 1
report['inconsistencies'].append(inc)
return report
def main():
root_dir = Path(__file__).parent.parent
checker = ContentInconsistencyChecker(root_dir)
report = checker.check()
# Save report
json_file = root_dir / 'CONTENT_INCONSISTENCIES.json'
with open(json_file, 'w') as f:
json.dump(report, f, indent=2, default=str)
print(f"\n✅ Report saved to: {json_file}")
# Print summary
print("\n📊 Summary:")
print(f" Total inconsistencies: {report['summary']['total_inconsistencies']}")
print(f" By type: {dict(report['summary']['by_type'])}")
print(f" By severity: {dict(report['summary']['by_severity'])}")
return report
if __name__ == '__main__':
main()