"""Recursive semantic decomposition: split text into atomic units.""" from __future__ import annotations import re import uuid from typing import Any from fusionagi.reasoning.native import analyze_prompt from fusionagi.schemas.atomic import ( AtomicSemanticUnit, AtomicUnitType, DecompositionResult, RelationType, SemanticRelation, ) from fusionagi._logger import logger def _make_unit_id(prefix: str = "asu") -> str: """Generate unique unit ID.""" return f"{prefix}_{uuid.uuid4().hex[:12]}" def _is_atomic(text: str, min_words: int = 3) -> bool: """Check if text is irreducible (atomic).""" content = " ".join(text.split()).strip() if not content or len(content) < 10: return True words = len(content.split()) return words <= min_words def _extract_questions(text: str) -> list[str]: """Extract explicit questions from text.""" questions: list[str] = [] content = " ".join(text.split()).strip() q_parts = re.split(r"\?+", content) for part in q_parts[:-1]: q = part.strip() if len(q) > 10: questions.append(q + "?") if not questions and any(w in content.lower() for w in ["how", "what", "why", "when", "where", "who"]): questions.append(content) return questions[:5] def _extract_constraints(text: str) -> list[str]: """Extract constraint signals from text.""" constraints: list[str] = [] patterns = [ r"must\s+(\w[\w\s]+?)(?:\.|$)", r"should\s+(\w[\w\s]+?)(?:\.|$)", r"cannot\s+(\w[\w\s]+?)(?:\.|$)", r"require[sd]?\s+(\w[\w\s]+?)(?:\.|$)", r"constraint[s]?:\s*(\w[\w\s]+?)(?:\.|$)", r"assume[sd]?\s+(\w[\w\s]+?)(?:\.|$)", ] for pat in patterns: for m in re.finditer(pat, text, re.I): constraints.append(m.group(1).strip()) return list(dict.fromkeys(constraints))[:10] def _extract_entities(text: str) -> list[str]: """Extract entity-like phrases.""" entities = re.findall(r'"([^"]+)"', text) entities += re.findall(r"\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b", text) return list(dict.fromkeys(e for e in entities if len(e) > 2))[:10] def decompose_recursive( text: str, max_depth: int = 3, parent_id: str | None = None, current_depth: int = 0, source_ref: str | None = None, ) -> DecompositionResult: """ Recursively decompose text into atomic semantic units. Extracts entities, constraints, intents, assumptions, questions; recurses on non-atomic segments. Integrates with native analyze_prompt for intent and domain signals. Args: text: Input text to decompose. max_depth: Maximum recursion depth. parent_id: Parent unit ID for decomposition tree. current_depth: Current recursion depth. source_ref: Optional source reference. Returns: DecompositionResult with units and relations. """ content = " ".join(text.split()).strip() if not content: return DecompositionResult(units=[], relations=[], depth=current_depth) units: list[AtomicSemanticUnit] = [] relations: list[SemanticRelation] = [] analysis = analyze_prompt(content) # Root unit for this segment (if not already atomic) root_id = _make_unit_id() root_unit = AtomicSemanticUnit( unit_id=root_id, content=content[:500] + ("..." if len(content) > 500 else ""), type=AtomicUnitType.INTENT if analysis.intent == "question" else AtomicUnitType.FACT, confidence=0.8, parent_id=parent_id, source_ref=source_ref, metadata={"intent": analysis.intent}, ) units.append(root_unit) if parent_id: relations.append( SemanticRelation(from_id=parent_id, to_id=root_id, relation_type=RelationType.LOGICAL) ) # Extract questions as atomic units for q in _extract_questions(content): q_id = _make_unit_id() units.append( AtomicSemanticUnit( unit_id=q_id, content=q, type=AtomicUnitType.QUESTION, confidence=0.9, parent_id=root_id, source_ref=source_ref, ) ) relations.append( SemanticRelation(from_id=root_id, to_id=q_id, relation_type=RelationType.LOGICAL) ) # Extract constraints as atomic units for c in _extract_constraints(content): c_id = _make_unit_id() units.append( AtomicSemanticUnit( unit_id=c_id, content=c, type=AtomicUnitType.CONSTRAINT, confidence=0.85, parent_id=root_id, source_ref=source_ref, ) ) relations.append( SemanticRelation(from_id=root_id, to_id=c_id, relation_type=RelationType.LOGICAL) ) # Extract entities as atomic units for e in _extract_entities(content): e_id = _make_unit_id() units.append( AtomicSemanticUnit( unit_id=e_id, content=e, type=AtomicUnitType.FACT, confidence=0.9, parent_id=root_id, source_ref=source_ref, ) ) relations.append( SemanticRelation(from_id=root_id, to_id=e_id, relation_type=RelationType.LOGICAL) ) # If not atomic and depth allows, split and recurse if not _is_atomic(content, min_words=8) and current_depth < max_depth: sentences = re.split(r"[.!?]\s+", content) if len(sentences) > 1: for sent in sentences: sent = sent.strip() if len(sent) > 20: sub = decompose_recursive( sent, max_depth=max_depth, parent_id=root_id, current_depth=current_depth + 1, source_ref=source_ref, ) units.extend(sub.units) relations.extend(sub.relations) # Dedupe by unit_id seen: set[str] = set() unique_units: list[AtomicSemanticUnit] = [] for u in units: if u.unit_id not in seen: seen.add(u.unit_id) unique_units.append(u) logger.debug( "Decomposition complete", extra={"depth": current_depth, "units": len(unique_units), "relations": len(relations)}, ) return DecompositionResult( units=unique_units, relations=relations, depth=current_depth, )