import re from dataclasses import dataclass from typing import List, Optional, Dict @dataclass class StructuredHeader: domain: Optional[str] assumptions: List[str] formula: Optional[str] body: str _DOMAIN_RE = re.compile(r"(?im)^\s*Domain\s*:\s*([a-zA-Z0-9_]+)\s*$") _ASSUME_RE = re.compile(r"(?im)^\s*Assumption(?:s)?\s*:\s*(.+?)\s*$") _FORMULA_RE = re.compile(r"(?im)^\s*Formula\s*:\s*(.+?)\s*$") _ASSUMPTION_MAP: Dict[str, str] = { "transitive": "transitive", "推移": "transitive", "reflexive": "reflexive", "反射": "reflexive", "symmetric": "symmetric", "対称": "symmetric", "serial": "serial", "連結": "serial", "euclidean": "euclidean", } def parse_structured_header(text: str) -> StructuredHeader: raw = text or "" domain = None m = _DOMAIN_RE.search(raw) if m: domain = m.group(1).strip().lower() assumptions: List[str] = [] for m in _ASSUME_RE.finditer(raw): parts = re.split(r"[,\s]+", m.group(1).strip().lower()) for p in parts: if not p: continue assumptions.append(_ASSUMPTION_MAP.get(p, p)) formula = None m = _FORMULA_RE.search(raw) if m: formula = m.group(1).strip() body = strip_header(raw) return StructuredHeader(domain=domain, assumptions=assumptions, formula=formula, body=body) def strip_header(text: str) -> str: lines = [] for ln in (text or "").splitlines(): if _DOMAIN_RE.match(ln) or _ASSUME_RE.match(ln) or _FORMULA_RE.match(ln): continue lines.append(ln) return "\n".join(lines).strip()