| from __future__ import annotations |
|
|
| import re |
| from dataclasses import dataclass |
| from typing import List, Optional |
|
|
| _QUOTED_RE = re.compile(r'"([^"]+)"') |
| |
| _UNQUOTED_LOGIC_RE = re.compile(r"([\[\]<>&|~()A-Za-z\->\s\\]{3,})") |
| _UNQUOTED_MATH_RE = re.compile(r"(∫|dx|dy|dz|sin|cos|tan|log|ln|exp|lim|sum|prod|sqrt|∞|dim|dimension|matrix|行列|次元|対称)", re.IGNORECASE) |
| _GREEK_SYMBOL_RE = re.compile(r"[φψχαβγδλμνπστω]") |
| _DIM_SYM_EN = re.compile(r"dimension of (the )?space of (n\s*x\s*n|n×n) symmetric matrices", re.IGNORECASE) |
| _DIM_SYM_JA = re.compile(r"(実|複素)?n次対称行列の空間の次元|対称行列の空間の次元") |
| _DIM_MAT_EN = re.compile(r"dimension of (the )?space of (n\s*x\s*n|n×n) matrices", re.IGNORECASE) |
| _DIM_MAT_JA = re.compile(r"(実|複素)?n次行列の空間の次元|行列の空間の次元") |
|
|
| _ARROW_FIX = [ |
| ("→", "->"), |
| ("⇒", "->"), |
| ("⟶", "->"), |
| ("⟹", "->"), |
| ("↦", "->"), |
| ] |
| _MISC_FIX = [ |
| ("\u200b", ""), |
| ("\ufeff", ""), |
| ("\u200c", ""), |
| ("\u200d", ""), |
| (r"\land", "&"), |
| (r"\lor", "|"), |
| (r"\neg", "~"), |
| (r"\to", "->"), |
| (r"\rightarrow", "->"), |
| (r"\leftrightarrow", "<->"), |
| (r"\box", "[]"), |
| (r"\diamond", "<>"), |
| ("¬", "~"), |
| ("∧", "&"), |
| ("∨", "|"), |
| ("□", "[]"), |
| ("◇", "<>"), |
| ] |
| _WORD_MODAL = [ |
| (re.compile(r"\bbox\b", re.IGNORECASE), "[]"), |
| (re.compile(r"\bdiamond\b", re.IGNORECASE), "<>"), |
| ] |
| _SPACE_AFTER_MODAL = re.compile(r"(\[\]|\<\>)\s+") |
| _HAS_LOGIC_TOKENS = re.compile(r"(\-\>|\&|\||~|\[\]|\<\>|\(|\))") |
| _INLINE_MODAL_RE = re.compile(r"(\[\].{1,240}?->.{1,240})", re.DOTALL) |
| _INLINE_DIA_RE = re.compile(r"(\<\>.{1,240}?->.{1,240})", re.DOTALL) |
| _INLINE_PROP_RE = re.compile(r"([A-Za-z0-9_()\s~&|]+->[\sA-Za-z0-9_()~&|]+)", re.DOTALL) |
| _INLINE_FORMULA_CHUNK_RE = re.compile(r"[A-Za-z0-9_\[\]<>~&|()*/=\+\-]+") |
| _HAS_JA_RE = re.compile(r"[一-龥ぁ-んァ-ン]") |
| _MATH_TOKENS = re.compile(r"(∫|dx|dy|dz|sin|cos|tan|log|ln|exp|lim|sum|prod|sqrt|\^|/|∞)", re.IGNORECASE) |
|
|
| _ASSUME_JA = { |
| "assume:transitive": ["推移", "推移性", "transitive"], |
| "assume:reflexive": ["反射", "反射性", "reflexive"], |
| "assume:symmetric": ["対称", "対称性", "symmetric"], |
| "assume:serial": ["逐次", "serial"], |
| "assume:euclidean": ["ユークリッド", "euclidean"], |
| } |
|
|
|
|
| @dataclass |
| class ParsedInput: |
| raw: str |
| core_formula: Optional[str] |
| extra_formulas: List[str] |
| context_text: str |
| parse_notes: List[str] |
|
|
|
|
| def _normalize_formula(s: str) -> str: |
| s = s.strip() |
| is_math = bool(_MATH_TOKENS.search(s)) |
| for a, b in _ARROW_FIX: |
| s = s.replace(a, b) |
| for a, b in _MISC_FIX: |
| s = s.replace(a, b) |
| for pat, repl in _WORD_MODAL: |
| s = pat.sub(repl, s) |
| s = s.replace("\n", " ").replace("\r", " ") |
| s = re.sub(r"\s+", " ", s).strip() |
| s = _SPACE_AFTER_MODAL.sub(r"\1", s) |
| s = re.sub(r"\s*->\s*", "->", s) |
| s = re.sub(r"\s*&\s*", "&", s) |
| s = re.sub(r"\s*\|\s*", "|", s) |
| s = re.sub(r"\s*~\s*", "~", s) |
| if not is_math: |
| |
| s = re.sub(r"\s+[a-zA-Z]\s*$", "", s).strip() |
| return s |
|
|
|
|
| def extract_quoted_formulas(text: str) -> List[str]: |
| out: List[str] = [] |
| for m in _QUOTED_RE.finditer(text or ""): |
| g = next((x for x in m.groups() if x), None) |
| if not g: |
| continue |
| f = _normalize_formula(g) |
| if f: |
| out.append(f) |
| seen = set() |
| uniq = [] |
| for f in out: |
| if f not in seen: |
| seen.add(f) |
| uniq.append(f) |
| return uniq |
|
|
|
|
| def extract_unquoted_formulas(text: str) -> List[str]: |
| out: List[str] = [] |
| raw = text or "" |
| |
| inline = extract_inline_formula(raw) |
|
|
| |
| if _DIM_SYM_EN.search(raw) or _DIM_SYM_JA.search(raw): |
| out.append(_normalize_formula("dim Sym(n,R)")) |
| if _DIM_MAT_EN.search(raw) or _DIM_MAT_JA.search(raw): |
| out.append(_normalize_formula("dim M(n,n,R)")) |
|
|
| |
| for line in raw.splitlines(): |
| if _UNQUOTED_MATH_RE.search(line): |
| |
| candidate = _normalize_formula(line) |
| if candidate: |
| |
| op_count = len(re.findall(r"[\\[\\]<>~&|()]", candidate)) + candidate.count("->") + candidate.count("<->") |
| word_count = len(re.findall(r"[A-Za-z一-龥ぁ-んァ-ン]", line)) |
| if len(candidate) > 120 and op_count < 2 and word_count > 10: |
| continue |
| out.append(candidate) |
|
|
| |
| for m in _UNQUOTED_LOGIC_RE.finditer(raw): |
| cand = m.group(1) or "" |
| if "->" in cand or "&" in cand or "|" in cand or "[]" in cand or "<>" in cand or "~" in cand: |
| f = _normalize_formula(cand) |
| if f: |
| out.append(f) |
|
|
| |
| if inline and inline not in out: |
| out.append(inline) |
|
|
| |
| if not out and _GREEK_SYMBOL_RE.search(raw) and _HAS_LOGIC_TOKENS.search(raw): |
| out.append(_normalize_formula("φ")) |
|
|
| seen = set() |
| uniq: List[str] = [] |
| for f in out: |
| if f not in seen: |
| seen.add(f) |
| uniq.append(f) |
| return uniq |
|
|
|
|
| def is_formula_like(s: str) -> bool: |
| if not s: |
| return False |
| return bool(_HAS_LOGIC_TOKENS.search(s) or _MATH_TOKENS.search(s)) |
|
|
|
|
| def _is_strong_formula(s: str) -> bool: |
| """ |
| Stronger signal used for inline extraction (avoid lone [] or single symbols). |
| """ |
| if not s: |
| return False |
| if _MATH_TOKENS.search(s): |
| return True |
| return bool(re.search(r"(->|<->|&|\||~)", s)) |
|
|
|
|
| def extract_inline_formula(text: str) -> Optional[str]: |
| """ |
| Extract a likely formula-like substring from a long sentence. |
| Priority: modal chain, then implication-like fragment. |
| """ |
| if not text: |
| return None |
| m = _INLINE_MODAL_RE.search(text) |
| if m: |
| if not _HAS_JA_RE.search(m.group(1)): |
| return _normalize_formula(m.group(1)) |
| m = _INLINE_DIA_RE.search(text) |
| if m: |
| if not _HAS_JA_RE.search(m.group(1)): |
| return _normalize_formula(m.group(1)) |
| m = _INLINE_PROP_RE.search(text) |
| if m: |
| if not _HAS_JA_RE.search(m.group(1)): |
| return _normalize_formula(m.group(1)) |
|
|
| |
| best = None |
| best_score = -1 |
| for chunk in _INLINE_FORMULA_CHUNK_RE.findall(text): |
| if _HAS_JA_RE.search(chunk): |
| continue |
| if not is_formula_like(chunk): |
| continue |
| score = 0 |
| if _is_strong_formula(chunk): |
| score += 3 |
| score += len(chunk) |
| if score > best_score: |
| best_score = score |
| best = chunk |
| if best: |
| return _normalize_formula(best) |
| return None |
|
|
|
|
| def pick_core_formula(text: str) -> ParsedInput: |
| notes: List[str] = [] |
| raw = text or "" |
|
|
| quoted = extract_quoted_formulas(raw) |
| if quoted: |
| notes.append(f"quoted_formulas={len(quoted)}") |
| core = quoted[0] |
| |
| if core and not is_formula_like(core): |
| inline = extract_inline_formula(raw) |
| if inline and is_formula_like(inline): |
| core = inline |
| notes.append("quoted_core_replaced_by_inline_formula") |
| |
| if core and is_formula_like(core): |
| inline = extract_inline_formula(core) |
| if inline and is_formula_like(inline) and len(inline) < len(core): |
| core = inline |
| notes.append("quoted_core_inline_subset") |
| extras = quoted[1:] |
| ctx = _QUOTED_RE.sub(" ", raw) |
| ctx = re.sub(r"\s+", " ", ctx).strip() |
| return ParsedInput( |
| raw=raw, |
| core_formula=core, |
| extra_formulas=extras, |
| context_text=ctx, |
| parse_notes=notes, |
| ) |
|
|
| notes.append("no_quoted_formula_detected") |
| unquoted = extract_unquoted_formulas(raw) |
| if unquoted: |
| notes.append(f"unquoted_candidates={len(unquoted)}") |
| core = unquoted[0] |
| |
| if core and len(core) > 120: |
| inline = extract_inline_formula(raw) |
| if inline and is_formula_like(inline): |
| core = inline |
| notes.append("unquoted_core_replaced_by_inline_formula") |
| extras = unquoted[1:] |
| |
| if _GREEK_SYMBOL_RE.fullmatch(core) and not _HAS_LOGIC_TOKENS.search(core): |
| notes.append("unquoted_core_is_trivial_symbol") |
| core = None |
| extras = [] |
| return ParsedInput( |
| raw=raw, |
| core_formula=core, |
| extra_formulas=extras, |
| context_text=raw.strip(), |
| parse_notes=notes, |
| ) |
|
|
| return ParsedInput( |
| raw=raw, |
| core_formula=None, |
| extra_formulas=[], |
| context_text=raw.strip(), |
| parse_notes=notes, |
| ) |
|
|
|
|
| def ui_rule_text_ja() -> str: |
| return ( |
| "【入力ルール】\n" |
| "・式らしい部分を自動抽出します(必要なら引用してもOK)。\n" |
| "・括弧・矢印・論理記号は正しく入力してください。\n" |
| '例: ((A -> B) & A) -> B / []p -> [][]p / dim Sym(n,R)' |
| ) |
|
|
|
|
| def ui_rule_text_en() -> str: |
| return ( |
| "Rule: We auto-extract formula-like parts (quoting is optional).\n" |
| "Use correct operators and parentheses.\n" |
| "Examples: ((A -> B) & A) -> B, []p -> [][]p, dim Sym(n,R)" |
| ) |
|
|
|
|
| def detect_modal_assumptions(text: str) -> List[str]: |
| t = (text or "").lower() |
| out: List[str] = [] |
| for tag, keys in _ASSUME_JA.items(): |
| for k in keys: |
| if k.lower() in t: |
| out.append(tag) |
| break |
| return sorted(set(out)) |
|
|