File size: 10,641 Bytes
29b87da | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 | from __future__ import annotations
import re
from dataclasses import dataclass
from typing import List, Optional
_QUOTED_RE = re.compile(r'"([^"]+)"')
# 決定打:角括弧や矢印を正しく含む正規表現に修正 + LaTeXコマンド用バックスラッシュ追加
_UNQUOTED_LOGIC_RE = re.compile(r"([\[\]<>&|~()A-Za-z\->\s\\]{3,})")
_UNQUOTED_MATH_RE = re.compile(r"(∫|dx|dy|dz|sin|cos|tan|log|ln|exp|lim|sum|prod|sqrt|∞|dim|dimension|matrix|行列|次元|対称)", re.IGNORECASE)
_GREEK_SYMBOL_RE = re.compile(r"[φψχαβγδλμνπστω]")
_DIM_SYM_EN = re.compile(r"dimension of (the )?space of (n\s*x\s*n|n×n) symmetric matrices", re.IGNORECASE)
_DIM_SYM_JA = re.compile(r"(実|複素)?n次対称行列の空間の次元|対称行列の空間の次元")
_DIM_MAT_EN = re.compile(r"dimension of (the )?space of (n\s*x\s*n|n×n) matrices", re.IGNORECASE)
_DIM_MAT_JA = re.compile(r"(実|複素)?n次行列の空間の次元|行列の空間の次元")
_ARROW_FIX = [
("→", "->"),
("⇒", "->"),
("⟶", "->"),
("⟹", "->"),
("↦", "->"),
]
_MISC_FIX = [
("\u200b", ""),
("\ufeff", ""),
("\u200c", ""),
("\u200d", ""),
(r"\land", "&"),
(r"\lor", "|"),
(r"\neg", "~"),
(r"\to", "->"),
(r"\rightarrow", "->"),
(r"\leftrightarrow", "<->"),
(r"\box", "[]"),
(r"\diamond", "<>"),
("¬", "~"),
("∧", "&"),
("∨", "|"),
("□", "[]"),
("◇", "<>"),
]
_WORD_MODAL = [
(re.compile(r"\bbox\b", re.IGNORECASE), "[]"),
(re.compile(r"\bdiamond\b", re.IGNORECASE), "<>"),
]
_SPACE_AFTER_MODAL = re.compile(r"(\[\]|\<\>)\s+")
_HAS_LOGIC_TOKENS = re.compile(r"(\-\>|\&|\||~|\[\]|\<\>|\(|\))")
_INLINE_MODAL_RE = re.compile(r"(\[\].{1,240}?->.{1,240})", re.DOTALL)
_INLINE_DIA_RE = re.compile(r"(\<\>.{1,240}?->.{1,240})", re.DOTALL)
_INLINE_PROP_RE = re.compile(r"([A-Za-z0-9_()\s~&|]+->[\sA-Za-z0-9_()~&|]+)", re.DOTALL)
_INLINE_FORMULA_CHUNK_RE = re.compile(r"[A-Za-z0-9_\[\]<>~&|()*/=\+\-]+")
_HAS_JA_RE = re.compile(r"[一-龥ぁ-んァ-ン]")
_MATH_TOKENS = re.compile(r"(∫|dx|dy|dz|sin|cos|tan|log|ln|exp|lim|sum|prod|sqrt|\^|/|∞)", re.IGNORECASE)
_ASSUME_JA = {
"assume:transitive": ["推移", "推移性", "transitive"],
"assume:reflexive": ["反射", "反射性", "reflexive"],
"assume:symmetric": ["対称", "対称性", "symmetric"],
"assume:serial": ["逐次", "serial"],
"assume:euclidean": ["ユークリッド", "euclidean"],
}
@dataclass
class ParsedInput:
raw: str
core_formula: Optional[str]
extra_formulas: List[str]
context_text: str
parse_notes: List[str]
def _normalize_formula(s: str) -> str:
s = s.strip()
is_math = bool(_MATH_TOKENS.search(s))
for a, b in _ARROW_FIX:
s = s.replace(a, b)
for a, b in _MISC_FIX:
s = s.replace(a, b)
for pat, repl in _WORD_MODAL:
s = pat.sub(repl, s)
s = s.replace("\n", " ").replace("\r", " ")
s = re.sub(r"\s+", " ", s).strip()
s = _SPACE_AFTER_MODAL.sub(r"\1", s)
s = re.sub(r"\s*->\s*", "->", s)
s = re.sub(r"\s*&\s*", "&", s)
s = re.sub(r"\s*\|\s*", "|", s)
s = re.sub(r"\s*~\s*", "~", s)
if not is_math:
# Guard against UI noise for logic formulas only.
s = re.sub(r"\s+[a-zA-Z]\s*$", "", s).strip()
return s
def extract_quoted_formulas(text: str) -> List[str]:
out: List[str] = []
for m in _QUOTED_RE.finditer(text or ""):
g = next((x for x in m.groups() if x), None)
if not g:
continue
f = _normalize_formula(g)
if f:
out.append(f)
seen = set()
uniq = []
for f in out:
if f not in seen:
seen.add(f)
uniq.append(f)
return uniq
def extract_unquoted_formulas(text: str) -> List[str]:
out: List[str] = []
raw = text or ""
# If inline formula exists, prefer it over long sentence lines.
inline = extract_inline_formula(raw)
# Pattern-based linear algebra hints.
if _DIM_SYM_EN.search(raw) or _DIM_SYM_JA.search(raw):
out.append(_normalize_formula("dim Sym(n,R)"))
if _DIM_MAT_EN.search(raw) or _DIM_MAT_JA.search(raw):
out.append(_normalize_formula("dim M(n,n,R)"))
# Math-like detection: keep full lines if they contain math tokens.
for line in raw.splitlines():
if _UNQUOTED_MATH_RE.search(line):
# Avoid capturing long natural-language lines as formulas.
candidate = _normalize_formula(line)
if candidate:
# Heuristic: skip if long and operator density is low.
op_count = len(re.findall(r"[\\[\\]<>~&|()]", candidate)) + candidate.count("->") + candidate.count("<->")
word_count = len(re.findall(r"[A-Za-z一-龥ぁ-んァ-ン]", line))
if len(candidate) > 120 and op_count < 2 and word_count > 10:
continue
out.append(candidate)
# Logic-like fragments (simple heuristic).
for m in _UNQUOTED_LOGIC_RE.finditer(raw):
cand = m.group(1) or ""
if "->" in cand or "&" in cand or "|" in cand or "[]" in cand or "<>" in cand or "~" in cand:
f = _normalize_formula(cand)
if f:
out.append(f)
# Inline fallback: grab the first formula-like chunk.
if inline and inline not in out:
out.append(inline)
# Greek-symbol fallback only when logic tokens are present.
if not out and _GREEK_SYMBOL_RE.search(raw) and _HAS_LOGIC_TOKENS.search(raw):
out.append(_normalize_formula("φ"))
seen = set()
uniq: List[str] = []
for f in out:
if f not in seen:
seen.add(f)
uniq.append(f)
return uniq
def is_formula_like(s: str) -> bool:
if not s:
return False
return bool(_HAS_LOGIC_TOKENS.search(s) or _MATH_TOKENS.search(s))
def _is_strong_formula(s: str) -> bool:
"""
Stronger signal used for inline extraction (avoid lone [] or single symbols).
"""
if not s:
return False
if _MATH_TOKENS.search(s):
return True
return bool(re.search(r"(->|<->|&|\||~)", s))
def extract_inline_formula(text: str) -> Optional[str]:
"""
Extract a likely formula-like substring from a long sentence.
Priority: modal chain, then implication-like fragment.
"""
if not text:
return None
m = _INLINE_MODAL_RE.search(text)
if m:
if not _HAS_JA_RE.search(m.group(1)):
return _normalize_formula(m.group(1))
m = _INLINE_DIA_RE.search(text)
if m:
if not _HAS_JA_RE.search(m.group(1)):
return _normalize_formula(m.group(1))
m = _INLINE_PROP_RE.search(text)
if m:
if not _HAS_JA_RE.search(m.group(1)):
return _normalize_formula(m.group(1))
# Token-like fallback: pick the best formula-ish chunk (prefer strong operators).
best = None
best_score = -1
for chunk in _INLINE_FORMULA_CHUNK_RE.findall(text):
if _HAS_JA_RE.search(chunk):
continue
if not is_formula_like(chunk):
continue
score = 0
if _is_strong_formula(chunk):
score += 3
score += len(chunk)
if score > best_score:
best_score = score
best = chunk
if best:
return _normalize_formula(best)
return None
def pick_core_formula(text: str) -> ParsedInput:
notes: List[str] = []
raw = text or ""
quoted = extract_quoted_formulas(raw)
if quoted:
notes.append(f"quoted_formulas={len(quoted)}")
core = quoted[0]
# If quoted core looks like a sentence, prefer inline formula fragments.
if core and not is_formula_like(core):
inline = extract_inline_formula(raw)
if inline and is_formula_like(inline):
core = inline
notes.append("quoted_core_replaced_by_inline_formula")
# If quoted contains a mix of prose + formula, prefer the tighter inline formula.
if core and is_formula_like(core):
inline = extract_inline_formula(core)
if inline and is_formula_like(inline) and len(inline) < len(core):
core = inline
notes.append("quoted_core_inline_subset")
extras = quoted[1:]
ctx = _QUOTED_RE.sub(" ", raw)
ctx = re.sub(r"\s+", " ", ctx).strip()
return ParsedInput(
raw=raw,
core_formula=core,
extra_formulas=extras,
context_text=ctx,
parse_notes=notes,
)
notes.append("no_quoted_formula_detected")
unquoted = extract_unquoted_formulas(raw)
if unquoted:
notes.append(f"unquoted_candidates={len(unquoted)}")
core = unquoted[0]
# Prefer inline formula if the core looks like a full sentence.
if core and len(core) > 120:
inline = extract_inline_formula(raw)
if inline and is_formula_like(inline):
core = inline
notes.append("unquoted_core_replaced_by_inline_formula")
extras = unquoted[1:]
# Reject trivial single-symbol cores (e.g., φ) with no operators.
if _GREEK_SYMBOL_RE.fullmatch(core) and not _HAS_LOGIC_TOKENS.search(core):
notes.append("unquoted_core_is_trivial_symbol")
core = None
extras = []
return ParsedInput(
raw=raw,
core_formula=core,
extra_formulas=extras,
context_text=raw.strip(),
parse_notes=notes,
)
return ParsedInput(
raw=raw,
core_formula=None,
extra_formulas=[],
context_text=raw.strip(),
parse_notes=notes,
)
def ui_rule_text_ja() -> str:
return (
"【入力ルール】\n"
"・式らしい部分を自動抽出します(必要なら引用してもOK)。\n"
"・括弧・矢印・論理記号は正しく入力してください。\n"
'例: ((A -> B) & A) -> B / []p -> [][]p / dim Sym(n,R)'
)
def ui_rule_text_en() -> str:
return (
"Rule: We auto-extract formula-like parts (quoting is optional).\n"
"Use correct operators and parentheses.\n"
"Examples: ((A -> B) & A) -> B, []p -> [][]p, dim Sym(n,R)"
)
def detect_modal_assumptions(text: str) -> List[str]:
t = (text or "").lower()
out: List[str] = []
for tag, keys in _ASSUME_JA.items():
for k in keys:
if k.lower() in t:
out.append(tag)
break
return sorted(set(out))
|