| import re |
|
|
| _ZERO_WIDTH = re.compile(r"[\u200b\u200c\u200d\ufeff]") |
| _WS = re.compile(r"[ \t]+") |
| _MULTI_NL = re.compile(r"\n{3,}") |
|
|
|
|
| def _normalize_symbols(s: str) -> str: |
| s = s.replace("β", "->").replace("β", "->").replace("βΆ", "->") |
| s = s.replace("β", "<->").replace("β", "<->") |
| s = s.replace("β§", "&").replace("β¨", "|") |
| s = s.replace("Β¬", "~") |
| s = s.replace("οΌ", "(").replace("οΌ", ")") |
| s = s.replace("γ", "[").replace("γ", "]") |
| return s |
|
|
|
|
| def _normalize_modal_tokens(s: str) -> str: |
| |
| s = s.replace("[]", "β‘").replace("<>", "β") |
| s = re.sub(r"\bbox\b", "β‘", s, flags=re.IGNORECASE) |
| s = re.sub(r"\bdiamond\b", "β", s, flags=re.IGNORECASE) |
| |
| s = re.sub(r"β‘\s+(?=[A-Za-z(~\[])", "β‘", s) |
| s = re.sub(r"β\s+(?=[A-Za-z(~\[])", "β", s) |
| s = re.sub(r"β‘\s+β‘\s*", "β‘β‘", s) |
| s = re.sub(r"β\s+β\s*", "ββ", s) |
| return s |
|
|
|
|
| def normalize_input(text: str) -> str: |
| """ |
| Normalize common symbol and spacing variants without destroying line structure. |
| - Remove zero-width characters. |
| - Normalize arrows/connectives. |
| - Normalize modal tokens (box/diamond/[]/<>) to β‘/β. |
| - Normalize excess whitespace. |
| """ |
| s = (text or "").replace("\r\n", "\n").replace("\r", "\n") |
| s = _ZERO_WIDTH.sub("", s) |
| s = _normalize_symbols(s) |
| s = _normalize_modal_tokens(s) |
| |
| s = _MULTI_NL.sub("\n\n", s) |
| |
| s = "\n".join(_WS.sub(" ", ln).strip() for ln in s.split("\n")) |
| return s.strip() |
|
|