| import json |
| import random |
| import re |
| from pathlib import Path |
|
|
| |
| COUNT = 2000 |
| SEED_FILE = Path("avh_math/db/text_cross_seed.jsonl") |
| KB_FILE = Path("avh_math/db/text_cross_kb.jsonl") |
|
|
| |
| SEED_FILE.parent.mkdir(parents=True, exist_ok=True) |
| KB_FILE.parent.mkdir(parents=True, exist_ok=True) |
|
|
| |
| existing_texts = set() |
| existing_sigs = set() |
|
|
| def load_existing(): |
| if SEED_FILE.exists(): |
| with SEED_FILE.open("r", encoding="utf-8") as f: |
| for line in f: |
| try: |
| obj = json.loads(line) |
| existing_texts.add(obj.get("raw_text", "")) |
| except: pass |
| |
| if KB_FILE.exists(): |
| with KB_FILE.open("r", encoding="utf-8") as f: |
| for line in f: |
| try: |
| obj = json.loads(line) |
| sig = tuple(obj.get("structure_signature", [])) |
| existing_sigs.add(sig) |
| except: pass |
| |
| print(f"Loaded {len(existing_texts)} existing texts and {len(existing_sigs)} signatures.") |
|
|
| |
|
|
| |
| JP_PARTS_2 = ["かつ", "または", "すべての", "存在する", "写像", "空間", "演算", "同型", "正規", "可換", "一意", "満たす", "要素"] |
| EN_PARTS_2 = ["forall", "exists", "map", "space", "op", "iso", "normal", "commute", "unique", "s.t.", "element", "implies", "iff"] |
| SYMBOLS_2 = ["X", "Y", "Z", "r1", "r2", "theta", "phi", "sum", "prod", "lim", "int", "del", "nabla"] |
| ARROWS_2 = ["=>", "<=", "<=>", "-->", "<--"] |
| MODALS_2 = ["Box", "Diamond", "[a]", "<a>", "K", "T", "S4"] |
| BRACKETS_2 = ["{", "}", "<", ">", "|", "||"] |
| OTHERS_2 = ["#", "$", "%", "&", "@", "^", "_", "`", "~", ";;", "::"] |
|
|
| def generate_novel_text(): |
| |
| length = random.choice([2, 3, 4, 15, 20, 25]) |
| |
| parts = [] |
| |
| |
| mode = random.choice(["dense_symbol", "verbose_text", "bracket_mess", "arrow_chain", "mixed_chaos"]) |
| |
| for _ in range(length): |
| if mode == "dense_symbol": |
| parts.append(random.choice(SYMBOLS_2 + ARROWS_2 + MODALS_2 + OTHERS_2)) |
| elif mode == "verbose_text": |
| parts.append(random.choice(JP_PARTS_2 + EN_PARTS_2)) |
| elif mode == "bracket_mess": |
| parts.append(random.choice(BRACKETS_2 + SYMBOLS_2)) |
| elif mode == "arrow_chain": |
| parts.append(random.choice(ARROWS_2 + SYMBOLS_2)) |
| else: |
| parts.append(random.choice(JP_PARTS_2 + EN_PARTS_2 + SYMBOLS_2 + ARROWS_2 + MODALS_2 + BRACKETS_2 + OTHERS_2)) |
| |
| |
| text = "" |
| for p in parts: |
| if random.random() < 0.5: |
| text += p |
| else: |
| text += " " + p |
| |
| return text.strip() |
|
|
| |
|
|
| def classify_shape(token): |
| if token in ["->", "→", "=>", "<-", "<=", "<=>", "-->", "<--"]: |
| return "arrow" |
| if token in ["[]", "□", "<>", "Diamond", "Box", "[a]", "<a>"]: |
| return "modal" |
| if token in ["(", ")", "[", "]", "{", "}", "\"", "'", "<", ">", "|", "||"]: |
| return "bracket" |
| |
| |
| if re.match(r"^[A-Za-z0-9]$", token): |
| return "symbol" |
| if re.match(r"^[A-Za-z0-9_]+", token) and len(token) > 1: |
| return "word" |
| if any("\u3000" <= c <= "\u9faf" for c in token): |
| return "word" |
| |
| return "other" |
|
|
| def decompose_text(text): |
| |
| |
| pattern = r"(->|=>|<->|\[\]|<>|Box|Diamond|-->|<--|\[a\]|<a>|[A-Za-z0-9_]+|[\u3000-\u9faf]+|[^\s])" |
| tokens = [t for t in re.findall(pattern, text) if t.strip()] |
| |
| shapes = [] |
| for i, t in enumerate(tokens): |
| shapes.append({ |
| "token": t, |
| "shape": classify_shape(t), |
| "position": i |
| }) |
| |
| |
| notes = [] |
| shape_types = [s["shape"] for s in shapes] |
| |
| if "arrow" in shape_types: |
| notes.append("arrow_detected") |
| if "modal" in shape_types: |
| notes.append("modal_detected") |
| if "\"" in text or "'" in text: |
| notes.append("quoted_segment") |
| |
| has_jp = any(any("\u3000" <= c <= "\u9faf" for c in t) for t in tokens) |
| has_en = any(re.search(r"[a-zA-Z]", t) for t in tokens) |
| if has_jp and has_en: |
| notes.append("mixed_language") |
| |
| |
| stack = [] |
| unbalanced = False |
| pairs = {")": "(", "]": "[", "}": "{", ">": "<"} |
| for t in tokens: |
| if t in pairs.values(): |
| stack.append(t) |
| elif t in pairs: |
| if not stack or stack[-1] != pairs[t]: |
| unbalanced = True |
| break |
| stack.pop() |
| if stack: |
| unbalanced = True |
| if unbalanced: |
| notes.append("unbalanced_bracket") |
| |
| |
| symbol_ratio = shape_types.count("symbol") / len(shape_types) if shape_types else 0 |
| if symbol_ratio > 0.3 or "arrow" in shape_types or "modal" in shape_types: |
| notes.append("formula_like_sequence") |
|
|
| return { |
| "raw_text": text, |
| "tokens": tokens, |
| "shapes": shapes, |
| "structure_signature": shape_types, |
| "notes": notes |
| } |
|
|
| def main(): |
| load_existing() |
| print(f"Generating {COUNT} NEW unique entries...") |
| |
| generated_count = 0 |
| attempts = 0 |
| |
| buffer_seed = [] |
| buffer_kb = [] |
| |
| while generated_count < COUNT: |
| attempts += 1 |
| text = generate_novel_text() |
| |
| |
| if text in existing_texts: |
| continue |
| |
| |
| kb_data = decompose_text(text) |
| sig = tuple(kb_data["structure_signature"]) |
| |
| |
| |
| if sig in existing_sigs: |
| continue |
| |
| |
| existing_texts.add(text) |
| existing_sigs.add(sig) |
| |
| buffer_seed.append({"raw_text": text}) |
| buffer_kb.append(kb_data) |
| |
| generated_count += 1 |
| |
| if generated_count % 500 == 0: |
| print(f"Generated {generated_count} items (Attempts: {attempts})") |
| |
| print(f"Writing to files...") |
| |
| with SEED_FILE.open("a", encoding="utf-8") as fs: |
| for item in buffer_seed: |
| fs.write(json.dumps(item, ensure_ascii=False) + "\n") |
| |
| with KB_FILE.open("a", encoding="utf-8") as fk: |
| for item in buffer_kb: |
| fk.write(json.dumps(item, ensure_ascii=False) + "\n") |
| |
| print("Done.") |
|
|
| if __name__ == "__main__": |
| main() |
|
|