verantyx-logic-math / tools /append_unique_cross_data.py

Initial upload of Verantyx Logic Engine (v1.0)

29b87da verified 4 months ago

6.98 kB

	import json
	import random
	import re
	from pathlib import Path

	# Config
	COUNT = 2000
	SEED_FILE = Path("avh_math/db/text_cross_seed.jsonl")
	KB_FILE = Path("avh_math/db/text_cross_kb.jsonl")

	# Ensure directories
	SEED_FILE.parent.mkdir(parents=True, exist_ok=True)
	KB_FILE.parent.mkdir(parents=True, exist_ok=True)

	# --- 1. Load Existing Data (for Uniqueness Check) ---
	existing_texts = set()
	existing_sigs = set()

	def load_existing():
	if SEED_FILE.exists():
	with SEED_FILE.open("r", encoding="utf-8") as f:
	for line in f:
	try:
	obj = json.loads(line)
	existing_texts.add(obj.get("raw_text", ""))
	except: pass

	if KB_FILE.exists():
	with KB_FILE.open("r", encoding="utf-8") as f:
	for line in f:
	try:
	obj = json.loads(line)
	sig = tuple(obj.get("structure_signature", []))
	existing_sigs.add(sig)
	except: pass

	print(f"Loaded {len(existing_texts)} existing texts and {len(existing_sigs)} signatures.")

	# --- 2. Advanced Generator (Focus on Novelty) ---

	# Expanded Vocabulary
	JP_PARTS_2 = ["かつ", "または", "すべての", "存在する", "写像", "空間", "演算", "同型", "正規", "可換", "一意", "満たす", "要素"]
	EN_PARTS_2 = ["forall", "exists", "map", "space", "op", "iso", "normal", "commute", "unique", "s.t.", "element", "implies", "iff"]
	SYMBOLS_2 = ["X", "Y", "Z", "r1", "r2", "theta", "phi", "sum", "prod", "lim", "int", "del", "nabla"]
	ARROWS_2 = ["=>", "<=", "<=>", "-->", "<--"]
	MODALS_2 = ["Box", "Diamond", "[a]", "<a>", "K", "T", "S4"]
	BRACKETS_2 = ["{", "}", "<", ">", "\|", "\|\|"]
	OTHERS_2 = ["#", "$", "%", "&", "@", "^", "_", "`", "~", ";;", "::"]

	def generate_novel_text():
	# Strategy: Vary length drastically, mix types aggressively
	length = random.choice([2, 3, 4, 15, 20, 25]) # Very short or very long

	parts = []

	# Mode selection for structural diversity
	mode = random.choice(["dense_symbol", "verbose_text", "bracket_mess", "arrow_chain", "mixed_chaos"])

	for _ in range(length):
	if mode == "dense_symbol":
	parts.append(random.choice(SYMBOLS_2 + ARROWS_2 + MODALS_2 + OTHERS_2))
	elif mode == "verbose_text":
	parts.append(random.choice(JP_PARTS_2 + EN_PARTS_2))
	elif mode == "bracket_mess":
	parts.append(random.choice(BRACKETS_2 + SYMBOLS_2))
	elif mode == "arrow_chain":
	parts.append(random.choice(ARROWS_2 + SYMBOLS_2))
	else: # mixed_chaos
	parts.append(random.choice(JP_PARTS_2 + EN_PARTS_2 + SYMBOLS_2 + ARROWS_2 + MODALS_2 + BRACKETS_2 + OTHERS_2))

	# Assemble
	text = ""
	for p in parts:
	if random.random() < 0.5: # 50% chance of no space to create weird tokens
	text += p
	else:
	text += " " + p

	return text.strip()

	# --- 3. Decomposition Logic (Consistent with previous) ---

	def classify_shape(token):
	if token in ["->", "→", "=>", "<-", "<=", "<=>", "-->", "<--"]:
	return "arrow"
	if token in ["[]", "□", "<>", "Diamond", "Box", "[a]", "<a>"]:
	return "modal"
	if token in ["(", ")", "[", "]", "{", "}", "\"", "'", "<", ">", "\|", "\|\|"]:
	return "bracket"

	# Symbol vs Word vs Other
	if re.match(r"^[A-Za-z0-9]$", token):
	return "symbol"
	if re.match(r"^[A-Za-z0-9_]+", token) and len(token) > 1:
	return "word"
	if any("\u3000" <= c <= "\u9faf" for c in token):
	return "word"

	return "other"

	def decompose_text(text):
	# Regex to capture all new symbols
	# Prioritize multi-char tokens
	pattern = r"(->\|=>\|<->\|\[\]\|<>\|Box\|Diamond\|-->\|<--\|\[a\]\|<a>\|[A-Za-z0-9_]+\|[\u3000-\u9faf]+\|[^\s])"
	tokens = [t for t in re.findall(pattern, text) if t.strip()]

	shapes = []
	for i, t in enumerate(tokens):
	shapes.append({
	"token": t,
	"shape": classify_shape(t),
	"position": i
	})

	# Notes
	notes = []
	shape_types = [s["shape"] for s in shapes]

	if "arrow" in shape_types:
	notes.append("arrow_detected")
	if "modal" in shape_types:
	notes.append("modal_detected")
	if "\"" in text or "'" in text:
	notes.append("quoted_segment")

	has_jp = any(any("\u3000" <= c <= "\u9faf" for c in t) for t in tokens)
	has_en = any(re.search(r"[a-zA-Z]", t) for t in tokens)
	if has_jp and has_en:
	notes.append("mixed_language")

	# Bracket check (extended)
	stack = []
	unbalanced = False
	pairs = {")": "(", "]": "[", "}": "{", ">": "<"}
	for t in tokens:
	if t in pairs.values():
	stack.append(t)
	elif t in pairs:
	if not stack or stack[-1] != pairs[t]:
	unbalanced = True
	break
	stack.pop()
	if stack:
	unbalanced = True
	if unbalanced:
	notes.append("unbalanced_bracket")

	# Formula-like check
	symbol_ratio = shape_types.count("symbol") / len(shape_types) if shape_types else 0
	if symbol_ratio > 0.3 or "arrow" in shape_types or "modal" in shape_types:
	notes.append("formula_like_sequence")

	return {
	"raw_text": text,
	"tokens": tokens,
	"shapes": shapes,
	"structure_signature": shape_types,
	"notes": notes
	}

	def main():
	load_existing()
	print(f"Generating {COUNT} NEW unique entries...")

	generated_count = 0
	attempts = 0

	buffer_seed = []
	buffer_kb = []

	while generated_count < COUNT:
	attempts += 1
	text = generate_novel_text()

	# Text uniqueness check
	if text in existing_texts:
	continue

	# Decompose
	kb_data = decompose_text(text)
	sig = tuple(kb_data["structure_signature"])

	# Structure uniqueness check (Strict)
	# If this exact structure exists, we skip it to force diversity
	if sig in existing_sigs:
	continue

	# It's unique! Add to sets
	existing_texts.add(text)
	existing_sigs.add(sig)

	buffer_seed.append({"raw_text": text})
	buffer_kb.append(kb_data)

	generated_count += 1

	if generated_count % 500 == 0:
	print(f"Generated {generated_count} items (Attempts: {attempts})")

	print(f"Writing to files...")

	with SEED_FILE.open("a", encoding="utf-8") as fs:
	for item in buffer_seed:
	fs.write(json.dumps(item, ensure_ascii=False) + "\n")

	with KB_FILE.open("a", encoding="utf-8") as fk:
	for item in buffer_kb:
	fk.write(json.dumps(item, ensure_ascii=False) + "\n")

	print("Done.")

	if __name__ == "__main__":
	main()