kadubon's picture
initial commit
14e9249 verified
# app.py - Intrinsic Intelligence Foundations: Search & Viewer (Gradio)
import os, re, json
import gradio as gr
HF_REPO = os.environ.get("HF_DATASET_REPO", "kadubon/intrinsic-intelligence-foundations")
def _load_dataset():
try:
from datasets import load_dataset
ds = load_dataset(HF_REPO, split="train")
return ds, "remote"
except Exception as e:
local_path = os.environ.get("LOCAL_JSONL", "huggingface_dataset_takahashi.jsonl")
rows = []
if os.path.exists(local_path):
with open(local_path, "r", encoding="utf-8") as f:
for line in f:
if line.strip():
rows.append(json.loads(line))
return rows, "local"
raise RuntimeError(f"Failed to load dataset: {e}")
DS, MODE = _load_dataset()
def _iter_records():
if MODE == "remote":
for r in DS:
yield r
else:
for r in DS:
yield r
EQ_PATTERN = re.compile(r"\[\[EQ:([^\]]+)\]\]")
def expand_placeholders(text, equations, to="tex"):
tmap = {}
for e in equations or []:
tmap[e.get("id")] = (e.get("tex",""), e.get("mathml",""))
if to == "tex":
return EQ_PATTERN.sub(lambda m: f"$${tmap.get(m.group(1), ('',''))[0]}$$", text or "")
else:
return EQ_PATTERN.sub(lambda m: tmap.get(m.group(1), ('',''))[1] or "", text or "")
def record_to_md(rec, show="tex", preview_chars=1200):
title = rec.get("title","(no title)")
doi = rec.get("doi")
url = rec.get("urls",{}).get("landing") or (f"https://doi.org/{doi}" if doi else None)
authors = rec.get("authors") or []
if authors and isinstance(authors, list):
auth = ", ".join([f"{a.get('given','').strip()} {a.get('family','').strip()}".strip() if isinstance(a, dict) else str(a) for a in authors])
else:
auth = "K. Takahashi"
kws = rec.get("keywords") or []
eqs = rec.get("equations") or []
text = rec.get("fulltext",{}).get("plain","")
if show == "tex":
body = expand_placeholders(text, eqs, to="tex")
md_body = body[:preview_chars] + ("…" if len(body) > preview_chars else "")
else:
body = expand_placeholders(text, eqs, to="mathml")
snippet = body[:preview_chars] + ("…" if len(body) > preview_chars else "")
md_body = f"<div>{snippet}</div>"
meta = []
if url:
meta.append(f"[DOI]({url})")
if doi and not url:
meta.append(f"`{doi}`")
if kws:
meta.append("keywords: " + ", ".join(kws[:10]))
header = f"### 📄 {title}\n\n**Authors:** {auth} \n" + (" \n".join(meta) if meta else "")
return header + "\n\n" + md_body
def search_dataset(query, show="tex", top_k=5):
q = (query or "").strip().lower()
if not q:
return "Type keywords to search titles/keywords/text."
hits = []
for rec in _iter_records():
title = rec.get("title","")
kw = " ".join(rec.get("keywords") or [])
text = rec.get("fulltext",{}).get("plain","")
hay = " ".join([title, kw, text]).lower()
if q in hay:
hits.append(rec)
if len(hits) >= top_k*3:
break
if not hits:
return "No results found."
md = []
for rec in hits[:top_k]:
md.append(record_to_md(rec, show=show))
return "\n\n---\n\n".join(md)
def view_by_index(idx, show="tex"):
try:
idx = int(idx)
except:
return "Index must be an integer (0-based)."
rec = None
if MODE == "remote":
if 0 <= idx < len(DS):
rec = DS[int(idx)]
else:
if 0 <= idx < len(DS):
rec = DS[int(idx)]
if rec is None:
return f"Out of range. Available: 0..{len(DS)-1}"
return record_to_md(rec, show=show, preview_chars=10000)
with gr.Blocks(title="Intrinsic Intelligence Foundations") as demo:
gr.Markdown(
"""
# Intrinsic Intelligence Foundations — Search & Viewer
Explore the math-aware dataset (TeX/MathML) for autonomous, self-organizing intelligence research.
**Source:** [Hugging Face dataset](https://huggingface.co/datasets/kadubon/intrinsic-intelligence-foundations)
"""
)
with gr.Row():
query = gr.Textbox(label="Search keywords", placeholder="e.g., teleogenesis, fractal category theory, UGV")
show = gr.Radio(choices=["tex","mathml"], value="tex", label="Render equations as")
topk = gr.Slider(minimum=1, maximum=10, value=5, step=1, label="Top-K")
btn = gr.Button("Search")
out = gr.Markdown()
btn.click(fn=search_dataset, inputs=[query, show, topk], outputs=out)
gr.Markdown("### Or view by 0-based index")
with gr.Row():
idx = gr.Number(value=0, precision=0, label="Index")
show2 = gr.Radio(choices=["tex","mathml"], value="tex", label="Render equations as", interactive=True)
btn2 = gr.Button("View record")
out2 = gr.Markdown()
btn2.click(fn=view_by_index, inputs=[idx, show2], outputs=out2)
if __name__ == "__main__":
demo.launch()