|
|
| |
| import os, re, json |
| import gradio as gr |
|
|
| HF_REPO = os.environ.get("HF_DATASET_REPO", "kadubon/intrinsic-intelligence-foundations") |
|
|
| def _load_dataset(): |
| try: |
| from datasets import load_dataset |
| ds = load_dataset(HF_REPO, split="train") |
| return ds, "remote" |
| except Exception as e: |
| local_path = os.environ.get("LOCAL_JSONL", "huggingface_dataset_takahashi.jsonl") |
| rows = [] |
| if os.path.exists(local_path): |
| with open(local_path, "r", encoding="utf-8") as f: |
| for line in f: |
| if line.strip(): |
| rows.append(json.loads(line)) |
| return rows, "local" |
| raise RuntimeError(f"Failed to load dataset: {e}") |
|
|
| DS, MODE = _load_dataset() |
|
|
| def _iter_records(): |
| if MODE == "remote": |
| for r in DS: |
| yield r |
| else: |
| for r in DS: |
| yield r |
|
|
| EQ_PATTERN = re.compile(r"\[\[EQ:([^\]]+)\]\]") |
|
|
| def expand_placeholders(text, equations, to="tex"): |
| tmap = {} |
| for e in equations or []: |
| tmap[e.get("id")] = (e.get("tex",""), e.get("mathml","")) |
| if to == "tex": |
| return EQ_PATTERN.sub(lambda m: f"$${tmap.get(m.group(1), ('',''))[0]}$$", text or "") |
| else: |
| return EQ_PATTERN.sub(lambda m: tmap.get(m.group(1), ('',''))[1] or "", text or "") |
|
|
| def record_to_md(rec, show="tex", preview_chars=1200): |
| title = rec.get("title","(no title)") |
| doi = rec.get("doi") |
| url = rec.get("urls",{}).get("landing") or (f"https://doi.org/{doi}" if doi else None) |
| authors = rec.get("authors") or [] |
| if authors and isinstance(authors, list): |
| auth = ", ".join([f"{a.get('given','').strip()} {a.get('family','').strip()}".strip() if isinstance(a, dict) else str(a) for a in authors]) |
| else: |
| auth = "K. Takahashi" |
| kws = rec.get("keywords") or [] |
| eqs = rec.get("equations") or [] |
| text = rec.get("fulltext",{}).get("plain","") |
|
|
| if show == "tex": |
| body = expand_placeholders(text, eqs, to="tex") |
| md_body = body[:preview_chars] + ("…" if len(body) > preview_chars else "") |
| else: |
| body = expand_placeholders(text, eqs, to="mathml") |
| snippet = body[:preview_chars] + ("…" if len(body) > preview_chars else "") |
| md_body = f"<div>{snippet}</div>" |
|
|
| meta = [] |
| if url: |
| meta.append(f"[DOI]({url})") |
| if doi and not url: |
| meta.append(f"`{doi}`") |
| if kws: |
| meta.append("keywords: " + ", ".join(kws[:10])) |
|
|
| header = f"### 📄 {title}\n\n**Authors:** {auth} \n" + (" \n".join(meta) if meta else "") |
| return header + "\n\n" + md_body |
|
|
| def search_dataset(query, show="tex", top_k=5): |
| q = (query or "").strip().lower() |
| if not q: |
| return "Type keywords to search titles/keywords/text." |
| hits = [] |
| for rec in _iter_records(): |
| title = rec.get("title","") |
| kw = " ".join(rec.get("keywords") or []) |
| text = rec.get("fulltext",{}).get("plain","") |
| hay = " ".join([title, kw, text]).lower() |
| if q in hay: |
| hits.append(rec) |
| if len(hits) >= top_k*3: |
| break |
| if not hits: |
| return "No results found." |
| md = [] |
| for rec in hits[:top_k]: |
| md.append(record_to_md(rec, show=show)) |
| return "\n\n---\n\n".join(md) |
|
|
| def view_by_index(idx, show="tex"): |
| try: |
| idx = int(idx) |
| except: |
| return "Index must be an integer (0-based)." |
| rec = None |
| if MODE == "remote": |
| if 0 <= idx < len(DS): |
| rec = DS[int(idx)] |
| else: |
| if 0 <= idx < len(DS): |
| rec = DS[int(idx)] |
| if rec is None: |
| return f"Out of range. Available: 0..{len(DS)-1}" |
| return record_to_md(rec, show=show, preview_chars=10000) |
|
|
| with gr.Blocks(title="Intrinsic Intelligence Foundations") as demo: |
| gr.Markdown( |
| """ |
| # Intrinsic Intelligence Foundations — Search & Viewer |
| Explore the math-aware dataset (TeX/MathML) for autonomous, self-organizing intelligence research. |
| **Source:** [Hugging Face dataset](https://huggingface.co/datasets/kadubon/intrinsic-intelligence-foundations) |
| """ |
| ) |
| with gr.Row(): |
| query = gr.Textbox(label="Search keywords", placeholder="e.g., teleogenesis, fractal category theory, UGV") |
| show = gr.Radio(choices=["tex","mathml"], value="tex", label="Render equations as") |
| topk = gr.Slider(minimum=1, maximum=10, value=5, step=1, label="Top-K") |
| btn = gr.Button("Search") |
| out = gr.Markdown() |
| btn.click(fn=search_dataset, inputs=[query, show, topk], outputs=out) |
|
|
| gr.Markdown("### Or view by 0-based index") |
| with gr.Row(): |
| idx = gr.Number(value=0, precision=0, label="Index") |
| show2 = gr.Radio(choices=["tex","mathml"], value="tex", label="Render equations as", interactive=True) |
| btn2 = gr.Button("View record") |
| out2 = gr.Markdown() |
| btn2.click(fn=view_by_index, inputs=[idx, show2], outputs=out2) |
|
|
| if __name__ == "__main__": |
| demo.launch() |
|
|