Spaces:
Running
Running
deploy Gita Advisor as Gradio Space
Browse files- add app.py: Gradio ChatInterface wrapping the advisor pipeline
- add .gitignore: exclude .env, __pycache__, raw data, logs
- update README.md: add HF Spaces frontmatter
- update requirements.txt: add gradio>=4.0
- include artifacts/chroma/ (via LFS), optimized_advisor.json, corpus_enriched.jsonl
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This view is limited to 50 files because it contains too many changes. See raw diff
- .gitattributes +1 -0
- .gitignore +14 -0
- README.md +118 -7
- advisor.py +157 -0
- app.py +72 -0
- artifacts/chroma/1cb22ce3-5b5d-4ea7-84da-8e74f131266a/data_level0.bin +3 -0
- artifacts/chroma/1cb22ce3-5b5d-4ea7-84da-8e74f131266a/header.bin +3 -0
- artifacts/chroma/1cb22ce3-5b5d-4ea7-84da-8e74f131266a/index_metadata.pickle +3 -0
- artifacts/chroma/1cb22ce3-5b5d-4ea7-84da-8e74f131266a/length.bin +3 -0
- artifacts/chroma/1cb22ce3-5b5d-4ea7-84da-8e74f131266a/link_lists.bin +3 -0
- artifacts/chroma/1f1f3474-209f-41fb-91d3-d45fd026fb05/data_level0.bin +3 -0
- artifacts/chroma/1f1f3474-209f-41fb-91d3-d45fd026fb05/header.bin +3 -0
- artifacts/chroma/1f1f3474-209f-41fb-91d3-d45fd026fb05/length.bin +3 -0
- artifacts/chroma/1f1f3474-209f-41fb-91d3-d45fd026fb05/link_lists.bin +0 -0
- artifacts/chroma/52cdeb15-0631-44ed-8618-782f1d4d27bb/data_level0.bin +3 -0
- artifacts/chroma/52cdeb15-0631-44ed-8618-782f1d4d27bb/header.bin +3 -0
- artifacts/chroma/52cdeb15-0631-44ed-8618-782f1d4d27bb/length.bin +3 -0
- artifacts/chroma/52cdeb15-0631-44ed-8618-782f1d4d27bb/link_lists.bin +0 -0
- artifacts/chroma/8707047c-50b3-41ba-ad04-329e93917e30/data_level0.bin +3 -0
- artifacts/chroma/8707047c-50b3-41ba-ad04-329e93917e30/header.bin +3 -0
- artifacts/chroma/8707047c-50b3-41ba-ad04-329e93917e30/length.bin +3 -0
- artifacts/chroma/8707047c-50b3-41ba-ad04-329e93917e30/link_lists.bin +0 -0
- artifacts/chroma/9c71c1cd-5694-4e9f-abe6-12ba1e74225b/data_level0.bin +3 -0
- artifacts/chroma/9c71c1cd-5694-4e9f-abe6-12ba1e74225b/header.bin +3 -0
- artifacts/chroma/9c71c1cd-5694-4e9f-abe6-12ba1e74225b/index_metadata.pickle +3 -0
- artifacts/chroma/9c71c1cd-5694-4e9f-abe6-12ba1e74225b/length.bin +3 -0
- artifacts/chroma/9c71c1cd-5694-4e9f-abe6-12ba1e74225b/link_lists.bin +3 -0
- artifacts/chroma/chroma.sqlite3 +3 -0
- artifacts/chroma/d091e62b-3e8b-4cd6-842e-ff3411b384f5/data_level0.bin +3 -0
- artifacts/chroma/d091e62b-3e8b-4cd6-842e-ff3411b384f5/header.bin +3 -0
- artifacts/chroma/d091e62b-3e8b-4cd6-842e-ff3411b384f5/length.bin +3 -0
- artifacts/chroma/d091e62b-3e8b-4cd6-842e-ff3411b384f5/link_lists.bin +0 -0
- artifacts/optimized_advisor.json +157 -0
- chat.py +392 -0
- config.py +201 -0
- corpus.py +224 -0
- data/corpus_enriched.jsonl +0 -0
- dataset_generator.py +332 -0
- download_sources.py +195 -0
- enrich_corpus.py +174 -0
- enrichment.py +266 -0
- ingest_corpus.py +203 -0
- knowledge_base.py +416 -0
- metrics.py +435 -0
- optimize_gepa.py +200 -0
- parsers/__init__.py +0 -0
- parsers/gita_json.py +236 -0
- parsers/sastry_archive.py +249 -0
- requirements.txt +11 -0
- run_overnight.py +230 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
*.sqlite3 filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.env
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.pyc
|
| 4 |
+
*.pyo
|
| 5 |
+
data/raw/
|
| 6 |
+
data/enrichment_cache.jsonl
|
| 7 |
+
data/corpus.jsonl
|
| 8 |
+
artifacts/gepa_logs/
|
| 9 |
+
artifacts/gepa_state.bin
|
| 10 |
+
artifacts/*.log
|
| 11 |
+
Gita-advisor/
|
| 12 |
+
sources_local/
|
| 13 |
+
sources/
|
| 14 |
+
.DS_Store
|
README.md
CHANGED
|
@@ -1,14 +1,125 @@
|
|
| 1 |
---
|
| 2 |
title: Gita Advisor
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: gradio
|
| 7 |
-
sdk_version:
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
-
license: mit
|
| 11 |
-
short_description: 'Bhagvad Gita Spiritual Advisor '
|
| 12 |
---
|
| 13 |
|
| 14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
title: Gita Advisor
|
| 3 |
+
emoji: 🕉️
|
| 4 |
+
colorFrom: yellow
|
| 5 |
+
colorTo: orange
|
| 6 |
sdk: gradio
|
| 7 |
+
sdk_version: 5.33.0
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
|
|
|
|
|
|
| 10 |
---
|
| 11 |
|
| 12 |
+
# Gītā Advisor
|
| 13 |
+
|
| 14 |
+
A spiritual advisor grounded in Advaita Vedānta as taught by Śaṅkarācārya,
|
| 15 |
+
optimized via DSPy + GEPA against a local LM Studio model. The advisor takes
|
| 16 |
+
real-life questions or vents and produces responses that are empathetic to
|
| 17 |
+
the felt experience, faithful to the non-dual lineage, and grounded in
|
| 18 |
+
exact-cited verses from the Gītā with Śaṅkara's bhāṣya, the principal
|
| 19 |
+
Upaniṣads, the Brahma Sūtras, and the prakaraṇa-granthas.
|
| 20 |
+
|
| 21 |
+
## What makes this design unusual
|
| 22 |
+
|
| 23 |
+
The first unusual choice is that the unit of retrieval is the verse, not
|
| 24 |
+
the chunk. Scripture is not arbitrary prose: each Gītā śloka, each
|
| 25 |
+
Upaniṣadic mantra, each sūtra is a sealed teaching unit with a stable
|
| 26 |
+
citation reference. We index by `verse_id` (e.g. `bhagavad_gita_02_47`,
|
| 27 |
+
which renders as `BG 2.47` in citations) so the advisor's references can be
|
| 28 |
+
exact-match-verified against the retrieved set.
|
| 29 |
+
|
| 30 |
+
The second unusual choice is that we use the local LLM, in a one-time
|
| 31 |
+
offline pass, to enrich each verse with structured fields a real person's
|
| 32 |
+
question can match against. A user does not write "I am experiencing
|
| 33 |
+
rāga toward kāmya-karma"; they write "I worked on this for three years and
|
| 34 |
+
it just failed." So we ask the local model, for each verse, to produce a
|
| 35 |
+
plain-English paraphrase, the Vedāntic themes engaged, the life situations
|
| 36 |
+
addressed, the emotions met, the practical teaching offered, and five
|
| 37 |
+
hypothetical first-person questions the verse would speak to. We then
|
| 38 |
+
embed three views of each verse — the literal translation, Śaṅkara's
|
| 39 |
+
bhāṣya, and the LLM-enriched advisor view — and at retrieval time query
|
| 40 |
+
all three and merge by verse ID.
|
| 41 |
+
|
| 42 |
+
The advisor view dominates retrieval because that is where the language
|
| 43 |
+
gap closes. The literal and bhāṣya views act as insurance against the
|
| 44 |
+
enrichment pipeline missing a topic.
|
| 45 |
+
|
| 46 |
+
## Where the texts come from
|
| 47 |
+
|
| 48 |
+
Every source is unambiguously open. The verse-indexed JSON at
|
| 49 |
+
`github.com/gita/gita`, released under the Unlicense, gives us Sanskrit
|
| 50 |
+
plus IAST transliteration plus word-by-word glosses for the Gītā. Alladi
|
| 51 |
+
Mahadeva Sastry's 1897 translation of Śaṅkara's Gītā Bhāṣya, in the public
|
| 52 |
+
domain and full-text on archive.org, gives us Śaṅkara's commentary
|
| 53 |
+
attached to each verse. The wisdomlib mirror of the *Sacred Books of the
|
| 54 |
+
East* is staged for the Upaniṣad-with-Śaṅkara texts and the Brahma Sūtra
|
| 55 |
+
bhāṣya; those parsers are registered but not yet implemented. See
|
| 56 |
+
`sources_registry.py` for the complete catalog and `CLAUDE.md` for the
|
| 57 |
+
licensing rationale.
|
| 58 |
+
|
| 59 |
+
We deliberately exclude the modern Advaita Ashrama translations (active
|
| 60 |
+
copyright), modern Ramaṇa and Nisargadatta editions, and Prabhupada's
|
| 61 |
+
commentary. If you have your own license-cleared copies, drop them in
|
| 62 |
+
`sources_local/` and the `plain_text` parser will fold them in.
|
| 63 |
+
|
| 64 |
+
## Pipeline of commands
|
| 65 |
+
|
| 66 |
+
```bash
|
| 67 |
+
pip install -r requirements.txt
|
| 68 |
+
|
| 69 |
+
# 1. Download the registered open sources to data/raw/<source_key>/
|
| 70 |
+
python download_sources.py
|
| 71 |
+
|
| 72 |
+
# 2. Parse + merge into data/corpus.jsonl (one verse per line)
|
| 73 |
+
python ingest_corpus.py
|
| 74 |
+
|
| 75 |
+
# 3. Enrich every verse via the local LLM. SLOW — overnight.
|
| 76 |
+
# Resumable; kill -9 is safe (append-mode cache).
|
| 77 |
+
python enrich_corpus.py --limit 50 # smoke-test the prompt first
|
| 78 |
+
python enrich_corpus.py # then the real run
|
| 79 |
+
|
| 80 |
+
# 4. Build the three-view Chroma index
|
| 81 |
+
python knowledge_base.py --build
|
| 82 |
+
|
| 83 |
+
# 5. Try a query against the index
|
| 84 |
+
python knowledge_base.py --query "I just got laid off and feel hollow"
|
| 85 |
+
|
| 86 |
+
# 6. Smoke-test the full advisor pipeline
|
| 87 |
+
python smoke_test.py "I just got laid off and feel hollow"
|
| 88 |
+
|
| 89 |
+
# 7. Generate the synthetic question dataset and run GEPA
|
| 90 |
+
python dataset_generator.py --n 500
|
| 91 |
+
python optimize_gepa.py --auto medium
|
| 92 |
+
|
| 93 |
+
# 8. Open the chat CLI
|
| 94 |
+
python chat.py
|
| 95 |
+
```
|
| 96 |
+
|
| 97 |
+
## Project structure
|
| 98 |
+
|
| 99 |
+
The project is laid out so the data flow is left-to-right through the
|
| 100 |
+
pipeline: each script reads what the previous one wrote, with all
|
| 101 |
+
intermediate state on disk so any stage can be re-run independently. The
|
| 102 |
+
data model lives in `corpus.py` (`Verse` and `EnrichedVerse` dataclasses)
|
| 103 |
+
and is the contract between modules. The advisor itself is a `dspy.Module`
|
| 104 |
+
that GEPA optimizes; the metric in `metrics.py` is the specification GEPA
|
| 105 |
+
optimizes against, combining rule-based hygiene checks with an LLM-judge
|
| 106 |
+
rubric and producing structured feedback for GEPA's reflection step. See
|
| 107 |
+
`CLAUDE.md` for the full file map and the design commitments that should
|
| 108 |
+
not be silently broken.
|
| 109 |
+
|
| 110 |
+
## Configuration
|
| 111 |
+
|
| 112 |
+
`config.py` reads a small number of environment variables. The two that
|
| 113 |
+
matter most are `LM_STUDIO_BASE` (defaults to `http://localhost:1234/v1`)
|
| 114 |
+
and `LOCAL_MODEL` (defaults to `google/gemma-4-26b-a4b`, but copy whatever
|
| 115 |
+
LM Studio reports verbatim). The embedding model defaults to BGE-small on
|
| 116 |
+
Apple Silicon's MPS device; switch `EMBED_DEVICE` to `cpu` if you are not
|
| 117 |
+
on Apple Silicon.
|
| 118 |
+
|
| 119 |
+
## License
|
| 120 |
+
|
| 121 |
+
The code in this repository is yours to use. The texts in `data/raw/` come
|
| 122 |
+
with their own licenses, all unambiguously open and tracked in
|
| 123 |
+
`sources_registry.py`. Attributions for translators are preserved through
|
| 124 |
+
the pipeline and surfaced in citation footers.
|
| 125 |
+
# gita_advisor
|
advisor.py
ADDED
|
@@ -0,0 +1,157 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
advisor.py — the composed DSPy module.
|
| 3 |
+
|
| 4 |
+
This is what GEPA optimizes. It chains four predictors:
|
| 5 |
+
|
| 6 |
+
UnderstandQuery → PlanRetrieval → [retrieve] → SelectPassages → SynthesizeAdvice
|
| 7 |
+
|
| 8 |
+
Each predictor uses ChainOfThought so GEPA has a `reasoning` field to inspect
|
| 9 |
+
in its reflection step. The retriever itself is not optimized (it's vector
|
| 10 |
+
search), but the *queries given to it* are — that's where PlanRetrieval lives.
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
from __future__ import annotations
|
| 14 |
+
import json
|
| 15 |
+
from dataclasses import dataclass
|
| 16 |
+
from typing import Any
|
| 17 |
+
import dspy
|
| 18 |
+
|
| 19 |
+
from signatures import (
|
| 20 |
+
UnderstandQuery,
|
| 21 |
+
PlanRetrieval,
|
| 22 |
+
SelectPassages,
|
| 23 |
+
SynthesizeAdvice,
|
| 24 |
+
)
|
| 25 |
+
from knowledge_base import AdvaitaRetriever, format_passages_for_llm
|
| 26 |
+
import config
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
@dataclass
|
| 30 |
+
class AdviceTrace:
|
| 31 |
+
"""Everything the pipeline produced — useful for the metric to reason over."""
|
| 32 |
+
user_question: str
|
| 33 |
+
felt_emotion: str
|
| 34 |
+
surface_concern: str
|
| 35 |
+
deeper_concern: str
|
| 36 |
+
vedantic_themes: list[str]
|
| 37 |
+
queries: list[str]
|
| 38 |
+
retrieved_passages: list[dict] # raw hits with metadata
|
| 39 |
+
selected_indices: list[int]
|
| 40 |
+
selection_rationale: str
|
| 41 |
+
response: str
|
| 42 |
+
sources_cited: list[str]
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
class GitaAdvisor(dspy.Module):
|
| 46 |
+
def __init__(self, retriever: AdvaitaRetriever | None = None):
|
| 47 |
+
super().__init__()
|
| 48 |
+
self.understand = dspy.ChainOfThought(UnderstandQuery)
|
| 49 |
+
self.plan = dspy.ChainOfThought(PlanRetrieval)
|
| 50 |
+
self.select = dspy.ChainOfThought(SelectPassages)
|
| 51 |
+
self.synthesize = dspy.ChainOfThought(SynthesizeAdvice)
|
| 52 |
+
# Retriever is not a Predictor; held as a plain attribute so DSPy
|
| 53 |
+
# introspection ignores it during optimization.
|
| 54 |
+
self._retriever = retriever or AdvaitaRetriever()
|
| 55 |
+
|
| 56 |
+
def forward(
|
| 57 |
+
self,
|
| 58 |
+
user_question: str,
|
| 59 |
+
history: dspy.History | None = None,
|
| 60 |
+
_stage_cb=None,
|
| 61 |
+
) -> dspy.Prediction:
|
| 62 |
+
if history is None:
|
| 63 |
+
history = dspy.History(messages=[])
|
| 64 |
+
|
| 65 |
+
# 1. Understand — history lets it interpret follow-ups correctly
|
| 66 |
+
if _stage_cb:
|
| 67 |
+
_stage_cb("understanding your question...")
|
| 68 |
+
u = self.understand(
|
| 69 |
+
history=history,
|
| 70 |
+
user_question=user_question,
|
| 71 |
+
)
|
| 72 |
+
|
| 73 |
+
# 2. Plan retrieval queries
|
| 74 |
+
if _stage_cb:
|
| 75 |
+
_stage_cb("planning search queries...")
|
| 76 |
+
p = self.plan(
|
| 77 |
+
surface_concern=u.surface_concern,
|
| 78 |
+
deeper_concern=u.deeper_concern,
|
| 79 |
+
vedantic_themes=u.vedantic_themes,
|
| 80 |
+
)
|
| 81 |
+
queries = p.queries[: config.N_RETRIEVAL_QUERIES] if p.queries else [u.deeper_concern]
|
| 82 |
+
|
| 83 |
+
# 3. Retrieve
|
| 84 |
+
if _stage_cb:
|
| 85 |
+
_stage_cb("searching scriptures...")
|
| 86 |
+
hits = self._retriever.search_many(queries, k_per=config.TOP_K_RETRIEVE)
|
| 87 |
+
# Cap candidate set so the selector prompt stays focused
|
| 88 |
+
candidates = hits[: max(8, config.TOP_K_RETRIEVE)]
|
| 89 |
+
candidates_text = format_passages_for_llm(candidates)
|
| 90 |
+
# Pre-serialize hits to dicts so the dspy.Prediction we return below
|
| 91 |
+
# can be pickled by GEPA's bookkeeping. The metric reads from these
|
| 92 |
+
# dicts, not from Hit objects, so it doesn't need the knowledge_base
|
| 93 |
+
# import either.
|
| 94 |
+
candidates_as_dicts = [h.to_dict() for h in candidates]
|
| 95 |
+
|
| 96 |
+
# 4. Select — tell the selector what's already been cited so it prefers fresh sources
|
| 97 |
+
if _stage_cb:
|
| 98 |
+
_stage_cb("selecting passages...")
|
| 99 |
+
previously_cited = [
|
| 100 |
+
src
|
| 101 |
+
for msg in history.messages
|
| 102 |
+
for src in msg.get("sources_cited", [])
|
| 103 |
+
]
|
| 104 |
+
s = self.select(
|
| 105 |
+
deeper_concern=u.deeper_concern,
|
| 106 |
+
candidate_passages=candidates_text,
|
| 107 |
+
previously_cited=previously_cited,
|
| 108 |
+
)
|
| 109 |
+
# Defensive: clamp indices to valid range
|
| 110 |
+
valid_idx = [
|
| 111 |
+
i for i in (s.selected_indices or [])
|
| 112 |
+
if isinstance(i, int) and 1 <= i <= len(candidates)
|
| 113 |
+
]
|
| 114 |
+
if not valid_idx:
|
| 115 |
+
# Fallback: take the top-3 candidates so synthesis isn't starved
|
| 116 |
+
valid_idx = list(range(1, min(4, len(candidates) + 1)))
|
| 117 |
+
|
| 118 |
+
selected = [candidates[i - 1] for i in valid_idx]
|
| 119 |
+
selected_text = format_passages_for_llm(selected)
|
| 120 |
+
|
| 121 |
+
# 5. Synthesize — history lets it build across turns, avoid repetition
|
| 122 |
+
if _stage_cb:
|
| 123 |
+
_stage_cb("composing response...")
|
| 124 |
+
a = self.synthesize(
|
| 125 |
+
history=history,
|
| 126 |
+
user_question=user_question,
|
| 127 |
+
felt_emotion=u.felt_emotion,
|
| 128 |
+
deeper_concern=u.deeper_concern,
|
| 129 |
+
selected_passages=selected_text,
|
| 130 |
+
)
|
| 131 |
+
|
| 132 |
+
return dspy.Prediction(
|
| 133 |
+
response=a.response,
|
| 134 |
+
sources_cited=a.sources_cited or [],
|
| 135 |
+
synthesis_reasoning=getattr(a, "reasoning", ""),
|
| 136 |
+
# Carry intermediate state for the metric / debugging:
|
| 137 |
+
felt_emotion=u.felt_emotion,
|
| 138 |
+
surface_concern=u.surface_concern,
|
| 139 |
+
deeper_concern=u.deeper_concern,
|
| 140 |
+
vedantic_themes=u.vedantic_themes,
|
| 141 |
+
queries=queries,
|
| 142 |
+
retrieved_passages=candidates_as_dicts,
|
| 143 |
+
selected_indices=valid_idx,
|
| 144 |
+
selection_rationale=s.selection_rationale,
|
| 145 |
+
)
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
def load_optimized(path: str | None = None) -> GitaAdvisor:
|
| 149 |
+
"""Load an advisor with GEPA-optimized prompts if available, else fresh."""
|
| 150 |
+
advisor = GitaAdvisor()
|
| 151 |
+
p = path or str(config.OPTIMIZED_PROGRAM_PATH)
|
| 152 |
+
try:
|
| 153 |
+
advisor.load(p)
|
| 154 |
+
print(f"Loaded optimized advisor from {p}")
|
| 155 |
+
except FileNotFoundError:
|
| 156 |
+
print(f"No optimized program at {p} — using base prompts.")
|
| 157 |
+
return advisor
|
app.py
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
app.py — Gradio web interface for the Gītā Advisor.
|
| 3 |
+
|
| 4 |
+
Wraps the same advisor pipeline as chat.py but exposes it as a Gradio
|
| 5 |
+
ChatInterface suitable for Hugging Face Spaces (free CPU tier).
|
| 6 |
+
|
| 7 |
+
Deploy:
|
| 8 |
+
- Set GEMINI_API_KEY as a Space Secret (Space Settings → Secrets)
|
| 9 |
+
- Push this file + all project files + artifacts/chroma/ + data/corpus_enriched.jsonl
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
import gradio as gr
|
| 13 |
+
import dspy
|
| 14 |
+
|
| 15 |
+
import config
|
| 16 |
+
from advisor import load_optimized
|
| 17 |
+
from knowledge_base import AdvaitaRetriever
|
| 18 |
+
|
| 19 |
+
# ── startup — runs once when the Space boots ───────────────────────────────────
|
| 20 |
+
config.configure_dspy()
|
| 21 |
+
_advisor = load_optimized()
|
| 22 |
+
|
| 23 |
+
# Pre-warm retriever so the first user request isn't slow
|
| 24 |
+
_retriever = AdvaitaRetriever()
|
| 25 |
+
_retriever._ensure()
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
# ── chat handler ───────────────────────────────────────────────────────────────
|
| 29 |
+
def chat(message: str, history: list) -> str:
|
| 30 |
+
# Gradio type="messages" passes history as list of {"role": ..., "content": ...}
|
| 31 |
+
dspy_msgs = []
|
| 32 |
+
i = 0
|
| 33 |
+
while i + 1 < len(history):
|
| 34 |
+
user_msg = history[i]
|
| 35 |
+
bot_msg = history[i + 1]
|
| 36 |
+
if user_msg.get("role") == "user" and bot_msg.get("role") == "assistant":
|
| 37 |
+
dspy_msgs.append({
|
| 38 |
+
"user_question": user_msg["content"],
|
| 39 |
+
"response": bot_msg["content"],
|
| 40 |
+
"sources_cited": [],
|
| 41 |
+
})
|
| 42 |
+
i += 2
|
| 43 |
+
dspy_history = dspy.History(messages=dspy_msgs)
|
| 44 |
+
|
| 45 |
+
pred = _advisor(user_question=message, history=dspy_history)
|
| 46 |
+
|
| 47 |
+
reply = pred.response
|
| 48 |
+
if pred.sources_cited:
|
| 49 |
+
reply += "\n\n---\n**Sources:** " + " · ".join(pred.sources_cited)
|
| 50 |
+
return reply
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
# ── Gradio app ─────────────────────────────────────────────────────────────────
|
| 54 |
+
demo = gr.ChatInterface(
|
| 55 |
+
fn=chat,
|
| 56 |
+
title="Gītā Advisor",
|
| 57 |
+
description=(
|
| 58 |
+
"A spiritual advisor grounded in Advaita Vedānta as taught by Śaṅkarācārya. "
|
| 59 |
+
"Speak from where you actually are. The advisor cites exact verses from the "
|
| 60 |
+
"Gītā with Śaṅkara's commentary."
|
| 61 |
+
),
|
| 62 |
+
type="messages",
|
| 63 |
+
examples=[
|
| 64 |
+
"I just got laid off and feel like nothing makes sense.",
|
| 65 |
+
"I'm terrified of dying. Is that irrational?",
|
| 66 |
+
"I keep hurting the people I love without meaning to.",
|
| 67 |
+
"I've been meditating for years but still feel empty. What am I missing?",
|
| 68 |
+
],
|
| 69 |
+
)
|
| 70 |
+
|
| 71 |
+
if __name__ == "__main__":
|
| 72 |
+
demo.launch()
|
artifacts/chroma/1cb22ce3-5b5d-4ea7-84da-8e74f131266a/data_level0.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e83b9e9a3e97ce6dc11715a48ebb2e9bc6f5b24f91c82d3fbc8d6b46085afb44
|
| 3 |
+
size 1174876
|
artifacts/chroma/1cb22ce3-5b5d-4ea7-84da-8e74f131266a/header.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:90261dc9d8649c19b95dcba5fa4ef3bb2f5a64b26da0aefc9e58ba7a3f2fcbe0
|
| 3 |
+
size 100
|
artifacts/chroma/1cb22ce3-5b5d-4ea7-84da-8e74f131266a/index_metadata.pickle
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b626b118ac6ec87f4d2ecdb13193aa4eaadc0514fe351c284c77edcc0b491fea
|
| 3 |
+
size 40786
|
artifacts/chroma/1cb22ce3-5b5d-4ea7-84da-8e74f131266a/length.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b57dc524397d289a5ec9f2dfd69bf88c097d397d354c600fff6cbd958cadda89
|
| 3 |
+
size 2804
|
artifacts/chroma/1cb22ce3-5b5d-4ea7-84da-8e74f131266a/link_lists.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bdb3c70e56609aecebbde493e89cc3f2eff23d6375c762bca900dd5edc46738b
|
| 3 |
+
size 6204
|
artifacts/chroma/1f1f3474-209f-41fb-91d3-d45fd026fb05/data_level0.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:66cd81ca458bd620e24b378f6ce96e6d77ba4c8789d4ece914775c339be10e26
|
| 3 |
+
size 167600
|
artifacts/chroma/1f1f3474-209f-41fb-91d3-d45fd026fb05/header.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a0e81c3b22454233bc12d0762f06dcca48261a75231cf87c79b75e69a6c00150
|
| 3 |
+
size 100
|
artifacts/chroma/1f1f3474-209f-41fb-91d3-d45fd026fb05/length.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7a12e561363385e9dfeeab326368731c030ed4b374e7f5897ac819159d2884c5
|
| 3 |
+
size 400
|
artifacts/chroma/1f1f3474-209f-41fb-91d3-d45fd026fb05/link_lists.bin
ADDED
|
File without changes
|
artifacts/chroma/52cdeb15-0631-44ed-8618-782f1d4d27bb/data_level0.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:201b998f2a013f78cea5960b05174ceffedbd046c4dfc10a8d2492ff8a1398a7
|
| 3 |
+
size 167600
|
artifacts/chroma/52cdeb15-0631-44ed-8618-782f1d4d27bb/header.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a0e81c3b22454233bc12d0762f06dcca48261a75231cf87c79b75e69a6c00150
|
| 3 |
+
size 100
|
artifacts/chroma/52cdeb15-0631-44ed-8618-782f1d4d27bb/length.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7a12e561363385e9dfeeab326368731c030ed4b374e7f5897ac819159d2884c5
|
| 3 |
+
size 400
|
artifacts/chroma/52cdeb15-0631-44ed-8618-782f1d4d27bb/link_lists.bin
ADDED
|
File without changes
|
artifacts/chroma/8707047c-50b3-41ba-ad04-329e93917e30/data_level0.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:121a03535f783813fa1ec964dd84095d2da0d83c0e5dad98955bdf0e252b33d8
|
| 3 |
+
size 167600
|
artifacts/chroma/8707047c-50b3-41ba-ad04-329e93917e30/header.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a0e81c3b22454233bc12d0762f06dcca48261a75231cf87c79b75e69a6c00150
|
| 3 |
+
size 100
|
artifacts/chroma/8707047c-50b3-41ba-ad04-329e93917e30/length.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7a12e561363385e9dfeeab326368731c030ed4b374e7f5897ac819159d2884c5
|
| 3 |
+
size 400
|
artifacts/chroma/8707047c-50b3-41ba-ad04-329e93917e30/link_lists.bin
ADDED
|
File without changes
|
artifacts/chroma/9c71c1cd-5694-4e9f-abe6-12ba1e74225b/data_level0.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:68a485ecde933131e39b14c078dc82c3e0e27454cdb837d76f4be92f00bef026
|
| 3 |
+
size 1136328
|
artifacts/chroma/9c71c1cd-5694-4e9f-abe6-12ba1e74225b/header.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3ce56f617adc30c61cd845ab6c84720756b445d622b4bc3160ce70dc5ce91e7e
|
| 3 |
+
size 100
|
artifacts/chroma/9c71c1cd-5694-4e9f-abe6-12ba1e74225b/index_metadata.pickle
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bde285e65f8c89d9d82ad8cd02cd29c9bfcbe7009e37a2ef50f2fc58dd630af1
|
| 3 |
+
size 39452
|
artifacts/chroma/9c71c1cd-5694-4e9f-abe6-12ba1e74225b/length.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:360e5acdc3007feb7ef856ed67807c4a8995beb723091091828f729ccba9dbcb
|
| 3 |
+
size 2712
|
artifacts/chroma/9c71c1cd-5694-4e9f-abe6-12ba1e74225b/link_lists.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fef3ffac7707ea201b174e47e15c161162d3f38b6fdbeccc43aab1af3c35e617
|
| 3 |
+
size 6044
|
artifacts/chroma/chroma.sqlite3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:acd63ba75ad75f6234302822709e44e8e560d80e120f31519c20f73d41821d22
|
| 3 |
+
size 20459520
|
artifacts/chroma/d091e62b-3e8b-4cd6-842e-ff3411b384f5/data_level0.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:88b9416504c97719067232f4ef2a042eb4015b033f8c4a3137cd90e9f2faa468
|
| 3 |
+
size 167600
|
artifacts/chroma/d091e62b-3e8b-4cd6-842e-ff3411b384f5/header.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a0e81c3b22454233bc12d0762f06dcca48261a75231cf87c79b75e69a6c00150
|
| 3 |
+
size 100
|
artifacts/chroma/d091e62b-3e8b-4cd6-842e-ff3411b384f5/length.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7a12e561363385e9dfeeab326368731c030ed4b374e7f5897ac819159d2884c5
|
| 3 |
+
size 400
|
artifacts/chroma/d091e62b-3e8b-4cd6-842e-ff3411b384f5/link_lists.bin
ADDED
|
File without changes
|
artifacts/optimized_advisor.json
ADDED
|
@@ -0,0 +1,157 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"understand.predict": {
|
| 3 |
+
"traces": [],
|
| 4 |
+
"train": [],
|
| 5 |
+
"demos": [],
|
| 6 |
+
"signature": {
|
| 7 |
+
"instructions": "Read the user's life situation carefully, taking into account the full\nconversation so far. If there is prior exchange, use it to understand\nfollow-up messages, references like 'what you said earlier', or shifts in\nthe user's emotional state across turns. Identify the felt emotion, the\nunderlying spiritual concern (not just the surface complaint), and the\nVedāntic themes that are most relevant — drawing only from concepts native\nto Advaita Vedānta.",
|
| 8 |
+
"fields": [
|
| 9 |
+
{
|
| 10 |
+
"prefix": "History:",
|
| 11 |
+
"description": "Prior turns as a list of message dicts with 'user_question' and 'response' keys. Empty history means this is the first message."
|
| 12 |
+
},
|
| 13 |
+
{
|
| 14 |
+
"prefix": "User Question:",
|
| 15 |
+
"description": "The user's current message; may be a question, a vent, a follow-up, or a description of a situation."
|
| 16 |
+
},
|
| 17 |
+
{
|
| 18 |
+
"prefix": "Reasoning:",
|
| 19 |
+
"description": "${reasoning}"
|
| 20 |
+
},
|
| 21 |
+
{
|
| 22 |
+
"prefix": "Felt Emotion:",
|
| 23 |
+
"description": "The dominant emotion the user is experiencing, named precisely (e.g. 'anticipatory grief', not just 'sad')."
|
| 24 |
+
},
|
| 25 |
+
{
|
| 26 |
+
"prefix": "Surface Concern:",
|
| 27 |
+
"description": "What the user is literally asking about, in one sentence."
|
| 28 |
+
},
|
| 29 |
+
{
|
| 30 |
+
"prefix": "Deeper Concern:",
|
| 31 |
+
"description": "The underlying existential/spiritual concern — usually about identity, attachment, fear, dharma, or meaning — that the surface concern is a symptom of. One sentence."
|
| 32 |
+
},
|
| 33 |
+
{
|
| 34 |
+
"prefix": "Vedantic Themes:",
|
| 35 |
+
"description": "2-4 Advaita-Vedānta concepts most relevant to this situation. Use Sanskrit terms with brief gloss, e.g. 'adhyāsa (superimposition of self onto roles)', 'vairāgya (dispassion)', 'sākṣī (witness consciousness)'."
|
| 36 |
+
}
|
| 37 |
+
]
|
| 38 |
+
},
|
| 39 |
+
"lm": null
|
| 40 |
+
},
|
| 41 |
+
"plan.predict": {
|
| 42 |
+
"traces": [],
|
| 43 |
+
"train": [],
|
| 44 |
+
"demos": [],
|
| 45 |
+
"signature": {
|
| 46 |
+
"instructions": "Given the user's situation and identified themes, generate diverse search\nqueries to find relevant passages from the Advaita corpus (Bhagavad Gītā with\nŚaṅkara bhāṣya, Upaniṣads, Brahma Sūtras, prakaraṇa texts). Each query should\ntarget a different angle — one query about the philosophical principle,\none about a parallel situation in the texts, one about the practical\nteaching offered by the lineage.",
|
| 47 |
+
"fields": [
|
| 48 |
+
{
|
| 49 |
+
"prefix": "Surface Concern:",
|
| 50 |
+
"description": "${surface_concern}"
|
| 51 |
+
},
|
| 52 |
+
{
|
| 53 |
+
"prefix": "Deeper Concern:",
|
| 54 |
+
"description": "${deeper_concern}"
|
| 55 |
+
},
|
| 56 |
+
{
|
| 57 |
+
"prefix": "Vedantic Themes:",
|
| 58 |
+
"description": "${vedantic_themes}"
|
| 59 |
+
},
|
| 60 |
+
{
|
| 61 |
+
"prefix": "Reasoning:",
|
| 62 |
+
"description": "${reasoning}"
|
| 63 |
+
},
|
| 64 |
+
{
|
| 65 |
+
"prefix": "Queries:",
|
| 66 |
+
"description": "3 distinct search queries (each 5-15 words). Vary in angle: principle, parallel, practice."
|
| 67 |
+
}
|
| 68 |
+
]
|
| 69 |
+
},
|
| 70 |
+
"lm": null
|
| 71 |
+
},
|
| 72 |
+
"select.predict": {
|
| 73 |
+
"traces": [],
|
| 74 |
+
"train": [],
|
| 75 |
+
"demos": [],
|
| 76 |
+
"signature": {
|
| 77 |
+
"instructions": "From the retrieved candidate passages, select the ones that genuinely\nspeak to this user's situation. Prefer primary sources (Gītā verses,\nUpaniṣadic mantras, Śaṅkara's bhāṣya) over secondary or modern commentary\nwhen both are available. Reject passages that are merely topically adjacent\nbut don't address the actual spiritual concern. Avoid re-selecting passages\nwhose source was already cited in a prior turn — prefer fresh ground.",
|
| 78 |
+
"fields": [
|
| 79 |
+
{
|
| 80 |
+
"prefix": "Deeper Concern:",
|
| 81 |
+
"description": "${deeper_concern}"
|
| 82 |
+
},
|
| 83 |
+
{
|
| 84 |
+
"prefix": "Candidate Passages:",
|
| 85 |
+
"description": "Numbered candidate passages with source attribution."
|
| 86 |
+
},
|
| 87 |
+
{
|
| 88 |
+
"prefix": "Previously Cited:",
|
| 89 |
+
"description": "Source references already cited in earlier turns of this conversation (e.g. ['BG 2.47', 'BG 18.66']). Prefer passages not on this list so the conversation covers new ground. Empty list on the first turn."
|
| 90 |
+
},
|
| 91 |
+
{
|
| 92 |
+
"prefix": "Reasoning:",
|
| 93 |
+
"description": "${reasoning}"
|
| 94 |
+
},
|
| 95 |
+
{
|
| 96 |
+
"prefix": "Selected Indices:",
|
| 97 |
+
"description": "Indices (1-based) of the 2-4 most relevant passages."
|
| 98 |
+
},
|
| 99 |
+
{
|
| 100 |
+
"prefix": "Selection Rationale:",
|
| 101 |
+
"description": "One sentence per selection explaining why that passage speaks to this concern."
|
| 102 |
+
}
|
| 103 |
+
]
|
| 104 |
+
},
|
| 105 |
+
"lm": null
|
| 106 |
+
},
|
| 107 |
+
"synthesize.predict": {
|
| 108 |
+
"traces": [],
|
| 109 |
+
"train": [],
|
| 110 |
+
"demos": [],
|
| 111 |
+
"signature": {
|
| 112 |
+
"instructions": "Compose a response that is grounded in Advaita Vedānta as taught by\nŚaṅkarācārya, empathetic to the user's felt experience, and practically\nuseful for their situation. Honor the two-truths distinction: meet the user\nin vyāvahārika (transactional reality) without ever denying the\npāramārthika (absolute) view. Cite specific verses/passages by reference,\nintegrate them into prose rather than dumping quotes, and keep wit gentle —\nlight around the cosmic predicament, never light about the user's pain.\n\nIf history has prior turns: do not repeat citations or teachings already\ngiven; build on or deepen what was said; acknowledge any shift the user has\nexpressed since the last turn. If the user is following up, open by briefly\nacknowledging the continuity before moving forward.",
|
| 113 |
+
"fields": [
|
| 114 |
+
{
|
| 115 |
+
"prefix": "History:",
|
| 116 |
+
"description": "Prior turns as a list of message dicts with 'user_question' and 'response' keys. Use this to avoid repetition and to build across turns."
|
| 117 |
+
},
|
| 118 |
+
{
|
| 119 |
+
"prefix": "User Question:",
|
| 120 |
+
"description": "${user_question}"
|
| 121 |
+
},
|
| 122 |
+
{
|
| 123 |
+
"prefix": "Felt Emotion:",
|
| 124 |
+
"description": "${felt_emotion}"
|
| 125 |
+
},
|
| 126 |
+
{
|
| 127 |
+
"prefix": "Deeper Concern:",
|
| 128 |
+
"description": "${deeper_concern}"
|
| 129 |
+
},
|
| 130 |
+
{
|
| 131 |
+
"prefix": "Selected Passages:",
|
| 132 |
+
"description": "The selected passages with full source attribution."
|
| 133 |
+
},
|
| 134 |
+
{
|
| 135 |
+
"prefix": "Reasoning:",
|
| 136 |
+
"description": "${reasoning}"
|
| 137 |
+
},
|
| 138 |
+
{
|
| 139 |
+
"prefix": "Response:",
|
| 140 |
+
"description": "The advisor's reply to the user. 250-450 words. Open by acknowledging the felt experience. Move into the Vedāntic perspective. Cite at least one primary source (Gītā chapter:verse, Upaniṣad name + section, etc.). Close with a concrete practice or shift in perspective they can try this week. Address the user as 'you' throughout. Avoid Western therapy clichés."
|
| 141 |
+
},
|
| 142 |
+
{
|
| 143 |
+
"prefix": "Sources Cited:",
|
| 144 |
+
"description": "Source references actually cited in the response, e.g. 'BG 2.47', 'Bṛhadāraṇyaka Up. 4.4.5', 'Vivekacūḍāmaṇi 11'."
|
| 145 |
+
}
|
| 146 |
+
]
|
| 147 |
+
},
|
| 148 |
+
"lm": null
|
| 149 |
+
},
|
| 150 |
+
"metadata": {
|
| 151 |
+
"dependency_versions": {
|
| 152 |
+
"python": "3.11",
|
| 153 |
+
"dspy": "3.2.0",
|
| 154 |
+
"cloudpickle": "3.1"
|
| 155 |
+
}
|
| 156 |
+
}
|
| 157 |
+
}
|
chat.py
ADDED
|
@@ -0,0 +1,392 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
chat.py — interactive conversation with the advisor.
|
| 3 |
+
|
| 4 |
+
By default it loads the GEPA-optimized program from artifacts/. If that file
|
| 5 |
+
doesn't exist yet, it falls back to the un-optimized base prompts so you can
|
| 6 |
+
sanity-check the pipeline before running optimization.
|
| 7 |
+
|
| 8 |
+
Flags:
|
| 9 |
+
--debug Show intermediate pipeline state (felt emotion, queries, etc.)
|
| 10 |
+
--thinking Show the full synthesis reasoning trace (default: first 6 lines)
|
| 11 |
+
--no-thinking Hide the reasoning trace entirely
|
| 12 |
+
|
| 13 |
+
After each response, source references are printed with numbers.
|
| 14 |
+
show <N|ref> Display the verse text, translation, and Śaṅkara's bhāṣya.
|
| 15 |
+
explain <N|ref> Show the verse then stream a contextual explanation of how
|
| 16 |
+
it applies to the current conversation.
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
from __future__ import annotations
|
| 20 |
+
import argparse
|
| 21 |
+
import time
|
| 22 |
+
import threading
|
| 23 |
+
from typing import Optional
|
| 24 |
+
|
| 25 |
+
import dspy
|
| 26 |
+
from rich.console import Console
|
| 27 |
+
from rich.live import Live
|
| 28 |
+
from rich.markdown import Markdown
|
| 29 |
+
from rich.panel import Panel
|
| 30 |
+
from rich.rule import Rule
|
| 31 |
+
from rich.text import Text
|
| 32 |
+
|
| 33 |
+
import config
|
| 34 |
+
from advisor import load_optimized
|
| 35 |
+
from corpus import EnrichedVerse, Verse, read_jsonl_enriched, read_jsonl_verses
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
# ── speed constants ────────────────────────────────────────────────────────────
|
| 39 |
+
_THINKING_CPS = 800 # chars/sec for reasoning stream (secondary content, fast)
|
| 40 |
+
_RESPONSE_CPS = 300 # chars/sec for advisor response (primary content)
|
| 41 |
+
_THINKING_PREVIEW = 6 # lines shown in collapsed thinking mode
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
# ── verse corpus lookup ────────────────────────────────────────────────────────
|
| 45 |
+
def _load_verse_lookup() -> dict[str, Verse]:
|
| 46 |
+
"""Build a case-insensitive verse_ref → Verse dict from the corpus."""
|
| 47 |
+
lookup: dict[str, Verse] = {}
|
| 48 |
+
enriched = config.DATA_DIR / "corpus_enriched.jsonl"
|
| 49 |
+
plain = config.DATA_DIR / "corpus.jsonl"
|
| 50 |
+
|
| 51 |
+
if enriched.exists():
|
| 52 |
+
loader, path = read_jsonl_enriched, enriched
|
| 53 |
+
elif plain.exists():
|
| 54 |
+
loader, path = read_jsonl_verses, plain
|
| 55 |
+
else:
|
| 56 |
+
return lookup
|
| 57 |
+
|
| 58 |
+
for verse in loader(path):
|
| 59 |
+
lookup[verse.verse_ref.lower().strip()] = verse
|
| 60 |
+
return lookup
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
def _find_verse(lookup: dict, ref: str) -> Optional[Verse]:
|
| 64 |
+
return lookup.get(ref.lower().strip())
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
def _resolve_ref(arg: str, sources_cited: list[str]) -> str:
|
| 68 |
+
"""Turn '1' → sources_cited[0], or return arg unchanged for direct ref lookup."""
|
| 69 |
+
try:
|
| 70 |
+
n = int(arg.strip())
|
| 71 |
+
if 1 <= n <= len(sources_cited):
|
| 72 |
+
return sources_cited[n - 1]
|
| 73 |
+
except ValueError:
|
| 74 |
+
pass
|
| 75 |
+
return arg.strip()
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
# ── DSPy signature for contextual explanation ─────────────────────────────────
|
| 79 |
+
class _ExplainInContext(dspy.Signature):
|
| 80 |
+
"""You are the Gītā Advisor continuing a conversation. The user has asked
|
| 81 |
+
you to unpack a specific verse or passage you cited. Explain what it means
|
| 82 |
+
and why it speaks precisely to their situation — go deeper than the initial
|
| 83 |
+
response did. Reference the user's words. Close with one concrete way to
|
| 84 |
+
hold or work with this text this week."""
|
| 85 |
+
|
| 86 |
+
verse_ref: str = dspy.InputField()
|
| 87 |
+
verse_content: str = dspy.InputField(
|
| 88 |
+
desc="Translation, original text (if available), and Śaṅkara's commentary."
|
| 89 |
+
)
|
| 90 |
+
conversation_context: str = dspy.InputField(
|
| 91 |
+
desc="The user's question and the advisor's response where this verse was cited."
|
| 92 |
+
)
|
| 93 |
+
|
| 94 |
+
explanation: str = dspy.OutputField(
|
| 95 |
+
desc="150-250 words. Grounded in Advaita. Do not merely restate the translation. "
|
| 96 |
+
"End with a practical suggestion for this week."
|
| 97 |
+
)
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
# ── streaming helpers ─────────────────────────────────────────────────────────
|
| 101 |
+
def _stream_chars(console: Console, text: str, cps: int):
|
| 102 |
+
"""Write text to the terminal character by character."""
|
| 103 |
+
if not text:
|
| 104 |
+
return
|
| 105 |
+
delay = 1.0 / cps
|
| 106 |
+
for ch in text:
|
| 107 |
+
console.file.write(ch)
|
| 108 |
+
console.file.flush()
|
| 109 |
+
time.sleep(delay)
|
| 110 |
+
console.file.write("\n")
|
| 111 |
+
console.file.flush()
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
def _stream_response(console: Console, text: str, cps: int = _RESPONSE_CPS):
|
| 115 |
+
"""Stream the advisor response into a growing Markdown Panel via Rich Live."""
|
| 116 |
+
if not text:
|
| 117 |
+
return
|
| 118 |
+
displayed = ""
|
| 119 |
+
delay = 1.0 / cps
|
| 120 |
+
with Live(console=console, refresh_per_second=min(cps, 30)) as live:
|
| 121 |
+
for ch in text:
|
| 122 |
+
displayed += ch
|
| 123 |
+
live.update(Panel(
|
| 124 |
+
Markdown(displayed),
|
| 125 |
+
title="[bold]advisor[/bold]",
|
| 126 |
+
border_style="yellow",
|
| 127 |
+
padding=(1, 2),
|
| 128 |
+
))
|
| 129 |
+
time.sleep(delay)
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
def _show_thinking(console: Console, reasoning: str, full: bool):
|
| 133 |
+
"""Stream the synthesis reasoning below a dim rule, collapsed to _THINKING_PREVIEW lines."""
|
| 134 |
+
if not reasoning:
|
| 135 |
+
return
|
| 136 |
+
|
| 137 |
+
lines = reasoning.strip().splitlines()
|
| 138 |
+
if not full and len(lines) > _THINKING_PREVIEW:
|
| 139 |
+
display = "\n".join(lines[:_THINKING_PREVIEW])
|
| 140 |
+
n_hidden = len(lines) - _THINKING_PREVIEW
|
| 141 |
+
else:
|
| 142 |
+
display = "\n".join(lines)
|
| 143 |
+
n_hidden = 0
|
| 144 |
+
|
| 145 |
+
console.print(Rule("[dim]thinking[/dim]", style="dim blue"))
|
| 146 |
+
# Write dim italic via ANSI since we're streaming to file directly
|
| 147 |
+
# (Rich markup can't be applied char-by-char; dim is cosmetic here)
|
| 148 |
+
_stream_chars(console, display, cps=_THINKING_CPS)
|
| 149 |
+
|
| 150 |
+
if n_hidden:
|
| 151 |
+
console.print(f"[dim] ↳ {n_hidden} more lines — use --thinking to expand[/dim]")
|
| 152 |
+
console.print()
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
# ── verse display helpers ─────────────────────────────────────────────────────
|
| 156 |
+
def _show_verse(console: Console, verse: Verse):
|
| 157 |
+
"""Render a verse with its translation, original text, and commentary."""
|
| 158 |
+
body = Text()
|
| 159 |
+
|
| 160 |
+
if verse.sanskrit:
|
| 161 |
+
body.append(verse.sanskrit + "\n", style="bold")
|
| 162 |
+
if verse.transliteration:
|
| 163 |
+
body.append(verse.transliteration + "\n", style="italic dim")
|
| 164 |
+
|
| 165 |
+
if verse.translation:
|
| 166 |
+
label = f"Translation ({verse.translator})" if verse.translator else "Translation"
|
| 167 |
+
body.append(f"\n{label}:\n", style="dim")
|
| 168 |
+
body.append(verse.translation + "\n")
|
| 169 |
+
|
| 170 |
+
if verse.bhashya:
|
| 171 |
+
translator_note = f" ({verse.bhashya_translator})" if verse.bhashya_translator else ""
|
| 172 |
+
body.append(f"\nŚaṅkara's Bhāṣya{translator_note}:\n", style="dim")
|
| 173 |
+
preview = verse.bhashya[:800] + ("…" if len(verse.bhashya) > 800 else "")
|
| 174 |
+
body.append(preview + "\n", style="dim")
|
| 175 |
+
|
| 176 |
+
ev = verse if isinstance(verse, EnrichedVerse) else None
|
| 177 |
+
if ev and ev.paraphrase:
|
| 178 |
+
body.append("\nTeaching: ", style="bold dim")
|
| 179 |
+
body.append(ev.paraphrase + "\n", style="dim")
|
| 180 |
+
if ev and ev.themes:
|
| 181 |
+
body.append("Themes: ", style="bold dim")
|
| 182 |
+
body.append(", ".join(ev.themes) + "\n", style="dim")
|
| 183 |
+
if ev and ev.practical_teaching:
|
| 184 |
+
body.append("Practical shift: ", style="bold dim")
|
| 185 |
+
body.append(ev.practical_teaching + "\n", style="dim")
|
| 186 |
+
|
| 187 |
+
section = verse.section_display or verse.section
|
| 188 |
+
subtitle = verse.work_display + (f" — {section}" if section else "")
|
| 189 |
+
console.print(Panel(
|
| 190 |
+
body,
|
| 191 |
+
title=f"[bold]{verse.verse_ref}[/bold]",
|
| 192 |
+
subtitle=f"[dim]{subtitle}[/dim]",
|
| 193 |
+
border_style="cyan",
|
| 194 |
+
padding=(1, 2),
|
| 195 |
+
))
|
| 196 |
+
|
| 197 |
+
|
| 198 |
+
def _explain_in_context(
|
| 199 |
+
console: Console,
|
| 200 |
+
verse: Verse,
|
| 201 |
+
history_messages: list[dict],
|
| 202 |
+
cps: int = _RESPONSE_CPS,
|
| 203 |
+
):
|
| 204 |
+
"""Call the LM to explain the verse in context of the last conversation turn."""
|
| 205 |
+
if history_messages:
|
| 206 |
+
last = history_messages[-1]
|
| 207 |
+
context = (
|
| 208 |
+
f"User: {last.get('user_question', '')}\n\n"
|
| 209 |
+
f"Advisor: {last.get('response', '')}"
|
| 210 |
+
)
|
| 211 |
+
else:
|
| 212 |
+
context = "No prior conversation."
|
| 213 |
+
|
| 214 |
+
bits = []
|
| 215 |
+
if verse.translation:
|
| 216 |
+
bits.append(f"Translation: {verse.translation}")
|
| 217 |
+
if verse.sanskrit:
|
| 218 |
+
bits.append(f"Sanskrit: {verse.sanskrit}")
|
| 219 |
+
if verse.bhashya:
|
| 220 |
+
bits.append(f"Śaṅkara's commentary: {verse.bhashya[:600]}")
|
| 221 |
+
ev = verse if isinstance(verse, EnrichedVerse) else None
|
| 222 |
+
if ev and ev.paraphrase:
|
| 223 |
+
bits.append(f"Teaching: {ev.paraphrase}")
|
| 224 |
+
verse_content = "\n\n".join(bits)
|
| 225 |
+
|
| 226 |
+
explainer = dspy.ChainOfThought(_ExplainInContext)
|
| 227 |
+
with console.status("[dim]expanding...[/dim]", spinner="dots"):
|
| 228 |
+
try:
|
| 229 |
+
result = explainer(
|
| 230 |
+
verse_ref=verse.verse_ref,
|
| 231 |
+
verse_content=verse_content,
|
| 232 |
+
conversation_context=context,
|
| 233 |
+
)
|
| 234 |
+
explanation = result.explanation
|
| 235 |
+
except Exception as exc:
|
| 236 |
+
console.print(f"[red]Could not generate explanation: {exc}[/red]")
|
| 237 |
+
return
|
| 238 |
+
|
| 239 |
+
console.print()
|
| 240 |
+
_stream_response(console, explanation, cps=cps)
|
| 241 |
+
|
| 242 |
+
|
| 243 |
+
# ── main loop ─────────────────────────────────────────────────────────────────
|
| 244 |
+
def main():
|
| 245 |
+
ap = argparse.ArgumentParser()
|
| 246 |
+
ap.add_argument("--program", default=str(config.OPTIMIZED_PROGRAM_PATH))
|
| 247 |
+
ap.add_argument("--debug", action="store_true",
|
| 248 |
+
help="Show intermediate pipeline state for each turn.")
|
| 249 |
+
ap.add_argument("--thinking", action="store_true",
|
| 250 |
+
help="Show full synthesis reasoning trace (default: first 6 lines).")
|
| 251 |
+
ap.add_argument("--no-thinking", action="store_true", dest="no_thinking",
|
| 252 |
+
help="Hide the reasoning trace entirely.")
|
| 253 |
+
args = ap.parse_args()
|
| 254 |
+
|
| 255 |
+
config.configure_dspy()
|
| 256 |
+
advisor = load_optimized(args.program)
|
| 257 |
+
console = Console()
|
| 258 |
+
|
| 259 |
+
with console.status("[dim]loading corpus...[/dim]", spinner="dots"):
|
| 260 |
+
verse_lookup = _load_verse_lookup()
|
| 261 |
+
|
| 262 |
+
console.print(Panel.fit(
|
| 263 |
+
"[bold]Gītā Advisor[/bold]\n\n"
|
| 264 |
+
"Speak from where you actually are.\n"
|
| 265 |
+
"After a response: [italic]show <N>[/italic] to read a cited verse · "
|
| 266 |
+
"[italic]explain <N>[/italic] for contextual breakdown.\n"
|
| 267 |
+
"Type [italic]exit[/italic] or Ctrl-D to leave.",
|
| 268 |
+
border_style="cyan",
|
| 269 |
+
))
|
| 270 |
+
|
| 271 |
+
history = dspy.History(messages=[])
|
| 272 |
+
last_pred = None
|
| 273 |
+
|
| 274 |
+
while True:
|
| 275 |
+
try:
|
| 276 |
+
console.print()
|
| 277 |
+
console.print("[bold cyan]you:[/bold cyan] ", end="")
|
| 278 |
+
line = input().strip()
|
| 279 |
+
except (EOFError, KeyboardInterrupt):
|
| 280 |
+
console.print("\n[dim]नमस्ते।[/dim]")
|
| 281 |
+
return
|
| 282 |
+
|
| 283 |
+
if not line:
|
| 284 |
+
continue
|
| 285 |
+
if line.lower() in {"exit", "quit", ":q"}:
|
| 286 |
+
console.print("[dim]नमस्ते।[/dim]")
|
| 287 |
+
return
|
| 288 |
+
|
| 289 |
+
# ── source exploration commands ───────────────────────────────────────
|
| 290 |
+
cmd_lower = line.lower()
|
| 291 |
+
if cmd_lower.startswith(("show ", "explain ")):
|
| 292 |
+
if last_pred is None:
|
| 293 |
+
console.print("[dim]No sources yet — ask a question first.[/dim]")
|
| 294 |
+
continue
|
| 295 |
+
cmd, _, arg = line.partition(" ")
|
| 296 |
+
ref = _resolve_ref(arg, last_pred.sources_cited)
|
| 297 |
+
verse = _find_verse(verse_lookup, ref)
|
| 298 |
+
if verse is None:
|
| 299 |
+
console.print(f"[dim]'{ref}' not found in corpus.[/dim]")
|
| 300 |
+
if last_pred.sources_cited:
|
| 301 |
+
hint = " ".join(
|
| 302 |
+
f"[{i+1}] {r}" for i, r in enumerate(last_pred.sources_cited)
|
| 303 |
+
)
|
| 304 |
+
console.print(f"[dim]Available: {hint}[/dim]")
|
| 305 |
+
continue
|
| 306 |
+
_show_verse(console, verse)
|
| 307 |
+
if cmd.lower() == "explain":
|
| 308 |
+
_explain_in_context(console, verse, history.messages)
|
| 309 |
+
continue
|
| 310 |
+
|
| 311 |
+
# ── normal question — run pipeline in background with live stage progress ──
|
| 312 |
+
pred = None
|
| 313 |
+
error = None
|
| 314 |
+
stage = ["initializing..."]
|
| 315 |
+
done = threading.Event()
|
| 316 |
+
|
| 317 |
+
def run_advisor():
|
| 318 |
+
nonlocal pred, error
|
| 319 |
+
try:
|
| 320 |
+
pred = advisor(
|
| 321 |
+
user_question=line,
|
| 322 |
+
history=history,
|
| 323 |
+
_stage_cb=lambda msg: stage.__setitem__(0, msg),
|
| 324 |
+
)
|
| 325 |
+
except Exception as exc:
|
| 326 |
+
error = exc
|
| 327 |
+
finally:
|
| 328 |
+
done.set()
|
| 329 |
+
|
| 330 |
+
threading.Thread(target=run_advisor, daemon=True).start()
|
| 331 |
+
|
| 332 |
+
with Live(console=console, refresh_per_second=8) as live:
|
| 333 |
+
while not done.wait(timeout=0.12):
|
| 334 |
+
live.update(Text(f" ◌ {stage[0]}", style="dim"))
|
| 335 |
+
live.update(Text(""))
|
| 336 |
+
|
| 337 |
+
if error:
|
| 338 |
+
console.print(f"[red]Error: {error}[/red]")
|
| 339 |
+
continue
|
| 340 |
+
|
| 341 |
+
last_pred = pred
|
| 342 |
+
history.messages.append({
|
| 343 |
+
"user_question": line,
|
| 344 |
+
"response": pred.response,
|
| 345 |
+
"sources_cited": pred.sources_cited,
|
| 346 |
+
})
|
| 347 |
+
|
| 348 |
+
# debug trace
|
| 349 |
+
if args.debug:
|
| 350 |
+
console.print(Rule("[dim]debug[/dim]", style="dim"))
|
| 351 |
+
console.print(f"[dim]felt:[/dim] {pred.felt_emotion}")
|
| 352 |
+
console.print(f"[dim]surface:[/dim] {pred.surface_concern}")
|
| 353 |
+
console.print(f"[dim]deeper:[/dim] {pred.deeper_concern}")
|
| 354 |
+
console.print(f"[dim]themes:[/dim] {', '.join(pred.vedantic_themes)}")
|
| 355 |
+
console.print(f"[dim]queries:[/dim] {pred.queries}")
|
| 356 |
+
console.print(f"[dim]selected:[/dim] {pred.selected_indices}")
|
| 357 |
+
for i in pred.selected_indices:
|
| 358 |
+
if 1 <= i <= len(pred.retrieved_passages):
|
| 359 |
+
h = pred.retrieved_passages[i - 1]
|
| 360 |
+
m = h["meta"]
|
| 361 |
+
console.print(
|
| 362 |
+
f" [dim]→ [{m['tier']}] {m['work']}"
|
| 363 |
+
f"{' — ' + m['section'] if m.get('section') else ''}"
|
| 364 |
+
f" (score {h['score']:.3f})[/dim]"
|
| 365 |
+
)
|
| 366 |
+
console.print(Rule(style="dim"))
|
| 367 |
+
|
| 368 |
+
# thinking section
|
| 369 |
+
if not args.no_thinking:
|
| 370 |
+
_show_thinking(
|
| 371 |
+
console,
|
| 372 |
+
getattr(pred, "synthesis_reasoning", ""),
|
| 373 |
+
full=args.thinking,
|
| 374 |
+
)
|
| 375 |
+
|
| 376 |
+
# stream the response
|
| 377 |
+
console.print()
|
| 378 |
+
_stream_response(console, pred.response)
|
| 379 |
+
|
| 380 |
+
# source footer with hints
|
| 381 |
+
if pred.sources_cited:
|
| 382 |
+
numbered = " ".join(
|
| 383 |
+
f"[{i+1}] {r}" for i, r in enumerate(pred.sources_cited)
|
| 384 |
+
)
|
| 385 |
+
console.print(f"\n[dim]sources: {numbered}[/dim]")
|
| 386 |
+
console.print(
|
| 387 |
+
"[dim] → show <N> to read the verse · explain <N> for contextual breakdown[/dim]"
|
| 388 |
+
)
|
| 389 |
+
|
| 390 |
+
|
| 391 |
+
if __name__ == "__main__":
|
| 392 |
+
main()
|
config.py
ADDED
|
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
config.py — central configuration for the Gītā Advisor.
|
| 3 |
+
|
| 4 |
+
Three LMs are configured:
|
| 5 |
+
|
| 6 |
+
- TASK_LM: the local model running in LM Studio. Used at inference
|
| 7 |
+
time (understanding, retrieval planning, advice synthesis).
|
| 8 |
+
|
| 9 |
+
- ENRICH_LM: Claude Sonnet (API) for the offline enrichment pass.
|
| 10 |
+
The local 26B model truncates structured output at 1500
|
| 11 |
+
tokens and drops fields. Claude handles all six fields
|
| 12 |
+
cleanly in one call and costs ~$12-15 for the full 701-
|
| 13 |
+
verse corpus (one-time). Set ANTHROPIC_API_KEY in env.
|
| 14 |
+
|
| 15 |
+
- REFLECTION_LM: gpt-4o (OpenAI) for GEPA's reflection step.
|
| 16 |
+
GEPA asks the reflection LM to read metric feedback and
|
| 17 |
+
propose rewritten prompts — this scales strongly with
|
| 18 |
+
model quality. gpt-4o reasons well enough to handle
|
| 19 |
+
nuanced Advaita feedback without breaking the budget.
|
| 20 |
+
Same OPENAI_API_KEY as enrichment.
|
| 21 |
+
"""
|
| 22 |
+
|
| 23 |
+
from __future__ import annotations
|
| 24 |
+
import os
|
| 25 |
+
import re
|
| 26 |
+
from pathlib import Path
|
| 27 |
+
import dspy
|
| 28 |
+
import dspy.adapters.chat_adapter as _chat_adapter_module
|
| 29 |
+
from dotenv import load_dotenv
|
| 30 |
+
|
| 31 |
+
# Gemma (and some other local models) output `[[ ## field ]]` without the closing `##`
|
| 32 |
+
# that DSPy's ChatAdapter expects (`[[ ## field ## ]]`). Patch the module-level regex
|
| 33 |
+
# to accept both forms before any adapter is instantiated.
|
| 34 |
+
_chat_adapter_module.field_header_pattern = re.compile(r"\[\[ ## (\w+)(?:\s*##)? \]\]")
|
| 35 |
+
|
| 36 |
+
load_dotenv(Path(__file__).parent / ".env") # explicit path; works from any cwd
|
| 37 |
+
|
| 38 |
+
# ──────────────────────────── Paths ────────────────────────────
|
| 39 |
+
ROOT = Path(__file__).parent.resolve()
|
| 40 |
+
SOURCES_DIR = ROOT / "sources"
|
| 41 |
+
DATA_DIR = ROOT / "data"
|
| 42 |
+
ARTIFACTS_DIR = ROOT / "artifacts"
|
| 43 |
+
CHROMA_DIR = ARTIFACTS_DIR / "chroma"
|
| 44 |
+
|
| 45 |
+
for d in (SOURCES_DIR, DATA_DIR, ARTIFACTS_DIR, CHROMA_DIR):
|
| 46 |
+
d.mkdir(parents=True, exist_ok=True)
|
| 47 |
+
|
| 48 |
+
DATASET_PATH = DATA_DIR / "synthetic_questions.jsonl"
|
| 49 |
+
OPTIMIZED_PROGRAM_PATH = ARTIFACTS_DIR / "optimized_advisor.json"
|
| 50 |
+
|
| 51 |
+
# ──────────────────────────── Task LM — Gemini API (preferred) ───────────────────────────
|
| 52 |
+
# When GEMINI_API_KEY is set, route the task LM through Google AI Studio.
|
| 53 |
+
# Same Gemma 4 26B weights, but no local GPU required and the free tier is
|
| 54 |
+
# sufficient for inference + GEPA optimization runs.
|
| 55 |
+
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", "")
|
| 56 |
+
GEMINI_TASK_MODEL = os.getenv("GEMINI_TASK_MODEL", "gemini/gemma-4-26b-a4b-it")
|
| 57 |
+
|
| 58 |
+
GEMINI_TASK_LM_KWARGS = dict(
|
| 59 |
+
api_key=GEMINI_API_KEY,
|
| 60 |
+
temperature=0.6,
|
| 61 |
+
# Gemma 4 thinking tokens count against max_tokens in the Gemini API.
|
| 62 |
+
# Each pipeline call burns ~3-4k reasoning tokens before writing output,
|
| 63 |
+
# so 4096 gets truncated. 16384 gives comfortable headroom for both.
|
| 64 |
+
max_tokens=16384,
|
| 65 |
+
cache=True,
|
| 66 |
+
)
|
| 67 |
+
|
| 68 |
+
# ──────────────────────────── Task LM — LM Studio fallback ───────────────────────────────
|
| 69 |
+
LM_STUDIO_BASE = os.getenv("LM_STUDIO_BASE", "http://localhost:1234/v1")
|
| 70 |
+
LOCAL_MODEL = os.getenv("LOCAL_MODEL", "google/gemma-4-26b-a4b")
|
| 71 |
+
|
| 72 |
+
# DSPy uses LiteLLM-style model strings. "openai/" prefix routes through the
|
| 73 |
+
# OpenAI-compatible client, which LM Studio speaks.
|
| 74 |
+
TASK_MODEL_STRING = f"openai/{LOCAL_MODEL}"
|
| 75 |
+
|
| 76 |
+
TASK_LM_KWARGS = dict(
|
| 77 |
+
api_base=LM_STUDIO_BASE,
|
| 78 |
+
api_key=os.getenv("LM_STUDIO_KEY", "lm-studio"), # any non-empty string
|
| 79 |
+
temperature=0.6,
|
| 80 |
+
max_tokens=4096, # ChainOfThought reasoning + all output fields easily exceeds 2k
|
| 81 |
+
cache=True,
|
| 82 |
+
)
|
| 83 |
+
|
| 84 |
+
# Which backend to use: "gemini" if the API key is present, else "lm_studio".
|
| 85 |
+
# Override with TASK_LM_BACKEND=lm_studio to force local even when the key is set.
|
| 86 |
+
TASK_LM_BACKEND: str = os.getenv("TASK_LM_BACKEND", "gemini" if GEMINI_API_KEY else "lm_studio")
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
# ──────────────────────────── Enrichment LM (OpenAI gpt-4o-mini, offline batch) ─────────
|
| 90 |
+
# gpt-4o-mini is reliable at structured JSON output and cheap enough that the
|
| 91 |
+
# full 701-verse corpus costs under $1 (one-time).
|
| 92 |
+
#
|
| 93 |
+
# Cost estimate (full 701-verse corpus):
|
| 94 |
+
# ~1800 input tokens/verse × 701 × $0.15/M ≈ $0.19 input
|
| 95 |
+
# ~900 output tokens/verse × 701 × $0.60/M ≈ $0.38 output
|
| 96 |
+
# Total ≈ $0.57 — effectively free at this scale.
|
| 97 |
+
#
|
| 98 |
+
# Key is read from .env (OPENAI_API_KEY). Override ENRICH_MODEL env var to
|
| 99 |
+
# swap in a different OpenAI model (e.g. "openai/gpt-4o" for harder cases).
|
| 100 |
+
ENRICH_MODEL = os.getenv("ENRICH_MODEL", "openai/gpt-4o-mini")
|
| 101 |
+
|
| 102 |
+
ENRICH_LM_KWARGS = dict(
|
| 103 |
+
api_key=os.getenv("OPENAI_API_KEY", ""),
|
| 104 |
+
temperature=0.3, # lower than task LM — we want consistent structured output
|
| 105 |
+
max_tokens=3000, # enough headroom for all six fields + CoT reasoning
|
| 106 |
+
cache=True, # DSPy disk cache deduplicates identical calls on re-runs
|
| 107 |
+
response_format={"type": "text"}, # DSPy 3.x sends json_object by default;
|
| 108 |
+
# OpenAI now requires json_schema or text
|
| 109 |
+
)
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
# ──────────────────────────── Proxy Task LM (gpt-4o-mini, GEPA optimization only) ────────
|
| 113 |
+
# When running GEPA with --proxy-task-lm, this model replaces Gemma 4 as the task LM
|
| 114 |
+
# during optimization. Prompts are model-agnostic text; they transfer back to Gemma 4
|
| 115 |
+
# at inference time. gpt-4o-mini runs ~20x faster than Gemma 4 thinking mode, bringing
|
| 116 |
+
# --auto light from ~260 hours to ~2-3 hours.
|
| 117 |
+
PROXY_TASK_MODEL = os.getenv("PROXY_TASK_MODEL", "openai/gpt-4o-mini")
|
| 118 |
+
|
| 119 |
+
PROXY_TASK_LM_KWARGS = dict(
|
| 120 |
+
api_key=os.getenv("OPENAI_API_KEY", ""),
|
| 121 |
+
temperature=0.6,
|
| 122 |
+
max_tokens=4096,
|
| 123 |
+
cache=True,
|
| 124 |
+
response_format={"type": "text"},
|
| 125 |
+
)
|
| 126 |
+
|
| 127 |
+
# ──────────────────────────── Reflection LM (gpt-4o, GEPA) ──────────────────────────────
|
| 128 |
+
# GEPA's reflection step reads metric feedback and proposes rewritten prompts.
|
| 129 |
+
# This scales strongly with model quality. gpt-4o is the right balance here:
|
| 130 |
+
# it reasons well enough to write meaningful prompt mutations from nuanced
|
| 131 |
+
# Advaita feedback, and is affordable on a small OpenAI credit balance.
|
| 132 |
+
#
|
| 133 |
+
# Cost estimate per GEPA run (reflection calls only):
|
| 134 |
+
# --auto light: ~50 calls × 6k tokens ≈ $1.50
|
| 135 |
+
# --auto medium: ~250 calls × 6k tokens ≈ $7.50
|
| 136 |
+
#
|
| 137 |
+
# gpt-4o-mini is too shallow for this task — it produces generic rewrites
|
| 138 |
+
# that ignore the tradition-specific feedback the metric provides.
|
| 139 |
+
# Same OPENAI_API_KEY as the enrichment LM.
|
| 140 |
+
REFLECTION_MODEL = os.getenv("REFLECTION_MODEL", "openai/gpt-4o")
|
| 141 |
+
|
| 142 |
+
REFLECTION_LM_KWARGS = dict(
|
| 143 |
+
api_key=os.getenv("OPENAI_API_KEY", ""),
|
| 144 |
+
temperature=1.0, # GEPA wants diversity across reflection proposals
|
| 145 |
+
max_tokens=6000, # headroom for detailed critique + full rewritten prompt text
|
| 146 |
+
response_format={"type": "text"}, # same fix as enrichment LM — avoid json_object
|
| 147 |
+
cache=False, # reflection calls are intentionally diverse; caching defeats that
|
| 148 |
+
)
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
# ──────────────────────────── Configure helpers ───────────────────────────────────────
|
| 152 |
+
def configure_dspy() -> tuple[dspy.LM, dspy.LM]:
|
| 153 |
+
"""Configure DSPy for inference and return (task_lm, reflection_lm).
|
| 154 |
+
|
| 155 |
+
Prefers Gemini API when GEMINI_API_KEY is set (same Gemma 4 26B weights,
|
| 156 |
+
hosted by Google, free tier). Falls back to LM Studio otherwise.
|
| 157 |
+
Override with TASK_LM_BACKEND=lm_studio env var to force local.
|
| 158 |
+
|
| 159 |
+
ChatAdapter fallback to JSONAdapter is disabled in both paths because:
|
| 160 |
+
- LM Studio rejects json_object.
|
| 161 |
+
- Gemma outputs `[[ ## field ]]` (no closing ##); the field_header_pattern
|
| 162 |
+
patch at module load time makes ChatAdapter parse these correctly.
|
| 163 |
+
"""
|
| 164 |
+
if TASK_LM_BACKEND == "gemini":
|
| 165 |
+
task_lm = dspy.LM(model=GEMINI_TASK_MODEL, **GEMINI_TASK_LM_KWARGS)
|
| 166 |
+
print(f"Task LM backend: Gemini API ({GEMINI_TASK_MODEL})")
|
| 167 |
+
else:
|
| 168 |
+
task_lm = dspy.LM(model=TASK_MODEL_STRING, **TASK_LM_KWARGS)
|
| 169 |
+
print(f"Task LM backend: LM Studio ({TASK_MODEL_STRING} @ {LM_STUDIO_BASE})")
|
| 170 |
+
|
| 171 |
+
reflection_lm = dspy.LM(model=REFLECTION_MODEL, **REFLECTION_LM_KWARGS)
|
| 172 |
+
# use_json_adapter_fallback=False: LM Studio rejects json_object, so we must never fall back
|
| 173 |
+
dspy.configure(lm=task_lm, adapter=dspy.ChatAdapter(use_json_adapter_fallback=False))
|
| 174 |
+
return task_lm, reflection_lm
|
| 175 |
+
|
| 176 |
+
|
| 177 |
+
def configure_enrich_lm() -> dspy.LM:
|
| 178 |
+
"""Configure DSPy globally with the Claude Sonnet enrichment LM and return it.
|
| 179 |
+
|
| 180 |
+
Call this instead of configure_dspy() when running enrich_corpus.py.
|
| 181 |
+
Raises if ANTHROPIC_API_KEY is not set.
|
| 182 |
+
"""
|
| 183 |
+
key = os.getenv("OPENAI_API_KEY", "")
|
| 184 |
+
if not key:
|
| 185 |
+
raise SystemExit(
|
| 186 |
+
"OPENAI_API_KEY is not set. Add it to your .env file:\n"
|
| 187 |
+
" OPENAI_API_KEY=sk-proj-..."
|
| 188 |
+
)
|
| 189 |
+
lm = dspy.LM(model=ENRICH_MODEL, **ENRICH_LM_KWARGS)
|
| 190 |
+
dspy.configure(lm=lm)
|
| 191 |
+
return lm
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
# ──────────────────────────── Embeddings ─────────────────────────────────────────────
|
| 195 |
+
# Local sentence-transformer for retrieval. BGE-small is a sweet spot for
|
| 196 |
+
# semantic philosophy text on a Mac without burning RAM.
|
| 197 |
+
EMBED_MODEL = os.getenv("EMBED_MODEL", "BAAI/bge-small-en-v1.5")
|
| 198 |
+
EMBED_DEVICE = os.getenv("EMBED_DEVICE", "mps") # "mps" on Apple Silicon, "cpu" otherwise
|
| 199 |
+
|
| 200 |
+
TOP_K_RETRIEVE = 8 # passages to fetch per query
|
| 201 |
+
N_RETRIEVAL_QUERIES = 3 # the planner generates this many per user question
|
corpus.py
ADDED
|
@@ -0,0 +1,224 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
corpus.py — the data model and on-disk storage for the verse corpus.
|
| 3 |
+
|
| 4 |
+
A note on dataclasses vs. plain dicts
|
| 5 |
+
-------------------------------------
|
| 6 |
+
We could have used dicts everywhere and saved keystrokes. We don't, because
|
| 7 |
+
the Verse type is the contract between five different modules — parsers,
|
| 8 |
+
enrichment, indexing, retrieval, and the metric — and a typed contract
|
| 9 |
+
catches mistakes that "I thought 'sources_cited' was a list" wouldn't.
|
| 10 |
+
|
| 11 |
+
The pipeline lifecycle of a verse
|
| 12 |
+
---------------------------------
|
| 13 |
+
parsers.* → Verse (no LLM-derived fields)
|
| 14 |
+
enrichment.py → EnrichedVerse (with LLM-derived fields)
|
| 15 |
+
knowledge_base → reads EnrichedVerse, writes 3 embeddings per verse
|
| 16 |
+
advisor.py → receives EnrichedVerse via retriever hits
|
| 17 |
+
metrics.py → uses verse_id for exact citation grounding
|
| 18 |
+
|
| 19 |
+
Storage choice
|
| 20 |
+
--------------
|
| 21 |
+
JSONL on disk. Each line is a verse. Why not Parquet, sqlite, etc.?
|
| 22 |
+
- Easy to grep
|
| 23 |
+
- Easy to diff in PRs
|
| 24 |
+
- Easy for a human to spot-check enrichment quality (the whole point)
|
| 25 |
+
- We never need to scan more than a few thousand lines, so format doesn't matter
|
| 26 |
+
"""
|
| 27 |
+
|
| 28 |
+
from __future__ import annotations
|
| 29 |
+
import json
|
| 30 |
+
from dataclasses import dataclass, field, asdict, fields
|
| 31 |
+
from pathlib import Path
|
| 32 |
+
from typing import Iterable, Iterator
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
# ──────────────────────────── Verse: the raw record ────────────────────────────
|
| 36 |
+
@dataclass
|
| 37 |
+
class Verse:
|
| 38 |
+
"""A natural unit of scripture: one verse, one mantra, one sūtra.
|
| 39 |
+
|
| 40 |
+
The required fields are minimal — every parser must produce at least these.
|
| 41 |
+
Optional fields (sanskrit, transliteration, bhashya, ...) are filled when
|
| 42 |
+
the source provides them.
|
| 43 |
+
|
| 44 |
+
`verse_id` is the global unique key. Convention:
|
| 45 |
+
'<work_slug>_<section_slug>_<verse_number>'
|
| 46 |
+
e.g. 'bhagavad_gita_02_47', 'mundaka_upanishad_2_1_3'.
|
| 47 |
+
|
| 48 |
+
`verse_ref` is the human-readable citation form:
|
| 49 |
+
e.g. 'BG 2.47', 'Muṇḍaka Up. 2.1.3', 'Vivekacūḍāmaṇi 11'.
|
| 50 |
+
The advisor's response uses this exact string in citations.
|
| 51 |
+
"""
|
| 52 |
+
# Identity — required for every record
|
| 53 |
+
verse_id: str
|
| 54 |
+
work: str
|
| 55 |
+
work_display: str
|
| 56 |
+
verse_ref: str
|
| 57 |
+
tier: str # primary | shankara | supporting
|
| 58 |
+
|
| 59 |
+
# Section/chapter info — required when the work has chapters
|
| 60 |
+
section: str = "" # 'chapter_02'
|
| 61 |
+
section_display: str = "" # 'Chapter 2: Sāṅkhya Yoga'
|
| 62 |
+
|
| 63 |
+
# Content — at least one of {translation, bhashya} must be non-empty
|
| 64 |
+
translation: str = "" # English translation of the verse itself
|
| 65 |
+
translator: str = "" # who translated it (for attribution)
|
| 66 |
+
|
| 67 |
+
sanskrit: str = "" # original Devanāgarī
|
| 68 |
+
transliteration: str = "" # IAST roman transliteration
|
| 69 |
+
word_meanings: str = "" # word-by-word gloss when present
|
| 70 |
+
|
| 71 |
+
bhashya: str = "" # Śaṅkara's commentary on this verse, if any
|
| 72 |
+
bhashya_translator: str = "" # who translated the bhāṣya
|
| 73 |
+
|
| 74 |
+
# Provenance for accountability and license display
|
| 75 |
+
source_key: str = "" # the registry key this came from
|
| 76 |
+
license: str = "" # license tag from registry
|
| 77 |
+
|
| 78 |
+
def has_content(self) -> bool:
|
| 79 |
+
"""Used by parsers/loaders to drop empty records before they pollute
|
| 80 |
+
the index. A 'verse' with only a verse_id and no actual text is junk."""
|
| 81 |
+
return bool(self.translation.strip() or self.bhashya.strip())
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
# ──────────────────────────── EnrichedVerse: with LLM extractions ────────────────
|
| 85 |
+
@dataclass
|
| 86 |
+
class EnrichedVerse(Verse):
|
| 87 |
+
"""A Verse + the structured fields produced by the offline LLM pass.
|
| 88 |
+
|
| 89 |
+
Every list defaults to empty so a verse that fails enrichment can still
|
| 90 |
+
be stored (without enrichment, indexed only on its literal text/bhāṣya).
|
| 91 |
+
"""
|
| 92 |
+
# The plain-English statement of what the verse teaches. Ideally 1–2
|
| 93 |
+
# sentences. This is what the synthesizer reads downstream.
|
| 94 |
+
paraphrase: str = ""
|
| 95 |
+
|
| 96 |
+
# Vedānta concepts engaged by the verse. Tradition-native vocabulary.
|
| 97 |
+
# Examples: 'karma_yoga', 'vairagya', 'sakshi', 'two_truths', 'adhyasa'.
|
| 98 |
+
themes: list[str] = field(default_factory=list)
|
| 99 |
+
|
| 100 |
+
# Mundane life situations where this verse would help. User-language.
|
| 101 |
+
# Examples: 'facing failure after sustained effort', 'watching a parent decline'.
|
| 102 |
+
life_situations: list[str] = field(default_factory=list)
|
| 103 |
+
|
| 104 |
+
# Emotions addressed, from a small consistent vocabulary.
|
| 105 |
+
# See enrichment.py EMOTION_VOCAB for the closed set.
|
| 106 |
+
emotions_addressed: list[str] = field(default_factory=list)
|
| 107 |
+
|
| 108 |
+
# What does this verse ask the seeker to do or shift?
|
| 109 |
+
practical_teaching: str = ""
|
| 110 |
+
|
| 111 |
+
# Hypothetical questions a real person might bring to this verse.
|
| 112 |
+
# These are gold for retrieval; they bridge the language gap.
|
| 113 |
+
hypothetical_questions: list[str] = field(default_factory=list)
|
| 114 |
+
|
| 115 |
+
# Quality / debugging
|
| 116 |
+
enrichment_model: str = "" # which LM produced these fields
|
| 117 |
+
enrichment_version: int = 1 # bump when the prompt changes substantively
|
| 118 |
+
|
| 119 |
+
# ---- Derived "views" used at indexing time ----
|
| 120 |
+
def literal_view(self) -> str:
|
| 121 |
+
"""The literal English translation, lightly enriched with the Sanskrit
|
| 122 |
+
if available. Best for queries that share lexical features with the text."""
|
| 123 |
+
parts = []
|
| 124 |
+
if self.translation:
|
| 125 |
+
parts.append(self.translation.strip())
|
| 126 |
+
if self.transliteration:
|
| 127 |
+
parts.append(f"({self.transliteration.strip()})")
|
| 128 |
+
return "\n".join(parts)
|
| 129 |
+
|
| 130 |
+
def bhashya_view(self) -> str:
|
| 131 |
+
"""Śaṅkara's commentary on this verse. Best for queries about the
|
| 132 |
+
Vedāntic explanation rather than the verse text itself."""
|
| 133 |
+
return self.bhashya.strip()
|
| 134 |
+
|
| 135 |
+
def advisor_view(self) -> str:
|
| 136 |
+
"""The composed view that bridges the language gap.
|
| 137 |
+
|
| 138 |
+
This is what makes the user-question-→-verse mapping work. A user who
|
| 139 |
+
types 'I feel hollow even though I got everything I wanted' will not
|
| 140 |
+
find anything in the Sanskrit. They will find a near-neighbor in this
|
| 141 |
+
view if the enrichment did its job.
|
| 142 |
+
"""
|
| 143 |
+
bits = []
|
| 144 |
+
if self.paraphrase:
|
| 145 |
+
bits.append(f"Teaching: {self.paraphrase}")
|
| 146 |
+
if self.life_situations:
|
| 147 |
+
bits.append(
|
| 148 |
+
"Speaks to: " + "; ".join(self.life_situations)
|
| 149 |
+
)
|
| 150 |
+
if self.emotions_addressed:
|
| 151 |
+
bits.append(
|
| 152 |
+
"Addresses: " + ", ".join(self.emotions_addressed)
|
| 153 |
+
)
|
| 154 |
+
if self.themes:
|
| 155 |
+
bits.append(
|
| 156 |
+
"Themes: " + ", ".join(self.themes)
|
| 157 |
+
)
|
| 158 |
+
if self.hypothetical_questions:
|
| 159 |
+
bits.append(
|
| 160 |
+
"Questions this answers:\n - "
|
| 161 |
+
+ "\n - ".join(self.hypothetical_questions)
|
| 162 |
+
)
|
| 163 |
+
if self.practical_teaching:
|
| 164 |
+
bits.append(f"Practical shift: {self.practical_teaching}")
|
| 165 |
+
return "\n".join(bits)
|
| 166 |
+
|
| 167 |
+
def is_enriched(self) -> bool:
|
| 168 |
+
"""Did enrichment populate at least the minimum-viable fields?"""
|
| 169 |
+
return bool(self.paraphrase) and bool(self.life_situations) and bool(self.hypothetical_questions)
|
| 170 |
+
|
| 171 |
+
|
| 172 |
+
# ──────────────────────────── On-disk JSONL ────────────────────────────
|
| 173 |
+
def write_jsonl(records: Iterable[Verse], path: Path) -> int:
|
| 174 |
+
"""Write a stream of records as JSONL. Returns count written."""
|
| 175 |
+
path.parent.mkdir(parents=True, exist_ok=True)
|
| 176 |
+
n = 0
|
| 177 |
+
with path.open("w", encoding="utf-8") as f:
|
| 178 |
+
for r in records:
|
| 179 |
+
f.write(json.dumps(asdict(r), ensure_ascii=False) + "\n")
|
| 180 |
+
n += 1
|
| 181 |
+
return n
|
| 182 |
+
|
| 183 |
+
|
| 184 |
+
def read_jsonl_verses(path: Path) -> Iterator[Verse]:
|
| 185 |
+
"""Read a JSONL file as Verse records. Skips lines we can't parse."""
|
| 186 |
+
if not path.exists():
|
| 187 |
+
return
|
| 188 |
+
with path.open(encoding="utf-8") as f:
|
| 189 |
+
for line_no, line in enumerate(f, start=1):
|
| 190 |
+
line = line.strip()
|
| 191 |
+
if not line:
|
| 192 |
+
continue
|
| 193 |
+
try:
|
| 194 |
+
d = json.loads(line)
|
| 195 |
+
yield _verse_from_dict(d, Verse)
|
| 196 |
+
except Exception as e:
|
| 197 |
+
print(f"[corpus] skipping malformed line {line_no} in {path}: {e}")
|
| 198 |
+
|
| 199 |
+
|
| 200 |
+
def read_jsonl_enriched(path: Path) -> Iterator[EnrichedVerse]:
|
| 201 |
+
"""Read a JSONL file as EnrichedVerse records."""
|
| 202 |
+
if not path.exists():
|
| 203 |
+
return
|
| 204 |
+
with path.open(encoding="utf-8") as f:
|
| 205 |
+
for line_no, line in enumerate(f, start=1):
|
| 206 |
+
line = line.strip()
|
| 207 |
+
if not line:
|
| 208 |
+
continue
|
| 209 |
+
try:
|
| 210 |
+
d = json.loads(line)
|
| 211 |
+
yield _verse_from_dict(d, EnrichedVerse)
|
| 212 |
+
except Exception as e:
|
| 213 |
+
print(f"[corpus] skipping malformed line {line_no} in {path}: {e}")
|
| 214 |
+
|
| 215 |
+
|
| 216 |
+
def _verse_from_dict(d: dict, cls):
|
| 217 |
+
"""Construct a Verse/EnrichedVerse, ignoring keys the dataclass doesn't know.
|
| 218 |
+
|
| 219 |
+
This forward-compatibility matters: if a future version adds a field, old
|
| 220 |
+
JSONL files should still load. And if enrichment adds extra debug fields,
|
| 221 |
+
we don't want the dataclass to choke on them.
|
| 222 |
+
"""
|
| 223 |
+
valid = {f.name for f in fields(cls)}
|
| 224 |
+
return cls(**{k: v for k, v in d.items() if k in valid})
|
data/corpus_enriched.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
dataset_generator.py
ADDED
|
@@ -0,0 +1,332 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
dataset_generator.py — produce ~500 unique, life-grounded questions.
|
| 3 |
+
|
| 4 |
+
The dataset is the GEPA training/validation pool. We want:
|
| 5 |
+
- Coverage across life domains (career, grief, identity, dharma, practice, ...)
|
| 6 |
+
- Variety in voice (anguished / intellectual / sarcastic / exhausted / hopeful)
|
| 7 |
+
- Variety in form (direct question / vent / philosophical doubt / dilemma)
|
| 8 |
+
- Variety in age & life-stage cues
|
| 9 |
+
- Some cleanly Advaita-relevant, some that *force* the advisor to find the
|
| 10 |
+
Advaita angle in something mundane (this is where over-fitting to "spiritual"
|
| 11 |
+
questions usually shows up)
|
| 12 |
+
|
| 13 |
+
Strategy: structured combinatorics × LM rewriting × similarity dedupe.
|
| 14 |
+
|
| 15 |
+
We construct (domain, scenario, voice, form) tuples, send them to the local LM
|
| 16 |
+
to write each as a real human message, then dedupe by embedding similarity.
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
from __future__ import annotations
|
| 20 |
+
import argparse
|
| 21 |
+
import json
|
| 22 |
+
import random
|
| 23 |
+
import re
|
| 24 |
+
from dataclasses import dataclass, asdict
|
| 25 |
+
from pathlib import Path
|
| 26 |
+
|
| 27 |
+
import numpy as np
|
| 28 |
+
from sentence_transformers import SentenceTransformer
|
| 29 |
+
from tqdm import tqdm
|
| 30 |
+
import dspy
|
| 31 |
+
|
| 32 |
+
import config
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
# ──────────────────────────── Taxonomy ────────────────────────────
|
| 36 |
+
DOMAINS: dict[str, list[str]] = {
|
| 37 |
+
"career_and_purpose": [
|
| 38 |
+
"got laid off after years of dedication",
|
| 39 |
+
"achieved the big career goal and feels empty",
|
| 40 |
+
"stuck in a job that pays well but feels meaningless",
|
| 41 |
+
"wants to leave stable career to pursue art / spiritual path",
|
| 42 |
+
"watching peers succeed while their own work plateaus",
|
| 43 |
+
"facing retirement and loss of identity tied to work",
|
| 44 |
+
"imposter syndrome after a major promotion",
|
| 45 |
+
"publicly failed in front of colleagues",
|
| 46 |
+
],
|
| 47 |
+
"romantic_relationships": [
|
| 48 |
+
"going through a painful breakup after long relationship",
|
| 49 |
+
"marriage has gone cold and considering divorce",
|
| 50 |
+
"in love with someone who doesn't love them back",
|
| 51 |
+
"obsessive jealousy about a partner's past",
|
| 52 |
+
"tempted to have an affair",
|
| 53 |
+
"partner died and grief is overwhelming",
|
| 54 |
+
"afraid of commitment despite loving partner",
|
| 55 |
+
"single in their 40s and despairing about it",
|
| 56 |
+
],
|
| 57 |
+
"family": [
|
| 58 |
+
"parent is dying and they have unresolved conflict",
|
| 59 |
+
"estranged from a sibling for years",
|
| 60 |
+
"parents pressuring them about marriage / career",
|
| 61 |
+
"child making destructive life choices",
|
| 62 |
+
"caring for an aging parent and exhausted",
|
| 63 |
+
"had a falling out with adult child",
|
| 64 |
+
"mother-in-law conflict ruining marriage",
|
| 65 |
+
"feels they failed as a parent",
|
| 66 |
+
],
|
| 67 |
+
"friendship_and_social": [
|
| 68 |
+
"best friend betrayed their trust",
|
| 69 |
+
"feels invisible and lonely in their 30s",
|
| 70 |
+
"friend group has drifted apart with age",
|
| 71 |
+
"social anxiety preventing them from connecting",
|
| 72 |
+
"outgrown their old friends spiritually",
|
| 73 |
+
"discovered close friend was talking behind their back",
|
| 74 |
+
],
|
| 75 |
+
"mortality_and_loss": [
|
| 76 |
+
"received a serious medical diagnosis",
|
| 77 |
+
"watching a loved one die slowly",
|
| 78 |
+
"afraid of death after a near-miss",
|
| 79 |
+
"grieving a sudden, unexpected loss",
|
| 80 |
+
"watching parents age and decline",
|
| 81 |
+
"lost a child",
|
| 82 |
+
"lost a pet who was their closest companion",
|
| 83 |
+
"approaching old age with regret about unlived life",
|
| 84 |
+
],
|
| 85 |
+
"identity_and_ego": [
|
| 86 |
+
"tying self-worth entirely to external validation",
|
| 87 |
+
"endlessly comparing themselves to others on social media",
|
| 88 |
+
"going through midlife crisis questioning everything",
|
| 89 |
+
"famous and feels everyone wants something from them",
|
| 90 |
+
"lost sense of who they are after big life change",
|
| 91 |
+
"racial / cultural identity feels splintered between worlds",
|
| 92 |
+
"transitioning gender and family rejecting them",
|
| 93 |
+
],
|
| 94 |
+
"material_life": [
|
| 95 |
+
"drowning in debt and shame about it",
|
| 96 |
+
"wealthy and feels guilty / disconnected because of it",
|
| 97 |
+
"consumed by FOMO scrolling through richer friends' lives",
|
| 98 |
+
"lost their home / financial security",
|
| 99 |
+
"struggling to give up consumerist habits despite knowing better",
|
| 100 |
+
"tempted by a get-rich-quick scheme",
|
| 101 |
+
],
|
| 102 |
+
"existential": [
|
| 103 |
+
"feels life has no meaning at all",
|
| 104 |
+
"deeply depressed and going through the motions",
|
| 105 |
+
"constant existential dread about the world's state",
|
| 106 |
+
"doubting whether God / Brahman exists",
|
| 107 |
+
"sees through everything and now nothing feels real",
|
| 108 |
+
"feels they were 'born wrong' for this world",
|
| 109 |
+
],
|
| 110 |
+
"spiritual_practice": [
|
| 111 |
+
"meditation has gone dry after years of practice",
|
| 112 |
+
"got addicted to spiritual highs and now they've stopped",
|
| 113 |
+
"spiritual ego — feels superior to non-practitioners",
|
| 114 |
+
"had a powerful experience and can't get back to it",
|
| 115 |
+
"doubts whether their guru / lineage is right for them",
|
| 116 |
+
"intellectually understands non-duality but doesn't feel it",
|
| 117 |
+
"afraid that liberation means losing love for family",
|
| 118 |
+
"can't reconcile traditional teachings with modern life",
|
| 119 |
+
],
|
| 120 |
+
"ethics_and_dharma": [
|
| 121 |
+
"told a serious lie and considering whether to confess",
|
| 122 |
+
"harmed someone in the past and can't forgive themselves",
|
| 123 |
+
"facing a moral dilemma at work involving dishonesty",
|
| 124 |
+
"tempted to retaliate against someone who wronged them",
|
| 125 |
+
"torn between duty to family and personal calling",
|
| 126 |
+
"did something they're deeply ashamed of",
|
| 127 |
+
],
|
| 128 |
+
"health_and_body": [
|
| 129 |
+
"chronic illness reshaping their entire life",
|
| 130 |
+
"struggling with addiction and relapse",
|
| 131 |
+
"eating disorder they can't seem to escape",
|
| 132 |
+
"chronic pain making spiritual practice feel impossible",
|
| 133 |
+
"hates their aging body",
|
| 134 |
+
"cancer diagnosis reframing everything",
|
| 135 |
+
],
|
| 136 |
+
"modernity_specific": [
|
| 137 |
+
"doomscrolling and feeling worse every day",
|
| 138 |
+
"AI / automation making them feel obsolete",
|
| 139 |
+
"climate dread paralyzing their life decisions",
|
| 140 |
+
"political division has destroyed family relationships",
|
| 141 |
+
"addicted to phone / can't focus / can't read books anymore",
|
| 142 |
+
"online persona feels disconnected from real self",
|
| 143 |
+
],
|
| 144 |
+
}
|
| 145 |
+
|
| 146 |
+
VOICES = [
|
| 147 |
+
"anguished",
|
| 148 |
+
"exhausted",
|
| 149 |
+
"intellectual and analytical",
|
| 150 |
+
"darkly sarcastic",
|
| 151 |
+
"quietly hopeful",
|
| 152 |
+
"numb and dissociated",
|
| 153 |
+
"frustrated and angry",
|
| 154 |
+
"softly resigned",
|
| 155 |
+
]
|
| 156 |
+
|
| 157 |
+
FORMS = [
|
| 158 |
+
"direct question",
|
| 159 |
+
"venting paragraph",
|
| 160 |
+
"philosophical doubt",
|
| 161 |
+
"practical dilemma asking what to do",
|
| 162 |
+
"stream-of-consciousness",
|
| 163 |
+
]
|
| 164 |
+
|
| 165 |
+
AGE_CUES = [
|
| 166 |
+
"early 20s",
|
| 167 |
+
"late 20s",
|
| 168 |
+
"early 30s",
|
| 169 |
+
"late 30s",
|
| 170 |
+
"40s",
|
| 171 |
+
"50s",
|
| 172 |
+
"60s",
|
| 173 |
+
"70s",
|
| 174 |
+
"(no age cue)",
|
| 175 |
+
]
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
@dataclass
|
| 179 |
+
class QuestionRecord:
|
| 180 |
+
id: str
|
| 181 |
+
question: str
|
| 182 |
+
domain: str
|
| 183 |
+
scenario: str
|
| 184 |
+
voice: str
|
| 185 |
+
form: str
|
| 186 |
+
age_cue: str
|
| 187 |
+
|
| 188 |
+
|
| 189 |
+
# ──────────────────────────── LM-driven phrasing ────────────────────────────
|
| 190 |
+
class WriteUserMessage(dspy.Signature):
|
| 191 |
+
"""Write a single, realistic message that a person might send to a spiritual
|
| 192 |
+
advisor. The message must reflect the given scenario, voice, form, and age
|
| 193 |
+
cue. Do NOT include scripture references, do NOT name Vedānta concepts —
|
| 194 |
+
write as a real person speaking from their actual life. Avoid generic phrases
|
| 195 |
+
like 'help me find peace' or 'I want to grow spiritually'. Be specific, lived,
|
| 196 |
+
grounded in detail. 2-6 sentences."""
|
| 197 |
+
|
| 198 |
+
scenario: str = dspy.InputField()
|
| 199 |
+
voice: str = dspy.InputField()
|
| 200 |
+
form: str = dspy.InputField()
|
| 201 |
+
age_cue: str = dspy.InputField()
|
| 202 |
+
|
| 203 |
+
message: str = dspy.OutputField(desc="The user's message, in first person.")
|
| 204 |
+
|
| 205 |
+
|
| 206 |
+
def _slug(s: str) -> str:
|
| 207 |
+
return re.sub(r"[^a-z0-9]+", "_", s.lower()).strip("_")[:60]
|
| 208 |
+
|
| 209 |
+
|
| 210 |
+
def generate_questions(target_n: int = 500, seed: int = 7, use_local: bool = False) -> list[QuestionRecord]:
|
| 211 |
+
"""Generate ~target_n unique questions via combinatorics + LM rewriting."""
|
| 212 |
+
rng = random.Random(seed)
|
| 213 |
+
if use_local:
|
| 214 |
+
config.configure_dspy()
|
| 215 |
+
else:
|
| 216 |
+
config.configure_enrich_lm() # gpt-4o-mini: faster and more stylistically diverse
|
| 217 |
+
writer = dspy.Predict(WriteUserMessage)
|
| 218 |
+
|
| 219 |
+
# Build the (domain, scenario, voice, form, age) plan first
|
| 220 |
+
combos: list[tuple[str, str, str, str, str]] = []
|
| 221 |
+
for domain, scenarios in DOMAINS.items():
|
| 222 |
+
for scenario in scenarios:
|
| 223 |
+
# 5 variants per scenario varying voice/form/age
|
| 224 |
+
voices = rng.sample(VOICES, k=5)
|
| 225 |
+
forms = [rng.choice(FORMS) for _ in range(5)]
|
| 226 |
+
ages = rng.sample(AGE_CUES, k=5)
|
| 227 |
+
for v, f, a in zip(voices, forms, ages):
|
| 228 |
+
combos.append((domain, scenario, v, f, a))
|
| 229 |
+
|
| 230 |
+
rng.shuffle(combos)
|
| 231 |
+
|
| 232 |
+
# Cap to a generous over-target; we'll dedupe down to target_n
|
| 233 |
+
over_target = int(target_n * 1.25)
|
| 234 |
+
combos = combos[:over_target]
|
| 235 |
+
|
| 236 |
+
records: list[QuestionRecord] = []
|
| 237 |
+
for i, (domain, scenario, voice, form, age) in enumerate(tqdm(combos, desc="Generating")):
|
| 238 |
+
try:
|
| 239 |
+
out = writer(scenario=scenario, voice=voice, form=form, age_cue=age)
|
| 240 |
+
msg = (out.message or "").strip()
|
| 241 |
+
if len(msg) < 30:
|
| 242 |
+
continue
|
| 243 |
+
records.append(QuestionRecord(
|
| 244 |
+
id=f"q_{i:04d}_{_slug(domain)}",
|
| 245 |
+
question=msg,
|
| 246 |
+
domain=domain,
|
| 247 |
+
scenario=scenario,
|
| 248 |
+
voice=voice,
|
| 249 |
+
form=form,
|
| 250 |
+
age_cue=age,
|
| 251 |
+
))
|
| 252 |
+
except Exception as e:
|
| 253 |
+
# Local LMs occasionally hiccup. Log and continue.
|
| 254 |
+
print(f"[warn] generation failure on combo {i}: {e}")
|
| 255 |
+
continue
|
| 256 |
+
|
| 257 |
+
return _dedupe_by_similarity(records, target_n=target_n)
|
| 258 |
+
|
| 259 |
+
|
| 260 |
+
def _dedupe_by_similarity(records: list[QuestionRecord], target_n: int, threshold: float = 0.92) -> list[QuestionRecord]:
|
| 261 |
+
"""Embed and remove near-duplicates greedily."""
|
| 262 |
+
if not records:
|
| 263 |
+
return records
|
| 264 |
+
print(f"Deduping {len(records)} candidates ...")
|
| 265 |
+
embedder = SentenceTransformer(config.EMBED_MODEL, device=config.EMBED_DEVICE)
|
| 266 |
+
embs = embedder.encode(
|
| 267 |
+
[r.question for r in records],
|
| 268 |
+
normalize_embeddings=True,
|
| 269 |
+
show_progress_bar=True,
|
| 270 |
+
batch_size=32,
|
| 271 |
+
)
|
| 272 |
+
keep_idx: list[int] = []
|
| 273 |
+
kept_embs = []
|
| 274 |
+
for i, e in enumerate(embs):
|
| 275 |
+
if not kept_embs:
|
| 276 |
+
keep_idx.append(i)
|
| 277 |
+
kept_embs.append(e)
|
| 278 |
+
continue
|
| 279 |
+
sims = np.dot(np.stack(kept_embs), e)
|
| 280 |
+
if float(sims.max()) < threshold:
|
| 281 |
+
keep_idx.append(i)
|
| 282 |
+
kept_embs.append(e)
|
| 283 |
+
if len(keep_idx) >= target_n:
|
| 284 |
+
break
|
| 285 |
+
print(f"Kept {len(keep_idx)} after dedupe (target {target_n}).")
|
| 286 |
+
return [records[i] for i in keep_idx]
|
| 287 |
+
|
| 288 |
+
|
| 289 |
+
def save_jsonl(records: list[QuestionRecord], path: Path):
|
| 290 |
+
with path.open("w", encoding="utf-8") as f:
|
| 291 |
+
for r in records:
|
| 292 |
+
f.write(json.dumps(asdict(r), ensure_ascii=False) + "\n")
|
| 293 |
+
print(f"Wrote {len(records)} questions to {path}")
|
| 294 |
+
|
| 295 |
+
|
| 296 |
+
def load_jsonl(path: Path = config.DATASET_PATH) -> list[dict]:
|
| 297 |
+
with path.open(encoding="utf-8") as f:
|
| 298 |
+
return [json.loads(line) for line in f if line.strip()]
|
| 299 |
+
|
| 300 |
+
|
| 301 |
+
def to_dspy_examples(records: list[dict]) -> list[dspy.Example]:
|
| 302 |
+
"""The dataset has no gold labels — that's fine. GEPA's metric uses LLM
|
| 303 |
+
judgment + retrieval grounding rather than reference answers.
|
| 304 |
+
We carry the metadata as inputs-of-record so the metric can use them."""
|
| 305 |
+
out = []
|
| 306 |
+
for r in records:
|
| 307 |
+
ex = dspy.Example(
|
| 308 |
+
user_question=r["question"],
|
| 309 |
+
history=dspy.History(messages=[]),
|
| 310 |
+
domain=r["domain"],
|
| 311 |
+
scenario=r["scenario"],
|
| 312 |
+
).with_inputs("user_question", "history")
|
| 313 |
+
out.append(ex)
|
| 314 |
+
return out
|
| 315 |
+
|
| 316 |
+
|
| 317 |
+
# ──────────────────────────── CLI ────────────────────────────
|
| 318 |
+
def main():
|
| 319 |
+
ap = argparse.ArgumentParser()
|
| 320 |
+
ap.add_argument("--n", type=int, default=500)
|
| 321 |
+
ap.add_argument("--seed", type=int, default=7)
|
| 322 |
+
ap.add_argument("--out", type=str, default=str(config.DATASET_PATH))
|
| 323 |
+
ap.add_argument("--lm", choices=["openai", "local"], default="openai",
|
| 324 |
+
help="openai = gpt-4o-mini (default, faster); local = LM Studio task LM")
|
| 325 |
+
args = ap.parse_args()
|
| 326 |
+
|
| 327 |
+
records = generate_questions(target_n=args.n, seed=args.seed, use_local=(args.lm == "local"))
|
| 328 |
+
save_jsonl(records, Path(args.out))
|
| 329 |
+
|
| 330 |
+
|
| 331 |
+
if __name__ == "__main__":
|
| 332 |
+
main()
|
download_sources.py
ADDED
|
@@ -0,0 +1,195 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
download_sources.py — fetch every enabled source from the registry.
|
| 3 |
+
|
| 4 |
+
What this does
|
| 5 |
+
--------------
|
| 6 |
+
Reads sources_registry.SOURCES, walks each enabled entry, and downloads its
|
| 7 |
+
files into data/raw/<source_key>/. The downloader is deliberately dumb: it
|
| 8 |
+
just gets the bytes onto disk. Parsing happens in a separate step (parsers/)
|
| 9 |
+
so a download failure on one source doesn't block ingest of the others, and
|
| 10 |
+
so re-parsing during prompt iteration doesn't re-hit the network.
|
| 11 |
+
|
| 12 |
+
Why HTTPS over `requests` rather than git for everything
|
| 13 |
+
--------------------------------------------------------
|
| 14 |
+
Most of our sources are individual JSON or HTML files. Cloning a whole repo
|
| 15 |
+
to get two files wastes bandwidth and brittle-ifies the script. For sources
|
| 16 |
+
that *are* whole repos (rare in our registry), prefix the URL with `git+`.
|
| 17 |
+
|
| 18 |
+
Idempotency
|
| 19 |
+
-----------
|
| 20 |
+
If a file is already present and not corrupt, we skip it. Pass --force to
|
| 21 |
+
re-download. This makes it safe to run repeatedly while debugging parsers.
|
| 22 |
+
|
| 23 |
+
Politeness
|
| 24 |
+
----------
|
| 25 |
+
We send a real User-Agent and rate-limit to one request per second per host.
|
| 26 |
+
Internet Archive and similar mirrors are gracious to projects that play nice;
|
| 27 |
+
they can also throttle aggressively when they aren't.
|
| 28 |
+
"""
|
| 29 |
+
|
| 30 |
+
from __future__ import annotations
|
| 31 |
+
import argparse
|
| 32 |
+
import shutil
|
| 33 |
+
import subprocess
|
| 34 |
+
import sys
|
| 35 |
+
import time
|
| 36 |
+
from collections import defaultdict
|
| 37 |
+
from pathlib import Path
|
| 38 |
+
from urllib.parse import urlparse
|
| 39 |
+
|
| 40 |
+
import requests
|
| 41 |
+
from tqdm import tqdm
|
| 42 |
+
|
| 43 |
+
import config
|
| 44 |
+
from sources_registry import SOURCES, Source
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
RAW_DIR = config.DATA_DIR / "raw"
|
| 48 |
+
USER_AGENT = (
|
| 49 |
+
"GitaAdvisor/0.2 (Advaita-Vedanta research project; "
|
| 50 |
+
"contact: <add your email here>)"
|
| 51 |
+
)
|
| 52 |
+
|
| 53 |
+
# Per-host minimum interval in seconds
|
| 54 |
+
MIN_INTERVAL = 1.0
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def _filename_for_url(url: str) -> str:
|
| 58 |
+
"""Derive a sensible local filename from a URL."""
|
| 59 |
+
parsed = urlparse(url)
|
| 60 |
+
name = Path(parsed.path).name or "index.html"
|
| 61 |
+
# archive.org sometimes serves djvu.txt with no extension on the URL;
|
| 62 |
+
# keep what's there.
|
| 63 |
+
return name
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def _is_git_url(url: str) -> bool:
|
| 67 |
+
return url.startswith("git+")
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
_last_request_time: dict = defaultdict(float)
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def _polite_get(url: str) -> requests.Response:
|
| 74 |
+
"""GET with rate limiting per host."""
|
| 75 |
+
host = urlparse(url).netloc
|
| 76 |
+
elapsed = time.time() - _last_request_time[host]
|
| 77 |
+
if elapsed < MIN_INTERVAL:
|
| 78 |
+
time.sleep(MIN_INTERVAL - elapsed)
|
| 79 |
+
_last_request_time[host] = time.time()
|
| 80 |
+
return requests.get(url, headers={"User-Agent": USER_AGENT}, timeout=60, stream=True)
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
def _download_file(url: str, dest: Path, force: bool = False) -> bool:
|
| 84 |
+
"""Download a single URL to dest. Returns True if a download happened
|
| 85 |
+
(vs being skipped because already present)."""
|
| 86 |
+
if dest.exists() and dest.stat().st_size > 0 and not force:
|
| 87 |
+
return False
|
| 88 |
+
|
| 89 |
+
dest.parent.mkdir(parents=True, exist_ok=True)
|
| 90 |
+
tmp = dest.with_suffix(dest.suffix + ".tmp")
|
| 91 |
+
|
| 92 |
+
with _polite_get(url) as r:
|
| 93 |
+
r.raise_for_status()
|
| 94 |
+
total = int(r.headers.get("content-length", 0)) or None
|
| 95 |
+
with tmp.open("wb") as out, tqdm(
|
| 96 |
+
total=total, unit="B", unit_scale=True, leave=False, desc=dest.name
|
| 97 |
+
) as bar:
|
| 98 |
+
for chunk in r.iter_content(chunk_size=8192):
|
| 99 |
+
if not chunk:
|
| 100 |
+
continue
|
| 101 |
+
out.write(chunk)
|
| 102 |
+
bar.update(len(chunk))
|
| 103 |
+
|
| 104 |
+
tmp.replace(dest)
|
| 105 |
+
return True
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
def _clone_git(url: str, dest_dir: Path, force: bool = False) -> bool:
|
| 109 |
+
"""Clone a git repo (URL prefixed with 'git+') into dest_dir. Returns
|
| 110 |
+
True if a clone happened."""
|
| 111 |
+
real_url = url[len("git+"):]
|
| 112 |
+
if dest_dir.exists() and any(dest_dir.iterdir()) and not force:
|
| 113 |
+
return False
|
| 114 |
+
if dest_dir.exists():
|
| 115 |
+
shutil.rmtree(dest_dir)
|
| 116 |
+
dest_dir.parent.mkdir(parents=True, exist_ok=True)
|
| 117 |
+
subprocess.run(
|
| 118 |
+
["git", "clone", "--depth=1", real_url, str(dest_dir)],
|
| 119 |
+
check=True,
|
| 120 |
+
)
|
| 121 |
+
return True
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
def download_source(src: Source, force: bool = False) -> dict:
|
| 125 |
+
"""Download all URLs for one source. Returns a small report dict."""
|
| 126 |
+
target = RAW_DIR / src.key
|
| 127 |
+
report = {"key": src.key, "ok": 0, "skipped": 0, "failed": []}
|
| 128 |
+
|
| 129 |
+
if not src.urls:
|
| 130 |
+
report["failed"].append("no URLs in registry entry")
|
| 131 |
+
return report
|
| 132 |
+
|
| 133 |
+
for url in src.urls:
|
| 134 |
+
if not url:
|
| 135 |
+
continue
|
| 136 |
+
try:
|
| 137 |
+
if _is_git_url(url):
|
| 138 |
+
changed = _clone_git(url, target, force=force)
|
| 139 |
+
else:
|
| 140 |
+
fname = _filename_for_url(url)
|
| 141 |
+
changed = _download_file(url, target / fname, force=force)
|
| 142 |
+
if changed:
|
| 143 |
+
report["ok"] += 1
|
| 144 |
+
else:
|
| 145 |
+
report["skipped"] += 1
|
| 146 |
+
except Exception as e:
|
| 147 |
+
report["failed"].append(f"{url}: {e}")
|
| 148 |
+
return report
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
def main():
|
| 152 |
+
ap = argparse.ArgumentParser(description="Download all enabled sources from the registry.")
|
| 153 |
+
ap.add_argument("--force", action="store_true",
|
| 154 |
+
help="Re-download even if files exist.")
|
| 155 |
+
ap.add_argument("--only", nargs="*", default=None,
|
| 156 |
+
help="Only download these source keys.")
|
| 157 |
+
args = ap.parse_args()
|
| 158 |
+
|
| 159 |
+
enabled = [s for s in SOURCES if s.enabled]
|
| 160 |
+
if args.only:
|
| 161 |
+
enabled = [s for s in enabled if s.key in set(args.only)]
|
| 162 |
+
if not enabled:
|
| 163 |
+
print("No enabled sources match. Edit sources_registry.py to enable some.")
|
| 164 |
+
sys.exit(1)
|
| 165 |
+
|
| 166 |
+
print(f"Downloading {len(enabled)} sources to {RAW_DIR}")
|
| 167 |
+
print(f"User-Agent: {USER_AGENT}")
|
| 168 |
+
print()
|
| 169 |
+
|
| 170 |
+
any_failed = False
|
| 171 |
+
for src in enabled:
|
| 172 |
+
print(f"━━━ {src.key} — {src.name}")
|
| 173 |
+
print(f" license={src.license} tier={src.tier} parser={src.parser}")
|
| 174 |
+
if src.translator:
|
| 175 |
+
year = f", {src.year}" if src.year else ""
|
| 176 |
+
print(f" translator: {src.translator}{year}")
|
| 177 |
+
|
| 178 |
+
report = download_source(src, force=args.force)
|
| 179 |
+
if report["failed"]:
|
| 180 |
+
any_failed = True
|
| 181 |
+
for f in report["failed"]:
|
| 182 |
+
print(f" [FAIL] {f}")
|
| 183 |
+
print(f" downloaded={report['ok']} cached={report['skipped']}")
|
| 184 |
+
print()
|
| 185 |
+
|
| 186 |
+
if any_failed:
|
| 187 |
+
print("Some sources failed. Re-run with the network available, or "
|
| 188 |
+
"edit the URL in sources_registry.py if a mirror has moved.")
|
| 189 |
+
sys.exit(2)
|
| 190 |
+
print("All enabled sources are now on disk under data/raw/.")
|
| 191 |
+
print("Next: python ingest_corpus.py")
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
if __name__ == "__main__":
|
| 195 |
+
main()
|
enrich_corpus.py
ADDED
|
@@ -0,0 +1,174 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
enrich_corpus.py — run the local LLM over every verse, once, with caching.
|
| 3 |
+
|
| 4 |
+
The cost calculus
|
| 5 |
+
-----------------
|
| 6 |
+
For ~3,000 verses at ~30s per call on a 26B-class local model, a full pass
|
| 7 |
+
takes a long evening — call it 25 hours. That's tolerable as a one-time cost,
|
| 8 |
+
intolerable as a recurring one. So caching is non-negotiable. We cache by
|
| 9 |
+
verse_id and the enrichment_version stamp; if you change the prompt
|
| 10 |
+
substantively, bump the version in enrichment.py and the next run re-enriches.
|
| 11 |
+
|
| 12 |
+
What we write
|
| 13 |
+
-------------
|
| 14 |
+
data/corpus_enriched.jsonl — one EnrichedVerse per line, in the same order
|
| 15 |
+
as data/corpus.jsonl. Failed enrichments are still written (with empty
|
| 16 |
+
enrichment fields and an error stamp in enrichment_model) so the index can
|
| 17 |
+
still cover them on their literal text.
|
| 18 |
+
|
| 19 |
+
Concurrency
|
| 20 |
+
-----------
|
| 21 |
+
LM Studio's OpenAI-compatible server processes requests serially by default.
|
| 22 |
+
We don't try to parallelize at the client; if you've configured your server
|
| 23 |
+
for parallel decode, set --concurrency > 1 and DSPy will hold multiple
|
| 24 |
+
in-flight calls. For modest hardware, 1 is correct.
|
| 25 |
+
|
| 26 |
+
Resumability
|
| 27 |
+
------------
|
| 28 |
+
If the run dies halfway, just re-run. The cache at data/enrichment_cache.jsonl
|
| 29 |
+
remembers per-verse what we already did, so we pick up exactly where we left
|
| 30 |
+
off. No flag is needed for resume; it's the default behavior.
|
| 31 |
+
"""
|
| 32 |
+
|
| 33 |
+
from __future__ import annotations
|
| 34 |
+
import argparse
|
| 35 |
+
import json
|
| 36 |
+
import os
|
| 37 |
+
from dataclasses import asdict
|
| 38 |
+
from pathlib import Path
|
| 39 |
+
from typing import Iterable
|
| 40 |
+
|
| 41 |
+
from tqdm import tqdm
|
| 42 |
+
import dspy
|
| 43 |
+
|
| 44 |
+
import config
|
| 45 |
+
from corpus import Verse, EnrichedVerse, read_jsonl_verses, write_jsonl
|
| 46 |
+
from enrichment import Enricher
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
CACHE_PATH = config.DATA_DIR / "enrichment_cache.jsonl"
|
| 50 |
+
ENRICHED_PATH = config.DATA_DIR / "corpus_enriched.jsonl"
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
# ──────────────────────────── Cache I/O ────────────────────────────
|
| 54 |
+
def _load_cache(path: Path) -> dict[str, EnrichedVerse]:
|
| 55 |
+
"""Load cache as {verse_id: EnrichedVerse}. Tolerates partial writes."""
|
| 56 |
+
if not path.exists():
|
| 57 |
+
return {}
|
| 58 |
+
out: dict[str, EnrichedVerse] = {}
|
| 59 |
+
with path.open(encoding="utf-8") as f:
|
| 60 |
+
for line in f:
|
| 61 |
+
line = line.strip()
|
| 62 |
+
if not line:
|
| 63 |
+
continue
|
| 64 |
+
try:
|
| 65 |
+
d = json.loads(line)
|
| 66 |
+
ev = EnrichedVerse(**{k: v for k, v in d.items() if k in EnrichedVerse.__dataclass_fields__})
|
| 67 |
+
out[ev.verse_id] = ev
|
| 68 |
+
except Exception:
|
| 69 |
+
continue
|
| 70 |
+
return out
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def _append_cache(path: Path, ev: EnrichedVerse) -> None:
|
| 74 |
+
"""Append a single record. We use append-mode rather than rewriting so
|
| 75 |
+
a kill -9 mid-run loses at most one line."""
|
| 76 |
+
path.parent.mkdir(parents=True, exist_ok=True)
|
| 77 |
+
with path.open("a", encoding="utf-8") as f:
|
| 78 |
+
f.write(json.dumps(asdict(ev), ensure_ascii=False) + "\n")
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
# ──────────────────────────── Main loop ────────────────────────────
|
| 82 |
+
def enrich_all(
|
| 83 |
+
in_path: Path,
|
| 84 |
+
out_path: Path,
|
| 85 |
+
cache_path: Path,
|
| 86 |
+
limit: int | None = None,
|
| 87 |
+
re_enrich: bool = False,
|
| 88 |
+
only_failed: bool = False,
|
| 89 |
+
use_claude: bool = True,
|
| 90 |
+
) -> None:
|
| 91 |
+
if use_claude:
|
| 92 |
+
lm = config.configure_enrich_lm()
|
| 93 |
+
print(f"[enrich] LM: {lm.model} (Claude API)")
|
| 94 |
+
else:
|
| 95 |
+
config.configure_dspy()
|
| 96 |
+
print(f"[enrich] LM: {config.LOCAL_MODEL} (local LM Studio)")
|
| 97 |
+
enricher = Enricher()
|
| 98 |
+
|
| 99 |
+
cache = _load_cache(cache_path) if not re_enrich else {}
|
| 100 |
+
print(f"[enrich] cache contains {len(cache)} previously-enriched verses")
|
| 101 |
+
|
| 102 |
+
verses = list(read_jsonl_verses(in_path))
|
| 103 |
+
if limit:
|
| 104 |
+
verses = verses[:limit]
|
| 105 |
+
print(f"[enrich] enriching {len(verses)} verses from {in_path}")
|
| 106 |
+
|
| 107 |
+
enriched: list[EnrichedVerse] = []
|
| 108 |
+
pending = []
|
| 109 |
+
for v in verses:
|
| 110 |
+
cached = cache.get(v.verse_id)
|
| 111 |
+
if cached and not re_enrich:
|
| 112 |
+
if only_failed and cached.enrichment_model.startswith("FAILED"):
|
| 113 |
+
pending.append(v)
|
| 114 |
+
else:
|
| 115 |
+
enriched.append(cached)
|
| 116 |
+
continue
|
| 117 |
+
else:
|
| 118 |
+
pending.append(v)
|
| 119 |
+
|
| 120 |
+
print(f"[enrich] {len(enriched)} from cache, {len(pending)} to call LM for")
|
| 121 |
+
|
| 122 |
+
n_failed = 0
|
| 123 |
+
for v in tqdm(pending, desc="enriching"):
|
| 124 |
+
ev = enricher(verse=v)
|
| 125 |
+
_append_cache(cache_path, ev)
|
| 126 |
+
enriched.append(ev)
|
| 127 |
+
if not ev.is_enriched():
|
| 128 |
+
n_failed += 1
|
| 129 |
+
|
| 130 |
+
# Restore original verse order from in_path
|
| 131 |
+
by_id = {ev.verse_id: ev for ev in enriched}
|
| 132 |
+
ordered = [by_id[v.verse_id] for v in verses if v.verse_id in by_id]
|
| 133 |
+
|
| 134 |
+
n_written = write_jsonl(ordered, out_path)
|
| 135 |
+
print(f"[enrich] wrote {n_written} enriched verses to {out_path}")
|
| 136 |
+
if n_failed:
|
| 137 |
+
print(f"[enrich] WARNING: {n_failed} verses failed enrichment "
|
| 138 |
+
f"(empty fields, indexed only on literal text). "
|
| 139 |
+
f"Re-run with --only-failed to retry just those.")
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
# ──────────────────────────── CLI ────────────────────────────
|
| 143 |
+
def main():
|
| 144 |
+
ap = argparse.ArgumentParser()
|
| 145 |
+
ap.add_argument("--in", dest="in_path",
|
| 146 |
+
default=str(config.DATA_DIR / "corpus.jsonl"))
|
| 147 |
+
ap.add_argument("--out", default=str(ENRICHED_PATH))
|
| 148 |
+
ap.add_argument("--cache", default=str(CACHE_PATH))
|
| 149 |
+
ap.add_argument("--limit", type=int, default=None,
|
| 150 |
+
help="Enrich only the first N verses (smoke-test).")
|
| 151 |
+
ap.add_argument("--re-enrich", action="store_true",
|
| 152 |
+
help="Ignore cache and re-enrich everything. Use this "
|
| 153 |
+
"when you change the enrichment prompt.")
|
| 154 |
+
ap.add_argument("--only-failed", action="store_true",
|
| 155 |
+
help="Re-run only the verses whose previous enrichment "
|
| 156 |
+
"failed (FAILED stamp in enrichment_model).")
|
| 157 |
+
ap.add_argument("--lm", choices=["claude", "local"], default="claude",
|
| 158 |
+
help="Which LM to use: 'claude' (default, Sonnet 4.6 via API) "
|
| 159 |
+
"or 'local' (LM Studio). Claude requires ANTHROPIC_API_KEY.")
|
| 160 |
+
args = ap.parse_args()
|
| 161 |
+
|
| 162 |
+
enrich_all(
|
| 163 |
+
in_path=Path(args.in_path),
|
| 164 |
+
out_path=Path(args.out),
|
| 165 |
+
cache_path=Path(args.cache),
|
| 166 |
+
limit=args.limit,
|
| 167 |
+
re_enrich=args.re_enrich,
|
| 168 |
+
only_failed=args.only_failed,
|
| 169 |
+
use_claude=(args.lm == "claude"),
|
| 170 |
+
)
|
| 171 |
+
|
| 172 |
+
|
| 173 |
+
if __name__ == "__main__":
|
| 174 |
+
main()
|
enrichment.py
ADDED
|
@@ -0,0 +1,266 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
enrichment.py — turn a Verse into an EnrichedVerse using the local LLM.
|
| 3 |
+
|
| 4 |
+
This module is the heart of the redesign. Instead of hoping that vector
|
| 5 |
+
similarity between a user's English question and a Sanskrit verse will find
|
| 6 |
+
the right teaching, we run a one-time offline pass that asks the local LLM
|
| 7 |
+
to translate each verse into the language a real person would use to seek
|
| 8 |
+
help. The output gets stored alongside the verse and embedded for retrieval.
|
| 9 |
+
|
| 10 |
+
What the prompt asks for, and why each field
|
| 11 |
+
--------------------------------------------
|
| 12 |
+
We extract six fields. Each one earns its place by closing a different gap
|
| 13 |
+
between scripture and a user's question:
|
| 14 |
+
|
| 15 |
+
paraphrase — what the verse teaches, in plain modern English.
|
| 16 |
+
This is what the synthesizer reads when writing
|
| 17 |
+
the advisor's reply, so paraphrase quality matters
|
| 18 |
+
more than embedding quality.
|
| 19 |
+
|
| 20 |
+
themes — Vedānta concepts engaged. Tradition-native names
|
| 21 |
+
(karma_yoga, vairagya, sakshi, two_truths). Used
|
| 22 |
+
for filtering and for ensuring the metric can
|
| 23 |
+
verify Advaita-coherence.
|
| 24 |
+
|
| 25 |
+
life_situations — the predicaments where this verse helps. User-
|
| 26 |
+
language. This is the field that does the actual
|
| 27 |
+
bridging: a query about "facing failure" finds
|
| 28 |
+
BG 2.47 even though those words aren't in the verse.
|
| 29 |
+
|
| 30 |
+
emotions_addressed — drawn from a fixed vocabulary so we get faceted
|
| 31 |
+
filtering rather than free-text drift. The metric
|
| 32 |
+
uses this to verify that retrieved verses actually
|
| 33 |
+
address the user's felt emotion.
|
| 34 |
+
|
| 35 |
+
practical_teaching — what the verse asks the seeker to do or shift.
|
| 36 |
+
The synthesizer uses this as the seed for its
|
| 37 |
+
"concrete practice you can try this week" close.
|
| 38 |
+
|
| 39 |
+
hypothetical_questions — five questions a real person might bring to the
|
| 40 |
+
verse. Highest-leverage field for retrieval recall.
|
| 41 |
+
|
| 42 |
+
A closed vocabulary for emotions
|
| 43 |
+
--------------------------------
|
| 44 |
+
We constrain `emotions_addressed` to the EMOTION_VOCAB list below. If we let
|
| 45 |
+
the LLM generate freely, we get drift: "sadness" / "sorrow" / "melancholy" /
|
| 46 |
+
"grief-tinged blue" all become separate buckets, and faceted filtering
|
| 47 |
+
becomes useless. Closed vocab keeps the index sharp.
|
| 48 |
+
|
| 49 |
+
We don't constrain themes the same way because the Sanskrit conceptual
|
| 50 |
+
vocabulary is open-ended and forcing the LLM into a small list would lose
|
| 51 |
+
information. We just normalize for casing/spacing in post-processing.
|
| 52 |
+
|
| 53 |
+
Working with a flaky local LLM
|
| 54 |
+
------------------------------
|
| 55 |
+
Local 26B-class models occasionally produce malformed structured output.
|
| 56 |
+
This module assumes that. The enrich() function:
|
| 57 |
+
- validates output against minimum-quality checks
|
| 58 |
+
- retries up to 2 times with temperature=0
|
| 59 |
+
- on persistent failure, returns an EnrichedVerse with empty enrichment
|
| 60 |
+
fields rather than raising — so the corpus can still index on the
|
| 61 |
+
literal text + bhāṣya and the verse isn't lost
|
| 62 |
+
"""
|
| 63 |
+
|
| 64 |
+
from __future__ import annotations
|
| 65 |
+
import re
|
| 66 |
+
from dataclasses import asdict
|
| 67 |
+
import dspy
|
| 68 |
+
|
| 69 |
+
from corpus import Verse, EnrichedVerse
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
# ──────────────────────────── Closed emotion vocabulary ────────────────────────────
|
| 73 |
+
# Twenty buckets, ordered roughly from acute to diffuse. Adding entries is
|
| 74 |
+
# easy; removing them risks orphaning previously-enriched records.
|
| 75 |
+
EMOTION_VOCAB: tuple[str, ...] = (
|
| 76 |
+
"grief", # acute loss
|
| 77 |
+
"anticipatory_grief", # loss in advance
|
| 78 |
+
"fear", # discrete fear
|
| 79 |
+
"anxiety", # chronic, diffuse
|
| 80 |
+
"despair", # loss of hope
|
| 81 |
+
"shame", # self-as-bad
|
| 82 |
+
"guilt", # action-as-bad
|
| 83 |
+
"anger",
|
| 84 |
+
"resentment",
|
| 85 |
+
"envy",
|
| 86 |
+
"jealousy",
|
| 87 |
+
"longing",
|
| 88 |
+
"loneliness",
|
| 89 |
+
"doubt", # epistemic; not knowing
|
| 90 |
+
"disillusionment", # the hollowness of attained goals
|
| 91 |
+
"boredom", # the inertness of repetition
|
| 92 |
+
"restlessness", # the inability to settle
|
| 93 |
+
"frustration",
|
| 94 |
+
"confusion",
|
| 95 |
+
"numbness", # affect-blunted
|
| 96 |
+
)
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
# ──────────────────────────── DSPy signature ────────────────────────────
|
| 100 |
+
class EnrichVerse(dspy.Signature):
|
| 101 |
+
"""You are an Advaita-Vedānta-trained reader producing structured metadata
|
| 102 |
+
for a verse from the Bhagavad Gītā or a related scripture, so that a
|
| 103 |
+
spiritual advisor can later find this verse when a real person describes
|
| 104 |
+
a life situation in everyday language. Stay strictly within the framework
|
| 105 |
+
of Śaṅkarācārya's non-dual interpretation. Do not import dualistic notions
|
| 106 |
+
(separate creator/creature, soul-merging-into-God-as-other, etc.) and do
|
| 107 |
+
not bypass the verse's plain meaning by always retreating to the absolute.
|
| 108 |
+
|
| 109 |
+
The verse may include the Sanskrit, the English translation, and (when
|
| 110 |
+
available) Śaṅkara's commentary. Read all three. Your output is structured
|
| 111 |
+
fields, not prose. Be specific, lived, concrete. Avoid generic spiritual
|
| 112 |
+
language ('find peace', 'be in the moment'). Avoid tradition-foreign
|
| 113 |
+
therapy language ('honor your feelings'). When in doubt about a field,
|
| 114 |
+
leave it shorter rather than padded."""
|
| 115 |
+
|
| 116 |
+
# Inputs — the verse in its richest available form
|
| 117 |
+
verse_ref: str = dspy.InputField(desc="Citation form, e.g. 'BG 2.47'.")
|
| 118 |
+
sanskrit: str = dspy.InputField(desc="Devanāgarī text, may be empty.")
|
| 119 |
+
translation: str = dspy.InputField(desc="English translation of the verse.")
|
| 120 |
+
bhashya: str = dspy.InputField(desc="Śaṅkara's commentary on this verse, may be empty.")
|
| 121 |
+
|
| 122 |
+
# Outputs
|
| 123 |
+
paraphrase: str = dspy.OutputField(
|
| 124 |
+
desc="One or two sentences in plain modern English stating what the "
|
| 125 |
+
"verse teaches. Not a translation; a teaching summary. No jargon."
|
| 126 |
+
)
|
| 127 |
+
themes: list[str] = dspy.OutputField(
|
| 128 |
+
desc="2–5 Vedānta concepts the verse engages, in tradition-native "
|
| 129 |
+
"vocabulary with snake_case_keys, e.g. ['karma_yoga', 'non_attachment', "
|
| 130 |
+
"'two_truths']. Use Sanskrit terms where they're the right name."
|
| 131 |
+
)
|
| 132 |
+
life_situations: list[str] = dspy.OutputField(
|
| 133 |
+
desc="3–6 specific human predicaments this verse would help with, "
|
| 134 |
+
"in everyday English. e.g. 'facing public failure after years of "
|
| 135 |
+
"effort'. NOT 'finding peace' or 'spiritual growth'."
|
| 136 |
+
)
|
| 137 |
+
emotions_addressed: list[str] = dspy.OutputField(
|
| 138 |
+
desc="The emotions this verse meets, drawn ONLY from this fixed list: "
|
| 139 |
+
+ ", ".join(EMOTION_VOCAB) + ". 1–4 entries."
|
| 140 |
+
)
|
| 141 |
+
practical_teaching: str = dspy.OutputField(
|
| 142 |
+
desc="One sentence: what the verse asks the seeker to actually do or "
|
| 143 |
+
"shift. If the verse is purely ontological, write 'pure ontology — "
|
| 144 |
+
"no direct prescription' and the field will be ignored downstream."
|
| 145 |
+
)
|
| 146 |
+
hypothetical_questions: list[str] = dspy.OutputField(
|
| 147 |
+
desc="EXACTLY 5 first-person questions a real person might write to a "
|
| 148 |
+
"spiritual advisor that this verse would speak to. Specific, "
|
| 149 |
+
"ungeneric, in the user's voice. NOT in scripture's voice. e.g. "
|
| 150 |
+
"'I worked on this for three years and it just failed publicly — "
|
| 151 |
+
"how do I keep going?'"
|
| 152 |
+
)
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
# ──────────────────────────── Validators ────────────────────────────
|
| 156 |
+
THEME_KEY_RX = re.compile(r"^[a-z][a-z0-9_]{2,40}$")
|
| 157 |
+
|
| 158 |
+
|
| 159 |
+
def _normalize_theme(t: str) -> str:
|
| 160 |
+
t = t.strip().lower()
|
| 161 |
+
t = re.sub(r"[\s\-]+", "_", t)
|
| 162 |
+
t = re.sub(r"[^a-z0-9_]", "", t)
|
| 163 |
+
return t
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
def _validate(pred) -> tuple[bool, str]:
|
| 167 |
+
"""Light schema check. Returns (ok, reason_if_not_ok). Used to decide
|
| 168 |
+
whether to retry the LM call with a stricter prompt."""
|
| 169 |
+
paraphrase = (pred.paraphrase or "").strip()
|
| 170 |
+
if len(paraphrase) < 20:
|
| 171 |
+
return False, "paraphrase too short"
|
| 172 |
+
|
| 173 |
+
qs = pred.hypothetical_questions or []
|
| 174 |
+
if not isinstance(qs, list) or len(qs) < 3:
|
| 175 |
+
return False, f"need ≥3 hypothetical_questions, got {len(qs)}"
|
| 176 |
+
|
| 177 |
+
sits = pred.life_situations or []
|
| 178 |
+
if not isinstance(sits, list) or len(sits) < 2:
|
| 179 |
+
return False, f"need ≥2 life_situations, got {len(sits)}"
|
| 180 |
+
|
| 181 |
+
emos = pred.emotions_addressed or []
|
| 182 |
+
if not isinstance(emos, list) or not emos:
|
| 183 |
+
return False, "emotions_addressed empty"
|
| 184 |
+
bad = [e for e in emos if _normalize_theme(e) not in EMOTION_VOCAB]
|
| 185 |
+
if bad:
|
| 186 |
+
return False, f"emotions outside vocabulary: {bad}"
|
| 187 |
+
|
| 188 |
+
themes = pred.themes or []
|
| 189 |
+
if not isinstance(themes, list) or not themes:
|
| 190 |
+
return False, "themes empty"
|
| 191 |
+
|
| 192 |
+
return True, ""
|
| 193 |
+
|
| 194 |
+
|
| 195 |
+
# ──────────────────────────── Module ────────────────────────────
|
| 196 |
+
class Enricher(dspy.Module):
|
| 197 |
+
"""Wraps the EnrichVerse signature with retries and post-processing.
|
| 198 |
+
|
| 199 |
+
Why ChainOfThought over Predict
|
| 200 |
+
-------------------------------
|
| 201 |
+
GEPA may eventually optimize this prompt too, and ChainOfThought gives it
|
| 202 |
+
a `reasoning` trace to inspect during reflection. The cost is one extra
|
| 203 |
+
paragraph of LM output per call, which is negligible at our scale.
|
| 204 |
+
"""
|
| 205 |
+
|
| 206 |
+
def __init__(self, max_retries: int = 2):
|
| 207 |
+
super().__init__()
|
| 208 |
+
self.predict = dspy.ChainOfThought(EnrichVerse)
|
| 209 |
+
self.max_retries = max_retries
|
| 210 |
+
|
| 211 |
+
def forward(self, verse: Verse) -> EnrichedVerse:
|
| 212 |
+
attempt = 0
|
| 213 |
+
last_err = ""
|
| 214 |
+
pred = None
|
| 215 |
+
|
| 216 |
+
while attempt <= self.max_retries:
|
| 217 |
+
try:
|
| 218 |
+
pred = self.predict(
|
| 219 |
+
verse_ref=verse.verse_ref,
|
| 220 |
+
sanskrit=verse.sanskrit or "",
|
| 221 |
+
translation=verse.translation or "",
|
| 222 |
+
bhashya=verse.bhashya or "",
|
| 223 |
+
)
|
| 224 |
+
ok, reason = _validate(pred)
|
| 225 |
+
if ok:
|
| 226 |
+
break
|
| 227 |
+
last_err = reason
|
| 228 |
+
except Exception as e:
|
| 229 |
+
last_err = f"LM error: {e}"
|
| 230 |
+
attempt += 1
|
| 231 |
+
|
| 232 |
+
# Build the EnrichedVerse from the Verse + whatever we got
|
| 233 |
+
base = asdict(verse)
|
| 234 |
+
ev = EnrichedVerse(**base)
|
| 235 |
+
|
| 236 |
+
if pred and not last_err:
|
| 237 |
+
ev.paraphrase = (pred.paraphrase or "").strip()
|
| 238 |
+
ev.practical_teaching = (pred.practical_teaching or "").strip()
|
| 239 |
+
ev.themes = [
|
| 240 |
+
_normalize_theme(t) for t in (pred.themes or [])
|
| 241 |
+
if THEME_KEY_RX.match(_normalize_theme(t))
|
| 242 |
+
]
|
| 243 |
+
ev.life_situations = [
|
| 244 |
+
s.strip() for s in (pred.life_situations or [])
|
| 245 |
+
if s and len(s.strip()) >= 5
|
| 246 |
+
]
|
| 247 |
+
ev.emotions_addressed = [
|
| 248 |
+
_normalize_theme(e) for e in (pred.emotions_addressed or [])
|
| 249 |
+
if _normalize_theme(e) in EMOTION_VOCAB
|
| 250 |
+
]
|
| 251 |
+
ev.hypothetical_questions = [
|
| 252 |
+
q.strip() for q in (pred.hypothetical_questions or [])
|
| 253 |
+
if q and len(q.strip()) >= 10
|
| 254 |
+
][:5] # cap at 5
|
| 255 |
+
|
| 256 |
+
# Stamp the model so re-runs after a model swap can be detected
|
| 257 |
+
try:
|
| 258 |
+
lm = dspy.settings.lm
|
| 259 |
+
ev.enrichment_model = getattr(lm, "model", "") or ""
|
| 260 |
+
except Exception:
|
| 261 |
+
pass
|
| 262 |
+
else:
|
| 263 |
+
# Enrichment failed; keep the verse but mark it
|
| 264 |
+
ev.enrichment_model = f"FAILED: {last_err}"
|
| 265 |
+
|
| 266 |
+
return ev
|
ingest_corpus.py
ADDED
|
@@ -0,0 +1,203 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
ingest_corpus.py — run the parsers and produce data/corpus.jsonl.
|
| 3 |
+
|
| 4 |
+
This script lives between download_sources.py (which gets bytes onto disk)
|
| 5 |
+
and enrich_corpus.py (which adds LLM-derived fields). Its specific job:
|
| 6 |
+
|
| 7 |
+
1. Walk each enabled source in the registry.
|
| 8 |
+
2. Dispatch to its parser, which yields Verse records.
|
| 9 |
+
3. Merge records across sources by verse_ref.
|
| 10 |
+
- The Gītā parser yields verses with translation but no bhāṣya.
|
| 11 |
+
- The Sastry parser yields verses with bhāṣya but spotty translation.
|
| 12 |
+
- We want one record per verse, with both populated when possible.
|
| 13 |
+
4. Write the merged stream as JSONL to data/corpus.jsonl.
|
| 14 |
+
|
| 15 |
+
Why merge by verse_ref rather than verse_id
|
| 16 |
+
-------------------------------------------
|
| 17 |
+
The Gītā parser uses work='bhagavad_gita' and the Sastry parser uses
|
| 18 |
+
work='bhagavad_gita_bhashya'. Their verse_ids therefore differ (different
|
| 19 |
+
work prefix), but their verse_refs match — both render as 'BG 2.47'. We
|
| 20 |
+
key the merge on verse_ref since that's the reader-facing canonical citation.
|
| 21 |
+
|
| 22 |
+
Conflict policy when merging
|
| 23 |
+
----------------------------
|
| 24 |
+
- Translation: keep whichever record has it; if both, prefer the one whose
|
| 25 |
+
source_key is in the GITA_TEXT_PRIORITY list. (We want the modern, clean
|
| 26 |
+
Sivananda over Sastry's archaic English-of-Śaṅkara-paraphrasing-the-verse.)
|
| 27 |
+
- Bhāṣya: only one source produces this; conflicts shouldn't happen.
|
| 28 |
+
- Sanskrit / transliteration / word_meanings: prefer gita_json; richer.
|
| 29 |
+
"""
|
| 30 |
+
|
| 31 |
+
from __future__ import annotations
|
| 32 |
+
import argparse
|
| 33 |
+
from collections import defaultdict
|
| 34 |
+
from pathlib import Path
|
| 35 |
+
from typing import Iterable
|
| 36 |
+
|
| 37 |
+
from tqdm import tqdm
|
| 38 |
+
|
| 39 |
+
import config
|
| 40 |
+
from corpus import Verse, write_jsonl
|
| 41 |
+
from sources_registry import enabled_sources, by_key, Source
|
| 42 |
+
|
| 43 |
+
# Parsers
|
| 44 |
+
from parsers import gita_json as parser_gita_json
|
| 45 |
+
from parsers import sastry_archive as parser_sastry
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
# When two sources both have a translation, this list decides which wins
|
| 49 |
+
GITA_TEXT_PRIORITY = ("gita_json_core", "sastry_gita_bhashya")
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def _parse_source(src: Source, raw_dir: Path) -> Iterable[Verse]:
|
| 53 |
+
"""Dispatch to the right parser for a registry entry.
|
| 54 |
+
|
| 55 |
+
Each parser is documented to take a directory and return an iterable of
|
| 56 |
+
Verses; this function is just a switch table.
|
| 57 |
+
"""
|
| 58 |
+
if src.parser == "gita_json":
|
| 59 |
+
# The gita_json parser can take both the core dir and (optionally) a
|
| 60 |
+
# translations dir. We pass the same dir for both since the downloader
|
| 61 |
+
# puts all gita_json* files into per-source folders.
|
| 62 |
+
if src.key == "gita_json_core":
|
| 63 |
+
translations_dir = raw_dir.parent / "gita_json_translations"
|
| 64 |
+
return parser_gita_json.parse(
|
| 65 |
+
raw_dir,
|
| 66 |
+
translations_dir if translations_dir.exists() else None,
|
| 67 |
+
)
|
| 68 |
+
# The translations source is "consumed" alongside core, not parsed alone
|
| 69 |
+
return iter(())
|
| 70 |
+
|
| 71 |
+
if src.parser == "sastry_archive":
|
| 72 |
+
return parser_sastry.parse(raw_dir)
|
| 73 |
+
|
| 74 |
+
if src.parser == "wisdomlib_html":
|
| 75 |
+
# Stub for now — see parsers/wisdomlib_html.py to implement.
|
| 76 |
+
# We don't fail the whole ingest just because one parser is unimplemented.
|
| 77 |
+
print(f"[ingest] wisdomlib_html parser not implemented yet — skipping {src.key}")
|
| 78 |
+
return iter(())
|
| 79 |
+
|
| 80 |
+
if src.parser == "thibaut_sbe":
|
| 81 |
+
print(f"[ingest] thibaut_sbe parser not implemented yet — skipping {src.key}")
|
| 82 |
+
return iter(())
|
| 83 |
+
|
| 84 |
+
if src.parser == "plain_text":
|
| 85 |
+
# Reserved for user-dropped texts; future work
|
| 86 |
+
return iter(())
|
| 87 |
+
|
| 88 |
+
raise ValueError(f"Unknown parser type: {src.parser}")
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
def _merge(records: list[Verse]) -> list[Verse]:
|
| 92 |
+
"""Merge multiple parser outputs into one record per verse_ref.
|
| 93 |
+
|
| 94 |
+
The output preserves the order of first appearance, so the corpus.jsonl
|
| 95 |
+
file is naturally chapter-then-verse ordered.
|
| 96 |
+
"""
|
| 97 |
+
by_ref: dict[str, Verse] = {}
|
| 98 |
+
order: list[str] = []
|
| 99 |
+
|
| 100 |
+
for r in records:
|
| 101 |
+
if r.verse_ref not in by_ref:
|
| 102 |
+
by_ref[r.verse_ref] = r
|
| 103 |
+
order.append(r.verse_ref)
|
| 104 |
+
continue
|
| 105 |
+
|
| 106 |
+
existing = by_ref[r.verse_ref]
|
| 107 |
+
|
| 108 |
+
# Translation: pick higher-priority source if both have one
|
| 109 |
+
new_translation = existing.translation
|
| 110 |
+
new_translator = existing.translator
|
| 111 |
+
if r.translation and (
|
| 112 |
+
not existing.translation
|
| 113 |
+
or _priority(r.source_key) < _priority(existing.source_key)
|
| 114 |
+
):
|
| 115 |
+
new_translation = r.translation
|
| 116 |
+
new_translator = r.translator
|
| 117 |
+
|
| 118 |
+
# Bhashya: only one source typically has it, take whichever isn't blank
|
| 119 |
+
new_bhashya = existing.bhashya or r.bhashya
|
| 120 |
+
new_bhashya_tr = existing.bhashya_translator or r.bhashya_translator
|
| 121 |
+
|
| 122 |
+
# Sanskrit family of fields: prefer the existing record if it has them,
|
| 123 |
+
# else take from the new record
|
| 124 |
+
merged = Verse(
|
| 125 |
+
verse_id=existing.verse_id,
|
| 126 |
+
work=existing.work, # keep the work_display of whichever came first
|
| 127 |
+
work_display=existing.work_display,
|
| 128 |
+
verse_ref=existing.verse_ref,
|
| 129 |
+
tier=_choose_tier(existing.tier, r.tier),
|
| 130 |
+
section=existing.section or r.section,
|
| 131 |
+
section_display=existing.section_display or r.section_display,
|
| 132 |
+
translation=new_translation,
|
| 133 |
+
translator=new_translator,
|
| 134 |
+
sanskrit=existing.sanskrit or r.sanskrit,
|
| 135 |
+
transliteration=existing.transliteration or r.transliteration,
|
| 136 |
+
word_meanings=existing.word_meanings or r.word_meanings,
|
| 137 |
+
bhashya=new_bhashya,
|
| 138 |
+
bhashya_translator=new_bhashya_tr,
|
| 139 |
+
source_key=existing.source_key + "+" + r.source_key,
|
| 140 |
+
license=existing.license or r.license,
|
| 141 |
+
)
|
| 142 |
+
by_ref[r.verse_ref] = merged
|
| 143 |
+
|
| 144 |
+
return [by_ref[k] for k in order]
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
def _priority(source_key: str) -> int:
|
| 148 |
+
"""Lower is higher-priority. Sources not in the priority list rank last."""
|
| 149 |
+
for i, key in enumerate(GITA_TEXT_PRIORITY):
|
| 150 |
+
if source_key == key or source_key.startswith(key + "+") or source_key.endswith("+" + key):
|
| 151 |
+
return i
|
| 152 |
+
return 99
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
def _choose_tier(a: str, b: str) -> str:
|
| 156 |
+
"""When two records merge, the tier of the merged verse is the most
|
| 157 |
+
'authoritative' of the two: primary > shankara > supporting.
|
| 158 |
+
|
| 159 |
+
Why primary > shankara: when we have both the verse text (primary) and
|
| 160 |
+
Śaṅkara's bhāṣya on it (shankara) folded into one record, the verse
|
| 161 |
+
itself is what the citation refers to — so primary wins."""
|
| 162 |
+
rank = {"primary": 0, "shankara": 1, "supporting": 2}
|
| 163 |
+
return a if rank.get(a, 9) <= rank.get(b, 9) else b
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
# ──────────────────────────── CLI ────────────────────────────
|
| 167 |
+
def main():
|
| 168 |
+
ap = argparse.ArgumentParser()
|
| 169 |
+
ap.add_argument("--out", default=str(config.DATA_DIR / "corpus.jsonl"))
|
| 170 |
+
args = ap.parse_args()
|
| 171 |
+
|
| 172 |
+
raw_root = config.DATA_DIR / "raw"
|
| 173 |
+
if not raw_root.exists():
|
| 174 |
+
raise SystemExit("data/raw/ doesn't exist. Run download_sources.py first.")
|
| 175 |
+
|
| 176 |
+
all_records: list[Verse] = []
|
| 177 |
+
for src in enabled_sources():
|
| 178 |
+
raw_dir = raw_root / src.key
|
| 179 |
+
if not raw_dir.exists():
|
| 180 |
+
print(f"[ingest] {src.key}: no files at {raw_dir}; skipping")
|
| 181 |
+
continue
|
| 182 |
+
print(f"[ingest] parsing {src.key} via {src.parser}")
|
| 183 |
+
try:
|
| 184 |
+
n_before = len(all_records)
|
| 185 |
+
for v in _parse_source(src, raw_dir):
|
| 186 |
+
if v.has_content():
|
| 187 |
+
all_records.append(v)
|
| 188 |
+
print(f"[ingest] yielded {len(all_records) - n_before} records")
|
| 189 |
+
except Exception as e:
|
| 190 |
+
print(f"[ingest] {src.key} failed: {e}")
|
| 191 |
+
|
| 192 |
+
print(f"[ingest] merging {len(all_records)} records by verse_ref ...")
|
| 193 |
+
merged = _merge(all_records)
|
| 194 |
+
print(f"[ingest] {len(merged)} unique verses after merge")
|
| 195 |
+
|
| 196 |
+
out_path = Path(args.out)
|
| 197 |
+
n = write_jsonl(merged, out_path)
|
| 198 |
+
print(f"[ingest] wrote {n} verses to {out_path}")
|
| 199 |
+
print(f"[ingest] next: python enrich_corpus.py")
|
| 200 |
+
|
| 201 |
+
|
| 202 |
+
if __name__ == "__main__":
|
| 203 |
+
main()
|
knowledge_base.py
ADDED
|
@@ -0,0 +1,416 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
knowledge_base.py — verse-indexed, multi-view RAG over the enriched corpus.
|
| 3 |
+
|
| 4 |
+
The shift from the old design
|
| 5 |
+
-----------------------------
|
| 6 |
+
The old knowledge_base.py chunked source text into 380-token windows with
|
| 7 |
+
overlap. The new one indexes each verse as a single record but with three
|
| 8 |
+
*views* — three separate embeddings of three different framings of the same
|
| 9 |
+
verse — so that queries phrased in different registers can all find it.
|
| 10 |
+
|
| 11 |
+
The three views per verse, and what each one is good for:
|
| 12 |
+
|
| 13 |
+
literal_view — the English translation (and Sanskrit fragment if
|
| 14 |
+
available). Best for queries that share lexical features
|
| 15 |
+
with the text itself: "what does it mean to act without
|
| 16 |
+
attachment?" maps cleanly to BG 2.47's literal text.
|
| 17 |
+
|
| 18 |
+
bhashya_view — Śaṅkara's commentary on the verse. Best for queries that
|
| 19 |
+
ask about the Vedāntic explanation rather than the verse
|
| 20 |
+
itself: "how does adhyāsa relate to suffering?" finds
|
| 21 |
+
the bhāṣya passages where Śaṅkara unfolds adhyāsa.
|
| 22 |
+
|
| 23 |
+
advisor_view — the LLM-enriched composite (paraphrase + life situations
|
| 24 |
+
+ emotions addressed + hypothetical questions). Best for
|
| 25 |
+
real-world questions in real-world language. This is
|
| 26 |
+
where the language gap closes.
|
| 27 |
+
|
| 28 |
+
At retrieval time we query all three indices, merge by verse_id (so each
|
| 29 |
+
verse appears at most once), and combine scores with a weighted sum that
|
| 30 |
+
gives the advisor_view the lion's share of credit while letting the
|
| 31 |
+
literal and bhāṣya views catch cases the LLM enrichment missed.
|
| 32 |
+
|
| 33 |
+
Why three indices and not one with concatenated views
|
| 34 |
+
-----------------------------------------------------
|
| 35 |
+
Concatenating literal + bhāṣya + advisor into one big text and embedding
|
| 36 |
+
that gives you the average direction across the three. Real semantic search
|
| 37 |
+
benefits from being able to match any one of the three angles strongly. The
|
| 38 |
+
extra storage (three vectors per verse instead of one) is trivial; the
|
| 39 |
+
retrieval-quality difference is large.
|
| 40 |
+
|
| 41 |
+
Storage layout
|
| 42 |
+
--------------
|
| 43 |
+
We keep three Chroma collections in artifacts/chroma/:
|
| 44 |
+
advaita_literal
|
| 45 |
+
advaita_bhashya
|
| 46 |
+
advaita_advisor
|
| 47 |
+
|
| 48 |
+
Each holds the same set of verse_ids. We resolve a hit's full record by
|
| 49 |
+
reading data/corpus_enriched.jsonl (kept small enough to live in memory).
|
| 50 |
+
"""
|
| 51 |
+
|
| 52 |
+
from __future__ import annotations
|
| 53 |
+
import argparse
|
| 54 |
+
from dataclasses import dataclass, field
|
| 55 |
+
from pathlib import Path
|
| 56 |
+
from typing import Iterable
|
| 57 |
+
|
| 58 |
+
import chromadb
|
| 59 |
+
from chromadb.config import Settings
|
| 60 |
+
from sentence_transformers import SentenceTransformer
|
| 61 |
+
from tqdm import tqdm
|
| 62 |
+
|
| 63 |
+
import config
|
| 64 |
+
from corpus import EnrichedVerse, read_jsonl_enriched
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
# ──────────────────────────── Constants ────────────────────────────
|
| 68 |
+
COLLECTION_LITERAL = "advaita_literal"
|
| 69 |
+
COLLECTION_BHASHYA = "advaita_bhashya"
|
| 70 |
+
COLLECTION_ADVISOR = "advaita_advisor"
|
| 71 |
+
|
| 72 |
+
# Tier weights — multiplied into the cosine similarity at retrieval time.
|
| 73 |
+
# Same logic as before: primary scripture and Śaṅkara's pen outrank later
|
| 74 |
+
# voices when the cosine score is otherwise comparable.
|
| 75 |
+
TIER_WEIGHTS = {"primary": 1.10, "shankara": 1.10, "supporting": 1.00}
|
| 76 |
+
|
| 77 |
+
# View weights — how much each view's score contributes to the combined
|
| 78 |
+
# score per verse. The advisor view dominates because it is the one
|
| 79 |
+
# designed to bridge the language gap; literal and bhāṣya are insurance
|
| 80 |
+
# against the enrichment pipeline missing a topic.
|
| 81 |
+
VIEW_WEIGHTS = {"advisor": 0.55, "literal": 0.25, "bhashya": 0.20}
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
# ──────────────────────────── Hit dataclass ────────────────────────────
|
| 85 |
+
@dataclass
|
| 86 |
+
class Hit:
|
| 87 |
+
"""One retrieval result, post-merge across the three views."""
|
| 88 |
+
verse: EnrichedVerse
|
| 89 |
+
combined_score: float # used for ranking
|
| 90 |
+
view_scores: dict[str, float] = field(default_factory=dict) # diagnostics
|
| 91 |
+
|
| 92 |
+
def __repr__(self) -> str:
|
| 93 |
+
v = self.verse
|
| 94 |
+
return (f"Hit({v.verse_ref}, tier={v.tier}, "
|
| 95 |
+
f"score={self.combined_score:.3f}, views={self.view_scores})")
|
| 96 |
+
|
| 97 |
+
def to_dict(self) -> dict:
|
| 98 |
+
"""Flatten a Hit to a JSON-serializable dict so the advisor can carry
|
| 99 |
+
it in dspy.Prediction (which is pickled during GEPA optimization), and
|
| 100 |
+
so the metric can read its fields without importing this module."""
|
| 101 |
+
v = self.verse
|
| 102 |
+
return {
|
| 103 |
+
"verse_id": v.verse_id,
|
| 104 |
+
"verse_ref": v.verse_ref,
|
| 105 |
+
"work": v.work,
|
| 106 |
+
"work_display": v.work_display,
|
| 107 |
+
"section": v.section,
|
| 108 |
+
"tier": v.tier,
|
| 109 |
+
"translation": v.translation,
|
| 110 |
+
"translator": v.translator,
|
| 111 |
+
"bhashya": v.bhashya,
|
| 112 |
+
"bhashya_translator": v.bhashya_translator,
|
| 113 |
+
"paraphrase": v.paraphrase,
|
| 114 |
+
"themes": list(v.themes),
|
| 115 |
+
"life_situations": list(v.life_situations),
|
| 116 |
+
"emotions_addressed": list(v.emotions_addressed),
|
| 117 |
+
"hypothetical_questions": list(v.hypothetical_questions),
|
| 118 |
+
"score": self.combined_score,
|
| 119 |
+
"view_scores": dict(self.view_scores),
|
| 120 |
+
# Legacy alias the old metric used:
|
| 121 |
+
"meta": {
|
| 122 |
+
"verse_ref": v.verse_ref,
|
| 123 |
+
"work": v.work,
|
| 124 |
+
"section": v.section,
|
| 125 |
+
"tier": v.tier,
|
| 126 |
+
},
|
| 127 |
+
}
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
# ──────────────────────────── Internals ────────────────────────────
|
| 131 |
+
def _client() -> chromadb.api.ClientAPI:
|
| 132 |
+
return chromadb.PersistentClient(
|
| 133 |
+
path=str(config.CHROMA_DIR),
|
| 134 |
+
settings=Settings(anonymized_telemetry=False),
|
| 135 |
+
)
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
def _embedder() -> SentenceTransformer:
|
| 139 |
+
return SentenceTransformer(config.EMBED_MODEL, device=config.EMBED_DEVICE)
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
def _record_metadata(v: EnrichedVerse) -> dict:
|
| 143 |
+
"""Metadata stored alongside each chroma record so the retriever can
|
| 144 |
+
filter and report without re-loading the JSONL on every call.
|
| 145 |
+
|
| 146 |
+
chromadb requires scalar metadata values, so list-valued fields (themes,
|
| 147 |
+
emotions) are joined with semicolons. The choice of ';' is safe because
|
| 148 |
+
neither chroma nor our snake_case theme keys contain that character.
|
| 149 |
+
"""
|
| 150 |
+
return {
|
| 151 |
+
"verse_id": v.verse_id,
|
| 152 |
+
"verse_ref": v.verse_ref,
|
| 153 |
+
"work": v.work,
|
| 154 |
+
"tier": v.tier,
|
| 155 |
+
"section": v.section,
|
| 156 |
+
"themes_csv": ";".join(v.themes),
|
| 157 |
+
"emotions_csv": ";".join(v.emotions_addressed),
|
| 158 |
+
}
|
| 159 |
+
|
| 160 |
+
|
| 161 |
+
# ──────────────────────────── Index build ────────────────────────────
|
| 162 |
+
def build_index(corpus_path: Path | None = None) -> dict[str, int]:
|
| 163 |
+
"""(Re)build all three view-indices from the enriched corpus.
|
| 164 |
+
|
| 165 |
+
Returns a dict {view_name: n_records} for confirmation. The function is
|
| 166 |
+
safe to re-run; it deletes existing collections first so partial state
|
| 167 |
+
from a prior crash doesn't pollute results.
|
| 168 |
+
"""
|
| 169 |
+
corpus_path = corpus_path or (config.DATA_DIR / "corpus_enriched.jsonl")
|
| 170 |
+
if not corpus_path.exists():
|
| 171 |
+
raise SystemExit(
|
| 172 |
+
f"No enriched corpus at {corpus_path}.\n"
|
| 173 |
+
f"Pipeline: download_sources.py → ingest_corpus.py → "
|
| 174 |
+
f"enrich_corpus.py → knowledge_base.py --build"
|
| 175 |
+
)
|
| 176 |
+
|
| 177 |
+
print(f"Loading embedding model: {config.EMBED_MODEL} on {config.EMBED_DEVICE}")
|
| 178 |
+
embedder = _embedder()
|
| 179 |
+
|
| 180 |
+
client = _client()
|
| 181 |
+
# Drop existing collections; build_index is "rebuild from scratch"
|
| 182 |
+
for name in (COLLECTION_LITERAL, COLLECTION_BHASHYA, COLLECTION_ADVISOR):
|
| 183 |
+
try:
|
| 184 |
+
client.delete_collection(name)
|
| 185 |
+
except Exception:
|
| 186 |
+
pass
|
| 187 |
+
|
| 188 |
+
coll_literal = client.create_collection(
|
| 189 |
+
COLLECTION_LITERAL, metadata={"hnsw:space": "cosine"})
|
| 190 |
+
coll_bhashya = client.create_collection(
|
| 191 |
+
COLLECTION_BHASHYA, metadata={"hnsw:space": "cosine"})
|
| 192 |
+
coll_advisor = client.create_collection(
|
| 193 |
+
COLLECTION_ADVISOR, metadata={"hnsw:space": "cosine"})
|
| 194 |
+
|
| 195 |
+
verses = list(read_jsonl_enriched(corpus_path))
|
| 196 |
+
print(f"Indexing {len(verses)} verses across 3 views ...")
|
| 197 |
+
|
| 198 |
+
counts = {"literal": 0, "bhashya": 0, "advisor": 0}
|
| 199 |
+
|
| 200 |
+
# We batch by view so each call to encode() is efficient. For 3000 verses
|
| 201 |
+
# at small-batch BGE this is a few seconds total per view, much faster
|
| 202 |
+
# than one-at-a-time embedding.
|
| 203 |
+
BATCH = 64
|
| 204 |
+
for view_name, view_fn, coll in (
|
| 205 |
+
("literal", lambda v: v.literal_view(), coll_literal),
|
| 206 |
+
("bhashya", lambda v: v.bhashya_view(), coll_bhashya),
|
| 207 |
+
("advisor", lambda v: v.advisor_view(), coll_advisor),
|
| 208 |
+
):
|
| 209 |
+
# Skip verses whose view is empty. A verse without a bhāṣya simply
|
| 210 |
+
# doesn't appear in the bhāṣya index — the merger handles partial
|
| 211 |
+
# coverage cleanly.
|
| 212 |
+
records = [(v, view_fn(v)) for v in verses]
|
| 213 |
+
records = [(v, t) for v, t in records if t.strip()]
|
| 214 |
+
|
| 215 |
+
for i in tqdm(range(0, len(records), BATCH), desc=f" view: {view_name}"):
|
| 216 |
+
chunk = records[i:i + BATCH]
|
| 217 |
+
ids = [v.verse_id for v, _ in chunk]
|
| 218 |
+
texts = [t for _, t in chunk]
|
| 219 |
+
metas = [_record_metadata(v) for v, _ in chunk]
|
| 220 |
+
|
| 221 |
+
vectors = embedder.encode(
|
| 222 |
+
texts,
|
| 223 |
+
normalize_embeddings=True,
|
| 224 |
+
show_progress_bar=False,
|
| 225 |
+
batch_size=BATCH,
|
| 226 |
+
).tolist()
|
| 227 |
+
coll.add(ids=ids, embeddings=vectors,
|
| 228 |
+
documents=texts, metadatas=metas)
|
| 229 |
+
|
| 230 |
+
counts[view_name] = len(records)
|
| 231 |
+
|
| 232 |
+
print(f"Index built: {counts}")
|
| 233 |
+
return counts
|
| 234 |
+
|
| 235 |
+
|
| 236 |
+
# ───────���──────────────────── Retriever ────────────────────────────
|
| 237 |
+
class AdvaitaRetriever:
|
| 238 |
+
"""Multi-view retriever returning Hit objects backed by EnrichedVerse.
|
| 239 |
+
|
| 240 |
+
Construction loads the enriched corpus into memory (≈3000 records,
|
| 241 |
+
≈10 MB) so we can resolve hits to full records without per-call disk
|
| 242 |
+
reads. This matters during GEPA optimization, which calls retrieve()
|
| 243 |
+
hundreds of times per evaluation pass.
|
| 244 |
+
|
| 245 |
+
The retriever is intentionally light: it doesn't filter by metadata,
|
| 246 |
+
by tier, or by emotion at query time. Filtering happens at scoring
|
| 247 |
+
(TIER_WEIGHTS) and at the SelectPassages stage downstream. Keeping
|
| 248 |
+
retrieval permissive and selection picky is more robust than the
|
| 249 |
+
reverse — when retrieval over-filters, you can never recover the
|
| 250 |
+
missed verse later in the pipeline.
|
| 251 |
+
"""
|
| 252 |
+
|
| 253 |
+
def __init__(self, top_k: int = config.TOP_K_RETRIEVE,
|
| 254 |
+
corpus_path: Path | None = None):
|
| 255 |
+
self.top_k = top_k
|
| 256 |
+
self._embedder: SentenceTransformer | None = None
|
| 257 |
+
self._coll_literal = None
|
| 258 |
+
self._coll_bhashya = None
|
| 259 |
+
self._coll_advisor = None
|
| 260 |
+
|
| 261 |
+
cp = corpus_path or (config.DATA_DIR / "corpus_enriched.jsonl")
|
| 262 |
+
self._verses_by_id: dict[str, EnrichedVerse] = {
|
| 263 |
+
v.verse_id: v for v in read_jsonl_enriched(cp)
|
| 264 |
+
}
|
| 265 |
+
|
| 266 |
+
def _ensure(self):
|
| 267 |
+
"""Lazy-load embedder and collections. We avoid loading at __init__
|
| 268 |
+
so a process that only needs the corpus mapping (e.g. the metric)
|
| 269 |
+
doesn't pay the SentenceTransformer load time."""
|
| 270 |
+
if self._embedder is None:
|
| 271 |
+
self._embedder = _embedder()
|
| 272 |
+
if self._coll_advisor is None:
|
| 273 |
+
client = _client()
|
| 274 |
+
self._coll_literal = client.get_collection(COLLECTION_LITERAL)
|
| 275 |
+
self._coll_bhashya = client.get_collection(COLLECTION_BHASHYA)
|
| 276 |
+
self._coll_advisor = client.get_collection(COLLECTION_ADVISOR)
|
| 277 |
+
|
| 278 |
+
def search(self, query: str, k: int | None = None) -> list[Hit]:
|
| 279 |
+
"""Run the query against all three views, merge by verse_id, and
|
| 280 |
+
return the top-k Hits sorted by combined score."""
|
| 281 |
+
self._ensure()
|
| 282 |
+
k = k or self.top_k
|
| 283 |
+
|
| 284 |
+
q_emb = self._embedder.encode(
|
| 285 |
+
[query], normalize_embeddings=True, show_progress_bar=False
|
| 286 |
+
).tolist()
|
| 287 |
+
|
| 288 |
+
# Over-fetch from each view; we want enough overlap that the merge
|
| 289 |
+
# has something to work with. 3*k per view is a reasonable upper
|
| 290 |
+
# bound: large enough to catch verses one view ranked low and another
|
| 291 |
+
# ranked high, small enough that Chroma's HNSW stays fast.
|
| 292 |
+
per_view_k = max(8, k * 3)
|
| 293 |
+
|
| 294 |
+
view_results: dict[str, list[tuple[str, float, dict]]] = {}
|
| 295 |
+
for name, coll in (("literal", self._coll_literal),
|
| 296 |
+
("bhashya", self._coll_bhashya),
|
| 297 |
+
("advisor", self._coll_advisor)):
|
| 298 |
+
r = coll.query(query_embeddings=q_emb, n_results=per_view_k)
|
| 299 |
+
ids = r["ids"][0]
|
| 300 |
+
dists = r["distances"][0] # cosine distance, in [0, 2]
|
| 301 |
+
metas = r["metadatas"][0]
|
| 302 |
+
view_results[name] = list(zip(ids, dists, metas))
|
| 303 |
+
|
| 304 |
+
# Merge: for each verse_id seen in any view, compute its combined
|
| 305 |
+
# score as Σ_v VIEW_WEIGHTS[v] * cos_sim(v) * tier_weight, where any
|
| 306 |
+
# view that didn't return that verse contributes 0. This is a soft
|
| 307 |
+
# voting scheme: a verse that appears strongly in one view but not
|
| 308 |
+
# others can still rank highly if that one view's weight is enough.
|
| 309 |
+
per_verse: dict[str, dict[str, float]] = {}
|
| 310 |
+
per_verse_meta: dict[str, dict] = {}
|
| 311 |
+
for view_name, results in view_results.items():
|
| 312 |
+
for vid, dist, meta in results:
|
| 313 |
+
cos_sim = 1.0 - dist
|
| 314 |
+
per_verse.setdefault(vid, {})[view_name] = cos_sim
|
| 315 |
+
per_verse_meta[vid] = meta
|
| 316 |
+
|
| 317 |
+
hits: list[Hit] = []
|
| 318 |
+
for vid, view_scores in per_verse.items():
|
| 319 |
+
tier = per_verse_meta[vid].get("tier", "supporting")
|
| 320 |
+
tw = TIER_WEIGHTS.get(tier, 1.0)
|
| 321 |
+
combined = sum(
|
| 322 |
+
VIEW_WEIGHTS[v] * view_scores.get(v, 0.0)
|
| 323 |
+
for v in VIEW_WEIGHTS
|
| 324 |
+
) * tw
|
| 325 |
+
verse = self._verses_by_id.get(vid)
|
| 326 |
+
if verse is None:
|
| 327 |
+
# Index has it but corpus file doesn't — corpus and index
|
| 328 |
+
# have drifted. Skip rather than fabricate a record.
|
| 329 |
+
continue
|
| 330 |
+
hits.append(Hit(verse=verse,
|
| 331 |
+
combined_score=combined,
|
| 332 |
+
view_scores=view_scores))
|
| 333 |
+
|
| 334 |
+
hits.sort(key=lambda h: h.combined_score, reverse=True)
|
| 335 |
+
return hits[:k]
|
| 336 |
+
|
| 337 |
+
def search_many(self, queries: Iterable[str],
|
| 338 |
+
k_per: int | None = None) -> list[Hit]:
|
| 339 |
+
"""Run multiple queries (e.g. from PlanRetrieval) and dedupe by
|
| 340 |
+
verse_id, keeping the highest combined score across queries."""
|
| 341 |
+
seen: dict[str, Hit] = {}
|
| 342 |
+
for q in queries:
|
| 343 |
+
for h in self.search(q, k=k_per):
|
| 344 |
+
cur = seen.get(h.verse.verse_id)
|
| 345 |
+
if cur is None or h.combined_score > cur.combined_score:
|
| 346 |
+
seen[h.verse.verse_id] = h
|
| 347 |
+
out = list(seen.values())
|
| 348 |
+
out.sort(key=lambda h: h.combined_score, reverse=True)
|
| 349 |
+
return out
|
| 350 |
+
|
| 351 |
+
|
| 352 |
+
# ──────────────────────────── Formatter for the LLM ────────────────────────────
|
| 353 |
+
def format_hits_for_llm(hits: list[Hit]) -> str:
|
| 354 |
+
"""Render hits for the SelectPassages and SynthesizeAdvice prompts.
|
| 355 |
+
|
| 356 |
+
We expose the verse_ref (so the synthesizer can cite it), the literal
|
| 357 |
+
translation (so the synthesizer can quote it lightly), the bhāṣya
|
| 358 |
+
snippet (so the synthesizer can ground its claims), and the advisor-view
|
| 359 |
+
fields (so the synthesizer knows *why* this verse is being suggested for
|
| 360 |
+
this user).
|
| 361 |
+
|
| 362 |
+
Each hit is bounded in length so the prompt stays tractable on a 26B
|
| 363 |
+
local model with an 8k context window.
|
| 364 |
+
"""
|
| 365 |
+
blocks = []
|
| 366 |
+
for i, h in enumerate(hits, start=1):
|
| 367 |
+
v = h.verse
|
| 368 |
+
block = [f"[{i}] {v.verse_ref} — {v.work_display}, {v.section_display}"]
|
| 369 |
+
block.append(f" tier: {v.tier} score: {h.combined_score:.3f}")
|
| 370 |
+
if v.translation:
|
| 371 |
+
block.append(f" Translation: {v.translation.strip()[:600]}")
|
| 372 |
+
if v.bhashya:
|
| 373 |
+
block.append(f" Bhāṣya (Śaṅkara): {v.bhashya.strip()[:800]}")
|
| 374 |
+
if v.paraphrase:
|
| 375 |
+
block.append(f" Teaching: {v.paraphrase}")
|
| 376 |
+
if v.life_situations:
|
| 377 |
+
block.append(f" Speaks to: {'; '.join(v.life_situations)}")
|
| 378 |
+
if v.emotions_addressed:
|
| 379 |
+
block.append(f" Addresses: {', '.join(v.emotions_addressed)}")
|
| 380 |
+
if v.themes:
|
| 381 |
+
block.append(f" Themes: {', '.join(v.themes)}")
|
| 382 |
+
blocks.append("\n".join(block))
|
| 383 |
+
return "\n\n".join(blocks)
|
| 384 |
+
|
| 385 |
+
|
| 386 |
+
# Alias kept so that advisor.py — and any prior code that imported the old
|
| 387 |
+
# name — works without modification. Both names refer to the same function
|
| 388 |
+
# because the new "passages" the advisor sees ARE Hit objects backed by
|
| 389 |
+
# EnrichedVerse records; the rendering is identical.
|
| 390 |
+
format_passages_for_llm = format_hits_for_llm
|
| 391 |
+
|
| 392 |
+
|
| 393 |
+
# ──────────────────────────── CLI ────────────────────────────
|
| 394 |
+
def main():
|
| 395 |
+
ap = argparse.ArgumentParser()
|
| 396 |
+
ap.add_argument("--build", action="store_true",
|
| 397 |
+
help="(Re)build the multi-view index from corpus_enriched.jsonl")
|
| 398 |
+
ap.add_argument("--query", type=str, default=None,
|
| 399 |
+
help="Run a test query against the index")
|
| 400 |
+
args = ap.parse_args()
|
| 401 |
+
|
| 402 |
+
if args.build:
|
| 403 |
+
build_index()
|
| 404 |
+
return
|
| 405 |
+
|
| 406 |
+
if args.query:
|
| 407 |
+
retr = AdvaitaRetriever()
|
| 408 |
+
hits = retr.search(args.query)
|
| 409 |
+
print(format_hits_for_llm(hits))
|
| 410 |
+
return
|
| 411 |
+
|
| 412 |
+
ap.print_help()
|
| 413 |
+
|
| 414 |
+
|
| 415 |
+
if __name__ == "__main__":
|
| 416 |
+
main()
|
metrics.py
ADDED
|
@@ -0,0 +1,435 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
metrics.py — the metric is the specification.
|
| 3 |
+
|
| 4 |
+
GEPA optimizes whatever the metric rewards. So the metric here is not a single
|
| 5 |
+
number; it's a *contract* on what an Advaita-grounded, empathetic, practically
|
| 6 |
+
useful response looks like — combined with rich textual feedback the reflection
|
| 7 |
+
LM uses to rewrite prompts.
|
| 8 |
+
|
| 9 |
+
We combine three signals:
|
| 10 |
+
1. Rule-based checks (fast, deterministic)
|
| 11 |
+
- citation grounding (cites real retrieved sources, not hallucinated)
|
| 12 |
+
- tier preference (primary + Śaṅkara > supporting)
|
| 13 |
+
- structural hygiene (length, has actionable element, no therapy clichés)
|
| 14 |
+
2. LLM-as-judge rubric scoring
|
| 15 |
+
- Advaita coherence (non-dual, not crypto-dualist)
|
| 16 |
+
- two-truths discipline (vyāvahārika ↔ pāramārthika)
|
| 17 |
+
- empathy without dissolving into the user's frame
|
| 18 |
+
- wit calibration (light around the predicament, never the pain)
|
| 19 |
+
3. Composite score + structured feedback string
|
| 20 |
+
|
| 21 |
+
The function signature matches GEPA's metric contract:
|
| 22 |
+
metric(gold, pred, trace=None, pred_name=None, pred_trace=None) -> dspy.Prediction
|
| 23 |
+
|
| 24 |
+
Returning dspy.Prediction(score=float, feedback=str) is the GEPA happy path.
|
| 25 |
+
"""
|
| 26 |
+
|
| 27 |
+
from __future__ import annotations
|
| 28 |
+
import re
|
| 29 |
+
import json
|
| 30 |
+
from typing import Any
|
| 31 |
+
import dspy
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
# ──────────────────────────── Rule-based checks ────────────────────────────
|
| 35 |
+
THERAPY_CLICHES = [
|
| 36 |
+
"you got this",
|
| 37 |
+
"be kind to yourself",
|
| 38 |
+
"self-care",
|
| 39 |
+
"just remember",
|
| 40 |
+
"trust the process",
|
| 41 |
+
"everything happens for a reason",
|
| 42 |
+
"you are enough",
|
| 43 |
+
"love and light",
|
| 44 |
+
"manifesting",
|
| 45 |
+
"send positive vibes",
|
| 46 |
+
"good vibes",
|
| 47 |
+
]
|
| 48 |
+
|
| 49 |
+
# Loose pattern catching citations like "BG 2.47", "Gītā 18.66", "Bṛhadāraṇyaka 4.4.5",
|
| 50 |
+
# "Vivekacūḍāmaṇi 11", "Kaṭha Up. 1.3.14", etc.
|
| 51 |
+
CITATION_PATTERN = re.compile(
|
| 52 |
+
r"\b("
|
| 53 |
+
r"BG\s*\d+[\.:]\d+" # BG 2.47
|
| 54 |
+
r"|G[īi]t[āa]\s*\d+[\.:]\d+" # Gita 2.47
|
| 55 |
+
r"|[A-ZĀĪŪṚḌṬṆṢŚḤṂa-zāīūṛḍṭṇṣśḥṃ]{3,}\s*Up\.?\s*\d+(?:[\.:]\d+){0,2}" # Kaṭha Up. 1.2.3
|
| 56 |
+
r"|Vivekac[ūu]ḍāmaṇi\s*\d+"
|
| 57 |
+
r"|Ātmabodha\s*\d+"
|
| 58 |
+
r"|Tattvabodha\s*\d+"
|
| 59 |
+
r"|Brahma\s*S[ūu]tra\s*\d+[\.:]\d+(?:[\.:]\d+)?"
|
| 60 |
+
r"|Aṣṭāvakra\s*G[īi]t[āa]\s*\d+[\.:]\d+"
|
| 61 |
+
r")\b"
|
| 62 |
+
)
|
| 63 |
+
|
| 64 |
+
EMPATHY_OPENERS = [
|
| 65 |
+
"what you", "you're carrying", "you are carrying", "i hear",
|
| 66 |
+
"this hurts", "this is painful", "the weight", "sitting with",
|
| 67 |
+
"what you describe", "the ache",
|
| 68 |
+
]
|
| 69 |
+
|
| 70 |
+
ACTIONABLE_MARKERS = [
|
| 71 |
+
"this week", "today", "try this", "begin by", "for the next",
|
| 72 |
+
"each morning", "each evening", "when you notice", "the next time",
|
| 73 |
+
"as a practice", "sit for", "spend ", "over the next",
|
| 74 |
+
]
|
| 75 |
+
|
| 76 |
+
NON_DUAL_MARKERS = [
|
| 77 |
+
"witness", "sākṣī", "sakshi", "non-dual", "advaita",
|
| 78 |
+
"pāramārthika", "paramarthika", "vyāvahārika", "vyavaharika",
|
| 79 |
+
"ātman", "atman", "brahman", "adhyāsa", "adhyasa", "māyā", "maya",
|
| 80 |
+
"neti neti", "tat tvam asi", "ahaṁ brahmāsmi", "aham brahmasmi",
|
| 81 |
+
"self with a capital", "the seer", "awareness itself",
|
| 82 |
+
]
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
def _word_count(s: str) -> int:
|
| 86 |
+
return len(s.split())
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
def _has_any(text: str, needles: list[str]) -> list[str]:
|
| 90 |
+
low = text.lower()
|
| 91 |
+
return [n for n in needles if n in low]
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
def _normalize_for_match(s: str) -> str:
|
| 95 |
+
return re.sub(r"\s+", " ", s.lower()).strip()
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
def _citation_grounding(
|
| 99 |
+
sources_cited: list[str],
|
| 100 |
+
retrieved_passages: list[dict],
|
| 101 |
+
) -> tuple[float, list[str], list[str]]:
|
| 102 |
+
"""Return (grounding_score, grounded_citations, ungrounded_citations).
|
| 103 |
+
|
| 104 |
+
With the verse-indexed corpus, each retrieved passage carries an exact
|
| 105 |
+
verse_ref string ('BG 2.47', 'Muṇḍaka Up. 2.1.3', etc.). Grounding becomes
|
| 106 |
+
an exact set-membership test rather than fuzzy substring matching, which
|
| 107 |
+
is dramatically sharper feedback for GEPA's reflection step: 'BG 2.47'
|
| 108 |
+
is grounded if and only if 'BG 2.47' was in the retrieved set.
|
| 109 |
+
|
| 110 |
+
We still tolerate light formatting noise: the synthesizer might write
|
| 111 |
+
'BG 2.47', 'Bhagavad Gītā 2.47', 'Gita 2:47', etc. We canonicalize to
|
| 112 |
+
'BG <chap>.<verse>' for Gītā citations before comparing. Other works
|
| 113 |
+
are matched directly by verse_ref string with whitespace normalized.
|
| 114 |
+
"""
|
| 115 |
+
if not sources_cited:
|
| 116 |
+
return 0.0, [], []
|
| 117 |
+
|
| 118 |
+
retrieved_refs = {
|
| 119 |
+
_canonicalize_ref(h.get("verse_ref") or h.get("meta", {}).get("verse_ref", ""))
|
| 120 |
+
for h in retrieved_passages
|
| 121 |
+
}
|
| 122 |
+
retrieved_refs.discard("")
|
| 123 |
+
|
| 124 |
+
grounded, ungrounded = [], []
|
| 125 |
+
for c in sources_cited:
|
| 126 |
+
canon = _canonicalize_ref(c)
|
| 127 |
+
# Try direct match first, then a "substring of any retrieved" fallback
|
| 128 |
+
# for cases where the synthesizer paraphrases the citation
|
| 129 |
+
# ('chapter 2 verse 47' vs 'BG 2.47').
|
| 130 |
+
hit = canon in retrieved_refs or any(
|
| 131 |
+
canon and (canon in r or r in canon) for r in retrieved_refs
|
| 132 |
+
)
|
| 133 |
+
(grounded if hit else ungrounded).append(c)
|
| 134 |
+
|
| 135 |
+
score = len(grounded) / max(len(sources_cited), 1)
|
| 136 |
+
return score, grounded, ungrounded
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
def _canonicalize_ref(s: str) -> str:
|
| 140 |
+
"""Normalize a citation string so 'BG 2.47', 'Bhagavad Gītā 2.47',
|
| 141 |
+
'Gītā 2:47' all reduce to the same canonical form 'BG 2.47'."""
|
| 142 |
+
s = re.sub(r"\s+", " ", s.strip())
|
| 143 |
+
# Gītā variants
|
| 144 |
+
m = re.match(r"^(?:BG|Bhagavad\s*G[īi]t[āa]|G[īi]t[āa])\s*(\d+)[\.:](\d+)", s, re.I)
|
| 145 |
+
if m:
|
| 146 |
+
return f"BG {int(m.group(1))}.{int(m.group(2))}"
|
| 147 |
+
# Default: lowercased, colons → dots
|
| 148 |
+
return s.lower().replace(":", ".")
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
def _tier_preference(
|
| 152 |
+
sources_cited: list[str],
|
| 153 |
+
retrieved_passages: list[dict],
|
| 154 |
+
selected_indices: list[int],
|
| 155 |
+
) -> tuple[float, dict]:
|
| 156 |
+
"""Reward responses whose *cited* passages came from primary/Śaṅkara tiers."""
|
| 157 |
+
if not selected_indices:
|
| 158 |
+
return 0.0, {"primary": 0, "shankara": 0, "supporting": 0}
|
| 159 |
+
|
| 160 |
+
counts = {"primary": 0, "shankara": 0, "supporting": 0}
|
| 161 |
+
for idx in selected_indices:
|
| 162 |
+
if 1 <= idx <= len(retrieved_passages):
|
| 163 |
+
tier = retrieved_passages[idx - 1].get("meta", {}).get("tier", "supporting")
|
| 164 |
+
counts[tier] = counts.get(tier, 0) + 1
|
| 165 |
+
|
| 166 |
+
total = sum(counts.values()) or 1
|
| 167 |
+
preferred = counts["primary"] + counts["shankara"]
|
| 168 |
+
return preferred / total, counts
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
def rule_based_score(pred: dspy.Prediction) -> tuple[float, dict]:
|
| 172 |
+
"""Returns (score in [0,1], breakdown dict)."""
|
| 173 |
+
response = getattr(pred, "response", "") or ""
|
| 174 |
+
sources_cited = getattr(pred, "sources_cited", []) or []
|
| 175 |
+
retrieved = getattr(pred, "retrieved_passages", []) or []
|
| 176 |
+
selected_idx = getattr(pred, "selected_indices", []) or []
|
| 177 |
+
felt = getattr(pred, "felt_emotion", "") or ""
|
| 178 |
+
|
| 179 |
+
wc = _word_count(response)
|
| 180 |
+
length_ok = 200 <= wc <= 600
|
| 181 |
+
length_score = 1.0 if length_ok else max(0.0, 1.0 - abs(wc - 350) / 350)
|
| 182 |
+
|
| 183 |
+
citations_in_text = CITATION_PATTERN.findall(response)
|
| 184 |
+
has_citation = bool(citations_in_text) or bool(sources_cited)
|
| 185 |
+
citation_score = 1.0 if has_citation else 0.0
|
| 186 |
+
|
| 187 |
+
grounding_score, grounded, ungrounded = _citation_grounding(sources_cited, retrieved)
|
| 188 |
+
|
| 189 |
+
tier_score, tier_counts = _tier_preference(sources_cited, retrieved, selected_idx)
|
| 190 |
+
|
| 191 |
+
cliches = _has_any(response, THERAPY_CLICHES)
|
| 192 |
+
cliche_penalty = min(1.0, 0.25 * len(cliches))
|
| 193 |
+
cliche_score = 1.0 - cliche_penalty
|
| 194 |
+
|
| 195 |
+
# Empathy: opening should signal acknowledgement of feeling
|
| 196 |
+
head = response[:300].lower()
|
| 197 |
+
empathy_hits = [m for m in EMPATHY_OPENERS if m in head]
|
| 198 |
+
# Bonus if the felt_emotion content is referenced (loosely)
|
| 199 |
+
if felt:
|
| 200 |
+
for tok in felt.lower().split():
|
| 201 |
+
if len(tok) > 4 and tok in head:
|
| 202 |
+
empathy_hits.append(f"echoes:{tok}")
|
| 203 |
+
break
|
| 204 |
+
empathy_score = min(1.0, 0.4 + 0.3 * len(empathy_hits))
|
| 205 |
+
|
| 206 |
+
actionable_hits = _has_any(response, ACTIONABLE_MARKERS)
|
| 207 |
+
actionable_score = 1.0 if actionable_hits else 0.4
|
| 208 |
+
|
| 209 |
+
nondual_hits = _has_any(response, NON_DUAL_MARKERS)
|
| 210 |
+
nondual_score = min(1.0, 0.4 + 0.2 * len(nondual_hits))
|
| 211 |
+
|
| 212 |
+
# Weighted aggregate
|
| 213 |
+
components = {
|
| 214 |
+
"length": (length_score, 0.05),
|
| 215 |
+
"citation_present": (citation_score, 0.08),
|
| 216 |
+
"citation_grounding": (grounding_score, 0.18),
|
| 217 |
+
"tier_preference": (tier_score, 0.12),
|
| 218 |
+
"no_cliches": (cliche_score, 0.10),
|
| 219 |
+
"empathy_opening": (empathy_score, 0.15),
|
| 220 |
+
"actionable": (actionable_score, 0.10),
|
| 221 |
+
"nondual_register": (nondual_score, 0.22),
|
| 222 |
+
}
|
| 223 |
+
score = sum(s * w for s, w in components.values())
|
| 224 |
+
|
| 225 |
+
breakdown = {
|
| 226 |
+
"score": score,
|
| 227 |
+
"word_count": wc,
|
| 228 |
+
"components": {k: round(v[0], 3) for k, v in components.items()},
|
| 229 |
+
"citations_in_text": citations_in_text,
|
| 230 |
+
"sources_cited": sources_cited,
|
| 231 |
+
"grounded_citations": grounded,
|
| 232 |
+
"ungrounded_citations": ungrounded,
|
| 233 |
+
"tier_counts": tier_counts,
|
| 234 |
+
"therapy_cliches_found": cliches,
|
| 235 |
+
"empathy_hits": empathy_hits,
|
| 236 |
+
"actionable_hits": actionable_hits,
|
| 237 |
+
"nondual_markers_found": nondual_hits,
|
| 238 |
+
}
|
| 239 |
+
return score, breakdown
|
| 240 |
+
|
| 241 |
+
|
| 242 |
+
# ──────────────────────────── LLM-judge rubric ────────────────────────────
|
| 243 |
+
class JudgeAdvice(dspy.Signature):
|
| 244 |
+
"""You are an examiner of Advaita-Vedānta spiritual counsel in the lineage
|
| 245 |
+
of Ādi Śaṅkarācārya. Score the advisor's response against the user's
|
| 246 |
+
question on each rubric (0.0 to 1.0) and write a short critique that an
|
| 247 |
+
optimizer can use to *improve the prompts that produced this response*.
|
| 248 |
+
|
| 249 |
+
Rubrics:
|
| 250 |
+
|
| 251 |
+
- advaita_coherence: Does the response reflect genuine non-dualism
|
| 252 |
+
(jīva-ātman-brahman identity), or does it accidentally smuggle in dualism
|
| 253 |
+
('the soul reaches God', 'becoming one with the universe' as if they were
|
| 254 |
+
separate, etc.)? Does it avoid collapsing into nihilism ('nothing is
|
| 255 |
+
real')?
|
| 256 |
+
|
| 257 |
+
- two_truths_discipline: Does it honor the distinction between
|
| 258 |
+
vyāvahārika (transactional, where the user's pain and choices are real
|
| 259 |
+
and matter) and pāramārthika (absolute, where the witness is untouched)?
|
| 260 |
+
Failure modes: spiritual bypass (denying the pain by pointing to the
|
| 261 |
+
absolute), or pure-therapy register (forgetting the absolute exists).
|
| 262 |
+
|
| 263 |
+
- empathy_without_dissolving: Does it meet the user in their felt
|
| 264 |
+
experience without either flattening into therapy-speak OR dismissing
|
| 265 |
+
the feeling with premature transcendence?
|
| 266 |
+
|
| 267 |
+
- wit_calibration: Is there a light, dry touch around the cosmic
|
| 268 |
+
predicament (Śaṅkara himself is dry; this is consistent with the
|
| 269 |
+
tradition) WITHOUT being flippant about the user's actual pain? Both
|
| 270 |
+
'too solemn throughout' and 'making jokes about their situation' lose
|
| 271 |
+
points.
|
| 272 |
+
|
| 273 |
+
- source_integration: Are scriptural citations woven into the prose
|
| 274 |
+
(illuminating the point) rather than dumped as block quotes or used
|
| 275 |
+
as decoration? Are the references specific (Gītā 2.47, not just
|
| 276 |
+
"the Gita says")?
|
| 277 |
+
|
| 278 |
+
- practical_offering: Does the response close with something the user
|
| 279 |
+
can actually try — a question to sit with, a practice, a perspective
|
| 280 |
+
shift — rather than abstract platitudes?
|
| 281 |
+
|
| 282 |
+
- draw_from_personal_experiences: Does the response use parables and day to day life
|
| 283 |
+
stories as examples to encourage the user to relate better to the advise
|
| 284 |
+
|
| 285 |
+
The critique should be specific and prescriptive: what to keep, what to
|
| 286 |
+
cut, what's missing. Phrase it as you would to a writer revising a draft."""
|
| 287 |
+
|
| 288 |
+
user_question: str = dspy.InputField()
|
| 289 |
+
response: str = dspy.InputField()
|
| 290 |
+
sources_cited: list[str] = dspy.InputField()
|
| 291 |
+
|
| 292 |
+
advaita_coherence: float = dspy.OutputField(desc="0.0 to 1.0")
|
| 293 |
+
two_truths_discipline: float = dspy.OutputField(desc="0.0 to 1.0")
|
| 294 |
+
empathy_without_dissolving: float = dspy.OutputField(desc="0.0 to 1.0")
|
| 295 |
+
wit_calibration: float = dspy.OutputField(desc="0.0 to 1.0")
|
| 296 |
+
source_integration: float = dspy.OutputField(desc="0.0 to 1.0")
|
| 297 |
+
practical_offering: float = dspy.OutputField(desc="0.0 to 1.0")
|
| 298 |
+
draw_from_personal_experiences: float = dspy.OutputField(desc="0.0 to 1.0")
|
| 299 |
+
critique: str = dspy.OutputField(
|
| 300 |
+
desc="3-6 sentences of prescriptive feedback for revising the response."
|
| 301 |
+
)
|
| 302 |
+
|
| 303 |
+
|
| 304 |
+
# Lazily-instantiated judge. Call configure_judge() to use a stronger LM (e.g. gpt-4o)
|
| 305 |
+
# during GEPA optimization so the reflection LM gets high-quality signal to work from.
|
| 306 |
+
_judge = None
|
| 307 |
+
_judge_lm = None # None means use the globally-configured LM (task LM)
|
| 308 |
+
|
| 309 |
+
|
| 310 |
+
def configure_judge(lm) -> None:
|
| 311 |
+
"""Set the LM used by judge_score. Call before GEPA to use gpt-4o instead of the task LM."""
|
| 312 |
+
global _judge_lm, _judge
|
| 313 |
+
_judge_lm = lm
|
| 314 |
+
_judge = None # reset so next call recreates with new context
|
| 315 |
+
|
| 316 |
+
|
| 317 |
+
def _get_judge():
|
| 318 |
+
global _judge
|
| 319 |
+
if _judge is None:
|
| 320 |
+
_judge = dspy.ChainOfThought(JudgeAdvice)
|
| 321 |
+
return _judge
|
| 322 |
+
|
| 323 |
+
|
| 324 |
+
def judge_score(user_question: str, pred: dspy.Prediction) -> tuple[float, dict, str]:
|
| 325 |
+
judge = _get_judge()
|
| 326 |
+
try:
|
| 327 |
+
call_kwargs = dict(
|
| 328 |
+
user_question=user_question,
|
| 329 |
+
response=getattr(pred, "response", "") or "",
|
| 330 |
+
sources_cited=getattr(pred, "sources_cited", []) or [],
|
| 331 |
+
)
|
| 332 |
+
if _judge_lm is not None:
|
| 333 |
+
with dspy.context(lm=_judge_lm):
|
| 334 |
+
j = judge(**call_kwargs)
|
| 335 |
+
else:
|
| 336 |
+
j = judge(**call_kwargs)
|
| 337 |
+
except Exception as e:
|
| 338 |
+
# If the judge fails (parse error, LM hiccup), fall back gracefully.
|
| 339 |
+
return 0.5, {"judge_error": str(e)}, f"Judge failed: {e}"
|
| 340 |
+
|
| 341 |
+
rubric = {
|
| 342 |
+
"advaita_coherence": float(j.advaita_coherence or 0.0),
|
| 343 |
+
"two_truths_discipline": float(j.two_truths_discipline or 0.0),
|
| 344 |
+
"empathy_without_dissolving": float(j.empathy_without_dissolving or 0.0),
|
| 345 |
+
"wit_calibration": float(j.wit_calibration or 0.0),
|
| 346 |
+
"source_integration": float(j.source_integration or 0.0),
|
| 347 |
+
"practical_offering": float(j.practical_offering or 0.0),
|
| 348 |
+
"draw_from_personal_experiences": float(j.draw_from_personal_experiences or 0.0),
|
| 349 |
+
}
|
| 350 |
+
weights = {
|
| 351 |
+
"advaita_coherence": 0.25,
|
| 352 |
+
"two_truths_discipline": 0.20,
|
| 353 |
+
"empathy_without_dissolving": 0.20,
|
| 354 |
+
"wit_calibration": 0.10,
|
| 355 |
+
"source_integration": 0.10,
|
| 356 |
+
"practical_offering": 0.10,
|
| 357 |
+
"draw_from_personal_experiences": 0.05,
|
| 358 |
+
}
|
| 359 |
+
score = sum(rubric[k] * weights[k] for k in rubric)
|
| 360 |
+
score = max(0.0, min(1.0, score))
|
| 361 |
+
return score, rubric, j.critique or ""
|
| 362 |
+
|
| 363 |
+
|
| 364 |
+
# ──────────────────────────── Composite GEPA metric ────────────────────────────
|
| 365 |
+
RULE_WEIGHT = 0.45
|
| 366 |
+
JUDGE_WEIGHT = 0.55
|
| 367 |
+
|
| 368 |
+
|
| 369 |
+
def _format_feedback(rule_breakdown: dict, judge_rubric: dict, critique: str) -> str:
|
| 370 |
+
"""Concatenate rule-based facts and judge critique into one feedback string
|
| 371 |
+
that the GEPA reflection LM can read and use to rewrite prompts."""
|
| 372 |
+
lines = ["FEEDBACK FOR PROMPT IMPROVEMENT", ""]
|
| 373 |
+
|
| 374 |
+
lines.append("Rule-based observations:")
|
| 375 |
+
comps = rule_breakdown.get("components", {})
|
| 376 |
+
for k, v in comps.items():
|
| 377 |
+
lines.append(f" - {k}: {v}")
|
| 378 |
+
if rule_breakdown.get("therapy_cliches_found"):
|
| 379 |
+
lines.append(f" - Therapy clichés to remove: {rule_breakdown['therapy_cliches_found']}")
|
| 380 |
+
if rule_breakdown.get("ungrounded_citations"):
|
| 381 |
+
lines.append(
|
| 382 |
+
f" - Citations that weren't in retrieved passages (likely hallucinated): "
|
| 383 |
+
f"{rule_breakdown['ungrounded_citations']}"
|
| 384 |
+
)
|
| 385 |
+
if not rule_breakdown.get("nondual_markers_found"):
|
| 386 |
+
lines.append(" - Response lacks explicit Advaita register; consider invoking "
|
| 387 |
+
"concepts like sākṣī, adhyāsa, the two truths, etc.")
|
| 388 |
+
if not rule_breakdown.get("actionable_hits"):
|
| 389 |
+
lines.append(" - No concrete practice or this-week shift was offered.")
|
| 390 |
+
tier_counts = rule_breakdown.get("tier_counts", {})
|
| 391 |
+
if tier_counts:
|
| 392 |
+
lines.append(f" - Selected passage tiers: {tier_counts} "
|
| 393 |
+
f"(prefer primary + śaṅkara when both options exist).")
|
| 394 |
+
|
| 395 |
+
lines.append("")
|
| 396 |
+
lines.append("Rubric scores from Advaita-tradition examiner:")
|
| 397 |
+
for k, v in judge_rubric.items():
|
| 398 |
+
if isinstance(v, float):
|
| 399 |
+
lines.append(f" - {k}: {v:.2f}")
|
| 400 |
+
lines.append("")
|
| 401 |
+
lines.append("Examiner critique:")
|
| 402 |
+
lines.append(critique.strip() or "(no critique returned)")
|
| 403 |
+
return "\n".join(lines)
|
| 404 |
+
|
| 405 |
+
|
| 406 |
+
def gita_metric(
|
| 407 |
+
gold: dspy.Example,
|
| 408 |
+
pred: dspy.Prediction,
|
| 409 |
+
trace: Any = None,
|
| 410 |
+
pred_name: str | None = None,
|
| 411 |
+
pred_trace: Any = None,
|
| 412 |
+
) -> dspy.Prediction:
|
| 413 |
+
"""The GEPA-compatible metric.
|
| 414 |
+
|
| 415 |
+
Returns dspy.Prediction(score=..., feedback=...). The feedback string is
|
| 416 |
+
what GEPA's reflection LM ingests when rewriting prompts."""
|
| 417 |
+
user_q = getattr(gold, "user_question", "") if gold else ""
|
| 418 |
+
|
| 419 |
+
rule_score, rule_breakdown = rule_based_score(pred)
|
| 420 |
+
j_score, j_rubric, critique = judge_score(user_q, pred)
|
| 421 |
+
|
| 422 |
+
composite = RULE_WEIGHT * rule_score + JUDGE_WEIGHT * j_score
|
| 423 |
+
feedback = _format_feedback(rule_breakdown, j_rubric, critique)
|
| 424 |
+
|
| 425 |
+
return dspy.Prediction(score=composite, feedback=feedback)
|
| 426 |
+
|
| 427 |
+
|
| 428 |
+
def quick_eval_score(
|
| 429 |
+
gold: dspy.Example,
|
| 430 |
+
pred: dspy.Prediction,
|
| 431 |
+
trace: Any = None,
|
| 432 |
+
) -> float:
|
| 433 |
+
"""A pure-float metric for `dspy.Evaluate` — same composite, no feedback."""
|
| 434 |
+
out = gita_metric(gold, pred, trace=trace)
|
| 435 |
+
return float(out.score)
|
optimize_gepa.py
ADDED
|
@@ -0,0 +1,200 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
optimize_gepa.py — run GEPA reflective prompt evolution.
|
| 3 |
+
|
| 4 |
+
GEPA (Genetic-Pareto) treats the program's prompts as an evolving population.
|
| 5 |
+
At each step it:
|
| 6 |
+
1. Runs the current candidate(s) on a minibatch of training examples
|
| 7 |
+
2. Collects the (score, feedback) pairs from our metric
|
| 8 |
+
3. Asks a *reflection LM* to read the failures + feedback and propose a
|
| 9 |
+
mutated prompt
|
| 10 |
+
4. Evaluates the mutant; keeps it if it Pareto-dominates the parent on the
|
| 11 |
+
validation set
|
| 12 |
+
5. Repeats
|
| 13 |
+
|
| 14 |
+
Because we wrote `gita_metric` to return rich textual feedback, the reflection
|
| 15 |
+
LM has something substantive to chew on instead of just gradient signal.
|
| 16 |
+
|
| 17 |
+
The dataset has no gold labels — that's deliberate. Our metric judges the
|
| 18 |
+
prediction directly. This is the regime GEPA is designed for.
|
| 19 |
+
|
| 20 |
+
Usage:
|
| 21 |
+
python optimize_gepa.py --auto medium
|
| 22 |
+
python optimize_gepa.py --max-metric-calls 300 --proxy-task-lm
|
| 23 |
+
python optimize_gepa.py --auto light --proxy-task-lm # ~2-3 hrs vs 260 hrs
|
| 24 |
+
|
| 25 |
+
Proxy task LM (--proxy-task-lm):
|
| 26 |
+
Runs GEPA with gpt-4o-mini as the task LM instead of Gemma 4. GEPA only
|
| 27 |
+
needs to evaluate prompt quality — it doesn't need the final inference model.
|
| 28 |
+
Optimized prompts are model-agnostic text and transfer back to Gemma 4 when
|
| 29 |
+
the saved program is loaded at inference time. ~20x speedup over Gemma thinking.
|
| 30 |
+
"""
|
| 31 |
+
|
| 32 |
+
from __future__ import annotations
|
| 33 |
+
import argparse
|
| 34 |
+
import json
|
| 35 |
+
import random
|
| 36 |
+
from pathlib import Path
|
| 37 |
+
|
| 38 |
+
import dspy
|
| 39 |
+
from dspy import GEPA
|
| 40 |
+
|
| 41 |
+
import config
|
| 42 |
+
from advisor import GitaAdvisor
|
| 43 |
+
from dataset_generator import load_jsonl, to_dspy_examples
|
| 44 |
+
import metrics as metrics_module
|
| 45 |
+
from metrics import gita_metric, quick_eval_score
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def split(examples, val_frac: float, seed: int = 42):
|
| 49 |
+
rng = random.Random(seed)
|
| 50 |
+
shuffled = examples[:]
|
| 51 |
+
rng.shuffle(shuffled)
|
| 52 |
+
n_val = max(20, int(len(shuffled) * val_frac))
|
| 53 |
+
return shuffled[n_val:], shuffled[:n_val]
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def main():
|
| 57 |
+
ap = argparse.ArgumentParser()
|
| 58 |
+
ap.add_argument("--dataset", default=str(config.DATASET_PATH))
|
| 59 |
+
ap.add_argument("--out", default=str(config.OPTIMIZED_PROGRAM_PATH))
|
| 60 |
+
ap.add_argument("--val-frac", type=float, default=0.2)
|
| 61 |
+
ap.add_argument(
|
| 62 |
+
"--auto",
|
| 63 |
+
choices=["light", "medium", "heavy"],
|
| 64 |
+
default="medium",
|
| 65 |
+
help="GEPA's auto-budget mode. 'light' for smoke-tests, 'medium' for "
|
| 66 |
+
"a real run, 'heavy' for an overnight run on a meaty box.",
|
| 67 |
+
)
|
| 68 |
+
ap.add_argument(
|
| 69 |
+
"--max-metric-calls",
|
| 70 |
+
type=int,
|
| 71 |
+
default=None,
|
| 72 |
+
help="Override --auto with an explicit metric-call budget.",
|
| 73 |
+
)
|
| 74 |
+
ap.add_argument("--track-stats", action="store_true", default=True)
|
| 75 |
+
ap.add_argument("--seed", type=int, default=42)
|
| 76 |
+
ap.add_argument(
|
| 77 |
+
"--proxy-task-lm",
|
| 78 |
+
action="store_true",
|
| 79 |
+
default=False,
|
| 80 |
+
help="Use gpt-4o-mini as the task LM during GEPA instead of Gemma 4. "
|
| 81 |
+
"~20x faster; optimized prompts transfer back to Gemma 4 at inference. "
|
| 82 |
+
"Requires OPENAI_API_KEY.",
|
| 83 |
+
)
|
| 84 |
+
args = ap.parse_args()
|
| 85 |
+
|
| 86 |
+
# Configure DSPy globally and grab the reflection LM
|
| 87 |
+
task_lm, reflection_lm = config.configure_dspy()
|
| 88 |
+
|
| 89 |
+
if args.proxy_task_lm:
|
| 90 |
+
# Override the task LM with gpt-4o-mini for the duration of this process.
|
| 91 |
+
# DSPy saves only prompt text (instructions + field descriptions), not the
|
| 92 |
+
# LM choice — so the optimized JSON loads cleanly onto Gemma 4 at inference.
|
| 93 |
+
task_lm = dspy.LM(model=config.PROXY_TASK_MODEL, **config.PROXY_TASK_LM_KWARGS)
|
| 94 |
+
dspy.configure(lm=task_lm, adapter=dspy.ChatAdapter(use_json_adapter_fallback=False))
|
| 95 |
+
print(f"Task LM (proxy): {task_lm.model} [GEPA optimization only]")
|
| 96 |
+
else:
|
| 97 |
+
print(f"Task LM: {task_lm.model}")
|
| 98 |
+
print(f"Reflection LM: {reflection_lm.model}")
|
| 99 |
+
|
| 100 |
+
# Use the reflection LM (gpt-4o) for judging instead of the task LM (Gemma).
|
| 101 |
+
# Gemma judging its own responses produces noisy, self-congratulatory scores;
|
| 102 |
+
# gpt-4o gives the reflection step the crisp, tradition-aware feedback it needs.
|
| 103 |
+
metrics_module.configure_judge(reflection_lm)
|
| 104 |
+
print(f"Judge LM: {reflection_lm.model} (overriding task LM for judging)")
|
| 105 |
+
|
| 106 |
+
# Dataset
|
| 107 |
+
raw = load_jsonl(Path(args.dataset))
|
| 108 |
+
examples = to_dspy_examples(raw)
|
| 109 |
+
if len(examples) < 40:
|
| 110 |
+
print(f"[warn] Only {len(examples)} examples — generate more with "
|
| 111 |
+
f"`python dataset_generator.py --n 500`.")
|
| 112 |
+
train, val = split(examples, args.val_frac, seed=args.seed)
|
| 113 |
+
print(f"Train: {len(train)} Val: {len(val)}")
|
| 114 |
+
|
| 115 |
+
# Student program
|
| 116 |
+
student = GitaAdvisor()
|
| 117 |
+
|
| 118 |
+
# More threads when hitting an API (no local GPU bottleneck).
|
| 119 |
+
num_threads = 16 if args.proxy_task_lm or config.TASK_LM_BACKEND == "gemini" else 4
|
| 120 |
+
|
| 121 |
+
# Optional: get a baseline number for context
|
| 122 |
+
print("\nEvaluating baseline (un-optimized) on validation set ...")
|
| 123 |
+
evaluator = dspy.Evaluate(
|
| 124 |
+
devset=val,
|
| 125 |
+
metric=quick_eval_score,
|
| 126 |
+
num_threads=num_threads,
|
| 127 |
+
display_progress=True,
|
| 128 |
+
display_table=0,
|
| 129 |
+
)
|
| 130 |
+
try:
|
| 131 |
+
baseline_result = evaluator(student)
|
| 132 |
+
baseline_score = float(baseline_result) if hasattr(baseline_result, "__float__") else baseline_result
|
| 133 |
+
print(f"Baseline score: {baseline_score}")
|
| 134 |
+
except Exception as e:
|
| 135 |
+
print(f"Baseline eval failed (continuing to optimization): {e}")
|
| 136 |
+
|
| 137 |
+
# GEPA
|
| 138 |
+
log_dir = str(config.ARTIFACTS_DIR / "gepa_logs")
|
| 139 |
+
gepa_kwargs = dict(
|
| 140 |
+
metric=gita_metric,
|
| 141 |
+
reflection_lm=reflection_lm,
|
| 142 |
+
track_stats=args.track_stats,
|
| 143 |
+
seed=args.seed,
|
| 144 |
+
# Show 6 training examples to the reflection LM per proposal step instead of
|
| 145 |
+
# the default 3 — our 12 domains need diversity to avoid domain-specific over-fit.
|
| 146 |
+
reflection_minibatch_size=6,
|
| 147 |
+
# API-backed runs (proxy or Gemini) can saturate many threads; local GPU is
|
| 148 |
+
# limited to 4 to avoid OOM / serialization on a single device.
|
| 149 |
+
num_threads=num_threads,
|
| 150 |
+
# When the task LM mangles a list field the reflection LM should know the format
|
| 151 |
+
# broke, not just see a low score with no explanation.
|
| 152 |
+
add_format_failure_as_feedback=True,
|
| 153 |
+
# Persist per-step scores and prompts for post-run inspection.
|
| 154 |
+
log_dir=log_dir,
|
| 155 |
+
)
|
| 156 |
+
if args.max_metric_calls is not None:
|
| 157 |
+
gepa_kwargs["max_metric_calls"] = args.max_metric_calls
|
| 158 |
+
else:
|
| 159 |
+
gepa_kwargs["auto"] = args.auto
|
| 160 |
+
|
| 161 |
+
print(f"\nStarting GEPA with {gepa_kwargs} ...")
|
| 162 |
+
optimizer = GEPA(**gepa_kwargs)
|
| 163 |
+
|
| 164 |
+
optimized = optimizer.compile(
|
| 165 |
+
student=student,
|
| 166 |
+
trainset=train,
|
| 167 |
+
valset=val,
|
| 168 |
+
)
|
| 169 |
+
|
| 170 |
+
# Save
|
| 171 |
+
out_path = Path(args.out)
|
| 172 |
+
out_path.parent.mkdir(parents=True, exist_ok=True)
|
| 173 |
+
optimized.save(str(out_path))
|
| 174 |
+
print(f"\nSaved optimized program to {out_path}")
|
| 175 |
+
|
| 176 |
+
# Side-by-side eval
|
| 177 |
+
print("\nFinal eval on validation set ...")
|
| 178 |
+
final_result = evaluator(optimized)
|
| 179 |
+
final_score = float(final_result) if hasattr(final_result, "__float__") else final_result
|
| 180 |
+
print(f"Optimized score: {final_score}")
|
| 181 |
+
|
| 182 |
+
# Dump the optimized prompts for human inspection
|
| 183 |
+
inspect_path = out_path.with_suffix(".prompts.txt")
|
| 184 |
+
with inspect_path.open("w", encoding="utf-8") as f:
|
| 185 |
+
f.write("# Optimized prompts after GEPA\n\n")
|
| 186 |
+
for name, predictor in optimized.named_predictors():
|
| 187 |
+
sig = predictor.signature
|
| 188 |
+
f.write(f"## {name}\n")
|
| 189 |
+
f.write(f"### instructions\n{sig.instructions}\n\n")
|
| 190 |
+
f.write("### fields\n")
|
| 191 |
+
for fname, field in sig.fields.items():
|
| 192 |
+
desc = getattr(field.json_schema_extra, "get", lambda *_: "")("desc", "") \
|
| 193 |
+
if hasattr(field, "json_schema_extra") else ""
|
| 194 |
+
f.write(f"- {fname}: {desc}\n")
|
| 195 |
+
f.write("\n---\n\n")
|
| 196 |
+
print(f"Wrote prompt inspection file to {inspect_path}")
|
| 197 |
+
|
| 198 |
+
|
| 199 |
+
if __name__ == "__main__":
|
| 200 |
+
main()
|
parsers/__init__.py
ADDED
|
File without changes
|
parsers/gita_json.py
ADDED
|
@@ -0,0 +1,236 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
parsers/gita_json.py — turn the gita/gita verse-indexed JSON into Verse records.
|
| 3 |
+
|
| 4 |
+
The gita/gita repo (Unlicense, public-domain dedication) gives us four files
|
| 5 |
+
on the static mirror:
|
| 6 |
+
|
| 7 |
+
chapters.json — chapter metadata (number, name, summary)
|
| 8 |
+
verse.json — per-verse Sanskrit + transliteration + word_meanings
|
| 9 |
+
translation.json — per-verse English translations keyed by author_id
|
| 10 |
+
authors.json — author metadata for the translations
|
| 11 |
+
|
| 12 |
+
Why split parsing across multiple sources_registry entries
|
| 13 |
+
----------------------------------------------------------
|
| 14 |
+
We register `gita_json_core` (the verse text) and `gita_json_translations`
|
| 15 |
+
(the English translations) as separate sources. Both happen to feed this one
|
| 16 |
+
parser. The reason for the split is that translations come and go from the
|
| 17 |
+
upstream repo whereas the core verse data is essentially fixed; isolating
|
| 18 |
+
them lets us pin only what we need.
|
| 19 |
+
|
| 20 |
+
Translator allowlist
|
| 21 |
+
--------------------
|
| 22 |
+
Not every translator in the gita/gita translations.json is public-domain.
|
| 23 |
+
We hard-allowlist the ones we know are safe to redistribute. Anyone not on
|
| 24 |
+
the list is silently skipped — adding more is a one-line change.
|
| 25 |
+
"""
|
| 26 |
+
|
| 27 |
+
from __future__ import annotations
|
| 28 |
+
import json
|
| 29 |
+
from pathlib import Path
|
| 30 |
+
from typing import Iterable
|
| 31 |
+
|
| 32 |
+
from corpus import Verse
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
# ──────────────────────────── Translator allowlist ────────────────────────────
|
| 36 |
+
# The keys are the author_id values used inside translation.json. The values
|
| 37 |
+
# are display strings + the year we want to use for attribution.
|
| 38 |
+
#
|
| 39 |
+
# Why this list and not just "all translations":
|
| 40 |
+
# - Some translators in the upstream repo (e.g. ISKCON Prabhupada) have
|
| 41 |
+
# active publisher rights that we shouldn't rely on regardless of how the
|
| 42 |
+
# upstream chose to license its compilation.
|
| 43 |
+
# - Reducing translation count keeps the index lean. Three voices are plenty.
|
| 44 |
+
#
|
| 45 |
+
# If you want to add a translator, verify their public-domain status (death
|
| 46 |
+
# year + 70 in most jurisdictions, or pre-1929 publication for US PD), then
|
| 47 |
+
# add a row.
|
| 48 |
+
ALLOWED_TRANSLATORS: dict[str, tuple[str, int | None]] = {
|
| 49 |
+
# Swami Sivananda — d. 1963 — works are widely shared by The Divine Life
|
| 50 |
+
# Society in keeping with their founder's non-commercial stance.
|
| 51 |
+
"sivananda": ("Swami Sivananda", 1969),
|
| 52 |
+
|
| 53 |
+
# Swami Tejomayananda — modern; included only because some mirrors
|
| 54 |
+
# release these under permissive terms; double-check before relying on it.
|
| 55 |
+
# Disabled by default to be conservative.
|
| 56 |
+
# "tejomayananda": ("Swami Tejomayananda", 1995),
|
| 57 |
+
|
| 58 |
+
# Dr. S. Sankaranarayan — translation of Śaṅkara's Gītā Bhāṣya included
|
| 59 |
+
# in some forks of gita/gita; verify the specific edition. Off by default.
|
| 60 |
+
# "shankara": ("Śaṅkara (tr. Sankaranarayan)", 1990),
|
| 61 |
+
|
| 62 |
+
# The verse text itself is not a "translation" per se but a copy of the
|
| 63 |
+
# critical text plus transliteration. We include it under the synthetic
|
| 64 |
+
# author key 'sanskrit'.
|
| 65 |
+
"sanskrit": ("Sanskrit text + IAST", None),
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
# ──────────────────────────── Helpers ────────────────────────────
|
| 70 |
+
def _verse_id(chapter: int, verse_no: int) -> str:
|
| 71 |
+
"""Stable global key. Format: bhagavad_gita_<chap>_<verse>, zero-padded
|
| 72 |
+
to two digits so 1.10 sorts after 1.9 and lexical ordering matches numeric."""
|
| 73 |
+
return f"bhagavad_gita_{chapter:02d}_{verse_no:02d}"
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def _verse_ref(chapter: int, verse_no: int) -> str:
|
| 77 |
+
"""Citation form used by the advisor in its replies."""
|
| 78 |
+
return f"BG {chapter}.{verse_no}"
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
def _section_display(chapter_meta: dict) -> str:
|
| 82 |
+
name = chapter_meta.get("name_translation") or chapter_meta.get("name", "")
|
| 83 |
+
return f"Chapter {chapter_meta.get('chapter_number', '?')}: {name}"
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
# ──────────────────────────── Parser entry point ────────────────────────────
|
| 87 |
+
def parse(raw_dir_for_core: Path, raw_dir_for_translations: Path | None = None) -> Iterable[Verse]:
|
| 88 |
+
"""Walk the gita/gita JSON files and yield Verse records.
|
| 89 |
+
|
| 90 |
+
Layout expected (after download_sources.py has run):
|
| 91 |
+
raw_dir_for_core/chapters.json
|
| 92 |
+
raw_dir_for_core/verse.json
|
| 93 |
+
[optionally]
|
| 94 |
+
raw_dir_for_translations/translation.json
|
| 95 |
+
raw_dir_for_translations/authors.json
|
| 96 |
+
|
| 97 |
+
If translations are not present, we still emit Verses with sanskrit +
|
| 98 |
+
transliteration + word_meanings; the `translation` field falls back to
|
| 99 |
+
the transliteration so the verse isn't content-empty. (Better: enable
|
| 100 |
+
the gita_json_translations source.)
|
| 101 |
+
"""
|
| 102 |
+
chapters = _load(raw_dir_for_core / "chapters.json")
|
| 103 |
+
verses_raw = _load(raw_dir_for_core / "verse.json")
|
| 104 |
+
|
| 105 |
+
chapters_by_id = {c["chapter_number"]: c for c in chapters}
|
| 106 |
+
|
| 107 |
+
translations_by_verse: dict[int, dict[str, str]] = {}
|
| 108 |
+
authors_by_id: dict[str, str] = {}
|
| 109 |
+
if raw_dir_for_translations is not None:
|
| 110 |
+
translations_by_verse = _load_translations(raw_dir_for_translations / "translation.json")
|
| 111 |
+
authors_by_id = _load_authors(raw_dir_for_translations / "authors.json")
|
| 112 |
+
|
| 113 |
+
# Pick the best available translator from the allowlist, in priority order.
|
| 114 |
+
# First match wins. This keeps the index from carrying redundant English
|
| 115 |
+
# translations of the same verse.
|
| 116 |
+
translator_priority = ["sivananda", "sanskrit"]
|
| 117 |
+
|
| 118 |
+
for v in verses_raw:
|
| 119 |
+
chap_no = v["chapter_number"]
|
| 120 |
+
verse_no = v["verse_number"]
|
| 121 |
+
chap_meta = chapters_by_id.get(chap_no, {})
|
| 122 |
+
verse_id = _verse_id(chap_no, verse_no)
|
| 123 |
+
|
| 124 |
+
# Sanskrit text comes from the core file. The 'text' field has it
|
| 125 |
+
# in Devanāgarī, often with a trailing newline and verse number.
|
| 126 |
+
sanskrit = (v.get("text") or "").strip()
|
| 127 |
+
translit = (v.get("transliteration") or "").strip()
|
| 128 |
+
word_mean = (v.get("word_meanings") or "").strip()
|
| 129 |
+
|
| 130 |
+
# Try to attach an English translation
|
| 131 |
+
english = ""
|
| 132 |
+
translator_label = ""
|
| 133 |
+
v_translations = translations_by_verse.get(v.get("id") or v.get("externalId") or -1, {})
|
| 134 |
+
for key in translator_priority:
|
| 135 |
+
text = v_translations.get(key) or _translation_for(v_translations, key)
|
| 136 |
+
if text:
|
| 137 |
+
english = text.strip()
|
| 138 |
+
meta = ALLOWED_TRANSLATORS.get(key)
|
| 139 |
+
if meta:
|
| 140 |
+
translator_label = meta[0]
|
| 141 |
+
break
|
| 142 |
+
|
| 143 |
+
# Fallback: if no English translation, use word-meanings as a substitute
|
| 144 |
+
# so the verse isn't content-empty. Better than nothing for retrieval,
|
| 145 |
+
# though enrichment will be poorer.
|
| 146 |
+
if not english:
|
| 147 |
+
english = word_mean or translit
|
| 148 |
+
|
| 149 |
+
yield Verse(
|
| 150 |
+
verse_id=verse_id,
|
| 151 |
+
work="bhagavad_gita",
|
| 152 |
+
work_display="Bhagavad Gītā",
|
| 153 |
+
verse_ref=_verse_ref(chap_no, verse_no),
|
| 154 |
+
tier="primary",
|
| 155 |
+
section=f"chapter_{chap_no:02d}",
|
| 156 |
+
section_display=_section_display(chap_meta),
|
| 157 |
+
translation=english,
|
| 158 |
+
translator=translator_label,
|
| 159 |
+
sanskrit=sanskrit,
|
| 160 |
+
transliteration=translit,
|
| 161 |
+
word_meanings=word_mean,
|
| 162 |
+
bhashya="", # Gītā Bhāṣya is brought in by the Sastry parser
|
| 163 |
+
bhashya_translator="",
|
| 164 |
+
source_key="gita_json_core",
|
| 165 |
+
license="unlicense",
|
| 166 |
+
)
|
| 167 |
+
|
| 168 |
+
|
| 169 |
+
# ──────────────────────────── Internals ────────────────────────────
|
| 170 |
+
def _load(path: Path):
|
| 171 |
+
with path.open(encoding="utf-8") as f:
|
| 172 |
+
return json.load(f)
|
| 173 |
+
|
| 174 |
+
|
| 175 |
+
def _load_translations(path: Path) -> dict[int, dict[str, str]]:
|
| 176 |
+
"""The translations file has one entry per (verse, author). Group them
|
| 177 |
+
by verse_id into a {verse_id: {author_id: text}} map.
|
| 178 |
+
|
| 179 |
+
Schema seen in the wild varies slightly between forks of gita/gita; we
|
| 180 |
+
cope by trying a few key names. If parsing fails entirely we return {}
|
| 181 |
+
and proceed without translations rather than blowing up the whole ingest.
|
| 182 |
+
"""
|
| 183 |
+
if not path.exists():
|
| 184 |
+
return {}
|
| 185 |
+
try:
|
| 186 |
+
raw = _load(path)
|
| 187 |
+
except Exception as e:
|
| 188 |
+
print(f"[gita_json] failed to load translations: {e}")
|
| 189 |
+
return {}
|
| 190 |
+
|
| 191 |
+
out: dict[int, dict[str, str]] = {}
|
| 192 |
+
for row in raw:
|
| 193 |
+
vid = row.get("verse_id") or row.get("verseNumber") or row.get("verse_number_id") or row.get("id")
|
| 194 |
+
text = row.get("description") or row.get("text") or row.get("translation")
|
| 195 |
+
if vid is None or not text:
|
| 196 |
+
continue
|
| 197 |
+
|
| 198 |
+
# Skip non-English rows (Ramsukhdas Hindi etc.)
|
| 199 |
+
lang = (row.get("lang") or "").lower()
|
| 200 |
+
if lang and lang not in ("english", "en"):
|
| 201 |
+
continue
|
| 202 |
+
|
| 203 |
+
# Map the authorName (e.g. "Swami Sivananda") to an allowlist key
|
| 204 |
+
# ("sivananda") via case-insensitive substring matching. The numeric
|
| 205 |
+
# author_id field alone can't match the allowlist, which is why we
|
| 206 |
+
# prefer authorName here.
|
| 207 |
+
name_str = str(row.get("authorName") or row.get("author_id") or row.get("author") or "").strip()
|
| 208 |
+
matched_key = next(
|
| 209 |
+
(k for k in ALLOWED_TRANSLATORS if k.lower() in name_str.lower()),
|
| 210 |
+
None,
|
| 211 |
+
)
|
| 212 |
+
if matched_key is None:
|
| 213 |
+
continue
|
| 214 |
+
out.setdefault(int(vid), {})[matched_key] = text
|
| 215 |
+
return out
|
| 216 |
+
|
| 217 |
+
|
| 218 |
+
def _load_authors(path: Path) -> dict[str, str]:
|
| 219 |
+
if not path.exists():
|
| 220 |
+
return {}
|
| 221 |
+
try:
|
| 222 |
+
raw = _load(path)
|
| 223 |
+
except Exception:
|
| 224 |
+
return {}
|
| 225 |
+
return {row.get("id"): row.get("name", "") for row in raw if row.get("id")}
|
| 226 |
+
|
| 227 |
+
|
| 228 |
+
def _translation_for(v_translations: dict, author_key: str) -> str | None:
|
| 229 |
+
"""Tolerant lookup: some files use 'sivananda', some 'Sivananda', etc."""
|
| 230 |
+
if author_key in v_translations:
|
| 231 |
+
return v_translations[author_key]
|
| 232 |
+
lk = author_key.lower()
|
| 233 |
+
for k, val in v_translations.items():
|
| 234 |
+
if str(k).lower() == lk:
|
| 235 |
+
return val
|
| 236 |
+
return None
|
parsers/sastry_archive.py
ADDED
|
@@ -0,0 +1,249 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
parsers/sastry_archive.py — extract verse-attached Śaṅkara bhāṣya from
|
| 3 |
+
Alladi Mahadeva Sastry's 1897 archive.org OCR text.
|
| 4 |
+
|
| 5 |
+
What makes this harder than the gita_json parser
|
| 6 |
+
-------------------------------------------------
|
| 7 |
+
The gita/gita JSON gave us each verse already keyed by chapter and verse
|
| 8 |
+
number. The Sastry archive.org file is OCR'd plain text — about 20 MB of
|
| 9 |
+
running prose where the only structural cues are:
|
| 10 |
+
|
| 11 |
+
1. Chapter headings, formatted in caps like "SANKHYA YOGA." or
|
| 12 |
+
"CHAPTER II — SANKHYA YOGA"
|
| 13 |
+
2. Verse markers, which appear in two forms in the OCR:
|
| 14 |
+
- inline as "(II. 47.)" or "II. 47." after a translated verse
|
| 15 |
+
- as section headings like "47." or "Verse 47." preceding the bhāṣya
|
| 16 |
+
3. The rule that when a translated verse appears, Śaṅkara's commentary
|
| 17 |
+
follows immediately until the next verse marker.
|
| 18 |
+
|
| 19 |
+
Add to that: OCR noise. "II" can become "11", "47" can become "4 7", periods
|
| 20 |
+
become commas, glyphs get dropped. So the parser is forgiving — it tries
|
| 21 |
+
several patterns and falls back gracefully.
|
| 22 |
+
|
| 23 |
+
What we extract
|
| 24 |
+
---------------
|
| 25 |
+
For each verse we find, we yield a Verse with:
|
| 26 |
+
- tier='shankara'
|
| 27 |
+
- work='bhagavad_gita_bhashya' (kept distinct from 'bhagavad_gita' so
|
| 28 |
+
the joiner in ingest_corpus.py knows to merge bhashya into the gita
|
| 29 |
+
verses by verse_ref)
|
| 30 |
+
- translation = the verse text as Sastry rendered it (handy as a second
|
| 31 |
+
English voice alongside Sivananda)
|
| 32 |
+
- bhashya = Śaṅkara's commentary, as Sastry translated it
|
| 33 |
+
- bhashya_translator = 'Alladi Mahadeva Sastry, 1897'
|
| 34 |
+
|
| 35 |
+
Robustness strategy
|
| 36 |
+
-------------------
|
| 37 |
+
We don't try to be perfect. If a verse's bhāṣya is mis-attributed by ±1, the
|
| 38 |
+
downstream enrichment step will produce paraphrases that don't quite fit, and
|
| 39 |
+
we'll catch those during the spot-check pass on enriched output. The metric
|
| 40 |
+
will also penalize ungrounded citations. The key invariant is: never silently
|
| 41 |
+
emit a wrong (verse_id, bhashya) pair if we're uncertain — better to skip.
|
| 42 |
+
"""
|
| 43 |
+
|
| 44 |
+
from __future__ import annotations
|
| 45 |
+
import re
|
| 46 |
+
from dataclasses import replace
|
| 47 |
+
from pathlib import Path
|
| 48 |
+
from typing import Iterable
|
| 49 |
+
|
| 50 |
+
from corpus import Verse
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
# ──────────────────────────── Patterns ────────────────────────────
|
| 54 |
+
# Roman numerals (allowing OCR substitutions: I↔1, V↔V, etc.)
|
| 55 |
+
ROMAN = r"(?:[IVX1l]+|[ivx]+)"
|
| 56 |
+
|
| 57 |
+
# A "verse marker" looks like "II. 47" or "(II. 47.)" or "47" alone in a section
|
| 58 |
+
# heading. We try several shapes and let the most specific win.
|
| 59 |
+
VERSE_INLINE = re.compile(
|
| 60 |
+
r"\(?\s*(?P<chap>" + ROMAN + r")\s*[\.\,]\s*(?P<verse>\d{1,3})\s*[\.\,]?\s*\)?",
|
| 61 |
+
re.IGNORECASE,
|
| 62 |
+
)
|
| 63 |
+
|
| 64 |
+
# Chapter heading: "CHAPTER II" or "II. SANKHYA YOGA" — uppercase-heavy lines
|
| 65 |
+
CHAPTER_HEADING = re.compile(
|
| 66 |
+
r"^\s*(?:CHAPTER\s+)?(?P<roman>" + ROMAN + r")\.?\s+[A-Z][A-Z \-—]{4,}",
|
| 67 |
+
re.MULTILINE,
|
| 68 |
+
)
|
| 69 |
+
|
| 70 |
+
# Roman → arabic
|
| 71 |
+
ROMAN_MAP = {
|
| 72 |
+
"I": 1, "II": 2, "III": 3, "IV": 4, "V": 5, "VI": 6, "VII": 7, "VIII": 8,
|
| 73 |
+
"IX": 9, "X": 10, "XI": 11, "XII": 12, "XIII": 13, "XIV": 14, "XV": 15,
|
| 74 |
+
"XVI": 16, "XVII": 17, "XVIII": 18,
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
def _to_arabic(token: str) -> int | None:
|
| 79 |
+
"""Convert a possibly-noisy roman numeral to an int. OCR sometimes turns
|
| 80 |
+
'I' into '1' and 'II' into '11', so we accept both forms."""
|
| 81 |
+
t = token.upper().replace("L", "I").replace("0", "O") # OCR substitutions
|
| 82 |
+
if t in ROMAN_MAP:
|
| 83 |
+
return ROMAN_MAP[t]
|
| 84 |
+
# Pure-arabic fallback (e.g. OCR rendered 'II' as '11')
|
| 85 |
+
if t.isdigit():
|
| 86 |
+
n = int(t)
|
| 87 |
+
if 1 <= n <= 18:
|
| 88 |
+
return n
|
| 89 |
+
return None
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
# ──────────────────────────── Main parse ────────────────────────────
|
| 93 |
+
def parse(raw_dir: Path) -> Iterable[Verse]:
|
| 94 |
+
"""Walk Sastry archive.org text in raw_dir and yield Verse records.
|
| 95 |
+
|
| 96 |
+
Expected layout (after download_sources.py):
|
| 97 |
+
raw_dir/Bhagavad-Gita.with.the.Commentary.of.Sri.Shankaracharya_djvu.txt
|
| 98 |
+
|
| 99 |
+
The file is ~20 MB of OCR text. We stream it line-by-line, maintain the
|
| 100 |
+
current chapter as we encounter chapter headings, and at each verse marker
|
| 101 |
+
yield the accumulated text since the previous marker as the bhāṣya.
|
| 102 |
+
"""
|
| 103 |
+
txts = list(raw_dir.glob("*_djvu.txt")) + list(raw_dir.glob("*.txt"))
|
| 104 |
+
if not txts:
|
| 105 |
+
print(f"[sastry] no .txt under {raw_dir}; did you download_sources.py?")
|
| 106 |
+
return
|
| 107 |
+
|
| 108 |
+
text = txts[0].read_text(encoding="utf-8", errors="replace")
|
| 109 |
+
text = _denoise(text)
|
| 110 |
+
|
| 111 |
+
# First pass: find every verse marker with its position and attempt to
|
| 112 |
+
# disambiguate the chapter from context. We collect (chap, verse, span)
|
| 113 |
+
# tuples in document order.
|
| 114 |
+
markers: list[tuple[int, int, int, int]] = [] # chap, verse, start, end
|
| 115 |
+
current_chapter = 1
|
| 116 |
+
last_pos = 0
|
| 117 |
+
|
| 118 |
+
# Walk chapter headings and verse markers together via merged iteration
|
| 119 |
+
events = []
|
| 120 |
+
for m in CHAPTER_HEADING.finditer(text):
|
| 121 |
+
c = _to_arabic(m.group("roman"))
|
| 122 |
+
if c is not None:
|
| 123 |
+
events.append(("chapter", m.start(), c))
|
| 124 |
+
|
| 125 |
+
for m in VERSE_INLINE.finditer(text):
|
| 126 |
+
c = _to_arabic(m.group("chap"))
|
| 127 |
+
try:
|
| 128 |
+
v = int(m.group("verse"))
|
| 129 |
+
except (ValueError, TypeError):
|
| 130 |
+
continue
|
| 131 |
+
if c is None or not (1 <= v <= 80):
|
| 132 |
+
continue
|
| 133 |
+
events.append(("verse", m.start(), c, v, m.end(), m.start("verse")))
|
| 134 |
+
|
| 135 |
+
events.sort(key=lambda e: e[1])
|
| 136 |
+
|
| 137 |
+
# Second pass: build (chapter, verse) → (start, end) spans, where each
|
| 138 |
+
# span is the bhāṣya from one marker to the next. We yield in document
|
| 139 |
+
# order with the chapter from the most recent chapter heading we saw.
|
| 140 |
+
last_marker_pos: int | None = None
|
| 141 |
+
last_chap: int | None = None
|
| 142 |
+
last_verse: int | None = None
|
| 143 |
+
|
| 144 |
+
for ev in events:
|
| 145 |
+
if ev[0] == "chapter":
|
| 146 |
+
current_chapter = ev[2]
|
| 147 |
+
continue
|
| 148 |
+
# ev: ("verse", start, chap, verse, end, verse_pos)
|
| 149 |
+
_, start, chap, verse, end, verse_pos = ev
|
| 150 |
+
|
| 151 |
+
# Only treat markers where the verse NUMBER appears near the start of
|
| 152 |
+
# its line — those are actual section headings. Inline cross-references
|
| 153 |
+
# like "(II. 47.)" mid-paragraph have the verse number well into the
|
| 154 |
+
# line and must not be treated as section boundaries.
|
| 155 |
+
verse_line_start = text.rfind("\n", 0, verse_pos) + 1
|
| 156 |
+
on_own_line = (verse_pos - verse_line_start) <= 8
|
| 157 |
+
if not on_own_line:
|
| 158 |
+
continue
|
| 159 |
+
current_chapter = chap
|
| 160 |
+
|
| 161 |
+
if last_marker_pos is not None and last_chap is not None and last_verse is not None:
|
| 162 |
+
bhashya_text = text[last_marker_pos:start].strip()
|
| 163 |
+
if bhashya_text:
|
| 164 |
+
yield _build_verse(
|
| 165 |
+
chap=last_chap, verse=last_verse, body=bhashya_text,
|
| 166 |
+
)
|
| 167 |
+
|
| 168 |
+
last_marker_pos = end
|
| 169 |
+
last_chap = current_chapter
|
| 170 |
+
last_verse = verse
|
| 171 |
+
|
| 172 |
+
# Flush the trailing one
|
| 173 |
+
if last_marker_pos is not None and last_chap and last_verse:
|
| 174 |
+
tail = text[last_marker_pos:].strip()
|
| 175 |
+
if tail:
|
| 176 |
+
yield _build_verse(chap=last_chap, verse=last_verse, body=tail)
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
# ──────────────────────────── Builders ────────────────────────────
|
| 180 |
+
def _build_verse(chap: int, verse: int, body: str) -> Verse:
|
| 181 |
+
"""The body lump contains both Sastry's English of the verse and Śaṅkara's
|
| 182 |
+
commentary, usually with the verse first (sometimes labeled) and the
|
| 183 |
+
commentary following. We make a *light* split heuristic: if the first
|
| 184 |
+
paragraph is short (≤ 400 chars) and ends near a period, treat it as the
|
| 185 |
+
verse translation; the rest is bhashya. If we can't split confidently,
|
| 186 |
+
we put everything into bhashya and leave translation empty — the gita_json
|
| 187 |
+
parser already gave us a translation by another translator."""
|
| 188 |
+
body = body.strip()
|
| 189 |
+
translation = ""
|
| 190 |
+
bhashya = body
|
| 191 |
+
|
| 192 |
+
# Heuristic split on the first blank-ish line within reasonable distance
|
| 193 |
+
para_break = re.search(r"\n\s*\n", body[:600])
|
| 194 |
+
if para_break and para_break.end() < 500:
|
| 195 |
+
head = body[:para_break.start()].strip()
|
| 196 |
+
tail = body[para_break.end():].strip()
|
| 197 |
+
# Accept the split only if the head looks like a verse: short-ish,
|
| 198 |
+
# not starting with a typical-bhashya opener like "This means" /
|
| 199 |
+
# "The meaning is" / "Here the Lord says".
|
| 200 |
+
if 30 < len(head) < 400 and not _looks_like_bhashya_opener(head):
|
| 201 |
+
translation, bhashya = head, tail
|
| 202 |
+
|
| 203 |
+
return Verse(
|
| 204 |
+
verse_id=f"bhagavad_gita_{chap:02d}_{verse:02d}",
|
| 205 |
+
work="bhagavad_gita_bhashya",
|
| 206 |
+
work_display="Bhagavad Gītā with Śaṅkara's Bhāṣya",
|
| 207 |
+
verse_ref=f"BG {chap}.{verse}",
|
| 208 |
+
tier="shankara",
|
| 209 |
+
section=f"chapter_{chap:02d}",
|
| 210 |
+
section_display=f"Chapter {chap}",
|
| 211 |
+
translation=translation,
|
| 212 |
+
translator="Alladi Mahadeva Sastry" if translation else "",
|
| 213 |
+
bhashya=bhashya,
|
| 214 |
+
bhashya_translator="Alladi Mahadeva Sastry, 1897",
|
| 215 |
+
source_key="sastry_gita_bhashya",
|
| 216 |
+
license="public_domain",
|
| 217 |
+
)
|
| 218 |
+
|
| 219 |
+
|
| 220 |
+
def _looks_like_bhashya_opener(s: str) -> bool:
|
| 221 |
+
s = s.strip().lower()
|
| 222 |
+
openers = (
|
| 223 |
+
"this means", "the meaning is", "the sense is", "here the lord",
|
| 224 |
+
"here it is said", "the lord says", "the question may", "objection",
|
| 225 |
+
"the commentator",
|
| 226 |
+
)
|
| 227 |
+
return any(s.startswith(o) for o in openers)
|
| 228 |
+
|
| 229 |
+
|
| 230 |
+
# ──────────────────────────── OCR de-noise ────────────────────────────
|
| 231 |
+
def _denoise(text: str) -> str:
|
| 232 |
+
"""Light cleanup. Aggressive normalization risks losing real signal —
|
| 233 |
+
we only fix patterns we're confident about."""
|
| 234 |
+
# Common OCR substitutions for Sanskrit diacritics losses won't matter
|
| 235 |
+
# for English-language retrieval; we leave Sanskrit fragments alone.
|
| 236 |
+
|
| 237 |
+
# Collapse runs of repeated punctuation that OCR hallucinated
|
| 238 |
+
text = re.sub(r"\.{3,}", ".", text)
|
| 239 |
+
text = re.sub(r" +\.", ".", text)
|
| 240 |
+
|
| 241 |
+
# Glue cross-line hyphens: "lib-\nerty" → "liberty"
|
| 242 |
+
text = re.sub(r"-\n([a-z])", r"\1", text)
|
| 243 |
+
|
| 244 |
+
# Normalize whitespace
|
| 245 |
+
text = re.sub(r"[ \t]+", " ", text)
|
| 246 |
+
text = re.sub(r"\n[ \t]+", "\n", text)
|
| 247 |
+
text = re.sub(r"\n{3,}", "\n\n", text)
|
| 248 |
+
|
| 249 |
+
return text
|
requirements.txt
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
dspy-ai>=2.6.0
|
| 2 |
+
chromadb>=0.5.0
|
| 3 |
+
sentence-transformers>=3.0.0
|
| 4 |
+
openai>=1.40.0
|
| 5 |
+
pydantic>=2.0
|
| 6 |
+
tqdm>=4.66
|
| 7 |
+
numpy>=1.26
|
| 8 |
+
rich>=13.7
|
| 9 |
+
unidecode>=1.3
|
| 10 |
+
requests>=2.31
|
| 11 |
+
gradio>=4.0
|
run_overnight.py
ADDED
|
@@ -0,0 +1,230 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
run_overnight.py — orchestrates full GEPA optimization through light → medium,
|
| 3 |
+
then saves prompts and runs a multi-question test suite.
|
| 4 |
+
|
| 5 |
+
Usage:
|
| 6 |
+
python run_overnight.py [--skip-light] [--skip-medium]
|
| 7 |
+
|
| 8 |
+
Writes a timestamped log to artifacts/overnight_run.log.
|
| 9 |
+
"""
|
| 10 |
+
from __future__ import annotations
|
| 11 |
+
import argparse
|
| 12 |
+
import subprocess
|
| 13 |
+
import sys
|
| 14 |
+
import time
|
| 15 |
+
from datetime import datetime
|
| 16 |
+
from pathlib import Path
|
| 17 |
+
import json
|
| 18 |
+
|
| 19 |
+
ROOT = Path(__file__).parent.resolve()
|
| 20 |
+
LOG_PATH = ROOT / "artifacts" / "overnight_run.log"
|
| 21 |
+
OPTIMIZED_PATH = ROOT / "artifacts" / "optimized_advisor.json"
|
| 22 |
+
PROMPTS_PATH = ROOT / "artifacts" / "optimized_advisor.prompts.txt"
|
| 23 |
+
RESULTS_PATH = ROOT / "artifacts" / "test_results.json"
|
| 24 |
+
|
| 25 |
+
TEST_QUESTIONS = [
|
| 26 |
+
"I just got laid off and feel like nothing matters anymore.",
|
| 27 |
+
"I keep procrastinating on important work and feel guilty about it. How do I stop?",
|
| 28 |
+
"My relationship ended and I feel like I've lost my identity. Who am I without this person?",
|
| 29 |
+
"I'm terrified of death and can't stop thinking about it at night.",
|
| 30 |
+
"I have achieved everything I wanted — career, family, money — and still feel empty.",
|
| 31 |
+
"I feel angry at everyone around me but don't know why. How should I deal with this?",
|
| 32 |
+
"I can't stop comparing myself to others and feeling like I'm always falling short.",
|
| 33 |
+
]
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def ts() -> str:
|
| 37 |
+
return datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def log(msg: str, f=None):
|
| 41 |
+
line = f"[{ts()}] {msg}"
|
| 42 |
+
print(line, flush=True)
|
| 43 |
+
if f:
|
| 44 |
+
f.write(line + "\n")
|
| 45 |
+
f.flush()
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def run_phase(cmd: list[str], phase: str, logfile) -> bool:
|
| 49 |
+
log(f"=== STARTING {phase} ===", logfile)
|
| 50 |
+
log(f"Command: {' '.join(cmd)}", logfile)
|
| 51 |
+
start = time.time()
|
| 52 |
+
try:
|
| 53 |
+
proc = subprocess.Popen(
|
| 54 |
+
cmd,
|
| 55 |
+
stdout=subprocess.PIPE,
|
| 56 |
+
stderr=subprocess.STDOUT,
|
| 57 |
+
text=True,
|
| 58 |
+
cwd=str(ROOT),
|
| 59 |
+
)
|
| 60 |
+
for line in proc.stdout:
|
| 61 |
+
logfile.write(line)
|
| 62 |
+
logfile.flush()
|
| 63 |
+
# Echo key lines to terminal
|
| 64 |
+
if any(k in line for k in ["score", "GEPA", "Step", "ERROR", "Saved", "Train:", "Val:", "Baseline"]):
|
| 65 |
+
print(line, end="", flush=True)
|
| 66 |
+
proc.wait()
|
| 67 |
+
elapsed = time.time() - start
|
| 68 |
+
if proc.returncode == 0:
|
| 69 |
+
log(f"=== {phase} COMPLETED in {elapsed/60:.1f} min ===", logfile)
|
| 70 |
+
return True
|
| 71 |
+
else:
|
| 72 |
+
log(f"=== {phase} FAILED (exit {proc.returncode}) after {elapsed/60:.1f} min ===", logfile)
|
| 73 |
+
return False
|
| 74 |
+
except Exception as e:
|
| 75 |
+
log(f"=== {phase} ERROR: {e} ===", logfile)
|
| 76 |
+
return False
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
def run_test_suite(logfile) -> dict:
|
| 80 |
+
log("=== STARTING TEST SUITE ===", logfile)
|
| 81 |
+
sys.path.insert(0, str(ROOT))
|
| 82 |
+
|
| 83 |
+
import config
|
| 84 |
+
from advisor import load_optimized
|
| 85 |
+
from metrics import gita_metric
|
| 86 |
+
import dspy
|
| 87 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 88 |
+
|
| 89 |
+
config.configure_dspy()
|
| 90 |
+
|
| 91 |
+
advisor = load_optimized()
|
| 92 |
+
n = len(TEST_QUESTIONS)
|
| 93 |
+
|
| 94 |
+
def run_one(i_q):
|
| 95 |
+
i, q = i_q
|
| 96 |
+
try:
|
| 97 |
+
pred = advisor(user_question=q, history=dspy.History(messages=[]))
|
| 98 |
+
gold = dspy.Example(user_question=q).with_inputs("user_question")
|
| 99 |
+
m = gita_metric(gold, pred)
|
| 100 |
+
return i, q, {
|
| 101 |
+
"question": q,
|
| 102 |
+
"score": round(float(m.score), 3),
|
| 103 |
+
"word_count": len(pred.response.split()),
|
| 104 |
+
"sources_cited": pred.sources_cited,
|
| 105 |
+
"response_excerpt": pred.response[:200],
|
| 106 |
+
"feedback_excerpt": m.feedback[:500],
|
| 107 |
+
}
|
| 108 |
+
except Exception as e:
|
| 109 |
+
return i, q, {"question": q, "error": str(e), "score": 0.0}
|
| 110 |
+
|
| 111 |
+
indexed = list(enumerate(TEST_QUESTIONS, 1))
|
| 112 |
+
results_map = {}
|
| 113 |
+
with ThreadPoolExecutor(max_workers=n) as pool:
|
| 114 |
+
futures = {pool.submit(run_one, iq): iq for iq in indexed}
|
| 115 |
+
for fut in as_completed(futures):
|
| 116 |
+
i, q, result = fut.result()
|
| 117 |
+
results_map[i] = result
|
| 118 |
+
if "error" in result:
|
| 119 |
+
log(f" [{i}/{n}] ERROR: {result['error']}", logfile)
|
| 120 |
+
else:
|
| 121 |
+
log(f" [{i}/{n}] score={result['score']:.3f} wc={result['word_count']} sources={result['sources_cited']}", logfile)
|
| 122 |
+
|
| 123 |
+
results = [results_map[i] for i in range(1, n + 1)]
|
| 124 |
+
avg = sum(r.get("score", 0) for r in results) / n
|
| 125 |
+
log(f"=== TEST SUITE DONE — avg score: {avg:.3f} ===", logfile)
|
| 126 |
+
return {"questions": results, "avg_score": round(avg, 3), "timestamp": ts()}
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
def dump_prompts(logfile):
|
| 130 |
+
"""Re-extract and log optimized prompts to a human-readable file."""
|
| 131 |
+
if not OPTIMIZED_PATH.exists():
|
| 132 |
+
log(" No optimized program found — skipping prompt dump.", logfile)
|
| 133 |
+
return
|
| 134 |
+
|
| 135 |
+
sys.path.insert(0, str(ROOT))
|
| 136 |
+
import config
|
| 137 |
+
from advisor import GitaAdvisor
|
| 138 |
+
config.configure_dspy()
|
| 139 |
+
|
| 140 |
+
advisor = GitaAdvisor()
|
| 141 |
+
try:
|
| 142 |
+
advisor.load(str(OPTIMIZED_PATH))
|
| 143 |
+
except Exception as e:
|
| 144 |
+
log(f" Could not load optimized program: {e}", logfile)
|
| 145 |
+
return
|
| 146 |
+
|
| 147 |
+
lines = ["# Optimized Prompts after GEPA overnight run", f"# Extracted at {ts()}", ""]
|
| 148 |
+
for name, predictor in advisor.named_predictors():
|
| 149 |
+
sig = predictor.signature
|
| 150 |
+
lines.append(f"## {name}")
|
| 151 |
+
lines.append(f"### Instructions")
|
| 152 |
+
lines.append(sig.instructions or "(none)")
|
| 153 |
+
lines.append("")
|
| 154 |
+
lines.append("### Field descriptions")
|
| 155 |
+
for fname, field in sig.fields.items():
|
| 156 |
+
extras = field.json_schema_extra or {}
|
| 157 |
+
desc = extras.get("desc", "") if isinstance(extras, dict) else ""
|
| 158 |
+
lines.append(f" {fname}: {desc}")
|
| 159 |
+
lines.append("")
|
| 160 |
+
lines.append("---")
|
| 161 |
+
lines.append("")
|
| 162 |
+
|
| 163 |
+
PROMPTS_PATH.write_text("\n".join(lines), encoding="utf-8")
|
| 164 |
+
log(f" Prompts written to {PROMPTS_PATH}", logfile)
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
def main():
|
| 168 |
+
ap = argparse.ArgumentParser()
|
| 169 |
+
ap.add_argument("--skip-light", action="store_true")
|
| 170 |
+
ap.add_argument("--skip-medium", action="store_true")
|
| 171 |
+
ap.add_argument("--skip-tests", action="store_true")
|
| 172 |
+
args = ap.parse_args()
|
| 173 |
+
|
| 174 |
+
LOG_PATH.parent.mkdir(parents=True, exist_ok=True)
|
| 175 |
+
|
| 176 |
+
with LOG_PATH.open("w", encoding="utf-8") as logfile:
|
| 177 |
+
log("=== OVERNIGHT GEPA RUN STARTED ===", logfile)
|
| 178 |
+
log(f"Dataset: {ROOT / 'data' / 'synthetic_questions.jsonl'}", logfile)
|
| 179 |
+
log(f"Output: {OPTIMIZED_PATH}", logfile)
|
| 180 |
+
|
| 181 |
+
python = sys.executable
|
| 182 |
+
|
| 183 |
+
# ── Phase 1: Light ──
|
| 184 |
+
if not args.skip_light:
|
| 185 |
+
ok = run_phase(
|
| 186 |
+
[python, "optimize_gepa.py", "--auto", "light"],
|
| 187 |
+
"GEPA LIGHT",
|
| 188 |
+
logfile,
|
| 189 |
+
)
|
| 190 |
+
if not ok:
|
| 191 |
+
log("Light phase failed — stopping overnight run.", logfile)
|
| 192 |
+
sys.exit(1)
|
| 193 |
+
# Back up light result
|
| 194 |
+
if OPTIMIZED_PATH.exists():
|
| 195 |
+
import shutil
|
| 196 |
+
shutil.copy(OPTIMIZED_PATH, OPTIMIZED_PATH.with_suffix(".light.json"))
|
| 197 |
+
log(f" Backed up light result to {OPTIMIZED_PATH.with_suffix('.light.json')}", logfile)
|
| 198 |
+
else:
|
| 199 |
+
log("Skipping light phase (--skip-light).", logfile)
|
| 200 |
+
|
| 201 |
+
# ── Phase 2: Medium ──
|
| 202 |
+
if not args.skip_medium:
|
| 203 |
+
ok = run_phase(
|
| 204 |
+
[python, "optimize_gepa.py", "--auto", "medium"],
|
| 205 |
+
"GEPA MEDIUM",
|
| 206 |
+
logfile,
|
| 207 |
+
)
|
| 208 |
+
if not ok:
|
| 209 |
+
log("Medium phase failed.", logfile)
|
| 210 |
+
# Don't exit — still dump whatever we have
|
| 211 |
+
else:
|
| 212 |
+
log("Skipping medium phase (--skip-medium).", logfile)
|
| 213 |
+
|
| 214 |
+
# ── Dump prompts ──
|
| 215 |
+
log("Extracting optimized prompts ...", logfile)
|
| 216 |
+
dump_prompts(logfile)
|
| 217 |
+
|
| 218 |
+
# ── Test suite ──
|
| 219 |
+
if not args.skip_tests:
|
| 220 |
+
test_results = run_test_suite(logfile)
|
| 221 |
+
RESULTS_PATH.write_text(json.dumps(test_results, indent=2, ensure_ascii=False), encoding="utf-8")
|
| 222 |
+
log(f"Test results written to {RESULTS_PATH}", logfile)
|
| 223 |
+
else:
|
| 224 |
+
log("Skipping test suite (--skip-tests).", logfile)
|
| 225 |
+
|
| 226 |
+
log("=== OVERNIGHT RUN COMPLETE ===", logfile)
|
| 227 |
+
|
| 228 |
+
|
| 229 |
+
if __name__ == "__main__":
|
| 230 |
+
main()
|