arvjay Claude Sonnet 4.6 commited on
Commit
a9352d6
·
1 Parent(s): 3df194c

deploy Gita Advisor as Gradio Space

Browse files

- add app.py: Gradio ChatInterface wrapping the advisor pipeline
- add .gitignore: exclude .env, __pycache__, raw data, logs
- update README.md: add HF Spaces frontmatter
- update requirements.txt: add gradio>=4.0
- include artifacts/chroma/ (via LFS), optimized_advisor.json, corpus_enriched.jsonl

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +1 -0
  2. .gitignore +14 -0
  3. README.md +118 -7
  4. advisor.py +157 -0
  5. app.py +72 -0
  6. artifacts/chroma/1cb22ce3-5b5d-4ea7-84da-8e74f131266a/data_level0.bin +3 -0
  7. artifacts/chroma/1cb22ce3-5b5d-4ea7-84da-8e74f131266a/header.bin +3 -0
  8. artifacts/chroma/1cb22ce3-5b5d-4ea7-84da-8e74f131266a/index_metadata.pickle +3 -0
  9. artifacts/chroma/1cb22ce3-5b5d-4ea7-84da-8e74f131266a/length.bin +3 -0
  10. artifacts/chroma/1cb22ce3-5b5d-4ea7-84da-8e74f131266a/link_lists.bin +3 -0
  11. artifacts/chroma/1f1f3474-209f-41fb-91d3-d45fd026fb05/data_level0.bin +3 -0
  12. artifacts/chroma/1f1f3474-209f-41fb-91d3-d45fd026fb05/header.bin +3 -0
  13. artifacts/chroma/1f1f3474-209f-41fb-91d3-d45fd026fb05/length.bin +3 -0
  14. artifacts/chroma/1f1f3474-209f-41fb-91d3-d45fd026fb05/link_lists.bin +0 -0
  15. artifacts/chroma/52cdeb15-0631-44ed-8618-782f1d4d27bb/data_level0.bin +3 -0
  16. artifacts/chroma/52cdeb15-0631-44ed-8618-782f1d4d27bb/header.bin +3 -0
  17. artifacts/chroma/52cdeb15-0631-44ed-8618-782f1d4d27bb/length.bin +3 -0
  18. artifacts/chroma/52cdeb15-0631-44ed-8618-782f1d4d27bb/link_lists.bin +0 -0
  19. artifacts/chroma/8707047c-50b3-41ba-ad04-329e93917e30/data_level0.bin +3 -0
  20. artifacts/chroma/8707047c-50b3-41ba-ad04-329e93917e30/header.bin +3 -0
  21. artifacts/chroma/8707047c-50b3-41ba-ad04-329e93917e30/length.bin +3 -0
  22. artifacts/chroma/8707047c-50b3-41ba-ad04-329e93917e30/link_lists.bin +0 -0
  23. artifacts/chroma/9c71c1cd-5694-4e9f-abe6-12ba1e74225b/data_level0.bin +3 -0
  24. artifacts/chroma/9c71c1cd-5694-4e9f-abe6-12ba1e74225b/header.bin +3 -0
  25. artifacts/chroma/9c71c1cd-5694-4e9f-abe6-12ba1e74225b/index_metadata.pickle +3 -0
  26. artifacts/chroma/9c71c1cd-5694-4e9f-abe6-12ba1e74225b/length.bin +3 -0
  27. artifacts/chroma/9c71c1cd-5694-4e9f-abe6-12ba1e74225b/link_lists.bin +3 -0
  28. artifacts/chroma/chroma.sqlite3 +3 -0
  29. artifacts/chroma/d091e62b-3e8b-4cd6-842e-ff3411b384f5/data_level0.bin +3 -0
  30. artifacts/chroma/d091e62b-3e8b-4cd6-842e-ff3411b384f5/header.bin +3 -0
  31. artifacts/chroma/d091e62b-3e8b-4cd6-842e-ff3411b384f5/length.bin +3 -0
  32. artifacts/chroma/d091e62b-3e8b-4cd6-842e-ff3411b384f5/link_lists.bin +0 -0
  33. artifacts/optimized_advisor.json +157 -0
  34. chat.py +392 -0
  35. config.py +201 -0
  36. corpus.py +224 -0
  37. data/corpus_enriched.jsonl +0 -0
  38. dataset_generator.py +332 -0
  39. download_sources.py +195 -0
  40. enrich_corpus.py +174 -0
  41. enrichment.py +266 -0
  42. ingest_corpus.py +203 -0
  43. knowledge_base.py +416 -0
  44. metrics.py +435 -0
  45. optimize_gepa.py +200 -0
  46. parsers/__init__.py +0 -0
  47. parsers/gita_json.py +236 -0
  48. parsers/sastry_archive.py +249 -0
  49. requirements.txt +11 -0
  50. run_overnight.py +230 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.sqlite3 filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .env
2
+ __pycache__/
3
+ *.pyc
4
+ *.pyo
5
+ data/raw/
6
+ data/enrichment_cache.jsonl
7
+ data/corpus.jsonl
8
+ artifacts/gepa_logs/
9
+ artifacts/gepa_state.bin
10
+ artifacts/*.log
11
+ Gita-advisor/
12
+ sources_local/
13
+ sources/
14
+ .DS_Store
README.md CHANGED
@@ -1,14 +1,125 @@
1
  ---
2
  title: Gita Advisor
3
- emoji: 📊
4
- colorFrom: indigo
5
- colorTo: green
6
  sdk: gradio
7
- sdk_version: 6.13.0
8
  app_file: app.py
9
  pinned: false
10
- license: mit
11
- short_description: 'Bhagvad Gita Spiritual Advisor '
12
  ---
13
 
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  title: Gita Advisor
3
+ emoji: 🕉️
4
+ colorFrom: yellow
5
+ colorTo: orange
6
  sdk: gradio
7
+ sdk_version: 5.33.0
8
  app_file: app.py
9
  pinned: false
 
 
10
  ---
11
 
12
+ # Gītā Advisor
13
+
14
+ A spiritual advisor grounded in Advaita Vedānta as taught by Śaṅkarācārya,
15
+ optimized via DSPy + GEPA against a local LM Studio model. The advisor takes
16
+ real-life questions or vents and produces responses that are empathetic to
17
+ the felt experience, faithful to the non-dual lineage, and grounded in
18
+ exact-cited verses from the Gītā with Śaṅkara's bhāṣya, the principal
19
+ Upaniṣads, the Brahma Sūtras, and the prakaraṇa-granthas.
20
+
21
+ ## What makes this design unusual
22
+
23
+ The first unusual choice is that the unit of retrieval is the verse, not
24
+ the chunk. Scripture is not arbitrary prose: each Gītā śloka, each
25
+ Upaniṣadic mantra, each sūtra is a sealed teaching unit with a stable
26
+ citation reference. We index by `verse_id` (e.g. `bhagavad_gita_02_47`,
27
+ which renders as `BG 2.47` in citations) so the advisor's references can be
28
+ exact-match-verified against the retrieved set.
29
+
30
+ The second unusual choice is that we use the local LLM, in a one-time
31
+ offline pass, to enrich each verse with structured fields a real person's
32
+ question can match against. A user does not write "I am experiencing
33
+ rāga toward kāmya-karma"; they write "I worked on this for three years and
34
+ it just failed." So we ask the local model, for each verse, to produce a
35
+ plain-English paraphrase, the Vedāntic themes engaged, the life situations
36
+ addressed, the emotions met, the practical teaching offered, and five
37
+ hypothetical first-person questions the verse would speak to. We then
38
+ embed three views of each verse — the literal translation, Śaṅkara's
39
+ bhāṣya, and the LLM-enriched advisor view — and at retrieval time query
40
+ all three and merge by verse ID.
41
+
42
+ The advisor view dominates retrieval because that is where the language
43
+ gap closes. The literal and bhāṣya views act as insurance against the
44
+ enrichment pipeline missing a topic.
45
+
46
+ ## Where the texts come from
47
+
48
+ Every source is unambiguously open. The verse-indexed JSON at
49
+ `github.com/gita/gita`, released under the Unlicense, gives us Sanskrit
50
+ plus IAST transliteration plus word-by-word glosses for the Gītā. Alladi
51
+ Mahadeva Sastry's 1897 translation of Śaṅkara's Gītā Bhāṣya, in the public
52
+ domain and full-text on archive.org, gives us Śaṅkara's commentary
53
+ attached to each verse. The wisdomlib mirror of the *Sacred Books of the
54
+ East* is staged for the Upaniṣad-with-Śaṅkara texts and the Brahma Sūtra
55
+ bhāṣya; those parsers are registered but not yet implemented. See
56
+ `sources_registry.py` for the complete catalog and `CLAUDE.md` for the
57
+ licensing rationale.
58
+
59
+ We deliberately exclude the modern Advaita Ashrama translations (active
60
+ copyright), modern Ramaṇa and Nisargadatta editions, and Prabhupada's
61
+ commentary. If you have your own license-cleared copies, drop them in
62
+ `sources_local/` and the `plain_text` parser will fold them in.
63
+
64
+ ## Pipeline of commands
65
+
66
+ ```bash
67
+ pip install -r requirements.txt
68
+
69
+ # 1. Download the registered open sources to data/raw/<source_key>/
70
+ python download_sources.py
71
+
72
+ # 2. Parse + merge into data/corpus.jsonl (one verse per line)
73
+ python ingest_corpus.py
74
+
75
+ # 3. Enrich every verse via the local LLM. SLOW — overnight.
76
+ # Resumable; kill -9 is safe (append-mode cache).
77
+ python enrich_corpus.py --limit 50 # smoke-test the prompt first
78
+ python enrich_corpus.py # then the real run
79
+
80
+ # 4. Build the three-view Chroma index
81
+ python knowledge_base.py --build
82
+
83
+ # 5. Try a query against the index
84
+ python knowledge_base.py --query "I just got laid off and feel hollow"
85
+
86
+ # 6. Smoke-test the full advisor pipeline
87
+ python smoke_test.py "I just got laid off and feel hollow"
88
+
89
+ # 7. Generate the synthetic question dataset and run GEPA
90
+ python dataset_generator.py --n 500
91
+ python optimize_gepa.py --auto medium
92
+
93
+ # 8. Open the chat CLI
94
+ python chat.py
95
+ ```
96
+
97
+ ## Project structure
98
+
99
+ The project is laid out so the data flow is left-to-right through the
100
+ pipeline: each script reads what the previous one wrote, with all
101
+ intermediate state on disk so any stage can be re-run independently. The
102
+ data model lives in `corpus.py` (`Verse` and `EnrichedVerse` dataclasses)
103
+ and is the contract between modules. The advisor itself is a `dspy.Module`
104
+ that GEPA optimizes; the metric in `metrics.py` is the specification GEPA
105
+ optimizes against, combining rule-based hygiene checks with an LLM-judge
106
+ rubric and producing structured feedback for GEPA's reflection step. See
107
+ `CLAUDE.md` for the full file map and the design commitments that should
108
+ not be silently broken.
109
+
110
+ ## Configuration
111
+
112
+ `config.py` reads a small number of environment variables. The two that
113
+ matter most are `LM_STUDIO_BASE` (defaults to `http://localhost:1234/v1`)
114
+ and `LOCAL_MODEL` (defaults to `google/gemma-4-26b-a4b`, but copy whatever
115
+ LM Studio reports verbatim). The embedding model defaults to BGE-small on
116
+ Apple Silicon's MPS device; switch `EMBED_DEVICE` to `cpu` if you are not
117
+ on Apple Silicon.
118
+
119
+ ## License
120
+
121
+ The code in this repository is yours to use. The texts in `data/raw/` come
122
+ with their own licenses, all unambiguously open and tracked in
123
+ `sources_registry.py`. Attributions for translators are preserved through
124
+ the pipeline and surfaced in citation footers.
125
+ # gita_advisor
advisor.py ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ advisor.py — the composed DSPy module.
3
+
4
+ This is what GEPA optimizes. It chains four predictors:
5
+
6
+ UnderstandQuery → PlanRetrieval → [retrieve] → SelectPassages → SynthesizeAdvice
7
+
8
+ Each predictor uses ChainOfThought so GEPA has a `reasoning` field to inspect
9
+ in its reflection step. The retriever itself is not optimized (it's vector
10
+ search), but the *queries given to it* are — that's where PlanRetrieval lives.
11
+ """
12
+
13
+ from __future__ import annotations
14
+ import json
15
+ from dataclasses import dataclass
16
+ from typing import Any
17
+ import dspy
18
+
19
+ from signatures import (
20
+ UnderstandQuery,
21
+ PlanRetrieval,
22
+ SelectPassages,
23
+ SynthesizeAdvice,
24
+ )
25
+ from knowledge_base import AdvaitaRetriever, format_passages_for_llm
26
+ import config
27
+
28
+
29
+ @dataclass
30
+ class AdviceTrace:
31
+ """Everything the pipeline produced — useful for the metric to reason over."""
32
+ user_question: str
33
+ felt_emotion: str
34
+ surface_concern: str
35
+ deeper_concern: str
36
+ vedantic_themes: list[str]
37
+ queries: list[str]
38
+ retrieved_passages: list[dict] # raw hits with metadata
39
+ selected_indices: list[int]
40
+ selection_rationale: str
41
+ response: str
42
+ sources_cited: list[str]
43
+
44
+
45
+ class GitaAdvisor(dspy.Module):
46
+ def __init__(self, retriever: AdvaitaRetriever | None = None):
47
+ super().__init__()
48
+ self.understand = dspy.ChainOfThought(UnderstandQuery)
49
+ self.plan = dspy.ChainOfThought(PlanRetrieval)
50
+ self.select = dspy.ChainOfThought(SelectPassages)
51
+ self.synthesize = dspy.ChainOfThought(SynthesizeAdvice)
52
+ # Retriever is not a Predictor; held as a plain attribute so DSPy
53
+ # introspection ignores it during optimization.
54
+ self._retriever = retriever or AdvaitaRetriever()
55
+
56
+ def forward(
57
+ self,
58
+ user_question: str,
59
+ history: dspy.History | None = None,
60
+ _stage_cb=None,
61
+ ) -> dspy.Prediction:
62
+ if history is None:
63
+ history = dspy.History(messages=[])
64
+
65
+ # 1. Understand — history lets it interpret follow-ups correctly
66
+ if _stage_cb:
67
+ _stage_cb("understanding your question...")
68
+ u = self.understand(
69
+ history=history,
70
+ user_question=user_question,
71
+ )
72
+
73
+ # 2. Plan retrieval queries
74
+ if _stage_cb:
75
+ _stage_cb("planning search queries...")
76
+ p = self.plan(
77
+ surface_concern=u.surface_concern,
78
+ deeper_concern=u.deeper_concern,
79
+ vedantic_themes=u.vedantic_themes,
80
+ )
81
+ queries = p.queries[: config.N_RETRIEVAL_QUERIES] if p.queries else [u.deeper_concern]
82
+
83
+ # 3. Retrieve
84
+ if _stage_cb:
85
+ _stage_cb("searching scriptures...")
86
+ hits = self._retriever.search_many(queries, k_per=config.TOP_K_RETRIEVE)
87
+ # Cap candidate set so the selector prompt stays focused
88
+ candidates = hits[: max(8, config.TOP_K_RETRIEVE)]
89
+ candidates_text = format_passages_for_llm(candidates)
90
+ # Pre-serialize hits to dicts so the dspy.Prediction we return below
91
+ # can be pickled by GEPA's bookkeeping. The metric reads from these
92
+ # dicts, not from Hit objects, so it doesn't need the knowledge_base
93
+ # import either.
94
+ candidates_as_dicts = [h.to_dict() for h in candidates]
95
+
96
+ # 4. Select — tell the selector what's already been cited so it prefers fresh sources
97
+ if _stage_cb:
98
+ _stage_cb("selecting passages...")
99
+ previously_cited = [
100
+ src
101
+ for msg in history.messages
102
+ for src in msg.get("sources_cited", [])
103
+ ]
104
+ s = self.select(
105
+ deeper_concern=u.deeper_concern,
106
+ candidate_passages=candidates_text,
107
+ previously_cited=previously_cited,
108
+ )
109
+ # Defensive: clamp indices to valid range
110
+ valid_idx = [
111
+ i for i in (s.selected_indices or [])
112
+ if isinstance(i, int) and 1 <= i <= len(candidates)
113
+ ]
114
+ if not valid_idx:
115
+ # Fallback: take the top-3 candidates so synthesis isn't starved
116
+ valid_idx = list(range(1, min(4, len(candidates) + 1)))
117
+
118
+ selected = [candidates[i - 1] for i in valid_idx]
119
+ selected_text = format_passages_for_llm(selected)
120
+
121
+ # 5. Synthesize — history lets it build across turns, avoid repetition
122
+ if _stage_cb:
123
+ _stage_cb("composing response...")
124
+ a = self.synthesize(
125
+ history=history,
126
+ user_question=user_question,
127
+ felt_emotion=u.felt_emotion,
128
+ deeper_concern=u.deeper_concern,
129
+ selected_passages=selected_text,
130
+ )
131
+
132
+ return dspy.Prediction(
133
+ response=a.response,
134
+ sources_cited=a.sources_cited or [],
135
+ synthesis_reasoning=getattr(a, "reasoning", ""),
136
+ # Carry intermediate state for the metric / debugging:
137
+ felt_emotion=u.felt_emotion,
138
+ surface_concern=u.surface_concern,
139
+ deeper_concern=u.deeper_concern,
140
+ vedantic_themes=u.vedantic_themes,
141
+ queries=queries,
142
+ retrieved_passages=candidates_as_dicts,
143
+ selected_indices=valid_idx,
144
+ selection_rationale=s.selection_rationale,
145
+ )
146
+
147
+
148
+ def load_optimized(path: str | None = None) -> GitaAdvisor:
149
+ """Load an advisor with GEPA-optimized prompts if available, else fresh."""
150
+ advisor = GitaAdvisor()
151
+ p = path or str(config.OPTIMIZED_PROGRAM_PATH)
152
+ try:
153
+ advisor.load(p)
154
+ print(f"Loaded optimized advisor from {p}")
155
+ except FileNotFoundError:
156
+ print(f"No optimized program at {p} — using base prompts.")
157
+ return advisor
app.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ app.py — Gradio web interface for the Gītā Advisor.
3
+
4
+ Wraps the same advisor pipeline as chat.py but exposes it as a Gradio
5
+ ChatInterface suitable for Hugging Face Spaces (free CPU tier).
6
+
7
+ Deploy:
8
+ - Set GEMINI_API_KEY as a Space Secret (Space Settings → Secrets)
9
+ - Push this file + all project files + artifacts/chroma/ + data/corpus_enriched.jsonl
10
+ """
11
+
12
+ import gradio as gr
13
+ import dspy
14
+
15
+ import config
16
+ from advisor import load_optimized
17
+ from knowledge_base import AdvaitaRetriever
18
+
19
+ # ── startup — runs once when the Space boots ───────────────────────────────────
20
+ config.configure_dspy()
21
+ _advisor = load_optimized()
22
+
23
+ # Pre-warm retriever so the first user request isn't slow
24
+ _retriever = AdvaitaRetriever()
25
+ _retriever._ensure()
26
+
27
+
28
+ # ── chat handler ───────────────────────────────────────────────────────────────
29
+ def chat(message: str, history: list) -> str:
30
+ # Gradio type="messages" passes history as list of {"role": ..., "content": ...}
31
+ dspy_msgs = []
32
+ i = 0
33
+ while i + 1 < len(history):
34
+ user_msg = history[i]
35
+ bot_msg = history[i + 1]
36
+ if user_msg.get("role") == "user" and bot_msg.get("role") == "assistant":
37
+ dspy_msgs.append({
38
+ "user_question": user_msg["content"],
39
+ "response": bot_msg["content"],
40
+ "sources_cited": [],
41
+ })
42
+ i += 2
43
+ dspy_history = dspy.History(messages=dspy_msgs)
44
+
45
+ pred = _advisor(user_question=message, history=dspy_history)
46
+
47
+ reply = pred.response
48
+ if pred.sources_cited:
49
+ reply += "\n\n---\n**Sources:** " + " · ".join(pred.sources_cited)
50
+ return reply
51
+
52
+
53
+ # ── Gradio app ─────────────────────────────────────────────────────────────────
54
+ demo = gr.ChatInterface(
55
+ fn=chat,
56
+ title="Gītā Advisor",
57
+ description=(
58
+ "A spiritual advisor grounded in Advaita Vedānta as taught by Śaṅkarācārya. "
59
+ "Speak from where you actually are. The advisor cites exact verses from the "
60
+ "Gītā with Śaṅkara's commentary."
61
+ ),
62
+ type="messages",
63
+ examples=[
64
+ "I just got laid off and feel like nothing makes sense.",
65
+ "I'm terrified of dying. Is that irrational?",
66
+ "I keep hurting the people I love without meaning to.",
67
+ "I've been meditating for years but still feel empty. What am I missing?",
68
+ ],
69
+ )
70
+
71
+ if __name__ == "__main__":
72
+ demo.launch()
artifacts/chroma/1cb22ce3-5b5d-4ea7-84da-8e74f131266a/data_level0.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e83b9e9a3e97ce6dc11715a48ebb2e9bc6f5b24f91c82d3fbc8d6b46085afb44
3
+ size 1174876
artifacts/chroma/1cb22ce3-5b5d-4ea7-84da-8e74f131266a/header.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:90261dc9d8649c19b95dcba5fa4ef3bb2f5a64b26da0aefc9e58ba7a3f2fcbe0
3
+ size 100
artifacts/chroma/1cb22ce3-5b5d-4ea7-84da-8e74f131266a/index_metadata.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b626b118ac6ec87f4d2ecdb13193aa4eaadc0514fe351c284c77edcc0b491fea
3
+ size 40786
artifacts/chroma/1cb22ce3-5b5d-4ea7-84da-8e74f131266a/length.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b57dc524397d289a5ec9f2dfd69bf88c097d397d354c600fff6cbd958cadda89
3
+ size 2804
artifacts/chroma/1cb22ce3-5b5d-4ea7-84da-8e74f131266a/link_lists.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bdb3c70e56609aecebbde493e89cc3f2eff23d6375c762bca900dd5edc46738b
3
+ size 6204
artifacts/chroma/1f1f3474-209f-41fb-91d3-d45fd026fb05/data_level0.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:66cd81ca458bd620e24b378f6ce96e6d77ba4c8789d4ece914775c339be10e26
3
+ size 167600
artifacts/chroma/1f1f3474-209f-41fb-91d3-d45fd026fb05/header.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a0e81c3b22454233bc12d0762f06dcca48261a75231cf87c79b75e69a6c00150
3
+ size 100
artifacts/chroma/1f1f3474-209f-41fb-91d3-d45fd026fb05/length.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7a12e561363385e9dfeeab326368731c030ed4b374e7f5897ac819159d2884c5
3
+ size 400
artifacts/chroma/1f1f3474-209f-41fb-91d3-d45fd026fb05/link_lists.bin ADDED
File without changes
artifacts/chroma/52cdeb15-0631-44ed-8618-782f1d4d27bb/data_level0.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:201b998f2a013f78cea5960b05174ceffedbd046c4dfc10a8d2492ff8a1398a7
3
+ size 167600
artifacts/chroma/52cdeb15-0631-44ed-8618-782f1d4d27bb/header.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a0e81c3b22454233bc12d0762f06dcca48261a75231cf87c79b75e69a6c00150
3
+ size 100
artifacts/chroma/52cdeb15-0631-44ed-8618-782f1d4d27bb/length.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7a12e561363385e9dfeeab326368731c030ed4b374e7f5897ac819159d2884c5
3
+ size 400
artifacts/chroma/52cdeb15-0631-44ed-8618-782f1d4d27bb/link_lists.bin ADDED
File without changes
artifacts/chroma/8707047c-50b3-41ba-ad04-329e93917e30/data_level0.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:121a03535f783813fa1ec964dd84095d2da0d83c0e5dad98955bdf0e252b33d8
3
+ size 167600
artifacts/chroma/8707047c-50b3-41ba-ad04-329e93917e30/header.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a0e81c3b22454233bc12d0762f06dcca48261a75231cf87c79b75e69a6c00150
3
+ size 100
artifacts/chroma/8707047c-50b3-41ba-ad04-329e93917e30/length.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7a12e561363385e9dfeeab326368731c030ed4b374e7f5897ac819159d2884c5
3
+ size 400
artifacts/chroma/8707047c-50b3-41ba-ad04-329e93917e30/link_lists.bin ADDED
File without changes
artifacts/chroma/9c71c1cd-5694-4e9f-abe6-12ba1e74225b/data_level0.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:68a485ecde933131e39b14c078dc82c3e0e27454cdb837d76f4be92f00bef026
3
+ size 1136328
artifacts/chroma/9c71c1cd-5694-4e9f-abe6-12ba1e74225b/header.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ce56f617adc30c61cd845ab6c84720756b445d622b4bc3160ce70dc5ce91e7e
3
+ size 100
artifacts/chroma/9c71c1cd-5694-4e9f-abe6-12ba1e74225b/index_metadata.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bde285e65f8c89d9d82ad8cd02cd29c9bfcbe7009e37a2ef50f2fc58dd630af1
3
+ size 39452
artifacts/chroma/9c71c1cd-5694-4e9f-abe6-12ba1e74225b/length.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:360e5acdc3007feb7ef856ed67807c4a8995beb723091091828f729ccba9dbcb
3
+ size 2712
artifacts/chroma/9c71c1cd-5694-4e9f-abe6-12ba1e74225b/link_lists.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fef3ffac7707ea201b174e47e15c161162d3f38b6fdbeccc43aab1af3c35e617
3
+ size 6044
artifacts/chroma/chroma.sqlite3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:acd63ba75ad75f6234302822709e44e8e560d80e120f31519c20f73d41821d22
3
+ size 20459520
artifacts/chroma/d091e62b-3e8b-4cd6-842e-ff3411b384f5/data_level0.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:88b9416504c97719067232f4ef2a042eb4015b033f8c4a3137cd90e9f2faa468
3
+ size 167600
artifacts/chroma/d091e62b-3e8b-4cd6-842e-ff3411b384f5/header.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a0e81c3b22454233bc12d0762f06dcca48261a75231cf87c79b75e69a6c00150
3
+ size 100
artifacts/chroma/d091e62b-3e8b-4cd6-842e-ff3411b384f5/length.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7a12e561363385e9dfeeab326368731c030ed4b374e7f5897ac819159d2884c5
3
+ size 400
artifacts/chroma/d091e62b-3e8b-4cd6-842e-ff3411b384f5/link_lists.bin ADDED
File without changes
artifacts/optimized_advisor.json ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "understand.predict": {
3
+ "traces": [],
4
+ "train": [],
5
+ "demos": [],
6
+ "signature": {
7
+ "instructions": "Read the user's life situation carefully, taking into account the full\nconversation so far. If there is prior exchange, use it to understand\nfollow-up messages, references like 'what you said earlier', or shifts in\nthe user's emotional state across turns. Identify the felt emotion, the\nunderlying spiritual concern (not just the surface complaint), and the\nVedāntic themes that are most relevant — drawing only from concepts native\nto Advaita Vedānta.",
8
+ "fields": [
9
+ {
10
+ "prefix": "History:",
11
+ "description": "Prior turns as a list of message dicts with 'user_question' and 'response' keys. Empty history means this is the first message."
12
+ },
13
+ {
14
+ "prefix": "User Question:",
15
+ "description": "The user's current message; may be a question, a vent, a follow-up, or a description of a situation."
16
+ },
17
+ {
18
+ "prefix": "Reasoning:",
19
+ "description": "${reasoning}"
20
+ },
21
+ {
22
+ "prefix": "Felt Emotion:",
23
+ "description": "The dominant emotion the user is experiencing, named precisely (e.g. 'anticipatory grief', not just 'sad')."
24
+ },
25
+ {
26
+ "prefix": "Surface Concern:",
27
+ "description": "What the user is literally asking about, in one sentence."
28
+ },
29
+ {
30
+ "prefix": "Deeper Concern:",
31
+ "description": "The underlying existential/spiritual concern — usually about identity, attachment, fear, dharma, or meaning — that the surface concern is a symptom of. One sentence."
32
+ },
33
+ {
34
+ "prefix": "Vedantic Themes:",
35
+ "description": "2-4 Advaita-Vedānta concepts most relevant to this situation. Use Sanskrit terms with brief gloss, e.g. 'adhyāsa (superimposition of self onto roles)', 'vairāgya (dispassion)', 'sākṣī (witness consciousness)'."
36
+ }
37
+ ]
38
+ },
39
+ "lm": null
40
+ },
41
+ "plan.predict": {
42
+ "traces": [],
43
+ "train": [],
44
+ "demos": [],
45
+ "signature": {
46
+ "instructions": "Given the user's situation and identified themes, generate diverse search\nqueries to find relevant passages from the Advaita corpus (Bhagavad Gītā with\nŚaṅkara bhāṣya, Upaniṣads, Brahma Sūtras, prakaraṇa texts). Each query should\ntarget a different angle — one query about the philosophical principle,\none about a parallel situation in the texts, one about the practical\nteaching offered by the lineage.",
47
+ "fields": [
48
+ {
49
+ "prefix": "Surface Concern:",
50
+ "description": "${surface_concern}"
51
+ },
52
+ {
53
+ "prefix": "Deeper Concern:",
54
+ "description": "${deeper_concern}"
55
+ },
56
+ {
57
+ "prefix": "Vedantic Themes:",
58
+ "description": "${vedantic_themes}"
59
+ },
60
+ {
61
+ "prefix": "Reasoning:",
62
+ "description": "${reasoning}"
63
+ },
64
+ {
65
+ "prefix": "Queries:",
66
+ "description": "3 distinct search queries (each 5-15 words). Vary in angle: principle, parallel, practice."
67
+ }
68
+ ]
69
+ },
70
+ "lm": null
71
+ },
72
+ "select.predict": {
73
+ "traces": [],
74
+ "train": [],
75
+ "demos": [],
76
+ "signature": {
77
+ "instructions": "From the retrieved candidate passages, select the ones that genuinely\nspeak to this user's situation. Prefer primary sources (Gītā verses,\nUpaniṣadic mantras, Śaṅkara's bhāṣya) over secondary or modern commentary\nwhen both are available. Reject passages that are merely topically adjacent\nbut don't address the actual spiritual concern. Avoid re-selecting passages\nwhose source was already cited in a prior turn — prefer fresh ground.",
78
+ "fields": [
79
+ {
80
+ "prefix": "Deeper Concern:",
81
+ "description": "${deeper_concern}"
82
+ },
83
+ {
84
+ "prefix": "Candidate Passages:",
85
+ "description": "Numbered candidate passages with source attribution."
86
+ },
87
+ {
88
+ "prefix": "Previously Cited:",
89
+ "description": "Source references already cited in earlier turns of this conversation (e.g. ['BG 2.47', 'BG 18.66']). Prefer passages not on this list so the conversation covers new ground. Empty list on the first turn."
90
+ },
91
+ {
92
+ "prefix": "Reasoning:",
93
+ "description": "${reasoning}"
94
+ },
95
+ {
96
+ "prefix": "Selected Indices:",
97
+ "description": "Indices (1-based) of the 2-4 most relevant passages."
98
+ },
99
+ {
100
+ "prefix": "Selection Rationale:",
101
+ "description": "One sentence per selection explaining why that passage speaks to this concern."
102
+ }
103
+ ]
104
+ },
105
+ "lm": null
106
+ },
107
+ "synthesize.predict": {
108
+ "traces": [],
109
+ "train": [],
110
+ "demos": [],
111
+ "signature": {
112
+ "instructions": "Compose a response that is grounded in Advaita Vedānta as taught by\nŚaṅkarācārya, empathetic to the user's felt experience, and practically\nuseful for their situation. Honor the two-truths distinction: meet the user\nin vyāvahārika (transactional reality) without ever denying the\npāramārthika (absolute) view. Cite specific verses/passages by reference,\nintegrate them into prose rather than dumping quotes, and keep wit gentle —\nlight around the cosmic predicament, never light about the user's pain.\n\nIf history has prior turns: do not repeat citations or teachings already\ngiven; build on or deepen what was said; acknowledge any shift the user has\nexpressed since the last turn. If the user is following up, open by briefly\nacknowledging the continuity before moving forward.",
113
+ "fields": [
114
+ {
115
+ "prefix": "History:",
116
+ "description": "Prior turns as a list of message dicts with 'user_question' and 'response' keys. Use this to avoid repetition and to build across turns."
117
+ },
118
+ {
119
+ "prefix": "User Question:",
120
+ "description": "${user_question}"
121
+ },
122
+ {
123
+ "prefix": "Felt Emotion:",
124
+ "description": "${felt_emotion}"
125
+ },
126
+ {
127
+ "prefix": "Deeper Concern:",
128
+ "description": "${deeper_concern}"
129
+ },
130
+ {
131
+ "prefix": "Selected Passages:",
132
+ "description": "The selected passages with full source attribution."
133
+ },
134
+ {
135
+ "prefix": "Reasoning:",
136
+ "description": "${reasoning}"
137
+ },
138
+ {
139
+ "prefix": "Response:",
140
+ "description": "The advisor's reply to the user. 250-450 words. Open by acknowledging the felt experience. Move into the Vedāntic perspective. Cite at least one primary source (Gītā chapter:verse, Upaniṣad name + section, etc.). Close with a concrete practice or shift in perspective they can try this week. Address the user as 'you' throughout. Avoid Western therapy clichés."
141
+ },
142
+ {
143
+ "prefix": "Sources Cited:",
144
+ "description": "Source references actually cited in the response, e.g. 'BG 2.47', 'Bṛhadāraṇyaka Up. 4.4.5', 'Vivekacūḍāmaṇi 11'."
145
+ }
146
+ ]
147
+ },
148
+ "lm": null
149
+ },
150
+ "metadata": {
151
+ "dependency_versions": {
152
+ "python": "3.11",
153
+ "dspy": "3.2.0",
154
+ "cloudpickle": "3.1"
155
+ }
156
+ }
157
+ }
chat.py ADDED
@@ -0,0 +1,392 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ chat.py — interactive conversation with the advisor.
3
+
4
+ By default it loads the GEPA-optimized program from artifacts/. If that file
5
+ doesn't exist yet, it falls back to the un-optimized base prompts so you can
6
+ sanity-check the pipeline before running optimization.
7
+
8
+ Flags:
9
+ --debug Show intermediate pipeline state (felt emotion, queries, etc.)
10
+ --thinking Show the full synthesis reasoning trace (default: first 6 lines)
11
+ --no-thinking Hide the reasoning trace entirely
12
+
13
+ After each response, source references are printed with numbers.
14
+ show <N|ref> Display the verse text, translation, and Śaṅkara's bhāṣya.
15
+ explain <N|ref> Show the verse then stream a contextual explanation of how
16
+ it applies to the current conversation.
17
+ """
18
+
19
+ from __future__ import annotations
20
+ import argparse
21
+ import time
22
+ import threading
23
+ from typing import Optional
24
+
25
+ import dspy
26
+ from rich.console import Console
27
+ from rich.live import Live
28
+ from rich.markdown import Markdown
29
+ from rich.panel import Panel
30
+ from rich.rule import Rule
31
+ from rich.text import Text
32
+
33
+ import config
34
+ from advisor import load_optimized
35
+ from corpus import EnrichedVerse, Verse, read_jsonl_enriched, read_jsonl_verses
36
+
37
+
38
+ # ── speed constants ────────────────────────────────────────────────────────────
39
+ _THINKING_CPS = 800 # chars/sec for reasoning stream (secondary content, fast)
40
+ _RESPONSE_CPS = 300 # chars/sec for advisor response (primary content)
41
+ _THINKING_PREVIEW = 6 # lines shown in collapsed thinking mode
42
+
43
+
44
+ # ── verse corpus lookup ────────────────────────────────────────────────────────
45
+ def _load_verse_lookup() -> dict[str, Verse]:
46
+ """Build a case-insensitive verse_ref → Verse dict from the corpus."""
47
+ lookup: dict[str, Verse] = {}
48
+ enriched = config.DATA_DIR / "corpus_enriched.jsonl"
49
+ plain = config.DATA_DIR / "corpus.jsonl"
50
+
51
+ if enriched.exists():
52
+ loader, path = read_jsonl_enriched, enriched
53
+ elif plain.exists():
54
+ loader, path = read_jsonl_verses, plain
55
+ else:
56
+ return lookup
57
+
58
+ for verse in loader(path):
59
+ lookup[verse.verse_ref.lower().strip()] = verse
60
+ return lookup
61
+
62
+
63
+ def _find_verse(lookup: dict, ref: str) -> Optional[Verse]:
64
+ return lookup.get(ref.lower().strip())
65
+
66
+
67
+ def _resolve_ref(arg: str, sources_cited: list[str]) -> str:
68
+ """Turn '1' → sources_cited[0], or return arg unchanged for direct ref lookup."""
69
+ try:
70
+ n = int(arg.strip())
71
+ if 1 <= n <= len(sources_cited):
72
+ return sources_cited[n - 1]
73
+ except ValueError:
74
+ pass
75
+ return arg.strip()
76
+
77
+
78
+ # ── DSPy signature for contextual explanation ─────────────────────────────────
79
+ class _ExplainInContext(dspy.Signature):
80
+ """You are the Gītā Advisor continuing a conversation. The user has asked
81
+ you to unpack a specific verse or passage you cited. Explain what it means
82
+ and why it speaks precisely to their situation — go deeper than the initial
83
+ response did. Reference the user's words. Close with one concrete way to
84
+ hold or work with this text this week."""
85
+
86
+ verse_ref: str = dspy.InputField()
87
+ verse_content: str = dspy.InputField(
88
+ desc="Translation, original text (if available), and Śaṅkara's commentary."
89
+ )
90
+ conversation_context: str = dspy.InputField(
91
+ desc="The user's question and the advisor's response where this verse was cited."
92
+ )
93
+
94
+ explanation: str = dspy.OutputField(
95
+ desc="150-250 words. Grounded in Advaita. Do not merely restate the translation. "
96
+ "End with a practical suggestion for this week."
97
+ )
98
+
99
+
100
+ # ── streaming helpers ─────────────────────────────────────────────────────────
101
+ def _stream_chars(console: Console, text: str, cps: int):
102
+ """Write text to the terminal character by character."""
103
+ if not text:
104
+ return
105
+ delay = 1.0 / cps
106
+ for ch in text:
107
+ console.file.write(ch)
108
+ console.file.flush()
109
+ time.sleep(delay)
110
+ console.file.write("\n")
111
+ console.file.flush()
112
+
113
+
114
+ def _stream_response(console: Console, text: str, cps: int = _RESPONSE_CPS):
115
+ """Stream the advisor response into a growing Markdown Panel via Rich Live."""
116
+ if not text:
117
+ return
118
+ displayed = ""
119
+ delay = 1.0 / cps
120
+ with Live(console=console, refresh_per_second=min(cps, 30)) as live:
121
+ for ch in text:
122
+ displayed += ch
123
+ live.update(Panel(
124
+ Markdown(displayed),
125
+ title="[bold]advisor[/bold]",
126
+ border_style="yellow",
127
+ padding=(1, 2),
128
+ ))
129
+ time.sleep(delay)
130
+
131
+
132
+ def _show_thinking(console: Console, reasoning: str, full: bool):
133
+ """Stream the synthesis reasoning below a dim rule, collapsed to _THINKING_PREVIEW lines."""
134
+ if not reasoning:
135
+ return
136
+
137
+ lines = reasoning.strip().splitlines()
138
+ if not full and len(lines) > _THINKING_PREVIEW:
139
+ display = "\n".join(lines[:_THINKING_PREVIEW])
140
+ n_hidden = len(lines) - _THINKING_PREVIEW
141
+ else:
142
+ display = "\n".join(lines)
143
+ n_hidden = 0
144
+
145
+ console.print(Rule("[dim]thinking[/dim]", style="dim blue"))
146
+ # Write dim italic via ANSI since we're streaming to file directly
147
+ # (Rich markup can't be applied char-by-char; dim is cosmetic here)
148
+ _stream_chars(console, display, cps=_THINKING_CPS)
149
+
150
+ if n_hidden:
151
+ console.print(f"[dim] ↳ {n_hidden} more lines — use --thinking to expand[/dim]")
152
+ console.print()
153
+
154
+
155
+ # ── verse display helpers ─────────────────────────────────────────────────────
156
+ def _show_verse(console: Console, verse: Verse):
157
+ """Render a verse with its translation, original text, and commentary."""
158
+ body = Text()
159
+
160
+ if verse.sanskrit:
161
+ body.append(verse.sanskrit + "\n", style="bold")
162
+ if verse.transliteration:
163
+ body.append(verse.transliteration + "\n", style="italic dim")
164
+
165
+ if verse.translation:
166
+ label = f"Translation ({verse.translator})" if verse.translator else "Translation"
167
+ body.append(f"\n{label}:\n", style="dim")
168
+ body.append(verse.translation + "\n")
169
+
170
+ if verse.bhashya:
171
+ translator_note = f" ({verse.bhashya_translator})" if verse.bhashya_translator else ""
172
+ body.append(f"\nŚaṅkara's Bhāṣya{translator_note}:\n", style="dim")
173
+ preview = verse.bhashya[:800] + ("…" if len(verse.bhashya) > 800 else "")
174
+ body.append(preview + "\n", style="dim")
175
+
176
+ ev = verse if isinstance(verse, EnrichedVerse) else None
177
+ if ev and ev.paraphrase:
178
+ body.append("\nTeaching: ", style="bold dim")
179
+ body.append(ev.paraphrase + "\n", style="dim")
180
+ if ev and ev.themes:
181
+ body.append("Themes: ", style="bold dim")
182
+ body.append(", ".join(ev.themes) + "\n", style="dim")
183
+ if ev and ev.practical_teaching:
184
+ body.append("Practical shift: ", style="bold dim")
185
+ body.append(ev.practical_teaching + "\n", style="dim")
186
+
187
+ section = verse.section_display or verse.section
188
+ subtitle = verse.work_display + (f" — {section}" if section else "")
189
+ console.print(Panel(
190
+ body,
191
+ title=f"[bold]{verse.verse_ref}[/bold]",
192
+ subtitle=f"[dim]{subtitle}[/dim]",
193
+ border_style="cyan",
194
+ padding=(1, 2),
195
+ ))
196
+
197
+
198
+ def _explain_in_context(
199
+ console: Console,
200
+ verse: Verse,
201
+ history_messages: list[dict],
202
+ cps: int = _RESPONSE_CPS,
203
+ ):
204
+ """Call the LM to explain the verse in context of the last conversation turn."""
205
+ if history_messages:
206
+ last = history_messages[-1]
207
+ context = (
208
+ f"User: {last.get('user_question', '')}\n\n"
209
+ f"Advisor: {last.get('response', '')}"
210
+ )
211
+ else:
212
+ context = "No prior conversation."
213
+
214
+ bits = []
215
+ if verse.translation:
216
+ bits.append(f"Translation: {verse.translation}")
217
+ if verse.sanskrit:
218
+ bits.append(f"Sanskrit: {verse.sanskrit}")
219
+ if verse.bhashya:
220
+ bits.append(f"Śaṅkara's commentary: {verse.bhashya[:600]}")
221
+ ev = verse if isinstance(verse, EnrichedVerse) else None
222
+ if ev and ev.paraphrase:
223
+ bits.append(f"Teaching: {ev.paraphrase}")
224
+ verse_content = "\n\n".join(bits)
225
+
226
+ explainer = dspy.ChainOfThought(_ExplainInContext)
227
+ with console.status("[dim]expanding...[/dim]", spinner="dots"):
228
+ try:
229
+ result = explainer(
230
+ verse_ref=verse.verse_ref,
231
+ verse_content=verse_content,
232
+ conversation_context=context,
233
+ )
234
+ explanation = result.explanation
235
+ except Exception as exc:
236
+ console.print(f"[red]Could not generate explanation: {exc}[/red]")
237
+ return
238
+
239
+ console.print()
240
+ _stream_response(console, explanation, cps=cps)
241
+
242
+
243
+ # ── main loop ─────────────────────────────────────────────────────────────────
244
+ def main():
245
+ ap = argparse.ArgumentParser()
246
+ ap.add_argument("--program", default=str(config.OPTIMIZED_PROGRAM_PATH))
247
+ ap.add_argument("--debug", action="store_true",
248
+ help="Show intermediate pipeline state for each turn.")
249
+ ap.add_argument("--thinking", action="store_true",
250
+ help="Show full synthesis reasoning trace (default: first 6 lines).")
251
+ ap.add_argument("--no-thinking", action="store_true", dest="no_thinking",
252
+ help="Hide the reasoning trace entirely.")
253
+ args = ap.parse_args()
254
+
255
+ config.configure_dspy()
256
+ advisor = load_optimized(args.program)
257
+ console = Console()
258
+
259
+ with console.status("[dim]loading corpus...[/dim]", spinner="dots"):
260
+ verse_lookup = _load_verse_lookup()
261
+
262
+ console.print(Panel.fit(
263
+ "[bold]Gītā Advisor[/bold]\n\n"
264
+ "Speak from where you actually are.\n"
265
+ "After a response: [italic]show <N>[/italic] to read a cited verse · "
266
+ "[italic]explain <N>[/italic] for contextual breakdown.\n"
267
+ "Type [italic]exit[/italic] or Ctrl-D to leave.",
268
+ border_style="cyan",
269
+ ))
270
+
271
+ history = dspy.History(messages=[])
272
+ last_pred = None
273
+
274
+ while True:
275
+ try:
276
+ console.print()
277
+ console.print("[bold cyan]you:[/bold cyan] ", end="")
278
+ line = input().strip()
279
+ except (EOFError, KeyboardInterrupt):
280
+ console.print("\n[dim]नमस्ते।[/dim]")
281
+ return
282
+
283
+ if not line:
284
+ continue
285
+ if line.lower() in {"exit", "quit", ":q"}:
286
+ console.print("[dim]नमस्ते।[/dim]")
287
+ return
288
+
289
+ # ── source exploration commands ───────────────────────────────────────
290
+ cmd_lower = line.lower()
291
+ if cmd_lower.startswith(("show ", "explain ")):
292
+ if last_pred is None:
293
+ console.print("[dim]No sources yet — ask a question first.[/dim]")
294
+ continue
295
+ cmd, _, arg = line.partition(" ")
296
+ ref = _resolve_ref(arg, last_pred.sources_cited)
297
+ verse = _find_verse(verse_lookup, ref)
298
+ if verse is None:
299
+ console.print(f"[dim]'{ref}' not found in corpus.[/dim]")
300
+ if last_pred.sources_cited:
301
+ hint = " ".join(
302
+ f"[{i+1}] {r}" for i, r in enumerate(last_pred.sources_cited)
303
+ )
304
+ console.print(f"[dim]Available: {hint}[/dim]")
305
+ continue
306
+ _show_verse(console, verse)
307
+ if cmd.lower() == "explain":
308
+ _explain_in_context(console, verse, history.messages)
309
+ continue
310
+
311
+ # ── normal question — run pipeline in background with live stage progress ──
312
+ pred = None
313
+ error = None
314
+ stage = ["initializing..."]
315
+ done = threading.Event()
316
+
317
+ def run_advisor():
318
+ nonlocal pred, error
319
+ try:
320
+ pred = advisor(
321
+ user_question=line,
322
+ history=history,
323
+ _stage_cb=lambda msg: stage.__setitem__(0, msg),
324
+ )
325
+ except Exception as exc:
326
+ error = exc
327
+ finally:
328
+ done.set()
329
+
330
+ threading.Thread(target=run_advisor, daemon=True).start()
331
+
332
+ with Live(console=console, refresh_per_second=8) as live:
333
+ while not done.wait(timeout=0.12):
334
+ live.update(Text(f" ◌ {stage[0]}", style="dim"))
335
+ live.update(Text(""))
336
+
337
+ if error:
338
+ console.print(f"[red]Error: {error}[/red]")
339
+ continue
340
+
341
+ last_pred = pred
342
+ history.messages.append({
343
+ "user_question": line,
344
+ "response": pred.response,
345
+ "sources_cited": pred.sources_cited,
346
+ })
347
+
348
+ # debug trace
349
+ if args.debug:
350
+ console.print(Rule("[dim]debug[/dim]", style="dim"))
351
+ console.print(f"[dim]felt:[/dim] {pred.felt_emotion}")
352
+ console.print(f"[dim]surface:[/dim] {pred.surface_concern}")
353
+ console.print(f"[dim]deeper:[/dim] {pred.deeper_concern}")
354
+ console.print(f"[dim]themes:[/dim] {', '.join(pred.vedantic_themes)}")
355
+ console.print(f"[dim]queries:[/dim] {pred.queries}")
356
+ console.print(f"[dim]selected:[/dim] {pred.selected_indices}")
357
+ for i in pred.selected_indices:
358
+ if 1 <= i <= len(pred.retrieved_passages):
359
+ h = pred.retrieved_passages[i - 1]
360
+ m = h["meta"]
361
+ console.print(
362
+ f" [dim]→ [{m['tier']}] {m['work']}"
363
+ f"{' — ' + m['section'] if m.get('section') else ''}"
364
+ f" (score {h['score']:.3f})[/dim]"
365
+ )
366
+ console.print(Rule(style="dim"))
367
+
368
+ # thinking section
369
+ if not args.no_thinking:
370
+ _show_thinking(
371
+ console,
372
+ getattr(pred, "synthesis_reasoning", ""),
373
+ full=args.thinking,
374
+ )
375
+
376
+ # stream the response
377
+ console.print()
378
+ _stream_response(console, pred.response)
379
+
380
+ # source footer with hints
381
+ if pred.sources_cited:
382
+ numbered = " ".join(
383
+ f"[{i+1}] {r}" for i, r in enumerate(pred.sources_cited)
384
+ )
385
+ console.print(f"\n[dim]sources: {numbered}[/dim]")
386
+ console.print(
387
+ "[dim] → show <N> to read the verse · explain <N> for contextual breakdown[/dim]"
388
+ )
389
+
390
+
391
+ if __name__ == "__main__":
392
+ main()
config.py ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ config.py — central configuration for the Gītā Advisor.
3
+
4
+ Three LMs are configured:
5
+
6
+ - TASK_LM: the local model running in LM Studio. Used at inference
7
+ time (understanding, retrieval planning, advice synthesis).
8
+
9
+ - ENRICH_LM: Claude Sonnet (API) for the offline enrichment pass.
10
+ The local 26B model truncates structured output at 1500
11
+ tokens and drops fields. Claude handles all six fields
12
+ cleanly in one call and costs ~$12-15 for the full 701-
13
+ verse corpus (one-time). Set ANTHROPIC_API_KEY in env.
14
+
15
+ - REFLECTION_LM: gpt-4o (OpenAI) for GEPA's reflection step.
16
+ GEPA asks the reflection LM to read metric feedback and
17
+ propose rewritten prompts — this scales strongly with
18
+ model quality. gpt-4o reasons well enough to handle
19
+ nuanced Advaita feedback without breaking the budget.
20
+ Same OPENAI_API_KEY as enrichment.
21
+ """
22
+
23
+ from __future__ import annotations
24
+ import os
25
+ import re
26
+ from pathlib import Path
27
+ import dspy
28
+ import dspy.adapters.chat_adapter as _chat_adapter_module
29
+ from dotenv import load_dotenv
30
+
31
+ # Gemma (and some other local models) output `[[ ## field ]]` without the closing `##`
32
+ # that DSPy's ChatAdapter expects (`[[ ## field ## ]]`). Patch the module-level regex
33
+ # to accept both forms before any adapter is instantiated.
34
+ _chat_adapter_module.field_header_pattern = re.compile(r"\[\[ ## (\w+)(?:\s*##)? \]\]")
35
+
36
+ load_dotenv(Path(__file__).parent / ".env") # explicit path; works from any cwd
37
+
38
+ # ──────────────────────────── Paths ────────────────────────────
39
+ ROOT = Path(__file__).parent.resolve()
40
+ SOURCES_DIR = ROOT / "sources"
41
+ DATA_DIR = ROOT / "data"
42
+ ARTIFACTS_DIR = ROOT / "artifacts"
43
+ CHROMA_DIR = ARTIFACTS_DIR / "chroma"
44
+
45
+ for d in (SOURCES_DIR, DATA_DIR, ARTIFACTS_DIR, CHROMA_DIR):
46
+ d.mkdir(parents=True, exist_ok=True)
47
+
48
+ DATASET_PATH = DATA_DIR / "synthetic_questions.jsonl"
49
+ OPTIMIZED_PROGRAM_PATH = ARTIFACTS_DIR / "optimized_advisor.json"
50
+
51
+ # ──────────────────────────── Task LM — Gemini API (preferred) ───────────────────────────
52
+ # When GEMINI_API_KEY is set, route the task LM through Google AI Studio.
53
+ # Same Gemma 4 26B weights, but no local GPU required and the free tier is
54
+ # sufficient for inference + GEPA optimization runs.
55
+ GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", "")
56
+ GEMINI_TASK_MODEL = os.getenv("GEMINI_TASK_MODEL", "gemini/gemma-4-26b-a4b-it")
57
+
58
+ GEMINI_TASK_LM_KWARGS = dict(
59
+ api_key=GEMINI_API_KEY,
60
+ temperature=0.6,
61
+ # Gemma 4 thinking tokens count against max_tokens in the Gemini API.
62
+ # Each pipeline call burns ~3-4k reasoning tokens before writing output,
63
+ # so 4096 gets truncated. 16384 gives comfortable headroom for both.
64
+ max_tokens=16384,
65
+ cache=True,
66
+ )
67
+
68
+ # ──────────────────────────── Task LM — LM Studio fallback ───────────────────────────────
69
+ LM_STUDIO_BASE = os.getenv("LM_STUDIO_BASE", "http://localhost:1234/v1")
70
+ LOCAL_MODEL = os.getenv("LOCAL_MODEL", "google/gemma-4-26b-a4b")
71
+
72
+ # DSPy uses LiteLLM-style model strings. "openai/" prefix routes through the
73
+ # OpenAI-compatible client, which LM Studio speaks.
74
+ TASK_MODEL_STRING = f"openai/{LOCAL_MODEL}"
75
+
76
+ TASK_LM_KWARGS = dict(
77
+ api_base=LM_STUDIO_BASE,
78
+ api_key=os.getenv("LM_STUDIO_KEY", "lm-studio"), # any non-empty string
79
+ temperature=0.6,
80
+ max_tokens=4096, # ChainOfThought reasoning + all output fields easily exceeds 2k
81
+ cache=True,
82
+ )
83
+
84
+ # Which backend to use: "gemini" if the API key is present, else "lm_studio".
85
+ # Override with TASK_LM_BACKEND=lm_studio to force local even when the key is set.
86
+ TASK_LM_BACKEND: str = os.getenv("TASK_LM_BACKEND", "gemini" if GEMINI_API_KEY else "lm_studio")
87
+
88
+
89
+ # ──────────────────────────── Enrichment LM (OpenAI gpt-4o-mini, offline batch) ─────────
90
+ # gpt-4o-mini is reliable at structured JSON output and cheap enough that the
91
+ # full 701-verse corpus costs under $1 (one-time).
92
+ #
93
+ # Cost estimate (full 701-verse corpus):
94
+ # ~1800 input tokens/verse × 701 × $0.15/M ≈ $0.19 input
95
+ # ~900 output tokens/verse × 701 × $0.60/M ≈ $0.38 output
96
+ # Total ≈ $0.57 — effectively free at this scale.
97
+ #
98
+ # Key is read from .env (OPENAI_API_KEY). Override ENRICH_MODEL env var to
99
+ # swap in a different OpenAI model (e.g. "openai/gpt-4o" for harder cases).
100
+ ENRICH_MODEL = os.getenv("ENRICH_MODEL", "openai/gpt-4o-mini")
101
+
102
+ ENRICH_LM_KWARGS = dict(
103
+ api_key=os.getenv("OPENAI_API_KEY", ""),
104
+ temperature=0.3, # lower than task LM — we want consistent structured output
105
+ max_tokens=3000, # enough headroom for all six fields + CoT reasoning
106
+ cache=True, # DSPy disk cache deduplicates identical calls on re-runs
107
+ response_format={"type": "text"}, # DSPy 3.x sends json_object by default;
108
+ # OpenAI now requires json_schema or text
109
+ )
110
+
111
+
112
+ # ──────────────────────────── Proxy Task LM (gpt-4o-mini, GEPA optimization only) ────────
113
+ # When running GEPA with --proxy-task-lm, this model replaces Gemma 4 as the task LM
114
+ # during optimization. Prompts are model-agnostic text; they transfer back to Gemma 4
115
+ # at inference time. gpt-4o-mini runs ~20x faster than Gemma 4 thinking mode, bringing
116
+ # --auto light from ~260 hours to ~2-3 hours.
117
+ PROXY_TASK_MODEL = os.getenv("PROXY_TASK_MODEL", "openai/gpt-4o-mini")
118
+
119
+ PROXY_TASK_LM_KWARGS = dict(
120
+ api_key=os.getenv("OPENAI_API_KEY", ""),
121
+ temperature=0.6,
122
+ max_tokens=4096,
123
+ cache=True,
124
+ response_format={"type": "text"},
125
+ )
126
+
127
+ # ──────────────────────────── Reflection LM (gpt-4o, GEPA) ──────────────────────────────
128
+ # GEPA's reflection step reads metric feedback and proposes rewritten prompts.
129
+ # This scales strongly with model quality. gpt-4o is the right balance here:
130
+ # it reasons well enough to write meaningful prompt mutations from nuanced
131
+ # Advaita feedback, and is affordable on a small OpenAI credit balance.
132
+ #
133
+ # Cost estimate per GEPA run (reflection calls only):
134
+ # --auto light: ~50 calls × 6k tokens ≈ $1.50
135
+ # --auto medium: ~250 calls × 6k tokens ≈ $7.50
136
+ #
137
+ # gpt-4o-mini is too shallow for this task — it produces generic rewrites
138
+ # that ignore the tradition-specific feedback the metric provides.
139
+ # Same OPENAI_API_KEY as the enrichment LM.
140
+ REFLECTION_MODEL = os.getenv("REFLECTION_MODEL", "openai/gpt-4o")
141
+
142
+ REFLECTION_LM_KWARGS = dict(
143
+ api_key=os.getenv("OPENAI_API_KEY", ""),
144
+ temperature=1.0, # GEPA wants diversity across reflection proposals
145
+ max_tokens=6000, # headroom for detailed critique + full rewritten prompt text
146
+ response_format={"type": "text"}, # same fix as enrichment LM — avoid json_object
147
+ cache=False, # reflection calls are intentionally diverse; caching defeats that
148
+ )
149
+
150
+
151
+ # ──────────────────────────── Configure helpers ───────────────────────────────────────
152
+ def configure_dspy() -> tuple[dspy.LM, dspy.LM]:
153
+ """Configure DSPy for inference and return (task_lm, reflection_lm).
154
+
155
+ Prefers Gemini API when GEMINI_API_KEY is set (same Gemma 4 26B weights,
156
+ hosted by Google, free tier). Falls back to LM Studio otherwise.
157
+ Override with TASK_LM_BACKEND=lm_studio env var to force local.
158
+
159
+ ChatAdapter fallback to JSONAdapter is disabled in both paths because:
160
+ - LM Studio rejects json_object.
161
+ - Gemma outputs `[[ ## field ]]` (no closing ##); the field_header_pattern
162
+ patch at module load time makes ChatAdapter parse these correctly.
163
+ """
164
+ if TASK_LM_BACKEND == "gemini":
165
+ task_lm = dspy.LM(model=GEMINI_TASK_MODEL, **GEMINI_TASK_LM_KWARGS)
166
+ print(f"Task LM backend: Gemini API ({GEMINI_TASK_MODEL})")
167
+ else:
168
+ task_lm = dspy.LM(model=TASK_MODEL_STRING, **TASK_LM_KWARGS)
169
+ print(f"Task LM backend: LM Studio ({TASK_MODEL_STRING} @ {LM_STUDIO_BASE})")
170
+
171
+ reflection_lm = dspy.LM(model=REFLECTION_MODEL, **REFLECTION_LM_KWARGS)
172
+ # use_json_adapter_fallback=False: LM Studio rejects json_object, so we must never fall back
173
+ dspy.configure(lm=task_lm, adapter=dspy.ChatAdapter(use_json_adapter_fallback=False))
174
+ return task_lm, reflection_lm
175
+
176
+
177
+ def configure_enrich_lm() -> dspy.LM:
178
+ """Configure DSPy globally with the Claude Sonnet enrichment LM and return it.
179
+
180
+ Call this instead of configure_dspy() when running enrich_corpus.py.
181
+ Raises if ANTHROPIC_API_KEY is not set.
182
+ """
183
+ key = os.getenv("OPENAI_API_KEY", "")
184
+ if not key:
185
+ raise SystemExit(
186
+ "OPENAI_API_KEY is not set. Add it to your .env file:\n"
187
+ " OPENAI_API_KEY=sk-proj-..."
188
+ )
189
+ lm = dspy.LM(model=ENRICH_MODEL, **ENRICH_LM_KWARGS)
190
+ dspy.configure(lm=lm)
191
+ return lm
192
+
193
+
194
+ # ──────────────────────────── Embeddings ─────────────────────────────────────────────
195
+ # Local sentence-transformer for retrieval. BGE-small is a sweet spot for
196
+ # semantic philosophy text on a Mac without burning RAM.
197
+ EMBED_MODEL = os.getenv("EMBED_MODEL", "BAAI/bge-small-en-v1.5")
198
+ EMBED_DEVICE = os.getenv("EMBED_DEVICE", "mps") # "mps" on Apple Silicon, "cpu" otherwise
199
+
200
+ TOP_K_RETRIEVE = 8 # passages to fetch per query
201
+ N_RETRIEVAL_QUERIES = 3 # the planner generates this many per user question
corpus.py ADDED
@@ -0,0 +1,224 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ corpus.py — the data model and on-disk storage for the verse corpus.
3
+
4
+ A note on dataclasses vs. plain dicts
5
+ -------------------------------------
6
+ We could have used dicts everywhere and saved keystrokes. We don't, because
7
+ the Verse type is the contract between five different modules — parsers,
8
+ enrichment, indexing, retrieval, and the metric — and a typed contract
9
+ catches mistakes that "I thought 'sources_cited' was a list" wouldn't.
10
+
11
+ The pipeline lifecycle of a verse
12
+ ---------------------------------
13
+ parsers.* → Verse (no LLM-derived fields)
14
+ enrichment.py → EnrichedVerse (with LLM-derived fields)
15
+ knowledge_base → reads EnrichedVerse, writes 3 embeddings per verse
16
+ advisor.py → receives EnrichedVerse via retriever hits
17
+ metrics.py → uses verse_id for exact citation grounding
18
+
19
+ Storage choice
20
+ --------------
21
+ JSONL on disk. Each line is a verse. Why not Parquet, sqlite, etc.?
22
+ - Easy to grep
23
+ - Easy to diff in PRs
24
+ - Easy for a human to spot-check enrichment quality (the whole point)
25
+ - We never need to scan more than a few thousand lines, so format doesn't matter
26
+ """
27
+
28
+ from __future__ import annotations
29
+ import json
30
+ from dataclasses import dataclass, field, asdict, fields
31
+ from pathlib import Path
32
+ from typing import Iterable, Iterator
33
+
34
+
35
+ # ──────────────────────────── Verse: the raw record ────────────────────────────
36
+ @dataclass
37
+ class Verse:
38
+ """A natural unit of scripture: one verse, one mantra, one sūtra.
39
+
40
+ The required fields are minimal — every parser must produce at least these.
41
+ Optional fields (sanskrit, transliteration, bhashya, ...) are filled when
42
+ the source provides them.
43
+
44
+ `verse_id` is the global unique key. Convention:
45
+ '<work_slug>_<section_slug>_<verse_number>'
46
+ e.g. 'bhagavad_gita_02_47', 'mundaka_upanishad_2_1_3'.
47
+
48
+ `verse_ref` is the human-readable citation form:
49
+ e.g. 'BG 2.47', 'Muṇḍaka Up. 2.1.3', 'Vivekacūḍāmaṇi 11'.
50
+ The advisor's response uses this exact string in citations.
51
+ """
52
+ # Identity — required for every record
53
+ verse_id: str
54
+ work: str
55
+ work_display: str
56
+ verse_ref: str
57
+ tier: str # primary | shankara | supporting
58
+
59
+ # Section/chapter info — required when the work has chapters
60
+ section: str = "" # 'chapter_02'
61
+ section_display: str = "" # 'Chapter 2: Sāṅkhya Yoga'
62
+
63
+ # Content — at least one of {translation, bhashya} must be non-empty
64
+ translation: str = "" # English translation of the verse itself
65
+ translator: str = "" # who translated it (for attribution)
66
+
67
+ sanskrit: str = "" # original Devanāgarī
68
+ transliteration: str = "" # IAST roman transliteration
69
+ word_meanings: str = "" # word-by-word gloss when present
70
+
71
+ bhashya: str = "" # Śaṅkara's commentary on this verse, if any
72
+ bhashya_translator: str = "" # who translated the bhāṣya
73
+
74
+ # Provenance for accountability and license display
75
+ source_key: str = "" # the registry key this came from
76
+ license: str = "" # license tag from registry
77
+
78
+ def has_content(self) -> bool:
79
+ """Used by parsers/loaders to drop empty records before they pollute
80
+ the index. A 'verse' with only a verse_id and no actual text is junk."""
81
+ return bool(self.translation.strip() or self.bhashya.strip())
82
+
83
+
84
+ # ──────────────────────────── EnrichedVerse: with LLM extractions ────────────────
85
+ @dataclass
86
+ class EnrichedVerse(Verse):
87
+ """A Verse + the structured fields produced by the offline LLM pass.
88
+
89
+ Every list defaults to empty so a verse that fails enrichment can still
90
+ be stored (without enrichment, indexed only on its literal text/bhāṣya).
91
+ """
92
+ # The plain-English statement of what the verse teaches. Ideally 1–2
93
+ # sentences. This is what the synthesizer reads downstream.
94
+ paraphrase: str = ""
95
+
96
+ # Vedānta concepts engaged by the verse. Tradition-native vocabulary.
97
+ # Examples: 'karma_yoga', 'vairagya', 'sakshi', 'two_truths', 'adhyasa'.
98
+ themes: list[str] = field(default_factory=list)
99
+
100
+ # Mundane life situations where this verse would help. User-language.
101
+ # Examples: 'facing failure after sustained effort', 'watching a parent decline'.
102
+ life_situations: list[str] = field(default_factory=list)
103
+
104
+ # Emotions addressed, from a small consistent vocabulary.
105
+ # See enrichment.py EMOTION_VOCAB for the closed set.
106
+ emotions_addressed: list[str] = field(default_factory=list)
107
+
108
+ # What does this verse ask the seeker to do or shift?
109
+ practical_teaching: str = ""
110
+
111
+ # Hypothetical questions a real person might bring to this verse.
112
+ # These are gold for retrieval; they bridge the language gap.
113
+ hypothetical_questions: list[str] = field(default_factory=list)
114
+
115
+ # Quality / debugging
116
+ enrichment_model: str = "" # which LM produced these fields
117
+ enrichment_version: int = 1 # bump when the prompt changes substantively
118
+
119
+ # ---- Derived "views" used at indexing time ----
120
+ def literal_view(self) -> str:
121
+ """The literal English translation, lightly enriched with the Sanskrit
122
+ if available. Best for queries that share lexical features with the text."""
123
+ parts = []
124
+ if self.translation:
125
+ parts.append(self.translation.strip())
126
+ if self.transliteration:
127
+ parts.append(f"({self.transliteration.strip()})")
128
+ return "\n".join(parts)
129
+
130
+ def bhashya_view(self) -> str:
131
+ """Śaṅkara's commentary on this verse. Best for queries about the
132
+ Vedāntic explanation rather than the verse text itself."""
133
+ return self.bhashya.strip()
134
+
135
+ def advisor_view(self) -> str:
136
+ """The composed view that bridges the language gap.
137
+
138
+ This is what makes the user-question-→-verse mapping work. A user who
139
+ types 'I feel hollow even though I got everything I wanted' will not
140
+ find anything in the Sanskrit. They will find a near-neighbor in this
141
+ view if the enrichment did its job.
142
+ """
143
+ bits = []
144
+ if self.paraphrase:
145
+ bits.append(f"Teaching: {self.paraphrase}")
146
+ if self.life_situations:
147
+ bits.append(
148
+ "Speaks to: " + "; ".join(self.life_situations)
149
+ )
150
+ if self.emotions_addressed:
151
+ bits.append(
152
+ "Addresses: " + ", ".join(self.emotions_addressed)
153
+ )
154
+ if self.themes:
155
+ bits.append(
156
+ "Themes: " + ", ".join(self.themes)
157
+ )
158
+ if self.hypothetical_questions:
159
+ bits.append(
160
+ "Questions this answers:\n - "
161
+ + "\n - ".join(self.hypothetical_questions)
162
+ )
163
+ if self.practical_teaching:
164
+ bits.append(f"Practical shift: {self.practical_teaching}")
165
+ return "\n".join(bits)
166
+
167
+ def is_enriched(self) -> bool:
168
+ """Did enrichment populate at least the minimum-viable fields?"""
169
+ return bool(self.paraphrase) and bool(self.life_situations) and bool(self.hypothetical_questions)
170
+
171
+
172
+ # ──────────────────────────── On-disk JSONL ────────────────────────────
173
+ def write_jsonl(records: Iterable[Verse], path: Path) -> int:
174
+ """Write a stream of records as JSONL. Returns count written."""
175
+ path.parent.mkdir(parents=True, exist_ok=True)
176
+ n = 0
177
+ with path.open("w", encoding="utf-8") as f:
178
+ for r in records:
179
+ f.write(json.dumps(asdict(r), ensure_ascii=False) + "\n")
180
+ n += 1
181
+ return n
182
+
183
+
184
+ def read_jsonl_verses(path: Path) -> Iterator[Verse]:
185
+ """Read a JSONL file as Verse records. Skips lines we can't parse."""
186
+ if not path.exists():
187
+ return
188
+ with path.open(encoding="utf-8") as f:
189
+ for line_no, line in enumerate(f, start=1):
190
+ line = line.strip()
191
+ if not line:
192
+ continue
193
+ try:
194
+ d = json.loads(line)
195
+ yield _verse_from_dict(d, Verse)
196
+ except Exception as e:
197
+ print(f"[corpus] skipping malformed line {line_no} in {path}: {e}")
198
+
199
+
200
+ def read_jsonl_enriched(path: Path) -> Iterator[EnrichedVerse]:
201
+ """Read a JSONL file as EnrichedVerse records."""
202
+ if not path.exists():
203
+ return
204
+ with path.open(encoding="utf-8") as f:
205
+ for line_no, line in enumerate(f, start=1):
206
+ line = line.strip()
207
+ if not line:
208
+ continue
209
+ try:
210
+ d = json.loads(line)
211
+ yield _verse_from_dict(d, EnrichedVerse)
212
+ except Exception as e:
213
+ print(f"[corpus] skipping malformed line {line_no} in {path}: {e}")
214
+
215
+
216
+ def _verse_from_dict(d: dict, cls):
217
+ """Construct a Verse/EnrichedVerse, ignoring keys the dataclass doesn't know.
218
+
219
+ This forward-compatibility matters: if a future version adds a field, old
220
+ JSONL files should still load. And if enrichment adds extra debug fields,
221
+ we don't want the dataclass to choke on them.
222
+ """
223
+ valid = {f.name for f in fields(cls)}
224
+ return cls(**{k: v for k, v in d.items() if k in valid})
data/corpus_enriched.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
dataset_generator.py ADDED
@@ -0,0 +1,332 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ dataset_generator.py — produce ~500 unique, life-grounded questions.
3
+
4
+ The dataset is the GEPA training/validation pool. We want:
5
+ - Coverage across life domains (career, grief, identity, dharma, practice, ...)
6
+ - Variety in voice (anguished / intellectual / sarcastic / exhausted / hopeful)
7
+ - Variety in form (direct question / vent / philosophical doubt / dilemma)
8
+ - Variety in age & life-stage cues
9
+ - Some cleanly Advaita-relevant, some that *force* the advisor to find the
10
+ Advaita angle in something mundane (this is where over-fitting to "spiritual"
11
+ questions usually shows up)
12
+
13
+ Strategy: structured combinatorics × LM rewriting × similarity dedupe.
14
+
15
+ We construct (domain, scenario, voice, form) tuples, send them to the local LM
16
+ to write each as a real human message, then dedupe by embedding similarity.
17
+ """
18
+
19
+ from __future__ import annotations
20
+ import argparse
21
+ import json
22
+ import random
23
+ import re
24
+ from dataclasses import dataclass, asdict
25
+ from pathlib import Path
26
+
27
+ import numpy as np
28
+ from sentence_transformers import SentenceTransformer
29
+ from tqdm import tqdm
30
+ import dspy
31
+
32
+ import config
33
+
34
+
35
+ # ──────────────────────────── Taxonomy ────────────────────────────
36
+ DOMAINS: dict[str, list[str]] = {
37
+ "career_and_purpose": [
38
+ "got laid off after years of dedication",
39
+ "achieved the big career goal and feels empty",
40
+ "stuck in a job that pays well but feels meaningless",
41
+ "wants to leave stable career to pursue art / spiritual path",
42
+ "watching peers succeed while their own work plateaus",
43
+ "facing retirement and loss of identity tied to work",
44
+ "imposter syndrome after a major promotion",
45
+ "publicly failed in front of colleagues",
46
+ ],
47
+ "romantic_relationships": [
48
+ "going through a painful breakup after long relationship",
49
+ "marriage has gone cold and considering divorce",
50
+ "in love with someone who doesn't love them back",
51
+ "obsessive jealousy about a partner's past",
52
+ "tempted to have an affair",
53
+ "partner died and grief is overwhelming",
54
+ "afraid of commitment despite loving partner",
55
+ "single in their 40s and despairing about it",
56
+ ],
57
+ "family": [
58
+ "parent is dying and they have unresolved conflict",
59
+ "estranged from a sibling for years",
60
+ "parents pressuring them about marriage / career",
61
+ "child making destructive life choices",
62
+ "caring for an aging parent and exhausted",
63
+ "had a falling out with adult child",
64
+ "mother-in-law conflict ruining marriage",
65
+ "feels they failed as a parent",
66
+ ],
67
+ "friendship_and_social": [
68
+ "best friend betrayed their trust",
69
+ "feels invisible and lonely in their 30s",
70
+ "friend group has drifted apart with age",
71
+ "social anxiety preventing them from connecting",
72
+ "outgrown their old friends spiritually",
73
+ "discovered close friend was talking behind their back",
74
+ ],
75
+ "mortality_and_loss": [
76
+ "received a serious medical diagnosis",
77
+ "watching a loved one die slowly",
78
+ "afraid of death after a near-miss",
79
+ "grieving a sudden, unexpected loss",
80
+ "watching parents age and decline",
81
+ "lost a child",
82
+ "lost a pet who was their closest companion",
83
+ "approaching old age with regret about unlived life",
84
+ ],
85
+ "identity_and_ego": [
86
+ "tying self-worth entirely to external validation",
87
+ "endlessly comparing themselves to others on social media",
88
+ "going through midlife crisis questioning everything",
89
+ "famous and feels everyone wants something from them",
90
+ "lost sense of who they are after big life change",
91
+ "racial / cultural identity feels splintered between worlds",
92
+ "transitioning gender and family rejecting them",
93
+ ],
94
+ "material_life": [
95
+ "drowning in debt and shame about it",
96
+ "wealthy and feels guilty / disconnected because of it",
97
+ "consumed by FOMO scrolling through richer friends' lives",
98
+ "lost their home / financial security",
99
+ "struggling to give up consumerist habits despite knowing better",
100
+ "tempted by a get-rich-quick scheme",
101
+ ],
102
+ "existential": [
103
+ "feels life has no meaning at all",
104
+ "deeply depressed and going through the motions",
105
+ "constant existential dread about the world's state",
106
+ "doubting whether God / Brahman exists",
107
+ "sees through everything and now nothing feels real",
108
+ "feels they were 'born wrong' for this world",
109
+ ],
110
+ "spiritual_practice": [
111
+ "meditation has gone dry after years of practice",
112
+ "got addicted to spiritual highs and now they've stopped",
113
+ "spiritual ego — feels superior to non-practitioners",
114
+ "had a powerful experience and can't get back to it",
115
+ "doubts whether their guru / lineage is right for them",
116
+ "intellectually understands non-duality but doesn't feel it",
117
+ "afraid that liberation means losing love for family",
118
+ "can't reconcile traditional teachings with modern life",
119
+ ],
120
+ "ethics_and_dharma": [
121
+ "told a serious lie and considering whether to confess",
122
+ "harmed someone in the past and can't forgive themselves",
123
+ "facing a moral dilemma at work involving dishonesty",
124
+ "tempted to retaliate against someone who wronged them",
125
+ "torn between duty to family and personal calling",
126
+ "did something they're deeply ashamed of",
127
+ ],
128
+ "health_and_body": [
129
+ "chronic illness reshaping their entire life",
130
+ "struggling with addiction and relapse",
131
+ "eating disorder they can't seem to escape",
132
+ "chronic pain making spiritual practice feel impossible",
133
+ "hates their aging body",
134
+ "cancer diagnosis reframing everything",
135
+ ],
136
+ "modernity_specific": [
137
+ "doomscrolling and feeling worse every day",
138
+ "AI / automation making them feel obsolete",
139
+ "climate dread paralyzing their life decisions",
140
+ "political division has destroyed family relationships",
141
+ "addicted to phone / can't focus / can't read books anymore",
142
+ "online persona feels disconnected from real self",
143
+ ],
144
+ }
145
+
146
+ VOICES = [
147
+ "anguished",
148
+ "exhausted",
149
+ "intellectual and analytical",
150
+ "darkly sarcastic",
151
+ "quietly hopeful",
152
+ "numb and dissociated",
153
+ "frustrated and angry",
154
+ "softly resigned",
155
+ ]
156
+
157
+ FORMS = [
158
+ "direct question",
159
+ "venting paragraph",
160
+ "philosophical doubt",
161
+ "practical dilemma asking what to do",
162
+ "stream-of-consciousness",
163
+ ]
164
+
165
+ AGE_CUES = [
166
+ "early 20s",
167
+ "late 20s",
168
+ "early 30s",
169
+ "late 30s",
170
+ "40s",
171
+ "50s",
172
+ "60s",
173
+ "70s",
174
+ "(no age cue)",
175
+ ]
176
+
177
+
178
+ @dataclass
179
+ class QuestionRecord:
180
+ id: str
181
+ question: str
182
+ domain: str
183
+ scenario: str
184
+ voice: str
185
+ form: str
186
+ age_cue: str
187
+
188
+
189
+ # ──────────────────────────── LM-driven phrasing ────────────────────────────
190
+ class WriteUserMessage(dspy.Signature):
191
+ """Write a single, realistic message that a person might send to a spiritual
192
+ advisor. The message must reflect the given scenario, voice, form, and age
193
+ cue. Do NOT include scripture references, do NOT name Vedānta concepts —
194
+ write as a real person speaking from their actual life. Avoid generic phrases
195
+ like 'help me find peace' or 'I want to grow spiritually'. Be specific, lived,
196
+ grounded in detail. 2-6 sentences."""
197
+
198
+ scenario: str = dspy.InputField()
199
+ voice: str = dspy.InputField()
200
+ form: str = dspy.InputField()
201
+ age_cue: str = dspy.InputField()
202
+
203
+ message: str = dspy.OutputField(desc="The user's message, in first person.")
204
+
205
+
206
+ def _slug(s: str) -> str:
207
+ return re.sub(r"[^a-z0-9]+", "_", s.lower()).strip("_")[:60]
208
+
209
+
210
+ def generate_questions(target_n: int = 500, seed: int = 7, use_local: bool = False) -> list[QuestionRecord]:
211
+ """Generate ~target_n unique questions via combinatorics + LM rewriting."""
212
+ rng = random.Random(seed)
213
+ if use_local:
214
+ config.configure_dspy()
215
+ else:
216
+ config.configure_enrich_lm() # gpt-4o-mini: faster and more stylistically diverse
217
+ writer = dspy.Predict(WriteUserMessage)
218
+
219
+ # Build the (domain, scenario, voice, form, age) plan first
220
+ combos: list[tuple[str, str, str, str, str]] = []
221
+ for domain, scenarios in DOMAINS.items():
222
+ for scenario in scenarios:
223
+ # 5 variants per scenario varying voice/form/age
224
+ voices = rng.sample(VOICES, k=5)
225
+ forms = [rng.choice(FORMS) for _ in range(5)]
226
+ ages = rng.sample(AGE_CUES, k=5)
227
+ for v, f, a in zip(voices, forms, ages):
228
+ combos.append((domain, scenario, v, f, a))
229
+
230
+ rng.shuffle(combos)
231
+
232
+ # Cap to a generous over-target; we'll dedupe down to target_n
233
+ over_target = int(target_n * 1.25)
234
+ combos = combos[:over_target]
235
+
236
+ records: list[QuestionRecord] = []
237
+ for i, (domain, scenario, voice, form, age) in enumerate(tqdm(combos, desc="Generating")):
238
+ try:
239
+ out = writer(scenario=scenario, voice=voice, form=form, age_cue=age)
240
+ msg = (out.message or "").strip()
241
+ if len(msg) < 30:
242
+ continue
243
+ records.append(QuestionRecord(
244
+ id=f"q_{i:04d}_{_slug(domain)}",
245
+ question=msg,
246
+ domain=domain,
247
+ scenario=scenario,
248
+ voice=voice,
249
+ form=form,
250
+ age_cue=age,
251
+ ))
252
+ except Exception as e:
253
+ # Local LMs occasionally hiccup. Log and continue.
254
+ print(f"[warn] generation failure on combo {i}: {e}")
255
+ continue
256
+
257
+ return _dedupe_by_similarity(records, target_n=target_n)
258
+
259
+
260
+ def _dedupe_by_similarity(records: list[QuestionRecord], target_n: int, threshold: float = 0.92) -> list[QuestionRecord]:
261
+ """Embed and remove near-duplicates greedily."""
262
+ if not records:
263
+ return records
264
+ print(f"Deduping {len(records)} candidates ...")
265
+ embedder = SentenceTransformer(config.EMBED_MODEL, device=config.EMBED_DEVICE)
266
+ embs = embedder.encode(
267
+ [r.question for r in records],
268
+ normalize_embeddings=True,
269
+ show_progress_bar=True,
270
+ batch_size=32,
271
+ )
272
+ keep_idx: list[int] = []
273
+ kept_embs = []
274
+ for i, e in enumerate(embs):
275
+ if not kept_embs:
276
+ keep_idx.append(i)
277
+ kept_embs.append(e)
278
+ continue
279
+ sims = np.dot(np.stack(kept_embs), e)
280
+ if float(sims.max()) < threshold:
281
+ keep_idx.append(i)
282
+ kept_embs.append(e)
283
+ if len(keep_idx) >= target_n:
284
+ break
285
+ print(f"Kept {len(keep_idx)} after dedupe (target {target_n}).")
286
+ return [records[i] for i in keep_idx]
287
+
288
+
289
+ def save_jsonl(records: list[QuestionRecord], path: Path):
290
+ with path.open("w", encoding="utf-8") as f:
291
+ for r in records:
292
+ f.write(json.dumps(asdict(r), ensure_ascii=False) + "\n")
293
+ print(f"Wrote {len(records)} questions to {path}")
294
+
295
+
296
+ def load_jsonl(path: Path = config.DATASET_PATH) -> list[dict]:
297
+ with path.open(encoding="utf-8") as f:
298
+ return [json.loads(line) for line in f if line.strip()]
299
+
300
+
301
+ def to_dspy_examples(records: list[dict]) -> list[dspy.Example]:
302
+ """The dataset has no gold labels — that's fine. GEPA's metric uses LLM
303
+ judgment + retrieval grounding rather than reference answers.
304
+ We carry the metadata as inputs-of-record so the metric can use them."""
305
+ out = []
306
+ for r in records:
307
+ ex = dspy.Example(
308
+ user_question=r["question"],
309
+ history=dspy.History(messages=[]),
310
+ domain=r["domain"],
311
+ scenario=r["scenario"],
312
+ ).with_inputs("user_question", "history")
313
+ out.append(ex)
314
+ return out
315
+
316
+
317
+ # ──────────────────────────── CLI ────────────────────────────
318
+ def main():
319
+ ap = argparse.ArgumentParser()
320
+ ap.add_argument("--n", type=int, default=500)
321
+ ap.add_argument("--seed", type=int, default=7)
322
+ ap.add_argument("--out", type=str, default=str(config.DATASET_PATH))
323
+ ap.add_argument("--lm", choices=["openai", "local"], default="openai",
324
+ help="openai = gpt-4o-mini (default, faster); local = LM Studio task LM")
325
+ args = ap.parse_args()
326
+
327
+ records = generate_questions(target_n=args.n, seed=args.seed, use_local=(args.lm == "local"))
328
+ save_jsonl(records, Path(args.out))
329
+
330
+
331
+ if __name__ == "__main__":
332
+ main()
download_sources.py ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ download_sources.py — fetch every enabled source from the registry.
3
+
4
+ What this does
5
+ --------------
6
+ Reads sources_registry.SOURCES, walks each enabled entry, and downloads its
7
+ files into data/raw/<source_key>/. The downloader is deliberately dumb: it
8
+ just gets the bytes onto disk. Parsing happens in a separate step (parsers/)
9
+ so a download failure on one source doesn't block ingest of the others, and
10
+ so re-parsing during prompt iteration doesn't re-hit the network.
11
+
12
+ Why HTTPS over `requests` rather than git for everything
13
+ --------------------------------------------------------
14
+ Most of our sources are individual JSON or HTML files. Cloning a whole repo
15
+ to get two files wastes bandwidth and brittle-ifies the script. For sources
16
+ that *are* whole repos (rare in our registry), prefix the URL with `git+`.
17
+
18
+ Idempotency
19
+ -----------
20
+ If a file is already present and not corrupt, we skip it. Pass --force to
21
+ re-download. This makes it safe to run repeatedly while debugging parsers.
22
+
23
+ Politeness
24
+ ----------
25
+ We send a real User-Agent and rate-limit to one request per second per host.
26
+ Internet Archive and similar mirrors are gracious to projects that play nice;
27
+ they can also throttle aggressively when they aren't.
28
+ """
29
+
30
+ from __future__ import annotations
31
+ import argparse
32
+ import shutil
33
+ import subprocess
34
+ import sys
35
+ import time
36
+ from collections import defaultdict
37
+ from pathlib import Path
38
+ from urllib.parse import urlparse
39
+
40
+ import requests
41
+ from tqdm import tqdm
42
+
43
+ import config
44
+ from sources_registry import SOURCES, Source
45
+
46
+
47
+ RAW_DIR = config.DATA_DIR / "raw"
48
+ USER_AGENT = (
49
+ "GitaAdvisor/0.2 (Advaita-Vedanta research project; "
50
+ "contact: <add your email here>)"
51
+ )
52
+
53
+ # Per-host minimum interval in seconds
54
+ MIN_INTERVAL = 1.0
55
+
56
+
57
+ def _filename_for_url(url: str) -> str:
58
+ """Derive a sensible local filename from a URL."""
59
+ parsed = urlparse(url)
60
+ name = Path(parsed.path).name or "index.html"
61
+ # archive.org sometimes serves djvu.txt with no extension on the URL;
62
+ # keep what's there.
63
+ return name
64
+
65
+
66
+ def _is_git_url(url: str) -> bool:
67
+ return url.startswith("git+")
68
+
69
+
70
+ _last_request_time: dict = defaultdict(float)
71
+
72
+
73
+ def _polite_get(url: str) -> requests.Response:
74
+ """GET with rate limiting per host."""
75
+ host = urlparse(url).netloc
76
+ elapsed = time.time() - _last_request_time[host]
77
+ if elapsed < MIN_INTERVAL:
78
+ time.sleep(MIN_INTERVAL - elapsed)
79
+ _last_request_time[host] = time.time()
80
+ return requests.get(url, headers={"User-Agent": USER_AGENT}, timeout=60, stream=True)
81
+
82
+
83
+ def _download_file(url: str, dest: Path, force: bool = False) -> bool:
84
+ """Download a single URL to dest. Returns True if a download happened
85
+ (vs being skipped because already present)."""
86
+ if dest.exists() and dest.stat().st_size > 0 and not force:
87
+ return False
88
+
89
+ dest.parent.mkdir(parents=True, exist_ok=True)
90
+ tmp = dest.with_suffix(dest.suffix + ".tmp")
91
+
92
+ with _polite_get(url) as r:
93
+ r.raise_for_status()
94
+ total = int(r.headers.get("content-length", 0)) or None
95
+ with tmp.open("wb") as out, tqdm(
96
+ total=total, unit="B", unit_scale=True, leave=False, desc=dest.name
97
+ ) as bar:
98
+ for chunk in r.iter_content(chunk_size=8192):
99
+ if not chunk:
100
+ continue
101
+ out.write(chunk)
102
+ bar.update(len(chunk))
103
+
104
+ tmp.replace(dest)
105
+ return True
106
+
107
+
108
+ def _clone_git(url: str, dest_dir: Path, force: bool = False) -> bool:
109
+ """Clone a git repo (URL prefixed with 'git+') into dest_dir. Returns
110
+ True if a clone happened."""
111
+ real_url = url[len("git+"):]
112
+ if dest_dir.exists() and any(dest_dir.iterdir()) and not force:
113
+ return False
114
+ if dest_dir.exists():
115
+ shutil.rmtree(dest_dir)
116
+ dest_dir.parent.mkdir(parents=True, exist_ok=True)
117
+ subprocess.run(
118
+ ["git", "clone", "--depth=1", real_url, str(dest_dir)],
119
+ check=True,
120
+ )
121
+ return True
122
+
123
+
124
+ def download_source(src: Source, force: bool = False) -> dict:
125
+ """Download all URLs for one source. Returns a small report dict."""
126
+ target = RAW_DIR / src.key
127
+ report = {"key": src.key, "ok": 0, "skipped": 0, "failed": []}
128
+
129
+ if not src.urls:
130
+ report["failed"].append("no URLs in registry entry")
131
+ return report
132
+
133
+ for url in src.urls:
134
+ if not url:
135
+ continue
136
+ try:
137
+ if _is_git_url(url):
138
+ changed = _clone_git(url, target, force=force)
139
+ else:
140
+ fname = _filename_for_url(url)
141
+ changed = _download_file(url, target / fname, force=force)
142
+ if changed:
143
+ report["ok"] += 1
144
+ else:
145
+ report["skipped"] += 1
146
+ except Exception as e:
147
+ report["failed"].append(f"{url}: {e}")
148
+ return report
149
+
150
+
151
+ def main():
152
+ ap = argparse.ArgumentParser(description="Download all enabled sources from the registry.")
153
+ ap.add_argument("--force", action="store_true",
154
+ help="Re-download even if files exist.")
155
+ ap.add_argument("--only", nargs="*", default=None,
156
+ help="Only download these source keys.")
157
+ args = ap.parse_args()
158
+
159
+ enabled = [s for s in SOURCES if s.enabled]
160
+ if args.only:
161
+ enabled = [s for s in enabled if s.key in set(args.only)]
162
+ if not enabled:
163
+ print("No enabled sources match. Edit sources_registry.py to enable some.")
164
+ sys.exit(1)
165
+
166
+ print(f"Downloading {len(enabled)} sources to {RAW_DIR}")
167
+ print(f"User-Agent: {USER_AGENT}")
168
+ print()
169
+
170
+ any_failed = False
171
+ for src in enabled:
172
+ print(f"━━━ {src.key} — {src.name}")
173
+ print(f" license={src.license} tier={src.tier} parser={src.parser}")
174
+ if src.translator:
175
+ year = f", {src.year}" if src.year else ""
176
+ print(f" translator: {src.translator}{year}")
177
+
178
+ report = download_source(src, force=args.force)
179
+ if report["failed"]:
180
+ any_failed = True
181
+ for f in report["failed"]:
182
+ print(f" [FAIL] {f}")
183
+ print(f" downloaded={report['ok']} cached={report['skipped']}")
184
+ print()
185
+
186
+ if any_failed:
187
+ print("Some sources failed. Re-run with the network available, or "
188
+ "edit the URL in sources_registry.py if a mirror has moved.")
189
+ sys.exit(2)
190
+ print("All enabled sources are now on disk under data/raw/.")
191
+ print("Next: python ingest_corpus.py")
192
+
193
+
194
+ if __name__ == "__main__":
195
+ main()
enrich_corpus.py ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ enrich_corpus.py — run the local LLM over every verse, once, with caching.
3
+
4
+ The cost calculus
5
+ -----------------
6
+ For ~3,000 verses at ~30s per call on a 26B-class local model, a full pass
7
+ takes a long evening — call it 25 hours. That's tolerable as a one-time cost,
8
+ intolerable as a recurring one. So caching is non-negotiable. We cache by
9
+ verse_id and the enrichment_version stamp; if you change the prompt
10
+ substantively, bump the version in enrichment.py and the next run re-enriches.
11
+
12
+ What we write
13
+ -------------
14
+ data/corpus_enriched.jsonl — one EnrichedVerse per line, in the same order
15
+ as data/corpus.jsonl. Failed enrichments are still written (with empty
16
+ enrichment fields and an error stamp in enrichment_model) so the index can
17
+ still cover them on their literal text.
18
+
19
+ Concurrency
20
+ -----------
21
+ LM Studio's OpenAI-compatible server processes requests serially by default.
22
+ We don't try to parallelize at the client; if you've configured your server
23
+ for parallel decode, set --concurrency > 1 and DSPy will hold multiple
24
+ in-flight calls. For modest hardware, 1 is correct.
25
+
26
+ Resumability
27
+ ------------
28
+ If the run dies halfway, just re-run. The cache at data/enrichment_cache.jsonl
29
+ remembers per-verse what we already did, so we pick up exactly where we left
30
+ off. No flag is needed for resume; it's the default behavior.
31
+ """
32
+
33
+ from __future__ import annotations
34
+ import argparse
35
+ import json
36
+ import os
37
+ from dataclasses import asdict
38
+ from pathlib import Path
39
+ from typing import Iterable
40
+
41
+ from tqdm import tqdm
42
+ import dspy
43
+
44
+ import config
45
+ from corpus import Verse, EnrichedVerse, read_jsonl_verses, write_jsonl
46
+ from enrichment import Enricher
47
+
48
+
49
+ CACHE_PATH = config.DATA_DIR / "enrichment_cache.jsonl"
50
+ ENRICHED_PATH = config.DATA_DIR / "corpus_enriched.jsonl"
51
+
52
+
53
+ # ──────────────────────────── Cache I/O ────────────────────────────
54
+ def _load_cache(path: Path) -> dict[str, EnrichedVerse]:
55
+ """Load cache as {verse_id: EnrichedVerse}. Tolerates partial writes."""
56
+ if not path.exists():
57
+ return {}
58
+ out: dict[str, EnrichedVerse] = {}
59
+ with path.open(encoding="utf-8") as f:
60
+ for line in f:
61
+ line = line.strip()
62
+ if not line:
63
+ continue
64
+ try:
65
+ d = json.loads(line)
66
+ ev = EnrichedVerse(**{k: v for k, v in d.items() if k in EnrichedVerse.__dataclass_fields__})
67
+ out[ev.verse_id] = ev
68
+ except Exception:
69
+ continue
70
+ return out
71
+
72
+
73
+ def _append_cache(path: Path, ev: EnrichedVerse) -> None:
74
+ """Append a single record. We use append-mode rather than rewriting so
75
+ a kill -9 mid-run loses at most one line."""
76
+ path.parent.mkdir(parents=True, exist_ok=True)
77
+ with path.open("a", encoding="utf-8") as f:
78
+ f.write(json.dumps(asdict(ev), ensure_ascii=False) + "\n")
79
+
80
+
81
+ # ──────────────────────────── Main loop ────────────────────────────
82
+ def enrich_all(
83
+ in_path: Path,
84
+ out_path: Path,
85
+ cache_path: Path,
86
+ limit: int | None = None,
87
+ re_enrich: bool = False,
88
+ only_failed: bool = False,
89
+ use_claude: bool = True,
90
+ ) -> None:
91
+ if use_claude:
92
+ lm = config.configure_enrich_lm()
93
+ print(f"[enrich] LM: {lm.model} (Claude API)")
94
+ else:
95
+ config.configure_dspy()
96
+ print(f"[enrich] LM: {config.LOCAL_MODEL} (local LM Studio)")
97
+ enricher = Enricher()
98
+
99
+ cache = _load_cache(cache_path) if not re_enrich else {}
100
+ print(f"[enrich] cache contains {len(cache)} previously-enriched verses")
101
+
102
+ verses = list(read_jsonl_verses(in_path))
103
+ if limit:
104
+ verses = verses[:limit]
105
+ print(f"[enrich] enriching {len(verses)} verses from {in_path}")
106
+
107
+ enriched: list[EnrichedVerse] = []
108
+ pending = []
109
+ for v in verses:
110
+ cached = cache.get(v.verse_id)
111
+ if cached and not re_enrich:
112
+ if only_failed and cached.enrichment_model.startswith("FAILED"):
113
+ pending.append(v)
114
+ else:
115
+ enriched.append(cached)
116
+ continue
117
+ else:
118
+ pending.append(v)
119
+
120
+ print(f"[enrich] {len(enriched)} from cache, {len(pending)} to call LM for")
121
+
122
+ n_failed = 0
123
+ for v in tqdm(pending, desc="enriching"):
124
+ ev = enricher(verse=v)
125
+ _append_cache(cache_path, ev)
126
+ enriched.append(ev)
127
+ if not ev.is_enriched():
128
+ n_failed += 1
129
+
130
+ # Restore original verse order from in_path
131
+ by_id = {ev.verse_id: ev for ev in enriched}
132
+ ordered = [by_id[v.verse_id] for v in verses if v.verse_id in by_id]
133
+
134
+ n_written = write_jsonl(ordered, out_path)
135
+ print(f"[enrich] wrote {n_written} enriched verses to {out_path}")
136
+ if n_failed:
137
+ print(f"[enrich] WARNING: {n_failed} verses failed enrichment "
138
+ f"(empty fields, indexed only on literal text). "
139
+ f"Re-run with --only-failed to retry just those.")
140
+
141
+
142
+ # ──────────────────────────── CLI ────────────────────────────
143
+ def main():
144
+ ap = argparse.ArgumentParser()
145
+ ap.add_argument("--in", dest="in_path",
146
+ default=str(config.DATA_DIR / "corpus.jsonl"))
147
+ ap.add_argument("--out", default=str(ENRICHED_PATH))
148
+ ap.add_argument("--cache", default=str(CACHE_PATH))
149
+ ap.add_argument("--limit", type=int, default=None,
150
+ help="Enrich only the first N verses (smoke-test).")
151
+ ap.add_argument("--re-enrich", action="store_true",
152
+ help="Ignore cache and re-enrich everything. Use this "
153
+ "when you change the enrichment prompt.")
154
+ ap.add_argument("--only-failed", action="store_true",
155
+ help="Re-run only the verses whose previous enrichment "
156
+ "failed (FAILED stamp in enrichment_model).")
157
+ ap.add_argument("--lm", choices=["claude", "local"], default="claude",
158
+ help="Which LM to use: 'claude' (default, Sonnet 4.6 via API) "
159
+ "or 'local' (LM Studio). Claude requires ANTHROPIC_API_KEY.")
160
+ args = ap.parse_args()
161
+
162
+ enrich_all(
163
+ in_path=Path(args.in_path),
164
+ out_path=Path(args.out),
165
+ cache_path=Path(args.cache),
166
+ limit=args.limit,
167
+ re_enrich=args.re_enrich,
168
+ only_failed=args.only_failed,
169
+ use_claude=(args.lm == "claude"),
170
+ )
171
+
172
+
173
+ if __name__ == "__main__":
174
+ main()
enrichment.py ADDED
@@ -0,0 +1,266 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ enrichment.py — turn a Verse into an EnrichedVerse using the local LLM.
3
+
4
+ This module is the heart of the redesign. Instead of hoping that vector
5
+ similarity between a user's English question and a Sanskrit verse will find
6
+ the right teaching, we run a one-time offline pass that asks the local LLM
7
+ to translate each verse into the language a real person would use to seek
8
+ help. The output gets stored alongside the verse and embedded for retrieval.
9
+
10
+ What the prompt asks for, and why each field
11
+ --------------------------------------------
12
+ We extract six fields. Each one earns its place by closing a different gap
13
+ between scripture and a user's question:
14
+
15
+ paraphrase — what the verse teaches, in plain modern English.
16
+ This is what the synthesizer reads when writing
17
+ the advisor's reply, so paraphrase quality matters
18
+ more than embedding quality.
19
+
20
+ themes — Vedānta concepts engaged. Tradition-native names
21
+ (karma_yoga, vairagya, sakshi, two_truths). Used
22
+ for filtering and for ensuring the metric can
23
+ verify Advaita-coherence.
24
+
25
+ life_situations — the predicaments where this verse helps. User-
26
+ language. This is the field that does the actual
27
+ bridging: a query about "facing failure" finds
28
+ BG 2.47 even though those words aren't in the verse.
29
+
30
+ emotions_addressed — drawn from a fixed vocabulary so we get faceted
31
+ filtering rather than free-text drift. The metric
32
+ uses this to verify that retrieved verses actually
33
+ address the user's felt emotion.
34
+
35
+ practical_teaching — what the verse asks the seeker to do or shift.
36
+ The synthesizer uses this as the seed for its
37
+ "concrete practice you can try this week" close.
38
+
39
+ hypothetical_questions — five questions a real person might bring to the
40
+ verse. Highest-leverage field for retrieval recall.
41
+
42
+ A closed vocabulary for emotions
43
+ --------------------------------
44
+ We constrain `emotions_addressed` to the EMOTION_VOCAB list below. If we let
45
+ the LLM generate freely, we get drift: "sadness" / "sorrow" / "melancholy" /
46
+ "grief-tinged blue" all become separate buckets, and faceted filtering
47
+ becomes useless. Closed vocab keeps the index sharp.
48
+
49
+ We don't constrain themes the same way because the Sanskrit conceptual
50
+ vocabulary is open-ended and forcing the LLM into a small list would lose
51
+ information. We just normalize for casing/spacing in post-processing.
52
+
53
+ Working with a flaky local LLM
54
+ ------------------------------
55
+ Local 26B-class models occasionally produce malformed structured output.
56
+ This module assumes that. The enrich() function:
57
+ - validates output against minimum-quality checks
58
+ - retries up to 2 times with temperature=0
59
+ - on persistent failure, returns an EnrichedVerse with empty enrichment
60
+ fields rather than raising — so the corpus can still index on the
61
+ literal text + bhāṣya and the verse isn't lost
62
+ """
63
+
64
+ from __future__ import annotations
65
+ import re
66
+ from dataclasses import asdict
67
+ import dspy
68
+
69
+ from corpus import Verse, EnrichedVerse
70
+
71
+
72
+ # ──────────────────────────── Closed emotion vocabulary ────────────────────────────
73
+ # Twenty buckets, ordered roughly from acute to diffuse. Adding entries is
74
+ # easy; removing them risks orphaning previously-enriched records.
75
+ EMOTION_VOCAB: tuple[str, ...] = (
76
+ "grief", # acute loss
77
+ "anticipatory_grief", # loss in advance
78
+ "fear", # discrete fear
79
+ "anxiety", # chronic, diffuse
80
+ "despair", # loss of hope
81
+ "shame", # self-as-bad
82
+ "guilt", # action-as-bad
83
+ "anger",
84
+ "resentment",
85
+ "envy",
86
+ "jealousy",
87
+ "longing",
88
+ "loneliness",
89
+ "doubt", # epistemic; not knowing
90
+ "disillusionment", # the hollowness of attained goals
91
+ "boredom", # the inertness of repetition
92
+ "restlessness", # the inability to settle
93
+ "frustration",
94
+ "confusion",
95
+ "numbness", # affect-blunted
96
+ )
97
+
98
+
99
+ # ──────────────────────────── DSPy signature ────────────────────────────
100
+ class EnrichVerse(dspy.Signature):
101
+ """You are an Advaita-Vedānta-trained reader producing structured metadata
102
+ for a verse from the Bhagavad Gītā or a related scripture, so that a
103
+ spiritual advisor can later find this verse when a real person describes
104
+ a life situation in everyday language. Stay strictly within the framework
105
+ of Śaṅkarācārya's non-dual interpretation. Do not import dualistic notions
106
+ (separate creator/creature, soul-merging-into-God-as-other, etc.) and do
107
+ not bypass the verse's plain meaning by always retreating to the absolute.
108
+
109
+ The verse may include the Sanskrit, the English translation, and (when
110
+ available) Śaṅkara's commentary. Read all three. Your output is structured
111
+ fields, not prose. Be specific, lived, concrete. Avoid generic spiritual
112
+ language ('find peace', 'be in the moment'). Avoid tradition-foreign
113
+ therapy language ('honor your feelings'). When in doubt about a field,
114
+ leave it shorter rather than padded."""
115
+
116
+ # Inputs — the verse in its richest available form
117
+ verse_ref: str = dspy.InputField(desc="Citation form, e.g. 'BG 2.47'.")
118
+ sanskrit: str = dspy.InputField(desc="Devanāgarī text, may be empty.")
119
+ translation: str = dspy.InputField(desc="English translation of the verse.")
120
+ bhashya: str = dspy.InputField(desc="Śaṅkara's commentary on this verse, may be empty.")
121
+
122
+ # Outputs
123
+ paraphrase: str = dspy.OutputField(
124
+ desc="One or two sentences in plain modern English stating what the "
125
+ "verse teaches. Not a translation; a teaching summary. No jargon."
126
+ )
127
+ themes: list[str] = dspy.OutputField(
128
+ desc="2–5 Vedānta concepts the verse engages, in tradition-native "
129
+ "vocabulary with snake_case_keys, e.g. ['karma_yoga', 'non_attachment', "
130
+ "'two_truths']. Use Sanskrit terms where they're the right name."
131
+ )
132
+ life_situations: list[str] = dspy.OutputField(
133
+ desc="3–6 specific human predicaments this verse would help with, "
134
+ "in everyday English. e.g. 'facing public failure after years of "
135
+ "effort'. NOT 'finding peace' or 'spiritual growth'."
136
+ )
137
+ emotions_addressed: list[str] = dspy.OutputField(
138
+ desc="The emotions this verse meets, drawn ONLY from this fixed list: "
139
+ + ", ".join(EMOTION_VOCAB) + ". 1–4 entries."
140
+ )
141
+ practical_teaching: str = dspy.OutputField(
142
+ desc="One sentence: what the verse asks the seeker to actually do or "
143
+ "shift. If the verse is purely ontological, write 'pure ontology — "
144
+ "no direct prescription' and the field will be ignored downstream."
145
+ )
146
+ hypothetical_questions: list[str] = dspy.OutputField(
147
+ desc="EXACTLY 5 first-person questions a real person might write to a "
148
+ "spiritual advisor that this verse would speak to. Specific, "
149
+ "ungeneric, in the user's voice. NOT in scripture's voice. e.g. "
150
+ "'I worked on this for three years and it just failed publicly — "
151
+ "how do I keep going?'"
152
+ )
153
+
154
+
155
+ # ──────────────────────────── Validators ────────────────────────────
156
+ THEME_KEY_RX = re.compile(r"^[a-z][a-z0-9_]{2,40}$")
157
+
158
+
159
+ def _normalize_theme(t: str) -> str:
160
+ t = t.strip().lower()
161
+ t = re.sub(r"[\s\-]+", "_", t)
162
+ t = re.sub(r"[^a-z0-9_]", "", t)
163
+ return t
164
+
165
+
166
+ def _validate(pred) -> tuple[bool, str]:
167
+ """Light schema check. Returns (ok, reason_if_not_ok). Used to decide
168
+ whether to retry the LM call with a stricter prompt."""
169
+ paraphrase = (pred.paraphrase or "").strip()
170
+ if len(paraphrase) < 20:
171
+ return False, "paraphrase too short"
172
+
173
+ qs = pred.hypothetical_questions or []
174
+ if not isinstance(qs, list) or len(qs) < 3:
175
+ return False, f"need ≥3 hypothetical_questions, got {len(qs)}"
176
+
177
+ sits = pred.life_situations or []
178
+ if not isinstance(sits, list) or len(sits) < 2:
179
+ return False, f"need ≥2 life_situations, got {len(sits)}"
180
+
181
+ emos = pred.emotions_addressed or []
182
+ if not isinstance(emos, list) or not emos:
183
+ return False, "emotions_addressed empty"
184
+ bad = [e for e in emos if _normalize_theme(e) not in EMOTION_VOCAB]
185
+ if bad:
186
+ return False, f"emotions outside vocabulary: {bad}"
187
+
188
+ themes = pred.themes or []
189
+ if not isinstance(themes, list) or not themes:
190
+ return False, "themes empty"
191
+
192
+ return True, ""
193
+
194
+
195
+ # ──────────────────────────── Module ────────────────────────────
196
+ class Enricher(dspy.Module):
197
+ """Wraps the EnrichVerse signature with retries and post-processing.
198
+
199
+ Why ChainOfThought over Predict
200
+ -------------------------------
201
+ GEPA may eventually optimize this prompt too, and ChainOfThought gives it
202
+ a `reasoning` trace to inspect during reflection. The cost is one extra
203
+ paragraph of LM output per call, which is negligible at our scale.
204
+ """
205
+
206
+ def __init__(self, max_retries: int = 2):
207
+ super().__init__()
208
+ self.predict = dspy.ChainOfThought(EnrichVerse)
209
+ self.max_retries = max_retries
210
+
211
+ def forward(self, verse: Verse) -> EnrichedVerse:
212
+ attempt = 0
213
+ last_err = ""
214
+ pred = None
215
+
216
+ while attempt <= self.max_retries:
217
+ try:
218
+ pred = self.predict(
219
+ verse_ref=verse.verse_ref,
220
+ sanskrit=verse.sanskrit or "",
221
+ translation=verse.translation or "",
222
+ bhashya=verse.bhashya or "",
223
+ )
224
+ ok, reason = _validate(pred)
225
+ if ok:
226
+ break
227
+ last_err = reason
228
+ except Exception as e:
229
+ last_err = f"LM error: {e}"
230
+ attempt += 1
231
+
232
+ # Build the EnrichedVerse from the Verse + whatever we got
233
+ base = asdict(verse)
234
+ ev = EnrichedVerse(**base)
235
+
236
+ if pred and not last_err:
237
+ ev.paraphrase = (pred.paraphrase or "").strip()
238
+ ev.practical_teaching = (pred.practical_teaching or "").strip()
239
+ ev.themes = [
240
+ _normalize_theme(t) for t in (pred.themes or [])
241
+ if THEME_KEY_RX.match(_normalize_theme(t))
242
+ ]
243
+ ev.life_situations = [
244
+ s.strip() for s in (pred.life_situations or [])
245
+ if s and len(s.strip()) >= 5
246
+ ]
247
+ ev.emotions_addressed = [
248
+ _normalize_theme(e) for e in (pred.emotions_addressed or [])
249
+ if _normalize_theme(e) in EMOTION_VOCAB
250
+ ]
251
+ ev.hypothetical_questions = [
252
+ q.strip() for q in (pred.hypothetical_questions or [])
253
+ if q and len(q.strip()) >= 10
254
+ ][:5] # cap at 5
255
+
256
+ # Stamp the model so re-runs after a model swap can be detected
257
+ try:
258
+ lm = dspy.settings.lm
259
+ ev.enrichment_model = getattr(lm, "model", "") or ""
260
+ except Exception:
261
+ pass
262
+ else:
263
+ # Enrichment failed; keep the verse but mark it
264
+ ev.enrichment_model = f"FAILED: {last_err}"
265
+
266
+ return ev
ingest_corpus.py ADDED
@@ -0,0 +1,203 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ingest_corpus.py — run the parsers and produce data/corpus.jsonl.
3
+
4
+ This script lives between download_sources.py (which gets bytes onto disk)
5
+ and enrich_corpus.py (which adds LLM-derived fields). Its specific job:
6
+
7
+ 1. Walk each enabled source in the registry.
8
+ 2. Dispatch to its parser, which yields Verse records.
9
+ 3. Merge records across sources by verse_ref.
10
+ - The Gītā parser yields verses with translation but no bhāṣya.
11
+ - The Sastry parser yields verses with bhāṣya but spotty translation.
12
+ - We want one record per verse, with both populated when possible.
13
+ 4. Write the merged stream as JSONL to data/corpus.jsonl.
14
+
15
+ Why merge by verse_ref rather than verse_id
16
+ -------------------------------------------
17
+ The Gītā parser uses work='bhagavad_gita' and the Sastry parser uses
18
+ work='bhagavad_gita_bhashya'. Their verse_ids therefore differ (different
19
+ work prefix), but their verse_refs match — both render as 'BG 2.47'. We
20
+ key the merge on verse_ref since that's the reader-facing canonical citation.
21
+
22
+ Conflict policy when merging
23
+ ----------------------------
24
+ - Translation: keep whichever record has it; if both, prefer the one whose
25
+ source_key is in the GITA_TEXT_PRIORITY list. (We want the modern, clean
26
+ Sivananda over Sastry's archaic English-of-Śaṅkara-paraphrasing-the-verse.)
27
+ - Bhāṣya: only one source produces this; conflicts shouldn't happen.
28
+ - Sanskrit / transliteration / word_meanings: prefer gita_json; richer.
29
+ """
30
+
31
+ from __future__ import annotations
32
+ import argparse
33
+ from collections import defaultdict
34
+ from pathlib import Path
35
+ from typing import Iterable
36
+
37
+ from tqdm import tqdm
38
+
39
+ import config
40
+ from corpus import Verse, write_jsonl
41
+ from sources_registry import enabled_sources, by_key, Source
42
+
43
+ # Parsers
44
+ from parsers import gita_json as parser_gita_json
45
+ from parsers import sastry_archive as parser_sastry
46
+
47
+
48
+ # When two sources both have a translation, this list decides which wins
49
+ GITA_TEXT_PRIORITY = ("gita_json_core", "sastry_gita_bhashya")
50
+
51
+
52
+ def _parse_source(src: Source, raw_dir: Path) -> Iterable[Verse]:
53
+ """Dispatch to the right parser for a registry entry.
54
+
55
+ Each parser is documented to take a directory and return an iterable of
56
+ Verses; this function is just a switch table.
57
+ """
58
+ if src.parser == "gita_json":
59
+ # The gita_json parser can take both the core dir and (optionally) a
60
+ # translations dir. We pass the same dir for both since the downloader
61
+ # puts all gita_json* files into per-source folders.
62
+ if src.key == "gita_json_core":
63
+ translations_dir = raw_dir.parent / "gita_json_translations"
64
+ return parser_gita_json.parse(
65
+ raw_dir,
66
+ translations_dir if translations_dir.exists() else None,
67
+ )
68
+ # The translations source is "consumed" alongside core, not parsed alone
69
+ return iter(())
70
+
71
+ if src.parser == "sastry_archive":
72
+ return parser_sastry.parse(raw_dir)
73
+
74
+ if src.parser == "wisdomlib_html":
75
+ # Stub for now — see parsers/wisdomlib_html.py to implement.
76
+ # We don't fail the whole ingest just because one parser is unimplemented.
77
+ print(f"[ingest] wisdomlib_html parser not implemented yet — skipping {src.key}")
78
+ return iter(())
79
+
80
+ if src.parser == "thibaut_sbe":
81
+ print(f"[ingest] thibaut_sbe parser not implemented yet — skipping {src.key}")
82
+ return iter(())
83
+
84
+ if src.parser == "plain_text":
85
+ # Reserved for user-dropped texts; future work
86
+ return iter(())
87
+
88
+ raise ValueError(f"Unknown parser type: {src.parser}")
89
+
90
+
91
+ def _merge(records: list[Verse]) -> list[Verse]:
92
+ """Merge multiple parser outputs into one record per verse_ref.
93
+
94
+ The output preserves the order of first appearance, so the corpus.jsonl
95
+ file is naturally chapter-then-verse ordered.
96
+ """
97
+ by_ref: dict[str, Verse] = {}
98
+ order: list[str] = []
99
+
100
+ for r in records:
101
+ if r.verse_ref not in by_ref:
102
+ by_ref[r.verse_ref] = r
103
+ order.append(r.verse_ref)
104
+ continue
105
+
106
+ existing = by_ref[r.verse_ref]
107
+
108
+ # Translation: pick higher-priority source if both have one
109
+ new_translation = existing.translation
110
+ new_translator = existing.translator
111
+ if r.translation and (
112
+ not existing.translation
113
+ or _priority(r.source_key) < _priority(existing.source_key)
114
+ ):
115
+ new_translation = r.translation
116
+ new_translator = r.translator
117
+
118
+ # Bhashya: only one source typically has it, take whichever isn't blank
119
+ new_bhashya = existing.bhashya or r.bhashya
120
+ new_bhashya_tr = existing.bhashya_translator or r.bhashya_translator
121
+
122
+ # Sanskrit family of fields: prefer the existing record if it has them,
123
+ # else take from the new record
124
+ merged = Verse(
125
+ verse_id=existing.verse_id,
126
+ work=existing.work, # keep the work_display of whichever came first
127
+ work_display=existing.work_display,
128
+ verse_ref=existing.verse_ref,
129
+ tier=_choose_tier(existing.tier, r.tier),
130
+ section=existing.section or r.section,
131
+ section_display=existing.section_display or r.section_display,
132
+ translation=new_translation,
133
+ translator=new_translator,
134
+ sanskrit=existing.sanskrit or r.sanskrit,
135
+ transliteration=existing.transliteration or r.transliteration,
136
+ word_meanings=existing.word_meanings or r.word_meanings,
137
+ bhashya=new_bhashya,
138
+ bhashya_translator=new_bhashya_tr,
139
+ source_key=existing.source_key + "+" + r.source_key,
140
+ license=existing.license or r.license,
141
+ )
142
+ by_ref[r.verse_ref] = merged
143
+
144
+ return [by_ref[k] for k in order]
145
+
146
+
147
+ def _priority(source_key: str) -> int:
148
+ """Lower is higher-priority. Sources not in the priority list rank last."""
149
+ for i, key in enumerate(GITA_TEXT_PRIORITY):
150
+ if source_key == key or source_key.startswith(key + "+") or source_key.endswith("+" + key):
151
+ return i
152
+ return 99
153
+
154
+
155
+ def _choose_tier(a: str, b: str) -> str:
156
+ """When two records merge, the tier of the merged verse is the most
157
+ 'authoritative' of the two: primary > shankara > supporting.
158
+
159
+ Why primary > shankara: when we have both the verse text (primary) and
160
+ Śaṅkara's bhāṣya on it (shankara) folded into one record, the verse
161
+ itself is what the citation refers to — so primary wins."""
162
+ rank = {"primary": 0, "shankara": 1, "supporting": 2}
163
+ return a if rank.get(a, 9) <= rank.get(b, 9) else b
164
+
165
+
166
+ # ──────────────────────────── CLI ────────────────────────────
167
+ def main():
168
+ ap = argparse.ArgumentParser()
169
+ ap.add_argument("--out", default=str(config.DATA_DIR / "corpus.jsonl"))
170
+ args = ap.parse_args()
171
+
172
+ raw_root = config.DATA_DIR / "raw"
173
+ if not raw_root.exists():
174
+ raise SystemExit("data/raw/ doesn't exist. Run download_sources.py first.")
175
+
176
+ all_records: list[Verse] = []
177
+ for src in enabled_sources():
178
+ raw_dir = raw_root / src.key
179
+ if not raw_dir.exists():
180
+ print(f"[ingest] {src.key}: no files at {raw_dir}; skipping")
181
+ continue
182
+ print(f"[ingest] parsing {src.key} via {src.parser}")
183
+ try:
184
+ n_before = len(all_records)
185
+ for v in _parse_source(src, raw_dir):
186
+ if v.has_content():
187
+ all_records.append(v)
188
+ print(f"[ingest] yielded {len(all_records) - n_before} records")
189
+ except Exception as e:
190
+ print(f"[ingest] {src.key} failed: {e}")
191
+
192
+ print(f"[ingest] merging {len(all_records)} records by verse_ref ...")
193
+ merged = _merge(all_records)
194
+ print(f"[ingest] {len(merged)} unique verses after merge")
195
+
196
+ out_path = Path(args.out)
197
+ n = write_jsonl(merged, out_path)
198
+ print(f"[ingest] wrote {n} verses to {out_path}")
199
+ print(f"[ingest] next: python enrich_corpus.py")
200
+
201
+
202
+ if __name__ == "__main__":
203
+ main()
knowledge_base.py ADDED
@@ -0,0 +1,416 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ knowledge_base.py — verse-indexed, multi-view RAG over the enriched corpus.
3
+
4
+ The shift from the old design
5
+ -----------------------------
6
+ The old knowledge_base.py chunked source text into 380-token windows with
7
+ overlap. The new one indexes each verse as a single record but with three
8
+ *views* — three separate embeddings of three different framings of the same
9
+ verse — so that queries phrased in different registers can all find it.
10
+
11
+ The three views per verse, and what each one is good for:
12
+
13
+ literal_view — the English translation (and Sanskrit fragment if
14
+ available). Best for queries that share lexical features
15
+ with the text itself: "what does it mean to act without
16
+ attachment?" maps cleanly to BG 2.47's literal text.
17
+
18
+ bhashya_view — Śaṅkara's commentary on the verse. Best for queries that
19
+ ask about the Vedāntic explanation rather than the verse
20
+ itself: "how does adhyāsa relate to suffering?" finds
21
+ the bhāṣya passages where Śaṅkara unfolds adhyāsa.
22
+
23
+ advisor_view — the LLM-enriched composite (paraphrase + life situations
24
+ + emotions addressed + hypothetical questions). Best for
25
+ real-world questions in real-world language. This is
26
+ where the language gap closes.
27
+
28
+ At retrieval time we query all three indices, merge by verse_id (so each
29
+ verse appears at most once), and combine scores with a weighted sum that
30
+ gives the advisor_view the lion's share of credit while letting the
31
+ literal and bhāṣya views catch cases the LLM enrichment missed.
32
+
33
+ Why three indices and not one with concatenated views
34
+ -----------------------------------------------------
35
+ Concatenating literal + bhāṣya + advisor into one big text and embedding
36
+ that gives you the average direction across the three. Real semantic search
37
+ benefits from being able to match any one of the three angles strongly. The
38
+ extra storage (three vectors per verse instead of one) is trivial; the
39
+ retrieval-quality difference is large.
40
+
41
+ Storage layout
42
+ --------------
43
+ We keep three Chroma collections in artifacts/chroma/:
44
+ advaita_literal
45
+ advaita_bhashya
46
+ advaita_advisor
47
+
48
+ Each holds the same set of verse_ids. We resolve a hit's full record by
49
+ reading data/corpus_enriched.jsonl (kept small enough to live in memory).
50
+ """
51
+
52
+ from __future__ import annotations
53
+ import argparse
54
+ from dataclasses import dataclass, field
55
+ from pathlib import Path
56
+ from typing import Iterable
57
+
58
+ import chromadb
59
+ from chromadb.config import Settings
60
+ from sentence_transformers import SentenceTransformer
61
+ from tqdm import tqdm
62
+
63
+ import config
64
+ from corpus import EnrichedVerse, read_jsonl_enriched
65
+
66
+
67
+ # ──────────────────────────── Constants ────────────────────────────
68
+ COLLECTION_LITERAL = "advaita_literal"
69
+ COLLECTION_BHASHYA = "advaita_bhashya"
70
+ COLLECTION_ADVISOR = "advaita_advisor"
71
+
72
+ # Tier weights — multiplied into the cosine similarity at retrieval time.
73
+ # Same logic as before: primary scripture and Śaṅkara's pen outrank later
74
+ # voices when the cosine score is otherwise comparable.
75
+ TIER_WEIGHTS = {"primary": 1.10, "shankara": 1.10, "supporting": 1.00}
76
+
77
+ # View weights — how much each view's score contributes to the combined
78
+ # score per verse. The advisor view dominates because it is the one
79
+ # designed to bridge the language gap; literal and bhāṣya are insurance
80
+ # against the enrichment pipeline missing a topic.
81
+ VIEW_WEIGHTS = {"advisor": 0.55, "literal": 0.25, "bhashya": 0.20}
82
+
83
+
84
+ # ──────────────────────────── Hit dataclass ────────────────────────────
85
+ @dataclass
86
+ class Hit:
87
+ """One retrieval result, post-merge across the three views."""
88
+ verse: EnrichedVerse
89
+ combined_score: float # used for ranking
90
+ view_scores: dict[str, float] = field(default_factory=dict) # diagnostics
91
+
92
+ def __repr__(self) -> str:
93
+ v = self.verse
94
+ return (f"Hit({v.verse_ref}, tier={v.tier}, "
95
+ f"score={self.combined_score:.3f}, views={self.view_scores})")
96
+
97
+ def to_dict(self) -> dict:
98
+ """Flatten a Hit to a JSON-serializable dict so the advisor can carry
99
+ it in dspy.Prediction (which is pickled during GEPA optimization), and
100
+ so the metric can read its fields without importing this module."""
101
+ v = self.verse
102
+ return {
103
+ "verse_id": v.verse_id,
104
+ "verse_ref": v.verse_ref,
105
+ "work": v.work,
106
+ "work_display": v.work_display,
107
+ "section": v.section,
108
+ "tier": v.tier,
109
+ "translation": v.translation,
110
+ "translator": v.translator,
111
+ "bhashya": v.bhashya,
112
+ "bhashya_translator": v.bhashya_translator,
113
+ "paraphrase": v.paraphrase,
114
+ "themes": list(v.themes),
115
+ "life_situations": list(v.life_situations),
116
+ "emotions_addressed": list(v.emotions_addressed),
117
+ "hypothetical_questions": list(v.hypothetical_questions),
118
+ "score": self.combined_score,
119
+ "view_scores": dict(self.view_scores),
120
+ # Legacy alias the old metric used:
121
+ "meta": {
122
+ "verse_ref": v.verse_ref,
123
+ "work": v.work,
124
+ "section": v.section,
125
+ "tier": v.tier,
126
+ },
127
+ }
128
+
129
+
130
+ # ──────────────────────────── Internals ────────────────────────────
131
+ def _client() -> chromadb.api.ClientAPI:
132
+ return chromadb.PersistentClient(
133
+ path=str(config.CHROMA_DIR),
134
+ settings=Settings(anonymized_telemetry=False),
135
+ )
136
+
137
+
138
+ def _embedder() -> SentenceTransformer:
139
+ return SentenceTransformer(config.EMBED_MODEL, device=config.EMBED_DEVICE)
140
+
141
+
142
+ def _record_metadata(v: EnrichedVerse) -> dict:
143
+ """Metadata stored alongside each chroma record so the retriever can
144
+ filter and report without re-loading the JSONL on every call.
145
+
146
+ chromadb requires scalar metadata values, so list-valued fields (themes,
147
+ emotions) are joined with semicolons. The choice of ';' is safe because
148
+ neither chroma nor our snake_case theme keys contain that character.
149
+ """
150
+ return {
151
+ "verse_id": v.verse_id,
152
+ "verse_ref": v.verse_ref,
153
+ "work": v.work,
154
+ "tier": v.tier,
155
+ "section": v.section,
156
+ "themes_csv": ";".join(v.themes),
157
+ "emotions_csv": ";".join(v.emotions_addressed),
158
+ }
159
+
160
+
161
+ # ──────────────────────────── Index build ────────────────────────────
162
+ def build_index(corpus_path: Path | None = None) -> dict[str, int]:
163
+ """(Re)build all three view-indices from the enriched corpus.
164
+
165
+ Returns a dict {view_name: n_records} for confirmation. The function is
166
+ safe to re-run; it deletes existing collections first so partial state
167
+ from a prior crash doesn't pollute results.
168
+ """
169
+ corpus_path = corpus_path or (config.DATA_DIR / "corpus_enriched.jsonl")
170
+ if not corpus_path.exists():
171
+ raise SystemExit(
172
+ f"No enriched corpus at {corpus_path}.\n"
173
+ f"Pipeline: download_sources.py → ingest_corpus.py → "
174
+ f"enrich_corpus.py → knowledge_base.py --build"
175
+ )
176
+
177
+ print(f"Loading embedding model: {config.EMBED_MODEL} on {config.EMBED_DEVICE}")
178
+ embedder = _embedder()
179
+
180
+ client = _client()
181
+ # Drop existing collections; build_index is "rebuild from scratch"
182
+ for name in (COLLECTION_LITERAL, COLLECTION_BHASHYA, COLLECTION_ADVISOR):
183
+ try:
184
+ client.delete_collection(name)
185
+ except Exception:
186
+ pass
187
+
188
+ coll_literal = client.create_collection(
189
+ COLLECTION_LITERAL, metadata={"hnsw:space": "cosine"})
190
+ coll_bhashya = client.create_collection(
191
+ COLLECTION_BHASHYA, metadata={"hnsw:space": "cosine"})
192
+ coll_advisor = client.create_collection(
193
+ COLLECTION_ADVISOR, metadata={"hnsw:space": "cosine"})
194
+
195
+ verses = list(read_jsonl_enriched(corpus_path))
196
+ print(f"Indexing {len(verses)} verses across 3 views ...")
197
+
198
+ counts = {"literal": 0, "bhashya": 0, "advisor": 0}
199
+
200
+ # We batch by view so each call to encode() is efficient. For 3000 verses
201
+ # at small-batch BGE this is a few seconds total per view, much faster
202
+ # than one-at-a-time embedding.
203
+ BATCH = 64
204
+ for view_name, view_fn, coll in (
205
+ ("literal", lambda v: v.literal_view(), coll_literal),
206
+ ("bhashya", lambda v: v.bhashya_view(), coll_bhashya),
207
+ ("advisor", lambda v: v.advisor_view(), coll_advisor),
208
+ ):
209
+ # Skip verses whose view is empty. A verse without a bhāṣya simply
210
+ # doesn't appear in the bhāṣya index — the merger handles partial
211
+ # coverage cleanly.
212
+ records = [(v, view_fn(v)) for v in verses]
213
+ records = [(v, t) for v, t in records if t.strip()]
214
+
215
+ for i in tqdm(range(0, len(records), BATCH), desc=f" view: {view_name}"):
216
+ chunk = records[i:i + BATCH]
217
+ ids = [v.verse_id for v, _ in chunk]
218
+ texts = [t for _, t in chunk]
219
+ metas = [_record_metadata(v) for v, _ in chunk]
220
+
221
+ vectors = embedder.encode(
222
+ texts,
223
+ normalize_embeddings=True,
224
+ show_progress_bar=False,
225
+ batch_size=BATCH,
226
+ ).tolist()
227
+ coll.add(ids=ids, embeddings=vectors,
228
+ documents=texts, metadatas=metas)
229
+
230
+ counts[view_name] = len(records)
231
+
232
+ print(f"Index built: {counts}")
233
+ return counts
234
+
235
+
236
+ # ───────���──────────────────── Retriever ────────────────────────────
237
+ class AdvaitaRetriever:
238
+ """Multi-view retriever returning Hit objects backed by EnrichedVerse.
239
+
240
+ Construction loads the enriched corpus into memory (≈3000 records,
241
+ ≈10 MB) so we can resolve hits to full records without per-call disk
242
+ reads. This matters during GEPA optimization, which calls retrieve()
243
+ hundreds of times per evaluation pass.
244
+
245
+ The retriever is intentionally light: it doesn't filter by metadata,
246
+ by tier, or by emotion at query time. Filtering happens at scoring
247
+ (TIER_WEIGHTS) and at the SelectPassages stage downstream. Keeping
248
+ retrieval permissive and selection picky is more robust than the
249
+ reverse — when retrieval over-filters, you can never recover the
250
+ missed verse later in the pipeline.
251
+ """
252
+
253
+ def __init__(self, top_k: int = config.TOP_K_RETRIEVE,
254
+ corpus_path: Path | None = None):
255
+ self.top_k = top_k
256
+ self._embedder: SentenceTransformer | None = None
257
+ self._coll_literal = None
258
+ self._coll_bhashya = None
259
+ self._coll_advisor = None
260
+
261
+ cp = corpus_path or (config.DATA_DIR / "corpus_enriched.jsonl")
262
+ self._verses_by_id: dict[str, EnrichedVerse] = {
263
+ v.verse_id: v for v in read_jsonl_enriched(cp)
264
+ }
265
+
266
+ def _ensure(self):
267
+ """Lazy-load embedder and collections. We avoid loading at __init__
268
+ so a process that only needs the corpus mapping (e.g. the metric)
269
+ doesn't pay the SentenceTransformer load time."""
270
+ if self._embedder is None:
271
+ self._embedder = _embedder()
272
+ if self._coll_advisor is None:
273
+ client = _client()
274
+ self._coll_literal = client.get_collection(COLLECTION_LITERAL)
275
+ self._coll_bhashya = client.get_collection(COLLECTION_BHASHYA)
276
+ self._coll_advisor = client.get_collection(COLLECTION_ADVISOR)
277
+
278
+ def search(self, query: str, k: int | None = None) -> list[Hit]:
279
+ """Run the query against all three views, merge by verse_id, and
280
+ return the top-k Hits sorted by combined score."""
281
+ self._ensure()
282
+ k = k or self.top_k
283
+
284
+ q_emb = self._embedder.encode(
285
+ [query], normalize_embeddings=True, show_progress_bar=False
286
+ ).tolist()
287
+
288
+ # Over-fetch from each view; we want enough overlap that the merge
289
+ # has something to work with. 3*k per view is a reasonable upper
290
+ # bound: large enough to catch verses one view ranked low and another
291
+ # ranked high, small enough that Chroma's HNSW stays fast.
292
+ per_view_k = max(8, k * 3)
293
+
294
+ view_results: dict[str, list[tuple[str, float, dict]]] = {}
295
+ for name, coll in (("literal", self._coll_literal),
296
+ ("bhashya", self._coll_bhashya),
297
+ ("advisor", self._coll_advisor)):
298
+ r = coll.query(query_embeddings=q_emb, n_results=per_view_k)
299
+ ids = r["ids"][0]
300
+ dists = r["distances"][0] # cosine distance, in [0, 2]
301
+ metas = r["metadatas"][0]
302
+ view_results[name] = list(zip(ids, dists, metas))
303
+
304
+ # Merge: for each verse_id seen in any view, compute its combined
305
+ # score as Σ_v VIEW_WEIGHTS[v] * cos_sim(v) * tier_weight, where any
306
+ # view that didn't return that verse contributes 0. This is a soft
307
+ # voting scheme: a verse that appears strongly in one view but not
308
+ # others can still rank highly if that one view's weight is enough.
309
+ per_verse: dict[str, dict[str, float]] = {}
310
+ per_verse_meta: dict[str, dict] = {}
311
+ for view_name, results in view_results.items():
312
+ for vid, dist, meta in results:
313
+ cos_sim = 1.0 - dist
314
+ per_verse.setdefault(vid, {})[view_name] = cos_sim
315
+ per_verse_meta[vid] = meta
316
+
317
+ hits: list[Hit] = []
318
+ for vid, view_scores in per_verse.items():
319
+ tier = per_verse_meta[vid].get("tier", "supporting")
320
+ tw = TIER_WEIGHTS.get(tier, 1.0)
321
+ combined = sum(
322
+ VIEW_WEIGHTS[v] * view_scores.get(v, 0.0)
323
+ for v in VIEW_WEIGHTS
324
+ ) * tw
325
+ verse = self._verses_by_id.get(vid)
326
+ if verse is None:
327
+ # Index has it but corpus file doesn't — corpus and index
328
+ # have drifted. Skip rather than fabricate a record.
329
+ continue
330
+ hits.append(Hit(verse=verse,
331
+ combined_score=combined,
332
+ view_scores=view_scores))
333
+
334
+ hits.sort(key=lambda h: h.combined_score, reverse=True)
335
+ return hits[:k]
336
+
337
+ def search_many(self, queries: Iterable[str],
338
+ k_per: int | None = None) -> list[Hit]:
339
+ """Run multiple queries (e.g. from PlanRetrieval) and dedupe by
340
+ verse_id, keeping the highest combined score across queries."""
341
+ seen: dict[str, Hit] = {}
342
+ for q in queries:
343
+ for h in self.search(q, k=k_per):
344
+ cur = seen.get(h.verse.verse_id)
345
+ if cur is None or h.combined_score > cur.combined_score:
346
+ seen[h.verse.verse_id] = h
347
+ out = list(seen.values())
348
+ out.sort(key=lambda h: h.combined_score, reverse=True)
349
+ return out
350
+
351
+
352
+ # ──────────────────────────── Formatter for the LLM ────────────────────────────
353
+ def format_hits_for_llm(hits: list[Hit]) -> str:
354
+ """Render hits for the SelectPassages and SynthesizeAdvice prompts.
355
+
356
+ We expose the verse_ref (so the synthesizer can cite it), the literal
357
+ translation (so the synthesizer can quote it lightly), the bhāṣya
358
+ snippet (so the synthesizer can ground its claims), and the advisor-view
359
+ fields (so the synthesizer knows *why* this verse is being suggested for
360
+ this user).
361
+
362
+ Each hit is bounded in length so the prompt stays tractable on a 26B
363
+ local model with an 8k context window.
364
+ """
365
+ blocks = []
366
+ for i, h in enumerate(hits, start=1):
367
+ v = h.verse
368
+ block = [f"[{i}] {v.verse_ref} — {v.work_display}, {v.section_display}"]
369
+ block.append(f" tier: {v.tier} score: {h.combined_score:.3f}")
370
+ if v.translation:
371
+ block.append(f" Translation: {v.translation.strip()[:600]}")
372
+ if v.bhashya:
373
+ block.append(f" Bhāṣya (Śaṅkara): {v.bhashya.strip()[:800]}")
374
+ if v.paraphrase:
375
+ block.append(f" Teaching: {v.paraphrase}")
376
+ if v.life_situations:
377
+ block.append(f" Speaks to: {'; '.join(v.life_situations)}")
378
+ if v.emotions_addressed:
379
+ block.append(f" Addresses: {', '.join(v.emotions_addressed)}")
380
+ if v.themes:
381
+ block.append(f" Themes: {', '.join(v.themes)}")
382
+ blocks.append("\n".join(block))
383
+ return "\n\n".join(blocks)
384
+
385
+
386
+ # Alias kept so that advisor.py — and any prior code that imported the old
387
+ # name — works without modification. Both names refer to the same function
388
+ # because the new "passages" the advisor sees ARE Hit objects backed by
389
+ # EnrichedVerse records; the rendering is identical.
390
+ format_passages_for_llm = format_hits_for_llm
391
+
392
+
393
+ # ──────────────────────────── CLI ────────────────────────────
394
+ def main():
395
+ ap = argparse.ArgumentParser()
396
+ ap.add_argument("--build", action="store_true",
397
+ help="(Re)build the multi-view index from corpus_enriched.jsonl")
398
+ ap.add_argument("--query", type=str, default=None,
399
+ help="Run a test query against the index")
400
+ args = ap.parse_args()
401
+
402
+ if args.build:
403
+ build_index()
404
+ return
405
+
406
+ if args.query:
407
+ retr = AdvaitaRetriever()
408
+ hits = retr.search(args.query)
409
+ print(format_hits_for_llm(hits))
410
+ return
411
+
412
+ ap.print_help()
413
+
414
+
415
+ if __name__ == "__main__":
416
+ main()
metrics.py ADDED
@@ -0,0 +1,435 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ metrics.py — the metric is the specification.
3
+
4
+ GEPA optimizes whatever the metric rewards. So the metric here is not a single
5
+ number; it's a *contract* on what an Advaita-grounded, empathetic, practically
6
+ useful response looks like — combined with rich textual feedback the reflection
7
+ LM uses to rewrite prompts.
8
+
9
+ We combine three signals:
10
+ 1. Rule-based checks (fast, deterministic)
11
+ - citation grounding (cites real retrieved sources, not hallucinated)
12
+ - tier preference (primary + Śaṅkara > supporting)
13
+ - structural hygiene (length, has actionable element, no therapy clichés)
14
+ 2. LLM-as-judge rubric scoring
15
+ - Advaita coherence (non-dual, not crypto-dualist)
16
+ - two-truths discipline (vyāvahārika ↔ pāramārthika)
17
+ - empathy without dissolving into the user's frame
18
+ - wit calibration (light around the predicament, never the pain)
19
+ 3. Composite score + structured feedback string
20
+
21
+ The function signature matches GEPA's metric contract:
22
+ metric(gold, pred, trace=None, pred_name=None, pred_trace=None) -> dspy.Prediction
23
+
24
+ Returning dspy.Prediction(score=float, feedback=str) is the GEPA happy path.
25
+ """
26
+
27
+ from __future__ import annotations
28
+ import re
29
+ import json
30
+ from typing import Any
31
+ import dspy
32
+
33
+
34
+ # ──────────────────────────── Rule-based checks ────────────────────────────
35
+ THERAPY_CLICHES = [
36
+ "you got this",
37
+ "be kind to yourself",
38
+ "self-care",
39
+ "just remember",
40
+ "trust the process",
41
+ "everything happens for a reason",
42
+ "you are enough",
43
+ "love and light",
44
+ "manifesting",
45
+ "send positive vibes",
46
+ "good vibes",
47
+ ]
48
+
49
+ # Loose pattern catching citations like "BG 2.47", "Gītā 18.66", "Bṛhadāraṇyaka 4.4.5",
50
+ # "Vivekacūḍāmaṇi 11", "Kaṭha Up. 1.3.14", etc.
51
+ CITATION_PATTERN = re.compile(
52
+ r"\b("
53
+ r"BG\s*\d+[\.:]\d+" # BG 2.47
54
+ r"|G[īi]t[āa]\s*\d+[\.:]\d+" # Gita 2.47
55
+ r"|[A-ZĀĪŪṚḌṬṆṢŚḤṂa-zāīūṛḍṭṇṣśḥṃ]{3,}\s*Up\.?\s*\d+(?:[\.:]\d+){0,2}" # Kaṭha Up. 1.2.3
56
+ r"|Vivekac[ūu]ḍāmaṇi\s*\d+"
57
+ r"|Ātmabodha\s*\d+"
58
+ r"|Tattvabodha\s*\d+"
59
+ r"|Brahma\s*S[ūu]tra\s*\d+[\.:]\d+(?:[\.:]\d+)?"
60
+ r"|Aṣṭāvakra\s*G[īi]t[āa]\s*\d+[\.:]\d+"
61
+ r")\b"
62
+ )
63
+
64
+ EMPATHY_OPENERS = [
65
+ "what you", "you're carrying", "you are carrying", "i hear",
66
+ "this hurts", "this is painful", "the weight", "sitting with",
67
+ "what you describe", "the ache",
68
+ ]
69
+
70
+ ACTIONABLE_MARKERS = [
71
+ "this week", "today", "try this", "begin by", "for the next",
72
+ "each morning", "each evening", "when you notice", "the next time",
73
+ "as a practice", "sit for", "spend ", "over the next",
74
+ ]
75
+
76
+ NON_DUAL_MARKERS = [
77
+ "witness", "sākṣī", "sakshi", "non-dual", "advaita",
78
+ "pāramārthika", "paramarthika", "vyāvahārika", "vyavaharika",
79
+ "ātman", "atman", "brahman", "adhyāsa", "adhyasa", "māyā", "maya",
80
+ "neti neti", "tat tvam asi", "ahaṁ brahmāsmi", "aham brahmasmi",
81
+ "self with a capital", "the seer", "awareness itself",
82
+ ]
83
+
84
+
85
+ def _word_count(s: str) -> int:
86
+ return len(s.split())
87
+
88
+
89
+ def _has_any(text: str, needles: list[str]) -> list[str]:
90
+ low = text.lower()
91
+ return [n for n in needles if n in low]
92
+
93
+
94
+ def _normalize_for_match(s: str) -> str:
95
+ return re.sub(r"\s+", " ", s.lower()).strip()
96
+
97
+
98
+ def _citation_grounding(
99
+ sources_cited: list[str],
100
+ retrieved_passages: list[dict],
101
+ ) -> tuple[float, list[str], list[str]]:
102
+ """Return (grounding_score, grounded_citations, ungrounded_citations).
103
+
104
+ With the verse-indexed corpus, each retrieved passage carries an exact
105
+ verse_ref string ('BG 2.47', 'Muṇḍaka Up. 2.1.3', etc.). Grounding becomes
106
+ an exact set-membership test rather than fuzzy substring matching, which
107
+ is dramatically sharper feedback for GEPA's reflection step: 'BG 2.47'
108
+ is grounded if and only if 'BG 2.47' was in the retrieved set.
109
+
110
+ We still tolerate light formatting noise: the synthesizer might write
111
+ 'BG 2.47', 'Bhagavad Gītā 2.47', 'Gita 2:47', etc. We canonicalize to
112
+ 'BG <chap>.<verse>' for Gītā citations before comparing. Other works
113
+ are matched directly by verse_ref string with whitespace normalized.
114
+ """
115
+ if not sources_cited:
116
+ return 0.0, [], []
117
+
118
+ retrieved_refs = {
119
+ _canonicalize_ref(h.get("verse_ref") or h.get("meta", {}).get("verse_ref", ""))
120
+ for h in retrieved_passages
121
+ }
122
+ retrieved_refs.discard("")
123
+
124
+ grounded, ungrounded = [], []
125
+ for c in sources_cited:
126
+ canon = _canonicalize_ref(c)
127
+ # Try direct match first, then a "substring of any retrieved" fallback
128
+ # for cases where the synthesizer paraphrases the citation
129
+ # ('chapter 2 verse 47' vs 'BG 2.47').
130
+ hit = canon in retrieved_refs or any(
131
+ canon and (canon in r or r in canon) for r in retrieved_refs
132
+ )
133
+ (grounded if hit else ungrounded).append(c)
134
+
135
+ score = len(grounded) / max(len(sources_cited), 1)
136
+ return score, grounded, ungrounded
137
+
138
+
139
+ def _canonicalize_ref(s: str) -> str:
140
+ """Normalize a citation string so 'BG 2.47', 'Bhagavad Gītā 2.47',
141
+ 'Gītā 2:47' all reduce to the same canonical form 'BG 2.47'."""
142
+ s = re.sub(r"\s+", " ", s.strip())
143
+ # Gītā variants
144
+ m = re.match(r"^(?:BG|Bhagavad\s*G[īi]t[āa]|G[īi]t[āa])\s*(\d+)[\.:](\d+)", s, re.I)
145
+ if m:
146
+ return f"BG {int(m.group(1))}.{int(m.group(2))}"
147
+ # Default: lowercased, colons → dots
148
+ return s.lower().replace(":", ".")
149
+
150
+
151
+ def _tier_preference(
152
+ sources_cited: list[str],
153
+ retrieved_passages: list[dict],
154
+ selected_indices: list[int],
155
+ ) -> tuple[float, dict]:
156
+ """Reward responses whose *cited* passages came from primary/Śaṅkara tiers."""
157
+ if not selected_indices:
158
+ return 0.0, {"primary": 0, "shankara": 0, "supporting": 0}
159
+
160
+ counts = {"primary": 0, "shankara": 0, "supporting": 0}
161
+ for idx in selected_indices:
162
+ if 1 <= idx <= len(retrieved_passages):
163
+ tier = retrieved_passages[idx - 1].get("meta", {}).get("tier", "supporting")
164
+ counts[tier] = counts.get(tier, 0) + 1
165
+
166
+ total = sum(counts.values()) or 1
167
+ preferred = counts["primary"] + counts["shankara"]
168
+ return preferred / total, counts
169
+
170
+
171
+ def rule_based_score(pred: dspy.Prediction) -> tuple[float, dict]:
172
+ """Returns (score in [0,1], breakdown dict)."""
173
+ response = getattr(pred, "response", "") or ""
174
+ sources_cited = getattr(pred, "sources_cited", []) or []
175
+ retrieved = getattr(pred, "retrieved_passages", []) or []
176
+ selected_idx = getattr(pred, "selected_indices", []) or []
177
+ felt = getattr(pred, "felt_emotion", "") or ""
178
+
179
+ wc = _word_count(response)
180
+ length_ok = 200 <= wc <= 600
181
+ length_score = 1.0 if length_ok else max(0.0, 1.0 - abs(wc - 350) / 350)
182
+
183
+ citations_in_text = CITATION_PATTERN.findall(response)
184
+ has_citation = bool(citations_in_text) or bool(sources_cited)
185
+ citation_score = 1.0 if has_citation else 0.0
186
+
187
+ grounding_score, grounded, ungrounded = _citation_grounding(sources_cited, retrieved)
188
+
189
+ tier_score, tier_counts = _tier_preference(sources_cited, retrieved, selected_idx)
190
+
191
+ cliches = _has_any(response, THERAPY_CLICHES)
192
+ cliche_penalty = min(1.0, 0.25 * len(cliches))
193
+ cliche_score = 1.0 - cliche_penalty
194
+
195
+ # Empathy: opening should signal acknowledgement of feeling
196
+ head = response[:300].lower()
197
+ empathy_hits = [m for m in EMPATHY_OPENERS if m in head]
198
+ # Bonus if the felt_emotion content is referenced (loosely)
199
+ if felt:
200
+ for tok in felt.lower().split():
201
+ if len(tok) > 4 and tok in head:
202
+ empathy_hits.append(f"echoes:{tok}")
203
+ break
204
+ empathy_score = min(1.0, 0.4 + 0.3 * len(empathy_hits))
205
+
206
+ actionable_hits = _has_any(response, ACTIONABLE_MARKERS)
207
+ actionable_score = 1.0 if actionable_hits else 0.4
208
+
209
+ nondual_hits = _has_any(response, NON_DUAL_MARKERS)
210
+ nondual_score = min(1.0, 0.4 + 0.2 * len(nondual_hits))
211
+
212
+ # Weighted aggregate
213
+ components = {
214
+ "length": (length_score, 0.05),
215
+ "citation_present": (citation_score, 0.08),
216
+ "citation_grounding": (grounding_score, 0.18),
217
+ "tier_preference": (tier_score, 0.12),
218
+ "no_cliches": (cliche_score, 0.10),
219
+ "empathy_opening": (empathy_score, 0.15),
220
+ "actionable": (actionable_score, 0.10),
221
+ "nondual_register": (nondual_score, 0.22),
222
+ }
223
+ score = sum(s * w for s, w in components.values())
224
+
225
+ breakdown = {
226
+ "score": score,
227
+ "word_count": wc,
228
+ "components": {k: round(v[0], 3) for k, v in components.items()},
229
+ "citations_in_text": citations_in_text,
230
+ "sources_cited": sources_cited,
231
+ "grounded_citations": grounded,
232
+ "ungrounded_citations": ungrounded,
233
+ "tier_counts": tier_counts,
234
+ "therapy_cliches_found": cliches,
235
+ "empathy_hits": empathy_hits,
236
+ "actionable_hits": actionable_hits,
237
+ "nondual_markers_found": nondual_hits,
238
+ }
239
+ return score, breakdown
240
+
241
+
242
+ # ──────────────────────────── LLM-judge rubric ────────────────────────────
243
+ class JudgeAdvice(dspy.Signature):
244
+ """You are an examiner of Advaita-Vedānta spiritual counsel in the lineage
245
+ of Ādi Śaṅkarācārya. Score the advisor's response against the user's
246
+ question on each rubric (0.0 to 1.0) and write a short critique that an
247
+ optimizer can use to *improve the prompts that produced this response*.
248
+
249
+ Rubrics:
250
+
251
+ - advaita_coherence: Does the response reflect genuine non-dualism
252
+ (jīva-ātman-brahman identity), or does it accidentally smuggle in dualism
253
+ ('the soul reaches God', 'becoming one with the universe' as if they were
254
+ separate, etc.)? Does it avoid collapsing into nihilism ('nothing is
255
+ real')?
256
+
257
+ - two_truths_discipline: Does it honor the distinction between
258
+ vyāvahārika (transactional, where the user's pain and choices are real
259
+ and matter) and pāramārthika (absolute, where the witness is untouched)?
260
+ Failure modes: spiritual bypass (denying the pain by pointing to the
261
+ absolute), or pure-therapy register (forgetting the absolute exists).
262
+
263
+ - empathy_without_dissolving: Does it meet the user in their felt
264
+ experience without either flattening into therapy-speak OR dismissing
265
+ the feeling with premature transcendence?
266
+
267
+ - wit_calibration: Is there a light, dry touch around the cosmic
268
+ predicament (Śaṅkara himself is dry; this is consistent with the
269
+ tradition) WITHOUT being flippant about the user's actual pain? Both
270
+ 'too solemn throughout' and 'making jokes about their situation' lose
271
+ points.
272
+
273
+ - source_integration: Are scriptural citations woven into the prose
274
+ (illuminating the point) rather than dumped as block quotes or used
275
+ as decoration? Are the references specific (Gītā 2.47, not just
276
+ "the Gita says")?
277
+
278
+ - practical_offering: Does the response close with something the user
279
+ can actually try — a question to sit with, a practice, a perspective
280
+ shift — rather than abstract platitudes?
281
+
282
+ - draw_from_personal_experiences: Does the response use parables and day to day life
283
+ stories as examples to encourage the user to relate better to the advise
284
+
285
+ The critique should be specific and prescriptive: what to keep, what to
286
+ cut, what's missing. Phrase it as you would to a writer revising a draft."""
287
+
288
+ user_question: str = dspy.InputField()
289
+ response: str = dspy.InputField()
290
+ sources_cited: list[str] = dspy.InputField()
291
+
292
+ advaita_coherence: float = dspy.OutputField(desc="0.0 to 1.0")
293
+ two_truths_discipline: float = dspy.OutputField(desc="0.0 to 1.0")
294
+ empathy_without_dissolving: float = dspy.OutputField(desc="0.0 to 1.0")
295
+ wit_calibration: float = dspy.OutputField(desc="0.0 to 1.0")
296
+ source_integration: float = dspy.OutputField(desc="0.0 to 1.0")
297
+ practical_offering: float = dspy.OutputField(desc="0.0 to 1.0")
298
+ draw_from_personal_experiences: float = dspy.OutputField(desc="0.0 to 1.0")
299
+ critique: str = dspy.OutputField(
300
+ desc="3-6 sentences of prescriptive feedback for revising the response."
301
+ )
302
+
303
+
304
+ # Lazily-instantiated judge. Call configure_judge() to use a stronger LM (e.g. gpt-4o)
305
+ # during GEPA optimization so the reflection LM gets high-quality signal to work from.
306
+ _judge = None
307
+ _judge_lm = None # None means use the globally-configured LM (task LM)
308
+
309
+
310
+ def configure_judge(lm) -> None:
311
+ """Set the LM used by judge_score. Call before GEPA to use gpt-4o instead of the task LM."""
312
+ global _judge_lm, _judge
313
+ _judge_lm = lm
314
+ _judge = None # reset so next call recreates with new context
315
+
316
+
317
+ def _get_judge():
318
+ global _judge
319
+ if _judge is None:
320
+ _judge = dspy.ChainOfThought(JudgeAdvice)
321
+ return _judge
322
+
323
+
324
+ def judge_score(user_question: str, pred: dspy.Prediction) -> tuple[float, dict, str]:
325
+ judge = _get_judge()
326
+ try:
327
+ call_kwargs = dict(
328
+ user_question=user_question,
329
+ response=getattr(pred, "response", "") or "",
330
+ sources_cited=getattr(pred, "sources_cited", []) or [],
331
+ )
332
+ if _judge_lm is not None:
333
+ with dspy.context(lm=_judge_lm):
334
+ j = judge(**call_kwargs)
335
+ else:
336
+ j = judge(**call_kwargs)
337
+ except Exception as e:
338
+ # If the judge fails (parse error, LM hiccup), fall back gracefully.
339
+ return 0.5, {"judge_error": str(e)}, f"Judge failed: {e}"
340
+
341
+ rubric = {
342
+ "advaita_coherence": float(j.advaita_coherence or 0.0),
343
+ "two_truths_discipline": float(j.two_truths_discipline or 0.0),
344
+ "empathy_without_dissolving": float(j.empathy_without_dissolving or 0.0),
345
+ "wit_calibration": float(j.wit_calibration or 0.0),
346
+ "source_integration": float(j.source_integration or 0.0),
347
+ "practical_offering": float(j.practical_offering or 0.0),
348
+ "draw_from_personal_experiences": float(j.draw_from_personal_experiences or 0.0),
349
+ }
350
+ weights = {
351
+ "advaita_coherence": 0.25,
352
+ "two_truths_discipline": 0.20,
353
+ "empathy_without_dissolving": 0.20,
354
+ "wit_calibration": 0.10,
355
+ "source_integration": 0.10,
356
+ "practical_offering": 0.10,
357
+ "draw_from_personal_experiences": 0.05,
358
+ }
359
+ score = sum(rubric[k] * weights[k] for k in rubric)
360
+ score = max(0.0, min(1.0, score))
361
+ return score, rubric, j.critique or ""
362
+
363
+
364
+ # ──────────────────────────── Composite GEPA metric ────────────────────────────
365
+ RULE_WEIGHT = 0.45
366
+ JUDGE_WEIGHT = 0.55
367
+
368
+
369
+ def _format_feedback(rule_breakdown: dict, judge_rubric: dict, critique: str) -> str:
370
+ """Concatenate rule-based facts and judge critique into one feedback string
371
+ that the GEPA reflection LM can read and use to rewrite prompts."""
372
+ lines = ["FEEDBACK FOR PROMPT IMPROVEMENT", ""]
373
+
374
+ lines.append("Rule-based observations:")
375
+ comps = rule_breakdown.get("components", {})
376
+ for k, v in comps.items():
377
+ lines.append(f" - {k}: {v}")
378
+ if rule_breakdown.get("therapy_cliches_found"):
379
+ lines.append(f" - Therapy clichés to remove: {rule_breakdown['therapy_cliches_found']}")
380
+ if rule_breakdown.get("ungrounded_citations"):
381
+ lines.append(
382
+ f" - Citations that weren't in retrieved passages (likely hallucinated): "
383
+ f"{rule_breakdown['ungrounded_citations']}"
384
+ )
385
+ if not rule_breakdown.get("nondual_markers_found"):
386
+ lines.append(" - Response lacks explicit Advaita register; consider invoking "
387
+ "concepts like sākṣī, adhyāsa, the two truths, etc.")
388
+ if not rule_breakdown.get("actionable_hits"):
389
+ lines.append(" - No concrete practice or this-week shift was offered.")
390
+ tier_counts = rule_breakdown.get("tier_counts", {})
391
+ if tier_counts:
392
+ lines.append(f" - Selected passage tiers: {tier_counts} "
393
+ f"(prefer primary + śaṅkara when both options exist).")
394
+
395
+ lines.append("")
396
+ lines.append("Rubric scores from Advaita-tradition examiner:")
397
+ for k, v in judge_rubric.items():
398
+ if isinstance(v, float):
399
+ lines.append(f" - {k}: {v:.2f}")
400
+ lines.append("")
401
+ lines.append("Examiner critique:")
402
+ lines.append(critique.strip() or "(no critique returned)")
403
+ return "\n".join(lines)
404
+
405
+
406
+ def gita_metric(
407
+ gold: dspy.Example,
408
+ pred: dspy.Prediction,
409
+ trace: Any = None,
410
+ pred_name: str | None = None,
411
+ pred_trace: Any = None,
412
+ ) -> dspy.Prediction:
413
+ """The GEPA-compatible metric.
414
+
415
+ Returns dspy.Prediction(score=..., feedback=...). The feedback string is
416
+ what GEPA's reflection LM ingests when rewriting prompts."""
417
+ user_q = getattr(gold, "user_question", "") if gold else ""
418
+
419
+ rule_score, rule_breakdown = rule_based_score(pred)
420
+ j_score, j_rubric, critique = judge_score(user_q, pred)
421
+
422
+ composite = RULE_WEIGHT * rule_score + JUDGE_WEIGHT * j_score
423
+ feedback = _format_feedback(rule_breakdown, j_rubric, critique)
424
+
425
+ return dspy.Prediction(score=composite, feedback=feedback)
426
+
427
+
428
+ def quick_eval_score(
429
+ gold: dspy.Example,
430
+ pred: dspy.Prediction,
431
+ trace: Any = None,
432
+ ) -> float:
433
+ """A pure-float metric for `dspy.Evaluate` — same composite, no feedback."""
434
+ out = gita_metric(gold, pred, trace=trace)
435
+ return float(out.score)
optimize_gepa.py ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ optimize_gepa.py — run GEPA reflective prompt evolution.
3
+
4
+ GEPA (Genetic-Pareto) treats the program's prompts as an evolving population.
5
+ At each step it:
6
+ 1. Runs the current candidate(s) on a minibatch of training examples
7
+ 2. Collects the (score, feedback) pairs from our metric
8
+ 3. Asks a *reflection LM* to read the failures + feedback and propose a
9
+ mutated prompt
10
+ 4. Evaluates the mutant; keeps it if it Pareto-dominates the parent on the
11
+ validation set
12
+ 5. Repeats
13
+
14
+ Because we wrote `gita_metric` to return rich textual feedback, the reflection
15
+ LM has something substantive to chew on instead of just gradient signal.
16
+
17
+ The dataset has no gold labels — that's deliberate. Our metric judges the
18
+ prediction directly. This is the regime GEPA is designed for.
19
+
20
+ Usage:
21
+ python optimize_gepa.py --auto medium
22
+ python optimize_gepa.py --max-metric-calls 300 --proxy-task-lm
23
+ python optimize_gepa.py --auto light --proxy-task-lm # ~2-3 hrs vs 260 hrs
24
+
25
+ Proxy task LM (--proxy-task-lm):
26
+ Runs GEPA with gpt-4o-mini as the task LM instead of Gemma 4. GEPA only
27
+ needs to evaluate prompt quality — it doesn't need the final inference model.
28
+ Optimized prompts are model-agnostic text and transfer back to Gemma 4 when
29
+ the saved program is loaded at inference time. ~20x speedup over Gemma thinking.
30
+ """
31
+
32
+ from __future__ import annotations
33
+ import argparse
34
+ import json
35
+ import random
36
+ from pathlib import Path
37
+
38
+ import dspy
39
+ from dspy import GEPA
40
+
41
+ import config
42
+ from advisor import GitaAdvisor
43
+ from dataset_generator import load_jsonl, to_dspy_examples
44
+ import metrics as metrics_module
45
+ from metrics import gita_metric, quick_eval_score
46
+
47
+
48
+ def split(examples, val_frac: float, seed: int = 42):
49
+ rng = random.Random(seed)
50
+ shuffled = examples[:]
51
+ rng.shuffle(shuffled)
52
+ n_val = max(20, int(len(shuffled) * val_frac))
53
+ return shuffled[n_val:], shuffled[:n_val]
54
+
55
+
56
+ def main():
57
+ ap = argparse.ArgumentParser()
58
+ ap.add_argument("--dataset", default=str(config.DATASET_PATH))
59
+ ap.add_argument("--out", default=str(config.OPTIMIZED_PROGRAM_PATH))
60
+ ap.add_argument("--val-frac", type=float, default=0.2)
61
+ ap.add_argument(
62
+ "--auto",
63
+ choices=["light", "medium", "heavy"],
64
+ default="medium",
65
+ help="GEPA's auto-budget mode. 'light' for smoke-tests, 'medium' for "
66
+ "a real run, 'heavy' for an overnight run on a meaty box.",
67
+ )
68
+ ap.add_argument(
69
+ "--max-metric-calls",
70
+ type=int,
71
+ default=None,
72
+ help="Override --auto with an explicit metric-call budget.",
73
+ )
74
+ ap.add_argument("--track-stats", action="store_true", default=True)
75
+ ap.add_argument("--seed", type=int, default=42)
76
+ ap.add_argument(
77
+ "--proxy-task-lm",
78
+ action="store_true",
79
+ default=False,
80
+ help="Use gpt-4o-mini as the task LM during GEPA instead of Gemma 4. "
81
+ "~20x faster; optimized prompts transfer back to Gemma 4 at inference. "
82
+ "Requires OPENAI_API_KEY.",
83
+ )
84
+ args = ap.parse_args()
85
+
86
+ # Configure DSPy globally and grab the reflection LM
87
+ task_lm, reflection_lm = config.configure_dspy()
88
+
89
+ if args.proxy_task_lm:
90
+ # Override the task LM with gpt-4o-mini for the duration of this process.
91
+ # DSPy saves only prompt text (instructions + field descriptions), not the
92
+ # LM choice — so the optimized JSON loads cleanly onto Gemma 4 at inference.
93
+ task_lm = dspy.LM(model=config.PROXY_TASK_MODEL, **config.PROXY_TASK_LM_KWARGS)
94
+ dspy.configure(lm=task_lm, adapter=dspy.ChatAdapter(use_json_adapter_fallback=False))
95
+ print(f"Task LM (proxy): {task_lm.model} [GEPA optimization only]")
96
+ else:
97
+ print(f"Task LM: {task_lm.model}")
98
+ print(f"Reflection LM: {reflection_lm.model}")
99
+
100
+ # Use the reflection LM (gpt-4o) for judging instead of the task LM (Gemma).
101
+ # Gemma judging its own responses produces noisy, self-congratulatory scores;
102
+ # gpt-4o gives the reflection step the crisp, tradition-aware feedback it needs.
103
+ metrics_module.configure_judge(reflection_lm)
104
+ print(f"Judge LM: {reflection_lm.model} (overriding task LM for judging)")
105
+
106
+ # Dataset
107
+ raw = load_jsonl(Path(args.dataset))
108
+ examples = to_dspy_examples(raw)
109
+ if len(examples) < 40:
110
+ print(f"[warn] Only {len(examples)} examples — generate more with "
111
+ f"`python dataset_generator.py --n 500`.")
112
+ train, val = split(examples, args.val_frac, seed=args.seed)
113
+ print(f"Train: {len(train)} Val: {len(val)}")
114
+
115
+ # Student program
116
+ student = GitaAdvisor()
117
+
118
+ # More threads when hitting an API (no local GPU bottleneck).
119
+ num_threads = 16 if args.proxy_task_lm or config.TASK_LM_BACKEND == "gemini" else 4
120
+
121
+ # Optional: get a baseline number for context
122
+ print("\nEvaluating baseline (un-optimized) on validation set ...")
123
+ evaluator = dspy.Evaluate(
124
+ devset=val,
125
+ metric=quick_eval_score,
126
+ num_threads=num_threads,
127
+ display_progress=True,
128
+ display_table=0,
129
+ )
130
+ try:
131
+ baseline_result = evaluator(student)
132
+ baseline_score = float(baseline_result) if hasattr(baseline_result, "__float__") else baseline_result
133
+ print(f"Baseline score: {baseline_score}")
134
+ except Exception as e:
135
+ print(f"Baseline eval failed (continuing to optimization): {e}")
136
+
137
+ # GEPA
138
+ log_dir = str(config.ARTIFACTS_DIR / "gepa_logs")
139
+ gepa_kwargs = dict(
140
+ metric=gita_metric,
141
+ reflection_lm=reflection_lm,
142
+ track_stats=args.track_stats,
143
+ seed=args.seed,
144
+ # Show 6 training examples to the reflection LM per proposal step instead of
145
+ # the default 3 — our 12 domains need diversity to avoid domain-specific over-fit.
146
+ reflection_minibatch_size=6,
147
+ # API-backed runs (proxy or Gemini) can saturate many threads; local GPU is
148
+ # limited to 4 to avoid OOM / serialization on a single device.
149
+ num_threads=num_threads,
150
+ # When the task LM mangles a list field the reflection LM should know the format
151
+ # broke, not just see a low score with no explanation.
152
+ add_format_failure_as_feedback=True,
153
+ # Persist per-step scores and prompts for post-run inspection.
154
+ log_dir=log_dir,
155
+ )
156
+ if args.max_metric_calls is not None:
157
+ gepa_kwargs["max_metric_calls"] = args.max_metric_calls
158
+ else:
159
+ gepa_kwargs["auto"] = args.auto
160
+
161
+ print(f"\nStarting GEPA with {gepa_kwargs} ...")
162
+ optimizer = GEPA(**gepa_kwargs)
163
+
164
+ optimized = optimizer.compile(
165
+ student=student,
166
+ trainset=train,
167
+ valset=val,
168
+ )
169
+
170
+ # Save
171
+ out_path = Path(args.out)
172
+ out_path.parent.mkdir(parents=True, exist_ok=True)
173
+ optimized.save(str(out_path))
174
+ print(f"\nSaved optimized program to {out_path}")
175
+
176
+ # Side-by-side eval
177
+ print("\nFinal eval on validation set ...")
178
+ final_result = evaluator(optimized)
179
+ final_score = float(final_result) if hasattr(final_result, "__float__") else final_result
180
+ print(f"Optimized score: {final_score}")
181
+
182
+ # Dump the optimized prompts for human inspection
183
+ inspect_path = out_path.with_suffix(".prompts.txt")
184
+ with inspect_path.open("w", encoding="utf-8") as f:
185
+ f.write("# Optimized prompts after GEPA\n\n")
186
+ for name, predictor in optimized.named_predictors():
187
+ sig = predictor.signature
188
+ f.write(f"## {name}\n")
189
+ f.write(f"### instructions\n{sig.instructions}\n\n")
190
+ f.write("### fields\n")
191
+ for fname, field in sig.fields.items():
192
+ desc = getattr(field.json_schema_extra, "get", lambda *_: "")("desc", "") \
193
+ if hasattr(field, "json_schema_extra") else ""
194
+ f.write(f"- {fname}: {desc}\n")
195
+ f.write("\n---\n\n")
196
+ print(f"Wrote prompt inspection file to {inspect_path}")
197
+
198
+
199
+ if __name__ == "__main__":
200
+ main()
parsers/__init__.py ADDED
File without changes
parsers/gita_json.py ADDED
@@ -0,0 +1,236 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ parsers/gita_json.py — turn the gita/gita verse-indexed JSON into Verse records.
3
+
4
+ The gita/gita repo (Unlicense, public-domain dedication) gives us four files
5
+ on the static mirror:
6
+
7
+ chapters.json — chapter metadata (number, name, summary)
8
+ verse.json — per-verse Sanskrit + transliteration + word_meanings
9
+ translation.json — per-verse English translations keyed by author_id
10
+ authors.json — author metadata for the translations
11
+
12
+ Why split parsing across multiple sources_registry entries
13
+ ----------------------------------------------------------
14
+ We register `gita_json_core` (the verse text) and `gita_json_translations`
15
+ (the English translations) as separate sources. Both happen to feed this one
16
+ parser. The reason for the split is that translations come and go from the
17
+ upstream repo whereas the core verse data is essentially fixed; isolating
18
+ them lets us pin only what we need.
19
+
20
+ Translator allowlist
21
+ --------------------
22
+ Not every translator in the gita/gita translations.json is public-domain.
23
+ We hard-allowlist the ones we know are safe to redistribute. Anyone not on
24
+ the list is silently skipped — adding more is a one-line change.
25
+ """
26
+
27
+ from __future__ import annotations
28
+ import json
29
+ from pathlib import Path
30
+ from typing import Iterable
31
+
32
+ from corpus import Verse
33
+
34
+
35
+ # ──────────────────────────── Translator allowlist ────────────────────────────
36
+ # The keys are the author_id values used inside translation.json. The values
37
+ # are display strings + the year we want to use for attribution.
38
+ #
39
+ # Why this list and not just "all translations":
40
+ # - Some translators in the upstream repo (e.g. ISKCON Prabhupada) have
41
+ # active publisher rights that we shouldn't rely on regardless of how the
42
+ # upstream chose to license its compilation.
43
+ # - Reducing translation count keeps the index lean. Three voices are plenty.
44
+ #
45
+ # If you want to add a translator, verify their public-domain status (death
46
+ # year + 70 in most jurisdictions, or pre-1929 publication for US PD), then
47
+ # add a row.
48
+ ALLOWED_TRANSLATORS: dict[str, tuple[str, int | None]] = {
49
+ # Swami Sivananda — d. 1963 — works are widely shared by The Divine Life
50
+ # Society in keeping with their founder's non-commercial stance.
51
+ "sivananda": ("Swami Sivananda", 1969),
52
+
53
+ # Swami Tejomayananda — modern; included only because some mirrors
54
+ # release these under permissive terms; double-check before relying on it.
55
+ # Disabled by default to be conservative.
56
+ # "tejomayananda": ("Swami Tejomayananda", 1995),
57
+
58
+ # Dr. S. Sankaranarayan — translation of Śaṅkara's Gītā Bhāṣya included
59
+ # in some forks of gita/gita; verify the specific edition. Off by default.
60
+ # "shankara": ("Śaṅkara (tr. Sankaranarayan)", 1990),
61
+
62
+ # The verse text itself is not a "translation" per se but a copy of the
63
+ # critical text plus transliteration. We include it under the synthetic
64
+ # author key 'sanskrit'.
65
+ "sanskrit": ("Sanskrit text + IAST", None),
66
+ }
67
+
68
+
69
+ # ──────────────────────────── Helpers ────────────────────────────
70
+ def _verse_id(chapter: int, verse_no: int) -> str:
71
+ """Stable global key. Format: bhagavad_gita_<chap>_<verse>, zero-padded
72
+ to two digits so 1.10 sorts after 1.9 and lexical ordering matches numeric."""
73
+ return f"bhagavad_gita_{chapter:02d}_{verse_no:02d}"
74
+
75
+
76
+ def _verse_ref(chapter: int, verse_no: int) -> str:
77
+ """Citation form used by the advisor in its replies."""
78
+ return f"BG {chapter}.{verse_no}"
79
+
80
+
81
+ def _section_display(chapter_meta: dict) -> str:
82
+ name = chapter_meta.get("name_translation") or chapter_meta.get("name", "")
83
+ return f"Chapter {chapter_meta.get('chapter_number', '?')}: {name}"
84
+
85
+
86
+ # ──────────────────────────── Parser entry point ────────────────────────────
87
+ def parse(raw_dir_for_core: Path, raw_dir_for_translations: Path | None = None) -> Iterable[Verse]:
88
+ """Walk the gita/gita JSON files and yield Verse records.
89
+
90
+ Layout expected (after download_sources.py has run):
91
+ raw_dir_for_core/chapters.json
92
+ raw_dir_for_core/verse.json
93
+ [optionally]
94
+ raw_dir_for_translations/translation.json
95
+ raw_dir_for_translations/authors.json
96
+
97
+ If translations are not present, we still emit Verses with sanskrit +
98
+ transliteration + word_meanings; the `translation` field falls back to
99
+ the transliteration so the verse isn't content-empty. (Better: enable
100
+ the gita_json_translations source.)
101
+ """
102
+ chapters = _load(raw_dir_for_core / "chapters.json")
103
+ verses_raw = _load(raw_dir_for_core / "verse.json")
104
+
105
+ chapters_by_id = {c["chapter_number"]: c for c in chapters}
106
+
107
+ translations_by_verse: dict[int, dict[str, str]] = {}
108
+ authors_by_id: dict[str, str] = {}
109
+ if raw_dir_for_translations is not None:
110
+ translations_by_verse = _load_translations(raw_dir_for_translations / "translation.json")
111
+ authors_by_id = _load_authors(raw_dir_for_translations / "authors.json")
112
+
113
+ # Pick the best available translator from the allowlist, in priority order.
114
+ # First match wins. This keeps the index from carrying redundant English
115
+ # translations of the same verse.
116
+ translator_priority = ["sivananda", "sanskrit"]
117
+
118
+ for v in verses_raw:
119
+ chap_no = v["chapter_number"]
120
+ verse_no = v["verse_number"]
121
+ chap_meta = chapters_by_id.get(chap_no, {})
122
+ verse_id = _verse_id(chap_no, verse_no)
123
+
124
+ # Sanskrit text comes from the core file. The 'text' field has it
125
+ # in Devanāgarī, often with a trailing newline and verse number.
126
+ sanskrit = (v.get("text") or "").strip()
127
+ translit = (v.get("transliteration") or "").strip()
128
+ word_mean = (v.get("word_meanings") or "").strip()
129
+
130
+ # Try to attach an English translation
131
+ english = ""
132
+ translator_label = ""
133
+ v_translations = translations_by_verse.get(v.get("id") or v.get("externalId") or -1, {})
134
+ for key in translator_priority:
135
+ text = v_translations.get(key) or _translation_for(v_translations, key)
136
+ if text:
137
+ english = text.strip()
138
+ meta = ALLOWED_TRANSLATORS.get(key)
139
+ if meta:
140
+ translator_label = meta[0]
141
+ break
142
+
143
+ # Fallback: if no English translation, use word-meanings as a substitute
144
+ # so the verse isn't content-empty. Better than nothing for retrieval,
145
+ # though enrichment will be poorer.
146
+ if not english:
147
+ english = word_mean or translit
148
+
149
+ yield Verse(
150
+ verse_id=verse_id,
151
+ work="bhagavad_gita",
152
+ work_display="Bhagavad Gītā",
153
+ verse_ref=_verse_ref(chap_no, verse_no),
154
+ tier="primary",
155
+ section=f"chapter_{chap_no:02d}",
156
+ section_display=_section_display(chap_meta),
157
+ translation=english,
158
+ translator=translator_label,
159
+ sanskrit=sanskrit,
160
+ transliteration=translit,
161
+ word_meanings=word_mean,
162
+ bhashya="", # Gītā Bhāṣya is brought in by the Sastry parser
163
+ bhashya_translator="",
164
+ source_key="gita_json_core",
165
+ license="unlicense",
166
+ )
167
+
168
+
169
+ # ──────────────────────────── Internals ────────────────────────────
170
+ def _load(path: Path):
171
+ with path.open(encoding="utf-8") as f:
172
+ return json.load(f)
173
+
174
+
175
+ def _load_translations(path: Path) -> dict[int, dict[str, str]]:
176
+ """The translations file has one entry per (verse, author). Group them
177
+ by verse_id into a {verse_id: {author_id: text}} map.
178
+
179
+ Schema seen in the wild varies slightly between forks of gita/gita; we
180
+ cope by trying a few key names. If parsing fails entirely we return {}
181
+ and proceed without translations rather than blowing up the whole ingest.
182
+ """
183
+ if not path.exists():
184
+ return {}
185
+ try:
186
+ raw = _load(path)
187
+ except Exception as e:
188
+ print(f"[gita_json] failed to load translations: {e}")
189
+ return {}
190
+
191
+ out: dict[int, dict[str, str]] = {}
192
+ for row in raw:
193
+ vid = row.get("verse_id") or row.get("verseNumber") or row.get("verse_number_id") or row.get("id")
194
+ text = row.get("description") or row.get("text") or row.get("translation")
195
+ if vid is None or not text:
196
+ continue
197
+
198
+ # Skip non-English rows (Ramsukhdas Hindi etc.)
199
+ lang = (row.get("lang") or "").lower()
200
+ if lang and lang not in ("english", "en"):
201
+ continue
202
+
203
+ # Map the authorName (e.g. "Swami Sivananda") to an allowlist key
204
+ # ("sivananda") via case-insensitive substring matching. The numeric
205
+ # author_id field alone can't match the allowlist, which is why we
206
+ # prefer authorName here.
207
+ name_str = str(row.get("authorName") or row.get("author_id") or row.get("author") or "").strip()
208
+ matched_key = next(
209
+ (k for k in ALLOWED_TRANSLATORS if k.lower() in name_str.lower()),
210
+ None,
211
+ )
212
+ if matched_key is None:
213
+ continue
214
+ out.setdefault(int(vid), {})[matched_key] = text
215
+ return out
216
+
217
+
218
+ def _load_authors(path: Path) -> dict[str, str]:
219
+ if not path.exists():
220
+ return {}
221
+ try:
222
+ raw = _load(path)
223
+ except Exception:
224
+ return {}
225
+ return {row.get("id"): row.get("name", "") for row in raw if row.get("id")}
226
+
227
+
228
+ def _translation_for(v_translations: dict, author_key: str) -> str | None:
229
+ """Tolerant lookup: some files use 'sivananda', some 'Sivananda', etc."""
230
+ if author_key in v_translations:
231
+ return v_translations[author_key]
232
+ lk = author_key.lower()
233
+ for k, val in v_translations.items():
234
+ if str(k).lower() == lk:
235
+ return val
236
+ return None
parsers/sastry_archive.py ADDED
@@ -0,0 +1,249 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ parsers/sastry_archive.py — extract verse-attached Śaṅkara bhāṣya from
3
+ Alladi Mahadeva Sastry's 1897 archive.org OCR text.
4
+
5
+ What makes this harder than the gita_json parser
6
+ -------------------------------------------------
7
+ The gita/gita JSON gave us each verse already keyed by chapter and verse
8
+ number. The Sastry archive.org file is OCR'd plain text — about 20 MB of
9
+ running prose where the only structural cues are:
10
+
11
+ 1. Chapter headings, formatted in caps like "SANKHYA YOGA." or
12
+ "CHAPTER II — SANKHYA YOGA"
13
+ 2. Verse markers, which appear in two forms in the OCR:
14
+ - inline as "(II. 47.)" or "II. 47." after a translated verse
15
+ - as section headings like "47." or "Verse 47." preceding the bhāṣya
16
+ 3. The rule that when a translated verse appears, Śaṅkara's commentary
17
+ follows immediately until the next verse marker.
18
+
19
+ Add to that: OCR noise. "II" can become "11", "47" can become "4 7", periods
20
+ become commas, glyphs get dropped. So the parser is forgiving — it tries
21
+ several patterns and falls back gracefully.
22
+
23
+ What we extract
24
+ ---------------
25
+ For each verse we find, we yield a Verse with:
26
+ - tier='shankara'
27
+ - work='bhagavad_gita_bhashya' (kept distinct from 'bhagavad_gita' so
28
+ the joiner in ingest_corpus.py knows to merge bhashya into the gita
29
+ verses by verse_ref)
30
+ - translation = the verse text as Sastry rendered it (handy as a second
31
+ English voice alongside Sivananda)
32
+ - bhashya = Śaṅkara's commentary, as Sastry translated it
33
+ - bhashya_translator = 'Alladi Mahadeva Sastry, 1897'
34
+
35
+ Robustness strategy
36
+ -------------------
37
+ We don't try to be perfect. If a verse's bhāṣya is mis-attributed by ±1, the
38
+ downstream enrichment step will produce paraphrases that don't quite fit, and
39
+ we'll catch those during the spot-check pass on enriched output. The metric
40
+ will also penalize ungrounded citations. The key invariant is: never silently
41
+ emit a wrong (verse_id, bhashya) pair if we're uncertain — better to skip.
42
+ """
43
+
44
+ from __future__ import annotations
45
+ import re
46
+ from dataclasses import replace
47
+ from pathlib import Path
48
+ from typing import Iterable
49
+
50
+ from corpus import Verse
51
+
52
+
53
+ # ──────────────────────────── Patterns ────────────────────────────
54
+ # Roman numerals (allowing OCR substitutions: I↔1, V↔V, etc.)
55
+ ROMAN = r"(?:[IVX1l]+|[ivx]+)"
56
+
57
+ # A "verse marker" looks like "II. 47" or "(II. 47.)" or "47" alone in a section
58
+ # heading. We try several shapes and let the most specific win.
59
+ VERSE_INLINE = re.compile(
60
+ r"\(?\s*(?P<chap>" + ROMAN + r")\s*[\.\,]\s*(?P<verse>\d{1,3})\s*[\.\,]?\s*\)?",
61
+ re.IGNORECASE,
62
+ )
63
+
64
+ # Chapter heading: "CHAPTER II" or "II. SANKHYA YOGA" — uppercase-heavy lines
65
+ CHAPTER_HEADING = re.compile(
66
+ r"^\s*(?:CHAPTER\s+)?(?P<roman>" + ROMAN + r")\.?\s+[A-Z][A-Z \-—]{4,}",
67
+ re.MULTILINE,
68
+ )
69
+
70
+ # Roman → arabic
71
+ ROMAN_MAP = {
72
+ "I": 1, "II": 2, "III": 3, "IV": 4, "V": 5, "VI": 6, "VII": 7, "VIII": 8,
73
+ "IX": 9, "X": 10, "XI": 11, "XII": 12, "XIII": 13, "XIV": 14, "XV": 15,
74
+ "XVI": 16, "XVII": 17, "XVIII": 18,
75
+ }
76
+
77
+
78
+ def _to_arabic(token: str) -> int | None:
79
+ """Convert a possibly-noisy roman numeral to an int. OCR sometimes turns
80
+ 'I' into '1' and 'II' into '11', so we accept both forms."""
81
+ t = token.upper().replace("L", "I").replace("0", "O") # OCR substitutions
82
+ if t in ROMAN_MAP:
83
+ return ROMAN_MAP[t]
84
+ # Pure-arabic fallback (e.g. OCR rendered 'II' as '11')
85
+ if t.isdigit():
86
+ n = int(t)
87
+ if 1 <= n <= 18:
88
+ return n
89
+ return None
90
+
91
+
92
+ # ──────────────────────────── Main parse ────────────────────────────
93
+ def parse(raw_dir: Path) -> Iterable[Verse]:
94
+ """Walk Sastry archive.org text in raw_dir and yield Verse records.
95
+
96
+ Expected layout (after download_sources.py):
97
+ raw_dir/Bhagavad-Gita.with.the.Commentary.of.Sri.Shankaracharya_djvu.txt
98
+
99
+ The file is ~20 MB of OCR text. We stream it line-by-line, maintain the
100
+ current chapter as we encounter chapter headings, and at each verse marker
101
+ yield the accumulated text since the previous marker as the bhāṣya.
102
+ """
103
+ txts = list(raw_dir.glob("*_djvu.txt")) + list(raw_dir.glob("*.txt"))
104
+ if not txts:
105
+ print(f"[sastry] no .txt under {raw_dir}; did you download_sources.py?")
106
+ return
107
+
108
+ text = txts[0].read_text(encoding="utf-8", errors="replace")
109
+ text = _denoise(text)
110
+
111
+ # First pass: find every verse marker with its position and attempt to
112
+ # disambiguate the chapter from context. We collect (chap, verse, span)
113
+ # tuples in document order.
114
+ markers: list[tuple[int, int, int, int]] = [] # chap, verse, start, end
115
+ current_chapter = 1
116
+ last_pos = 0
117
+
118
+ # Walk chapter headings and verse markers together via merged iteration
119
+ events = []
120
+ for m in CHAPTER_HEADING.finditer(text):
121
+ c = _to_arabic(m.group("roman"))
122
+ if c is not None:
123
+ events.append(("chapter", m.start(), c))
124
+
125
+ for m in VERSE_INLINE.finditer(text):
126
+ c = _to_arabic(m.group("chap"))
127
+ try:
128
+ v = int(m.group("verse"))
129
+ except (ValueError, TypeError):
130
+ continue
131
+ if c is None or not (1 <= v <= 80):
132
+ continue
133
+ events.append(("verse", m.start(), c, v, m.end(), m.start("verse")))
134
+
135
+ events.sort(key=lambda e: e[1])
136
+
137
+ # Second pass: build (chapter, verse) → (start, end) spans, where each
138
+ # span is the bhāṣya from one marker to the next. We yield in document
139
+ # order with the chapter from the most recent chapter heading we saw.
140
+ last_marker_pos: int | None = None
141
+ last_chap: int | None = None
142
+ last_verse: int | None = None
143
+
144
+ for ev in events:
145
+ if ev[0] == "chapter":
146
+ current_chapter = ev[2]
147
+ continue
148
+ # ev: ("verse", start, chap, verse, end, verse_pos)
149
+ _, start, chap, verse, end, verse_pos = ev
150
+
151
+ # Only treat markers where the verse NUMBER appears near the start of
152
+ # its line — those are actual section headings. Inline cross-references
153
+ # like "(II. 47.)" mid-paragraph have the verse number well into the
154
+ # line and must not be treated as section boundaries.
155
+ verse_line_start = text.rfind("\n", 0, verse_pos) + 1
156
+ on_own_line = (verse_pos - verse_line_start) <= 8
157
+ if not on_own_line:
158
+ continue
159
+ current_chapter = chap
160
+
161
+ if last_marker_pos is not None and last_chap is not None and last_verse is not None:
162
+ bhashya_text = text[last_marker_pos:start].strip()
163
+ if bhashya_text:
164
+ yield _build_verse(
165
+ chap=last_chap, verse=last_verse, body=bhashya_text,
166
+ )
167
+
168
+ last_marker_pos = end
169
+ last_chap = current_chapter
170
+ last_verse = verse
171
+
172
+ # Flush the trailing one
173
+ if last_marker_pos is not None and last_chap and last_verse:
174
+ tail = text[last_marker_pos:].strip()
175
+ if tail:
176
+ yield _build_verse(chap=last_chap, verse=last_verse, body=tail)
177
+
178
+
179
+ # ──────────────────────────── Builders ────────────────────────────
180
+ def _build_verse(chap: int, verse: int, body: str) -> Verse:
181
+ """The body lump contains both Sastry's English of the verse and Śaṅkara's
182
+ commentary, usually with the verse first (sometimes labeled) and the
183
+ commentary following. We make a *light* split heuristic: if the first
184
+ paragraph is short (≤ 400 chars) and ends near a period, treat it as the
185
+ verse translation; the rest is bhashya. If we can't split confidently,
186
+ we put everything into bhashya and leave translation empty — the gita_json
187
+ parser already gave us a translation by another translator."""
188
+ body = body.strip()
189
+ translation = ""
190
+ bhashya = body
191
+
192
+ # Heuristic split on the first blank-ish line within reasonable distance
193
+ para_break = re.search(r"\n\s*\n", body[:600])
194
+ if para_break and para_break.end() < 500:
195
+ head = body[:para_break.start()].strip()
196
+ tail = body[para_break.end():].strip()
197
+ # Accept the split only if the head looks like a verse: short-ish,
198
+ # not starting with a typical-bhashya opener like "This means" /
199
+ # "The meaning is" / "Here the Lord says".
200
+ if 30 < len(head) < 400 and not _looks_like_bhashya_opener(head):
201
+ translation, bhashya = head, tail
202
+
203
+ return Verse(
204
+ verse_id=f"bhagavad_gita_{chap:02d}_{verse:02d}",
205
+ work="bhagavad_gita_bhashya",
206
+ work_display="Bhagavad Gītā with Śaṅkara's Bhāṣya",
207
+ verse_ref=f"BG {chap}.{verse}",
208
+ tier="shankara",
209
+ section=f"chapter_{chap:02d}",
210
+ section_display=f"Chapter {chap}",
211
+ translation=translation,
212
+ translator="Alladi Mahadeva Sastry" if translation else "",
213
+ bhashya=bhashya,
214
+ bhashya_translator="Alladi Mahadeva Sastry, 1897",
215
+ source_key="sastry_gita_bhashya",
216
+ license="public_domain",
217
+ )
218
+
219
+
220
+ def _looks_like_bhashya_opener(s: str) -> bool:
221
+ s = s.strip().lower()
222
+ openers = (
223
+ "this means", "the meaning is", "the sense is", "here the lord",
224
+ "here it is said", "the lord says", "the question may", "objection",
225
+ "the commentator",
226
+ )
227
+ return any(s.startswith(o) for o in openers)
228
+
229
+
230
+ # ──────────────────────────── OCR de-noise ────────────────────────────
231
+ def _denoise(text: str) -> str:
232
+ """Light cleanup. Aggressive normalization risks losing real signal —
233
+ we only fix patterns we're confident about."""
234
+ # Common OCR substitutions for Sanskrit diacritics losses won't matter
235
+ # for English-language retrieval; we leave Sanskrit fragments alone.
236
+
237
+ # Collapse runs of repeated punctuation that OCR hallucinated
238
+ text = re.sub(r"\.{3,}", ".", text)
239
+ text = re.sub(r" +\.", ".", text)
240
+
241
+ # Glue cross-line hyphens: "lib-\nerty" → "liberty"
242
+ text = re.sub(r"-\n([a-z])", r"\1", text)
243
+
244
+ # Normalize whitespace
245
+ text = re.sub(r"[ \t]+", " ", text)
246
+ text = re.sub(r"\n[ \t]+", "\n", text)
247
+ text = re.sub(r"\n{3,}", "\n\n", text)
248
+
249
+ return text
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dspy-ai>=2.6.0
2
+ chromadb>=0.5.0
3
+ sentence-transformers>=3.0.0
4
+ openai>=1.40.0
5
+ pydantic>=2.0
6
+ tqdm>=4.66
7
+ numpy>=1.26
8
+ rich>=13.7
9
+ unidecode>=1.3
10
+ requests>=2.31
11
+ gradio>=4.0
run_overnight.py ADDED
@@ -0,0 +1,230 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ run_overnight.py — orchestrates full GEPA optimization through light → medium,
3
+ then saves prompts and runs a multi-question test suite.
4
+
5
+ Usage:
6
+ python run_overnight.py [--skip-light] [--skip-medium]
7
+
8
+ Writes a timestamped log to artifacts/overnight_run.log.
9
+ """
10
+ from __future__ import annotations
11
+ import argparse
12
+ import subprocess
13
+ import sys
14
+ import time
15
+ from datetime import datetime
16
+ from pathlib import Path
17
+ import json
18
+
19
+ ROOT = Path(__file__).parent.resolve()
20
+ LOG_PATH = ROOT / "artifacts" / "overnight_run.log"
21
+ OPTIMIZED_PATH = ROOT / "artifacts" / "optimized_advisor.json"
22
+ PROMPTS_PATH = ROOT / "artifacts" / "optimized_advisor.prompts.txt"
23
+ RESULTS_PATH = ROOT / "artifacts" / "test_results.json"
24
+
25
+ TEST_QUESTIONS = [
26
+ "I just got laid off and feel like nothing matters anymore.",
27
+ "I keep procrastinating on important work and feel guilty about it. How do I stop?",
28
+ "My relationship ended and I feel like I've lost my identity. Who am I without this person?",
29
+ "I'm terrified of death and can't stop thinking about it at night.",
30
+ "I have achieved everything I wanted — career, family, money — and still feel empty.",
31
+ "I feel angry at everyone around me but don't know why. How should I deal with this?",
32
+ "I can't stop comparing myself to others and feeling like I'm always falling short.",
33
+ ]
34
+
35
+
36
+ def ts() -> str:
37
+ return datetime.now().strftime("%Y-%m-%d %H:%M:%S")
38
+
39
+
40
+ def log(msg: str, f=None):
41
+ line = f"[{ts()}] {msg}"
42
+ print(line, flush=True)
43
+ if f:
44
+ f.write(line + "\n")
45
+ f.flush()
46
+
47
+
48
+ def run_phase(cmd: list[str], phase: str, logfile) -> bool:
49
+ log(f"=== STARTING {phase} ===", logfile)
50
+ log(f"Command: {' '.join(cmd)}", logfile)
51
+ start = time.time()
52
+ try:
53
+ proc = subprocess.Popen(
54
+ cmd,
55
+ stdout=subprocess.PIPE,
56
+ stderr=subprocess.STDOUT,
57
+ text=True,
58
+ cwd=str(ROOT),
59
+ )
60
+ for line in proc.stdout:
61
+ logfile.write(line)
62
+ logfile.flush()
63
+ # Echo key lines to terminal
64
+ if any(k in line for k in ["score", "GEPA", "Step", "ERROR", "Saved", "Train:", "Val:", "Baseline"]):
65
+ print(line, end="", flush=True)
66
+ proc.wait()
67
+ elapsed = time.time() - start
68
+ if proc.returncode == 0:
69
+ log(f"=== {phase} COMPLETED in {elapsed/60:.1f} min ===", logfile)
70
+ return True
71
+ else:
72
+ log(f"=== {phase} FAILED (exit {proc.returncode}) after {elapsed/60:.1f} min ===", logfile)
73
+ return False
74
+ except Exception as e:
75
+ log(f"=== {phase} ERROR: {e} ===", logfile)
76
+ return False
77
+
78
+
79
+ def run_test_suite(logfile) -> dict:
80
+ log("=== STARTING TEST SUITE ===", logfile)
81
+ sys.path.insert(0, str(ROOT))
82
+
83
+ import config
84
+ from advisor import load_optimized
85
+ from metrics import gita_metric
86
+ import dspy
87
+ from concurrent.futures import ThreadPoolExecutor, as_completed
88
+
89
+ config.configure_dspy()
90
+
91
+ advisor = load_optimized()
92
+ n = len(TEST_QUESTIONS)
93
+
94
+ def run_one(i_q):
95
+ i, q = i_q
96
+ try:
97
+ pred = advisor(user_question=q, history=dspy.History(messages=[]))
98
+ gold = dspy.Example(user_question=q).with_inputs("user_question")
99
+ m = gita_metric(gold, pred)
100
+ return i, q, {
101
+ "question": q,
102
+ "score": round(float(m.score), 3),
103
+ "word_count": len(pred.response.split()),
104
+ "sources_cited": pred.sources_cited,
105
+ "response_excerpt": pred.response[:200],
106
+ "feedback_excerpt": m.feedback[:500],
107
+ }
108
+ except Exception as e:
109
+ return i, q, {"question": q, "error": str(e), "score": 0.0}
110
+
111
+ indexed = list(enumerate(TEST_QUESTIONS, 1))
112
+ results_map = {}
113
+ with ThreadPoolExecutor(max_workers=n) as pool:
114
+ futures = {pool.submit(run_one, iq): iq for iq in indexed}
115
+ for fut in as_completed(futures):
116
+ i, q, result = fut.result()
117
+ results_map[i] = result
118
+ if "error" in result:
119
+ log(f" [{i}/{n}] ERROR: {result['error']}", logfile)
120
+ else:
121
+ log(f" [{i}/{n}] score={result['score']:.3f} wc={result['word_count']} sources={result['sources_cited']}", logfile)
122
+
123
+ results = [results_map[i] for i in range(1, n + 1)]
124
+ avg = sum(r.get("score", 0) for r in results) / n
125
+ log(f"=== TEST SUITE DONE — avg score: {avg:.3f} ===", logfile)
126
+ return {"questions": results, "avg_score": round(avg, 3), "timestamp": ts()}
127
+
128
+
129
+ def dump_prompts(logfile):
130
+ """Re-extract and log optimized prompts to a human-readable file."""
131
+ if not OPTIMIZED_PATH.exists():
132
+ log(" No optimized program found — skipping prompt dump.", logfile)
133
+ return
134
+
135
+ sys.path.insert(0, str(ROOT))
136
+ import config
137
+ from advisor import GitaAdvisor
138
+ config.configure_dspy()
139
+
140
+ advisor = GitaAdvisor()
141
+ try:
142
+ advisor.load(str(OPTIMIZED_PATH))
143
+ except Exception as e:
144
+ log(f" Could not load optimized program: {e}", logfile)
145
+ return
146
+
147
+ lines = ["# Optimized Prompts after GEPA overnight run", f"# Extracted at {ts()}", ""]
148
+ for name, predictor in advisor.named_predictors():
149
+ sig = predictor.signature
150
+ lines.append(f"## {name}")
151
+ lines.append(f"### Instructions")
152
+ lines.append(sig.instructions or "(none)")
153
+ lines.append("")
154
+ lines.append("### Field descriptions")
155
+ for fname, field in sig.fields.items():
156
+ extras = field.json_schema_extra or {}
157
+ desc = extras.get("desc", "") if isinstance(extras, dict) else ""
158
+ lines.append(f" {fname}: {desc}")
159
+ lines.append("")
160
+ lines.append("---")
161
+ lines.append("")
162
+
163
+ PROMPTS_PATH.write_text("\n".join(lines), encoding="utf-8")
164
+ log(f" Prompts written to {PROMPTS_PATH}", logfile)
165
+
166
+
167
+ def main():
168
+ ap = argparse.ArgumentParser()
169
+ ap.add_argument("--skip-light", action="store_true")
170
+ ap.add_argument("--skip-medium", action="store_true")
171
+ ap.add_argument("--skip-tests", action="store_true")
172
+ args = ap.parse_args()
173
+
174
+ LOG_PATH.parent.mkdir(parents=True, exist_ok=True)
175
+
176
+ with LOG_PATH.open("w", encoding="utf-8") as logfile:
177
+ log("=== OVERNIGHT GEPA RUN STARTED ===", logfile)
178
+ log(f"Dataset: {ROOT / 'data' / 'synthetic_questions.jsonl'}", logfile)
179
+ log(f"Output: {OPTIMIZED_PATH}", logfile)
180
+
181
+ python = sys.executable
182
+
183
+ # ── Phase 1: Light ──
184
+ if not args.skip_light:
185
+ ok = run_phase(
186
+ [python, "optimize_gepa.py", "--auto", "light"],
187
+ "GEPA LIGHT",
188
+ logfile,
189
+ )
190
+ if not ok:
191
+ log("Light phase failed — stopping overnight run.", logfile)
192
+ sys.exit(1)
193
+ # Back up light result
194
+ if OPTIMIZED_PATH.exists():
195
+ import shutil
196
+ shutil.copy(OPTIMIZED_PATH, OPTIMIZED_PATH.with_suffix(".light.json"))
197
+ log(f" Backed up light result to {OPTIMIZED_PATH.with_suffix('.light.json')}", logfile)
198
+ else:
199
+ log("Skipping light phase (--skip-light).", logfile)
200
+
201
+ # ── Phase 2: Medium ──
202
+ if not args.skip_medium:
203
+ ok = run_phase(
204
+ [python, "optimize_gepa.py", "--auto", "medium"],
205
+ "GEPA MEDIUM",
206
+ logfile,
207
+ )
208
+ if not ok:
209
+ log("Medium phase failed.", logfile)
210
+ # Don't exit — still dump whatever we have
211
+ else:
212
+ log("Skipping medium phase (--skip-medium).", logfile)
213
+
214
+ # ── Dump prompts ──
215
+ log("Extracting optimized prompts ...", logfile)
216
+ dump_prompts(logfile)
217
+
218
+ # ── Test suite ──
219
+ if not args.skip_tests:
220
+ test_results = run_test_suite(logfile)
221
+ RESULTS_PATH.write_text(json.dumps(test_results, indent=2, ensure_ascii=False), encoding="utf-8")
222
+ log(f"Test results written to {RESULTS_PATH}", logfile)
223
+ else:
224
+ log("Skipping test suite (--skip-tests).", logfile)
225
+
226
+ log("=== OVERNIGHT RUN COMPLETE ===", logfile)
227
+
228
+
229
+ if __name__ == "__main__":
230
+ main()