Spaces:

phureexd
/

phuree

Running

App Files Files Community

phureexd commited on Dec 5, 2025

Commit

41169c9

0 Parent(s):

Clean deploy with LFS for all DB files

Browse files

Files changed (18) hide show

.gitattributes +4 -0
.gitignore +1 -0
Dockerfile +41 -0
Modelfile.local +62 -0
README.md +1 -0
chroma_db/ba17ee65-4350-4399-8b7f-ca4660b2aab0/data_level0.bin +3 -0
chroma_db/ba17ee65-4350-4399-8b7f-ca4660b2aab0/header.bin +3 -0
chroma_db/ba17ee65-4350-4399-8b7f-ca4660b2aab0/index_metadata.pickle +3 -0
chroma_db/ba17ee65-4350-4399-8b7f-ca4660b2aab0/length.bin +3 -0
chroma_db/ba17ee65-4350-4399-8b7f-ca4660b2aab0/link_lists.bin +3 -0
chroma_db/chroma.sqlite3 +3 -0
docker.md +7 -0
download_models.py +63 -0
graph.png +0 -0
main.py +146 -0
main_v2.py +431 -0
requirements.txt +15 -0
start-ollama.sh +11 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,4 @@

+*.gguf filter=lfs diff=lfs merge=lfs -text
+chroma_db/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text
+chroma_db/**/*.bin filter=lfs diff=lfs merge=lfs -text
+chroma_db/**/*.pickle filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ unsloth.Q4_K_M.gguf

Dockerfile ADDED Viewed

	@@ -0,0 +1,41 @@

+FROM python:3.11-slim
+WORKDIR /app
+ENV HF_HUB_DISABLE_PROGRESS_BARS=1 # Prevents Hugging Face from showing progress bars
+ENV ANONYMIZED_TELEMETRY=False # Disables telemetry
+ENV PYTHONUNBUFFERED=1 # Prevents Python from buffering output
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    python3-dev \
+    build-essential \
+    curl \
+    dos2unix \
+    && rm -rf /var/lib/apt/lists/*
+# Install Ollama
+RUN curl -fsSL https://ollama.com/install.sh | sh
+# Install Python dependencies
+COPY requirements.txt .
+RUN pip install --upgrade --no-cache-dir pip && pip install --no-cache-dir -r requirements.txt
+# Set HF_HOME to ensure models are stored in a consistent location
+ENV HF_HOME=/app/hf_cache
+# Pre-download models
+COPY download_models.py .
+RUN python download_models.py
+# Copy application files
+COPY . .
+COPY chroma_db /app/chroma_db
+# Fix line endings and permissions for shell scripts
+RUN dos2unix /app/start-ollama.sh && chmod +x /app/start-ollama.sh
+# Expose FastAPI port
+EXPOSE 7860
+# Start Ollama and FastAPI
+CMD ["/bin/sh", "-c", "/app/start-ollama.sh && uvicorn main_v2:app --host 0.0.0.0 --port 7860"]

Modelfile.local ADDED Viewed

	@@ -0,0 +1,62 @@

+# Modelfile generated by "ollama show"
+# To build a new Modelfile based on this, replace FROM with:
+# FROM hf.co/phureexd/qwen3_v2_gguf:Q4_K_M
+# You must have .gguf file first
+FROM /app/unsloth.Q4_K_M.gguf
+TEMPLATE """{{- if .Messages }}
+{{- if or .System .Tools }}<|im_start|>system
+{{- if .System }}
+{{ .System }}
+{{- end }}
+{{- if .Tools }}
+# Tools
+You may call one or more functions to assist with the user query.
+You are provided with function signatures within <tools></tools> XML tags:
+<tools>
+{{- range .Tools }}
+{"type": "function", "function": {{ .Function }}}
+{{- end }}
+</tools>
+For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:
+<tool_call>
+{"name": <function-name>, "arguments": <args-json-object>}
+</tool_call>
+{{- end }}<|im_end|>
+{{ end }}
+{{- range $i, $_ := .Messages }}
+{{- $last := eq (len (slice $.Messages $i)) 1 -}}
+{{- if eq .Role "user" }}<|im_start|>user
+{{ .Content }}<|im_end|>
+{{ else if eq .Role "assistant" }}<|im_start|>assistant
+{{ if .Content }}{{ .Content }}
+{{- else if .ToolCalls }}<tool_call>
+{{ range .ToolCalls }}{"name": "{{ .Function.Name }}", "arguments": {{ .Function.Arguments }}}
+{{ end }}</tool_call>
+{{- end }}{{ if not $last }}<|im_end|>
+{{ end }}
+{{- else if eq .Role "tool" }}<|im_start|>user
+<tool_response>
+{{ .Content }}
+</tool_response><|im_end|>
+{{ end }}
+{{- if and (ne .Role "assistant") $last }}<|im_start|>assistant
+{{ end }}
+{{- end }}
+{{- else }}
+{{- if .System }}<|im_start|>system
+{{ .System }}<|im_end|>
+{{ end }}{{ if .Prompt }}<|im_start|>user
+{{ .Prompt }}<|im_end|>
+{{ end }}<|im_start|>assistant
+{{ end }}{{ .Response }}{{ if .Response }}<|im_end|>{{ end }}"""
+PARAMETER temperature 0.7
+PARAMETER top_p 0.8
+PARAMETER top_k 20
+PARAMETER num_predict 512
+PARAMETER repeat_penalty 1
+PARAMETER stop <|im_start|>
+PARAMETER stop <|im_end|>

README.md ADDED Viewed

	@@ -0,0 +1 @@


1	+ "# qwen-healthcare-assistant"

chroma_db/ba17ee65-4350-4399-8b7f-ca4660b2aab0/data_level0.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2475b61e969a2d81d22ce88d8c6e9f26f63f6bbf7bc5d834a43f414c368a34b1
+size 8472000

chroma_db/ba17ee65-4350-4399-8b7f-ca4660b2aab0/header.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:01f7190252d8675a30dba956d9378f41be948e807392883f5a95ba08085e0efa
+size 100

chroma_db/ba17ee65-4350-4399-8b7f-ca4660b2aab0/index_metadata.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:06f1337839fcfb42cb8148849ea14ad9148b1d24da8fbb177b80ea012228263d
+size 113967

chroma_db/ba17ee65-4350-4399-8b7f-ca4660b2aab0/length.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:07995695399aa84304fb2a27cae4b5640590e1a6d85a43b2631c958c4c17ff15
+size 8000

chroma_db/ba17ee65-4350-4399-8b7f-ca4660b2aab0/link_lists.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:732e4883582480d065df2f20149b50ad1ebbbe55c48725c909fde893fc64aabf
+size 16976

chroma_db/chroma.sqlite3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ef6c315aadf349adb83c6f4b522cce8aafdf726261e6be1ca5e21809409df2fd
+size 50151424

docker.md ADDED Viewed

	@@ -0,0 +1,7 @@

+```bash
+docker build -t nlp-app .
+```
+```bash
+docker run -p 8000:8000 --name nlp-container nlp-app
+```

download_models.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import os
+import time
+from huggingface_hub import snapshot_download, hf_hub_download
+from sentence_transformers import SentenceTransformer, CrossEncoder
+# Define models to download
+EMBEDDING_MODEL_NAME = "BAAI/bge-m3"
+CROSS_ENCODER_MODEL_NAME = "BAAI/bge-reranker-v2-m3"
+def download_with_retries(repo_id, retries=5, delay=10):
+    """Downloads a model with retry logic."""
+    for i in range(retries):
+        try:
+            print(f"Downloading {repo_id} (Attempt {i+1}/{retries})...")
+            # resume_download=True ensures we don't start from scratch if interrupted
+            snapshot_download(repo_id=repo_id, resume_download=True)
+            print(f"Successfully downloaded {repo_id}")
+            return
+        except Exception as e:
+            print(f"Error downloading {repo_id}: {e}")
+            if i < retries - 1:
+                print(f"Retrying in {delay} seconds...")
+                time.sleep(delay)
+            else:
+                print(f"Failed to download {repo_id} after {retries} attempts.")
+                raise e
+def download_models():
+    print(f"Downloading embedding model: {EMBEDDING_MODEL_NAME}")
+    download_with_retries(EMBEDDING_MODEL_NAME)
+    # Also initialize SentenceTransformer to ensure it caches correctly for the library
+    print(f"Initializing SentenceTransformer for {EMBEDDING_MODEL_NAME} to populate cache...")
+    try:
+        SentenceTransformer(EMBEDDING_MODEL_NAME)
+    except Exception as e:
+        print(f"Warning: Failed to initialize SentenceTransformer: {e}")
+    print(f"Downloading cross-encoder model: {CROSS_ENCODER_MODEL_NAME}")
+    download_with_retries(CROSS_ENCODER_MODEL_NAME)
+    # Initialize CrossEncoder to populate cache
+    print(f"Initializing CrossEncoder for {CROSS_ENCODER_MODEL_NAME} to populate cache...")
+    try:
+        CrossEncoder(CROSS_ENCODER_MODEL_NAME)
+    except Exception as e:
+        print(f"Warning: Failed to initialize CrossEncoder: {e}")
+    # Download GGUF model
+    llm_repo_id = "phureexd/qwen3_v2_gguf"
+    llm_filename = "unsloth.Q4_K_M.gguf"
+    print(f"Downloading LLM: {llm_filename} from {llm_repo_id}")
+    try:
+        hf_hub_download(repo_id=llm_repo_id, filename=llm_filename, local_dir=".", local_dir_use_symlinks=False)
+        print(f"Successfully downloaded {llm_filename}")
+    except Exception as e:
+        print(f"Error downloading LLM: {e}")
+        raise e
+    print("All models downloaded successfully.")
+if __name__ == "__main__":
+    download_models()

graph.png ADDED Viewed

main.py ADDED Viewed

	@@ -0,0 +1,146 @@

+import os
+os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"
+from unsloth import FastLanguageModel
+print()
+import time
+from threading import Thread
+import uvicorn
+from fastapi import FastAPI
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import StreamingResponse
+from transformers import TextIteratorStreamer
+# Initialize FastAPI app
+app = FastAPI()
+# Enable CORS to allow frontend requests
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+###########################################################################
+from langchain_huggingface import HuggingFaceEmbeddings
+embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-m3")
+from langchain_chroma import Chroma
+vector_store = Chroma(
+    embedding_function=embeddings,
+    persist_directory="C:/Users/LENOVO/Downloads/chroma_langchain_db_3",  # Where to save data locally, remove if not necessary
+)
+# Model configuration
+# model_name = "phureexd/qwen_model"
+model_name = "unsloth/Qwen3-1.7B-unsloth-bnb-4bit"
+max_seq_length = 2048
+dtype = None
+load_in_4bit = True
+# Load model and tokenizer
+model, tokenizer = FastLanguageModel.from_pretrained(
+    model_name=model_name,
+    max_seq_length=max_seq_length,
+    dtype=dtype,
+    load_in_4bit=load_in_4bit,
+)
+FastLanguageModel.for_inference(model)
+# Step 3: Prepare RAG messages
+def prepare_rag_messages(messages, vector_store, k=2):
+    query = next(msg["content"] for msg in reversed(messages) if msg["role"] == "user")
+    print("this is query:\n", type(query), query)
+    docs = vector_store.similarity_search(query, k=k)
+    context = "\n\n".join(
+        f"Source: {doc.metadata['source']}\nContent: {doc.page_content}" for doc in docs
+    )
+    print("this is context:\n", context)
+    system_message = messages[0]["content"] + "\n\nContext:\n" + context
+    rag_messages = [
+        {"role": "system", "content": system_message},
+        {"role": "user", "content": query},
+    ]
+    return rag_messages
+# Endpoint to generate response using GET
+@app.get("/generate")
+async def generate(query: str):
+    messages = [
+        {
+            "role": "system",
+            "content": f"""You are a medical professional assistant. You will receive user queries along with relevant context retrieved via RAG.
+Use the context if it is relevant. If not, rely on your own medical knowledge. If unsure, clearly state so.
+Always respond in the same language used in the user's query. Keep responses clear, concise, and professional.
+Extremely important: Answer in the same language as the user query.
+""",
+        },
+        {"role": "user", "content": f"{query}"},
+    ]
+    rag_messages = prepare_rag_messages(messages, vector_store, k=2)
+    text = tokenizer.apply_chat_template(
+        rag_messages,
+        tokenize=False,
+        add_generation_prompt=True,  # Must add for generation
+        enable_thinking=False,
+    )
+    inputs = tokenizer(text, return_tensors="pt").to(device=model.device)
+    def stream_response():
+        streamer = TextIteratorStreamer(
+            tokenizer, skip_prompt=True, skip_special_tokens=True
+        )
+        # * the recommended settings for reasoning inference are temperature = 0.6, top_p = 0.95, top_k = 20
+        # * For normal chat based inference, temperature = 0.7, top_p = 0.8, top_k = 20
+        # generate_kwargs = dict(
+        #     **inputs,
+        #     max_new_tokens=2048,
+        #     do_sample=True,
+        #     temperature=0.6,
+        #     top_p=0.95,
+        #     top_k=20,
+        #     streamer=streamer,
+        # )
+        generate_kwargs = dict(
+            **inputs,
+            max_new_tokens=1024,
+            do_sample=True,
+            temperature=0.7,
+            top_p=0.8,
+            top_k=20,
+            streamer=streamer,
+        )
+        thread = Thread(target=model.generate, kwargs=generate_kwargs)
+        # thread.daemon = True
+        thread.start()
+        for new_text in streamer:
+            yield f"data: {new_text}\n\n"
+            # time.sleep(0.01)
+    return StreamingResponse(
+        stream_response(),
+        media_type="text/event-stream",
+        headers={"Cache-Control": "no-cache", "Connection": "keep-alive"},
+    )
+# Run the server
+if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=8000)

main_v2.py ADDED Viewed

	@@ -0,0 +1,431 @@

+import os
+# Suppress TensorFlow oneDNN optimization messages if not needed
+os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"
+# Disable ChromaDB telemetry to prevent log errors
+os.environ["ANONYMIZED_TELEMETRY"] = "False"
+import uvicorn
+from fastapi import FastAPI
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import StreamingResponse
+from langchain.retrievers import ContextualCompressionRetriever
+from langchain.retrievers.document_compressors import CrossEncoderReranker
+from langchain.tools.retriever import create_retriever_tool
+from langchain_chroma import Chroma
+from langchain_community.cross_encoders import HuggingFaceCrossEncoder
+from langchain_core.messages import AIMessage, HumanMessage, SystemMessage, ToolMessage
+from langchain_huggingface import HuggingFaceEmbeddings
+from langchain_ollama import ChatOllama
+from langgraph.checkpoint.memory import MemorySaver
+from langgraph.graph import END, MessagesState, StateGraph
+from langgraph.prebuilt import ToolNode, tools_condition
+# Set the device for HuggingFace models
+device = "cpu"
+# --- Configuration Constants ---
+APP_HOST = "0.0.0.0"
+APP_PORT = 7860
+THREAD_ID = "global_health_chat_session"  # Unique ID for the chat session
+# Models and Paths
+EMBEDDING_MODEL_NAME = "BAAI/bge-m3"
+CROSS_ENCODER_MODEL_NAME = "BAAI/bge-reranker-v2-m3"
+LLM_MODEL_NAME = "custom-model"  # Replace with your actual model, e.g., "hf.co/phureexd/qwen3_v2_gguf:Q4_K_M"
+VECTOR_DB_PATH = "/app/chroma_db" if os.path.exists("/app/chroma_db") else "chroma_db"
+# LLM Parameters
+LLM_TEMPERATURE = 0.7
+LLM_TOP_P = 0.8
+LLM_TOP_K = 20
+LLM_NUM_PREDICT = 512
+# Retriever Parameters
+RETRIEVER_SEARCH_K = 6  # Number of documents to fetch initially
+RERANKER_TOP_N = 3  # Number of documents after reranking
+# --- System Prompts ---
+INITIAL_SYSTEM_MESSAGE = SystemMessage(
+    content="""
+You are a health assistant designed to answer questions related to health, wellness, nutrition, exercise, symptoms, diseases, prevention, treatment, mental health, and medical advice. This explicitly includes general statements about feeling unwell or sick (e.g., "I'm sick", "I don't feel good"). For ANY query that falls into these categories, you MUST use the retrieve_health_info tool to fetch relevant information from the database before providing an answer. This ensures your responses are accurate and based on trusted sources. Do not answer health-related questions directly without using the tool, even if you think you know the answer.
+If the query is clearly unrelated to health (e.g., general knowledge questions), you can answer directly without the tool.
+**Important Guidelines:**
+- If the query mentions or implies health, feeling unwell, sickness, treatment, symptoms, diseases, nutrition, exercise, mental health, or wellness, use the tool.
+- Even if the query is only slightly related to health, or is a general statement about feeling unwell, use the tool to provide an informed answer.
+- Always respond in the same language as the user's query.
+- When in doubt, err on the side of using the tool.
+**Examples:**
+1. **Health-Related (Use Tool):**
+   - User: "What are the symptoms of diabetes?"
+   - Assistant: [Uses retrieve_health_info tool] "Common symptoms of diabetes include frequent urination, excessive thirst, and fatigue."
+2. **Slightly Health-Related (Use Tool):**
+   - User: "Is it okay to exercise when I have a cold?"
+   - Assistant: [Uses retrieve_health_info tool] "Light exercise might be okay, but rest if you have a fever."
+3. **General Sickness Statement (Use Tool):**
+   - User: "I'm sick."
+   - Assistant: [Uses retrieve_health_info tool] "I'm sorry to hear you're not feeling well. Common advice includes resting and staying hydrated. If you have specific symptoms, I can try to provide more information."
+4. **Non-Health-Related (No Tool):**
+   - User: "What is the capital of France?"
+   - Assistant: "The capital of France is Paris."
+5. **Health-Related in Thai (Use Tool):**
+   - User: "อาการของโรคเบาหวานมีอะไรบ้าง?"
+   - Assistant: [Uses retrieve_health_info tool] "อาการทั่วไปของโรคเบาหวาน ได้แก่ ปัสสาวะบ่อย กระหายน้ำมาก และอ่อนเพลีย"
+6. **Non-Health-Related in Thai (No Tool):**
+   - User: "เมืองหลวงของฝรั่งเศสคืออะไร?"
+   - Assistant: "เมืองหลวงของฝรั่งเศสคือปารีส"
+/no_think
+"""
+)
+RAG_SYSTEM_PROMPT_TEMPLATE = """
+You are a health assistant for question-answering tasks.
+Use the following pieces of retrieved documents to answer the question.
+If you don't know the answer, say that you don't know.
+Keep the answer concise and accurate.
+**Extremely important: Answer in the same language as the user query.**
+### Retrieved documents (if applicable):
+{docs_content}
+### Examples of the language model's responses:
+**Example 1 (English):**
+User: I feel a bit tired, what could it be?
+Assistant: Fatigue can be caused by lack of sleep, stress, or dehydration. Ensure you get 7-8 hours of sleep and stay hydrated.
+**Example 2 (English):**
+User: Does coffee affect my health?
+Assistant: Moderate coffee consumption can improve alertness but may cause insomnia or anxiety if overconsumed.
+**Example 3 (Thai):**
+User: ฉันรู้สึกเหนื่อยเล็กน้อย เกิดจากอะไรได้บ้าง?
+Assistant: อาการเหนื่อยอาจเกิดจากการนอนหลับไม่เพียงพอ ความเครียด หรือภาวะขาดน้ำ ควรนอนหลับ 7-8 ชั่วโมงและดื่มน้ำให้เพียงพอ
+/no_think
+"""
+# --- Initialization of Langchain Components ---
+def init_embeddings(model_name: str):
+    """Initializes HuggingFace embeddings."""
+    return HuggingFaceEmbeddings(model_name=model_name)
+def init_vector_store(embedding_function, persist_directory: str):
+    """Initializes Chroma vector store."""
+    return Chroma(
+        embedding_function=embedding_function,
+        persist_directory=persist_directory,
+    )
+def init_llm(
+    model_name: str, temperature: float, top_p: float, top_k: int, num_predict: int
+):
+    """Initializes ChatOllama LLM."""
+    return ChatOllama(
+        model=model_name,
+        temperature=temperature,
+        top_p=top_p,
+        top_k=top_k,
+        num_predict=num_predict,
+    )
+def init_retriever_tool(
+    vector_store_instance,
+    cross_encoder_model_name: str,
+    base_retriever_k: int,
+    reranker_top_n: int,
+):
+    """Initializes the retriever tool with reranking."""
+    base_retriever = vector_store_instance.as_retriever(
+        search_kwargs={"k": base_retriever_k}
+    )
+    cross_encoder = HuggingFaceCrossEncoder(
+        model_name=cross_encoder_model_name,
+        model_kwargs={"device": device},  # Specify device if needed, e.g., "cuda"
+    )
+    reranker = CrossEncoderReranker(model=cross_encoder, top_n=reranker_top_n)
+    compression_retriever = ContextualCompressionRetriever(
+        base_compressor=reranker,
+        base_retriever=base_retriever,
+    )
+    return create_retriever_tool(
+        retriever=compression_retriever,
+        name="retrieve_health_info",
+        description=(
+            "Use this tool to retrieve relevant documents from the query related to health, "
+            "wellness, nutrition, exercise, symptoms, diseases, treatment, prevention, "
+            "mental health, or medical advice information from the database. "
+            "Even if the query is slightly related. "
+            f"Return the top {reranker_top_n} most relevant documents."
+        ),
+        response_format="content_and_artifact",  # Ensures artifact contains Document objects
+    )
+# Initialize components
+print("Initializing Embeddings...")
+embeddings = init_embeddings(EMBEDDING_MODEL_NAME)
+print("Embeddings Initialized.")
+print("Initializing Vector Store...")
+vector_store = init_vector_store(embeddings, VECTOR_DB_PATH)
+print("Vector Store Initialized.")
+print("Initializing LLM...")
+llm = init_llm(LLM_MODEL_NAME, LLM_TEMPERATURE, LLM_TOP_P, LLM_TOP_K, LLM_NUM_PREDICT)
+print("LLM Initialized.")
+print("Initializing Retriever Tool...")
+retriever_tool = init_retriever_tool(
+    vector_store, CROSS_ENCODER_MODEL_NAME, RETRIEVER_SEARCH_K, RERANKER_TOP_N
+)
+print("Retriever Tool Initialized.")
+# --- LangGraph Node Definitions ---
+async def query_or_respond_node_logic(state: MessagesState):
+    """
+    Node function: Decides whether to call a tool for retrieval or respond directly.
+    Binds the retriever_tool to the LLM for this decision.
+    """
+    response = await llm.bind_tools([retriever_tool]).ainvoke(state["messages"])
+    return {"messages": [response]}
+async def generate_rag_response_node_logic(state: MessagesState):
+    """
+    Node function: Generates a response using retrieved documents (if any).
+    """
+    # Extract the most recent contiguous block of tool messages
+    recent_tool_messages = []
+    for message in reversed(state["messages"]):
+        if message.type == "tool":  # or isinstance(message, ToolMessage)
+            recent_tool_messages.append(message)
+        else:
+            break
+    tool_messages = recent_tool_messages[::-1]
+    # Format retrieved document content for the prompt
+    doc_strings = []
+    for tool_msg in tool_messages:
+        # Ensure artifact is a list of Langchain Document objects
+        if hasattr(tool_msg, "artifact") and isinstance(tool_msg.artifact, list):
+            for doc in tool_msg.artifact:
+                if hasattr(doc, "page_content") and hasattr(
+                    doc, "metadata"
+                ):  # Document structure check
+                    source = doc.metadata.get("source", "Unknown source")
+                    content = doc.page_content
+                    doc_strings.append(f"Source: {source}\nContent: {content}")
+    docs_content = (
+        "\n\n".join(doc_strings)
+        if doc_strings
+        else "No relevant documents were found to answer the current question."
+    )
+    # Prepare messages for the generation LLM call (history + new system prompt with docs)
+    # Include human messages, initial system messages, and AI responses (not tool calls)
+    conversation_history_for_llm = [
+        msg
+        for msg in state["messages"]
+        if msg.type in ("human", "system") or (msg.type == "ai" and not msg.tool_calls)
+    ]
+    # Construct the system prompt with retrieved documents
+    current_system_prompt_content = RAG_SYSTEM_PROMPT_TEMPLATE.format(
+        docs_content=docs_content
+    )
+    prompt_for_generation = [
+        SystemMessage(content=current_system_prompt_content)
+    ] + conversation_history_for_llm
+    response = await llm.ainvoke(prompt_for_generation)
+    return {"messages": [response]}
+# --- LangGraph Graph Construction ---
+def create_lang_graph(checkpointer_instance):
+    """Creates and compiles the LangGraph."""
+    graph_builder = StateGraph(MessagesState)
+    # Define nodes
+    graph_builder.add_node("query_or_respond", query_or_respond_node_logic)
+    tools_node = ToolNode([retriever_tool])  # Define tool execution node
+    graph_builder.add_node("tools", tools_node)
+    graph_builder.add_node("generate_rag_response", generate_rag_response_node_logic)
+    # Define edges
+    graph_builder.set_entry_point("query_or_respond")
+    graph_builder.add_conditional_edges(
+        "query_or_respond",
+        tools_condition,  # Prebuilt condition to check for tool calls
+        {END: END, "tools": "tools"},
+    )
+    graph_builder.add_edge("tools", "generate_rag_response")
+    graph_builder.add_edge("generate_rag_response", END)
+    return graph_builder.compile(checkpointer=checkpointer_instance)
+# Initialize checkpointer and compile graph
+memory_saver = MemorySaver()
+graph = create_lang_graph(memory_saver)
+# Optional: Save graph visualization
+# try:
+#     graph.get_graph().draw_mermaid_png(output_file_path="graph.png")
+#     print("Graph visualization saved to graph.png")
+# except Exception as e:
+#     print(f"Could not save graph visualization: {e}")
+# --- FastAPI Application Setup ---
+app = FastAPI(
+    title="Health Assistant API",
+    description="API for a health assistant using a retrieval-augmented generation approach.",
+)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],  # Allow all origins for simplicity; restrict in production
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# --- FastAPI Endpoints ---
+@app.get("/generate", summary="Generate a response from the health assistant model")
+async def generate_endpoint(
+    query: str,
+    useRAG: bool = False,
+    thread_id: str = THREAD_ID,
+):
+    """
+    Handles a user query, streams back the assistant's responses.
+    - `query`: The user's question.
+    - `useRAG`: If true, forces the use of the retrieval tool via a system message.
+    - `thread_id`: Unique identifier for the conversation session.
+    """
+    print(f"Received query: '{query}', Force RAG: {useRAG}, Thread ID: {thread_id}")
+    config = {"configurable": {"thread_id": thread_id}}
+    # Prepare input messages for the graph
+    input_messages = []
+    current_checkpoint_tuple = memory_saver.get_tuple(config)  # Check if history exists
+    if current_checkpoint_tuple is None:  # No history, it's a new or cleared session
+        input_messages.append(INITIAL_SYSTEM_MESSAGE)
+        print("Starting new conversation: Added initial system message.")
+    if useRAG:
+        # This message is added to strongly encourage tool use for the current query,
+        # supplementing the INITIAL_SYSTEM_MESSAGE.
+        input_messages.append(
+            SystemMessage(
+                content="You MUST use the retrieve_health_info tool for this query even if it seems unrelated to health."
+            )
+        )
+        print("Forcing RAG for this query with an additional system message.")
+    input_messages.append(HumanMessage(content=query))
+    graph_input = {"messages": input_messages}
+    async def stream_response_events():
+        # graph.stream with stream_mode="messages" yields the ENTIRE list of messages
+        # in the current state each time a node completes.
+        async for messages_in_state in graph.astream(
+            graph_input, config, stream_mode="messages"
+        ):
+            if not messages_in_state:
+                continue
+            # Get the current message from the state
+            latest_message = messages_in_state[0]
+            if isinstance(latest_message, AIMessage):
+                if latest_message.content:  # Final textual response
+                    # print(
+                    #     f"Streaming AI content: {latest_message.content}"
+                    # )
+                    yield f"data: {latest_message.content}\n\n"
+                elif latest_message.tool_calls:  # AI message requesting a tool call
+                    print(f"AI requested Tool call: {latest_message.tool_calls}")
+                    # You might want to send a status to the client, e.g., "Thinking..." or "Retrieving info..."
+                    # yield f"event: tool_call\ndata: {json.dumps(latest_message.tool_calls)}\n\n"
+            elif isinstance(
+                latest_message, ToolMessage
+            ):  # Message containing tool execution results
+                if latest_message.name == "retrieve_health_info" and hasattr(
+                    latest_message, "artifact"
+                ):
+                    print(f"Tool '{latest_message.name}' executed. Artifact content:")
+                    if latest_message.artifact and isinstance(
+                        latest_message.artifact, list
+                    ):
+                        # print every document in the artifact
+                        source_list = set()
+                        for doc in latest_message.artifact:
+                            source = doc.metadata.get("source", "Unknown source")
+                            if source != "Unknown source":
+                                source_list.add(source)
+                            print(f"  Source: {source}\n   Content: {doc.page_content}")
+                    yield f"data: **Source:**{str(source_list)}\n\n"
+    return StreamingResponse(
+        stream_response_events(),
+        media_type="text/event-stream",
+        headers={"Cache-Control": "no-cache", "Connection": "keep-alive"},
+    )
+@app.get("/clear", summary="Clear conversation history")
+async def clear_conversation_endpoint(thread_id: str = THREAD_ID):
+    """Clears the conversation history for the specified thread_id."""
+    try:
+        # Note: MemorySaver in some versions might not support explicit deletion easily via public API
+        # This is a best-effort attempt or placeholder for actual persistence deletion
+        # If using a real DB checkpointer, you would delete rows here.
+        # For MemorySaver, we might just need to reset the state or let it be if it's per-request instance (it's not here).
+        # Actually, MemorySaver stores in a dict. We can try accessing it if we really need to clear.
+        if hasattr(memory_saver, "storage"):
+             if thread_id in memory_saver.storage:
+                 del memory_saver.storage[thread_id]
+        print(f"Conversation history cleared for thread_id: {thread_id}")
+        return {"status": "success", "message": "Conversation history cleared."}
+    except Exception as e:
+        print(f"Error clearing conversation history for thread_id {thread_id}: {e}")
+        return {"status": "error", "message": f"Failed to clear history: {e}"}
+# --- Main Execution ---
+if __name__ == "__main__":
+    print(f"Starting Health Assistant API on {APP_HOST}:{APP_PORT}")
+    uvicorn.run(app, host=APP_HOST, port=APP_PORT)

requirements.txt ADDED Viewed

	@@ -0,0 +1,15 @@

+fastapi==0.115.12
+uvicorn==0.34.2
+langchain==0.3.25
+langchain-core==0.3.58
+langchain-chroma==0.2.3
+langchain-huggingface==0.1.2
+langchain-ollama==0.3.2
+langchain-community==0.3.23
+langgraph==0.4.1
+chromadb==0.6.3
+huggingface-hub==0.30.2
+sentence-transformers==3.4.1
+transformers==4.51.3
+aiocron==1.8
+aiohttp==3.11.11

start-ollama.sh ADDED Viewed

	@@ -0,0 +1,11 @@

+#!/bin/sh
+# Start Ollama in the background
+ollama serve &
+# Wait for Ollama to start
+sleep 5
+# Create a custom model using the Modelfile
+ollama create custom-model -f /app/Modelfile.local