Spaces:
Sleeping
Sleeping
| from typing import Optional, List, Dict, Any | |
| from pydantic import BaseModel, Field | |
| # Models | |
| class OptimizeRequest(BaseModel): | |
| """ | |
| π§ Explicit optimization request for RAG (Retrieval-Augmented Generation) pipelines. | |
| Args: | |
| docs_path (str, optional): π Folder containing your documents for RAG optimization. Default: "data/docs" | |
| retriever (List[str], optional): π Retriever type(s) to use. Default: ['faiss']. Example: 'bm25', 'faiss', 'chroma' | |
| embedding_model (List[str], optional): π§ Embedding model(s) to use. Default: ['sentence-transformers/all-MiniLM-L6-v2'] | |
| strategy (List[str], optional): π― RAG strategy to apply. Default: ['fixed']. Options: 'fixed', 'token', 'sentence' | |
| chunk_sizes (List[int], optional): π List of chunk sizes to evaluate. Default: [200, 400, 600] | |
| overlaps (List[int], optional): π List of overlap values to test. Default: [50, 100, 200] | |
| rerankers (List[str], optional): βοΈ Rerankers to apply after retrieval. Default: ['mmr'] | |
| search_type (str, optional): π Search method for parameter exploration. Default: 'grid'. Options: 'grid', 'random', 'bayesian' | |
| trials (int, optional): π§ͺ Number of optimization trials. Default: 5 | |
| metric (str, optional): π Metric to optimize. Default: 'faithfulness' | |
| validation_choice (str, optional): β Source of validation data. Default: 'generate'. Options: blank (use default), 'generate', local path, HF dataset ID | |
| llm_model (str, optional): π€ LLM used for QA generation if validation_choice='generate'. Default: 'gemini-2.5-flash-lite' | |
| """ | |
| docs_path: Optional[str] = Field( | |
| default="data/docs", | |
| description="π Folder containing your documents for RAG optimization. Example: 'data/docs'" | |
| ) | |
| retriever: Optional[List[str]] = Field( | |
| description="π Retriever type to use. Example: 'bm25', 'faiss', 'chroma'", | |
| default=['faiss'] | |
| ) | |
| embedding_model: Optional[List[str]] = Field( | |
| description="π§ Embedding model name or path. Example: 'sentence-transformers/all-MiniLM-L6-v2'", | |
| default=['sentence-transformers/all-MiniLM-L6-v2'] | |
| ) | |
| strategy: Optional[List[str]] = Field( | |
| description="π― RAG strategy name. Example: 'fixed', 'token', 'sentence'", | |
| default=['fixed'] | |
| ) | |
| chunk_sizes: Optional[List[int]] = Field( | |
| description="π List of chunk sizes to evaluate. Example: [200, 400, 600]", | |
| default=[200, 400, 600] | |
| ) | |
| overlaps: Optional[List[int]] = Field( | |
| description="π List of overlap values to test. Example: [50, 100, 200]", | |
| default = [50, 100, 200] | |
| ) | |
| rerankers: Optional[List[str]] = Field( | |
| default=["mmr"], | |
| description="βοΈ Rerankers to apply after retrieval. Default: ['mmr']" | |
| ) | |
| search_type: Optional[str] = Field( | |
| default="grid", | |
| description="π Search method to explore parameter space. Options: 'grid', 'random', 'bayesian'" | |
| ) | |
| trials: Optional[int] = Field( | |
| default=5, | |
| description="π§ͺ Number of optimization trials to run." | |
| ) | |
| metric: Optional[str] = Field( | |
| default="faithfulness", | |
| description="π Evaluation metric for optimization. Options: 'faithfulness'" | |
| ) | |
| validation_choice: Optional[str] = Field( | |
| default='generate', | |
| description=( | |
| "β Validation data source. Options:\n" | |
| " - Leave blank β use default 'validation_qa.json' if available\n" | |
| " - 'generate' β auto-generate a validation QA file from your docs\n" | |
| " - Path to a local JSON file (e.g. 'data/validation_qa.json')\n" | |
| " - Hugging Face dataset ID (e.g. 'squad')" | |
| ) | |
| ) | |
| llm_model: Optional[str] = Field( | |
| default="gemini-2.5-flash-lite", | |
| description="π€ LLM used to generate QA dataset when validation_choice='generate'. Example: 'gemini-pro', 'gpt-4o-mini'" | |
| ) | |
| class AutotuneRequest(BaseModel): | |
| """ | |
| β‘ Automatically tunes RAG pipeline parameters based on document analysis. | |
| Args: | |
| docs_path (str, optional): π Folder containing documents for RAG optimization. Default: "data/docs" | |
| embedding_model (str, optional): π§ Embedding model to analyze. Default: 'sentence-transformers/all-MiniLM-L6-v2' | |
| num_chunk_pairs (int, optional): π’ Number of chunk pairs to analyze. Default: 5 | |
| metric (str, optional): π Metric to optimize. Default: 'faithfulness' | |
| search_type (str, optional): π Search method for parameter exploration. Default: 'grid'. Options: 'grid', 'random', 'bayesian' | |
| trials (int, optional): π§ͺ Number of optimization trials. Default: 5 | |
| validation_choice (str, optional): β Source of validation data. Default: 'generate'. Options: blank, 'generate', local path, HF dataset ID | |
| llm_model (str, optional): π€ LLM used for QA generation if validation_choice='generate'. Default: 'gemini-2.5-flash-lite' | |
| """ | |
| docs_path: Optional[str] = Field( | |
| default="data/docs", | |
| description="π Folder containing your documents for RAG optimization. Example: 'data/docs'" | |
| ) | |
| embedding_model: Optional[str] = Field( | |
| default="sentence-transformers/all-MiniLM-L6-v2", | |
| description="π§ Embedding model name or path. Example: 'sentence-transformers/all-MiniLM-L6-v2'" | |
| ) | |
| num_chunk_pairs: Optional[int] = Field( | |
| default=5, | |
| description="π’ Number of chunk pairs to analyze for tuning." | |
| ) | |
| metric: Optional[str] = Field( | |
| default="faithfulness", | |
| description="π Evaluation metric for optimization. Options: 'faithfulness'" | |
| ) | |
| search_type: Optional[str] = Field( | |
| default="grid", | |
| description="π Search method to explore parameter space. Options: 'grid', 'random', 'bayesian'" | |
| ) | |
| trials: Optional[int] = Field( | |
| default=5, | |
| description="π§ͺ Number of optimization trials to run." | |
| ) | |
| validation_choice: Optional[str] = Field( | |
| default='generate', | |
| description=( | |
| "β Validation data source. Options:\n" | |
| " - Leave blank β use default 'validation_qa.jsonl' if available\n" | |
| " - 'generate' β auto-generate a validation QA file from your docs\n" | |
| " - Path to a local JSON file (e.g. 'data/validation_qa.json')\n" | |
| " - Hugging Face dataset ID (e.g. 'squad')" | |
| ) | |
| ) | |
| llm_model: Optional[str] = Field( | |
| default="gemini-2.5-flash-lite", | |
| description="π€ LLM used to generate QA dataset when validation_choice='generate'. Example: 'gemini-pro', 'gpt-4o-mini'" | |
| ) | |
| class QARequest(BaseModel): | |
| """ | |
| π§© Generate a validation QA dataset from documents for RAG evaluation. | |
| Args: | |
| docs_path (str): π Folder containing documents. Default: 'data/docs' | |
| llm_model (str): π€ LLM model used for question generation. Default: 'gemini-2.5-flash-lite' | |
| batch_size (int): π¦ Number of documents per batch. Default: 5 | |
| min_q (int): β Minimum number of questions per document. Default: 3 | |
| max_q (int): β Maximum number of questions per document. Default: 25 | |
| """ | |
| docs_path: str = Field( | |
| description="π Folder containing your documents to generate QA pairs from. Example: 'data/docs'", | |
| default='data/docs' | |
| ) | |
| llm_model: str = Field( | |
| default="gemini-2.5-flash-lite", | |
| description="π€ LLM model used for question generation. Example: 'gemini-2.5-flash-lite', 'gpt-4o-mini'" | |
| ) | |
| batch_size: int = Field( | |
| default=5, | |
| description="π¦ Number of documents processed per generation batch." | |
| ) | |
| min_q: int = Field( | |
| default=3, | |
| description="β Minimum number of questions per document." | |
| ) | |
| max_q: int = Field( | |
| default=25, | |
| description="β Maximum number of questions per document." | |
| ) | |