epirag / search.py
RohanB67's picture
add feature
189df32
raw
history blame
4.12 kB
"""
EpiRAG — search.py
------------------
Multi-provider web search, free fallback chain:
1. DuckDuckGo (ddg)
2. Tavily
Tries DDG first. Falls back to Tavily only if DDG returns nothing.
Domain whitelist applied to both.
"""
import urllib.parse
ALLOWED_DOMAINS = [
"arxiv.org", "pubmed.ncbi.nlm.nih.gov", "ncbi.nlm.nih.gov",
"semanticscholar.org", "nature.com", "science.org", "cell.com",
"plos.org", "biorxiv.org", "medrxiv.org", "academic.oup.com",
"wiley.com", "springer.com", "elsevier.com", "sciencedirect.com",
"tandfonline.com", "sagepub.com", "jstor.org", "researchgate.net",
"openalex.org", "europepmc.org", "who.int", "cdc.gov", "nih.gov",
"pmc.ncbi.nlm.nih.gov", "royalsocietypublishing.org", "pnas.org",
"bmj.com", "thelancet.com", "jamanetwork.com", "nejm.org",
"frontiersin.org", "mdpi.com", "acm.org", "ieee.org",
"dl.acm.org", "ieeexplore.ieee.org", "mathoverflow.net",
"math.stackexchange.com", "stats.stackexchange.com"
]
MAX_RESULTS = 5
def _is_allowed(url: str) -> bool:
if not url:
return False
try:
host = urllib.parse.urlparse(url).netloc.lower().lstrip("www.")
return any(host == d or host.endswith("." + d) for d in ALLOWED_DOMAINS)
except Exception:
return False
def _fmt(text: str, title: str, url: str, score: float = 0.5) -> dict:
return {
"text": text,
"source": title or url,
"similarity": round(score, 4),
"url": url,
"type": "web"
}
# -- Provider 1: DuckDuckGo------------------------------------------------------------
def _search_ddg(query: str) -> list[dict]:
try:
from ddgs import DDGS
results = []
with DDGS() as ddgs:
for r in ddgs.text(query, max_results=MAX_RESULTS * 3):
if _is_allowed(r.get("href", "")):
results.append(_fmt(
text = r.get("body", ""),
title = r.get("title", ""),
url = r.get("href", ""),
score = 0.6
))
if len(results) >= MAX_RESULTS:
break
return results
except Exception as e:
print(f" [DDG] failed: {e}", flush=True)
return []
# -- Provider 2: Tavily (free 1000/month) ------------------------------------------------------------
def _search_tavily(query: str, api_key: str) -> list[dict]:
try:
from tavily import TavilyClient
client = TavilyClient(api_key=api_key)
response = client.search(
query=query,
search_depth="advanced",
max_results=MAX_RESULTS,
include_answer=False,
topic="general",
include_domains=ALLOWED_DOMAINS,
)
return [
_fmt(
text = r.get("content", ""),
title = r.get("title", r.get("url", "Web")),
url = r.get("url", ""),
score = r.get("score", 0.5)
)
for r in response.get("results", [])
if _is_allowed(r.get("url", ""))
]
except Exception as e:
print(f" [Tavily] failed: {e}", flush=True)
return []
# -- Main entry point ------------------------------------------------------------
def web_search(query: str, tavily_key: str = None, **kwargs) -> list[dict]:
"""
Try DuckDuckGo first (always free, no key needed).
Fall back to Tavily if DDG returns nothing.
"""
print(" [Search] Trying DuckDuckGo...", flush=True)
results = _search_ddg(query)
if results:
print(f" [Search] DDG: {len(results)} results", flush=True)
return results
if tavily_key:
print(" [Search] DDG empty, falling back to Tavily...", flush=True)
results = _search_tavily(query, tavily_key)
if results:
print(f" [Search] Tavily: {len(results)} results", flush=True)
return results
print(" [Search] All providers returned empty", flush=True)
return []