midah's picture
Initial commit: Hugging Face Model Ecosystem Navigator
d5b98e6
raw
history blame
5.7 kB
"""
Netlify Serverless Function for model data API.
This is a simplified version that works with Netlify Functions.
"""
import json
import os
import sys
# Add parent directory to path to import modules
sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
from data_loader import ModelDataLoader
from embeddings import ModelEmbedder
from dimensionality_reduction import DimensionReducer
import pandas as pd
import numpy as np
# Global state (persists across invocations in serverless)
data_loader = ModelDataLoader()
embedder = None
reducer = None
df = None
embeddings = None
reduced_embeddings = None
def handler(event, context):
"""
Netlify serverless function handler.
"""
global embedder, reducer, df, embeddings, reduced_embeddings
# Parse query parameters
query_params = event.get('queryStringParameters') or {}
path = event.get('path', '')
# CORS headers
headers = {
'Access-Control-Allow-Origin': '*',
'Access-Control-Allow-Headers': 'Content-Type',
'Access-Control-Allow-Methods': 'GET, OPTIONS',
'Content-Type': 'application/json',
}
# Handle OPTIONS (CORS preflight)
if event.get('httpMethod') == 'OPTIONS':
return {
'statusCode': 200,
'headers': headers,
'body': ''
}
# Initialize data on first request
if df is None:
try:
print("Loading data...")
df = data_loader.load_data(sample_size=10000)
df = data_loader.preprocess_for_embedding(df)
print(f"Loaded {len(df)} models")
except Exception as e:
return {
'statusCode': 500,
'headers': headers,
'body': json.dumps({'error': f'Failed to load data: {str(e)}'})
}
# Route requests
if path.endswith('/api/models') or '/api/models' in path:
return get_models(query_params, headers)
elif path.endswith('/api/stats') or '/api/stats' in path:
return get_stats(headers)
else:
return {
'statusCode': 404,
'headers': headers,
'body': json.dumps({'error': 'Not found'})
}
def get_models(query_params, headers):
"""Get filtered models."""
global df, embedder, reducer, embeddings, reduced_embeddings
try:
min_downloads = int(query_params.get('min_downloads', 0))
min_likes = int(query_params.get('min_likes', 0))
search_query = query_params.get('search_query')
max_points = int(query_params.get('max_points', 5000))
# Filter data
filtered_df = data_loader.filter_data(
df=df,
min_downloads=min_downloads,
min_likes=min_likes,
search_query=search_query
)
if len(filtered_df) == 0:
return {
'statusCode': 200,
'headers': headers,
'body': json.dumps([])
}
# Limit points
if len(filtered_df) > max_points:
filtered_df = filtered_df.sample(n=max_points, random_state=42)
# Generate embeddings if needed
if embedder is None:
embedder = ModelEmbedder()
if embeddings is None:
texts = df['combined_text'].tolist()
embeddings = embedder.generate_embeddings(texts)
# Reduce dimensions if needed
if reducer is None:
reducer = DimensionReducer(method="umap", n_components=2)
if reduced_embeddings is None:
reduced_embeddings = reducer.fit_transform(embeddings)
# Get coordinates
filtered_indices = filtered_df.index.tolist()
filtered_reduced = reduced_embeddings[filtered_indices]
# Prepare response
models = []
for idx, (i, row) in enumerate(filtered_df.iterrows()):
models.append({
'model_id': row.get('model_id', 'Unknown'),
'x': float(filtered_reduced[idx, 0]),
'y': float(filtered_reduced[idx, 1]),
'library_name': row.get('library_name'),
'pipeline_tag': row.get('pipeline_tag'),
'downloads': int(row.get('downloads', 0)),
'likes': int(row.get('likes', 0)),
'trending_score': float(row.get('trendingScore', 0)) if pd.notna(row.get('trendingScore')) else None,
'tags': row.get('tags') if pd.notna(row.get('tags')) else None
})
return {
'statusCode': 200,
'headers': headers,
'body': json.dumps(models)
}
except Exception as e:
return {
'statusCode': 500,
'headers': headers,
'body': json.dumps({'error': str(e)})
}
def get_stats(headers):
"""Get dataset statistics."""
global df
if df is None:
return {
'statusCode': 503,
'headers': headers,
'body': json.dumps({'error': 'Data not loaded'})
}
stats = {
'total_models': len(df),
'unique_libraries': df['library_name'].nunique() if 'library_name' in df.columns else 0,
'unique_pipelines': df['pipeline_tag'].nunique() if 'pipeline_tag' in df.columns else 0,
'avg_downloads': float(df['downloads'].mean()) if 'downloads' in df.columns else 0,
'avg_likes': float(df['likes'].mean()) if 'likes' in df.columns else 0
}
return {
'statusCode': 200,
'headers': headers,
'body': json.dumps(stats)
}