midah commited on
Commit
bed2f80
·
1 Parent(s): c24ac02

Fix precomputed_loader to make embeddings optional

Browse files
backend/utils/precomputed_loader.py CHANGED
@@ -54,12 +54,11 @@ class PrecomputedDataLoader:
54
  """Check if pre-computed data is available."""
55
  metadata_file = self.data_dir / f"metadata_{self.version}.json"
56
  models_file = self.data_dir / f"models_{self.version}.parquet"
57
- embeddings_file = self.data_dir / f"embeddings_{self.version}.parquet"
58
 
 
59
  return (
60
  metadata_file.exists() and
61
- models_file.exists() and
62
- embeddings_file.exists()
63
  )
64
 
65
  def load_models(self) -> pd.DataFrame:
@@ -115,16 +114,23 @@ class PrecomputedDataLoader:
115
 
116
  return embeddings, model_ids
117
 
118
- def load_all(self) -> Tuple[pd.DataFrame, np.ndarray, Dict]:
119
  """
120
  Load all pre-computed data.
121
 
122
  Returns:
123
- Tuple of (models_df, embeddings_array, metadata_dict)
124
  """
125
  metadata = self.load_metadata()
126
  df = self.load_models()
127
- embeddings, _ = self.load_embeddings()
 
 
 
 
 
 
 
128
 
129
  return df, embeddings, metadata
130
 
 
54
  """Check if pre-computed data is available."""
55
  metadata_file = self.data_dir / f"metadata_{self.version}.json"
56
  models_file = self.data_dir / f"models_{self.version}.parquet"
 
57
 
58
+ # Embeddings file is optional - coordinates are in models file
59
  return (
60
  metadata_file.exists() and
61
+ models_file.exists()
 
62
  )
63
 
64
  def load_models(self) -> pd.DataFrame:
 
114
 
115
  return embeddings, model_ids
116
 
117
+ def load_all(self) -> Tuple[pd.DataFrame, Optional[np.ndarray], Dict]:
118
  """
119
  Load all pre-computed data.
120
 
121
  Returns:
122
+ Tuple of (models_df, embeddings_array_or_None, metadata_dict)
123
  """
124
  metadata = self.load_metadata()
125
  df = self.load_models()
126
+
127
+ # Try to load embeddings, but they're optional
128
+ embeddings_file = self.data_dir / f"embeddings_{self.version}.parquet"
129
+ if embeddings_file.exists():
130
+ embeddings, _ = self.load_embeddings()
131
+ else:
132
+ logger.info("Embeddings file not found, skipping...")
133
+ embeddings = None
134
 
135
  return df, embeddings, metadata
136
 
netlify.toml CHANGED
@@ -1,20 +1,18 @@
1
  [build]
2
  base = "frontend"
3
- publish = "frontend/build"
4
  command = "npm install --legacy-peer-deps && npm run build"
5
 
6
  [build.environment]
7
  NODE_VERSION = "18"
8
- # Set this to your backend URL (Railway, Render, etc.)
9
- # REACT_APP_API_URL = "https://your-backend-url.railway.app"
10
 
11
- # Redirect all routes to index.html for React Router (SPA routing)
12
  [[redirects]]
13
  from = "/*"
14
  to = "/index.html"
15
  status = 200
16
 
17
- # Security headers
18
  [[headers]]
19
  for = "/*"
20
  [headers.values]
@@ -23,3 +21,9 @@
23
  X-Content-Type-Options = "nosniff"
24
  Referrer-Policy = "strict-origin-when-cross-origin"
25
 
 
 
 
 
 
 
 
1
  [build]
2
  base = "frontend"
3
+ publish = "build"
4
  command = "npm install --legacy-peer-deps && npm run build"
5
 
6
  [build.environment]
7
  NODE_VERSION = "18"
8
+ # Set your Railway backend URL in Netlify dashboard:
9
+ # Site settings → Environment variables → REACT_APP_API_URL
10
 
 
11
  [[redirects]]
12
  from = "/*"
13
  to = "/index.html"
14
  status = 200
15
 
 
16
  [[headers]]
17
  for = "/*"
18
  [headers.values]
 
21
  X-Content-Type-Options = "nosniff"
22
  Referrer-Policy = "strict-origin-when-cross-origin"
23
 
24
+ # Cache static assets aggressively
25
+ [[headers]]
26
+ for = "/static/*"
27
+ [headers.values]
28
+ Cache-Control = "public, max-age=31536000, immutable"
29
+
precompute_150k.log ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-11-29 22:40:15,455 - INFO - ============================================================
2
+ 2025-11-29 22:40:15,455 - INFO - FAST PRE-COMPUTATION STARTED
3
+ 2025-11-29 22:40:15,455 - INFO - ============================================================
4
+ 2025-11-29 22:40:15,455 - INFO - Sample size: 150,000
5
+ 2025-11-29 22:40:15,455 - INFO - Output directory: ../precomputed_data
6
+ 2025-11-29 22:40:15,455 - INFO - Version: v1
7
+ 2025-11-29 22:40:15,455 - INFO - PCA pre-reduction: True (50 dims)
8
+ 2025-11-29 22:40:15,455 - INFO - ============================================================
9
+ 2025-11-29 22:40:15,455 - INFO - Step 1/5: Loading model data (prioritizing base models)...
10
+ Repo card metadata block was not found. Setting CardData to empty.
11
+ 2025-11-29 22:40:15,682 - WARNING - Repo card metadata block was not found. Setting CardData to empty.
12
+ 2025-11-29 22:40:19,390 - INFO - Loaded 150,000 models in 3.9 seconds
13
+ 2025-11-29 22:40:19,390 - INFO - Step 2/5: Generating embeddings...
14
+ 2025-11-29 22:40:19,390 - INFO - Building combined text from model fields...
15
+ 2025-11-29 22:40:19,488 - INFO - Use pytorch device_name: mps
16
+ 2025-11-29 22:40:19,488 - INFO - Load pretrained SentenceTransformer: all-MiniLM-L6-v2
17
+
18
+ 2025-11-29 22:45:49,794 - INFO - Generated embeddings: (150000, 384) in 5.5 minutes
19
+ 2025-11-29 22:45:49,794 - INFO - Step 2.5/5: PCA reduction (384 -> 50 dims)...
20
+ /Users/hamidaho/hf_viz/venv/lib/python3.13/site-packages/sklearn/decomposition/_pca.py:604: RuntimeWarning: divide by zero encountered in matmul
21
+ C = X.T @ X
22
+ /Users/hamidaho/hf_viz/venv/lib/python3.13/site-packages/sklearn/decomposition/_pca.py:604: RuntimeWarning: overflow encountered in matmul
23
+ C = X.T @ X
24
+ /Users/hamidaho/hf_viz/venv/lib/python3.13/site-packages/sklearn/decomposition/_pca.py:604: RuntimeWarning: invalid value encountered in matmul
25
+ C = X.T @ X
26
+ /Users/hamidaho/hf_viz/venv/lib/python3.13/site-packages/sklearn/decomposition/_base.py:148: RuntimeWarning: divide by zero encountered in matmul
27
+ X_transformed = X @ self.components_.T
28
+ /Users/hamidaho/hf_viz/venv/lib/python3.13/site-packages/sklearn/decomposition/_base.py:148: RuntimeWarning: overflow encountered in matmul
29
+ X_transformed = X @ self.components_.T
30
+ /Users/hamidaho/hf_viz/venv/lib/python3.13/site-packages/sklearn/decomposition/_base.py:148: RuntimeWarning: invalid value encountered in matmul
31
+ X_transformed = X @ self.components_.T
32
+ /Users/hamidaho/hf_viz/venv/lib/python3.13/site-packages/sklearn/decomposition/_base.py:155: RuntimeWarning: divide by zero encountered in matmul
33
+ X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
34
+ /Users/hamidaho/hf_viz/venv/lib/python3.13/site-packages/sklearn/decomposition/_base.py:155: RuntimeWarning: overflow encountered in matmul
35
+ X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
36
+ /Users/hamidaho/hf_viz/venv/lib/python3.13/site-packages/sklearn/decomposition/_base.py:155: RuntimeWarning: invalid value encountered in matmul
37
+ X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
38
+ 2025-11-29 22:45:49,880 - INFO - PCA complete in 0.1s (preserved 92.5% variance)
39
+ 2025-11-29 22:45:49,880 - INFO - Reduced embeddings: (150000, 50)
40
+ 2025-11-29 22:45:49,880 - INFO - Step 3/5: Running OPTIMIZED UMAP for 3D coordinates...
41
+ UMAP(low_memory=False, n_components=3, spread=1.5, verbose=True)
42
+ Sat Nov 29 22:45:50 2025 Construct fuzzy simplicial set
43
+ Sat Nov 29 22:45:50 2025 Finding Nearest Neighbors
44
+ Sat Nov 29 22:45:50 2025 Building RP forest with 24 trees
45
+ Sat Nov 29 22:45:52 2025 NN descent for 17 iterations
46
+ 1 / 17
47
+ 2 / 17
48
+ 3 / 17
49
+ 4 / 17
50
+ Stopping threshold met -- exiting after 4 iterations
51
+ Sat Nov 29 22:46:02 2025 Finished Nearest Neighbor Search
52
+ Sat Nov 29 22:46:04 2025 Construct embedding
53
+
54
+ 2025-11-29 22:56:30,884 - INFO - Generated 3D coordinates: (150000, 3) in 10.7 minutes
55
+ 2025-11-29 22:56:30,885 - INFO - Step 4/5: Running OPTIMIZED UMAP for 2D coordinates...
56
+ /Users/hamidaho/hf_viz/venv/lib/python3.13/site-packages/umap/spectral.py:548: UserWarning: Spectral initialisation failed! The eigenvector solver
57
+ failed. This is likely due to too small an eigengap. Consider
58
+ adding some noise or jitter to your data.
59
+
60
+ Falling back to random initialisation!
61
+ warn(
62
+ /Users/hamidaho/hf_viz/venv/lib/python3.13/site-packages/umap/spectral.py:548: UserWarning: Spectral initialisation failed! The eigenvector solver
63
+ failed. This is likely due to too small an eigengap. Consider
64
+ adding some noise or jitter to your data.
65
+
66
+ Falling back to random initialisation!
67
+ warn(
68
+ /Users/hamidaho/hf_viz/venv/lib/python3.13/site-packages/umap/spectral.py:548: UserWarning: Spectral initialisation failed! The eigenvector solver
69
+ failed. This is likely due to too small an eigengap. Consider
70
+ adding some noise or jitter to your data.
71
+
72
+ Falling back to random initialisation!
73
+ warn(
74
+ /Users/hamidaho/hf_viz/venv/lib/python3.13/site-packages/umap/spectral.py:548: UserWarning: Spectral initialisation failed! The eigenvector solver
75
+ failed. This is likely due to too small an eigengap. Consider
76
+ adding some noise or jitter to your data.
77
+
78
+ Falling back to random initialisation!
79
+ warn(
80
+ /Users/hamidaho/hf_viz/venv/lib/python3.13/site-packages/umap/spectral.py:548: UserWarning: Spectral initialisation failed! The eigenvector solver
81
+ failed. This is likely due to too small an eigengap. Consider
82
+ adding some noise or jitter to your data.
83
+
84
+ Falling back to random initialisation!
85
+ warn(
86
+ /Users/hamidaho/hf_viz/venv/lib/python3.13/site-packages/umap/spectral.py:548: UserWarning: Spectral initialisation failed! The eigenvector solver
87
+ failed. This is likely due to too small an eigengap. Consider
88
+ adding some noise or jitter to your data.
89
+
90
+ Falling back to random initialisation!
91
+ warn(
92
+ /Users/hamidaho/hf_viz/venv/lib/python3.13/site-packages/umap/spectral.py:548: UserWarning: Spectral initialisation failed! The eigenvector solver
93
+ failed. This is likely due to too small an eigengap. Consider
94
+ adding some noise or jitter to your data.
95
+
96
+ Falling back to random initialisation!
97
+ warn(
98
+ /Users/hamidaho/hf_viz/venv/lib/python3.13/site-packages/umap/spectral.py:548: UserWarning: Spectral initialisation failed! The eigenvector solver
99
+ failed. This is likely due to too small an eigengap. Consider
100
+ adding some noise or jitter to your data.
101
+
102
+ Falling back to random initialisation!
103
+ warn(
104
+ /Users/hamidaho/hf_viz/venv/lib/python3.13/site-packages/umap/spectral.py:548: UserWarning: Spectral initialisation failed! The eigenvector solver
105
+ failed. This is likely due to too small an eigengap. Consider
106
+ adding some noise or jitter to your data.
107
+
108
+ Falling back to random initialisation!
109
+ warn(
110
+ /Users/hamidaho/hf_viz/venv/lib/python3.13/site-packages/umap/spectral.py:548: UserWarning: Spectral initialisation failed! The eigenvector solver
111
+ failed. This is likely due to too small an eigengap. Consider
112
+ adding some noise or jitter to your data.
113
+
114
+ Falling back to random initialisation!
115
+ warn(
116
+ completed 0 / 200 epochs
117
+ completed 20 / 200 epochs
118
+ completed 40 / 200 epochs
119
+ completed 60 / 200 epochs
120
+ completed 80 / 200 epochs
121
+ completed 100 / 200 epochs
122
+ completed 120 / 200 epochs
123
+ completed 140 / 200 epochs
124
+ completed 160 / 200 epochs
125
+ completed 180 / 200 epochs
126
+ Sat Nov 29 22:56:30 2025 Finished embedding
127
+ UMAP(low_memory=False, spread=1.5, verbose=True)
128
+ Sat Nov 29 22:56:30 2025 Construct fuzzy simplicial set
129
+ Sat Nov 29 22:56:30 2025 Finding Nearest Neighbors
130
+ Sat Nov 29 22:56:30 2025 Building RP forest with 24 trees
131
+ Sat Nov 29 22:56:31 2025 NN descent for 17 iterations
132
+ 1 / 17
133
+ 2 / 17
134
+ 3 / 17
135
+ 4 / 17
136
+ Stopping threshold met -- exiting after 4 iterations
137
+ Sat Nov 29 22:56:35 2025 Finished Nearest Neighbor Search
138
+ Sat Nov 29 22:56:35 2025 Construct embedding
139
+
140
+ 2025-11-29 23:16:50,380 - INFO - Generated 2D coordinates: (150000, 2) in 20.3 minutes
141
+ 2025-11-29 23:16:50,381 - INFO - Step 5/5: Saving to Parquet files...
142
+ 2025-11-29 23:16:51,521 - INFO - Saved models data: ../precomputed_data/models_v1.parquet (19.9 MB)
143
+ 2025-11-29 23:16:51,522 - ERROR - Pre-computation failed: 'modelId'
144
+ Traceback (most recent call last):
145
+ File "/Users/hamidaho/hf_viz/venv/lib/python3.13/site-packages/pandas/core/indexes/base.py", line 3812, in get_loc
146
+ return self._engine.get_loc(casted_key)
147
+ ~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^
148
+ File "pandas/_libs/index.pyx", line 167, in pandas._libs.index.IndexEngine.get_loc
149
+ File "pandas/_libs/index.pyx", line 196, in pandas._libs.index.IndexEngine.get_loc
150
+ File "pandas/_libs/hashtable_class_helper.pxi", line 7088, in pandas._libs.hashtable.PyObjectHashTable.get_item
151
+ File "pandas/_libs/hashtable_class_helper.pxi", line 7096, in pandas._libs.hashtable.PyObjectHashTable.get_item
152
+ KeyError: 'modelId'
153
+
154
+ The above exception was the direct cause of the following exception:
155
+
156
+ Traceback (most recent call last):
157
+ File "/Users/hamidaho/hf_viz/backend/scripts/precompute_fast.py", line 266, in <module>
158
+ precompute_fast(
159
+ ~~~~~~~~~~~~~~~^
160
+ sample_size=args.sample_size,
161
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
162
+ ...<3 lines>...
163
+ use_pca=not args.no_pca
164
+ ^^^^^^^^^^^^^^^^^^^^^^^
165
+ )
166
+ ^
167
+ File "/Users/hamidaho/hf_viz/backend/scripts/precompute_fast.py", line 187, in precompute_fast
168
+ 'model_id': df['modelId'].values,
169
+ ~~^^^^^^^^^^^
170
+ File "/Users/hamidaho/hf_viz/venv/lib/python3.13/site-packages/pandas/core/frame.py", line 4113, in __getitem__
171
+ indexer = self.columns.get_loc(key)
172
+ File "/Users/hamidaho/hf_viz/venv/lib/python3.13/site-packages/pandas/core/indexes/base.py", line 3819, in get_loc
173
+ raise KeyError(key) from err
174
+ KeyError: 'modelId'
175
+ completed 0 / 200 epochs
176
+ completed 20 / 200 epochs
177
+ completed 40 / 200 epochs
178
+ completed 60 / 200 epochs
179
+ completed 80 / 200 epochs
180
+ completed 100 / 200 epochs
181
+ completed 120 / 200 epochs
182
+ completed 140 / 200 epochs
183
+ completed 160 / 200 epochs
184
+ completed 180 / 200 epochs
185
+ Sat Nov 29 23:16:50 2025 Finished embedding
precomputed_data/metadata_v1.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "v1",
3
+ "created_at": "2025-11-29T23:17:45.066035",
4
+ "total_models": 150000,
5
+ "embedding_dim": 384,
6
+ "umap_3d_shape": [
7
+ 150000,
8
+ 3
9
+ ],
10
+ "umap_2d_shape": [
11
+ 150000,
12
+ 2
13
+ ],
14
+ "unique_libraries": 364,
15
+ "unique_pipelines": 54,
16
+ "processing_time_minutes": 35,
17
+ "pca_dims": 50,
18
+ "pca_variance_preserved": 0.925
19
+ }