Spaces:

jnalv
/

Kiki-or-Bouba-classifier

Sleeping

App Files Files Community

jnalv commited on Jan 28

Commit

eb8ac19

1 Parent(s): d547345

Improve CLIP tensor extraction from BaseModelOutputWithPooling and add better error handling

Browse files

Files changed (1) hide show

classifier.py +83 -22

classifier.py CHANGED Viewed

@@ -182,8 +182,15 @@ class KikiBoubaClassifier:
         print(f"Loading model: {model_id}")
         # Use CLIPModel/CLIPProcessor for CLIP models, AutoModel/AutoProcessor for SigLIP
         if "clip" in model_id.lower():
-            self.model = CLIPModel.from_pretrained(model_id)
-            self.processor = CLIPProcessor.from_pretrained(model_id)
         else:
             self.model = AutoModel.from_pretrained(model_id)
             self.processor = AutoProcessor.from_pretrained(model_id)
@@ -229,26 +236,53 @@ class KikiBoubaClassifier:
         inputs = {k: v.to(self.device) for k, v in inputs.items()}
         with torch.no_grad():
-            # Handle both SigLIP (has get_text_features) and CLIP (uses different API)
             if hasattr(self.model, 'get_text_features'):
-                embeddings = self.model.get_text_features(**inputs)
             elif hasattr(self.model, 'text_model'):
-                # CLIP models: access text_model and get pooled output
                 outputs = self.model.text_model(**inputs)
-                # CLIP text_model returns BaseModelOutputWithPooling
                 if hasattr(outputs, 'pooler_output') and outputs.pooler_output is not None:
                     embeddings = outputs.pooler_output
-                else:
-                    # Fallback: mean pool the last hidden state
                     embeddings = outputs.last_hidden_state.mean(dim=1)
             else:
-                # Fallback: try calling model directly
                 outputs = self.model(**inputs)
-                embeddings = outputs.text_embeds if hasattr(outputs, 'text_embeds') else outputs[0]
-        # Ensure embeddings is a tensor and normalize
         if not isinstance(embeddings, torch.Tensor):
-            raise ValueError(f"Expected tensor, got {type(embeddings)}")
         return F.normalize(embeddings, dim=-1)
     def _embed_image(self, image: Union[Image.Image, str]) -> torch.Tensor:
@@ -269,26 +303,53 @@ class KikiBoubaClassifier:
         inputs = {k: v.to(self.device) for k, v in inputs.items()}
         with torch.no_grad():
-            # Handle both SigLIP (has get_image_features) and CLIP (uses different API)
             if hasattr(self.model, 'get_image_features'):
-                embedding = self.model.get_image_features(**inputs)
             elif hasattr(self.model, 'vision_model'):
-                # CLIP models: access vision_model and get pooled output
                 outputs = self.model.vision_model(**inputs)
-                # CLIP vision_model returns BaseModelOutputWithPooling
                 if hasattr(outputs, 'pooler_output') and outputs.pooler_output is not None:
                     embedding = outputs.pooler_output
-                else:
-                    # Fallback: mean pool the last hidden state
                     embedding = outputs.last_hidden_state.mean(dim=1)
             else:
-                # Fallback: try calling model directly
                 outputs = self.model(**inputs)
-                embedding = outputs.image_embeds if hasattr(outputs, 'image_embeds') else outputs[0]
-        # Ensure embedding is a tensor and normalize
         if not isinstance(embedding, torch.Tensor):
-            raise ValueError(f"Expected tensor, got {type(embedding)}")
         return F.normalize(embedding, dim=-1)
     def _compute_domain_scores(self, similarities: torch.Tensor, anchor_domains: List[str],

         print(f"Loading model: {model_id}")
         # Use CLIPModel/CLIPProcessor for CLIP models, AutoModel/AutoProcessor for SigLIP
         if "clip" in model_id.lower():
+            try:
+                self.model = CLIPModel.from_pretrained(model_id)
+                self.processor = CLIPProcessor.from_pretrained(model_id)
+                print(f"Loaded CLIPModel - has get_text_features: {hasattr(self.model, 'get_text_features')}")
+                print(f"Loaded CLIPModel - has get_image_features: {hasattr(self.model, 'get_image_features')}")
+            except Exception as e:
+                print(f"Warning: Failed to load as CLIPModel, trying AutoModel: {e}")
+                self.model = AutoModel.from_pretrained(model_id)
+                self.processor = AutoProcessor.from_pretrained(model_id)
         else:
             self.model = AutoModel.from_pretrained(model_id)
             self.processor = AutoProcessor.from_pretrained(model_id)
         inputs = {k: v.to(self.device) for k, v in inputs.items()}
         with torch.no_grad():
+            # CLIPModel has get_text_features method that returns tensor directly
             if hasattr(self.model, 'get_text_features'):
+                try:
+                    embeddings = self.model.get_text_features(**inputs)
+                except Exception:
+                    # Fallback: use text_model directly
+                    outputs = self.model.text_model(**inputs)
+                    if hasattr(outputs, 'pooler_output') and outputs.pooler_output is not None:
+                        embeddings = outputs.pooler_output
+                    else:
+                        embeddings = outputs.last_hidden_state.mean(dim=1)
             elif hasattr(self.model, 'text_model'):
+                # Direct access to text_model
                 outputs = self.model.text_model(**inputs)
+                # Extract tensor from BaseModelOutputWithPooling
                 if hasattr(outputs, 'pooler_output') and outputs.pooler_output is not None:
                     embeddings = outputs.pooler_output
+                elif hasattr(outputs, 'last_hidden_state'):
                     embeddings = outputs.last_hidden_state.mean(dim=1)
+                else:
+                    # Try to get the first attribute that's a tensor
+                    for attr in ['pooler_output', 'last_hidden_state', 'hidden_states']:
+                        if hasattr(outputs, attr):
+                            val = getattr(outputs, attr)
+                            if isinstance(val, torch.Tensor):
+                                if len(val.shape) > 2:
+                                    embeddings = val.mean(dim=1)
+                                else:
+                                    embeddings = val
+                                break
+                    else:
+                        raise ValueError(f"Could not extract tensor from text_model output: {type(outputs)}, attributes: {dir(outputs)}")
             else:
+                # Final fallback: use model forward pass
                 outputs = self.model(**inputs)
+                if hasattr(outputs, 'text_embeds'):
+                    embeddings = outputs.text_embeds
+                elif isinstance(outputs, tuple) and len(outputs) > 0:
+                    embeddings = outputs[0]
+                else:
+                    raise ValueError(f"Could not extract text embeddings from model output: {type(outputs)}")
+        # Ensure embeddings is a tensor
         if not isinstance(embeddings, torch.Tensor):
+            raise ValueError(f"Expected tensor, got {type(embeddings)}: {embeddings}")
+        # Normalize embeddings
         return F.normalize(embeddings, dim=-1)
     def _embed_image(self, image: Union[Image.Image, str]) -> torch.Tensor:
         inputs = {k: v.to(self.device) for k, v in inputs.items()}
         with torch.no_grad():
+            # CLIPModel has get_image_features method that returns tensor directly
             if hasattr(self.model, 'get_image_features'):
+                try:
+                    embedding = self.model.get_image_features(**inputs)
+                except Exception:
+                    # Fallback: use vision_model directly
+                    outputs = self.model.vision_model(**inputs)
+                    if hasattr(outputs, 'pooler_output') and outputs.pooler_output is not None:
+                        embedding = outputs.pooler_output
+                    else:
+                        embedding = outputs.last_hidden_state.mean(dim=1)
             elif hasattr(self.model, 'vision_model'):
+                # Direct access to vision_model
                 outputs = self.model.vision_model(**inputs)
+                # Extract tensor from BaseModelOutputWithPooling
                 if hasattr(outputs, 'pooler_output') and outputs.pooler_output is not None:
                     embedding = outputs.pooler_output
+                elif hasattr(outputs, 'last_hidden_state'):
                     embedding = outputs.last_hidden_state.mean(dim=1)
+                else:
+                    # Try to get the first attribute that's a tensor
+                    for attr in ['pooler_output', 'last_hidden_state', 'hidden_states']:
+                        if hasattr(outputs, attr):
+                            val = getattr(outputs, attr)
+                            if isinstance(val, torch.Tensor):
+                                if len(val.shape) > 2:
+                                    embedding = val.mean(dim=1)
+                                else:
+                                    embedding = val
+                                break
+                    else:
+                        raise ValueError(f"Could not extract tensor from vision_model output: {type(outputs)}, attributes: {dir(outputs)}")
             else:
+                # Final fallback: use model forward pass
                 outputs = self.model(**inputs)
+                if hasattr(outputs, 'image_embeds'):
+                    embedding = outputs.image_embeds
+                elif isinstance(outputs, tuple) and len(outputs) > 0:
+                    embedding = outputs[0]
+                else:
+                    raise ValueError(f"Could not extract image embeddings from model output: {type(outputs)}")
+        # Ensure embedding is a tensor
         if not isinstance(embedding, torch.Tensor):
+            raise ValueError(f"Expected tensor, got {type(embedding)}: {embedding}")
+        # Normalize embedding
         return F.normalize(embedding, dim=-1)
     def _compute_domain_scores(self, similarities: torch.Tensor, anchor_domains: List[str],