depth-anything-v2-large-api

@@ -11,7 +11,6 @@ class EndpointHandler:
     def __init__(self, path=""):
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
-        # Load processor + model from the *endpoint repo*
         self.processor = AutoImageProcessor.from_pretrained(path)
         self.model = AutoModelForDepthEstimation.from_pretrained(path)
         self.model.to(self.device)
@@ -19,54 +18,65 @@ class EndpointHandler:
     def __call__(self, data):
         """
-        Expected request body: raw image bytes (recommended)
-        Hugging Face Endpoints typically pass:
-          data["inputs"] -> bytes
         """
-        image_bytes = data.get("inputs", None)
-        if image_bytes is None:
-            raise ValueError('Missing "inputs". Send raw image bytes as the request body.')
         # Load image
         image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
         orig_w, orig_h = image.size
         # Preprocess
-        inputs = self.processor(images=image, return_tensors="pt")
-        inputs = {k: v.to(self.device) for k, v in inputs.items()}
         # Inference
         with torch.no_grad():
-            outputs = self.model(**inputs)
-            predicted_depth = outputs.predicted_depth  # shape: [B, H, W] (or similar)
-        # Upsample depth to original image size (as in the docs)
-        # Make it [B,1,H,W] for interpolate
-        depth = predicted_depth.unsqueeze(1)
         depth = F.interpolate(
             depth,
             size=(orig_h, orig_w),
             mode="bicubic",
             align_corners=False,
         )
-        depth = depth.squeeze(1).squeeze(0)  # [H, W]
         depth_np = depth.detach().float().cpu().numpy()
-        # ---- Make a nice visualization PNG (0..255) ----
         dmin, dmax = float(depth_np.min()), float(depth_np.max())
         denom = (dmax - dmin) if (dmax - dmin) > 1e-12 else 1.0
         depth_norm = (depth_np - dmin) / denom
         depth_uint8 = (depth_norm * 255.0).clip(0, 255).astype(np.uint8)
-        depth_img = Image.fromarray(depth_uint8, mode="L")  # grayscale
         buf = io.BytesIO()
         depth_img.save(buf, format="PNG")
         depth_png_base64 = base64.b64encode(buf.getvalue()).decode("utf-8")
-        # ---- Optional: return raw depth as float16 bytes (compact) ----
         depth_f16 = depth_np.astype(np.float16)
-        raw_bytes = depth_f16.tobytes()
-        depth_raw_base64_f16 = base64.b64encode(raw_bytes).decode("utf-8")
         return {
             "type": "relative_depth",

     def __init__(self, path=""):
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         self.processor = AutoImageProcessor.from_pretrained(path)
         self.model = AutoModelForDepthEstimation.from_pretrained(path)
         self.model.to(self.device)
     def __call__(self, data):
         """
+        Supports both common endpoint input styles:
+          1) JSON: {"inputs": "<base64-encoded image bytes>"}  (recommended)
+          2) Raw bytes passed through as inputs (fallback)
         """
+        inputs = data.get("inputs", None)
+        if inputs is None:
+            raise ValueError('Missing "inputs". Send JSON {"inputs": "<base64>"} or raw bytes.')
+        # Decode inputs -> image_bytes
+        if isinstance(inputs, str):
+            # JSON base64 string
+            try:
+                image_bytes = base64.b64decode(inputs)
+            except Exception as e:
+                raise ValueError(f'Failed to base64-decode "inputs" string: {e}')
+        elif isinstance(inputs, (bytes, bytearray)):
+            # raw bytes
+            image_bytes = bytes(inputs)
+        else:
+            raise ValueError(f'Unsupported inputs type: {type(inputs)}')
         # Load image
         image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
         orig_w, orig_h = image.size
         # Preprocess
+        inputs_t = self.processor(images=image, return_tensors="pt")
+        inputs_t = {k: v.to(self.device) for k, v in inputs_t.items()}
         # Inference
         with torch.no_grad():
+            outputs = self.model(**inputs_t)
+            predicted_depth = outputs.predicted_depth  # [B, H, W]
+        # Upsample to original image size
+        depth = predicted_depth.unsqueeze(1)  # [B,1,H,W]
         depth = F.interpolate(
             depth,
             size=(orig_h, orig_w),
             mode="bicubic",
             align_corners=False,
         )
+        depth = depth.squeeze(1).squeeze(0)  # [H,W]
         depth_np = depth.detach().float().cpu().numpy()
+        # Visualization (0..255 grayscale)
         dmin, dmax = float(depth_np.min()), float(depth_np.max())
         denom = (dmax - dmin) if (dmax - dmin) > 1e-12 else 1.0
         depth_norm = (depth_np - dmin) / denom
         depth_uint8 = (depth_norm * 255.0).clip(0, 255).astype(np.uint8)
+        depth_img = Image.fromarray(depth_uint8, mode="L")
         buf = io.BytesIO()
         depth_img.save(buf, format="PNG")
         depth_png_base64 = base64.b64encode(buf.getvalue()).decode("utf-8")
+        # Raw float16 depth (compact) — NOTE: relative depth, not meters
         depth_f16 = depth_np.astype(np.float16)
+        depth_raw_base64_f16 = base64.b64encode(depth_f16.tobytes()).decode("utf-8")
         return {
             "type": "relative_depth",