Spaces:

HorizonRobotics
/

EmbodiedGen-Image-to-3D

Running on Zero

App Files Files Community

xinjie.wang commited on Mar 3

Commit

be013ba

1 Parent(s): 9e78843

update

Browse files

Files changed (8) hide show

app.py.bak → app.bk.py +0 -0
app_full.py.bak +0 -571
common.bk.py +797 -0
common.py +13 -101
embodied_gen/models/sam3d.py +3 -16
embodied_gen/utils/monkey_patch/sam3d.py +4 -4
thirdparty/sam3d/sam3d_objects/pipeline/inference_pipeline.py +1 -11
thirdparty/sam3d/sam3d_objects/pipeline/inference_pipeline_pointmap.py +1 -13

app.py.bak → app.bk.py RENAMED Viewed

File without changes

app_full.py.bak DELETED Viewed

@@ -1,571 +0,0 @@
-import os as _os
-import sys as _sys
-import subprocess as _subprocess
-print("=" * 60, flush=True)
-print("[DEBUG] ===== Environment Diagnostics (no CUDA init) =====", flush=True)
-print(f"[DEBUG] Python: {_sys.version}", flush=True)
-print(f"[DEBUG] CWD: {_os.getcwd()}", flush=True)
-try:
-    _nvcc_out = _subprocess.check_output(["nvcc", "--version"], stderr=_subprocess.STDOUT, text=True)
-    print(f"[DEBUG] nvcc: {_nvcc_out.strip().splitlines()[-1]}", flush=True)
-except Exception as _e:
-    print(f"[DEBUG] nvcc not found: {_e}", flush=True)
-try:
-    _smi_out = _subprocess.check_output(["nvidia-smi", "-L"], stderr=_subprocess.STDOUT, text=True)
-    print(f"[DEBUG] nvidia-smi -L: {_smi_out.strip()}", flush=True)
-except Exception:
-    print("[DEBUG] nvidia-smi not available at startup (expected for ZeroGPU)", flush=True)
-try:
-    with open("/proc/driver/nvidia/version") as _f:
-        _lines = _f.read().strip().splitlines()
-        print(f"[DEBUG] NVIDIA driver: {_lines[0] if _lines else 'unknown'}", flush=True)
-except Exception:
-    print("[DEBUG] /proc/driver/nvidia/version not found", flush=True)
-for _env_key in sorted(_os.environ):
-    if any(_kw in _env_key.upper() for _kw in ["CUDA", "GPU", "NVIDIA", "ZERO", "SPACES"]):
-        print(f"[DEBUG] ENV {_env_key}={_os.environ[_env_key]}", flush=True)
-print("=" * 60, flush=True)
-# Project EmbodiedGen
-#
-# Copyright (c) 2025 Horizon Robotics. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#       http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-# implied. See the License for the specific language governing
-# permissions and limitations under the License.
-import os
-# GRADIO_APP == "imageto3d_sam3d", sam3d object model, by default.
-# GRADIO_APP == "imageto3d", TRELLIS model.
-os.environ["GRADIO_APP"] = "imageto3d_sam3d"
-from glob import glob
-import gradio as gr
-from app_style import custom_theme, image_css, lighting_css
-from common import (
-    MAX_SEED,
-    VERSION,
-    active_btn_by_content,
-    end_session,
-    extract_3d_representations_v3,
-    extract_urdf,
-    get_seed,
-    image_to_3d,
-    preprocess_image_fn,
-    preprocess_sam_image_fn,
-    select_point,
-    start_session,
-)
-app_name = os.getenv("GRADIO_APP")
-if app_name == "imageto3d_sam3d":
-    _enable_pre_resize_default = False
-    sample_step = 25
-    bg_rm_model_name = "rembg"  # "rembg", "rmbg14"
-elif app_name == "imageto3d":
-    _enable_pre_resize_default = True
-    sample_step = 12
-    bg_rm_model_name = "rembg"  # "rembg", "rmbg14"
-current_rmbg_tag = bg_rm_model_name
-def set_current_rmbg_tag(rmbg: str) -> None:
-    global current_rmbg_tag
-    current_rmbg_tag = rmbg
-def preprocess_example_image(
-    img: str,
-) -> tuple[object, object, gr.Button]:
-    image, image_cache = preprocess_image_fn(
-        img, current_rmbg_tag, _enable_pre_resize_default
-    )
-    return image, image_cache, gr.Button(interactive=True)
-with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
-    gr.HTML(image_css, visible=False)
-    # gr.HTML(lighting_css, visible=False)
-    gr.Markdown(
-        """
-        ## ***EmbodiedGen***: Image-to-3D Asset
-        **🔖 Version**: {VERSION}
-        <p style="display: flex; gap: 10px; flex-wrap: nowrap;">
-            <a href="https://horizonrobotics.github.io/EmbodiedGen">
-                <img alt="📖 Documentation" src="https://img.shields.io/badge/📖-Documentation-blue">
-            </a>
-            <a href="https://arxiv.org/abs/2506.10600">
-                <img alt="📄 arXiv" src="https://img.shields.io/badge/📄-arXiv-b31b1b">
-            </a>
-            <a href="https://github.com/HorizonRobotics/EmbodiedGen">
-                <img alt="💻 GitHub" src="https://img.shields.io/badge/GitHub-000000?logo=github">
-            </a>
-            <a href="https://www.youtube.com/watch?v=rG4odybuJRk">
-                <img alt="🎥 Video" src="https://img.shields.io/badge/🎥-Video-red">
-            </a>
-        </p>
-        🖼️ Generate physically plausible 3D asset from single input image.
-        """.format(
-            VERSION=VERSION
-        ),
-        elem_classes=["header"],
-    )
-    enable_pre_resize = gr.State(_enable_pre_resize_default)
-    with gr.Row():
-        with gr.Column(scale=3):
-            with gr.Tabs() as input_tabs:
-                with gr.Tab(
-                    label="Image(auto seg)", id=0
-                ) as single_image_input_tab:
-                    raw_image_cache = gr.Image(
-                        format="png",
-                        image_mode="RGB",
-                        type="pil",
-                        visible=False,
-                    )
-                    image_prompt = gr.Image(
-                        label="Input Image",
-                        format="png",
-                        image_mode="RGBA",
-                        type="pil",
-                        height=400,
-                        elem_classes=["image_fit"],
-                    )
-                    gr.Markdown(
-                        """
-                        If you are not satisfied with the auto segmentation
-                        result, please switch to the `Image(SAM seg)` tab."""
-                    )
-                with gr.Tab(
-                    label="Image(SAM seg)", id=1
-                ) as samimage_input_tab:
-                    with gr.Row():
-                        with gr.Column(scale=1):
-                            image_prompt_sam = gr.Image(
-                                label="Input Image",
-                                type="numpy",
-                                height=400,
-                                elem_classes=["image_fit"],
-                            )
-                            image_seg_sam = gr.Image(
-                                label="SAM Seg Image",
-                                image_mode="RGBA",
-                                type="pil",
-                                height=400,
-                                visible=False,
-                            )
-                        with gr.Column(scale=1):
-                            image_mask_sam = gr.AnnotatedImage(
-                                elem_classes=["image_fit"]
-                            )
-                    fg_bg_radio = gr.Radio(
-                        ["foreground_point", "background_point"],
-                        label="Select foreground(green) or background(red) points, by default foreground",  # noqa
-                        value="foreground_point",
-                    )
-                    gr.Markdown(
-                        """ Click the `Input Image` to select SAM points,
-                        after get the satisified segmentation, click `Generate`
-                         button to generate the 3D asset. \n
-                        Note: If the segmented foreground is too small relative
-                         to the entire image area, the generation will fail.
-                    """
-                    )
-            with gr.Accordion(label="Generation Settings", open=False):
-                with gr.Row():
-                    seed = gr.Slider(
-                        0, MAX_SEED, label="Seed", value=0, step=1
-                    )
-                    texture_size = gr.Slider(
-                        1024,
-                        4096,
-                        label="UV texture size",
-                        value=2048,
-                        step=256,
-                    )
-                    rmbg_tag = gr.Radio(
-                        choices=["rembg", "rmbg14"],
-                        value=bg_rm_model_name,
-                        label="Background Removal Model",
-                    )
-                with gr.Row():
-                    randomize_seed = gr.Checkbox(
-                        label="Randomize Seed", value=False
-                    )
-                    project_delight = gr.Checkbox(
-                        label="Back-project Delight",
-                        value=True,
-                    )
-                gr.Markdown("Geo Structure Generation")
-                with gr.Row():
-                    ss_guidance_strength = gr.Slider(
-                        0.0,
-                        10.0,
-                        label="Guidance Strength",
-                        value=7.5,
-                        step=0.1,
-                    )
-                    ss_sampling_steps = gr.Slider(
-                        1,
-                        50,
-                        label="Sampling Steps",
-                        value=sample_step,
-                        step=1,
-                    )
-                gr.Markdown("Visual Appearance Generation")
-                with gr.Row():
-                    slat_guidance_strength = gr.Slider(
-                        0.0,
-                        10.0,
-                        label="Guidance Strength",
-                        value=3.0,
-                        step=0.1,
-                    )
-                    slat_sampling_steps = gr.Slider(
-                        1,
-                        50,
-                        label="Sampling Steps",
-                        value=sample_step,
-                        step=1,
-                    )
-            generate_btn = gr.Button(
-                "🚀 1. Generate(~2 mins)",
-                variant="primary",
-                interactive=False,
-            )
-            model_output_obj = gr.Textbox(label="raw mesh .obj", visible=False)
-            # with gr.Row():
-            #     extract_rep3d_btn = gr.Button(
-            #         "🔍 2. Extract 3D Representation(~2 mins)",
-            #         variant="primary",
-            #         interactive=False,
-            #     )
-            with gr.Accordion(
-                label="Enter Asset Attributes(optional)", open=False
-            ):
-                asset_cat_text = gr.Textbox(
-                    label="Enter Asset Category (e.g., chair)"
-                )
-                height_range_text = gr.Textbox(
-                    label="Enter **Height Range** in meter (e.g., 0.5-0.6)"
-                )
-                mass_range_text = gr.Textbox(
-                    label="Enter **Mass Range** in kg (e.g., 1.1-1.2)"
-                )
-                asset_version_text = gr.Textbox(
-                    label=f"Enter version (e.g., {VERSION})"
-                )
-            with gr.Row():
-                extract_urdf_btn = gr.Button(
-                    "🧩 2. Extract URDF with physics(~1 mins)",
-                    variant="primary",
-                    interactive=False,
-                )
-            with gr.Row():
-                gr.Markdown(
-                    "#### Estimated Asset 3D Attributes(No input required)"
-                )
-            with gr.Row():
-                est_type_text = gr.Textbox(
-                    label="Asset category", interactive=False
-                )
-                est_height_text = gr.Textbox(
-                    label="Real height(.m)", interactive=False
-                )
-                est_mass_text = gr.Textbox(
-                    label="Mass(.kg)", interactive=False
-                )
-                est_mu_text = gr.Textbox(
-                    label="Friction coefficient", interactive=False
-                )
-            with gr.Row():
-                download_urdf = gr.DownloadButton(
-                    label="⬇️ 3. Download URDF",
-                    variant="primary",
-                    interactive=False,
-                )
-            gr.Markdown(
-                """ NOTE: If `Asset Attributes` are provided, it will guide
-                GPT to perform physical attributes restoration. \n
-                The `Download URDF` file is restored to the real scale and
-                has quality inspection, open with an editor to view details.
-            """
-            )
-            with gr.Row() as single_image_example:
-                examples = gr.Examples(
-                    label="Image Gallery",
-                    examples=[
-                        [image_path]
-                        for image_path in sorted(
-                            glob("assets/example_image/*")
-                        )
-                    ],
-                    inputs=[image_prompt],
-                    fn=preprocess_example_image,
-                    outputs=[image_prompt, raw_image_cache, generate_btn],
-                    run_on_click=True,
-                    examples_per_page=10,
-                    cache_examples=False,
-                )
-            with gr.Row(visible=False) as single_sam_image_example:
-                examples = gr.Examples(
-                    label="Image Gallery",
-                    examples=[
-                        [image_path]
-                        for image_path in sorted(
-                            glob("assets/example_image/*")
-                        )
-                    ],
-                    inputs=[image_prompt_sam],
-                    fn=preprocess_sam_image_fn,
-                    outputs=[image_prompt_sam, raw_image_cache],
-                    run_on_click=True,
-                    examples_per_page=10,
-                )
-        with gr.Column(scale=2):
-            gr.Markdown("<br>")
-            video_output = gr.Video(
-                label="Generated 3D Asset",
-                autoplay=True,
-                loop=True,
-                height=400,
-            )
-            model_output_gs = gr.Model3D(
-                label="Gaussian Representation", height=350, interactive=False
-            )
-            aligned_gs = gr.Textbox(visible=False)
-            gr.Markdown(
-                """ The rendering of `Gaussian Representation` takes additional 10s. """  # noqa
-            )
-            with gr.Row():
-                model_output_mesh = gr.Model3D(
-                    label="Mesh Representation",
-                    height=350,
-                    interactive=False,
-                    clear_color=[0, 0, 0, 1],
-                    elem_id="lighter_mesh",
-                )
-    is_samimage = gr.State(False)
-    output_buf = gr.State()
-    selected_points = gr.State(value=[])
-    demo.load(start_session)
-    demo.unload(end_session)
-    single_image_input_tab.select(
-        lambda: tuple(
-            [False, gr.Row.update(visible=True), gr.Row.update(visible=False)]
-        ),
-        outputs=[is_samimage, single_image_example, single_sam_image_example],
-    )
-    samimage_input_tab.select(
-        lambda: tuple(
-            [True, gr.Row.update(visible=True), gr.Row.update(visible=False)]
-        ),
-        outputs=[is_samimage, single_sam_image_example, single_image_example],
-    )
-    image_prompt.upload(
-        lambda img, rmbg: preprocess_image_fn(img, rmbg, _enable_pre_resize_default),
-        inputs=[image_prompt, rmbg_tag],
-        outputs=[image_prompt, raw_image_cache],
-        queue=False,
-    ).success(
-        active_btn_by_content,
-        inputs=image_prompt,
-        outputs=generate_btn,
-    )
-    rmbg_tag.change(
-        set_current_rmbg_tag,
-        inputs=[rmbg_tag],
-        outputs=[],
-    )
-    image_prompt.change(
-        lambda: tuple(
-            [
-                # gr.Button(interactive=False),
-                gr.Button(interactive=False),
-                gr.Button(interactive=False),
-                None,
-                "",
-                None,
-                None,
-                "",
-                "",
-                "",
-                "",
-                "",
-                "",
-                "",
-                "",
-            ]
-        ),
-        outputs=[
-            # extract_rep3d_btn,
-            extract_urdf_btn,
-            download_urdf,
-            model_output_gs,
-            aligned_gs,
-            model_output_mesh,
-            video_output,
-            asset_cat_text,
-            height_range_text,
-            mass_range_text,
-            asset_version_text,
-            est_type_text,
-            est_height_text,
-            est_mass_text,
-            est_mu_text,
-        ],
-    )
-    image_prompt.clear(
-        lambda: gr.Button(interactive=False),
-        outputs=[generate_btn],
-    )
-    image_prompt_sam.upload(
-        preprocess_sam_image_fn,
-        inputs=[image_prompt_sam],
-        outputs=[image_prompt_sam, raw_image_cache],
-    )
-    image_prompt_sam.change(
-        lambda: tuple(
-            [
-                # gr.Button(interactive=False),
-                gr.Button(interactive=False),
-                gr.Button(interactive=False),
-                None,
-                None,
-                None,
-                "",
-                "",
-                "",
-                "",
-                "",
-                "",
-                "",
-                "",
-                None,
-                [],
-            ]
-        ),
-        outputs=[
-            # extract_rep3d_btn,
-            extract_urdf_btn,
-            download_urdf,
-            model_output_gs,
-            model_output_mesh,
-            video_output,
-            asset_cat_text,
-            height_range_text,
-            mass_range_text,
-            asset_version_text,
-            est_type_text,
-            est_height_text,
-            est_mass_text,
-            est_mu_text,
-            image_mask_sam,
-            selected_points,
-        ],
-    )
-    image_prompt_sam.select(
-        select_point,
-        [
-            image_prompt_sam,
-            selected_points,
-            fg_bg_radio,
-        ],
-        [image_mask_sam, image_seg_sam],
-    )
-    image_seg_sam.change(
-        active_btn_by_content,
-        inputs=image_seg_sam,
-        outputs=generate_btn,
-    )
-    generate_btn.click(
-        get_seed,
-        inputs=[randomize_seed, seed],
-        outputs=[seed],
-    ).success(
-        image_to_3d,
-        inputs=[
-            image_prompt,
-            seed,
-            ss_sampling_steps,
-            slat_sampling_steps,
-            raw_image_cache,
-            ss_guidance_strength,
-            slat_guidance_strength,
-            image_seg_sam,
-            is_samimage,
-        ],
-        outputs=[output_buf, video_output],
-    ).success(
-        extract_3d_representations_v3,
-        inputs=[
-            output_buf,
-            project_delight,
-            texture_size,
-        ],
-        outputs=[
-            model_output_mesh,
-            model_output_gs,
-            model_output_obj,
-            aligned_gs,
-        ],
-    ).success(
-        lambda: gr.Button(interactive=True),
-        outputs=[extract_urdf_btn],
-    )
-    extract_urdf_btn.click(
-        extract_urdf,
-        inputs=[
-            aligned_gs,
-            model_output_obj,
-            asset_cat_text,
-            height_range_text,
-            mass_range_text,
-            asset_version_text,
-        ],
-        outputs=[
-            download_urdf,
-            est_type_text,
-            est_height_text,
-            est_mass_text,
-            est_mu_text,
-        ],
-        queue=True,
-        show_progress="full",
-    ).success(
-        lambda: gr.Button(interactive=True),
-        outputs=[download_urdf],
-    )
-if __name__ == "__main__":
-    demo.launch()

common.bk.py ADDED Viewed

	@@ -0,0 +1,797 @@

+# Project EmbodiedGen
+#
+# Copyright (c) 2025 Horizon Robotics. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied. See the License for the specific language governing
+# permissions and limitations under the License.
+import spaces
+from embodied_gen.utils.monkey_patch.trellis import monkey_path_trellis
+monkey_path_trellis()
+import gc
+import logging
+import os
+import shutil
+import subprocess
+import sys
+from glob import glob
+import cv2
+import gradio as gr
+import numpy as np
+import torch
+import trimesh
+from PIL import Image
+from embodied_gen.data.backproject_v2 import entrypoint as backproject_api
+from embodied_gen.data.backproject_v3 import entrypoint as backproject_api_v3
+from embodied_gen.data.differentiable_render import entrypoint as render_api
+from embodied_gen.data.utils import trellis_preprocess, zip_files
+from embodied_gen.models.delight_model import DelightingModel
+from embodied_gen.models.gs_model import GaussianOperator
+from embodied_gen.models.sam3d import Sam3dInference
+from embodied_gen.models.segment_model import (
+    BMGG14Remover,
+    RembgRemover,
+    SAMPredictor,
+)
+from embodied_gen.models.sr_model import ImageRealESRGAN, ImageStableSR
+from embodied_gen.scripts.render_gs import entrypoint as render_gs_api
+from embodied_gen.scripts.render_mv import build_texture_gen_pipe, infer_pipe
+from embodied_gen.scripts.text2image import (
+    build_text2img_ip_pipeline,
+    build_text2img_pipeline,
+    text2img_gen,
+)
+from embodied_gen.utils.gpt_clients import GPT_CLIENT
+from embodied_gen.utils.process_media import (
+    filter_image_small_connected_components,
+    keep_largest_connected_component,
+    merge_images_video,
+)
+from embodied_gen.utils.tags import VERSION
+from embodied_gen.utils.trender import pack_state, render_video, unpack_state
+from embodied_gen.validators.quality_checkers import (
+    BaseChecker,
+    ImageAestheticChecker,
+    ImageSegChecker,
+    MeshGeoChecker,
+)
+from embodied_gen.validators.urdf_convertor import URDFGenerator
+current_file_path = os.path.abspath(__file__)
+current_dir = os.path.dirname(current_file_path)
+sys.path.append(os.path.join(current_dir, ".."))
+from thirdparty.TRELLIS.trellis.pipelines import TrellisImageTo3DPipeline
+from thirdparty.TRELLIS.trellis.utils import postprocessing_utils
+logging.basicConfig(
+    format="%(asctime)s - %(levelname)s - %(message)s", level=logging.INFO
+)
+logger = logging.getLogger(__name__)
+os.environ["GRADIO_ANALYTICS_ENABLED"] = "false"
+os.environ.setdefault("OPENAI_API_KEY", "sk-placeholder")
+MAX_SEED = 100000
+# DELIGHT = DelightingModel()
+# IMAGESR_MODEL = ImageRealESRGAN(outscale=4)
+# IMAGESR_MODEL = ImageStableSR()
+if os.getenv("GRADIO_APP").startswith("imageto3d"):
+    RBG_REMOVER = RembgRemover()
+    RBG14_REMOVER = BMGG14Remover()
+    SAM_PREDICTOR = SAMPredictor(model_type="vit_h", device="cpu")
+    if "sam3d" in os.getenv("GRADIO_APP"):
+        PIPELINE = Sam3dInference()
+    else:
+        PIPELINE = TrellisImageTo3DPipeline.from_pretrained(
+            "microsoft/TRELLIS-image-large"
+        )
+        # PIPELINE.cuda()
+    SEG_CHECKER = ImageSegChecker(GPT_CLIENT)
+    GEO_CHECKER = MeshGeoChecker(GPT_CLIENT)
+    AESTHETIC_CHECKER = ImageAestheticChecker()
+    CHECKERS = [GEO_CHECKER, SEG_CHECKER, AESTHETIC_CHECKER]
+    TMP_DIR = os.path.join(
+        os.path.dirname(os.path.abspath(__file__)), "sessions/imageto3d"
+    )
+    os.makedirs(TMP_DIR, exist_ok=True)
+elif os.getenv("GRADIO_APP").startswith("textto3d"):
+    RBG_REMOVER = RembgRemover()
+    RBG14_REMOVER = BMGG14Remover()
+    if "sam3d" in os.getenv("GRADIO_APP"):
+        PIPELINE = Sam3dInference()
+    else:
+        PIPELINE = TrellisImageTo3DPipeline.from_pretrained(
+            "microsoft/TRELLIS-image-large"
+        )
+        # PIPELINE.cuda()
+    text_model_dir = "weights/Kolors"
+    PIPELINE_IMG_IP = build_text2img_ip_pipeline(text_model_dir, ref_scale=0.3)
+    PIPELINE_IMG = build_text2img_pipeline(text_model_dir)
+    SEG_CHECKER = ImageSegChecker(GPT_CLIENT)
+    GEO_CHECKER = MeshGeoChecker(GPT_CLIENT)
+    AESTHETIC_CHECKER = ImageAestheticChecker()
+    CHECKERS = [GEO_CHECKER, SEG_CHECKER, AESTHETIC_CHECKER]
+    TMP_DIR = os.path.join(
+        os.path.dirname(os.path.abspath(__file__)), "sessions/textto3d"
+    )
+    os.makedirs(TMP_DIR, exist_ok=True)
+elif os.getenv("GRADIO_APP") == "texture_edit":
+    DELIGHT = DelightingModel()
+    IMAGESR_MODEL = ImageRealESRGAN(outscale=4)
+    PIPELINE_IP = build_texture_gen_pipe(
+        base_ckpt_dir="./weights",
+        ip_adapt_scale=0.7,
+        device="cuda",
+    )
+    PIPELINE = build_texture_gen_pipe(
+        base_ckpt_dir="./weights",
+        ip_adapt_scale=0,
+        device="cuda",
+    )
+    TMP_DIR = os.path.join(
+        os.path.dirname(os.path.abspath(__file__)), "sessions/texture_edit"
+    )
+    os.makedirs(TMP_DIR, exist_ok=True)
+def start_session(req: gr.Request) -> None:
+    user_dir = os.path.join(TMP_DIR, str(req.session_hash))
+    os.makedirs(user_dir, exist_ok=True)
+def end_session(req: gr.Request) -> None:
+    user_dir = os.path.join(TMP_DIR, str(req.session_hash))
+    if os.path.exists(user_dir):
+        shutil.rmtree(user_dir)
+def preprocess_image_fn(
+    image: str | np.ndarray | Image.Image,
+    rmbg_tag: str = "rembg",
+    preprocess: bool = True,
+) -> tuple[Image.Image, Image.Image]:
+    if isinstance(image, str):
+        image = Image.open(image)
+    elif isinstance(image, np.ndarray):
+        image = Image.fromarray(image)
+    image_cache = image.copy()  # resize_pil(image.copy(), 1024)
+    bg_remover = RBG_REMOVER if rmbg_tag == "rembg" else RBG14_REMOVER
+    image = bg_remover(image)
+    image = keep_largest_connected_component(image)
+    if preprocess:
+        image = trellis_preprocess(image)
+    return image, image_cache
+def preprocess_sam_image_fn(
+    image: Image.Image,
+) -> tuple[Image.Image, Image.Image]:
+    if isinstance(image, np.ndarray):
+        image = Image.fromarray(image)
+    sam_image = SAM_PREDICTOR.preprocess_image(image)
+    image_cache = sam_image.copy()
+    SAM_PREDICTOR.predictor.set_image(sam_image)
+    return sam_image, image_cache
+def active_btn_by_content(content: gr.Image) -> gr.Button:
+    interactive = True if content is not None else False
+    return gr.Button(interactive=interactive)
+def active_btn_by_text_content(content: gr.Textbox) -> gr.Button:
+    if content is not None and len(content) > 0:
+        interactive = True
+    else:
+        interactive = False
+    return gr.Button(interactive=interactive)
+def get_selected_image(
+    choice: str, sample1: str, sample2: str, sample3: str
+) -> str:
+    if choice == "sample1":
+        return sample1
+    elif choice == "sample2":
+        return sample2
+    elif choice == "sample3":
+        return sample3
+    else:
+        raise ValueError(f"Invalid choice: {choice}")
+def get_cached_image(image_path: str) -> Image.Image:
+    if isinstance(image_path, Image.Image):
+        return image_path
+    return Image.open(image_path).resize((512, 512))
+def get_seed(randomize_seed: bool, seed: int, max_seed: int = MAX_SEED) -> int:
+    return np.random.randint(0, max_seed) if randomize_seed else seed
+def select_point(
+    image: np.ndarray,
+    sel_pix: list,
+    point_type: str,
+    evt: gr.SelectData,
+):
+    if point_type == "foreground_point":
+        sel_pix.append((evt.index, 1))  # append the foreground_point
+    elif point_type == "background_point":
+        sel_pix.append((evt.index, 0))  # append the background_point
+    else:
+        sel_pix.append((evt.index, 1))  # default foreground_point
+    masks = SAM_PREDICTOR.generate_masks(image, sel_pix)
+    seg_image = SAM_PREDICTOR.get_segmented_image(image, masks)
+    for point, label in sel_pix:
+        color = (255, 0, 0) if label == 0 else (0, 255, 0)
+        marker_type = 1 if label == 0 else 5
+        cv2.drawMarker(
+            image,
+            point,
+            color,
+            markerType=marker_type,
+            markerSize=15,
+            thickness=10,
+        )
+    torch.cuda.empty_cache()
+    return (image, masks), seg_image
+@spaces.GPU(duration=300)
+def image_to_3d(
+    image: Image.Image,
+    seed: int,
+    ss_sampling_steps: int,
+    slat_sampling_steps: int,
+    raw_image_cache: Image.Image,
+    ss_guidance_strength: float,
+    slat_guidance_strength: float,
+    sam_image: Image.Image = None,
+    is_sam_image: bool = False,
+    req: gr.Request = None,
+) -> tuple[dict, str]:
+    if is_sam_image:
+        seg_image = filter_image_small_connected_components(sam_image)
+        seg_image = Image.fromarray(seg_image, mode="RGBA")
+    else:
+        seg_image = image
+    if isinstance(seg_image, np.ndarray):
+        seg_image = Image.fromarray(seg_image)
+    logger.info("Start generating 3D representation from image...")
+    if isinstance(PIPELINE, Sam3dInference):
+        outputs = PIPELINE.run(
+            seg_image,
+            seed=seed,
+            stage1_inference_steps=ss_sampling_steps,
+            stage2_inference_steps=slat_sampling_steps,
+        )
+    else:
+        PIPELINE.cuda()
+        seg_image = trellis_preprocess(seg_image)
+        outputs = PIPELINE.run(
+            seg_image,
+            seed=seed,
+            formats=["gaussian", "mesh"],
+            preprocess_image=False,
+            sparse_structure_sampler_params={
+                "steps": ss_sampling_steps,
+                "cfg_strength": ss_guidance_strength,
+            },
+            slat_sampler_params={
+                "steps": slat_sampling_steps,
+                "cfg_strength": slat_guidance_strength,
+            },
+        )
+        # Set back to cpu for memory saving.
+        PIPELINE.cpu()
+    gs_model = outputs["gaussian"][0]
+    mesh_model = outputs["mesh"][0]
+    color_images = render_video(gs_model, r=1.85)["color"]
+    normal_images = render_video(mesh_model, r=1.85)["normal"]
+    output_root = os.path.join(TMP_DIR, str(req.session_hash))
+    os.makedirs(output_root, exist_ok=True)
+    seg_image.save(f"{output_root}/seg_image.png")
+    raw_image_cache.save(f"{output_root}/raw_image.png")
+    video_path = os.path.join(output_root, "gs_mesh.mp4")
+    merge_images_video(color_images, normal_images, video_path)
+    state = pack_state(gs_model, mesh_model)
+    gc.collect()
+    torch.cuda.empty_cache()
+    return state, video_path
+def extract_3d_representations_v2(
+    state: dict,
+    enable_delight: bool,
+    texture_size: int,
+    req: gr.Request,
+):
+    """Back-Projection Version of Texture Super-Resolution."""
+    output_root = TMP_DIR
+    user_dir = os.path.join(output_root, str(req.session_hash))
+    gs_model, mesh_model = unpack_state(state, device="cpu")
+    filename = "sample"
+    gs_path = os.path.join(user_dir, f"{filename}_gs.ply")
+    gs_model.save_ply(gs_path)
+    # Rotate mesh and GS by 90 degrees around Z-axis.
+    rot_matrix = [[0, 0, -1], [0, 1, 0], [1, 0, 0]]
+    gs_add_rot = [[1, 0, 0], [0, -1, 0], [0, 0, -1]]
+    mesh_add_rot = [[1, 0, 0], [0, 0, -1], [0, 1, 0]]
+    # Addtional rotation for GS to align mesh.
+    gs_rot = np.array(gs_add_rot) @ np.array(rot_matrix)
+    pose = GaussianOperator.trans_to_quatpose(gs_rot)
+    aligned_gs_path = gs_path.replace(".ply", "_aligned.ply")
+    GaussianOperator.resave_ply(
+        in_ply=gs_path,
+        out_ply=aligned_gs_path,
+        instance_pose=pose,
+        device="cpu",
+    )
+    color_path = os.path.join(user_dir, "color.png")
+    render_gs_api(
+        input_gs=aligned_gs_path,
+        output_path=color_path,
+        elevation=[20, -10, 60, -50],
+        num_images=12,
+    )
+    mesh = trimesh.Trimesh(
+        vertices=mesh_model.vertices.cpu().numpy(),
+        faces=mesh_model.faces.cpu().numpy(),
+    )
+    mesh.vertices = mesh.vertices @ np.array(mesh_add_rot)
+    mesh.vertices = mesh.vertices @ np.array(rot_matrix)
+    mesh_obj_path = os.path.join(user_dir, f"{filename}.obj")
+    mesh.export(mesh_obj_path)
+    mesh = backproject_api(
+        delight_model=DELIGHT,
+        imagesr_model=IMAGESR_MODEL,
+        color_path=color_path,
+        mesh_path=mesh_obj_path,
+        output_path=mesh_obj_path,
+        skip_fix_mesh=False,
+        delight=enable_delight,
+        texture_wh=[texture_size, texture_size],
+        elevation=[20, -10, 60, -50],
+        num_images=12,
+    )
+    mesh_glb_path = os.path.join(user_dir, f"{filename}.glb")
+    mesh.export(mesh_glb_path)
+    return mesh_glb_path, gs_path, mesh_obj_path, aligned_gs_path
+def extract_3d_representations_v3(
+    state: dict,
+    enable_delight: bool,
+    texture_size: int,
+    req: gr.Request,
+):
+    """Back-Projection Version with Optimization-Based."""
+    output_root = TMP_DIR
+    user_dir = os.path.join(output_root, str(req.session_hash))
+    gs_model, mesh_model = unpack_state(state, device="cpu")
+    filename = "sample"
+    gs_path = os.path.join(user_dir, f"{filename}_gs.ply")
+    gs_model.save_ply(gs_path)
+    # Rotate mesh and GS by 90 degrees around Z-axis.
+    rot_matrix = [[0, 0, -1], [0, 1, 0], [1, 0, 0]]
+    gs_add_rot = [[1, 0, 0], [0, -1, 0], [0, 0, -1]]
+    mesh_add_rot = [[1, 0, 0], [0, 0, -1], [0, 1, 0]]
+    # Addtional rotation for GS to align mesh.
+    gs_rot = np.array(gs_add_rot) @ np.array(rot_matrix)
+    pose = GaussianOperator.trans_to_quatpose(gs_rot)
+    aligned_gs_path = gs_path.replace(".ply", "_aligned.ply")
+    GaussianOperator.resave_ply(
+        in_ply=gs_path,
+        out_ply=aligned_gs_path,
+        instance_pose=pose,
+        device="cpu",
+    )
+    mesh = trimesh.Trimesh(
+        vertices=mesh_model.vertices.cpu().numpy(),
+        faces=mesh_model.faces.cpu().numpy(),
+    )
+    mesh.vertices = mesh.vertices @ np.array(mesh_add_rot)
+    mesh.vertices = mesh.vertices @ np.array(rot_matrix)
+    mesh_obj_path = os.path.join(user_dir, f"{filename}.obj")
+    mesh.export(mesh_obj_path)
+    mesh = backproject_api_v3(
+        gs_path=aligned_gs_path,
+        mesh_path=mesh_obj_path,
+        output_path=mesh_obj_path,
+        skip_fix_mesh=False,
+        texture_size=texture_size,
+    )
+    mesh_glb_path = os.path.join(user_dir, f"{filename}.glb")
+    mesh.export(mesh_glb_path)
+    return mesh_glb_path, gs_path, mesh_obj_path, aligned_gs_path
+def extract_urdf(
+    gs_path: str,
+    mesh_obj_path: str,
+    asset_cat_text: str,
+    height_range_text: str,
+    mass_range_text: str,
+    asset_version_text: str,
+    req: gr.Request = None,
+):
+    output_root = TMP_DIR
+    if req is not None:
+        output_root = os.path.join(output_root, str(req.session_hash))
+    # Convert to URDF and recover attrs by GPT.
+    filename = "sample"
+    urdf_convertor = URDFGenerator(
+        GPT_CLIENT, render_view_num=4, decompose_convex=True
+    )
+    asset_attrs = {
+        "version": VERSION,
+        "gs_model": f"{urdf_convertor.output_mesh_dir}/{filename}_gs.ply",
+    }
+    if asset_version_text:
+        asset_attrs["version"] = asset_version_text
+    if asset_cat_text:
+        asset_attrs["category"] = asset_cat_text.lower()
+    if height_range_text:
+        try:
+            min_height, max_height = map(float, height_range_text.split("-"))
+            asset_attrs["min_height"] = min_height
+            asset_attrs["max_height"] = max_height
+        except ValueError:
+            return "Invalid height input format. Use the format: min-max."
+    if mass_range_text:
+        try:
+            min_mass, max_mass = map(float, mass_range_text.split("-"))
+            asset_attrs["min_mass"] = min_mass
+            asset_attrs["max_mass"] = max_mass
+        except ValueError:
+            return "Invalid mass input format. Use the format: min-max."
+    urdf_path = urdf_convertor(
+        mesh_path=mesh_obj_path,
+        output_root=f"{output_root}/URDF_{filename}",
+        **asset_attrs,
+    )
+    # Rescale GS and save to URDF/mesh folder.
+    real_height = urdf_convertor.get_attr_from_urdf(
+        urdf_path, attr_name="real_height"
+    )
+    out_gs = f"{output_root}/URDF_{filename}/{urdf_convertor.output_mesh_dir}/{filename}_gs.ply"  # noqa
+    GaussianOperator.resave_ply(
+        in_ply=gs_path,
+        out_ply=out_gs,
+        real_height=real_height,
+        device="cpu",
+    )
+    # Quality check and update .urdf file.
+    mesh_out = f"{output_root}/URDF_{filename}/{urdf_convertor.output_mesh_dir}/{filename}.obj"  # noqa
+    trimesh.load(mesh_out).export(mesh_out.replace(".obj", ".glb"))
+    # image_paths = render_asset3d(
+    #     mesh_path=mesh_out,
+    #     output_root=f"{output_root}/URDF_{filename}",
+    #     output_subdir="qa_renders",
+    #     num_images=8,
+    #     elevation=(30, -30),
+    #     distance=5.5,
+    # )
+    image_dir = f"{output_root}/URDF_{filename}/{urdf_convertor.output_render_dir}/image_color"  # noqa
+    image_paths = glob(f"{image_dir}/*.png")
+    images_list = []
+    for checker in CHECKERS:
+        images = image_paths
+        if isinstance(checker, ImageSegChecker):
+            images = [
+                f"{TMP_DIR}/{req.session_hash}/raw_image.png",
+                f"{TMP_DIR}/{req.session_hash}/seg_image.png",
+            ]
+        images_list.append(images)
+    results = BaseChecker.validate(CHECKERS, images_list)
+    urdf_convertor.add_quality_tag(urdf_path, results)
+    # Zip urdf files
+    urdf_zip = zip_files(
+        input_paths=[
+            f"{output_root}/URDF_{filename}/{urdf_convertor.output_mesh_dir}",
+            f"{output_root}/URDF_{filename}/{filename}.urdf",
+        ],
+        output_zip=f"{output_root}/urdf_{filename}.zip",
+    )
+    estimated_type = urdf_convertor.estimated_attrs["category"]
+    estimated_height = urdf_convertor.estimated_attrs["height"]
+    estimated_mass = urdf_convertor.estimated_attrs["mass"]
+    estimated_mu = urdf_convertor.estimated_attrs["mu"]
+    return (
+        urdf_zip,
+        estimated_type,
+        estimated_height,
+        estimated_mass,
+        estimated_mu,
+    )
+@spaces.GPU(duration=300)
+def text2image_fn(
+    prompt: str,
+    guidance_scale: float,
+    infer_step: int = 50,
+    ip_image: Image.Image | str = None,
+    ip_adapt_scale: float = 0.3,
+    image_wh: int | tuple[int, int] = [1024, 1024],
+    rmbg_tag: str = "rembg",
+    seed: int = None,
+    enable_pre_resize: bool = True,
+    n_sample: int = 3,
+    req: gr.Request = None,
+):
+    if isinstance(image_wh, int):
+        image_wh = (image_wh, image_wh)
+    output_root = TMP_DIR
+    if req is not None:
+        output_root = os.path.join(output_root, str(req.session_hash))
+        os.makedirs(output_root, exist_ok=True)
+    pipeline = PIPELINE_IMG if ip_image is None else PIPELINE_IMG_IP
+    if ip_image is not None:
+        pipeline.set_ip_adapter_scale([ip_adapt_scale])
+    images = text2img_gen(
+        prompt=prompt,
+        n_sample=n_sample,
+        guidance_scale=guidance_scale,
+        pipeline=pipeline,
+        ip_image=ip_image,
+        image_wh=image_wh,
+        infer_step=infer_step,
+        seed=seed,
+    )
+    for idx in range(len(images)):
+        image = images[idx]
+        images[idx], _ = preprocess_image_fn(
+            image, rmbg_tag, enable_pre_resize
+        )
+    save_paths = []
+    for idx, image in enumerate(images):
+        save_path = f"{output_root}/sample_{idx}.png"
+        image.save(save_path)
+        save_paths.append(save_path)
+    logger.info(f"Images saved to {output_root}")
+    gc.collect()
+    torch.cuda.empty_cache()
+    return save_paths + save_paths
+@spaces.GPU(duration=120)
+def generate_condition(mesh_path: str, req: gr.Request, uuid: str = "sample"):
+    output_root = os.path.join(TMP_DIR, str(req.session_hash))
+    _ = render_api(
+        mesh_path=mesh_path,
+        output_root=f"{output_root}/condition",
+        uuid=str(uuid),
+    )
+    gc.collect()
+    torch.cuda.empty_cache()
+    return None, None, None
+@spaces.GPU(duration=300)
+def generate_texture_mvimages(
+    prompt: str,
+    controlnet_cond_scale: float = 0.55,
+    guidance_scale: float = 9,
+    strength: float = 0.9,
+    num_inference_steps: int = 50,
+    seed: int = 0,
+    ip_adapt_scale: float = 0,
+    ip_img_path: str = None,
+    uid: str = "sample",
+    sub_idxs: tuple[tuple[int]] = ((0, 1, 2), (3, 4, 5)),
+    req: gr.Request = None,
+) -> list[str]:
+    output_root = os.path.join(TMP_DIR, str(req.session_hash))
+    use_ip_adapter = True if ip_img_path and ip_adapt_scale > 0 else False
+    PIPELINE_IP.set_ip_adapter_scale([ip_adapt_scale])
+    img_save_paths = infer_pipe(
+        index_file=f"{output_root}/condition/index.json",
+        controlnet_cond_scale=controlnet_cond_scale,
+        guidance_scale=guidance_scale,
+        strength=strength,
+        num_inference_steps=num_inference_steps,
+        ip_adapt_scale=ip_adapt_scale,
+        ip_img_path=ip_img_path,
+        uid=uid,
+        prompt=prompt,
+        save_dir=f"{output_root}/multi_view",
+        sub_idxs=sub_idxs,
+        pipeline=PIPELINE_IP if use_ip_adapter else PIPELINE,
+        seed=seed,
+    )
+    gc.collect()
+    torch.cuda.empty_cache()
+    return img_save_paths + img_save_paths
+def backproject_texture(
+    mesh_path: str,
+    input_image: str,
+    texture_size: int,
+    uuid: str = "sample",
+    req: gr.Request = None,
+) -> str:
+    output_root = os.path.join(TMP_DIR, str(req.session_hash))
+    output_dir = os.path.join(output_root, "texture_mesh")
+    os.makedirs(output_dir, exist_ok=True)
+    command = [
+        "backproject-cli",
+        "--mesh_path",
+        mesh_path,
+        "--input_image",
+        input_image,
+        "--output_root",
+        output_dir,
+        "--uuid",
+        f"{uuid}",
+        "--texture_size",
+        str(texture_size),
+        "--skip_fix_mesh",
+    ]
+    _ = subprocess.run(
+        command, capture_output=True, text=True, encoding="utf-8"
+    )
+    output_obj_mesh = os.path.join(output_dir, f"{uuid}.obj")
+    output_glb_mesh = os.path.join(output_dir, f"{uuid}.glb")
+    _ = trimesh.load(output_obj_mesh).export(output_glb_mesh)
+    zip_file = zip_files(
+        input_paths=[
+            output_glb_mesh,
+            output_obj_mesh,
+            os.path.join(output_dir, "material.mtl"),
+            os.path.join(output_dir, "material_0.png"),
+        ],
+        output_zip=os.path.join(output_dir, f"{uuid}.zip"),
+    )
+    gc.collect()
+    torch.cuda.empty_cache()
+    return output_glb_mesh, output_obj_mesh, zip_file
+@spaces.GPU(duration=300)
+def backproject_texture_v2(
+    mesh_path: str,
+    input_image: str,
+    texture_size: int,
+    enable_delight: bool = True,
+    fix_mesh: bool = False,
+    no_mesh_post_process: bool = False,
+    uuid: str = "sample",
+    req: gr.Request = None,
+) -> str:
+    output_root = os.path.join(TMP_DIR, str(req.session_hash))
+    output_dir = os.path.join(output_root, "texture_mesh")
+    os.makedirs(output_dir, exist_ok=True)
+    textured_mesh = backproject_api(
+        delight_model=DELIGHT,
+        imagesr_model=IMAGESR_MODEL,
+        color_path=input_image,
+        mesh_path=mesh_path,
+        output_path=f"{output_dir}/{uuid}.obj",
+        skip_fix_mesh=not fix_mesh,
+        delight=enable_delight,
+        texture_wh=[texture_size, texture_size],
+        no_mesh_post_process=no_mesh_post_process,
+    )
+    output_obj_mesh = os.path.join(output_dir, f"{uuid}.obj")
+    output_glb_mesh = os.path.join(output_dir, f"{uuid}.glb")
+    _ = textured_mesh.export(output_glb_mesh)
+    zip_file = zip_files(
+        input_paths=[
+            output_glb_mesh,
+            output_obj_mesh,
+            os.path.join(output_dir, "material.mtl"),
+            os.path.join(output_dir, "material_0.png"),
+        ],
+        output_zip=os.path.join(output_dir, f"{uuid}.zip"),
+    )
+    gc.collect()
+    torch.cuda.empty_cache()
+    return output_glb_mesh, output_obj_mesh, zip_file
+@spaces.GPU(duration=120)
+def render_result_video(
+    mesh_path: str, video_size: int, req: gr.Request, uuid: str = ""
+) -> str:
+    output_root = os.path.join(TMP_DIR, str(req.session_hash))
+    output_dir = os.path.join(output_root, "texture_mesh")
+    _ = render_api(
+        mesh_path=mesh_path,
+        output_root=output_dir,
+        num_images=90,
+        elevation=[20],
+        with_mtl=True,
+        pbr_light_factor=1,
+        uuid=str(uuid),
+        gen_color_mp4=True,
+        gen_glonormal_mp4=True,
+        distance=5.5,
+        resolution_hw=(video_size, video_size),
+    )
+    gc.collect()
+    torch.cuda.empty_cache()
+    return f"{output_dir}/color.mp4"

common.py CHANGED Viewed

@@ -88,38 +88,20 @@ MAX_SEED = 100000
 # IMAGESR_MODEL = ImageRealESRGAN(outscale=4)
 # IMAGESR_MODEL = ImageStableSR()
 if os.getenv("GRADIO_APP").startswith("imageto3d"):
-    print("[INIT 1/7] Loading RembgRemover ...", flush=True)
     RBG_REMOVER = RembgRemover()
-    print("[INIT 1/7] RembgRemover done.", flush=True)
-    print("[INIT 2/7] Loading BMGG14Remover ...", flush=True)
     RBG14_REMOVER = BMGG14Remover()
-    print("[INIT 2/7] BMGG14Remover done.", flush=True)
-    print("[INIT 3/7] Loading SAMPredictor(cpu) ...", flush=True)
     SAM_PREDICTOR = SAMPredictor(model_type="vit_h", device="cpu")
-    print("[INIT 3/7] SAMPredictor done.", flush=True)
-    if "sam3d" in os.getenv("GRADIO_APP"):
-        print("[INIT 4/7] Loading Sam3dInference ...", flush=True)
-        PIPELINE = Sam3dInference()
-        print("[INIT 4/7] Sam3dInference done.", flush=True)
-    else:
-        print("[INIT 4/7] Loading TrellisImageTo3DPipeline ...", flush=True)
-        PIPELINE = TrellisImageTo3DPipeline.from_pretrained(
-            "microsoft/TRELLIS-image-large"
-        )
-        print("[INIT 4/7] TrellisImageTo3DPipeline done.", flush=True)
-        # PIPELINE.cuda()
-    print("[INIT 5/7] Loading SEG_CHECKER ...", flush=True)
-    SEG_CHECKER = ImageSegChecker(GPT_CLIENT)
-    print("[INIT 5/7] SEG_CHECKER done.", flush=True)
-    print("[INIT 6/7] Loading GEO_CHECKER + AESTHETIC_CHECKER ...", flush=True)
-    GEO_CHECKER = MeshGeoChecker(GPT_CLIENT)
-    AESTHETIC_CHECKER = ImageAestheticChecker()
-    CHECKERS = [GEO_CHECKER, SEG_CHECKER, AESTHETIC_CHECKER]
-    print("[INIT 6/7] Checkers done.", flush=True)
     TMP_DIR = os.path.join(
         os.path.dirname(os.path.abspath(__file__)), "sessions/imageto3d"
     )
@@ -281,22 +263,8 @@ def select_point(
     return (image, masks), seg_image
-@spaces.GPU(duration=60)
-def _gpu_alloc_test():
-    """Minimal @spaces.GPU test - no model tensors, just GPU allocation."""
-    import torch, time as _t
-    print(f"[GPU-ALLOC-TEST] entered function body at {_t.strftime('%H:%M:%S')}", flush=True)
-    print(f"[GPU-ALLOC-TEST] cuda.is_available={torch.cuda.is_available()}", flush=True)
-    if torch.cuda.is_available():
-        x = torch.randn(4, 4, device="cuda")
-        print(f"[GPU-ALLOC-TEST] test tensor OK: {x.device}", flush=True)
-        del x
-    print("[GPU-ALLOC-TEST] done!", flush=True)
-    return True
-@spaces.GPU(duration=100)
-def _image_to_3d_inner(
     image: Image.Image,
     seed: int,
     ss_sampling_steps: int,
@@ -308,7 +276,6 @@ def _image_to_3d_inner(
     is_sam_image: bool = False,
     req: gr.Request = None,
 ) -> tuple[dict, str]:
-    print("[STEP 0] >>>>>> image_to_3d function body entered! <<<<<<", flush=True)
     if is_sam_image:
         seg_image = filter_image_small_connected_components(sam_image)
         seg_image = Image.fromarray(seg_image, mode="RGBA")
@@ -318,24 +285,16 @@ def _image_to_3d_inner(
     if isinstance(seg_image, np.ndarray):
         seg_image = Image.fromarray(seg_image)
-    print("[STEP 1] image_to_3d entered, cuda available:", torch.cuda.is_available(), flush=True)
-    if torch.cuda.is_available():
-        print("[STEP 1]   device:", torch.cuda.get_device_name(0), flush=True)
     logger.info("Start generating 3D representation from image...")
     if isinstance(PIPELINE, Sam3dInference):
-        print("[STEP 2] Calling PIPELINE.run (Sam3dInference) ...", flush=True)
         outputs = PIPELINE.run(
             seg_image,
             seed=seed,
             stage1_inference_steps=ss_sampling_steps,
             stage2_inference_steps=slat_sampling_steps,
         )
-        print("[STEP 2] PIPELINE.run done.", flush=True)
     else:
-        print("[STEP 2] Moving PIPELINE to cuda ...", flush=True)
         PIPELINE.cuda()
-        print("[STEP 2] PIPELINE.cuda() done. Running inference ...", flush=True)
         seg_image = trellis_preprocess(seg_image)
         outputs = PIPELINE.run(
             seg_image,
@@ -351,76 +310,29 @@ def _image_to_3d_inner(
                 "cfg_strength": slat_guidance_strength,
             },
         )
-        print("[STEP 2] PIPELINE.run done. Moving back to cpu ...", flush=True)
         # Set back to cpu for memory saving.
         PIPELINE.cpu()
-    print("[STEP 3] Extracting gs_model and mesh_model ...", flush=True)
     gs_model = outputs["gaussian"][0]
     mesh_model = outputs["mesh"][0]
-    print("[STEP 4] Rendering color video ...", flush=True)
     color_images = render_video(gs_model, r=1.85)["color"]
-    print("[STEP 4] Rendering normal video ...", flush=True)
     normal_images = render_video(mesh_model, r=1.85)["normal"]
-    print("[STEP 4] Render done.", flush=True)
     output_root = os.path.join(TMP_DIR, str(req.session_hash))
     os.makedirs(output_root, exist_ok=True)
     seg_image.save(f"{output_root}/seg_image.png")
     raw_image_cache.save(f"{output_root}/raw_image.png")
-    print("[STEP 5] Merging video and packing state ...", flush=True)
     video_path = os.path.join(output_root, "gs_mesh.mp4")
     merge_images_video(color_images, normal_images, video_path)
     state = pack_state(gs_model, mesh_model)
     gc.collect()
     torch.cuda.empty_cache()
-    print("[STEP 6] image_to_3d done!", flush=True)
     return state, video_path
-def image_to_3d(
-    image,
-    seed,
-    ss_sampling_steps,
-    slat_sampling_steps,
-    raw_image_cache,
-    ss_guidance_strength,
-    slat_guidance_strength,
-    sam_image=None,
-    is_sam_image=False,
-    req=None,
-):
-    """Wrapper outside @spaces.GPU to diagnose where the hang occurs."""
-    import time as _time
-    _t0 = _time.time()
-    print(f"[WRAPPER] image_to_3d called at {_time.strftime('%H:%M:%S')}", flush=True)
-    print(f"[WRAPPER] Step 1: calling _gpu_alloc_test (minimal @spaces.GPU) ...", flush=True)
-    try:
-        _gpu_alloc_test()
-        print(f"[WRAPPER] Step 1 done in {_time.time()-_t0:.1f}s. GPU alloc works!", flush=True)
-    except Exception as e:
-        print(f"[WRAPPER] Step 1 _gpu_alloc_test FAILED: {type(e).__name__}: {e}", flush=True)
-        raise
-    _t1 = _time.time()
-    print(f"[WRAPPER] Step 2: calling _image_to_3d_inner (heavy, 13.7G tensors) ...", flush=True)
-    try:
-        result = _image_to_3d_inner(
-            image, seed, ss_sampling_steps, slat_sampling_steps,
-            raw_image_cache, ss_guidance_strength, slat_guidance_strength,
-            sam_image, is_sam_image, req,
-        )
-        print(f"[WRAPPER] _image_to_3d_inner returned in {_time.time()-_t1:.1f}s (total {_time.time()-_t0:.1f}s)", flush=True)
-        return result
-    except Exception as e:
-        print(f"[WRAPPER] _image_to_3d_inner FAILED after {_time.time()-_t1:.1f}s: {type(e).__name__}: {e}", flush=True)
-        raise
 def extract_3d_representations_v2(
     state: dict,
     enable_delight: bool,

 # IMAGESR_MODEL = ImageRealESRGAN(outscale=4)
 # IMAGESR_MODEL = ImageStableSR()
 if os.getenv("GRADIO_APP").startswith("imageto3d"):
     RBG_REMOVER = RembgRemover()
     RBG14_REMOVER = BMGG14Remover()
     SAM_PREDICTOR = SAMPredictor(model_type="vit_h", device="cpu")
+    # if "sam3d" in os.getenv("GRADIO_APP"):
+    #     PIPELINE = Sam3dInference()
+    # else:
+    #     PIPELINE = TrellisImageTo3DPipeline.from_pretrained(
+    #         "microsoft/TRELLIS-image-large"
+    #     )
+    #     # PIPELINE.cuda()
+    # SEG_CHECKER = ImageSegChecker(GPT_CLIENT)
+    # GEO_CHECKER = MeshGeoChecker(GPT_CLIENT)
+    # AESTHETIC_CHECKER = ImageAestheticChecker()
+    # CHECKERS = [GEO_CHECKER, SEG_CHECKER, AESTHETIC_CHECKER]
     TMP_DIR = os.path.join(
         os.path.dirname(os.path.abspath(__file__)), "sessions/imageto3d"
     )
     return (image, masks), seg_image
+@spaces.GPU(duration=300)
+def image_to_3d(
     image: Image.Image,
     seed: int,
     ss_sampling_steps: int,
     is_sam_image: bool = False,
     req: gr.Request = None,
 ) -> tuple[dict, str]:
     if is_sam_image:
         seg_image = filter_image_small_connected_components(sam_image)
         seg_image = Image.fromarray(seg_image, mode="RGBA")
     if isinstance(seg_image, np.ndarray):
         seg_image = Image.fromarray(seg_image)
     logger.info("Start generating 3D representation from image...")
     if isinstance(PIPELINE, Sam3dInference):
         outputs = PIPELINE.run(
             seg_image,
             seed=seed,
             stage1_inference_steps=ss_sampling_steps,
             stage2_inference_steps=slat_sampling_steps,
         )
     else:
         PIPELINE.cuda()
         seg_image = trellis_preprocess(seg_image)
         outputs = PIPELINE.run(
             seg_image,
                 "cfg_strength": slat_guidance_strength,
             },
         )
         # Set back to cpu for memory saving.
         PIPELINE.cpu()
     gs_model = outputs["gaussian"][0]
     mesh_model = outputs["mesh"][0]
     color_images = render_video(gs_model, r=1.85)["color"]
     normal_images = render_video(mesh_model, r=1.85)["normal"]
     output_root = os.path.join(TMP_DIR, str(req.session_hash))
     os.makedirs(output_root, exist_ok=True)
     seg_image.save(f"{output_root}/seg_image.png")
     raw_image_cache.save(f"{output_root}/raw_image.png")
     video_path = os.path.join(output_root, "gs_mesh.mp4")
     merge_images_video(color_images, normal_images, video_path)
     state = pack_state(gs_model, mesh_model)
     gc.collect()
     torch.cuda.empty_cache()
     return state, video_path
 def extract_3d_representations_v2(
     state: dict,
     enable_delight: bool,

embodied_gen/models/sam3d.py CHANGED Viewed

@@ -22,8 +22,7 @@ import sys
 import numpy as np
 from hydra.utils import instantiate
-# from modelscope import snapshot_download
-from huggingface_hub import snapshot_download
 from omegaconf import OmegaConf
 from PIL import Image
@@ -65,12 +64,8 @@ class Sam3dInference:
     def __init__(
         self, local_dir: str = "weights/sam-3d-objects", compile: bool = False
     ) -> None:
-        print("[SAM3D-INIT] Starting Sam3dInference.__init__", flush=True)
         if not os.path.exists(local_dir):
-            print("[SAM3D-INIT] Downloading weights ...", flush=True)
-            # snapshot_download("facebook/sam-3d-objects", local_dir=local_dir)
-            snapshot_download(repo_id="tuandao-zenai/sam-3d-objects", local_dir=local_dir)
-            print("[SAM3D-INIT] Download done.", flush=True)
         config_file = os.path.join(local_dir, "checkpoints/pipeline.yaml")
         config = OmegaConf.load(config_file)
         config.rendering_engine = "nvdiffrast"
@@ -83,9 +78,7 @@ class Sam3dInference:
         config["slat_decoder_gs_ckpt_path"] = config.pop(
             "slat_decoder_gs_4_ckpt_path", "slat_decoder_gs_4.ckpt"
         )
-        print("[SAM3D-INIT] Instantiating InferencePipelinePointMap ...", flush=True)
         self.pipeline: InferencePipelinePointMap = instantiate(config)
-        print("[SAM3D-INIT] Sam3dInference.__init__ done.", flush=True)
     def merge_mask_to_rgba(
         self, image: np.ndarray, mask: np.ndarray
@@ -107,15 +100,11 @@ class Sam3dInference:
         stage1_inference_steps: int = 25,
         stage2_inference_steps: int = 25,
     ) -> dict:
-        print("[SAM3D-RUN] Entering Sam3dInference.run", flush=True)
         if isinstance(image, Image.Image):
             image = np.array(image)
         if mask is not None:
             image = self.merge_mask_to_rgba(image, mask)
-        print(f"[SAM3D-RUN] image shape: {image.shape}, dtype: {image.dtype}", flush=True)
-        print(f"[SAM3D-RUN] seed={seed}, stage1_steps={stage1_inference_steps}, stage2_steps={stage2_inference_steps}", flush=True)
-        print("[SAM3D-RUN] Calling self.pipeline.run ...", flush=True)
-        result = self.pipeline.run(
             image,
             None,
             seed,
@@ -130,8 +119,6 @@ class Sam3dInference:
             stage2_inference_steps=stage2_inference_steps,
             pointmap=pointmap,
         )
-        print("[SAM3D-RUN] self.pipeline.run returned.", flush=True)
-        return result
 if __name__ == "__main__":

 import numpy as np
 from hydra.utils import instantiate
+from modelscope import snapshot_download
 from omegaconf import OmegaConf
 from PIL import Image
     def __init__(
         self, local_dir: str = "weights/sam-3d-objects", compile: bool = False
     ) -> None:
         if not os.path.exists(local_dir):
+            snapshot_download("facebook/sam-3d-objects", local_dir=local_dir)
         config_file = os.path.join(local_dir, "checkpoints/pipeline.yaml")
         config = OmegaConf.load(config_file)
         config.rendering_engine = "nvdiffrast"
         config["slat_decoder_gs_ckpt_path"] = config.pop(
             "slat_decoder_gs_4_ckpt_path", "slat_decoder_gs_4.ckpt"
         )
         self.pipeline: InferencePipelinePointMap = instantiate(config)
     def merge_mask_to_rgba(
         self, image: np.ndarray, mask: np.ndarray
         stage1_inference_steps: int = 25,
         stage2_inference_steps: int = 25,
     ) -> dict:
         if isinstance(image, Image.Image):
             image = np.array(image)
         if mask is not None:
             image = self.merge_mask_to_rgba(image, mask)
+        return self.pipeline.run(
             image,
             None,
             seed,
             stage2_inference_steps=stage2_inference_steps,
             pointmap=pointmap,
         )
 if __name__ == "__main__":

embodied_gen/utils/monkey_patch/sam3d.py CHANGED Viewed

@@ -40,7 +40,7 @@ def monkey_patch_sam3d():
     if sam3d_root not in sys.path:
         sys.path.insert(0, sam3d_root)
-    def patch_pointmap_infer_pipeline():
         """Patches InferencePipelinePointMap.run to handle pointmap generation and 3D structure sampling."""
         try:
             from sam3d_objects.pipeline.inference_pipeline_pointmap import (
@@ -202,7 +202,7 @@ def monkey_patch_sam3d():
         InferencePipelinePointMap.run = patch_run
-    def patch_infer_init():
         """Patches InferencePipeline.__init__ to allow CPU offloading during model initialization."""
         import torch
@@ -380,7 +380,7 @@ def monkey_patch_sam3d():
         InferencePipeline.__init__ = patch_init
-    # patch_pointmap_infer_pipeline()
-    # patch_infer_init()
     return

     if sam3d_root not in sys.path:
         sys.path.insert(0, sam3d_root)
+    def # patch_pointmap_infer_pipeline():
         """Patches InferencePipelinePointMap.run to handle pointmap generation and 3D structure sampling."""
         try:
             from sam3d_objects.pipeline.inference_pipeline_pointmap import (
         InferencePipelinePointMap.run = patch_run
+    def # patch_infer_init():
         """Patches InferencePipeline.__init__ to allow CPU offloading during model initialization."""
         import torch
         InferencePipeline.__init__ = patch_init
+    # # patch_pointmap_infer_pipeline()
+    # # patch_infer_init()
     return

thirdparty/sam3d/sam3d_objects/pipeline/inference_pipeline.py CHANGED Viewed

@@ -98,7 +98,6 @@ class InferencePipeline:
         logger.info(f"self.device: {self.device}")
         logger.info(f"CUDA_VISIBLE_DEVICES: {os.environ.get('CUDA_VISIBLE_DEVICES', None)}")
         logger.info(f"Actually using GPU: {torch.cuda.current_device()}")
-        print(f"[PIPE-INIT] entering with self.device ({self.device}) ...", flush=True)
         with self.device:
             self.decode_formats = decode_formats
             self.pad_size = pad_size
@@ -130,41 +129,33 @@ class InferencePipeline:
             self.slat_preprocessor = slat_preprocessor
             logger.info("Loading model weights...")
-            print("[PIPE-INIT] Loading ss_generator ...", flush=True)
             ss_generator = self.init_ss_generator(
                 ss_generator_config_path, ss_generator_ckpt_path
             )
-            print("[PIPE-INIT] Loading slat_generator ...", flush=True)
             slat_generator = self.init_slat_generator(
                 slat_generator_config_path, slat_generator_ckpt_path
             )
-            print("[PIPE-INIT] Loading ss_decoder ...", flush=True)
             ss_decoder = self.init_ss_decoder(
                 ss_decoder_config_path, ss_decoder_ckpt_path
             )
-            print("[PIPE-INIT] Loading ss_encoder ...", flush=True)
             ss_encoder = self.init_ss_encoder(
                 ss_encoder_config_path, ss_encoder_ckpt_path
             )
-            print("[PIPE-INIT] Loading slat_decoder_gs ...", flush=True)
             slat_decoder_gs = self.init_slat_decoder_gs(
                 slat_decoder_gs_config_path, slat_decoder_gs_ckpt_path
             )
-            print("[PIPE-INIT] Loading slat_decoder_gs_4 ...", flush=True)
             slat_decoder_gs_4 = self.init_slat_decoder_gs(
                 slat_decoder_gs_4_config_path, slat_decoder_gs_4_ckpt_path
             )
-            print("[PIPE-INIT] Loading slat_decoder_mesh ...", flush=True)
             slat_decoder_mesh = self.init_slat_decoder_mesh(
                 slat_decoder_mesh_config_path, slat_decoder_mesh_ckpt_path
             )
             # Load conditioner embedder so that we only load it once
-            print("[PIPE-INIT] Loading ss_condition_embedder ...", flush=True)
             ss_condition_embedder = self.init_ss_condition_embedder(
                 ss_generator_config_path, ss_generator_ckpt_path
             )
-            print("[PIPE-INIT] Loading slat_condition_embedder ...", flush=True)
             slat_condition_embedder = self.init_slat_condition_embedder(
                 slat_generator_config_path, slat_generator_ckpt_path
             )
@@ -202,7 +193,6 @@ class InferencePipeline:
                     "slat_decoder_mesh": slat_decoder_mesh,
                 }
             )
-            print("[PIPE-INIT] All models loaded into ModuleDict.", flush=True)
             logger.info("Loading model weights completed!")
             if self.compile_model:

         logger.info(f"self.device: {self.device}")
         logger.info(f"CUDA_VISIBLE_DEVICES: {os.environ.get('CUDA_VISIBLE_DEVICES', None)}")
         logger.info(f"Actually using GPU: {torch.cuda.current_device()}")
         with self.device:
             self.decode_formats = decode_formats
             self.pad_size = pad_size
             self.slat_preprocessor = slat_preprocessor
             logger.info("Loading model weights...")
             ss_generator = self.init_ss_generator(
                 ss_generator_config_path, ss_generator_ckpt_path
             )
             slat_generator = self.init_slat_generator(
                 slat_generator_config_path, slat_generator_ckpt_path
             )
             ss_decoder = self.init_ss_decoder(
                 ss_decoder_config_path, ss_decoder_ckpt_path
             )
             ss_encoder = self.init_ss_encoder(
                 ss_encoder_config_path, ss_encoder_ckpt_path
             )
             slat_decoder_gs = self.init_slat_decoder_gs(
                 slat_decoder_gs_config_path, slat_decoder_gs_ckpt_path
             )
             slat_decoder_gs_4 = self.init_slat_decoder_gs(
                 slat_decoder_gs_4_config_path, slat_decoder_gs_4_ckpt_path
             )
             slat_decoder_mesh = self.init_slat_decoder_mesh(
                 slat_decoder_mesh_config_path, slat_decoder_mesh_ckpt_path
             )
             # Load conditioner embedder so that we only load it once
             ss_condition_embedder = self.init_ss_condition_embedder(
                 ss_generator_config_path, ss_generator_ckpt_path
             )
             slat_condition_embedder = self.init_slat_condition_embedder(
                 slat_generator_config_path, slat_generator_ckpt_path
             )
                     "slat_decoder_mesh": slat_decoder_mesh,
                 }
             )
             logger.info("Loading model weights completed!")
             if self.compile_model:

thirdparty/sam3d/sam3d_objects/pipeline/inference_pipeline_pointmap.py CHANGED Viewed

@@ -332,11 +332,8 @@ class InferencePipelinePointMap(InferencePipeline):
         estimate_plane=False,
     ) -> dict:
         image = self.merge_image_and_mask(image, mask)
-        print(f"[PIPE-RUN] entering with self.device ({self.device}) ...", flush=True)
         with self.device:
-            print("[PIPE-RUN] compute_pointmap ...", flush=True)
             pointmap_dict = self.compute_pointmap(image, pointmap)
-            print("[PIPE-RUN] compute_pointmap done.", flush=True)
             pointmap = pointmap_dict["pointmap"]
             pts = type(self)._down_sample_img(pointmap)
             pts_colors = type(self)._down_sample_img(pointmap_dict["pts_color"])
@@ -344,21 +341,18 @@ class InferencePipelinePointMap(InferencePipeline):
             if estimate_plane:
                 return self.estimate_plane(pointmap_dict, image)
-            print("[PIPE-RUN] preprocess_image (ss) ...", flush=True)
             ss_input_dict = self.preprocess_image(
                 image, self.ss_preprocessor, pointmap=pointmap
             )
-            print("[PIPE-RUN] preprocess_image (slat) ...", flush=True)
             slat_input_dict = self.preprocess_image(image, self.slat_preprocessor)
             if seed is not None:
                 torch.manual_seed(seed)
-            print("[PIPE-RUN] sample_sparse_structure (stage1) ...", flush=True)
             ss_return_dict = self.sample_sparse_structure(
                 ss_input_dict,
                 inference_steps=stage1_inference_steps,
                 use_distillation=use_stage1_distillation,
             )
-            print("[PIPE-RUN] sample_sparse_structure done.", flush=True)
             # We could probably use the decoder from the models themselves
             pointmap_scale = ss_input_dict.get("pointmap_scale", None)
@@ -385,20 +379,15 @@ class InferencePipelinePointMap(InferencePipeline):
                 # return ss_return_dict
             coords = ss_return_dict["coords"]
-            print("[PIPE-RUN] sample_slat (stage2) ...", flush=True)
             slat = self.sample_slat(
                 slat_input_dict,
                 coords,
                 inference_steps=stage2_inference_steps,
                 use_distillation=use_stage2_distillation,
             )
-            print("[PIPE-RUN] sample_slat done.", flush=True)
-            print("[PIPE-RUN] decode_slat ...", flush=True)
             outputs = self.decode_slat(
                 slat, self.decode_formats if decode_formats is None else decode_formats
             )
-            print("[PIPE-RUN] decode_slat done.", flush=True)
-            print("[PIPE-RUN] postprocess_slat_output ...", flush=True)
             outputs = self.postprocess_slat_output(
                 outputs, with_mesh_postprocess, with_texture_baking, use_vertex_color
             )
@@ -424,7 +413,6 @@ class InferencePipelinePointMap(InferencePipeline):
                 )
             # glb.export("sample.glb")
-            print("[PIPE-RUN] ALL DONE, returning results.", flush=True)
             logger.info("Finished!")
             return {

         estimate_plane=False,
     ) -> dict:
         image = self.merge_image_and_mask(image, mask)
         with self.device:
             pointmap_dict = self.compute_pointmap(image, pointmap)
             pointmap = pointmap_dict["pointmap"]
             pts = type(self)._down_sample_img(pointmap)
             pts_colors = type(self)._down_sample_img(pointmap_dict["pts_color"])
             if estimate_plane:
                 return self.estimate_plane(pointmap_dict, image)
             ss_input_dict = self.preprocess_image(
                 image, self.ss_preprocessor, pointmap=pointmap
             )
             slat_input_dict = self.preprocess_image(image, self.slat_preprocessor)
             if seed is not None:
                 torch.manual_seed(seed)
             ss_return_dict = self.sample_sparse_structure(
                 ss_input_dict,
                 inference_steps=stage1_inference_steps,
                 use_distillation=use_stage1_distillation,
             )
             # We could probably use the decoder from the models themselves
             pointmap_scale = ss_input_dict.get("pointmap_scale", None)
                 # return ss_return_dict
             coords = ss_return_dict["coords"]
             slat = self.sample_slat(
                 slat_input_dict,
                 coords,
                 inference_steps=stage2_inference_steps,
                 use_distillation=use_stage2_distillation,
             )
             outputs = self.decode_slat(
                 slat, self.decode_formats if decode_formats is None else decode_formats
             )
             outputs = self.postprocess_slat_output(
                 outputs, with_mesh_postprocess, with_texture_baking, use_vertex_color
             )
                 )
             # glb.export("sample.glb")
             logger.info("Finished!")
             return {